欢迎光临散文网 会员登陆 & 注册

WGCNA极速入门

2021-12-29 16:12 作者:Biogeek_Lau  | 我要投稿

setwd("E:/demo") #设定工作目录,路径需修改

options(digits=2)

###############原始数据下载base on R3.5.3###############

#GEOquery包,此处以数据集GSE3910为例,用getGEOSUppFiles()函数获取原始数据

library(GEOquery)

rawdata = getGEOSuppFiles("GSE42568")

#因网速问题,本人手动下载了该数据集的前6个样本数据并用于后面的分析,正常情况下使用上述函数下载数据后需要解压成单个CEL格式文件,并存放在同一个文件夹中,以备后面使用

###############原始数据读取----ReadAffy()函数

#使用choose.dir()函数选择文件夹

dir <- choose.dir(caption = "Select folder")

#列出CEL文件,保存到变量

cel.files <- list.files(path = dir, pattern = ".+\\.cel", ignore.case = TRUE,

            full.names = TRUE, recursive = TRUE)

#查看文件名

basename(cel.files)

library(affy)

#读入数据

data.raw <- ReadAffy(filenames = cel.files)

#读入芯片的默认样品名称是文件名,用sampleNames函数查看并修改:

sampleNames(data.raw)

###############样本重命名--使用stri_sub函数删除CEL等,仅保留GSM号

library(stringi)

#8或9或10

sampleNames(data.raw)<-stri_sub(sampleNames(data.raw),1,10)

sampleNames(data.raw)

###############下载样本信息,即每个gsm的临床信息

library(dplyr);library(rvest);library(stringr)

getmultiplegsmclin=function(a){

 baseurl="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="

 link=paste(baseurl,a,sep ='')

 con = url(link)

 htmlCode = readLines(con)

 clin=read_html(htmlCode[302]) %>%

  html_nodes("td")

 clin=str_sub(clin, end=-11)

  clin=stri_replace_all(clin,replacement=",",regex="<br>")

 print(paste("writing clin data for",a))

 write.table(clin, "clin.csv", sep = ",", row.names = a,col.names=F, append = T)}

lapply(sampleNames(data.raw),getmultiplegsmclin)

###############构建样本分组信息

group_file=pData(data.raw)

#group_file$sample=rownames(group_file)

#根据实际情况修改

group_file$disease=c(rep(0,time=17),rep(1,time=104))

#查看分组信息

group_file

###############基因芯片数据预处理---质量控制--查找并删除芯片质量差的芯片optional###############

#质量控制包括:质量分析报告、RLE箱线图、NUSE箱线图、RNA降解图,通过上述分析,进而剔除质量不合格的样本

library(hgu133plus2cdf)

#R>3.5使用BiocManager::install("hgu133plus2.db")

#source("http://bioconductor.org/biocLite.R")

#biocLite("hgu133plus2.db")

library(hgu133plus2.db)

T##3.2.10 使用 arrayQualityMetrics 包进行质量控制(一键生成所有图),消耗大量内存

library(arrayQualityMetrics)

arrayQualityMetrics(expressionset = data.raw,

          outdir = "fig",

          force = TRUE,

          do.logtransform = T)

dev.off()


###############基因芯片数据预处理:背景校正,标准化,汇总###############

#option1,如果样本多会消耗大量内存,三合一函数

#eset.mas<-expresso(afbatch = data.raw,bgcorrect.method = "mas",

#          normalize.method = "constant",pmcorrect.method ="mas",

#          summary.method = "mas")

#option2

#eset.mas<-justrma(data.raw)

eset.mas <- mas5(data.raw)

exprmat=exprs(eset.mas)

str(exprmat)

#删除内参探针

del <- grep("AFFX",rownames(exprmat)) #获取affx的基因名

exprmat <- exprmat[-del,]

dim(exprmat)

#对数据做了归一化处理

exprmat.quantiles = preprocessCore::normalize.quantiles(exprmat)

dimnames(exprmat.quantiles)=dimnames(exprmat)

#求探针表达均值并过滤低表达基因

expmean=apply(exprmat.quantiles,1,mean)

sd=apply(exprmat.quantiles,1,sd)

cv=sd/expmean

exprmat.quantiles<- exprmat.quantiles[expmean>quantile(expmean,0.25) & cv>quantile(cv,0.25),]

dim(exprmat.quantiles)

exprmat.quantiles<- log2(exprmat.quantiles)

#保存表达矩阵

#save(dataexp,files="GSE**_exp.RData")

write.csv(exprmat.quantiles,file="GS42568.csv",quote=F)

#limma进行差异基因分析

library(limma)

Exp <- factor(group_file$disease)

design <- model.matrix(~ 0+Exp)

rownames(design)=colnames(exprmat.quantiles)

#比较应根据实际修改,实验组-对照组

cont_matrix <- makeContrasts(bc = Exp1-Exp0,levels=design)

fit <- lmFit(exprmat.quantiles, design)

fit_contrast <- contrasts.fit(fit, cont_matrix)

fit_contrast <- eBayes(fit_contrast, trend=TRUE, robust=TRUE)

results <- decideTests(fit_contrast,p.value = 0.05,lfc = 1.3) #设置p和变化倍数阈值

summary(results)

volcanoplot(fit_contrast,coef=1,highlight=10,names=rownames(exprmat.quantiles))

top_genes <- topTable(fit_contrast, number = 100, adjust = "BH")

heatmap(exprmat.quantiles[rownames(top_genes),],labRow=F,labCol=F,ColSideColors=c(rep("#FF0000FF",17),rep("#BDFF00FF",104)),distfun=dist, 

    hclustfun = hclust,main = "blood from HIV resistant and negative women") #热图样本bar需要根据实际情况设置ColSideColors,需根据矩阵样本顺序设定颜色

DE_genes <- topTable(fit_contrast, number =summary(results)[1]+summary(results)[3], adjust = "BH")

write.csv(DE_genes,"DE_genes.csv")

write.csv(exprmat.quantiles[rownames(DE_genes),],"exprSet.significant.csv") #将差异基因保存输出,下面分析可直接读取该文件;

#exprSet.significant=read.csv("exprSet.significant.csv",header=T) #下次直接读取,进行后面的分析,差异基因一般要求几百或者几千个


#####WgcNA:读取准备数据#####

library(WgcNA);

enableWgcNAThreads(2) 

#memory.limit(6000) #限制占用最大内存数

options(stringsAsFactors = FALSE);

dat0=read.csv("exprSet.significant.csv",header=TRUE)  

datSummary=dat0[,1];

dim(dat0) #查看数据形状

datExpr = t(dat0[,2: ncol(dat0)]); #转置表达矩阵

dim(datExpr)

ArrayName= names(data.frame(dat0[,-1])) #列名提取为样本名

GeneName= dat0[,1] #首列提取为基因名

names(datExpr)=dat0[,1] #指定datExpr列名


####有trait数据或运行demo数据可以去掉注释#####

#datTraits=read.csv("trait.csv") #按照NCBI GEO数据集的样本信息构建csv,1列为样本名称,其他为性状及其值,对照可为0,实验组为1

#table(dimnames(datExpr)[[1]]==datTraits$sample) 

y=design[,2] #只有1种性状时,有2个的要设置z

#z=datTraits$age

#sizeGrWindow(9, 5)

#pdf("ClusterTreeSamples.pdf")

#plotClusterTreeSamples(datExpr=datExpr, y=y)

#dev.off()

#rm(dat0);

#gc()


#####WgcNA:进行Power选择#####

powers=c(seq(1,10,by=1),seq(12,16,by=2));

sft=pickSoftThreshold(datExpr, powerVector=powers,networkType = "signed") 

sizeGrWindow(9, 5); #打开画布

pdf('choosing power.pdf');#生成pdf文件

par(mfrow = c(1,1)); #画布布局,1行1列

cex1 = 0.9; #power阈值,可减为0.8

plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],

   xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",main = paste("Scale independence")); #画图

text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],labels=powers,cex=cex1,col="red"); #添加文本

abline(h=0.90,col="red"); #画高位0.9的水平红色直线

dev.off() #关闭

# 同上画Mean connectivity对power的函数

sizeGrWindow(9, 5);

pdf('mean connectivity.pdf');

plot(sft$fitIndices[,1], sft$fitIndices[,5],xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",main = paste("Mean connectivity"))

text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red");

dev.off()


#####进行WgcNA#####

softPower =14 #参数可改,在上面那power图里曲线平滑时的第一个数,一般不大于12

Connectivity=softConnectivity(datExpr,corFnc = "cor", corOptions = "use ='p'",power=softPower,type = "signed")

pdf("scale-free.pdf");

scaleFreePlot(Connectivity,nBreaks = 10,truncated = FALSE,removeFirst = FALSE, main = "");

dev.off()

adjacency = adjacency(datExpr,corFnc = "cor", corOptions = "use ='p'",type = "signed", power = softPower)

TOM = TOMsimilarity(adjacency,TOMType="signed");

dissTOM = 1-TOM

geneTree = hclust(as.dist(dissTOM), method = "average")

minModuleSize =30; #模块最小基因数,参数可改,模块太少要改小

dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,deepSplit = 4, pamRespectsDendro = FALSE,minClusterSize = minModuleSize,cutHeight=0.99); #deepSplit = 0参数可改,0-4越大越敏感,模块数越多,5000个基因一般10-40个模块

table(dynamicMods)

dynamicColors = labels2colors(dynamicMods) #模块编号转换为颜色

table(dynamicColors)

MEList = moduleEigengenes(datExpr, colors = dynamicMods) #colors = dynamicColors

MEs = MEList$eigengenes

MEDiss = 1-cor(MEs);

METree = hclust(as.dist(MEDiss), method = "average");#

sizeGrWindow(7, 6)

plot(METree, main = "Clustering of module eigengenes",xlab = "", sub = "")

MEDissThres = 0.2

abline(h=MEDissThres, col = "red")

merge = mergeCloseModules(datExpr, dynamicMods, cutHeight = MEDissThres, verbose = 3); #合并相似的模块为1个

mergedColors = merge$colors;

table(mergedColors)

mergedMEs = merge$newMEs;

sizeGrWindow(12, 9)

pdf("DendroAndColors.pdf")

plotDendroAndColors(geneTree, cbind(dynamicMods, mergedColors),c("Dynamic Tree Cut", "Merged dynamic"),dendroLabels = FALSE, hang = 0.03,addGuide = TRUE, guideHang = 0.05)

dev.off()

moduleColors = mergedColors

MEs = mergedMEs;

MEDiss = 1-cor(MEs);

METree = hclust(as.dist(MEDiss), method = "average");#

pdf("METree.pdf")

plot(METree, main = "Clustering of module eigengenes",xlab = "", sub = "")

dev.off()

nSamples=nrow(datExpr)

geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p")); #计算基因属于某个模块的程度

MMPvalue = cbind.data.frame(datSummary,corPvalueStudent(as.matrix(geneModuleMembership), nSamples)); #计算其P值

write.table(data.frame(ArrayName,MEs),"MEs.csv",row.name=F) #输出模块基因表达信息

kMEdat=data.frame(geneModuleMembership,MMPvalue)

write.table(data.frame(datSummary,kMEdat),"kME-MMPvalue.csv",row.names=FALSE)

k.in=intramodularConnectivity(adjacency(datExpr,corFnc = "cor", corOptions = "use ='p'", type = "signed", power = softPower), moduleColors,scaleByMax = FALSE)

datout=data.frame(datSummary, colorNEW=moduleColors, k.in) 

write.table(datout, file="OutputCancerNetwork.csv", sep=",", row.names=F) #输出连接度及模块信息

hubs  = chooseTopHubInEachModule(datExpr, moduleColors)

write.csv(data.frame(module=names(hubs),moduleColor=labels2colors(names(hubs)),hub=hubs),"num2color.csv",row.names=F)

#####模块基因功能富集分析#####

gene=read.csv("OutputCancerNetwork.csv",header=T)

library(gProfileR)

for (i in unique(gene$colorNEW)[unique(gene$colorNEW)>0]){

 genes=subset(gene$datSummary,gene$colorNEW==i)

 go=gprofiler(as.vector(genes), 

        organism = "hsapiens",numeric_ns="ENTREZGENE_ACC")[,-14]

 write.table(go,"module_enrichment.csv",append =T,row.names=rep(i,nrow(go)),sep=",")}


#####画TOM及热图:可不做#####

pdf("TOM.pdf")

TOMplot(dissTOM , geneTree, moduleColors)

dev.off()

#画MDS图:可不做

pdf("MDS.pdf")

cmd1=cmdscale(as.dist(dissTOM),2)

par(mfrow=c(1,1))

plot(cmd1, col=as.character(moduleColors), main="MDS plot",xlab="Scaling Dimension 1",ylab="Scaling Dimension 2")

dev.off()


#####画module heatmap and the eigengene#####

pdf("modheatmap.pdf")

for (i in unique(moduleColors)[unique(moduleColors)!=0]){

ME=MEs[, paste("ME",i, sep="")] #ME是模块的基因表达方向

par(mfrow=c(2,1), mar=c(0.3, 5.5, 3, 2))

plotMat(t(scale(datExpr[,moduleColors==i ])),nrgcols=30,rlabels=F,rcols=labels2colors(i),main=labels2colors(i), cex.main=2)}

dev.off()

pdf("eigenebar.pdf")

for (i in unique(moduleColors)[unique(moduleColors)!=0]){

par(mar=c(5, 4.2, 0, 0.7))

barplot(ME, col=i, main="", cex.main=2,ylab="Eigengene expression",xlab=labels2colors(i))}

dev.off()


#####有性状数据时做,模块基因表达与trait进行关联#####

signif(cor(y,MEs, use="p"),2)

p.values = corPvalueStudent(cor(y,MEs, use="p"), nSamples = nSamples) #trait和模块相关分析的P值

#Measure of module significance as average gene significance

GS1=as.numeric(cor(y,datExpr, use="p")) #trait和基因表达的相关系数

GeneSignificance=abs(GS1)

# Next, module significance is defined as average gene significance.

ModuleSignificance=tapply(GeneSignificance, moduleColors, mean, na.rm=T) #求模块内GS1的均值

sizeGrWindow(8,7)

par(mfrow = c(1,1))

plotModuleSignificance(GeneSignificance[moduleColors!=0],moduleColors[moduleColors!=0])

#plot Gene significance (y-axis) vs. intramodular connectivity (x-axis) 画图gene sig和连接度散点

colorlevels=unique(moduleColors)

colorlevels=colorlevels[colorlevels!=0]

sizeGrWindow(9,6)

par(mfrow=c(2,as.integer(0.5+length(colorlevels)/2)))

par(mar = c(4,5,3,1))

for (i in c(1:length(colorlevels)))

{

 whichmodule=colorlevels[[i]];

 restrict1 = (moduleColors==whichmodule);

 verboseScatterplot(k.in$kWithin[restrict1],

           GeneSignificance[restrict1], col=moduleColors[restrict1],

           main=whichmodule,

           xlab = "Connectivity", ylab = "Gene Significance", abline = TRUE)

}

#Generalizing intramodular connectivity for all genes on the array

datKME=signedKME(datExpr, MEs, outputColumnName="MM.")

# Display the first few rows of the data frame

head(datKME)


#####有性状时做,筛选和性状相关的基因#####

NS1=networkScreening(y=y, datME=MEs, datExpr=datExpr,oddPower=softPower, blockSize=10000, minimumSampleSize=4,addMEy=TRUE, removeDiag=FALSE, weightESy=0.5)

GeneResultsNetworkScreening=data.frame(GeneName=row.names(NS1), NS1)

#write.table(GeneResultsNetworkScreening, file="GeneResultsNetworkScreening.csv",row.names=F,sep=",")

MEsy = data.frame(y, MEs)

eigengeneSignificance = cor(MEsy, y); #计算模块和性状的相关系数

eigengeneSignificance[1,1] = (1+max(eigengeneSignificance[-1, 1]))/2

eigengeneSignificance.pvalue = corPvalueStudent(eigengeneSignificance, nSamples = length(y)) #计算p值

namesME=names(MEsy)

# Form a summary data frame

out1=data.frame(t(data.frame(eigengeneSignificance,eigengeneSignificance.pvalue, namesME, t(MEsy))))

# Set appropriate row names

dimnames(out1)[[1]][1]="EigengeneSignificance"

dimnames(out1)[[1]][2]="EigengeneSignificancePvalue"

dimnames(out1)[[1]][3]="ModuleEigengeneName"

dimnames(out1)[[1]][-c(1:3)]=dimnames(datExpr)[[1]]

# Write the data frame into a file

write.table(out1, file="MEResultsNetworkScreening.csv", row.names=TRUE, col.names = TRUE, sep=",")

GeneName=dimnames(datExpr)[[2]]

GeneSummary=data.frame(GeneName, moduleColors, NS1)

write.table(GeneSummary, file="GeneSummaryTutorial.csv", row.names=F,sep=",")

datTraits=data.frame(ArrayName, MEsy)

dimnames(datTraits)[[2]][2:length(namesME)]=paste("Trait",dimnames(datTraits)[[2]][2:length(namesME)],sep=".")

write.table(datTraits, file="TraitsTutorial.csv", row.names=F,sep=",")

#Relationships among the top 30 most significant genes,correlation heatmaps for signed network:

sizeGrWindow(7,7)

NS1=networkScreening(y=y, datME=MEs, datExpr=datExpr,oddPower=softPower, blockSize=10000, minimumSampleSize=4,addMEy=TRUE, removeDiag=FALSE, weightESy=0.5)

topList=rank(NS1$p.Weighted,ties.method="first")<=30

#gene.names= names(datExpr)[topList]

gene.names=GeneName[topList]

colnames(datExpr)=GeneName

# The following shows the correlations between the top genes

plotNetworkHeatmap(datExpr, plotGenes = gene.names,networkType="signed", useTOM=TRUE,power=softPower, main="signed correlations")


#####自动进行所有模块的输出automatic finish the Cytoscape mods,必须做shiny用#####

probes = dat0[,1]

n=length(unique(moduleColors)[unique(moduleColors)!=0])

pb <- txtProgressBar(min = 0, max = n, style = 3) #添加进度条

for (p in 1:n) { modules=unique(moduleColors)[unique(moduleColors)!=0][p]

inModule = is.finite(match(moduleColors,modules));

modProbes = probes[inModule];

modTOM = TOM[inModule, inModule];

dimnames(modTOM) = list(modProbes, modProbes)

cyt = exportNetworkToCytoscape(modTOM,

                edgeFile = paste("CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep=""),

                nodeFile = paste("CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""),

                weighted = TRUE,threshold = quantile(abs(modTOM),probs=0.8,nodeNames = modProbes ,nodeAttr = moduleColors[inModule]))  

#threshold can be replaced by quantile(abs(modTOM),probs=0.8)

setTxtProgressBar(pb, p)}

close(pb) #关闭进度条


#####有性状数据时做,多个trait和module联系起来#####

allTraits = read.csv("trait.csv",row.names = 1);

femaleSamples=rownames(datExpr);

#traitRows = match(femaleSamples, allTraits$arrayname);

traitRows=match(femaleSamples, names(allTraits))

datTraits = allTraits[,1:6];

nGenes = ncol(datExpr);

nSamples = nrow(datExpr);

moduleTraitCor = WgcNA::cor(MEs[names(MEs)!="ME0"], datTraits, use = "p");

moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);

sizeGrWindow(10,6)

# Will display correlations and their p-values

textMatrix = paste(signif(moduleTraitCor, 2), "\n(",

          signif(moduleTraitPvalue, 1), ")", sep = "");

dim(textMatrix) = dim(moduleTraitCor)

par(mar = c(6, 8.5, 3, 3));

# Display the correlation values within a heatmap plot

labeledHeatmap(Matrix = moduleTraitCor,#moduleTraitCor

        # xLabels = names(datTraits),

        xLabels =names(datTraits), #如果只有1个性状,图比较难看,显示性状与模块的关系

        yLabels = names(MEs[names(MEs)!="ME0"]),

        ySymbols = names(MEs[names(MEs)!="ME0"]),

        colorLabels = FALSE,

        colors = greenWhiteRed(50),

        textMatrix =textMatrix,#textMatrix

        setStdMargins = FALSE,

        cex.text = 1,

        zlim = c(-1,1),

        main = paste("Module-trait relationships"))

#可以输出热图中的数据,textMatrix可以删去

out2=data.frame(data.frame(moduleTraitCor,moduleTraitPvalue))

write.table(out2, file="trait-module relationship.csv", row.names=TRUE, col.names = TRUE, sep=",")

#cor函数可以有use和method参数的设置 : Missing values present in input data. Consider using use = 'pairwise.complete.obs'.

#注意:大小写是不同的,特别是文件名等!


###############suvival analysis(optional)##############

library(tidyverse);library(tidytidbits);library(survivalanalysis);library(survival);library(survminer);library(pROC)

#####Fit survival data using the Kaplan-Meier method

data=cbind(MEs[names(MEs)!="ME0"],allTraits[,7:10])

surv_object <- Surv(time = allTraits$overall.survival.time_days, event = allTraits$overall.survival.event)

pdf("overall_sruvival.pdf")

for (a in 1:6){

 group=as.numeric(MEs[,a]>median(unlist(MEs[,a]))) #change a in combine_exp[a,]

 fit2 <- survfit(surv_object ~ group, data = data)

 print(ggsurvplot(fit2, data =data, pval = TRUE,risk.table=TRUE, xlab="Time (days)",title=names(data)[a])) #use print

}

dev.off()


WGCNA极速入门的评论 (共 条)

分享到微博请遵守国家法律