欢迎光临散文网 会员登陆 & 注册

「生信技能树」三阴性乳腺癌表达矩阵探索

2021-11-14 11:09 作者:zlzrzp  | 我要投稿

操作数据与视频案例不一样,已根据实际情况改动,请注意契合度。


library("GEOquery")

load("GSE55235.Rdata")


a=gset[[1]]

dat=exprs(a)

pd=pData(a)

#这里跟案例操作不一样,因为我用的GSE里有三种类型数据,所以我选取了其中两种


RA=rownames(pd[grepl('disease state: rheumatoid arthritis',as.character(pd$characteristics_ch1)),])

NOR=rownames(pd[grepl('disease state: healthy control',as.character(pd$characteristics_ch1)),])

dat=dat[,c(RA,NOR)]

group_list=c(rep("RA",times=10),rep("NOR",times=10))

save(dat,group_list,file="expression and pd")


rm(list = ls())

#PCA

load(file = "expression and pd")

dat=t(dat)

dat=as.data.frame(dat)

dat=cbind(dat,group_list)  

library("ggplot2")

library("factoextra")

library("FactoMineR")

#the the variable group_list (index =22284)is removed 

dat.pca <- PCA(dat[,-22284],graph=FALSE)

fviz_pca_ind(dat.pca,

       geom.ind = "point",

       col.ind = dat$group_list,

       palette = c("#00AFBB","#E7B800"),

       addEllipses = TRUE,

       legend.title = "Groups"

)



rm(list = ls())

# heatmap

load(file = "expression and pd")

cg=names(tail(sort(apply(dat,1,sd)),20))

library(pheatmap)

n=t(scale(t(dat[cg,])))

#让图区别明显

n[n>2]=2

n[n<-2]=-2

pheatmap(n,show_rownames = F, show_colnames = F)

#加分组信息

ac=data.frame(g=group_list)

row.names(ac)=colnames(n)

pheatmap(n,annotation_col = ac)


#boxplot

rm(list = ls())

load(file = "expression and pd")

library("ggplot2")

library("ggpubr")

bp=function(g){library("ggplot2")

 library("ggpubr")

 df=data.frame(gene=g,stage=group_list)

 p <- ggboxplot(df,x="stage",y="gene",

        color="stage",palette="jco",

        add="jitter")

 #add p value

 p + stat_compare_means()

}

bp(dat[1,])

bp(dat[2,])

rm(list = ls())

#差异分析

load(file = "expression and pd")


library("limma")

design=model.matrix(~factor(group_list))

fit=lmFit(dat,design)

fit=eBayes(fit)

options(digits = 4)

topTable(fit,coef=2,adjust="BH")

deg=topTable(fit,coef=2,adjust="BH",number = Inf)

save(deg,file = "P value ")


rm(list = ls())

#基因ID转换

load(file = "P value")

deg$probe_id=rownames(deg)

library("hgu133plus2.db")

ids=toTable(hgu133plus2SYMBOL)


deg=merge(deg,ids,by="probe_id")

save(deg,file = "symbol")


#绘制火山图

if(T){

 nrDEG=deg

 head(nrDEG)

 attach(nrDEG)

 plot(logFC,-log10(P.Value))

 library("ggplot2")

 library("ggpubr")

 df=nrDEG

 df$v=-log10(P.Value)

 ggscatter(df,x="logFC",y="v",size=0.5)

  

 df$g=ifelse(df$P.Value>0.01,"stable",

       ifelse(df$logFC>2,"up",

           ifelse(df$logFC<-2,"down","stable"))

 )

 table(df$g)

 df$name=rownames(df)

 ggscatter(df,x="logFC",y="v",size=0.5,color="g")

 ggscatter(df,x="logFC",y="v",color="g",size=0.5,

      label="symbol",repel=T,

      label.select=c("MMP3","MT1X"),

      palette=c("#00AFBB","#E78800","#FC4E07"))

  

 ggscatter(df,x="AveExpr",y="logFC",size=0.2)

 df$p_c=ifelse(df$P.Value<0.001,"p<0.001",

        ifelse(df$P.Value<0.01,"0.001<p<0.01","p>0.01"))

 table(df$p_c)

 ggscatter(df,x="AveExpr",y="logFC",color="p_c",size=0.2,

      palette=c("green","red","black"))

  

}

#绘制热图


if(T){

 #热图需要矩阵数据

 load(file = "expression and pd")

 load(file = "P value")

 dat[1:4,1:4]

 #取探针名

 x=deg$logFC

 deg$probe_id=rownames(deg)

 names(x)=deg$probe_id

 #根据logFC大小来取探针名;head前多少;tail后多少

 cg=c(names(head(sort(x),100)),

    names(tail(sort(x),100)))

 library(pheatmap)

 pheatmap(dat[cg,],show_colnames=F,show_rownames=F)

 #归一化后画图

 n=t(scale(t(dat[cg,])))

 n[n>2]=2

 n[n<-2]=-2

 n[1:4,1:4]

 pheatmap(n,show_colnames=F,show_rownames=F)

 #加回group_list

 ac=data.frame(g=group_list)

 rownames(ac)=colnames(n)

 pheatmap(n,show_colnames=F,

      show_rownames=F,

      #不排序就是F,默认排序T

      cluster_cols=F,

      annotation_col=ac)

}



rm(list = ls())

#注释

load(file = "symbol")

head(deg)

##这里出大问题,我的logFC运行代码后全部变成阈值了,待解决;然后我绕了一下路采用辅助列恢复原值

logFC_t=648.43

logFC_true=deg$logFC


deg=cbind(deg,logFC_true)

deg$g=ifelse(deg$P.Value>0.01,"stable",

       ifelse( deg$logFC >logFC_t,"up",

          ifelse( deg$logFC<-logFC_t,"down","stable"))

)

table(deg$g)

deg$logFC=deg$logFC_true

deg=deg[,-9]


#有些包不需要加载,同时关系到下面的一些函数,发现函数找不到时尝试重新加载包

library("ggplot2")

library("clusterProfiler")

library("AnnotationDbi")

library("stats4")

library("BiocGenerics")

library("org.Hs.eg.db")

df <- bitr(unique(deg$symbol),fromType="SYMBOL",

      toType=c("ENTREZID"),

      OrgDb=org.Hs.eg.db)


head(df)

head(deg)

deg$SYMBOL=rownames(deg)

deg=merge(deg,df,by.y="SYMBOL",by.x="symbol")

head(deg)


gene_up=deg[deg$g=="up","ENTREZID"]

gene_down=deg[deg$g=="down","ENTREZID"]

gene_diff=c(gene_up,gene_down)

#

gene_all=as.character(deg[,"ENTREZID"])

data(geneList,package = "DOSE")

head(geneList)

#这里两个图还没有理解

boxplot(geneList)

boxplot(deg$logFC)


geneList=deg$logFC

names(geneList)=deg$ENTREZID

geneList=sort(geneList,decreasing = T)


## KEGG pathway analysis

if(T){

 ### over=representation test

 kk.up <- enrichKEGG(gene = gene_up,

           organism = "hsa",

           universe = gene_all,

           pvalueCutoff = 0.9,

           qvalueCutoff = 0.9)

 head(kk.up)[,1:6]

 kk.down <- enrichKEGG(gene = gene_down,

            organism = "hsa",

            universe = gene_all,

            pvalueCutoff = 0.05)

 head(kk.down)[,1:6]

 kk.diff <- enrichKEGG(gene = gene_diff,

            organism = "hsa",

            pvalueCutoff = 0.05)

 head(kk.diff)[,1:6]

  

 kegg_diff_dt <- as.data.frame(kk.diff)

 kegg_down_dt <- as.data.frame(kk.down)

 kegg_up_dt <- as.data.frame(kk.up)

 down_kegg <- kegg_down_dt[kegg_down_dt$pvalue<0.05,];down_kegg$group=-1

 up_kegg <- kegg_up_dt[kegg_up_dt$pvalue<0.05,];up_kegg$group=1

 #这里的自定义函数指路https://www.jianshu.com/p/0dd8f42a92ba

 

 source("functions.R")

 g_kegg=kegg_plot(up_kegg,down_kegg)

 print(g_kegg)

  

 ggsave(g_kegg,filename="kegg_up_down.png")

  

 #GSEA

 #这里报错不用管

 kk_gse <- gseKEGG(geneList = geneList,

          organism = "hsa",

          verbose = FALSE)

 head(kk_gse)[,1:6]

 gseaplot(kk_gse,geneSetID = rownames(kk_gse[1,]))

  

 down_kegg <- kk_gse[kk_gse$pvalue<0.05&kk_gse$enrichmentScore<0,];down_kegg$group

 up_kegg <- kk_gse[kk_gse$pvalue<0.05&kk_gse$enrichmentScore<0,];up_kegg$group

  

 source("functions.R")

 #这里有错,待解决:Error in `$<-.data.frame`(`*tmp*`, "pvalue", value = numeric(0)) : 替换数据里有0行,但数据有4


 g_kegg=kegg_plot(up_kegg,down_kegg)

 print(g_kegg)

 ggsave(g_kegg,filename = "kegg_up_down_gsea.png")

  

}


#GO databases analysis

{

g_list=list(gene_up=gene_up,

      gene_down=gene_down,

      gene_diff=gene_diff)

if(F){

 go_enrich_results <- lapply(g_list,function(gene){

  lapply(c("BP","MF","CC"),function(ont){

   cat(paste("Now process",ont))

   ego <- enrichGO(gene = gene,

           universe = gene_all,

           OrgDb = org.Hs.eg.db,

           ont = ont,

           pAdjustMethod = "BH",

           pvalueCutoff = 0.99,

           qvalueCutoff = 0.99,

           readable = TRUE)

   print(head(ego))

   return(ego)

  })

 })

}

}


#根据需要画图

rm(list = ls())

options(stringsAsFactors = F)

load(file = "expression and pd")

dat[1:4,1:4]

library("hgu133plus2.db")

p2s=toTable(hgu133plus2SYMBOL)


a=p2s$symbol %in% c()

np=p2s[a,1]

ng=p2s[a,2]

dat[np,1:4]

x=dat[np,]

rownames(x)=paste(ng,np,sep = ":")

library('pheatmap')

tmp=data.frame(group=group_list)

rownames(tmp)=colnames(x)

pheatmap(x,annotation_col=tmp)


「生信技能树」三阴性乳腺癌表达矩阵探索的评论 (共 条)

分享到微博请遵守国家法律