「生信技能树」三阴性乳腺癌表达矩阵探索

操作数据与视频案例不一样,已根据实际情况改动,请注意契合度。
library("GEOquery")
load("GSE55235.Rdata")
a=gset[[1]]
dat=exprs(a)
pd=pData(a)
#这里跟案例操作不一样,因为我用的GSE里有三种类型数据,所以我选取了其中两种
RA=rownames(pd[grepl('disease state: rheumatoid arthritis',as.character(pd$characteristics_ch1)),])
NOR=rownames(pd[grepl('disease state: healthy control',as.character(pd$characteristics_ch1)),])
dat=dat[,c(RA,NOR)]
group_list=c(rep("RA",times=10),rep("NOR",times=10))
save(dat,group_list,file="expression and pd")
rm(list = ls())
#PCA
load(file = "expression and pd")
dat=t(dat)
dat=as.data.frame(dat)
dat=cbind(dat,group_list)
library("ggplot2")
library("factoextra")
library("FactoMineR")
#the the variable group_list (index =22284)is removed
dat.pca <- PCA(dat[,-22284],graph=FALSE)
fviz_pca_ind(dat.pca,
geom.ind = "point",
col.ind = dat$group_list,
palette = c("#00AFBB","#E7B800"),
addEllipses = TRUE,
legend.title = "Groups"
)
rm(list = ls())
# heatmap
load(file = "expression and pd")
cg=names(tail(sort(apply(dat,1,sd)),20))
library(pheatmap)
n=t(scale(t(dat[cg,])))
#让图区别明显
n[n>2]=2
n[n<-2]=-2
pheatmap(n,show_rownames = F, show_colnames = F)
#加分组信息
ac=data.frame(g=group_list)
row.names(ac)=colnames(n)
pheatmap(n,annotation_col = ac)
#boxplot
rm(list = ls())
load(file = "expression and pd")
library("ggplot2")
library("ggpubr")
bp=function(g){library("ggplot2")
library("ggpubr")
df=data.frame(gene=g,stage=group_list)
p <- ggboxplot(df,x="stage",y="gene",
color="stage",palette="jco",
add="jitter")
#add p value
p + stat_compare_means()
}
bp(dat[1,])
bp(dat[2,])
rm(list = ls())
#差异分析
load(file = "expression and pd")
library("limma")
design=model.matrix(~factor(group_list))
fit=lmFit(dat,design)
fit=eBayes(fit)
options(digits = 4)
topTable(fit,coef=2,adjust="BH")
deg=topTable(fit,coef=2,adjust="BH",number = Inf)
save(deg,file = "P value ")
rm(list = ls())
#基因ID转换
load(file = "P value")
deg$probe_id=rownames(deg)
library("hgu133plus2.db")
ids=toTable(hgu133plus2SYMBOL)
deg=merge(deg,ids,by="probe_id")
save(deg,file = "symbol")
#绘制火山图
if(T){
nrDEG=deg
head(nrDEG)
attach(nrDEG)
plot(logFC,-log10(P.Value))
library("ggplot2")
library("ggpubr")
df=nrDEG
df$v=-log10(P.Value)
ggscatter(df,x="logFC",y="v",size=0.5)
df$g=ifelse(df$P.Value>0.01,"stable",
ifelse(df$logFC>2,"up",
ifelse(df$logFC<-2,"down","stable"))
)
table(df$g)
df$name=rownames(df)
ggscatter(df,x="logFC",y="v",size=0.5,color="g")
ggscatter(df,x="logFC",y="v",color="g",size=0.5,
label="symbol",repel=T,
label.select=c("MMP3","MT1X"),
palette=c("#00AFBB","#E78800","#FC4E07"))
ggscatter(df,x="AveExpr",y="logFC",size=0.2)
df$p_c=ifelse(df$P.Value<0.001,"p<0.001",
ifelse(df$P.Value<0.01,"0.001<p<0.01","p>0.01"))
table(df$p_c)
ggscatter(df,x="AveExpr",y="logFC",color="p_c",size=0.2,
palette=c("green","red","black"))
}
#绘制热图
if(T){
#热图需要矩阵数据
load(file = "expression and pd")
load(file = "P value")
dat[1:4,1:4]
#取探针名
x=deg$logFC
deg$probe_id=rownames(deg)
names(x)=deg$probe_id
#根据logFC大小来取探针名;head前多少;tail后多少
cg=c(names(head(sort(x),100)),
names(tail(sort(x),100)))
library(pheatmap)
pheatmap(dat[cg,],show_colnames=F,show_rownames=F)
#归一化后画图
n=t(scale(t(dat[cg,])))
n[n>2]=2
n[n<-2]=-2
n[1:4,1:4]
pheatmap(n,show_colnames=F,show_rownames=F)
#加回group_list
ac=data.frame(g=group_list)
rownames(ac)=colnames(n)
pheatmap(n,show_colnames=F,
show_rownames=F,
#不排序就是F,默认排序T
cluster_cols=F,
annotation_col=ac)
}
rm(list = ls())
#注释
load(file = "symbol")
head(deg)
##这里出大问题,我的logFC运行代码后全部变成阈值了,待解决;然后我绕了一下路采用辅助列恢复原值
logFC_t=648.43
logFC_true=deg$logFC
deg=cbind(deg,logFC_true)
deg$g=ifelse(deg$P.Value>0.01,"stable",
ifelse( deg$logFC >logFC_t,"up",
ifelse( deg$logFC<-logFC_t,"down","stable"))
)
table(deg$g)
deg$logFC=deg$logFC_true
deg=deg[,-9]
#有些包不需要加载,同时关系到下面的一些函数,发现函数找不到时尝试重新加载包
library("ggplot2")
library("clusterProfiler")
library("AnnotationDbi")
library("stats4")
library("BiocGenerics")
library("org.Hs.eg.db")
df <- bitr(unique(deg$symbol),fromType="SYMBOL",
toType=c("ENTREZID"),
OrgDb=org.Hs.eg.db)
head(df)
head(deg)
deg$SYMBOL=rownames(deg)
deg=merge(deg,df,by.y="SYMBOL",by.x="symbol")
head(deg)
gene_up=deg[deg$g=="up","ENTREZID"]
gene_down=deg[deg$g=="down","ENTREZID"]
gene_diff=c(gene_up,gene_down)
#
gene_all=as.character(deg[,"ENTREZID"])
data(geneList,package = "DOSE")
head(geneList)
#这里两个图还没有理解
boxplot(geneList)
boxplot(deg$logFC)
geneList=deg$logFC
names(geneList)=deg$ENTREZID
geneList=sort(geneList,decreasing = T)
## KEGG pathway analysis
if(T){
### over=representation test
kk.up <- enrichKEGG(gene = gene_up,
organism = "hsa",
universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff = 0.9)
head(kk.up)[,1:6]
kk.down <- enrichKEGG(gene = gene_down,
organism = "hsa",
universe = gene_all,
pvalueCutoff = 0.05)
head(kk.down)[,1:6]
kk.diff <- enrichKEGG(gene = gene_diff,
organism = "hsa",
pvalueCutoff = 0.05)
head(kk.diff)[,1:6]
kegg_diff_dt <- as.data.frame(kk.diff)
kegg_down_dt <- as.data.frame(kk.down)
kegg_up_dt <- as.data.frame(kk.up)
down_kegg <- kegg_down_dt[kegg_down_dt$pvalue<0.05,];down_kegg$group=-1
up_kegg <- kegg_up_dt[kegg_up_dt$pvalue<0.05,];up_kegg$group=1
#这里的自定义函数指路https://www.jianshu.com/p/0dd8f42a92ba
source("functions.R")
g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)
ggsave(g_kegg,filename="kegg_up_down.png")
#GSEA
#这里报错不用管
kk_gse <- gseKEGG(geneList = geneList,
organism = "hsa",
verbose = FALSE)
head(kk_gse)[,1:6]
gseaplot(kk_gse,geneSetID = rownames(kk_gse[1,]))
down_kegg <- kk_gse[kk_gse$pvalue<0.05&kk_gse$enrichmentScore<0,];down_kegg$group
up_kegg <- kk_gse[kk_gse$pvalue<0.05&kk_gse$enrichmentScore<0,];up_kegg$group
source("functions.R")
#这里有错,待解决:Error in `$<-.data.frame`(`*tmp*`, "pvalue", value = numeric(0)) : 替换数据里有0行,但数据有4
g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)
ggsave(g_kegg,filename = "kegg_up_down_gsea.png")
}
#GO databases analysis
{
g_list=list(gene_up=gene_up,
gene_down=gene_down,
gene_diff=gene_diff)
if(F){
go_enrich_results <- lapply(g_list,function(gene){
lapply(c("BP","MF","CC"),function(ont){
cat(paste("Now process",ont))
ego <- enrichGO(gene = gene,
universe = gene_all,
OrgDb = org.Hs.eg.db,
ont = ont,
pAdjustMethod = "BH",
pvalueCutoff = 0.99,
qvalueCutoff = 0.99,
readable = TRUE)
print(head(ego))
return(ego)
})
})
}
}
#根据需要画图
rm(list = ls())
options(stringsAsFactors = F)
load(file = "expression and pd")
dat[1:4,1:4]
library("hgu133plus2.db")
p2s=toTable(hgu133plus2SYMBOL)
a=p2s$symbol %in% c()
np=p2s[a,1]
ng=p2s[a,2]
dat[np,1:4]
x=dat[np,]
rownames(x)=paste(ng,np,sep = ":")
library('pheatmap')
tmp=data.frame(group=group_list)
rownames(tmp)=colnames(x)
pheatmap(x,annotation_col=tmp)