泛癌多组学之基因表达与体细胞拷贝数变异相关性分析

今天小云想带着小伙伴进行泛癌联多组学分析,通过该推文完全掌握泛癌中基因表达与体细胞拷贝数变异相关性分析,通过基因发生的拷贝数变异情况,来研究拷贝数变异对基因表达的影响,从而可以推断出影响该疾病的分子机制,全方面来研究调控机制,分析内容会更有说服性,接下来跟这小云开始今天的学习奥。
1.如何进行基因表达与体细胞拷贝数变异相关性分析?
其实该分析很简单,只需要准备感兴趣基因集文件,泛癌表达矩阵文件和泛癌样本信息文件,分别提取基因集的基因表达和体细胞CNV文件,最后进行二者相关性分析,并绘制相关性气泡图,小云觉得小伙伴肯定能迅速掌握该分析奥,那就跟着小云开始实操学习吧!
2.需要的R包
#下载需要的R包
BiocManager::install("ComplexHeatmap")
install.packages("ggpubr", "randomcoloR")
install.packages("ggplot2")
install.packages("data.table")
BiocManager::install("impute")
#载入需要的R包
library(ggplot2)
library(data.table)
library(impute)
library(ComplexHeatmap)
library(randomcoloR)
library(ggpubr)
library(GSVA)
library(clusterProfiler)
3.数据下载
#泛癌样本信息下载-https://gdc.cancer.gov/about-data/publications/pancanatlas
merged_sample_quality_annotations.tsv,第一列和第二列为样本信息,第三列为肿瘤类型;其他列可以忽略奥。

#泛癌表达矩阵文件下载-https://gdc.cancer.gov/about-data/publications/pancanatlas
EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv,第一列为基因名,其他列为样本名;注意,基因名是以“|”分割,需要做格式转化。

#泛癌体细胞拷贝数变异文件-https://xenabrowser.net/datapages/?dataset=TCGA.PANCAN.sampleMap%2FGistic2_CopyNumber_Gistic2_all_data_by_genes&host=https%3A%2F%2Ftcga.xenahubs.net&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443
Gistic2_CopyNumber_Gistic2_all_data_by_genes.gz,第一列为基因名,其他列为样本信息。

4.泛癌中基因表达与体细胞拷贝数变异相关性分析
# 获得同时有肿瘤和正常样本的肿瘤名
tumors <- c("BLCA","BRCA","CESC","CHOL","COAD",
"ESCA","GBM","HNSC","KICH","KIRC",
"KIRP","LIHC","LUAD","LUSC","PAAD",
"PRAD","READ","STAD","THCA","UCEC")
# 获得感兴趣的基因集
frg<- c("CDKN1A","HSPA5","TTC35","SLC7A11","NFE2L2","MT1G","HSPB1","GPX4","FANCD2","CISD1","FDFT1","SLC1A5","SAT1",
"TFRC","RPL8","NCOA4","LPCAT3","GLS2","DPP4","CS","CARS","ATP5G3","ALOX15","ACSL4","EMC2")
# 简化泛癌样本信息文件
rawAnno <- read.delim("merged_sample_quality_annotations.tsv",sep = "\t",row.names = NULL,check.names = F,stringsAsFactors = F,header = T) # 数据来自PanCanAtlas
#提取样本名1-15个字符
rawAnno$simple_barcode <- substr(rawAnno$aliquot_barcode,1,15)
#去除重复的样本名
samAnno <- rawAnno[!duplicated(rawAnno$simple_barcode),c("cancer type", "simple_barcode")]
#去除组织类型为空的数据
samAnno <- samAnno[which(samAnno$`cancer type` != ""),]
write.table(samAnno,"simple_sample_annotation.txt",sep = "\t",row.names = F,col.names = T,quote = F)
# 快速读取表达矩阵文件
expr <- fread("EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv",sep = "\t",stringsAsFactors = F,check.names = F,header = T)
#将数据转换为数据框
expr <- as.data.frame(expr)
#将第一列作为行名
rownames(expr) <- expr[,1]
#删除第一列
expr <- expr[,-1]
# 调整基因名
gene <- sapply(strsplit(rownames(expr),"|",fixed = T), "[",1)
expr$gene <- gene
# 移除重复样本
expr <- expr[!duplicated(expr$gene),]
rownames(expr) <- expr$gene
expr <- expr[,-ncol(expr)]
# 取部分表达谱,感兴趣的基因集
comgene <- intersect(rownames(expr),frg)
expr_sub <- expr[comgene,]
colnames(expr_sub) <- substr(colnames(expr_sub),1,15)
expr_sub <- expr_sub[,!duplicated(colnames(expr_sub))]
#通过循环提取每种肿瘤组织的表达矩阵文件
for (i in tumors) {
message("--",i,"...")
sam <- samAnno[which(samAnno$`cancer type` == i),"simple_barcode"] # 获取该肿瘤的全部样本
comsam <- intersect(colnames(expr_sub), sam) # 取出与基因表达泛癌数据交集的样本
tumsam <- comsam[substr(comsam,14,14) == "0"] # 获得肿瘤样本
norsam <- comsam[substr(comsam,14,14) == "1"] # 获得正常样本
expr_subset <- expr_sub[,c(tumsam,norsam)]
expr_subset[expr_subset < 0] <- 0 # 这份数据里存在负值,即便负值比较小,但也要矫正,如果使用其他泛癌表达谱根据情况而定
expr_subset <- as.data.frame(impute.knn(as.matrix(expr_subset))$data)
write.table(expr_subset, paste0("TCGA_",i,"_expr_subset.txt"),sep = "\t",row.names = T,col.names = NA,quote = F)
}
#快速读取CNV数据
cna <- fread("Gistic2_CopyNumber_Gistic2_all_data_by_genes.gz",sep = "\t",stringsAsFactors = F,header = T,check.names = F)
#将数据转换为数据框
cna <- as.data.frame(cna)
comgene <- intersect(cna$Sample,frg)
# 只包含感兴趣基因的CNA子集
cna <- cna[cna$Sample %in% comgene,]
rownames(cna) <- cna[,1]
cna <- cna[,-1]
#相关性计算
corCnaExpr <- NULL
for (i in tumors) {
message("--",i,"...")
# 读取上面拆分好的表达数据
expr_subset <- read.table(paste0("TCGA_",i,"_expr_subset.txt"),sep = "\t",row.names = 1,check.names = F,stringsAsFactors = F,header = T)
# 取出同时具有表达且具有拷贝数的样本
comsam <- intersect(colnames(expr_subset),colnames(cna))
cna_subset <- cna[,comsam] # 生成拷贝数子集
rownames(cna_subset) <- gsub("EMC2","TTC35",rownames(cna_subset)) # EMC2是原文使用的TTC35基因的同名,使用子集数据时无须修正
expr_subset <- expr_subset[,comsam] # 生成基因表达子集
# 把cnv按癌症名保存到单个文件
write.table(cna_subset, paste0("TCGA_",i,"_broadcnv_subset_matched_expr.txt"),sep = "\t",row.names = T,col.names = T,quote = F)
# 计算相关系数
corTab <- NULL
for (j in rownames(cna_subset)) {
tmp1 <- as.numeric(cna_subset[j,]) # 相应的CNA数据
tmp2 <- as.numeric(expr_subset[j,]) # 相应的表达数据
cor.res <- cor.test(tmp1,tmp2, method = "spearman") # spearman相关性
#将计算的相关性矩阵合并起来
corTab <- rbind.data.frame(corTab,
data.frame(gene = j,
tumor = i,
Correlation = ifelse(is.na(cor.res$estimate), 0, cor.res$estimate),
Pvalue = ifelse(is.na(cor.res$p.value), 1, cor.res$p.value),
stringsAsFactors = F),
stringsAsFactors = F)
}
corCnaExpr <- rbind.data.frame(corCnaExpr,
corTab,
stringsAsFactors = F)
}
5.绘制相关性气泡图
# 设置颜色
blue <- "#4577FF"
red <- "#C2151A"
orange <- "#E45737"
#确定绘图基因的顺序
corCnaExpr$gene <- factor(corCnaExpr$gene,
levels= rev(c("CDKN1A","HSPA5","TTC35","SLC7A11","NFE2L2","MT1G","HSPB1","GPX4","FANCD2","CISD1", "FDFT1","SLC1A5","SAT1","TFRC","RPL8","NCOA4","LPCAT3","GLS2","DPP4","CS","CARS","ATP5G3","ALOX15","ACSL4")))
#绘制泡泡图
my_palette <- colorRampPalette(c(blue,"white",orange), alpha=TRUE)(n=128)
ggplot(corCnaExpr, aes(x=tumor,y=gene)) +
geom_point(aes(size=-log10(Pvalue), color=Correlation)) +
scale_color_gradientn('Correlation',
colors=my_palette) +
scale_size_continuous(range = c(1,12)) + #圆点的大小范围
#修改主题
theme_bw() +
theme(axis.text.x = element_text(angle = 45, size = 12, hjust = 0.3, vjust = 0.5, color = "black"),
axis.text.y = element_text(size = 12, color = rep(c(red,blue),c(14,10))),
axis.title = element_blank(),
panel.border = element_rect(size = 0.7, linetype = "solid", colour = "black"),
legend.position = "bottom",
plot.margin = unit(c(1,1,1,1), "lines"))
#保存图片
ggsave("correlation_between_cna_and_expression_of_interested_genes_in_pancancer.pdf", width = 8,height = 8)
6.结果文件
1. simple_sample_annotation.txt
该文件为提取的简化的泛癌样本信息文件,第一列为肿瘤组织,第二列为样本名。

2. TCGA_BLCA_expr_subset.txt
该文件为提取的感兴趣基因集在特定肿瘤组织中的表达矩阵文件,行名为基因名,列名为样本信息。

3. TCGA_BLCA_broadcnv_subset_matched_expr.txt
该文件为提取的感兴趣基因集在特定肿瘤组织中的CNV文件,行名为基因名,列名为样本信息。

4. TCGA_pancan_correlation_cnv_expr_subset.txt
该文件为计算的感兴趣基因集在泛癌中基因表达与CNV的相关性分析结果文件,第一列为基因名,第二列为肿瘤类型,第三列为计算的相关系数,第四列为Pvalue值。

5. correlation_between_cna_and_expression_of_interested_genes_in_pancancer.pdf
该图为感兴趣基因在泛癌中基因表达与体细胞CNV的相关性气泡图,横坐标为肿瘤组织名称,纵坐标为基因名,点的大小表示pvalue值,用不同颜色表示相关系数大小。

小云通过一文完成了泛癌中感兴趣基因集基因表达与体细胞拷贝数变异之间的相关性分析,该代码只需要提供感兴趣基因集文件就可以完成分析滴,泛癌分析的其他相关内容可以尝试使用本公司新开发的云平台生物信息分析小工具,零代码完成分析,云平台网址:http://www.biocloudservice.com/home.html,主要包括泛癌中特定通路基因的突变分析(http://www.biocloudservice.com/740/740.php),泛癌中拷贝数变异分析(http://www.biocloudservice.com/700/700.php),泛癌中热图和箱线图(http://www.biocloudservice.com/773/773.php),基因在肿瘤与正常组织中差异分析(http://www.biocloudservice.com/710/710.php)等相关泛癌分析小工具
好了,今天小云的分享就到这里啦,完整版代码上“云生信学生物信息学”公正号回复“代码”领取哟~小伙伴们有什么问题欢迎来和小云讨论分享呀。
