R/tcgaClinic.R

Defines functions tcgaClinic

Documented in tcgaClinic

#' Combine the interesting genes with the corresponed clinical traits to analysis
#'
#' use the tcgaEnrichment() fuction to get the enrichment results of DEGs.
#' @import survival progress survminer
#' @param dataPath the direction of download Data
#' @param clinical_data the downloaded clinical.tsv Data
#' @param transformed default is Ture,log2 transformed the gene expression,else not transform applied
#' @param groupby the quantitle threshold to determine the gene is relatively high expression or low expression in samples,and the range is 0.1~0.9.
#' @export
tcgaClinic<-function(dataPath,clinical_data,transfromed=T,groupby=0.5){
  message("读入临床数据,进行初步生存分析...",appendLF =T)
  setwd(dataPath)
  load(file=".//output//step05.RData")
  #带有symbold的TCGA差异表达结果:TCGA_result_genesymbol
  load(file=".//output//step07.RData")
  clinical_traits_cancer<-read.table(clinical_data,
                                     stringsAsFactors = F,sep = "\t",header = T,quote ="")
  #去重
  clinical_traits_cancer<-clinical_traits_cancer[!duplicated(clinical_traits_cancer$case_id),]
  clinical_traits_cancer_selected<-clinical_traits_cancer[,c("days_to_death",
                                                             "tumor_stage",
                                                             "days_to_last_follow_up",
                                                             "gender",
                                                             colnames(clinical_traits_cancer)[grepl("submitter",
                                                                                                    colnames(clinical_traits_cancer))])]

  #followup+death
  clinical_traits_cancer_selected$days_to_death=as.numeric(as.character(clinical_traits_cancer_selected$days_to_death))
  clinical_traits_cancer_selected$days_to_last_follow_up=as.numeric(as.character(clinical_traits_cancer_selected$days_to_last_follow_up))
  day0=ifelse(is.na(clinical_traits_cancer_selected$days_to_last_follow_up),0,clinical_traits_cancer_selected$days_to_last_follow_up)
  day1=ifelse(is.na(clinical_traits_cancer_selected$days_to_death),0,clinical_traits_cancer_selected$days_to_death)
  clinical_traits_cancer_selected$final_days=as.numeric(day0)+as.numeric(day1)
  clinical_traits_cancer_selected$survive_group<-ifelse(is.na(clinical_traits_cancer_selected$days_to_death),0,1)

 #表达矩阵在x_d中
  #差异分析结果在result中
  #TCGA_result 为过滤之后的结果
  x_td=as.data.frame(x_d)
  #d=TCGA_result_genesymbol
  #获取表达矩阵
  interesting_genes_matrix=x_td[match(TCGA_result_genesymbol$ensembl_gene_id,rownames(x_td)),]
  #选取肿瘤样本
  interesting_genes_matrix_df=interesting_genes_matrix[,group=="cancer"]

  #获取注释结果,同时对没有注释结果的ID进行预处理
  interesting_genes_df=data.frame(ensembl_gene_id=TCGA_result_genesymbol$ensembl_gene_id)
  ipo=data.frame(ensembl_gene_id=TCGA_result_genesymbol$ensembl_gene_id,
                 hgnc_symbol=TCGA_result_genesymbol$hgnc_symbol)
  c=merge(interesting_genes_df,ipo,all.x=T)
  c$ensembl_gene_id=as.character(c$ensembl_gene_id)
  c$hgnc_symbol=as.character(c$hgnc_symbol)
  c$hgnc_symbol=ifelse(is.na(c$hgnc_symbol),c$ensembl_gene_id,
                       ifelse(c$hgnc_symbol=="",c$ensembl_gene_id,c$hgnc_symbol)
  )
  rownames(interesting_genes_matrix_df)=c$hgnc_symbol
  rm(interesting_genes_df)

  #替换表达矩阵中的列名,需要和clinic_traits中的submitter_id保持一致
  colnames(interesting_genes_matrix_df)=substr(colnames(interesting_genes_matrix_df),1,12)
  #表达矩阵转置
  interesting_genes_df=as.data.frame(t(interesting_genes_matrix_df))

  clinical_traits_cancer_selected[,5]=gsub("\\-","\\.",clinical_traits_cancer_selected[,5])
  rownames(interesting_genes_df)=gsub("\\-","\\.",rownames(interesting_genes_df))

  #有“-”或“ ”的出现,不能进行下游的生存分析
  colnames(interesting_genes_df)=gsub("[[:punct:]]","\\_",colnames(interesting_genes_df))
  #选出对应样本的病理信息
  clinical_traits_cancer_selected_x<-clinical_traits_cancer_selected[match(rownames(interesting_genes_df),
                                                                           clinical_traits_cancer_selected[,5]),]
  #将两组数据结果:表达矩阵+病理信息
  #先转换
  #1. log2转换与否
  #2. median分组或者分位数分组

  if(transfromed){
    clinical_traits_cancer_selected_group<-apply(interesting_genes_df,2,
                                                 function(x){
                                                   ifelse(log2(x)<quantile(log2(x),groupby),"low expression","high expression")
                                                 }
    )
    message(paste0("采用Log对基因表达值进行转换,使用",round(groupby*100,2),"%分位数,对基因表达高低进行分组"),appendLF=T)
  }else{
    clinical_traits_cancer_selected_group<-apply(interesting_genes_df,2,
                                                 function(x){
                                                   ifelse(x<quantile(x,groupby),"low expression","high expression")
                                                 }
    )
    message(paste0("未对基因表达值进行转换,使用",round(groupby*100,2),"%分位数,对基因表达高低进行分组"),appendLF=T)
  }

  clinical_traits_cancer_selected_group_df=as.data.frame(clinical_traits_cancer_selected_group)
  clinical_traits_cancer_selected_merge<-cbind(clinical_traits_cancer_selected_x,
                                               clinical_traits_cancer_selected_group_df)

  geneList<<-colnames(interesting_genes_df)
  #y <- survival::Surv(as.numeric(clinical_traits_cancer_selected_merge$final_days),
  #         clinical_traits_cancer_selected_merge$survive_group)

  Cairo::CairoPDF(file = ".//output//step09_survive_analysis.pdf",
                  width = 8,height = 5,
                  family="Arial")
  #pvalues=rep(0,length(geneList))
  #p_genes=rep(0,length(geneList))
  pvalues<<-rep(0,length(geneList))
  for(i in 1:length(geneList)){
    geneName<<-geneList[i]
    clinical_traits_cancer_selected_mergex<<-clinical_traits_cancer_selected_merge
    fit<-survival::survfit(
      #始终还是不行
      #那就把“-”,替换为"_"把
      eval(parse(text = sprintf("survival::Surv(final_days,survive_group)~`%s`",geneName))
           ),
      #survival::Surv(final_days,survive_group)~eval(parse(text=geneName)),
      data=clinical_traits_cancer_selected_mergex
    )

    p<<-survminer::ggsurvplot(fit,data=clinical_traits_cancer_selected_merge,
               legend = c(0.8,0.8),
               legend.title = geneName,
               legend.labs = c("High Expression", "Low Expression"),
               pval = TRUE,
               conf.int = TRUE
               )+ggplot2::labs(x="Time (Days)")

    print(p)
    #获取logRank检验的P值
    logRanks=survminer::surv_pvalue(fit)
    pvalues[i]=round(logRanks$pval,3)
 }
  dev.off()

  pgenes=data.frame(pvalues,GeneSymbol=geneList)
  write.csv(pgenes,".//output//step09_survivalOfGenes.csv")
  write.csv(clinical_traits_cancer_selected,".//output//step09_followUpInformation.csv")
  message("Complete!")

}
dming1024/TCGApackages0226 documentation built on April 9, 2021, 7:48 a.m.