R/getMAFDataTCGA.R

Defines functions getMAFdataTCGA

Documented in getMAFdataTCGA

## To Supress Note
utils::globalVariables(c(".", "..mycols","..tcga_pheno_columns","tempdir","heat.colors","topo.colors"))

#' Function to extract the mutation data in MAF format from TCGA
#' @description This function download and extract the mutation
#'  data in MAF format from TCGA.
#' @author Mayank Tandon, Ashish Jain
#' @param cancerCodes A character vector of TCGA cancer codes
#' @param outputFolder path of the file containing the mutation
#' information in the MAF format
#' @param variant_caller type of variant caller in TCGA
#' @export
#' @return A vector containing the path of the downloaded mutation annotation files
#'
#' @examples
#' library(MAFDash)
#' cancerCodes <- c("ACC","THCA")
#' outputFolderPath <- tempdir()
#' #maf <- getMAFdataTCGA(cancerCodes = cancerCodes,outputFolder = outputFolderPath)
#' @importFrom TCGAbiolinks GDCquery_Maf
#' @importFrom ensurer ensure_that
#' @importFrom dplyr mutate group_by

getMAFdataTCGA<-function(cancerCodes=c("ACC"),outputFolder=tempdir(),variant_caller="mutect2"){

  cancerCodes <- ensurer::ensure_that(cancerCodes,
                                   !is.null(.) && (class(.) == "character") && all(. %in% TCGAbiolinks::getGDCprojects()$tumor),
                                   err_desc = "Please enter the correct TCGA cancer code. See TCGAbiolinks::getGDCprojects().")
  variant_caller <- ensurer::ensure_that(variant_caller,
                                     !is.null(.) && (class(.) == "character"),
                                     err_desc = "Please enter the variant caller type in correct format.")

  tcga_maf_file_list<-c()
  for(cancerCode in cancerCodes)
  {
    outputFolder=file.path(outputFolder,paste0("TCGA_",cancerCode),variant_caller)
    tcga_maf_file=file.path(outputFolder,paste0("TCGA_",cancerCode,".",variant_caller,".maf"))

    if (!file.exists(tcga_maf_file)) {
      if(!dir.exists(outputFolder)) {dir.create(outputFolder, recursive = TRUE)}
      tcga_maf <- TCGAbiolinks::GDCquery_Maf(gsub("TCGA-","",cancerCode),pipelines = variant_caller,directory = outputFolder)
      tcga_maf$Tumor_Sample_Barcode_original <- tcga_maf$Tumor_Sample_Barcode
      tcga_maf$Tumor_Sample_Barcode <-unlist(lapply(strsplit(tcga_maf$Tumor_Sample_Barcode, "-"), function(x) {paste0(x[1:3], collapse="-")}))
      tcga_maf$caller <- variant_caller
      write.table(tcga_maf, file=tcga_maf_file, quote=FALSE, sep="\t", row.names = FALSE, col.names = TRUE)
    }
    tcga_maf_file_list<-c(tcga_maf_file_list,tcga_maf_file)
  }
  # tcga_clinical_file=file.path(outputFolder,paste0("TCGA_",cancerCode,".clinical.txt"))
  # if (! file.exists(tcga_clinical_file)) {
  #   if (!dir.exists(dirname(tcga_clinical_file))) {dir.create(dirname(tcga_clinical_file), recursive = T)}
  #   tcga_clinical <- GDCquery_clinic(project = paste0("TCGA-",cancerCode), type = "clinical")
  #   write.table(tcga_clinical, file=tcga_clinical_file, quote=T, sep="\t", row.names = F, col.names = T)
  # }
  #
  #
  # tcga_clin_data <- read.table(tcga_clinical_file, sep="\t",header = T,stringsAsFactors = F)
  # tcga_clin_data$Tumor_Sample_Barcode <- tcga_clin_data$bcr_patient_barcode
  # tcga_maf <- read.maf(tcga_maf_file, clinicalData = tcga_clin_data)
  #return(list("MAFFilePath"=tcga_maf_file,"MAFObject"=tcga_maf))
  return(tcga_maf_file_list)
}


#' Function to extract the clinical annotations from TCGA
#' @description This function download and extract the clinical
#' annotations from TCGA.
#' @author Mayank Tandon, Ashish Jain
#' @param cancerCodes A character vector of TCGA cancer codes
#' @param outputFolder path of the file containing the clinical
#' annotations from TCGA
#' @param plotdata Flag to plot the annotations
#' @export
#' @return A list containing the TCGA clinical annotations
#'
#' @importFrom TCGAbiolinks GDCquery_clinic
#'
#' @examples
#' library(MAFDash)
#' cancerCodes <- "ACC"
#' #clinicalData <- getTCGAClinicalAnnotation(cancerCodes = cancerCodes)
#'
getTCGAClinicalAnnotation <- function(cancerCodes="ACC",outputFolder=tempdir(), plotdata=NULL) {
  cancerCodes <- ensurer::ensure_that(cancerCodes,
                                      !is.null(.) && (class(.) == "character") && all(. %in% TCGAbiolinks::getGDCprojects()$tumor),
                                      err_desc = "Please enter the correct TCGA cancer code. See TCGAbiolinks::getGDCprojects().")
  plotdata <- ensurer::ensure_that(plotdata,
                                   is.null(.) || (class(.) == "data.frame"),
                                   err_desc = "Please enter the data to be plotted in a correct format.")

  tcgaClinicalAnnotationDataList<-list()
  for(cancerCode in cancerCodes)
  {
    tcga_clinical_file=file.path(outputFolder,paste0("TCGA_",cancerCode,".clinical.txt"))
    if (! file.exists(tcga_clinical_file)) {
      if (!dir.exists(dirname(tcga_clinical_file))) {dir.create(dirname(tcga_clinical_file), recursive = TRUE)}
      tcga_clinical <- TCGAbiolinks::GDCquery_clinic(project = paste0("TCGA-",cancerCode), type = "clinical")
      write.table(tcga_clinical, file=tcga_clinical_file, quote=TRUE, sep="\t", row.names = FALSE, col.names = TRUE)
    }
    tcga_clin_data <- read.table(tcga_clinical_file, sep="\t",header = TRUE,stringsAsFactors = FALSE)
    tcga_clin_data$Tumor_Sample_Barcode <- tcga_clin_data$bcr_patient_barcode
    #tcga_clin_data <- tcga_maf_obj@clinical.data
    tcga_pheno_columns <- c("Tumor_Sample_Barcode","ajcc_pathologic_stage","age_at_diagnosis","gender","race","vital_status","tissue_or_organ_of_origin")
    matched_order=1:nrow(tcga_clin_data)
    if (!is.null(plotdata)) {
      matched_order=match(colnames(plotdata), tcga_clin_data$Tumor_Sample_Barcode, nomatch=0)
    }
    tcga_anno_data <- tcga_clin_data[matched_order,tcga_pheno_columns]
    cancerCode <- paste0(unique(tcga_clin_data$disease), collapse=",")
    tcga_anno_data$Dataset <- cancerCode
    tcgaClinicalAnnotationDataList[[cancerCode]]<-tcga_anno_data
    #anno_data <- tcga_anno_data
  }
  anno_data<-do.call("rbind",tcgaClinicalAnnotationDataList)
  stages=sort(unique(anno_data$ajcc_pathologic_stage))
  #stage_colors <- setNames(brewer.pal(n = length(stages), name = "Reds"), stages)
  stage_colors <- setNames(rainbow(length(stages)), stages)

  anno_data$age_at_diagnosis <- as.numeric(as.character(anno_data$age_at_diagnosis))
  age_range=round(range(anno_data$age_at_diagnosis, na.rm = TRUE),-1)
  age_color_length=10
  age_breaks=round(seq(age_range[1], age_range[2], length.out=age_color_length),0)
  age_color_vals=colorRampPalette(c("lightblue1","royalblue1","navy"))(age_color_length)
  age_colors=colorRamp2(age_breaks, age_color_vals)

  gender_colors=c(female="hotpink", male="cornflowerblue")

  races=sort(unique(anno_data$race))
  #race_colors <- setNames(rev(brewer.pal(n = length(races), name = "Set1")), races)
  race_colors <- setNames(rev(heat.colors(length(races))), races)

  statuses=sort(unique(anno_data$vital_status))
  vitstat_colors <- c(Alive="darkgreen",Dead="darkred")

  tissues=sort(unique(anno_data$tissue_or_organ_of_origin))
  #tissue_colors <- setNames(brewer.pal(n = length(tissues), name = "Dark2"), tissues)
  tissue_colors <- setNames(topo.colors(length(tissues)), tissues)

  # dataset_colors <- setNames(c("mediumorchid1","darkolivegreen1"),
  dataset_colors <- setNames(c("grey30","darkolivegreen1"),
                             c(cancerCode, "Other"))

  anno_colors <- setNames(list(stage_colors, age_colors, gender_colors, race_colors, vitstat_colors, tissue_colors, dataset_colors),
                          setdiff(colnames(anno_data),"Tumor_Sample_Barcode"))


  mycols <- which(!colnames(anno_data) %in% c("Tumor_Sample_Barcode"))
  anno_df <- anno_data[,mycols]
  myanno <- ComplexHeatmap::HeatmapAnnotation(df=anno_df,col = anno_colors)

  return(list(colorList=anno_colors, annodata=anno_data, HManno=myanno))

}

#' Makes reasonable colors for some TCGA clinical annoations
#' @description This will return a list of colors that can be used with TCGA clinical annotations
#' @author Mayank Tandon, Ashish Jain
#' @param ageRange The range of patient's age to generate color vectors
#' @export
#' @return A list containing the TCGA clinical annotations
#'
#' @examples
#' library(MAFDash)
#' colorList <- getTCGAClinicalColors()
getTCGAClinicalColors <- function(ageRange=c(0,100)) {
  #require(RColorBrewer)
  #suppressPackageStartupMessages(require(circlize))
  # tcga_pheno_columns <- c("Tumor_Sample_Barcode","ajcc_pathologic_stage","age_at_diagnosis","gender","race","vital_status","tissue_or_organ_of_origin")
  tcga_pheno_columns <- c("ajcc_pathologic_stage","age_at_diagnosis","gender","race","vital_status")

  # if (is.null(stages)) {
    stages <- paste("Stage",c("X",paste(rep(c("I","II","III","IV"), each=4),c("","A","B","C"), sep="")))
  # }
  #stage_colors <- setNames(colorRampPalette(brewer.pal(n = 5, name = "Reds"))(length(stages)), stages)
  stage_colors <- setNames(colorRampPalette(rainbow(5))(length(stages)), stages)

  # browser()
  ageRange=sort(ageRange)
  age_range=round(c(max(c(0,ageRange[1]-4)),ageRange[2]+4),-1)
  age_color_length=10
  age_breaks=round(seq(age_range[1], age_range[2], length.out=age_color_length),0)
  age_color_vals=colorRampPalette(c("lightblue1","royalblue1","navy"))(age_color_length)
  age_colors=colorRamp2(age_breaks, age_color_vals)

  gender_colors=c(female="hotpink", male="cornflowerblue")
  vitstat_colors <- c(Alive="darkgreen",Dead="darkred","Not Reported"="grey80")

  races=c("american indian or alaska native","asian","black or african american","native hawaiian or other pacific islander","white","not allowed to collect","not reported","other","unknown")
  #race_colors <- setNames(brewer.pal(n = length(races), name = "Set1"), races)
  race_colors <- setNames(heat.colors(length(races)), races)


  # tissues=sort(unique(anno_data$tissue_or_organ_of_origin))
  # tissue_colors <- setNames(brewer.pal(n = length(tissues), name = "Dark2"), tissues)

  # dataset_colors <- setNames(c("mediumorchid1","darkolivegreen1"),
  # dataset_colors <- setNames(c("grey30","darkolivegreen1"),
  #                            c(cancerCode, "Other"))

  anno_colors <- setNames(list(stage_colors, age_colors, gender_colors, race_colors, vitstat_colors),
                          tcga_pheno_columns)

  return(anno_colors)
}

Try the MAFDash package in your browser

Any scripts or data that you put into this service are public.

MAFDash documentation built on April 1, 2022, 9:05 a.m.