R/TCGA.R

Defines functions tcgaShorterBarcode extractTCGASampleType tcgabiolinks.get.450k.methylation get.TCGA.Project.Clinical get.TCGA.miRNAExpression get.TCGA.RNA.FPKM tcgabiolinks.get.junction.coverage tcgabiolinks.get.RNA.expression.log2tpm fpkm2tpm get.TCGA.sample.project.infomap

Documented in extractTCGASampleType fpkm2tpm get.TCGA.miRNAExpression get.TCGA.Project.Clinical get.TCGA.RNA.FPKM get.TCGA.sample.project.infomap tcgabiolinks.get.450k.methylation tcgabiolinks.get.junction.coverage tcgabiolinks.get.RNA.expression.log2tpm tcgaShorterBarcode

#' Get sample barcode and TCGA project map information
#'
#'
#' @param project A vector. TCGA-ACC, TCGA-COAD. See TCGAbiolinks::getGDCprojects()$project_id
#' @return A data.frame
#'  X1 proj samples
#'	1	1	TCGA-ACC	TCGA-OR-A5JZ
#'	2	1	TCGA-ACC	TCGA-OR-A5JK
#' @export
#'
#' @examples
get.TCGA.sample.project.infomap <- function(project=NA){

  library(TCGAbiolinks)

  # If not specify project, load default
  if (is.na(project)){
    project <- grep("TCGA", sort(getGDCprojects()$project_id),
                    value = TRUE)
  }


  df <- plyr::adply(project, .margins = 1, .fun = function(proj) {
    samples <- TCGAbiolinks:::getSubmitterID(proj)
    return(data.frame(proj, samples))
  })

  return(df)

}


#' Convert FPKM data fram to TPM
#'
#' @param fpkm.exp.df
#'
#' @return
#' @export
#'
#' @examples
fpkm2tpm <- function(fpkm.exp.df){

  fpkm2tpm <- apply(fpkm.exp.df, 2, function(fpkm){
      tpm <- exp(log(fpkm) - log(sum(fpkm,na.rm=T)) + log(1e6))
      tpm[which(is.na(tpm))] <- 0
      tpm
  })

  return(fpkm2tpm)
}



#' Download RAN expression by TCGAbiolinks
#'
#' @param project E.g. TCGA-LIHC
#' @param dir Raw data directory
#' @param remove.Raw If to remove raw data
#'
#' @return list(clinical = clinical, expression = expression.df, group = group)
#' Expression was log2 transformed.
#' Group is a data.frame including label, short name and sample barcode
#' Clinical is a list
#' @export
#'
#' @examples
tcgabiolinks.get.RNA.expression.log2tpm <- function(project, remove.Raw = FALSE, dir="~/rspace/GDCdata"){

  library(TCGAbiolinks)
  library(SummarizedExperiment)

  fpkm2tpm <- function(fpkm){
    tpm <- exp(log(fpkm) - log(sum(fpkm,na.rm=T)) + log(1e6))
    tpm[which(is.na(tpm))] <- 0
    return(tpm)
  }


  query <- TCGAbiolinks::GDCquery(project = project,
                                  data.category = "Transcriptome Profiling",
                                  workflow.type = "HTSeq - FPKM", # HTSeq - FPKM-UQ or HTSeq - Counts
                                  data.type = "Gene Expression Quantification",
                                  experimental.strategy = "RNA-Seq",
                                  legacy = FALSE
  )
  TCGAbiolinks::GDCdownload(query, directory = dir)
  project.data <- TCGAbiolinks::GDCprepare(query = query, directory = dir)



  # Expression
  expression.df <- SummarizedExperiment::assay(project.data)
  gene.information <- SummarizedExperiment::rowRanges(project.data)
  rm(project.data)
  # convert to tpm
  expression.df <- apply(expression.df, 2, fpkm2tpm)

  normal.sample <- TCGAbiolinks::TCGAquery_SampleTypes(barcode = colnames(expression.df),
                                                       typesample = "NT")
  tumor.sample <- TCGAbiolinks::TCGAquery_SampleTypes(barcode = colnames(expression.df),
                                                      typesample = "TP")

  group <- data.frame(
    Label = rep( c("Normal","Tumor"), c(length(normal.sample), length(tumor.sample))  ),
    Short.Name = substr(c(normal.sample,tumor.sample),1,12),
    Barcode = c(normal.sample,tumor.sample),
    stringsAsFactors = FALSE
  )

  # log2 transformation
  expression.df <- log2(expression.df[,c(normal.sample,tumor.sample)]+1)



  ## clinical
  query <- TCGAbiolinks::GDCquery(project = project,
                                  data.category = "Clinical",
                                  data.type = "Clinical Supplement",
                                  data.format = "BCR Biotab")
  TCGAbiolinks::GDCdownload(query, directory = dir)
  clinical <- TCGAbiolinks::GDCprepare(query, directory = dir)



  if(remove.Raw){
    file.remove( paste(dir,"/",project,sep="",collapse = "")  )
  }

  result <- list(clinical = clinical,
                 expression = expression.df,
                 group = group,
                 Gene.info = gene.information,
                 query = query
  )


}




#' Download junction data by TCGAbiolinks. Pls note, V2 tag and Illumina HiSeq platform were selected
#'
#' @param project E.g. TCGA-LIHC
#' @param dir Raw data directory
#' @param remove.Raw If to remove raw data
#' @param tags V2
#'
#' @return list(clinical = clinical, expression = expression.df, group = group)
#' Group is a data.frame including label, short name and sample barcode
#' Clinical is a list
#' @export
#'
#' @examples
tcgabiolinks.get.junction.coverage <- function(project, remove.Raw = FALSE, dir="~/rspace/GDCdata", tag = 'v2', platform = "Illumina HiSeq"){
  library(TCGAbiolinks)
  query <- TCGAbiolinks::GDCquery(project = project,
                                  legacy = TRUE,
                                  data.category = "Gene expression",
                                  data.type = "Exon junction quantification",
                                  experimental.strategy = "RNA-Seq",
                                  platform = platform

  )

  query[[1]][[1]] <- query[[1]][[1]][grep(tag,query[[1]][[1]]$tags), ]



  TCGAbiolinks::GDCdownload(query, directory = dir)
  project.data <- TCGAbiolinks::GDCprepare(query = query, directory = dir)



  # Expression
  expression.df <- SummarizedExperiment::assay(project.data)
  gene.information <- rowRanges(data)
  rm(project.data)

  normal.sample <- TCGAbiolinks::TCGAquery_SampleTypes(barcode = colnames(expression.df),
                                                       typesample = "NT")
  tumor.sample <- TCGAbiolinks::TCGAquery_SampleTypes(barcode = colnames(expression.df),
                                                      typesample = "TP")

  group <- data.frame(
    Label = rep( c("Normal","Tumor"), c(length(normal.sample), length(tumor.sample))  ),
    Short.Name = substr(c(normal.sample,tumor.sample),1,12),
    Barcode = c(normal.sample,tumor.sample),
    stringsAsFactors = FALSE
  )


  ## clinical
  query <- TCGAbiolinks::GDCquery(project = project,
                                  data.category = "Clinical",
                                  data.type = "Clinical Supplement",
                                  data.format = "BCR Biotab")
  TCGAbiolinks::GDCdownload(query, directory = dir)
  clinical <- TCGAbiolinks::GDCprepare(query, directory = dir)


  if(remove.Raw){
    file.remove( paste(dir,"/",project,sep="",collapse = "")  )
  }

  result <- list(clinical = clinical,
                 expression = expression.df,
                 group = group,
                 Gene.info = gene.information,
                 query = query
  )


}




#' Get TCGA RNA expression (RPKM) data
#'
#' @param tcga.project See ?RTCGA.miRNASeq::miRNASeq. For example: COAD
#' @param log2 If to perform log2 transformation
#'
#' @return
#' @export
#'
#' @examples
#' This function is similar to get.TCGA.miRNAExpression in Biomarker.R file.
#' Data source is illumina hiseq Level 3 RSEM normalized expression data. Data from 2015-11-01 snapshot.
#' The RNAseq gene expression level 3 data contains Reads per Kilobase per Million mapped reads (RPKM)
get.TCGA.RNA.FPKM <- function(tcga.project=NULL, rawCount=FALSE, log2=FALSE){

  if(is.null(tcga.project)){
    warning(?RTCGA.miRNASeq::miRNASeq)
    stop("Please specify a TCGA project. See ?RTCGA.miRNASeq::miRNASeq")
  }




  if (!require(RTCGA))BiocManager::install("RTCGA")
  if (!require(RTCGA.rnaseq))BiocManager::install("RTCGA.rnaseq")
  if (!require(RTCGA.clinical))BiocManager::install("RTCGA.clinical")

  # Expression
  ##-##-###-###-##-###-##-###-###-##-###-##-###-###-##-###-##-###-###-##-###
  expression.df = get( paste0(tcga.project,".rnaseq") )

  row.names(expression.df) = expression.df$bcr_patient_barcode
  expression.df = expression.df[,-c(1)]

  patient_ids <- rownames(expression.df)

  # convert string to numeric
  expression.df <- apply(as.matrix.noquote(expression.df),2,as.numeric)

  patient_ids -> rownames(expression.df)

  if(log2){
    expression.df = t(log2(expression.df+1))
  }else{
    expression.df = t( expression.df )
  }

  # Clinical
  ##-##-###-###-##-###-##-###-###-##-###-##-###-###-##-###-##-###-###-##-###
  # https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes
  group_list <- ifelse(substr(patient_ids,14,15)=='01','Tumor','Normal')

  library(dplyr)
  if(any( ! substr(patient_ids,14,15) %in% c(01,11)  )){
    stop("It seems some samples are not primary tumor or normal samples")
  }


  meta <- get( paste0(tcga.project,".clinical") )
  meta$patient.bcr_patient_barcode <- stringr::str_to_upper( meta$patient.bcr_patient_barcode )

  meta <- meta[match(substr(patient_ids,1,12),
                     meta$patient.bcr_patient_barcode),]

  row.names(meta) <- patient_ids
  meta$Label = group_list

  res = list(meta=meta,
             expr=expression.df,
             label=group_list)

  res
}





#' Get TCGA miRNA expression data
#'
#' @param tcga.project See ?RTCGA.miRNASeq::miRNASeq. For example: COAD
#' @param rawCount If to download raw count data
#' @param CPM If to download CPM data
#' @param log2 If to perform log2 transformation
#'
#' @return
#' @export
#'
#' @examples
get.TCGA.miRNAExpression <- function(tcga.project=NULL, rawCount=FALSE, CPM=FALSE, log2=FALSE){

  if(is.null(tcga.project)){
    warning(?RTCGA.miRNASeq::miRNASeq)
    stop("Please specify a TCGA project. See ?RTCGA.miRNASeq::miRNASeq")
  }
  if(!rawCount & !CPM){
    stop("Please specify download rawCount or CPM. Should only set one as TRUE")
  }else if(rawCount & CPM){
    stop("Should only set one as TRUE. Raw Count or CPM?")
  }



  if (!require(RTCGA))BiocManager::install("RTCGA")
  if (!require(RTCGA.miRNASeq))BiocManager::install("RTCGA.miRNASeq")
  if (!require(RTCGA.clinical))BiocManager::install("RTCGA.clinical")

  # Expression
  ##-##-###-###-##-###-##-###-###-##-###-##-###-###-##-###-##-###-###-##-###
  expression.df = get( paste0(tcga.project,".miRNASeq") )

  if(rawCount){
    col.ind <- grep("read_count", expression.df$miRNA_ID)
  }else if(CPM){
    col.ind <- grep("reads_per_million_miRNA_mapped", expression.df$miRNA_ID)
  }

  expression.df = expression.df[col.ind,-c(1,2)]

  patient_ids <- rownames(expression.df)

  # convert string to numeric
  expression.df <- apply(as.matrix.noquote(expression.df),2,as.numeric)

  patient_ids -> rownames(expression.df)

  library(dplyr)
  if(any( ! substr(patient_ids,14,15) %in% c("01","11")  )){
    warning("It seems some samples are not primary tumor or normal samples\nWe have removed\n")
    patient_ids = patient_ids[ substr(patient_ids,14,15) %in% c("01","11") ]
    expression.df = expression.df[patient_ids,]
  }


  if(log2){
    expression.df = t(log2(expression.df+1))
  }else{
    expression.df = t( expression.df )

  }

  # Clinical
  ##-##-###-###-##-###-##-###-###-##-###-##-###-###-##-###-##-###-###-##-###
  # https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes
  group_list <- ifelse(substr(patient_ids,14,15)=='01','Tumor','Normal')

  ind = order(group_list)
  group_list = group_list[ind]
  expression.df = expression.df[,ind]
  rm(ind)

  meta <- get( paste0(tcga.project,".clinical") )
  meta$patient.bcr_patient_barcode <- stringr::str_to_upper( meta$patient.bcr_patient_barcode )

  meta <- meta[match(substr(patient_ids,1,12),
                     meta$patient.bcr_patient_barcode),]

  row.names(meta) <- patient_ids
  meta$Label = group_list

  res = list(meta=meta,
             expr=expression.df,
             label=group_list)

  res
}


#' Get TCGA project clinical information
#'
#' @param project ESCA. TCGAbiolinks::getGDCprojects()$project_id without "TCGA_"
#' @param subtype if download subtype information
#' @param microsatellite if download microsatellite data
#'
#' @return
#' @export
#'
#' @examples
get.TCGA.Project.Clinical <- function(project, subtype=FASLE, microsatellite=FALSE){

  library(TCGAbiolinks)

  res = list()

  # 1 Clinical indexed data
  clinical = TCGAbiolinks::GDCquery_clinic(project = paste0(c("TCGA-",project), sep="", collapse = "" ), type = "clinical")
  res$clinical = clinical

  # 2 BCR Biotab: tsv files parsed from XML files
  query <- TCGAbiolinks::GDCquery(project = paste0(c("TCGA-",project), sep="", collapse = "" ),
                    data.category = "Clinical",
                    data.type = "Clinical Supplement",
                    data.format = "BCR Biotab")
  GDCdownload(query)
  clinical.BCRtab.all <- GDCprepare(query)
  res$clinical.BCRtab.all = clinical.BCRtab.all



  # 3 MSI-Mono-Dinucleotide Assay is performed to test a panel of four mononucleotide repeat loci
  if(microsatellite){
    query <- GDCquery(project = paste0(c("TCGA-",project), sep="", collapse = "" ),
                      data.category = "Other",
                      legacy = TRUE,
                      access = "open",
                      data.type = "Auxiliary test"  )
    GDCdownload(query)
    msi_results <- GDCprepare_clinic(query, "msi")

    res$msi = msi_results
  }

  if(subtype){
    subtype = TCGAbiolinks::TCGAquery_subtype(tumor = project)
    res$subtype.info = subtype
  }

  res

}


#' Download 450K methylation data
#'
#' @param project e.g. TCGA-COAD. See getGDCprojects()$project_id
#'
#' @return
#' @export
#'
#' @examples
tcgabiolinks.get.450k.methylation <- function(project){

  library(TCGAbiolinks)
  library(SummarizedExperiment)

  query <- GDCquery(project = project,
                    data.category = "DNA Methylation",
                    platform = "Illumina Human Methylation 450"
  )
  GDCdownload(query)
  met.dat <- GDCprepare(
    query = query,
    summarizedExperiment = TRUE
  )

  clin = data.frame( colData(met.dat) )

  met.dat <- assay(met.dat)

  list(clinical=clin, data=met.dat)

}



TCGA.ids = c("TCGA-3X-AAV9-01", "TCGA-3X-AAVA-01", "TCGA-3X-AAVE-01", "TCGA-4G-AAZO-01", "TCGA-4G-AAZT-01", "TCGA-W5-AA2G-01", "TCGA-W5-AA2I-01", "TCGA-W5-AA2I-11", "TCGA-W5-AA2O-01", "TCGA-W5-AA2Q-01", "TCGA-W5-AA2Q-11", "TCGA-W5-AA2R-01", "TCGA-W5-AA2R-11")


#' Extract TCGA sample types
#'
#' @param TCGA.ids
#'
#' @return
#' @export
#'
#' @examples
#' ids = c("TCGA-FV-A495-01", "TCGA-G3-A3CH-11", "TCGA-G3-A3CH-01", "TCGA-DD-A3A6-11", "TCGA-DD-A3A6-01", "TCGA-DD-A11A-11", "TCGA-DD-A11A-01", "TCGA-CC-A3MB-01", "TCGA-BC-A3KF-01")
#' loonR::extractTCGASampleType(ids)
extractTCGASampleType <- function(TCGA.ids){

  library(dplyr)
# https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes

  code.table = data.table::data.table(
        Code = c("01","02","03","04","05","06","07",
                 "08","09","10","11","12","13",
                 "14","15","16","20","40","50","60",
                 "61","99"),
        Definition = c("Primary Solid Tumor",
                       "Recurrent Solid Tumor",
                       "Primary Blood Derived Cancer - Peripheral Blood",
                       "Recurrent Blood Derived Cancer - Bone Marrow",
                       "Additional - New Primary","Metastatic",
                       "Additional Metastatic","Human Tumor Original Cells",
                       "Primary Blood Derived Cancer - Bone Marrow","Blood Derived Normal",
                       "Solid Tissue Normal","Buccal Cell Normal",
                       "EBV Immortalized Normal","Bone Marrow Normal",
                       "sample type 15","sample type 16",
                       "Control Analyte",
                       "Recurrent Blood Derived Cancer - Peripheral Blood","Cell Lines",
                       "Primary Xenograft Tissue",
                       "Cell Line Derived Xenograft Tissue","sample type 99"),
                 Short.Letter.Code = c("TP","TR","TB","TRBM","TAP","TM",
                       "TAM","THOC","TBM","NB","NT","NBC",
                       "NEBV","NBM","15SH","16SH","CELLC",
                       "TRB","CELL","XP","XCL","99SH")
               )

   code = substr(TCGA.ids, 14, 15)
   short.names = substr(TCGA.ids, 1, 12)

   Sample.table = data.frame(ID=TCGA.ids, Short=short.names, Code=code)

   g.res = loonR::vector.group( v = TCGA.ids, g = code )

   names(g.res) = code.table$Definition[match(names(g.res), code.table$Code)]

   if(sum("Solid Tissue Normal" %in%  names(g.res) )==1){
     paired.samples = intersect(
       substr(g.res$`Solid Tissue Normal`,1,12),
       substr(g.res$`Primary Solid Tumor`,1,12)
       )
     paired.normal.sample = g.res$`Solid Tissue Normal`[match(paired.samples, substr(g.res$`Solid Tissue Normal`,1,12))]
     paired.tumor.sample  = g.res$`Primary Solid Tumor`[match(paired.samples, substr(g.res$`Primary Solid Tumor`,1,12))]

     paried.sample.df = data.frame(Normal = paired.normal.sample,
                                Tumor = paired.tumor.sample,
                                Short = paired.samples)

   }else{
     paried.sample.df = "No paired samples"
   }

   res = list(Sample.table = Sample.table,
              Grouped = g.res,
              Code.description = code.table,
              Paired.samples = paried.sample.df)
   res

}

#' Get shorter TCGA ID
#'
#' @param ids
#'
#' @return
#' @export
#'
#' @examples
tcgaShorterBarcode <- function(ids){
  substr(ids, 1, 16)
}
ProfessionalFarmer/loonR documentation built on Oct. 9, 2024, 9:56 p.m.