R/readGistic.R

Defines functions FUN readGistic

Documented in readGistic

#' Read and summarize gistic output.
#' @description A little function to summarize gistic output files. Summarized output is returned as a list of tables.
#' @param gisticDir Directory containing GISTIC results. Default NULL. If provided all relevent files will be imported. Alternatively, below arguments can be used to import required files.
#' @param gisticAllLesionsFile All Lesions file generated by gistic. e.g; all_lesions.conf_XX.txt, where XX is the confidence level. Required. Default NULL.
#' @param gisticAmpGenesFile Amplification Genes file generated by gistic. e.g; amp_genes.conf_XX.txt, where XX is the confidence level. Default NULL.
#' @param gisticDelGenesFile Deletion Genes file generated by gistic. e.g; del_genes.conf_XX.txt, where XX is the confidence level. Default NULL.
#' @param gisticScoresFile scores.gistic file generated by gistic.
#' @param cnLevel level of CN changes to use. Can be 'all', 'deep' or 'shallow'. Default uses all i.e, genes with both 'shallow' or 'deep' CN changes
#' @param isTCGA Is the data from TCGA. Default FALSE.
#' @param verbose Default TRUE
#' @return A list of summarized data.
#' @export
#' @examples
#' all.lesions <- system.file("extdata", "all_lesions.conf_99.txt", package = "maftools")
#' amp.genes <- system.file("extdata", "amp_genes.conf_99.txt", package = "maftools")
#' del.genes <- system.file("extdata", "del_genes.conf_99.txt", package = "maftools")
#' scores.gistic <- system.file("extdata", "scores.gistic", package = "maftools")
#' laml.gistic = readGistic(gisticAllLesionsFile = all.lesions, gisticAmpGenesFile = amp.genes, gisticDelGenesFile = del.genes, gisticScoresFile = scores.gistic, isTCGA = TRUE)
#' @details
#' Requires output files generated from GISTIC. Gistic documentation can be found here ftp://ftp.broadinstitute.org/pub/GISTIC2.0/GISTICDocumentation_standalone.htm

readGistic = function(gisticDir = NULL, gisticAllLesionsFile = NULL, gisticAmpGenesFile = NULL, gisticDelGenesFile = NULL, gisticScoresFile = NULL, cnLevel = 'all', isTCGA = FALSE, verbose = TRUE){

  if(!is.null(gisticDir)){
    lf = list.files(gisticDir)
    lf = lf[startsWith(lf,'all_lesions.conf_')]

    #Parse conf interval from all_lesions.conf_99.txt file
    conf = gsub(pattern = "[A-Z]|[[:punct:]]", replacement = "", x = basename(lf), ignore.case = TRUE)

    gisticAllLesionsFile <- file.path(gisticDir, paste0("all_lesions.conf_",conf,".txt"))
    gisticAmpGenesFile <- file.path(gisticDir, paste0("amp_genes.conf_",conf,".txt"))
    gisticDelGenesFile <- file.path(gisticDir, paste0("del_genes.conf_",conf,".txt"))
    gisticScoresFile <- file.path(gisticDir, "scores.gistic")

    if(!file.exists(gisticAllLesionsFile)){
      gisticAllLesionsFile = NULL
    }

    if(!file.exists(gisticAmpGenesFile)){
      gisticAmpGenesFile = NULL
    }

    if(!file.exists(gisticDelGenesFile)){
      gisticDelGenesFile = NULL
    }

    if(!file.exists(gisticScoresFile)){
      gisticScoresFile = NULL
    }

    #If exists read broad_significance_results.txt
    scoresBroad <- file.path(gisticDir, "broad_significance_results.txt")

    if (file.exists(scoresBroad)){
      broad = data.table::fread(scoresBroad, sep = "\t",
                                    header = TRUE, quote = "", stringsAsFactors = FALSE, skip = "#",
                                    na.strings = "")
    }else{
      broad = NA
    }
  }else{
    broad = NA
  }

  if(is.null(gisticAllLesionsFile)){
    stop('Misisng gisticAllLesionsFile file! Provide all_lesions.conf_xx.txt generated by Gistic')
  }

  if(is.null(gisticAmpGenesFile) & is.null(gisticDelGenesFile)){
    stop('Missing gistic genes files! Provide atleast one of amp_genes.conf_xx.txt or del_genes.conf_xx.txt file.')
  }

  if(is.null(gisticScoresFile)){
    stop('Misisng scores.gistic file!')
  }

  # get 99 from "all_lesions.conf_99.txt", get 95 from "all_lesions.conf_95.txt", etc.
  conf = gsub(pattern = "[A-Z]|[[:punct:]]", replacement = "", x = basename(gisticAllLesionsFile), ignore.case = TRUE)

  if(verbose){
    cat('-Processing Gistic files..\n')
  }

  all.lesions = data.table::fread(input = gisticAllLesionsFile,stringsAsFactors = FALSE, header = TRUE)

  #Remove an empty columns at the end of the table
  if(colnames(all.lesions)[ncol(all.lesions)] == paste('V', ncol(all.lesions), sep='')){
    all.lesions = all.lesions[,1:c(ncol(all.lesions)-1), with = FALSE]
  }
  #Rename columns and fix spaces and dirty underscores
  colnames(all.lesions)[c(1,2,3)] = c('Unique_Name', 'cytoband', 'Wide_Peak_Limits')
  all.lesions$Unique_Name = gsub(pattern = ' ', replacement = '_', x = all.lesions$Unique_Name)
  all.lesions$Unique_Name = gsub(pattern = '__', replacement = '_', x = all.lesions$Unique_Name)

  #Remove actual sn values and get wide peak limits
  all.lesions = all.lesions[!grep(pattern = 'CN_values', x = all.lesions[,Unique_Name]),]
  all.lesions[, Wide_Peak_Limits := sapply(strsplit(x = as.character(all.lesions[,Wide_Peak_Limits]), split = '(', fixed = TRUE), '[', 1)]

  #qvlaues for all wide peaks
  qval = all.lesions[,c(1, 2, 3, 6), with =FALSE]
  colnames(x = qval) = c('Unique_Name', 'Cytoband', 'Wide_Peak_Limits', 'qvalues')

  all.lesions = all.lesions[,c(1,2,3,10:ncol(all.lesions)), with =FALSE]

  #Generate an unique peak-id
  all.lesions$cytoband = paste(substr(x = all.lesions$Unique_Name, start = 1, stop = 3), all.lesions$cytoband, sep = '_')
  all.lesions$cytoband = paste(all.lesions$cytoband, all.lesions$Wide_Peak_Limits, sep = '_')
  qval$peakID = all.lesions[,cytoband]

  all.lesions[,Unique_Name := NULL]
  all.lesions[,Wide_Peak_Limits := NULL]

  cnSamples = colnames(all.lesions)[2:ncol(all.lesions)]

  all.lesions.melt = data.table::melt(all.lesions, id.vars = 'cytoband')

  cnLevel = match.arg(arg = cnLevel, choices = c("all", "deep", "shallow"))
  if(cnLevel == 'all'){
    all.lesions.melt = all.lesions.melt[value %in% c(1, 2)] #1 = shallow amp/del; 2 = deep amp/del
  }else if(cnLevel == 'deep'){
    if(nrow(all.lesions.melt[value %in% 2]) == 0){
      stop("No deep events found!")
    }
    all.lesions.melt = all.lesions.melt[value %in% 2] #1 = shallow amp/del; 2 = deep amp/del
  }else if(cnLevel == 'shallow'){
    if(nrow(all.lesions.melt[value %in% 1]) == 0){
      stop("No shallow events found!")
    }
    all.lesions.melt = all.lesions.melt[value %in% 1] #1 = shallow amp/del; 2 = deep amp/del
  }

  cnGenes = data.table::data.table()

  if(!is.null(gisticAmpGenesFile)){
    if(verbose){
      cat(paste0('--Processing ', basename(gisticAmpGenesFile), '\n'))
    }
    ampGenes = data.table::fread(input = gisticAmpGenesFile, stringsAsFactors = FALSE, header = TRUE)
    if(colnames(ampGenes)[ncol(ampGenes)] == paste('V', ncol(ampGenes), sep='')){
      ampGenes = ampGenes[,1:c(ncol(ampGenes)-1), with = FALSE]
    }
    wpb = as.character(ampGenes[3]) #wide peak boundaries
    wpb = wpb[2:length(wpb)]

    #we need data only from 4th row
    ampGenes = ampGenes[c(4:nrow(ampGenes)),]
    ampGenes$cytoband = gsub(pattern = ' ', replacement = '_', x = ampGenes$cytoband)
    colnames(ampGenes) = c( 'cytoband',paste0( 'Amp_', colnames(ampGenes)[2:length(colnames(ampGenes))], '_',wpb))

    #data.table::setDF(x = ampGenes)
    ampGenes = suppressWarnings(data.table::melt(ampGenes, id.vars = 'cytoband'))
    data.table::setDT(ampGenes)
    #Remove empty values
    ampGenes = ampGenes[!value %in% '']
    #ampGenes$value = sapply(strsplit(x = ampGenes$value, split = '|', fixed = TRUE), '[', 1)
    ampGenes = ampGenes[!grep(pattern = '|', x = ampGenes$value, fixed = TRUE)] #remove genes with ambiguous annotation
    ampGenes = ampGenes[,.(variable, value)]
    cnGenes = rbind(cnGenes, ampGenes)
  }

  if(!is.null(gisticDelGenesFile)){
    if(verbose){
      cat(paste0('--Processing ', basename(gisticDelGenesFile), '\n'))
    }
    delGenes = data.table::fread(input = gisticDelGenesFile, stringsAsFactors = FALSE, header = TRUE)
    if(colnames(delGenes)[ncol(delGenes)] == paste('V', ncol(delGenes), sep='')){
      delGenes = delGenes[,1:c(ncol(delGenes)-1), with = FALSE]
    }

    wpb = as.character(delGenes[3])
    wpb = wpb[2:length(wpb)]

    delGenes = delGenes[c(4:nrow(delGenes)),]
    delGenes$cytoband = gsub(pattern = ' ', replacement = '_', x = delGenes$cytoband)
    colnames(delGenes) = c( 'cytoband',paste0( 'Del_', colnames(delGenes)[2:length(colnames(delGenes))], '_',wpb))

    #data.table::setDF(x = delGenes)
    delGenes = suppressWarnings(data.table::melt(delGenes, id.vars = 'cytoband'))
    data.table::setDT(delGenes)

    delGenes = delGenes[!value %in% '']
    #delGenes$value = sapply(strsplit(x = delGenes$value, split = '|', fixed = TRUE), '[', 1)
    delGenes = delGenes[!grep(pattern = '|', x = delGenes$value, fixed = TRUE)]
    delGenes = delGenes[,.(variable, value)]
    cnGenes = rbind(cnGenes, delGenes)
  }


  if(verbose){
    cat(paste0('--Processing ', basename(gisticScoresFile), '\n'))
  }
  gis.scores = data.table::fread(input = gisticScoresFile)
  colnames(gis.scores) = c('Variant_Classification', 'Chromosome', 'Start_Position', 'End_Position', 'fdr', 'G_Score', 'Avg_amplitude', 'frequency')
  #gis.scores$Variant_Classification = ifelse(test = as.numeric(gis.scores$fdr) > fdrCutOff, yes = gis.scores$Variant_Classification, no = 'neutral')
  #gis.scores$Variant_Classification = factor(gis.scores$Variant_Classification, levels = c('neutral', 'Amp', 'Del'))

  #cnSamples = unique(x = as.character(all.lesions.melt$variable))

  if(verbose){
    cat('--Summarizing by samples\n')
  }
  cnDT = data.table::rbindlist( lapply(X = cnSamples, FUN = function(x){
    cytBands = all.lesions.melt[variable %in% x, cytoband]
    cytBandsGenes = cnGenes[variable %in% cytBands]
    cytBandsGenes[,TumorSampleBarcode := x]
  }))

  cnDT[, CN := substr(x = cnDT[,variable], start = 1, stop = 3)]
  cnDT = cnDT[,.(variable, value, TumorSampleBarcode, CN)]
  colnames(cnDT) = c('Cytoband', 'Hugo_Symbol', 'Tumor_Sample_Barcode', 'Variant_Classification')
  cnDT = suppressWarnings(cnDT[,Variant_Type := 'CNV'])

  if(isTCGA){
    cnDT$Tumor_Sample_Barcode = substr(x = cnDT$Tumor_Sample_Barcode, start = 1, stop = 12)
    cnSamples = substr(x = cnSamples, start = 1, stop = 12)
    cnSamples = unique(x = cnSamples)
  }

  qval$Unique_Name = gsub(pattern = 'Amplification_Peak', replacement = 'AP', x = qval$Unique_Name)
  qval$Unique_Name = gsub(pattern = 'Deletion_Peak', replacement = 'DP', x = qval$Unique_Name)
  qval[, Unique_Name := paste(Unique_Name, Cytoband, sep = ':')]
  cnDT$peakID = as.character(factor(x = cnDT[,Cytoband], levels = qval[,peakID], labels = qval[,Unique_Name]))
  qval[,peakID := NULL]
  cnDT[,Cytoband := NULL]
  colnames(cnDT)[5] = 'Cytoband'

  cnDT[, Tumor_Sample_Barcode := factor(x = Tumor_Sample_Barcode, levels = cnSamples)]
  cnDt.summary = summarizeGistic(gistic = cnDT)
  cnDt.mat = gisticMap(gistic = cnDT)

  cnDt.summary$cytoband.summary = merge(cnDt.summary$cytoband.summary, qval, by.x = 'Cytoband', by.y = 'Unique_Name', all.x = TRUE)
  colnames(cnDt.summary$cytoband.summary)[c(1, 5)] = c('Unique_Name', 'Cytoband')
  cnDt.summary$cytoband.summary = cnDt.summary$cytoband.summary[order(qvalues)]

  g = GISTIC(data = cnDT, cnv.summary = cnDt.summary$cnv.summary,
             cytoband.summary = cnDt.summary$cytoband.summary,
          gene.summary = cnDt.summary$gene.summary,
          cnMatrix = cnDt.mat$oncomat, numericMatrix = cnDt.mat$nummat,
          classCode = cnDt.mat$vc, summary = cnDt.summary$summary, gis.scores = gis.scores)

  attr(g, "broad") = broad
  attr(g, "conf") = conf

  return(g)
}
PoisonAlien/maftools documentation built on April 7, 2024, 2:49 a.m.