maftools: Summarize, Analyze and Visualize MAF Files

Documented in oncodrive

#' Detect cancer driver genes based on positional clustering of variants.
#'
#' @description Clusters variants based on their position to detect disease causing genes.
#' @details This is the re-implimentation of algorithm defined in OncodriveCLUST article. Concept is based on the fact that most of the variants in cancer causing genes are enriched at few specific loci (aka hotspots).
#' This method takes advantage of such positions to identify cancer genes. Cluster score of 1 means, a single hotspot hosts all observed variants. If you use this function, please cite OncodriveCLUST article.
#' @references Tamborero D, Gonzalez-Perez A and Lopez-Bigas N. OncodriveCLUST: exploiting the positional clustering of somatic mutations to identify cancer genes. Bioinformatics. 2013; doi: 10.1093/bioinformatics/btt395s
#' @param maf an \code{\link{MAF}} object generated by \code{\link{read.maf}}
#' @param AACol manually specify column name for amino acid changes. Default looks for field 'AAChange'
#' @param pvalMethod either zscore (default method for oncodriveCLUST), poisson or combined (uses lowest of the two pvalues).
#' @param nBgGenes minimum number of genes required to estimate background score. Default 100. Do not change this unless its necessary.
#' @param minMut minimum number of mutations required for a gene to be included in analysis. Default 5.
#' @param bgEstimate If FALSE skips background estimation from synonymous variants and uses predifined values estimated from COSMIC synonymous variants.
#' @param ignoreGenes Ignore these genes from analysis. Default NULL. Helpful in case data contains large number of variants belonging to polymorphic genes such as mucins and TTN.
#' @return data table of genes ordered according to p-values.
#' @seealso \code{\link{plotOncodrive}}
#' @examples
#'
#' laml.maf <- system.file("extdata", "tcga_laml.maf.gz", package = "maftools")
#' laml <- read.maf(maf = laml.maf)
#' laml.sig <- oncodrive(maf = laml, AACol = 'Protein_Change', minMut = 5)
#'
#' @export


oncodrive = function(maf, AACol = NULL, minMut = 5, pvalMethod = 'zscore', nBgGenes = 100, bgEstimate = TRUE, ignoreGenes = NULL){

  warning("Oncodrive has been superseeded by OncodriveCLUSTL. See http://bg.upf.edu/group/projects/oncodrive-clust.php")

  #Proetin Length source
  gl = system.file('extdata', 'prot_len.txt.gz', package = 'maftools')

  if(Sys.info()[['sysname']] == 'Windows'){
    gl.gz = gzfile(description = gl, open = 'r')
    gl <- suppressWarnings( data.table(read.csv( file = gl.gz, header = TRUE, sep = '\t', stringsAsFactors = FALSE)) )
    close(gl.gz)
  } else{
    gl = data.table::fread(cmd = paste('zcat <', gl), sep = '\t', stringsAsFactors = FALSE)
  }

  pval.options = c('zscore', 'poisson', 'combined')

  if(!pvalMethod %in% pval.options){
    stop('pvalMethod can only be either zscore, poisson or combined')
  }

  if(length(pvalMethod) > 1){
    stop('pvalMethod can only be either zscore, poisson or combined')
  }


  #syn variants for background
  syn.maf = maf@maf.silent
  #number of samples in maf
  numSamples = as.numeric(maf@summary[3,summary])
  #Perform clustering and calculate background scores.
  if(bgEstimate){
    if(nrow(syn.maf) == 0){
      message('No syn mutations found! Skipping background estimation. Using predefined values. (Mean = 0.279; SD = 0.13)')
      bg.mean = 0.279
      bg.sd = 0.13
    }else{
      message('Estimating background scores from synonymous variants..')
      syn.bg.scores = parse_prot(dat = syn.maf, AACol = AACol, gl, m = minMut, calBg = TRUE, nBg = nBgGenes)

      #If number of genes to calculate background scores is not enough, use predefined scores.
      if(is.null(syn.bg.scores)){
        message("Not enough genes to build background. Using predefined values. (Mean = 0.279; SD = 0.13)")
        bg.mean = 0.279
        bg.sd = 0.13
      }else {
        if(nrow(syn.bg.scores) < nBgGenes){
          message("Not enough genes to build background. Using predefined values. (Mean = 0.279; SD = 0.13)")
          bg.mean = 0.279
          bg.sd = 0.13
        }else{
          bg.mean = mean(syn.bg.scores$clusterScores)
          bg.sd = sd(syn.bg.scores$clusterScores)
          message(paste('Estimated background mean: ', bg.mean))
          message(paste('Estimated background SD: ', bg.sd))
        }
      }
    }
  }else{
    message("Using predefined values for background. (Mean = 0.279; SD = 0.13)")
    bg.mean = 0.279
    bg.sd = 0.13
  }

  #non-syn variants
  non.syn.maf = maf@data

  #Remove genes to ignore
  if(!is.null(ignoreGenes)){
    ignoreGenes.count = nrow(non.syn.maf[Hugo_Symbol %in% ignoreGenes])
    message(paste('Removed', ignoreGenes.count, 'variants belonging to', paste(ignoreGenes, collapse = ', ', sep=',')))
    non.syn.maf = non.syn.maf[!Hugo_Symbol %in% ignoreGenes]
  }

  #Perform clustering and calculate cluster scores for nonsyn variants.
  message('Estimating cluster scores from non-syn variants..')
  nonsyn.scores = parse_prot(dat = non.syn.maf, AACol = AACol, gl = gl, m = minMut, calBg = FALSE, nBg = nBgGenes)

  if(pvalMethod == 'combined'){
    message('Comapring with background model and estimating p-values..')
    nonsyn.scores$zscore = (nonsyn.scores$clusterScores - bg.mean) / bg.sd
    nonsyn.scores$tPval = 1- pnorm(nonsyn.scores$zscore)
    nonsyn.scores$tFdr = p.adjust(nonsyn.scores$tPval, method = 'fdr')

    nonsyn.scores = merge(getGeneSummary(maf), nonsyn.scores, by = 'Hugo_Symbol')
    nonsyn.scores[,fract_muts_in_clusters := muts_in_clusters/total]

    counts.glm = glm(formula = total ~ protLen+clusters, family = poisson(link = identity), data = nonsyn.scores) #Poisson model
    nonsyn.scores$Expected = counts.glm$fitted.values #Get expected number of events (mutations) from the model

    observed_mut_colIndex = which(colnames(nonsyn.scores) == 'total')
    expected_mut_colIndex = which(colnames(nonsyn.scores) == 'Expected')

    #Poisson test to caluclate difference (p-value)
    nonsyn.scores$poissonPval = apply(nonsyn.scores, 1, function(x) {
      poisson.test(as.numeric(x[observed_mut_colIndex]), as.numeric(x[expected_mut_colIndex]))$p.value
    })

    nonsyn.scores$poissonFdr = p.adjust(nonsyn.scores$poissonPval, method = 'fdr')
    nonsyn.scores = nonsyn.scores[order(poissonFdr)]

    nonsyn.scores$fdr = apply(nonsyn.scores[,.(tFdr, poissonFdr)], MARGIN = 1, FUN = min)

  } else if(pvalMethod == 'zscore'){
    #Oncodrive clust way of caluclating pvalues
    #Calculate z scores; compare it to bg scores and estimate z-score, pvalues, corrected pvalues (fdr) (assumes normal distribution)
    message('Comapring with background model and estimating p-values..')
    nonsyn.scores$zscore = (nonsyn.scores$clusterScores - bg.mean) / bg.sd
    nonsyn.scores$pval = 1- pnorm(nonsyn.scores$zscore)
    nonsyn.scores$fdr = p.adjust(nonsyn.scores$pval, method = 'fdr')

    nonsyn.scores = merge(getGeneSummary(maf), nonsyn.scores, by = 'Hugo_Symbol')
    nonsyn.scores[,fract_muts_in_clusters := muts_in_clusters/total]
    #nonsyn.scores[,fract_MutatedSamples := MutatedSamples/numSamples]
    nonsyn.scores = nonsyn.scores[order(fdr)]
  }else{
    #Assuming poisson distribution of mutation counts
    #Now model observed number of mutations as a function of number of clusters and protein length. Calculate expected number of events based on poisson distribution.
    nonsyn.scores = merge(getGeneSummary(maf), nonsyn.scores, by = 'Hugo_Symbol')
    nonsyn.scores[,fract_muts_in_clusters := muts_in_clusters/total]

    counts.glm = glm(formula = total ~ protLen+clusters, family = poisson(link = identity), data = nonsyn.scores) #Poisson model
    nonsyn.scores$Expected = counts.glm$fitted.values #Get expected number of events (mutations) from the model

    observed_mut_colIndex = which(colnames(nonsyn.scores) == 'total')
    expected_mut_colIndex = which(colnames(nonsyn.scores) == 'Expected')

    #Poisson test to caluclate difference (p-value)
    nonsyn.scores$pval = apply(nonsyn.scores, 1, function(x) {
      poisson.test(as.numeric(x[observed_mut_colIndex]), as.numeric(x[expected_mut_colIndex]))$p.value
    })

    nonsyn.scores$fdr = p.adjust(nonsyn.scores$pval, method = 'fdr')
    nonsyn.scores = nonsyn.scores[order(fdr)]
  }
  message('Done !')
  return(nonsyn.scores)
}