
Defines functions prepareMutSig

Documented in prepareMutSig

#' Prepares MAF file for MutSig analysis.
#' @description Corrects gene names for MutSig compatibility.
#' @param maf an \code{\link{MAF}} object generated by \code{\link{read.maf}}
#' @param fn basename for output file. If provided writes MAF to an output file with the given basename.
#' @details MutSig/MutSigCV is most widely used program for detecting driver genes. However, we have observed that covariates files (gene.covariates.txt and exome_full192.coverage.txt) which are bundled with MutSig have non-standard gene names (non Hugo_Symbols).
#' This discrepancy between Hugo_Symbols in MAF and non-Hugo_symbols in covariates file causes MutSig program to ignore such genes. For example, KMT2D - a well known driver gene in Esophageal Carcinoma is represented as MLL2 in MutSig covariates. This causes KMT2D to be ignored
#' from analysis and is represented as an insignificant gene in MutSig results. This function attempts to correct such gene symbols with a manually curated list of gene names compatible with MutSig covariates list.
#'@return returns a MAF with gene symbols corrected.
#' laml.maf <- system.file("extdata", "tcga_laml.maf.gz", package = "maftools")
#' laml <- read.maf(maf = laml.maf)
#' prepareMutSig(maf = laml)
#' @export

prepareMutSig = function(maf, fn = NULL){

  hugo.to.ms = system.file('extdata', 'hugo_to_mutSigSymbol.txt.gz', package = 'maftools')

  if(Sys.info()[['sysname']] == 'Windows'){
    hugo.to.ms.gz = gzfile(description = hugo.to.ms, open = 'r')
    hugo.to.ms <- suppressWarnings( data.table(read.csv( file = hugo.to.ms.gz, header = TRUE, sep = '\t', stringsAsFactors = FALSE)) )
  } else{
    hugo.to.ms = data.table::fread(cmd = paste('zcat <', hugo.to.ms), sep = '\t', stringsAsFactors = FALSE)

  mut = maf@data

  #in case user read maf without removing silent variants, remove theme here.
  silent = c("3'UTR", "5'UTR", "3'Flank", "Targeted_Region", "Silent", "Intron",
             "RNA", "IGR", "Splice_Region", "5'Flank", "lincRNA", "De_novo_Start_InFrame", "De_novo_Start_OutOfFrame", "Start_Codon_Ins", "Start_Codon_SNP", "Stop_Codon_Del")

  mut = mut[!Variant_Classification %in% silent]

  mut = rbind(mut, maf@maf.silent, fill = TRUE)
  mut[,OG_Hugo_Symbol := Hugo_Symbol]

  #Convert Hugo_Symbols to HGNC_Synonyms for MutSig run
  mafToChange = mut[Hugo_Symbol %in% hugo.to.ms$Hugo_Symbol]
  mafToRetain = mut[!Hugo_Symbol %in% hugo.to.ms$Hugo_Symbol]

  if(nrow(mafToChange) > 0){
    genesToChange = unique(as.character(mafToChange[,Hugo_Symbol]))

    hc = hugo.to.ms[Hugo_Symbol %in% genesToChange]
    mafToChange$Hugo_Symbol = suppressWarnings(as.character(factor(x = mafToChange$Hugo_Symbol, levels = hugo.to.ms$Hugo_Symbol, labels = hugo.to.ms$MutSig_Synonym)))
    mc = mafToChange[,.N, Hugo_Symbol]
    conv.tbl = merge(hc, mc, by.x = 'MutSig_Synonym', by.y = 'Hugo_Symbol')
    conv.tbl = conv.tbl[,.(Hugo_Symbol, MutSig_Synonym, N)]
    conv.tbl = conv.tbl[order(N, decreasing = TRUE)]
    message(paste0('Converting gene names for ', nrow(mafToChange), ' variants', ' from ', nrow(conv.tbl), ' genes'))
    mut = rbind(mafToRetain, mafToChange, fill = TRUE)

      write.table(x = mut, file = paste0(fn, '.mutSig.maf'), sep = '\t', quote = FALSE, row.names = FALSE)
      write.table(x = conv.tbl, file = paste0(fn, '.correctedSymbols.tsv'), sep = '\t', quote = FALSE, row.names = FALSE)
    message('Original symbols are preserved under column OG_Hugo_Symbol.')
    message('No changes done. All gene symbols look okay!')
thesushantpatil/maftools documentation built on May 18, 2020, 9:54 p.m.