maftools: Summarize, Analyze and Visualize MAF Files

Documented in annovarToMaf

#' Converts annovar annotations into MAF.
#'
#' @description Converts variant annotations from Annovar into a basic MAF.
#' @details Annovar is one of the most widely used Variant Annotation tools in Genomics. Annovar output is generally in a tabular format with various annotation columns.
#' This function converts such annovar output files into MAF. This function requires that annovar was run with gene based annotation as a first operation, before including
#' any filter or region based annotations. Please be aware that this function performs no transcript prioritization.
#'
#' e.g,
#' table_annovar.pl example/ex1.avinput humandb/ -buildver hg19 -out myanno -remove -protocol (\code{refGene}),cytoBand,dbnsfp30a -operation (\code{g}),r,f -nastring NA
#'
#' This function mainly uses gene based annotations for processing, rest of the annotation columns from input file will be attached to the end of the resulting MAF.
#' @param annovar input annovar annotation file. Can be vector of multiple files.
#' @param Center Center field in MAF file will be filled with this value. Default NA.
#' @param refBuild NCBI_Build field in MAF file will be filled with this value. Default hg19.
#' @param tsbCol column name containing Tumor_Sample_Barcode or sample names in input file.
#' @param table reference table used for gene-based annotations. Can be 'ensGene' or 'refGene'. Default 'refGene'
#' @param ens2hugo If `table` is `ensGene`, setting this argument to `TRUE` converts all ensemble IDs to hugo symbols.
#' @param basename If provided writes resulting MAF file to an output file.
#' @param sep field seperator for input file. Default tab seperated.
#' @param MAFobj If TRUE, returns results as an \code{\link{MAF}} object.
#' @param sampleAnno annotations associated with each sample/Tumor_Sample_Barcode in input annovar file. If provided it will be included in MAF object. Could be a text file or a data.frame. Ideally annotation would contain clinical data, survival information and other necessary features associated with samples. Default NULL.
#' @references Wang, K., Li, M. & Hakonarson, H. ANNOVAR: functional annotation of genetic variants from high-throughput sequencing data. Nucleic Acids Res 38, e164 (2010).
#' @return MAF table.
#' @examples
#' var.annovar <- system.file("extdata", "variants.hg19_multianno.txt", package = "maftools")
#' var.annovar.maf <- annovarToMaf(annovar = var.annovar, Center = 'CSI-NUS', refBuild = 'hg19',
#' tsbCol = 'Tumor_Sample_Barcode', table = 'ensGene')
#' @export

annovarToMaf = function(annovar, Center = NULL, refBuild = 'hg19', tsbCol = NULL, table = 'refGene', ens2hugo = TRUE, basename = NULL , sep = '\t', MAFobj = FALSE, sampleAnno = NULL){

  start_time = proc.time()
  cat("-Reading annovar files\n")
  ann = lapply(annovar, data.table::fread, colClasses = 'character', sep = sep, stringsAsFactors = FALSE,
               fill = TRUE, header=TRUE, skip = "Chr")
  names(ann) = unlist(data.table::tstrsplit(x = basename(annovar), split = "\\.", keep = 1))
  ann = data.table::rbindlist(l = ann, fill = TRUE, idcol = "sample_id", use.names = TRUE)

  #Check to see if input file contains sample names
  if(is.null(tsbCol)){
    if(! 'Tumor_Sample_Barcode' %in% colnames(ann)){
      colnames(ann)[which(colnames(ann) == "sample_id")] = 'Tumor_Sample_Barcode'
      cat("--Tumor_Sample_Barcode column not found. Creating sample IDs from filenames\n")
    }
  }else{
    colnames(ann)[which(colnames(ann) == tsbCol)] = 'Tumor_Sample_Barcode'
  }

  #Table options. See here: http://annovar.openbioinformatics.org/en/latest/user-guide/download/ (not considering UCSC known genes options for now)
  table = match.arg(arg = table, choices = c('refGene', 'ensGene'))

  if(table == 'ensGene'){
    colnames(ann)[which(colnames(ann) == 'Func.ensGene')] = 'Func.refGene'
    colnames(ann)[which(colnames(ann) == 'Gene.ensGene')] = 'Gene.refGene'
    colnames(ann)[which(colnames(ann) == 'ExonicFunc.ensGene')] = 'ExonicFunc.refGene'
    colnames(ann)[which(colnames(ann) == 'AAChange.ensGene')] = 'AAChange.refGene'
    colnames(ann)[which(colnames(ann) == 'GeneDetail.ensGene')] = 'GeneDetail.refGene'
  }

    essential.col = c('Chr', 'Start', 'End', 'Ref', 'Alt', 'Func.refGene', 'Gene.refGene', 'GeneDetail.refGene',
                      'ExonicFunc.refGene', 'AAChange.refGene')

    #Change column names to standard names;
    for(i in 1:length(essential.col)){
      colId = suppressWarnings(grep(pattern = paste0('^', essential.col[i], '$'), x = colnames(ann), ignore.case = TRUE))
      if(length(colId) == 1){
        colnames(ann)[colId] = essential.col[i]
      }
     }

    if(length(essential.col[!essential.col %in% colnames(ann)]) > 0) {
      message('Available fields:')
      print(colnames(ann))
      message(paste0('Missing required field in input file: '))
      print(essential.col[!essential.col %in% colnames(ann)])
      stop()
    }

  #Center
  if(is.null(Center)){
    Center = NA
  }

    #If multiple genes are assigned, used the first entry (which correpsonds to closest gene)
    ann[, Hugo_Symbol := unlist(data.table::tstrsplit(Gene.refGene, split = ";", keep = 1))]

    #Annovar to MAF mappings (http://annovar.openbioinformatics.org/en/latest/user-guide/gene/)
    annovar_values = c(
      exonic = "RNA",
      splicing = "Splice_Site",
      UTR5 = "5'UTR",
      UTR3 = "3'UTR",
      intronic = "Intron",
      upstream = "5'Flank",
      downstream = "3'Flank",
      intergenic = "IGR",
      `frameshift insertion` = "Frame_Shift_Ins",
      `frameshift deletion` = "Frame_Shift_Del",
      `frameshift block substitution` = "Frameshift_INDEL",
      `frameshift substitution` = "Frameshift_INDEL",
      stopgain = "Nonsense_Mutation",
      stoploss = "Nonstop_Mutation",
      startloss = "Translation_Start_Site",
      startgain = "Unknown", #Can not properly map MAF classification. Setting it to Unknown
      `nonframeshift insertion` = "In_Frame_Ins",
      `nonframeshift deletion` = "In_Frame_Del",
      `nonframeshift block substitution` = "Inframe_INDEL",
      `nonframeshift substitution` = "Inframe_INDEL",
      `nonsynonymous SNV` = "Missense_Mutation",
      `synonymous SNV` = "Silent",
      unknown = "Unknown",
      ncRNA_exonic = "RNA",
      ncRNA_intronic = "RNA",
      ncRNA_UTR3 = "RNA",
      ncRNA_UTR5 = "RNA",
      ncRNA = "RNA",
      ncRNA_splicing = "RNA"
    )

    #Choose first entry for mixed annotations (e.g; exonic;splicing)
    ann[, Func.refGene := unlist(data.table::tstrsplit(x = as.character(Func.refGene), split = ";", keep = 1))]
    ann[, ExonicFunc.refGene := unlist(data.table::tstrsplit(x = as.character(ExonicFunc.refGene), split = ";", keep = 1))]
    ann$Variant_Classification = ifelse(is.na(ann$ExonicFunc.refGene), annovar_values[ann$Func.refGene], annovar_values[ann$ExonicFunc.refGene])

    if(nrow(ann[!is.na(AAChange.refGene)]) > 0){
      cat("--Extracting tx, exon, txchange and aa-change\n")
      ann_change = ann[!is.na(AAChange.refGene)]
      aa_change = unlist(data.table::tstrsplit(x = as.character(ann_change$AAChange.refGene),split = ',', fixed = TRUE, keep = 1))

      aa_tbl = lapply(aa_change, function(x){
        x = unlist(strsplit(x = x, split = ":", fixed = TRUE))

        if(length(x) == 5){
          #column contains the gene name, the transcript identifier, exon, tx-change, aa-change
          #If these entries are missing, fill them with NAs
          tx = x[2]
          exon = x[3]
          txChange = x[4]
          aaChange = x[5]
        }else{
          tx = NA
          exon = NA
          txChange = NA
          aaChange = NA
        }

        data.table::data.table(tx, exon, txChange, aaChange)
      })
      aa_tbl = data.table::rbindlist(l = aa_tbl)
      if(length(aa_change) != nrow(ann_change)){
        stop("Something went wrong while parsing aa-change")
      }
      ann_change = cbind(ann_change, aa_tbl)
      ann = data.table::rbindlist(l = list(ann_change, ann[is.na(AAChange.refGene)]), use.names = TRUE, fill = TRUE)
    }

    #Add Variant-type annotations based on difference between ref and alt alleles
    cat("-Adding Variant_Type\n")
    ann$Variant_Type = ifelse(endsWith(x = ann$Variant_Classification, suffix = "_Del"), yes = "DEL", no = NA)
    ann$Variant_Type = ifelse(endsWith(x = ann$Variant_Classification, suffix = "_Ins"), yes = "INS", no = ann$Variant_Type)
    ann$Variant_Type = ifelse(ann$Variant_Classification %in% c("Missense_Mutation", "Silent") , yes = "SNP", no = ann$Variant_Type)

    ann[,ref_alt_len := nchar(Ref) + nchar(Alt)]
    ann[,ref_alt_diff := nchar(Ref) - nchar(Alt)]
    ann$Variant_Type = ifelse(ann$ref_alt_diff < 0 , yes = "INS", no = ann$Variant_Type)
    ann$Variant_Type = ifelse(ann$ref_alt_diff > 0 , yes = "DEL", no = ann$Variant_Type)

     #For ambiguous variants such as DNPs, TNPs and ONPs
    if(nrow(ann[is.na(Variant_Type)]) > 0){
      ann[, ref_alt := paste(Ref, Alt, sep = ">")]
      ann_na = ann[is.na(Variant_Type)]
      ann_na$Variant_Type = ifelse(test = ann_na$ref_alt %in% paste("-", c("A", "T", "G", "C"), sep = ">"), yes = "INS",
                                   no = ifelse(test = ann_na$ref_alt %in% paste(c("A", "T", "G", "C"), "-", sep = ">"), yes = "DEL",
                                               no = ifelse(ann_na$ref_alt_len == 2, yes = "SNP", no = ifelse(test = ann_na$ref_alt_len == 4, yes = "DNP",
                                                                                                             no = "ONP"))))
      ann[, ref_alt := NULL]
      ann = data.table::rbindlist(l = list(ann[!is.na(Variant_Type)], ann_na), use.names = TRUE, fill = TRUE)
    }

  #Annovar ensGene doesn't provide HGNC gene symbols as Hugo_Symbol. We will change them manually.
  if(table == 'ensGene'){
    if(ens2hugo){
      ens = system.file('extdata', 'ensGenes.txt.gz', package = 'maftools')
      cat('-Converting Ensemble Gene IDs into HGNC gene symbols\n')
      ens = data.table::fread(file =  ens, sep = '\t', stringsAsFactors = FALSE)

      ann = merge(ann, ens, by.x = 'Hugo_Symbol', by.y = 'ens_id', all.x = TRUE)
      ann[,ens_id := Hugo_Symbol] #Backup original ids
      ann[,Hugo_Symbol := hgnc_symbol] #Add GHNC gene names
      ann[,Entrez_Gene_Id := Entrez] #Add entrez identifiers.
      cat('--Done. Original ensemble gene IDs are preserved under field name ens_id\n')
    }
  }
    ann[, ref_alt_diff := NULL]
    ann[, ref_alt_len := NULL]

    #MAF requires Hugo_Symbols to be set to Unknown in case of IGR mutations
    ann$Hugo_Symbol = ifelse(ann$Variant_Classification == "IGR", yes = "Unknown", no = ann$Hugo_Symbol)
    ann$Hugo_Symbol = ifelse(is.na(ann$Hugo_Symbol), yes = "Unknown", no = ann$Hugo_Symbol)

  #Re-roganize columns
  colnames(ann)[match(c("Chr", "Start", "End", "Ref", "Alt"), colnames(ann))] = c("Chromosome", "Start_Position", "End_Position", "Reference_Allele", "Tumor_Seq_Allele2")
  ord1 = colnames(x = ann)[colnames(x = ann) %in% c("Hugo_Symbol", "Chromosome", "Start_Position", "End_Position", "Reference_Allele", "Tumor_Seq_Allele2", "Variant_Classification", "Variant_Type", "Tumor_Sample_Barcode", "tx", "exon", "txChange", "aaChange")]
  ord2 = colnames(x = ann)[!colnames(x = ann) %in% c("Hugo_Symbol", "Chromosome", "Start_Position", "End_Position", "Reference_Allele", "Tumor_Seq_Allele2", "Variant_Classification", "Variant_Type", "Tumor_Sample_Barcode", "tx", "exon", "txChange", "aaChange")]
  ann = ann[,c(ord1, ord2), with = FALSE]

  if(!is.null(basename)){
    data.table::fwrite(x = ann, file = paste(basename, 'maf', sep='.'), sep='\t')
  }

  cat("Finished in",data.table::timetaken(start_time),"\n")

  if(MAFobj){
    m = read.maf(maf = ann, clinicalData = sampleAnno, verbose = FALSE)
    return(m)
  }else{
    return(ann)
  }
}