R/subsetMaf.R

Defines functions subsetMaf

Documented in subsetMaf

#' Subset MAF objects
#'
#' @description Subsets MAF based on given conditions.
#' @param maf an MAF object generated by \code{\link{read.maf}}
#' @param tsb subset by these samples (Tumor Sample Barcodes)
#' @param genes subset by these genes
#' @param fields include only these fields along with necessary fields in the output
#' @param query query string. e.g, "Variant_Classification == 'Missense_Mutation'" returns only Missense variants.
#' @param clinQuery query by clinical variable.
#' @param ranges subset by ranges. data.frame with 3 column (chr, start, end). Overlaps are identified by  \code{\link{foverlaps}} function with arguments `type = within`, `mult = all`, `nomatch = NULL`
#' @param mult When multiple loci in `ranges` match to the variants maf, mult=. controls which values are returned - "all" , "first" (default) or "last". This value is passed to `mult` argument of \code{\link{foverlaps}}
#' @param mafObj returns output as MAF class \code{\link{MAF-class}}. Default TRUE
#' @param includeSyn Default TRUE, only applicable when mafObj = FALSE. If mafObj = TRUE, synonymous variants will be stored in a seperate slot of MAF object.
#' @param isTCGA Is input MAF file from TCGA source.
#' @param dropLevels Default TRUE.
#' @param restrictTo restrict subset operations to these. Can be 'all', 'cnv', or 'mutations'. Default 'all'. If 'cnv' or 'mutations', subset operations will only be applied on copy-number or mutation data respectively, while retaining other parts as is.
#' @return subset table or an object of class \code{\link{MAF-class}}
#' @seealso \code{\link{getFields}}
#' @examples
#' laml.maf <- system.file("extdata", "tcga_laml.maf.gz", package = "maftools")
#' laml <- read.maf(maf = laml.maf)
#' ##Select all Splice_Site mutations from DNMT3A and NPM1
#' subsetMaf(maf = laml, genes = c('DNMT3A', 'NPM1'),
#' query = "Variant_Classification == 'Splice_Site'")
#' ##Select all variants with VAF above 30%
#' subsetMaf(maf = laml, query = "i_TumorVAF_WU > 30")
#' ##Extract data for samples 'TCGA.AB.3009' and 'TCGA.AB.2933' but only include vaf filed.
#' subsetMaf(maf = laml, tsb = c('TCGA-AB-3009', 'TCGA-AB-2933'), fields = 'i_TumorVAF_WU')
#' ##Subset by ranges
#' ranges = data.frame(chr = c("2", "17"), start = c(25457000, 7571720), end = c(25458000, 7590868))
#' subsetMaf(laml, ranges = ranges)
#'
#' @export

subsetMaf = function(maf, tsb = NULL, genes = NULL, query = NULL, clinQuery = NULL, ranges = NULL, mult = "first", fields = NULL, mafObj = TRUE, includeSyn = TRUE, isTCGA = FALSE, dropLevels = TRUE, restrictTo = 'all'){

  if(all(c(is.null(tsb), is.null(genes), is.null(query), is.null(ranges), is.null(clinQuery)))){
    stop("Please provide sample names or genes or a query or ranges to subset by.")
  }

  restrictTo = match.arg(arg = restrictTo, choices = c("all", "cnv", "mutations"), several.ok = FALSE)

  #Synonymous variants
  maf.silent <- maf@maf.silent
  #Main data
  maf.dat <- maf@data
  #Annotations
  maf.anno <- data.table::copy(x = maf@clinical.data)

  if(!is.null(clinQuery)){
    if(!is.null(tsb)){
      warning("sample names provided to tsb argument will be over written by clinical query", immediate. = TRUE)
    }
    message("-subsetting by clinical data..")
    maf.anno = maf.anno[eval(parse(text = clinQuery))]

    tsb = unique(as.character(maf.anno[,Tumor_Sample_Barcode]))
    if(length(tsb) > 0){
      message(paste0('--', length(tsb)), " samples meet the clinical query")
    }else{
      if(all(c(is.null(query), is.null(genes)))){
        stop("--None of the samples meet the clinical query", call. = FALSE)
      }else{
        message("--None of the samples meet the clinical query")
        maf.anno <- data.table::copy(x = maf@clinical.data)
      }
      tsb = NULL
    }
  }

  if(restrictTo == "cnv"){
    maf.silent.rest = maf.silent[!Variant_Type %in% 'CNV']
    maf.silent = maf.silent[Variant_Type %in% 'CNV']

    maf.dat.rest = maf.dat[!Variant_Type %in% 'CNV']
    maf.dat = maf.dat[Variant_Type %in% 'CNV']
  }else if(restrictTo == "mutations"){
    maf.silent.rest = maf.silent[Variant_Type %in% 'CNV']
    maf.silent = maf.silent[!Variant_Type %in% 'CNV']

    maf.dat.rest = maf.dat[Variant_Type %in% 'CNV']
    maf.dat = maf.dat[!Variant_Type %in% 'CNV']
  }

  #Select
  if(!is.null(tsb)){
    #message("-subsetting by tumor sample barcodes..")
    tsb = as.character(tsb)
    if(isTCGA){
      tsb = substr(x = tsb, start = 1, stop = 12)
    }
    maf.dat = maf.dat[Tumor_Sample_Barcode %in% tsb,]
    maf.silent = maf.silent[Tumor_Sample_Barcode %in% tsb,]
  }

  if(!is.null(genes)){
    #message("-subsetting by genes..")
    genes = as.character(genes)
    maf.dat = maf.dat[Hugo_Symbol %in% genes, ]
    maf.silent = maf.silent[Hugo_Symbol %in% genes, ]
  }

  if(!is.null(query)){
    #message("-subsetting by query..")
    maf.dat = maf.dat[eval(parse(text=query))]
    maf.silent = maf.silent[eval(parse(text=query))]
  }

  default.fields = c('Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position', 'Reference_Allele', 'Tumor_Seq_Allele2','Variant_Classification', 'Variant_Type', 'Tumor_Sample_Barcode') #necessary fields.

  if(!is.null(fields)){
    default.fields = unique(c(default.fields, fields))

    if(length(default.fields[!default.fields %in% colnames(maf.dat)]) > 0){
      message("Missing fields. Ignoring them.. ")
      print(default.fields[!default.fields %in% colnames(maf.dat)])
      default.fields = default.fields[default.fields %in% colnames(maf.dat)]
    }

    maf.dat = maf.dat[,default.fields, with = FALSE]
    maf.silent = maf.silent[,default.fields, with = FALSE]

    if(restrictTo != "all"){
      maf.dat.rest = maf.dat.rest[,default.fields, with = FALSE]
      maf.silent.rest = maf.silent.rest[,default.fields, with = FALSE]
    }
  }


  if(restrictTo != "all"){
    maf.dat = rbind(maf.dat, maf.dat.rest, fill = TRUE, use.names = TRUE)
    maf.silent = rbind(maf.silent, maf.silent.rest, fill = TRUE, use.names = TRUE)
  }

  if(!is.null(ranges)){
    ranges = data.table::copy(x = ranges)
    colnames(ranges)[1:3] = c("Chromosome", "Start_Position", "End_Position")
    ranges$Chromosome = as.character(ranges$Chromosome)
    ranges$Start_Position = as.numeric(as.character(ranges$Start_Position))
    ranges$End_Position = as.numeric(as.character(ranges$End_Position))

    data.table::setDT(x = ranges)
    data.table::setkey(x = ranges, Chromosome, Start_Position, End_Position)

    maf.dat$Chromosome = as.character(maf.dat$Chromosome)
    maf.dat$Start_Position = as.numeric(as.character(maf.dat$Start_Position))
    maf.dat$End_Position = as.numeric(as.character(maf.dat$End_Position))

    maf.silent$Chromosome = as.character(maf.silent$Chromosome)
    maf.silent$Start_Position = as.numeric(as.character(maf.silent$Start_Position))
    maf.silent$End_Position = as.numeric(as.character(maf.silent$End_Position))

    maf.dat = data.table::foverlaps(x = maf.dat, y = ranges, type = "within", nomatch = NULL, verbose = FALSE, mult = mult)
    maf.silent = data.table::foverlaps(x = maf.silent, y = ranges, type = "within", nomatch = NULL, verbose = FALSE, mult = mult)
    message(paste0(nrow(maf.dat)+nrow(maf.silent), " variants within provided ranges"))
  }


  if(mafObj){

    if(dropLevels){
      maf.silent = droplevels.data.frame(maf.silent)
      maf.dat = droplevels.data.frame(maf.dat)
      maf.anno = droplevels.data.frame(maf.anno)
    }

    if(nrow(maf.dat) == 0){
      stop("Subsetting has resulted in zero non-synonymous variants!")
    }

    mafSummary = summarizeMaf(maf.dat, chatty = FALSE, anno = maf.anno)

    m = MAF(data = maf.dat, variants.per.sample = mafSummary$variants.per.sample, variant.type.summary = mafSummary$variant.type.summary,
              variant.classification.summary = mafSummary$variant.classification.summary, gene.summary = mafSummary$gene.summary,
              summary = mafSummary$summary, maf.silent = maf.silent, clinical.data = droplevels(mafSummary$sample.anno))

    return(m)
  }else{
    if(includeSyn){
      maf.dat = rbind(maf.dat, maf.silent, use.names = TRUE, fill = TRUE)
      if(dropLevels){
        maf.dat = droplevels.data.frame(x = maf.dat)
      }
      return(maf.dat)
    }else{
      if(dropLevels){
        maf.dat = droplevels.data.frame(x = maf.dat)
      }
      return(maf.dat)
    }
  }
}

Try the maftools package in your browser

Any scripts or data that you put into this service are public.

maftools documentation built on Feb. 6, 2021, 2 a.m.