R/inferHeterogeneityPlus.R

#' Clusters variants and Estimating Tumor Heterogeneity(TH) based on Variant Allele Frequencies (VAF).
#' @description takes output generated by read.maf and clusters variants to infer tumor heterogeneity. This function requires VAF for clustering and density estimation.
#' VAF can be on the scale 0-1 or 0-100. Optionally if copy number information is available, it can be provided as a segmented file (e.g, from Circular Binary Segmentation). Those variants in
#' copy number altered regions will be ignored.
#'
#' @details This function clusters variants based on VAF to estimate univariate density and cluster classification. There are two methods available
#' for clustering. Default using parametric finite mixture models and another method using nonparametric inifinite mixture models (Dirichlet process).
#'
#' @details \emph{Estimate the TH Indices}
#' @details The TH indices are based on diveristy indices or the taxonomic diveristy, see \href{https://cran.r-project.org/web/packages/vegan/vignettes/diversity-vegan.pdf}{vegan}  for details.
#' @details When index = "diveristy", the shannon and reverse simpson index are esimated.
#' @details When index = "taxonomic", the taxonomic diveristy(Delt) and taxonomic distinctness(Dstar) are estimated.
#'
#' @references Chris Fraley and Adrian E. Raftery (2002) Model-based Clustering, Discriminant Analysis and Density Estimation Journal of the American
#' Statistical Association 97:611-631
#'
#' Jara A, Hanson TE, Quintana FA, Muller P, Rosner GL. DPpackage: Bayesian Semi- and Nonparametric Modeling in R. Journal of statistical software. 2011;40(5):1-30.
#'
#' Olshen AB, Venkatraman ES, Lucito R, Wigler M. Circular binary segmentation for the analysis of array-based DNA copy number data. Biostatistics. 2004;5(4):557-72.
#'
#' @param maf an \code{\link{MAF}} object generated by \code{\link{read.maf}}
#' @param vafCol manually specify column name for vafs. Default looks for column 't_vaf'
#' @param tsb specify sample names (Tumor_Sample_Barcodes) for which clustering has to be done.
#' @param index the available methods for the diversity indices estimate. The parames should be "diversity" or "taxonomic", see \strong{Details} for more information.
#' @param bin_size divide the vaf into N(=10 default) bins.
#' @param dirichlet Deprecated! No longer supported. uses nonparametric dirichlet process for clustering. Default FALSE - uses finite mixture models.
#' @param minVaf filter low frequency variants. Low vaf variants maybe due to sequencing error. Default 0. (on the scale of 0 to 1)
#' @param maxVaf filter high frequency variants. High vaf variants maybe due to copy number alterations or impure tumor. Default 1. (on the scale of 0 to 1)
#' @param ignChr ignore these chromosomes from analysis. e.g, sex chromsomes chrX, chrY. Default NULL.
#' @param top if \code{tsb} is NULL, uses top n number of most mutated samples. Defaults to 5.
#' @param segFile path to CBS segmented copy number file. Column names should be Sample, Chromosome, Start, End, Num_Probes and Segment_Mean (log2 scale).
#' @param useSyn Use synonymous variants. Default FALSE.
#'
#' @return list of clustering tables, including:
#' @return clusterData: data of clustering and TH indices.
#' @return clusterMeans: means of clustering.
#' @return diveristy: summarize of TH indices.
#'
#' @author Qingjian Chen modified the code of \code{\link[maftools]{inferHeterogeneity}} and added the TH index. Email:\email{[email protected]@163.com}
#'
#' @examples
#' \dontrun{
#'
#' library(THindex)
#' library(maftools)
#' laml.maf <- system.file("extdata", "tcga_laml.maf.gz", package = "maftools")
#' laml <- read.maf(maf = laml.maf)
#' TCGA.ab.het <- inferHeterogeneityPlus(maf = laml, vafCol = 'i_TumorVAF_WU', index = "diversity")
#' print(TCGA.ab.het$diveristy)
#'
#'}
#' @seealso \code{\link[maftools]{plotClusters}}, \code{\link[maftools]{inferHeterogeneity}}
#'
#' @import data.table
#' @import maftools
#' @import vegan
#' @import tidyverse
#' @import magrittr
#' @import dplyr
#' @import tidyr
#' @import mclust
#' @export

inferHeterogeneityPlus = function (maf, tsb = NULL,index = "diversity", top = 5, vafCol = NULL, segFile = NULL, ignChr = NULL, minVaf = 0, maxVaf = 1, useSyn = FALSE, dirichlet = FALSE, bin_size = 10) {
  if (is.null(tsb)) {
    tsb = as.character(getSampleSummary(x = maf)[1:top, Tumor_Sample_Barcode])
  }

  INDICES <- c("diversity", "taxonomic")
  index <- match.arg(index, INDICES)

  dat.tsb = subsetMaf(maf = maf, tsb = tsb, includeSyn = useSyn,
                      mafObj = FALSE)
  if (nrow(dat.tsb) == 0) {
    stop(paste(tsb, "not found in MAF"))
  }
  onlyContigs = as.character(seq(1:22))
  dirichlet = FALSE
  if (!"t_vaf" %in% colnames(dat.tsb)) {
    if (is.null(vafCol)) {
      if (all(c("t_ref_count", "t_alt_count") %in%
              colnames(dat.tsb))) {
        message("t_vaf field is missing, but found t_ref_count & t_alt_count columns. Estimating vaf..")
        dat.tsb[, `:=`(t_vaf, t_alt_count/(t_ref_count +
                                             t_alt_count))]
      }
      else {
        print(colnames(dat.tsb))
        stop("t_vaf field is missing. Use vafCol to manually specify vaf column name.")
      }
    }
    else {
      colnames(dat.tsb)[which(colnames(dat.tsb) == vafCol)] = "t_vaf"
    }
  }
  if (!is.null(segFile)) {
    seg.dat = readSegs(segFile)
    seg.dat = seg.dat[Chromosome %in% onlyContigs]
    seg.dat = seg.dat[order(as.numeric(Chromosome))]
    setkey(x = seg.dat, Chromosome, Start_Position, End_Position)
    seg.tsbs = unique(seg.dat[, Sample])
    if (length(seg.tsbs[!seg.tsbs %in% tsb]) > 0) {
      message("CN data for following samples not found. Ignoring them ..")
      print(seg.tsbs[!seg.tsbs %in% tsb])
      seg.tsbs = seg.tsbs[seg.tsbs %in% tsb]
    }
    if (length(seg.tsbs) > 0) {
      seg.dat = seg.dat[Sample %in% seg.tsbs]
    }
    else {
      stop("None of the provided samples have copy number data.")
    }
  }
  clust.dat = c()
  dat.tsb = dat.tsb[, .(Hugo_Symbol, Chromosome, Start_Position,
                        End_Position, Tumor_Sample_Barcode, t_vaf)]
  dat.tsb$Chromosome = as.character(dat.tsb$Chromosome)
  dat.tsb$t_vaf = as.numeric(as.character(dat.tsb$t_vaf))
  dat.tsb = dat.tsb[order(Chromosome)]
  if (max(dat.tsb$t_vaf, na.rm = TRUE) > 1) {
    dat.tsb$t_vaf = dat.tsb$t_vaf/100
  }
  if (!is.null(ignChr)) {
    dat.tsb = dat.tsb[!Chromosome %in% ignChr]
  }
  dat.tsb = dat.tsb[t_vaf > minVaf][t_vaf < maxVaf]
  dat.tsb$Chromosome = gsub(pattern = "chr", replacement = "",
                            x = dat.tsb$Chromosome, fixed = TRUE)
  dat.tsb = suppressWarnings(dat.tsb[order(as.numeric(Chromosome))])
  for (i in 1:length(tsb)) {
    message(paste("Processing ", tsb[i], "..",
                  sep = ""))
    tsb.dat = dat.tsb[Tumor_Sample_Barcode %in% tsb[i]]
    tsb.dat = tsb.dat[!is.na(tsb.dat$t_vaf), ]
    if (nrow(tsb.dat) < 3) {
      message("Too few mutations for clustering. Skipping..")
    }
    else {
      tempCheck = 0
      if (!is.null(segFile)) {
        if (tsb[i] %in% seg.tsbs) {
          seg = seg.dat[Sample %in% tsb[i]]
          seg.res = filterCopyNumber(seg, tsb.dat, tempCheck,
                                     tsb[i])
          tsb.dat = seg.res[[1]]
          tsb.dat.cn.vars = seg.res[[2]]
          tempCheck = seg.res[[3]]
        }
      }
      if (dirichlet) {
        tsb.dat = dirichletClusters(tsb.dat)
      }
      else {
        tsb.cluster = mclust::densityMclust(tsb.dat[,
                                                    t_vaf], G = 1:7, verbose = FALSE)
        tsb.dat$cluster = as.character(tsb.cluster$classification)
        abs.med.dev = abs(tsb.dat[, t_vaf] - median(tsb.dat[,
                                                            t_vaf]))
        pat.mad = median(abs.med.dev) * 100
        pat.math = pat.mad * 1.4826/median(tsb.dat[,
                                                   t_vaf])
        tsb.dat$MATH = pat.math
        tsb.dat$MedianAbsoluteDeviation = pat.mad

        diveristy = diversity_partition( data = tsb.dat, method = index, minVaf = minVaf, maxVaf = maxVaf, bin_size = bin_size )$index

        tsb.dat[,names(diveristy[1])] = diveristy[1]
        tsb.dat[,names(diveristy[2])] = diveristy[2]

      }
      tsb.dat = refineClusters(clusters = tsb.dat)
      if (tempCheck == 1) {
        tsb.dat = rbind(tsb.dat, tsb.dat.cn.vars, fill = TRUE)
      }
      clust.dat = rbind(clust.dat, tsb.dat, fill = TRUE)
    }
  }
  clust.dat.mean = clust.dat[, mean(t_vaf), by = .(Tumor_Sample_Barcode,
                                                   cluster)]
  colnames(clust.dat.mean)[ncol(clust.dat.mean)] = "meanVaf"

  if(index == "diversity"){
    diveristy = unique( clust.dat[, .(Tumor_Sample_Barcode, MATH, MedianAbsoluteDeviation, shannon, simpson)])
  }else{
    diveristy = unique( clust.dat[, .(Tumor_Sample_Barcode, MATH, MedianAbsoluteDeviation, Delt, Dstar)])
  }

  return(list(clusterData = clust.dat, clusterMeans = clust.dat.mean, diveristy = diveristy))

}
qingjian1991/THindex documentation built on Jan. 1, 2020, 12:55 a.m.