R/inferDriverDominceScore.R

#' Infer the Driver Dominances Scores for Patients and Genes.
#'
#'
#' @param maf an \code{\link{MAF}} object generated by \code{\link{read.maf}}
#' @param vafCol manually specify column name for vafs. Default looks for column 't_vaf'
#' @param tsb specify sample names (Tumor_Sample_Barcodes) for which clustering has to be done.
#' @param driver The driver genes list(Hugo_Symbol), defaults setting the \strong{topGenes}
#' @param topGenes if \code{driver} is NULL, uses top n number of most frequently mutated genes. Defaults to 10.
#' @param minVaf filter low frequency variants. Low vaf variants maybe due to sequencing error. Default 0. (on the scale of 0 to 1)
#' @param maxVaf filter high frequency variants. High vaf variants maybe due to copy number alterations or impure tumor. Default 1. (on the scale of 0 to 1)
#' @param ignChr ignore these chromosomes from analysis. e.g, sex chromsomes chrX, chrY. Default NULL.
#' @param segFile path to CBS segmented copy number file. Column names should be Sample, Chromosome, Start, End, Num_Probes and Segment_Mean (log2 scale).
#' @param useSyn Use synonymous variants. Default FALSE.

#' @import data.table
#' @import vegan
#' @import magrittr
#' @export

inferDriverDominanceScore = function (maf, tsb = NULL, driver =NULL , topGenes = 10, vafCol = NULL, segFile = NULL, ignChr = NULL, minVaf = 0, maxVaf = 1, useSyn = FALSE ) {

  if (is.null(tsb)) {
    tsb = as.character(maftools::getSampleSummary(x = maf)[, Tumor_Sample_Barcode])
  }

  dat.tsb = maftools::subsetMaf(maf = maf, tsb = tsb, includeSyn = useSyn,
                      mafObj = FALSE)

  if (nrow(dat.tsb) == 0) {
    stop(paste(tsb, "not found in MAF"))
  }
  onlyContigs = as.character(seq(1:22))

  if (!"t_vaf" %in% colnames(dat.tsb)) {
    if (is.null(vafCol)) {
      if (all(c("t_ref_count", "t_alt_count") %in%
              colnames(dat.tsb))) {
        message("t_vaf field is missing, but found t_ref_count & t_alt_count columns. Estimating vaf..")
        dat.tsb[, `:=`(t_vaf, t_alt_count/(t_ref_count +
                                             t_alt_count))]
      }
      else {
        print(colnames(dat.tsb))
        stop("t_vaf field is missing. Use vafCol to manually specify vaf column name.")
      }
    }
    else {
      colnames(dat.tsb)[which(colnames(dat.tsb) == vafCol)] = "t_vaf"
    }
  }

  if (!is.null(segFile)) {
    seg.dat = maftools:::readSegs(segFile)
    seg.dat = seg.dat[Chromosome %in% onlyContigs]
    seg.dat = seg.dat[order(as.numeric(Chromosome))]
    setkey(x = seg.dat, Chromosome, Start_Position, End_Position)
    seg.tsbs = unique(seg.dat[, Sample])
    if (length(seg.tsbs[!seg.tsbs %in% tsb]) > 0) {
      message("CN data for following samples not found. Ignoring them ..")
      print(seg.tsbs[!seg.tsbs %in% tsb])
      seg.tsbs = seg.tsbs[seg.tsbs %in% tsb]
    }
    if (length(seg.tsbs) > 0) {
      seg.dat = seg.dat[Sample %in% seg.tsbs]
    }
    else {
      stop("None of the provided samples have copy number data.")
    }
  }
  clust.dat = c()
  dat.tsb = dat.tsb[, .(Hugo_Symbol, Chromosome, Start_Position,
                        End_Position, Tumor_Sample_Barcode, t_vaf)]
  dat.tsb$Chromosome = as.character(dat.tsb$Chromosome)
  dat.tsb$t_vaf = as.numeric(as.character(dat.tsb$t_vaf))
  dat.tsb = dat.tsb[order(Chromosome)]
  if (max(dat.tsb$t_vaf, na.rm = TRUE) > 1) {
    dat.tsb$t_vaf = dat.tsb$t_vaf/100
  }
  if (!is.null(ignChr)) {
    dat.tsb = dat.tsb[!Chromosome %in% ignChr]
  }
  dat.tsb = dat.tsb[t_vaf > minVaf][t_vaf < maxVaf]
  dat.tsb$Chromosome = gsub(pattern = "chr", replacement = "",
                            x = dat.tsb$Chromosome, fixed = TRUE)
  dat.tsb = suppressWarnings(dat.tsb[order(as.numeric(Chromosome))])

  if( is.null(driver) ){
    driver = as.character(maftools:::getGeneSummary(x = maf)[1:topGenes , Hugo_Symbol])
  }

  dat.tsb = dat.tsb[ Hugo_Symbol %in% driver]

  dat.tsb1 = dat.tsb[, .(Hugo_Symbol, Tumor_Sample_Barcode)] %>%
    group_by(Hugo_Symbol, Tumor_Sample_Barcode) %>%
    summarise( count = n() ) %>%
    reshape2::dcast( Hugo_Symbol ~ Tumor_Sample_Barcode, fill = 0) %>%
    tibble::column_to_rownames(var = "Hugo_Symbol")

  dat.tsb.RowSums = rowSums(dat.tsb1, na.rm = T) #Mutated Genes
  dat.tsb.ColSums = colSums(dat.tsb1, na.rm = T) #Mutated Samples.

  #Genes Dominances.
  GeneDominances = dat.tsb1 %>%
    apply( 2, function(x) x/sum(x, na.rm = T)) %>%
    apply(1 , sum, na.rm = T)/dat.tsb.RowSums

  GeneDominancesSummary = maftools:::getGeneSummary(x = maf)[Hugo_Symbol %in% driver] %>%
    mutate(DominanceScores = GeneDominances) %>%
    mutate(Occurence = AlteredSamples/length(tsb) )

  #Patients Dominances
  PatientsDominances = dat.tsb1 * GeneDominances
  PatientsDominances = apply(PatientsDominances, 2, sum, na.rm = T)/dat.tsb.ColSums
  PatientsDominancesSummary = data.frame( Tumor_Sample_Barcode = names(PatientsDominances),
                                   DominanceScores = PatientsDominances
                                   )

  return( list(dat.tsb1 = dat.tsb1,
              GeneDominancesSummary = GeneDominancesSummary,
              PatientsDominancesSummary= PatientsDominancesSummary) )

}
qingjian1991/THindex documentation built on April 19, 2022, 1:16 p.m.