R/extractSignatures.R

Defines functions extractSignatures

Documented in extractSignatures

#' Extract mutational signatures from trinucleotide context.
#'
#' @description Decompose a matrix of 96 substitution classes into \code{n} signatures.
#'
#' @details This function decomposes a non-negative matrix into n signatures.
#'
#' @param mat Input matrix of diemnsion nx96 generated by \code{\link{trinucleotideMatrix}}
#' @param n decompose matrix into n signatures. Default NULL. Tries to predict best value for \code{n} by running NMF on a range of values and chooses based on cophenetic correlation coefficient.
#' @param plotBestFitRes plots consensus heatmap for range of values tried. Default FALSE
#' @param parallel Default 4. Number of cores to use.
#' @param pConstant A small positive value to add to the matrix. Use it ONLY if the functions throws an \code{non-conformable arrays} error
#' @return a list with decomposed scaled signatures, signature contributions in each sample and NMF object.
#' @examples
#' \dontrun{
#' laml.maf <- system.file("extdata", "tcga_laml.maf.gz", package = "maftools")
#' laml <- read.maf(maf = laml.maf)
#' laml.tnm <- trinucleotideMatrix(maf = laml, ref_genome = 'BSgenome.Hsapiens.UCSC.hg19', prefix = 'chr',
#' add = TRUE, useSyn = TRUE)
#' library("NMF")
#' laml.sign <- extractSignatures(mat = laml.tnm, plotBestFitRes = FALSE, n = 2, pConstant = 0.01)
#' }
#' @seealso \code{\link{trinucleotideMatrix}} \code{\link{plotSignatures}} \code{\link{compareSignatures}}
#' @export


extractSignatures = function(mat, n = NULL, plotBestFitRes = FALSE, parallel = 4, pConstant = NULL){

    #suppressPackageStartupMessages(require(NMF, quietly = TRUE))
    #transpose matrix
  start_time = proc.time()
    mat = t(mat$nmf_matrix)

    #Validation
    zeroMutClass = names(which(rowSums(mat) == 0))

    if(length(zeroMutClass)){
      message('-Found zero mutations for conversions:')
      for(temp in zeroMutClass){
        message(paste0("  ", temp))
      }
      #Add small value to avoid zero counts (maybe not appropriate). This happens when sample size is low or in cancers with low mutation rate.
      #mat[which(rowSums(mat) == 0),] = 0.1
    }

    #To avoid error due to non-conformable arrays
    if(!is.null(pConstant)){
      if(pConstant < 0 | pConstant == 0){
        stop("pConstant must be > 0")
      }
      mat = mat+pConstant
    }

    #Notes:
    #Available methods for nmf decompositions are 'brunet', 'lee', 'ls-nmf', 'nsNMF', 'offset'.
    #But based 21 breast cancer signatures data, defualt brunet seems to be working close to the results.
    #Sticking with default for now.

    message(paste0('-Running NMF for factorization rank: ', n))
    if(!is.null(parallel)){
      conv.mat.nmf = NMF::nmf(x = mat, rank = n, .opt = paste0('P', parallel), seed = 123456)
    }else{
      conv.mat.nmf = NMF::nmf(x = mat, rank = n, seed = 123456)
    }

    #Signatures
    w = NMF::basis(conv.mat.nmf)
    w = apply(w, 2, function(x) x/sum(x)) #Scale the signatures (basis)
    colnames(w) = paste('Signature', 1:ncol(w),sep='_')

    #Contribution
    h_abs = NMF::coef(conv.mat.nmf)
    colnames(h_abs) = colnames(mat) #correct colnames (seems to be mssing with low mutation load)
    #For single signature, contribution will be 100% per sample
    if(n == 1){
      h = h_abs/h_abs
      rownames(h) = paste('Signature', '1', sep = '_')
    }else{
      h = apply(h_abs, 2, function(x) x/sum(x)) #Scale contributions (coefs)
      rownames(h_abs) = rownames(h) = paste('Signature', 1:nrow(h),sep='_')
    }

    message("-Finished in",data.table::timetaken(start_time))
    return(list(signatures = w, contributions = h, nmfObj = conv.mat.nmf, contributions_abs = h_abs))
}
PoisonAlien/maftools documentation built on April 7, 2024, 2:49 a.m.