R/SignatureFitLib.R

Defines functions plot_SignatureFit_withBootstrap exposureDistributionBarplot export_SignatureFit_withBootstrap_to_JSON plotExposures objSimAnnelaingFunction startSolution bleedingFilterKLD bleedingFilter SignatureFit_withBootstrap_Analysis SignatureFit_withBootstrap SignatureFit

Documented in export_SignatureFit_withBootstrap_to_JSON exposureDistributionBarplot plot_SignatureFit_withBootstrap SignatureFit SignatureFit_withBootstrap SignatureFit_withBootstrap_Analysis

#' Mutational Signatures Fit (deprecated)
#'
#' This function is deprecated. You can still use it, but we advise to use the function Fit instead,
#' which provides a unified interface for basic signature fit with/without bootstrap.
#' Fit a given set of mutational signatures into mutational catalogues to extimate the activty/exposure of each of the given signatures in the catalogues.
#'
#' @param cat catalogue matrix, patients as columns, channels as rows
#' @param signature_data_matrix signatures, signatures as columns, channels as rows
#' @param method KLD or NNLS or SA
#' @param bf_method bleeding filter method, one of KLD or CosSim, only if bleeding filter is used (alpha>-1)
#' @param alpha set alpha to -1 to avoid Bleeding Filter
#' @param doRound round the exposures to the closest integer
#' @param verbose use FALSE to suppress messages
#' @param n_sa_iter set max Simulated Annealing iterations if method==SA
#' @param showDeprecated set to FALSE to switch off the deprecated warning messsage
#' @return returns the activities/exposures of the signatures in the given sample
#' @keywords mutational signatures fit
#' @export
#' @examples
#' res.exp <- SignatureFit(catalogues,signature_data_matrix)
SignatureFit <- function(cat, #catalogue, patients as columns, channels as rows
                         signature_data_matrix,  #signatures, signatures as columns, channels as rows
                         method = "KLD", #KLD or NNLS or SA
                         bf_method = "CosSim", #KLD or CosSim
                         alpha = -1, #set alpha to -1 to avoid Bleeding Filter
                         doRound = FALSE, #round the exposures to the closest integer
                         verbose = TRUE, #use FALSE to suppress messages
                         n_sa_iter = 500,
                         showDeprecated = TRUE){
  if(showDeprecated) message("[warning SignatureFit] SignatureFit is deprecated, please use Fit instead with useBootstrap=FALSE. You can turn off this warning with showDeprecated=FALSE")

  if(method=="KLD"){
    if(verbose) message("[info SignatureFit] SignatureFit, objective function: KLD")

    #Fit using the given signatures
    nnlm_res <- NNLM::nnlm(as.matrix(signature_data_matrix),as.matrix(cat),loss = "mkl",method = "lee")
    fit_KLD <- KLD(cat,as.matrix(signature_data_matrix) %*% nnlm_res$coefficients)
    exposures <- nnlm_res$coefficients
    if(verbose) message("[info SignatureFit] Optimisation terminated, KLD=",fit_KLD)
    if(alpha >= 0){ #negative alpha skips Bleeding Filter
      #apply bleeding filter
      if(bf_method=="CosSim"){
        if(verbose) message("[info SignatureFit] Applying Bleeding Filter (Cosine Similarity) with alpha=",alpha*100,"%")
      }else if (bf_method=="KLD"){
        if(verbose) message("[info SignatureFit] Applying Bleeding Filter (KLD) with alpha=",alpha*100,"%")
      }
      for(i in 1:ncol(nnlm_res$coefficients)){
        if(bf_method=="CosSim"){
          exposures[,i] <- bleedingFilter(e = nnlm_res$coefficients[,i],
                                             sig = signature_data_matrix,
                                             sample = cat[,i],
                                             alpha = alpha)
        }else if (bf_method=="KLD"){
          exposures[,i] <- bleedingFilterKLD(e = nnlm_res$coefficients[,i],
                                             sig = signature_data_matrix,
                                             sample = cat[,i],
                                             alpha = alpha)
        }
      }
      fit_KLD_afterBF <- KLD(cat,as.matrix(signature_data_matrix) %*% exposures)
      if(verbose) message("[info SignatureFit] New fit after Bleeding Filter, KLD=",fit_KLD_afterBF," (",sprintf("%.3f",(fit_KLD_afterBF - fit_KLD)/fit_KLD*100),"% increase)")
    }
  }else if(method=="SA"){
    # library(GenSA)
    # library(foreach)
    # library(doParallel)
    # library(doMC)

    doParallel::registerDoParallel(4)

    if(verbose) message("[info SignatureFit] SignatureFit, objective function: SA")

    sig <- signature_data_matrix


    #---- copy from Sandro's code below
    exp_list <- foreach::foreach(j=1:ncol(cat)) %dopar% {
      curr_cat <- as.numeric(cat[,j])
      nmut <- sum(curr_cat)
      exp_tmp <- rep(0, ncol(sig))
      names(exp_tmp) <- names(sig)
      if(nmut>0){
        if(verbose) message("[info SignatureFit] Analyzing " ,  j, " of ", ncol(cat), " sample name ", colnames(cat[j]))

        ## Compute the start solution for the sim. annealing
        ss <- startSolution(sig, curr_cat)

        ## Compute the exposure by using the sim. annealing
        out <- GenSA::GenSA(par=as.numeric(ss), lower=rep(0, ncol(sig)), upper=rep(nmut, ncol(sig)), fn=objSimAnnelaingFunction, control=list(maxit=n_sa_iter), xsample=curr_cat, xsignature=sig)

        exp_tmp <- as.numeric(out$par)
        if(sum(exp_tmp)==0){
          exp_tmp <- ss
        }else{
          exp_tmp <- (exp_tmp/sum(exp_tmp))*nmut
        }
        names(exp_tmp) <- names(ss)

        ## Apply the bleeding Filter to remove the 'unnecessary' signatures
        if(bf_method=="CosSim"){
          if(verbose) message("[info SignatureFit] Applying Bleeding Filter (Cosine Similarity) with alpha=",alpha*100,"%")
          exp_tmp <-  bleedingFilter(exp_tmp, sig, curr_cat, alpha)
        }else if (bf_method=="KLD"){
          if(verbose) message("[info SignatureFit] Applying Bleeding Filter (KLD) with alpha=",alpha*100,"%")
          exp_tmp <-  bleedingFilterKLD(exp_tmp, sig, curr_cat, alpha)
        }

      }
      exp_tmp
    }

    exposures <- matrix(unlist(exp_list), ncol=ncol(sig), byrow=T)
    rownames(exposures) <- colnames(cat)
    colnames(exposures) <- colnames(sig)
    exposures <- t(exposures)
    #----- end of Sandro's code



  }else if(method=="NNLS"){
    if(verbose) message("[info SignatureFit] SignatureFit, method: NNLS")

    exposures <- matrix(NA,ncol = ncol(cat),nrow = ncol(signature_data_matrix))
    colnames(exposures) <- colnames(cat)
    rownames(exposures) <- colnames(signature_data_matrix)
    for (i in 1:ncol(cat)){
      q <- as.vector(cat[,i])
      exp_NNLS <- nnls::nnls(as.matrix(signature_data_matrix),q)
      exposures[,i] <- exp_NNLS$x
    }

    fit_KLD <- KLD(cat,as.matrix(signature_data_matrix) %*% exposures)
    if(verbose) message("[info SignatureFit] Optimisation terminated, KLD=",fit_KLD)

    if(alpha >= 0){ #negative alpha skips Bleeding Filter
      #apply bleeding filter
      if(bf_method=="CosSim"){
        if(verbose) message("[info SignatureFit] Applying Bleeding Filter (Cosine Similarity) with alpha=",alpha*100,"%")
      }else if (bf_method=="KLD"){
        if(verbose) message("[info SignatureFit] Applying Bleeding Filter (KLD) with alpha=",alpha*100,"%")
      }
      for (i in 1:ncol(exposures)){
        if(bf_method=="CosSim"){
          exposures[,i] <- bleedingFilter(e = exposures[,i],
                                          sig = signature_data_matrix,
                                          sample = cat[,i],
                                          alpha = alpha)
        }else if (bf_method=="KLD"){
          exposures[,i] <- bleedingFilterKLD(e = exposures[,i],
                                             sig = signature_data_matrix,
                                             sample = cat[,i],
                                             alpha = alpha)
        }
      }
    }
    fit_KLD_afterBF <- KLD(cat,as.matrix(signature_data_matrix) %*% exposures)
    if(verbose) message("[info SignatureFit] New fit after Bleeding Filter, KLD=",fit_KLD_afterBF," (",sprintf("%.3f",(fit_KLD_afterBF - fit_KLD)/fit_KLD*100),"% increase)")
  }else{
    stop("[error SignatureFit] Unknown method specified.")
  }
  exposures[exposures < .Machine$double.eps] <- 0
  if(doRound) exposures <- round(exposures)
  return(exposures)
}



#' Mutational Signatures Fit with Bootstrap (deprecated)
#'
#' This function is deprecated. You can still use it, but we advise to use the function Fit instead,
#' which provides a unified interface for basic signature fit with/without bootstrap.
#' Fit a given set of mutational signatures into mutational catalogues to extimate the
#' activty/exposure of each of the given signatures in the catalogues. Implementation
#' of method similar to Huang et al. 2017, Detecting presence of mutational signatures with
#' confidence, which uses a bootstrap apporach to calculate the empirical probability
#' of an exposure to be larger or equal to a given threshold (i.e. 5% of mutations of a sample).
#' This probability can be used to decide which exposures to remove from the initial fit,
#' thus increasing the sparsity of the exposures.
#'
#' @param cat catalogue matrix, patients as columns, channels as rows
#' @param signature_data_matrix signatures, signatures as columns, channels as rows
#' @param nboot number of bootstraps to use, more bootstraps more accurate results
#' @param exposureFilterType use either fixedThreshold or giniScaledThreshold. When using fixedThreshold, exposures will be removed based on a fixed percentage with respect to the total number of mutations (threshold_percent will be used). When using giniScaledThreshold each signature will used a different threshold calculated as (1-Gini(signature))*giniThresholdScaling
#' @param threshold_percent threshold in percentage of total mutations in a sample, only exposures larger than threshold are considered. Set it to -1 to deactivate.
#' @param threshold_nmuts threshold in number of mutations in a sample, only exposures larger than threshold are considered.Set it to -1 to deactivate.
#' @param giniThresholdScaling scaling factor for the threshold type giniScaledThreshold, which is based on the Gini score of a signature. The threshold is computed as (1-Gini(signature))*giniThresholdScaling, and will be used as a percentage of mutations in a sample that the exposure of "signature" need to be larger than. Set it to -1 to deactivate.
#' @param giniThresholdScaling_nmuts scaling factor for the threshold type giniScaledThreshold, which is based on the Gini score of a signature. The threshold is computed as (1-Gini(signature))*giniThresholdScaling_nmuts, and will be used as number of mutations in a sample that the exposure of "signature" need to be larger than. Set to -1 to deactivate.
#' @param threshold_p.value p-value to determine whether an exposure is above the threshold_percent. In other words, this is the empirical probability that the exposure is lower or equal than the threshold
#' @param method KLD or NNLS or SA
#' @param bf_method bleeding filter method, one of KLD or CosSim, only if bleeding filter is used (alpha>-1)
#' @param alpha set alpha to -1 to avoid Bleeding Filter
#' @param doRound round the exposures to the closest integer
#' @param nparallel to use parallel specify >1
#' @param verbose use FALSE to suppress messages
#' @param n_sa_iter set max Simulated Annealing iterations if method==SA
#' @param randomSeed set an integer random seed
#' @param showDeprecated set to FALSE to switch off the deprecated warning messsage
#' @return returns the activities/exposures of the signatures in the given sample and other information, such as p-values and exposures of individual bootstrap runs.
#' @keywords mutational signatures fit
#' @references Huang, X., Wojtowicz, D., & Przytycka, T. M. (2017). Detecting Presence Of Mutational Signatures In Cancer With Confidence. bioRxiv, (October). https://doi.org/10.1101/132597
#' @export
#' @examples
#' res <- SignatureFit_withBootstrap(catalogues,signature_data_matrix)
SignatureFit_withBootstrap <- function(cat, #catalogue, patients as columns, channels as rows
                                       signature_data_matrix, #signatures, signatures as columns, channels as rows
                                       nboot = 100, #number of bootstraps to use, more bootstraps more accurate results
                                       exposureFilterType = "fixedThreshold", # or "giniScaledThreshold"
                                       giniThresholdScaling = 10,
                                       giniThresholdScaling_nmuts = -1,
                                       threshold_percent = 5, #threshold in percentage of total mutations in a sample, only exposures larger than threshold are considered
                                       threshold_nmuts = -1, #threshold in number of mutations in a sample, only exposures larger than threshold are considered
                                       threshold_p.value = 0.05, #p-value to determine whether an exposure is above the threshold_percent. In other words, this is the empirical probability that the exposure is lower than the threshold
                                       method = "KLD", #KLD or SA, just don't use SA or you will wait forever, especially with many bootstraps. SA is ~1000 times slower than KLD or NNLS
                                       bf_method = "CosSim", #KLD or CosSim, only used if alpha != -1
                                       alpha = -1, #set alpha to -1 to avoid Bleeding Filter
                                       verbose=TRUE, #use FALSE to suppress messages
                                       doRound = FALSE, #round the exposures to the closest integer
                                       nparallel=1, #to use parallel specify >1
                                       n_sa_iter = 500, #only used if  method = "SA"
                                       randomSeed = NULL,
                                       showDeprecated = TRUE){
  if(showDeprecated) message("[warning SignatureFit_withBootstrap] SignatureFit_withBootstrap is deprecated, please use Fit instead with useBootstrap=TRUE. You can turn off this warning with showDeprecated=FALSE")
  if(!is.null(randomSeed)){
    set.seed(randomSeed)
  }

  if (nparallel > 1){
    doParallel::registerDoParallel(nparallel)
    if(!is.null(randomSeed)){
      doRNG::registerDoRNG(randomSeed)
    }
    boot_list <- foreach::foreach(j=1:nboot) %dopar% {
      bootcat <- generateRandMuts(cat,verbose = FALSE)
      SignatureFit(bootcat,signature_data_matrix,method,bf_method,alpha,verbose=verbose,doRound = doRound,n_sa_iter=n_sa_iter,showDeprecated = F)
    }
  }else{
    boot_list <- list()
    for(i in 1:nboot){
      bootcat <- generateRandMuts(cat,verbose = FALSE)
      boot_list[[i]] <- SignatureFit(bootcat,signature_data_matrix,method,bf_method,alpha,verbose=verbose,doRound = doRound,n_sa_iter=n_sa_iter,showDeprecated = F)
    }
  }

  samples_list <- list()
  for(i in 1:ncol(cat)) {
    samples_list[[colnames(cat)[i]]] <- matrix(NA,ncol = nboot,nrow = ncol(signature_data_matrix))
    colnames(samples_list[[colnames(cat)[i]]]) <- 1:nboot
    row.names(samples_list[[colnames(cat)[i]]]) <- colnames(signature_data_matrix)
  }
  for(i in 1:nboot){
    for(j in 1:ncol(cat)){
      samples_list[[colnames(cat)[j]]][,i] <- boot_list[[i]][,j]
    }
  }
  E_median_notfiltered <- matrix(NA,nrow = ncol(signature_data_matrix),ncol = ncol(cat))
  E_median_filtered <- matrix(NA,nrow = ncol(signature_data_matrix),ncol = ncol(cat))
  E_p.values <- matrix(NA,nrow = ncol(signature_data_matrix),ncol = ncol(cat))
  colnames(E_median_notfiltered) <- colnames(cat)
  row.names(E_median_notfiltered) <- colnames(signature_data_matrix)
  colnames(E_median_filtered) <- colnames(cat)
  row.names(E_median_filtered) <- colnames(signature_data_matrix)
  colnames(E_p.values) <- colnames(cat)
  row.names(E_p.values) <- colnames(signature_data_matrix)
  #KLD error vector
  KLD_samples <- c()

  for(i in 1:ncol(cat)) {
    if(sum(cat[,i])>0){
      boots_perc <- samples_list[[colnames(cat)[i]]]/matrix(sum(cat[,i]),byrow = TRUE,nrow = nrow(samples_list[[colnames(cat)[i]]]),ncol = ncol(samples_list[[colnames(cat)[i]]]))*100
      boots_nmuts <- samples_list[[colnames(cat)[i]]]

      if(exposureFilterType=="giniScaledThreshold"){
        sigInvGini <- 1 - apply(signature_data_matrix,2,giniCoeff)
        giniThresholdPerc <- giniThresholdScaling*sigInvGini
        giniThresholdNMUTS <- giniThresholdScaling_nmuts*sigInvGini
        # set to zero differently for each signature
        p.values <- c()
        for(j in 1:length(giniThresholdPerc)) {
          pselection <- rep(FALSE,length(boots_perc[j,]))
          if(giniThresholdScaling >= 0) pselection <- pselection | boots_perc[j,]<=giniThresholdPerc[j]
          if(giniThresholdScaling_nmuts >= 0) pselection <- pselection | boots_nmuts[j,]<=giniThresholdNMUTS[j]
          p.values <- c(p.values,sum(pselection)/nboot)
        }
        names(p.values) <- colnames(signature_data_matrix)
      }else if(exposureFilterType=="fixedThreshold"){
        pselection <- matrix(FALSE,ncol = ncol(boots_perc),nrow = nrow(boots_perc),
                             dimnames = list(rownames(boots_perc),colnames(boots_perc)))
        if(threshold_percent >= 0) pselection <- pselection | boots_perc <= threshold_percent
        if(threshold_nmuts >= 0) pselection <- pselection | boots_nmuts <= threshold_nmuts
        p.values <- apply(pselection,1,sum)/nboot
        names(p.values) <- colnames(signature_data_matrix)
      }


      median_mut <- apply(samples_list[[colnames(cat)[i]]],1,median)
      E_median_notfiltered[,i] <- median_mut
      E_p.values[,i] <- p.values

      # median_mut_perc <- median_mut/sum(median_mut)*100
      # median_mut_perc[p.values > threshold_p.value] <- 0
      # median_mut <- median_mut_perc/100*sum(cat[,i])
      median_mut[p.values > threshold_p.value] <- 0
      E_median_filtered[,i] <- median_mut
      KLD_samples <- c(KLD_samples,KLD(cat[,i,drop=FALSE],as.matrix(signature_data_matrix) %*% E_median_filtered[,i,drop=FALSE]))
    }else{
      E_median_notfiltered[,i] <- 0
      E_p.values[,i] <- NA
      E_median_filtered[,i] <- 0
      KLD_samples <- c(KLD_samples,0)
    }
  }
  names(KLD_samples) <- colnames(cat)

  #unassigned mutations
  reconstructed_with_median <- as.matrix(signature_data_matrix) %*% as.matrix(E_median_filtered)
  unassigned_muts <- sapply(1:ncol(reconstructed_with_median),function(i) sum(cat[,i,drop=FALSE]) - sum(reconstructed_with_median[,i,drop=FALSE]))
  names(unassigned_muts) <- colnames(cat)

  res <- list()
  res$signature_data_matrix <- signature_data_matrix
  res$cat <- cat
  res$E_median_filtered <- E_median_filtered
  res$E_p.values <- E_p.values
  res$samples_list <- samples_list
  res$boot_list <- boot_list
  res$KLD_samples <- KLD_samples
  #need metadata
  res$exposureFilterType <- exposureFilterType
  res$threshold_percent <- threshold_percent
  res$threshold_nmuts <- threshold_nmuts
  res$giniThresholdScaling <- giniThresholdScaling
  res$giniThresholdScaling_nmuts <- giniThresholdScaling_nmuts
  res$threshold_p.value <- threshold_p.value
  res$nboots <- nboot
  res$method <- method
  res$unassigned_muts <- unassigned_muts
  res$alpha <- alpha
  res$bf_method <- bf_method
  res$n_sa_iter <- n_sa_iter
  return(res)
}

#' Mutational Signatures Fit with Bootstrap Analysis
#'
#' This function is a wrapper for the function SignatureFit_withBootstrap_Analysis, which
#' produces several plots for each sample in the catalogues cat.
#' Fit a given set of mutational signatures into mutational catalogues to extimate the
#' activty/exposure of each of the given signatures in the catalogues. Implementation
#' of method similar to Huang 2017, Detecting presence of mutational signatures with
#' confidence, which uses a bootstrap apporach to calculate the empirical probability
#' of an exposure to be larger or equal to a given threshold (i.e. 5% of mutations of a sample).
#' This probability can be used to decide which exposures to remove from the initial fit,
#' thus increasing the sparsity of the exposures.
#' Note that SignatureFit_withBootstrap_Analysis will save the results of SignatureFit_withBootstrap
#' in the outdir directory using the R save() function. If SignatureFit_withBootstrap_Analysis
#' is rerun with the same setting, the saved file will be loaded to avoid rerunning the Signature Fit
#' and figures will be replotted.
#'
#' @param outdir output directory for the analysis, remember to add '/' at the end
#' @param cat catalogue matrix, patients as columns, channels as rows
#' @param signature_data_matrix signatures, signatures as columns, channels as rows
#' @param nboot number of bootstraps to use, more bootstraps more accurate results
#' @param type_of_mutations either "subs", "rearr" or "generic"
#' @param threshold_percent threshold in percentage of total mutations in a sample, only exposures larger than threshold are considered
#' @param threshold_p.value p-value to determine whether an exposure is above the threshold_percent. In other words, this is the empirical probability that the exposure is lower than the threshold
#' @param method KLD or NNLS or SA
#' @param bf_method bleeding filter method, one of KLD or CosSim, only if bleeding filter is used (alpha>-1)
#' @param alpha set alpha to -1 to avoid Bleeding Filter
#' @param doRound round the exposures to the closest integer
#' @param nparallel to use parallel specify >1
#' @param verbose use FALSE to suppress messages
#' @param n_sa_iter set max Simulated Annealing iterations if method==SA
#' @return returns the activities/exposures of the signatures in the given sample and other information, such as p-values and exposures of individual bootstrap runs.
#' @keywords mutational signatures fit
#' @export
#' @examples
#' res <- SignatureFit_withBootstrap_Analysis(catalogues,signature_data_matrix)
SignatureFit_withBootstrap_Analysis <- function(outdir, #output directory for the analysis, remember to add '/' at the end
                                                cat, #catalogue, patients as columns, channels as rows
                                       signature_data_matrix, #signatures, signatures as columns, channels as rows
                                       nboot = 100, #number of bootstraps to use, more bootstraps more accurate results
                                       type_of_mutations="subs", #use one of c("subs","rearr","generic","dnv")
                                       threshold_percent = 5, #threshold in percentage of total mutations in a sample, only exposures larger than threshold are considered
                                       threshold_p.value = 0.05, #p-value to determine whether an exposure is above the threshold_percent. In other words, this is the empirical probability that the exposure is lower than the threshold
                                       method = "KLD", #KLD or SA, just don't use SA or you will wait forever, expecially with many bootstraps. SA is ~1000 times slower than KLD or NNLS
                                       bf_method = "CosSim", #KLD or CosSim, only used if alpha != -1
                                       alpha = -1, #set alpha to -1 to avoid Bleeding Filter
                                       doRound = FALSE, #round the exposures to the closest integer
                                       nparallel=1, #to use parallel specify >1
                                       n_sa_iter = 500){  #only used if  method = "SA"

  dir.create(outdir,recursive = TRUE,showWarnings = FALSE)

  #begin by computing the sigfit bootstrap
  file_store <- paste0(outdir,"SigFit_withBootstrap_Summary_m",method,"_bfm",bf_method,"_alpha",alpha,"_tr",threshold_percent,"_p",threshold_p.value,".rData")
  if(file.exists(file_store)){
    load(file_store)
    message("[info SignatureFit_withBootstrap_Analysis] Bootstrap Signature Fits loaded from file")
  }else{
    res <- SignatureFit_withBootstrap(cat = cat,
                                          signature_data_matrix = signature_data_matrix,
                                          nboot = nboot,
                                          threshold_percent = threshold_percent,
                                          threshold_p.value = threshold_p.value,
                                          method = method,
                                          bf_method = bf_method,
                                          alpha = alpha,
                                          doRound = doRound,
                                          nparallel = nparallel,
                                          n_sa_iter = n_sa_iter,
                                          verbose = FALSE)
    save(file = file_store,res,nboot)
  }

  plot_SignatureFit_withBootstrap(outdir,res,type_of_mutations)

  return(res)
}

## This Function removes the 'unnecesary' signatures
#optimise w.r.t. cosine similarity
#alpha is the max cosine similarity that can be lost
bleedingFilter <- function(e, sig,  sample, alpha){
  ## Compute the cosine similarity between the current solution and the catalogue
  sim_smpl <- as.matrix(sig) %*% e
  val <- cos_sim(sample, sim_smpl)

  e_orig <- e
  delta <- 0


  while(delta<=alpha && length(which(e>0))>1){

    pos <- which(e>0)
    e <- e[pos]
    sig <- sig[,pos]
    sim_m <- matrix(0, ncol(sig), ncol(sig))
    colnames(sim_m) <- names(e)
    rownames(sim_m) <- names(e)

    ## Move mutations across each pair of signatures and estimate the cosine similarity of the new solution
    for(i in 1:ncol(sig)){
      for(j in 1:ncol(sig)){
        if(i!=j){
          e2 <- e
          e2[j] <- e2[j]+e2[i]
          e2[i] <- 0
          sim_smpl <- as.matrix(sig) %*% e2
          sim_m[i,j] <- cos_sim(sample, sim_smpl)
        }
      }
    }

    ## Extract the minimum delta
    delta <- val-max(sim_m)

    # If delta <= alpha accept the new solution
    if(delta<=alpha){
      pos <-  which(sim_m==max(sim_m), arr.ind=T)
      e[pos[1,2]] <- e[pos[1,2]]+e[pos[1,1]]
      e[pos[1,1]] <- 0
    }
  }

  e_orig[] <- 0
  e_orig[names(e)] <- e

  return(e_orig)
}

## This Function removes the 'unnecesary' signatures
#optimise w.r.t. KLD
#alpha is the max ratio of KLD that can be lost (e.g. 0.01 is 1% of original KLD)
bleedingFilterKLD <- function(e, sig,  sample, alpha){
  ## Compute the cosine similarity between the current solution and the catalogue
  sim_smpl <- as.matrix(sig) %*% e
  val <- KLD(sample, sim_smpl)
  alpha <- alpha*val

  e_orig <- e
  delta <- 0


  while(delta<=alpha && length(which(e>0))>1){

    pos <- which(e>0)
    e <- e[pos]
    sig <- sig[,pos]
    sim_m <- matrix(0, ncol(sig), ncol(sig))
    colnames(sim_m) <- names(e)
    rownames(sim_m) <- names(e)

    ## Move mutations across each pair of signatures and estimate the cosine similarity of the new solution
    for(i in 1:ncol(sig)){
      for(j in 1:ncol(sig)){
        if(i!=j){
          e2 <- e
          e2[j] <- e2[j]+e2[i]
          e2[i] <- 0
          sim_smpl <- as.matrix(sig) %*% e2
          sim_m[i,j] <- KLD(sample, sim_smpl)
        }
      }
    }
    sim_m <- sim_m + diag(nrow(sim_m))*1e6
    ## Extract the minimum delta
    delta <- min(sim_m)-val

    # If delta <= alpha accept the new solution
    if(delta<=alpha){
      pos <-  which(sim_m==min(sim_m), arr.ind=T)
      e[pos[1,2]] <- e[pos[1,2]]+e[pos[1,1]]
      e[pos[1,1]] <- 0
    }
  }

  e_orig[] <- 0
  e_orig[names(e)] <- e

  return(e_orig)
}


## Function to generate the start solution for the sim. annelaing
startSolution <- function(sig, cat){
  summ <- apply(sig, 1, sum)
  out <- (sig/summ)*cat
  pos <- which(is.nan(as.matrix(out)), arr.ind=T)
  if(length(pos)>0){
    out[pos] <- 0
  }
  return(apply(out, 2, sum))
}

## The Objective Function fot the simulated annelaing
objSimAnnelaingFunction <- function(x, xsample, xsignature){
  sim_smpl <- rep(0, length(xsample))
  for(i in 1:ncol(xsignature)){
    sim_smpl  <- sim_smpl+(xsignature[,i]*x[i])
  }
  sum(abs(xsample-sim_smpl))
}

#' @export
plotExposures <- function(exposures,
                          cossimModelVScatalogue = NULL,
                          output_file=NULL){

  plotMatrix(dataMatrix = exposures,
             output_file = output_file,
             ndigitsafterzero = 0,
             sideVector = cossimModelVScatalogue,
             sideVectorLabel = "cosine similarity to catalogue")
}

#' Export Signature Fit with bootstrap to JSON
#'
#' Given a res file obtained from the SignatureFit_withBootstrap or
#' SignatureFit_withBootstrap_Analysis function, export it to a set of
#' JSON files that can be used for web visualisation
#'
#' @param outdir output directory where the output JSON files should be saved, remember to put "/" at the end of the folder name.
#' @param res R object obtained from the SignatureFit_withBootstrap or SignatureFit_withBootstrap_Analysis function
#' @keywords mutational signatures fit JSON
#' @export
#' @examples
#' res <- SignatureFit_withBootstrap(cat = rnd_matrix,
#'                   signature_data_matrix = cosmic30,
#'                   nboot = 5,
#'                   threshold_percent = 0.1,
#'                   threshold_p.value = 0.1)
#' export_SignatureFit_withBootstrap_to_JSON("JSON_out/",res)
export_SignatureFit_withBootstrap_to_JSON <- function(outdir,res){

  dir.create(outdir,showWarnings = FALSE,recursive = TRUE)
  #plot consensus exposures file with metadata
  consensus_file <- paste0(outdir,"consensus.json")
  sink(file = consensus_file)
  cat("{\n")
  cat("\t\"nboots\": ",res$nboots,",\n",sep = "")
  cat("\t\"threshold_percent\": ",res$threshold_percent,",\n",sep = "")
  cat("\t\"threshold_p.value\": ",res$threshold_p.value,",\n",sep = "")
  cat("\t\"method\": \"",res$method,"\",\n",sep = "")
  cat("\t\"consensus\": {\n",sep = "")

  for(i in 1:ncol(res$E_median_filtered)){
    sname <- colnames(res$E_median_filtered)[i]
    cat("\t\t\"",sname,"\": {\n",sep = "")

    for(j in 1:nrow(res$E_median_filtered)){
      rname <- rownames(res$E_median_filtered)[j]
      cat("\t\t\t\"",rname,"\": ",res$E_median_filtered[j,i],sep = "")
      if(j<nrow(res$E_median_filtered)){
        cat(",\n")
      }else{
        cat("\n")
      }
    }

    cat("\t\t}")
    if(i<ncol(res$E_median_filtered)){
      cat(",\n")
    }else{
      cat("\n")
    }
  }

  cat("\t}\n")
  cat("}\n")
  sink()

  #plot bootstraps exposures file
  boot_file <- paste0(outdir,"bootstraps.json")
  sink(file = boot_file)
  cat("[\n")

  for (b in 1:length(res$boot_list)){
    data_mat <- res$boot_list[[b]]

    cat("\t{\n")

    for(i in 1:ncol(data_mat)){
      sname <- colnames(data_mat)[i]
      cat("\t\t\"",sname,"\": {\n",sep = "")

      for(j in 1:nrow(data_mat)){
        rname <- rownames(data_mat)[j]
        cat("\t\t\t\"",rname,"\": ",data_mat[j,i],sep = "")
        if(j<nrow(data_mat)){
          cat(",\n")
        }else{
          cat("\n")
        }
      }

      cat("\t\t}")
      if(i<ncol(data_mat)){
        cat(",\n")
      }else{
        cat("\n")
      }
    }

    cat("\t}")
    if(b<length(res$boot_list)){
      cat(",\n")
    }else{
      cat("\n")
    }

  }
  cat("]\n")
  sink()

  #plot correlation of exposures file for each sample
  for (s in 1:ncol(res$E_median_filtered)){
    if (nrow(res$samples_list[[s]])>1){
      sname <- colnames(res$E_median_filtered)[s]
      data_mat <- cor(t(res$samples_list[[s]]),method = "spearman")
      data_mat[is.na(data_mat)] <- 0
      data_mat[row(data_mat)+(ncol(data_mat)-col(data_mat))>=ncol(data_mat)] <- 0

      cor_file <- paste0(outdir,sname,"_correlation.tsv")
      sink(file = cor_file)
      cat("Signature")
      for (j in colnames(data_mat)) cat("\t",j,sep = "")
      cat("\n")

      for(i in 1:nrow(data_mat)){
        cat(row.names(data_mat)[i])
        for (j in 1:ncol(data_mat)) cat("\t",data_mat[i,j],sep = "")
        cat("\n")
      }

      sink()
    }
  }

}

#' Distribution of Signatures in Samples
#'
#' Given a catalogue of samples and an exposures table, compute the relative amount of each signature
#' in each sample and the unassigned mutations. Also cluster the samples with hierarchical clustering
#' with average linkage and order the samples according to the clustering. Optionally, plot to file.
#'
#' @param fileout if specified, generate a plot, otherwise no plot is generated, use extension png or jpg
#' @param catalogue original catalogue, channels as rows and samples as columns
#' @param exposures exposures/activities of signatures in each sample. Signatures as rows, samples as columns
#' @keywords unexplained samples
#' @return list with to objects: the matrix of the distribution of the signatures in the samples and the hierarchical clustering object
#' @export
#' @examples
#' res <- SignatureFit_withBootstrap(cat = catalogue,
#'                   signature_data_matrix = cosmic30,
#'                   nboot = 5,
#'                   threshold_percent = 0.1,
#'                   threshold_p.value = 0.1)
#' distribution_object <- exposureDistributionBarplot(catalogue=catalogue,
#'                   exposures=res$E_median_filtered)
exposureDistributionBarplot <- function(fileout=NULL,catalogue,exposures){
  
  if(!is.null(fileout)) plottype <- substr(fileout,nchar(fileout)-2,nchar(fileout))
  
  total_mut_in_exp <- apply(exposures,2,sum)
  total_mut_in_cat <- apply(catalogue,2,sum)
  unassigned_mut <- total_mut_in_cat - total_mut_in_exp
  unassigned_mut[unassigned_mut < 0] <- 0
  plot_matrix <- rbind(exposures,unassigned_mut)
  plot_matrix <- plot_matrix/matrix(rep(total_mut_in_cat,nrow(plot_matrix)),byrow = TRUE,nrow = nrow(plot_matrix))*100
  #cluster samples
  d <- dist(t(plot_matrix), method = "euclidean") # distance matrix
  fit <- hclust(d, method="average")
  #order according to clustering
  plot_matrix <- plot_matrix[,fit$order]
  kelly_colors <- c('F2F3F4', '222222', 'F3C300', '875692', 'F38400', 'A1CAF1', 'BE0032',
                    'C2B280', '848482', '008856', 'E68FAC', '0067A5', 'F99379', '604E97',
                    'F6A600', 'B3446C', 'DCD300', '882D17', '8DB600', '654522', 'E25822', '2B3D26','CCCCCC','CCCCCC','CCCCCC')
  kelly_colors <- paste0("#",kelly_colors)
  if(!is.null(fileout)){
    doPlot <- TRUE
    if(plottype=="jpg"){
      jpeg(filename = fileout,width = max(1800,200+ncol(exposures)*3),height = 1000,res = 200)
    }else if(plottype=="png"){
      png(filename = fileout,width = max(1800,200+ncol(exposures)*3),height = 1000,res = 200)
    }else{
      message("[warning exposureDistributionBarplot] unsupported plot file extension ",plottype,". Plot not generated. Please use jpg or png.")
      doPlot <- FALSE
    }
    if(doPlot){
      par(mar=c(4,4,3,5),mgp=c(1.5,0.5,0))
      barplot(plot_matrix,col = c(kelly_colors[1:nrow(exposures)],"grey"),
              border = NA,ylab = "% of mutation",xlab = "samples",xaxt="n",space=0)
      legend(x="topright",title = "Signatures",legend = c(rownames(exposures),"other"),fill = c(kelly_colors[1:nrow(exposures)],"grey"),bty = "n",inset = c(-0.08,0),xpd = TRUE)
      dev.off()
    }
  }
  res <- list()
  res$distribution_matrix <- plot_matrix
  res$clustering_object <- fit
  return(res)
}



#' plot_SignatureFit_withBootstrap
#'
#' Generate plots from the output of SignatureFit_withBootstrap.
#'
#' @param outdir output directory for the plot
#' @param boostrapFit_res output of SignatureFit_withBootstrap
#' @param type_of_mutations type of mutations: subs, rearr, dnv, or generic
#' @export
plot_SignatureFit_withBootstrap <- function(outdir,
                                          boostrapFit_res,
                                          type_of_mutations){

  dir.create(outdir,showWarnings = F,recursive = T)

  res <- boostrapFit_res

  #function to draw a legend for the heatmap of the correlation matrix
  draw_legend <- function(col,xl,xr,yb,yt){
    par(xpd=TRUE)
    rect(xl,yb,xr,yt)
    rect(
      xl,
      head(seq(yb,yt,(yt-yb)/length(col)),-1),
      xr,
      tail(seq(yb,yt,(yt-yb)/length(col)),-1),
      col=col,border = NA
    )
    text(x = 1.2, y = yt,labels = "1")
    text(x = 1.2, y = (yt-yb)/2,labels = "0")
    text(x = 1.2, y = yb,labels = "-1")
    par(xpd=FALSE)
  }

  reconstructed_with_median <- as.matrix(res$signature_data_matrix) %*% res$E_median_filtered


  #plot and save exposures
  sums_exp <- apply(res$cat, 2, sum)
  exposures <- rbind(res$E_median_filtered,res$unassigned_muts)
  rownames(exposures)[nrow(exposures)] <- "unassigned"
  denominator <- matrix(sums_exp,nrow = nrow(exposures),ncol = ncol(exposures),byrow = TRUE)
  exposuresProp <- (exposures/denominator*100)
  # case of empty catalogues
  exposuresProp[,sums_exp==0] <- 0

  file_table_exp <- paste0(outdir,"SigFit_withBootstrap_Exposures_m",res$method,"_bfm",res$bf_method,"_alpha",res$alpha,"_tr",res$threshold_percent,"_p",res$threshold_p.value,".tsv")
  file_plot_exp <- paste0(outdir,"SigFit_withBootstrap_Exposures_m",res$method,"_bfm",res$bf_method,"_alpha",res$alpha,"_tr",res$threshold_percent,"_p",res$threshold_p.value,".pdf")
  file_plot_expProp <- paste0(outdir,"SigFit_withBootstrap_Exposures_m",res$method,"_bfm",res$bf_method,"_alpha",res$alpha,"_tr",res$threshold_percent,"_p",res$threshold_p.value,"_prop.pdf")

  plotExposures(exposures = exposures,output_file = file_plot_exp)
  plotExposures(exposures = exposuresProp,output_file = file_plot_expProp)
  write.table(exposures,file = file_table_exp,
              sep = "\t",col.names = TRUE,row.names = TRUE,quote = FALSE)

  #provide a series of plots for each sample
  rows_ordered_from_best <- order(res$KLD_samples)
  plot_nrows <- 2
  plot_ncol <- 4
  nplots <- plot_nrows*plot_ncol
  howmanyplots <- ncol(res$cat)
  plotsdir <- paste0(outdir,"SigFit_withBootstrap_Summary_m",res$method,"_bfm",res$bf_method,"_alpha",res$alpha,"_tr",res$threshold_percent,"_p",res$threshold_p.value,"/")
  dir.create(plotsdir,recursive = TRUE,showWarnings = FALSE)
  for(p in 1:howmanyplots){
    current_samples <- p
    png(filename = paste0(plotsdir,"sigfit_bootstrap_",p,"of",howmanyplots,".png"),
         width = 640*(plot_ncol),
         height = 480*plot_nrows,
         res = 150)
    par(mfrow=c(plot_nrows,plot_ncol))
    for(i in current_samples){
      fitIsEmpty <- sum(res$E_median_filtered[,i])==0
      unassigned_mut <- sprintf("%.2f",(sum(res$cat[,i,drop=FALSE]) - sum(reconstructed_with_median[,i,drop=FALSE]))/sum(res$cat[,i,drop=FALSE])*100)
      percentdiff <- sprintf("%.2f",sum(abs(res$cat[,i,drop=FALSE] - reconstructed_with_median[,i,drop=FALSE]))/sum(res$cat[,i,drop=FALSE])*100)
      cos_sim <- sprintf("%.2f",cos_sim(res$cat[,i,drop=FALSE],reconstructed_with_median[,i,drop=FALSE]))
      if(type_of_mutations=="subs"){
        #1 original
        plotSubsSignatures(signature_data_matrix = res$cat[,i,drop=FALSE],add_to_titles = "Catalogue",mar=c(6,3,5,2))
        if(!fitIsEmpty){
          #2 reconstructed
          plotSubsSignatures(signature_data_matrix = reconstructed_with_median[,i,drop=FALSE],add_to_titles = "Model",mar=c(6,3,5,2))
          #3 difference
          plotSubsSignatures(signature_data_matrix = res$cat[,i,drop=FALSE] - reconstructed_with_median[,i,drop=FALSE],add_to_titles = paste0("Difference\n(CosSim ",cos_sim,", Unassigned ",unassigned_mut,"%)"),mar=c(6,3,5,2),plot_sum = FALSE)
        }
      }else if(type_of_mutations=="rearr"){
        #1 original
        plotRearrSignatures(signature_data_matrix = res$cat[,i,drop=FALSE],add_to_titles = "Catalogue",mar=c(12,3,5,2))
        if(!fitIsEmpty){
          #2 reconstructed
          plotRearrSignatures(signature_data_matrix = reconstructed_with_median[,i,drop=FALSE],add_to_titles = "Model",mar=c(12,3,5,2))
          #3 difference
          plotRearrSignatures(signature_data_matrix = res$cat[,i,drop=FALSE] - reconstructed_with_median[,i,drop=FALSE],add_to_titles = paste0("Difference\n(CosSim ",cos_sim,", Unassigned ",unassigned_mut,"%)"),mar=c(12,3,5,2),plot_sum = FALSE)
        }
      }else if(type_of_mutations=="generic"){
        #1 original
        plotGenericSignatures(signature_data_matrix = res$cat[,i,drop=FALSE],add_to_titles = "Catalogue",mar=c(6,3,5,2))
        if(!fitIsEmpty){
          #2 reconstructed
          plotGenericSignatures(signature_data_matrix = reconstructed_with_median[,i,drop=FALSE],add_to_titles = "Model",mar=c(6,3,5,2))
          #3 difference
          plotGenericSignatures(signature_data_matrix = res$cat[,i,drop=FALSE] - reconstructed_with_median[,i,drop=FALSE],add_to_titles = paste0("Difference\n(CosSim ",cos_sim,", Unassigned ",unassigned_mut,"%)"),mar=c(6,3,5,2),plot_sum = FALSE)
        }
      }else if(type_of_mutations=="dnv"){
        #1 original
        plotDNVSignatures(signature_data_matrix = res$cat[,i,drop=FALSE],add_to_titles = "Catalogue",mar=c(6,3,5,2))
        if(!fitIsEmpty){
          #2 reconstructed
          plotDNVSignatures(signature_data_matrix = reconstructed_with_median[,i,drop=FALSE],add_to_titles = "Model",mar=c(6,3,5,2))
          #3 difference
          plotDNVSignatures(signature_data_matrix = res$cat[,i,drop=FALSE] - reconstructed_with_median[,i,drop=FALSE],add_to_titles = paste0("Difference\n(CosSim ",cos_sim,", Unassigned ",unassigned_mut,"%)"),mar=c(6,3,5,2),plot_sum = FALSE)
        }
      }
      if(!fitIsEmpty){
        #4 bootstraps
        par(mar=c(6,4,5,2))
        boxplot(t(res$samples_list[[i]]),las=3,cex.axes=0.9,
                ylab="n mutations",
                ylim=c(0,max(res$samples_list[[i]])),
                main=paste0("Exposures, of ",colnames(res$E_median_filtered)[i],"\nthreshold=",res$threshold_percent,"%, p-value=",res$threshold_p.value,", n=",res$nboot))
        points(1:length(res$E_median_filtered[,i,drop=FALSE]),res$E_median_filtered[,i,drop=FALSE],col="red")
        abline(h=res$threshold_percent/100*sum(res$cat[,i,drop=FALSE]),col="green")
        legend(x="topleft",legend = c("consensus exposures"),col = "red",pch = 1,cex = 0.9,bty = "n",inset = c(0,-0.14),xpd = TRUE)
        legend(x="topright",legend = c("threshold"),col = "green",lty = 1,cex = 0.9,bty = "n",inset = c(0,-0.14),xpd = TRUE)
        if(ncol(res$signature_data_matrix)>1){
          #5 top correlated signatures
          res.cor <- suppressWarnings(cor(t(res$samples_list[[i]]),method = "spearman"))
          res.cor_triangular <- res.cor
          res.cor_triangular[row(res.cor)+(ncol(res.cor)-col(res.cor))>=ncol(res.cor)] <- 0
          res.cor_triangular_label <- matrix(sprintf("%0.2f",res.cor_triangular),nrow = nrow(res.cor_triangular))
          res.cor_triangular_label[row(res.cor)+(ncol(res.cor)-col(res.cor))>=ncol(res.cor)] <- ""
          par(mar=c(6,8,5,6))
          par(xpd=FALSE)
          col<- colorRampPalette(c("blue", "white", "red"))(51)
          image(res.cor_triangular,col = col,zlim = c(-1,1), axes=F,main="Exposures Correlation (spearman)")
          extrabit <- 1/(ncol(res$signature_data_matrix)-1)/2
          abline(h=seq(0-extrabit,1+extrabit,length.out = ncol(res$signature_data_matrix)+1),col="grey",lty=2)
          abline(v=seq(0-extrabit,1+extrabit,length.out = ncol(res$signature_data_matrix)+1),col="grey",lty=2)
          axis(2,at = seq(0,1,length.out = ncol(res$signature_data_matrix)),labels = colnames(res$signature_data_matrix),las=1,cex.lab=0.8)
          axis(1,at = seq(0,1,length.out = ncol(res$signature_data_matrix)),labels = colnames(res$signature_data_matrix),las=2,cex.lab=0.8)
          draw_legend(col,1.25,1.3,0,1)

          #6 some correlation plots
          vals <- res.cor_triangular[order(abs(res.cor_triangular),decreasing = TRUE)]
          for (j in 1:(nplots-5)){
            pos <- which(vals[j]==res.cor_triangular,arr.ind = TRUE)
            mainpar <- paste0("Exposures across bootstraps, n=",res$nboot,"\nspearman correlation ",sprintf("%.2f",vals[j]))
            plot(res$samples_list[[i]][pos[1],],res$samples_list[[i]][pos[2],],
                 xlab = colnames(res$signature_data_matrix)[pos[1]],
                 ylab = colnames(res$signature_data_matrix)[pos[2]],
                 main=mainpar,col="blue",pch = 16)

          }
        }
      }
    }
    dev.off()
  }

}
Nik-Zainal-Group/signature.tools.lib documentation built on April 13, 2025, 5:50 p.m.