#' Mutational Signatures Fit
#'
#' Fit a given set of mutational signatures into mutational catalogues to extimate the activty/exposure of each of the given signatures in the catalogues.
#'
#' @param cat catalogue matrix, patients as columns, channels as rows
#' @param signature_data_matrix signatures, signatures as columns, channels as rows
#' @param method KLD or NNLS or SA
#' @param bf_method bleeding filter method, one of KLD or CosSim, only if bleeding filter is used (alpha>-1)
#' @param alpha set alpha to -1 to avoid Bleeding Filter
#' @param doRound round the exposures to the closest integer
#' @param verbose use FALSE to suppress messages
#' @param n_sa_iter set max Simulated Annealing iterations if method==SA
#' @return returns the activities/exposures of the signatures in the given sample
#' @keywords mutational signatures fit
#' @export
#' @examples
#' res.exp <- SignatureFit(catalogues,signature_data_matrix)
SignatureFit <- function(cat, #catalogue, patients as columns, channels as rows
signature_data_matrix, #signatures, signatures as columns, channels as rows
method = "KLD", #KLD or NNLS or SA
bf_method = "CosSim", #KLD or CosSim
alpha = -1, #set alpha to -1 to avoid Bleeding Filter
doRound = TRUE, #round the exposures to the closest integer
verbose = TRUE, #use FALSE to suppress messages
n_sa_iter = 500){
if(method=="KLD"){
if(verbose) message("SignatureFit, objective function: KLD")
#Fit using the given signatures
nnlm_res <- NNLM::nnlm(as.matrix(signature_data_matrix),as.matrix(cat),loss = "mkl",method = "lee")
fit_KLD <- KLD(cat,as.matrix(signature_data_matrix) %*% nnlm_res$coefficients)
exposures <- nnlm_res$coefficients
if(verbose) message("Optimisation terminated, KLD=",fit_KLD)
if(alpha >= 0){ #negative alpha skips Bleeding Filter
#apply bleeding filter
if(bf_method=="CosSim"){
if(verbose) message("Applying Bleeding Filter (Cosine Similarity) with alpha=",alpha*100,"%")
}else if (bf_method=="KLD"){
if(verbose) message("Applying Bleeding Filter (KLD) with alpha=",alpha*100,"%")
}
for(i in 1:ncol(nnlm_res$coefficients)){
if(bf_method=="CosSim"){
exposures[,i] <- bleedingFilter(e = nnlm_res$coefficients[,i],
sig = signature_data_matrix,
sample = cat[,i],
alpha = alpha)
}else if (bf_method=="KLD"){
exposures[,i] <- bleedingFilterKLD(e = nnlm_res$coefficients[,i],
sig = signature_data_matrix,
sample = cat[,i],
alpha = alpha)
}
}
fit_KLD_afterBF <- KLD(cat,as.matrix(signature_data_matrix) %*% exposures)
if(verbose) message("New fit after Bleeding Filter, KLD=",fit_KLD_afterBF," (",sprintf("%.3f",(fit_KLD_afterBF - fit_KLD)/fit_KLD*100),"% increase)")
}
}else if(method=="SA"){
# library(GenSA)
# library(foreach)
# library(doParallel)
# library(doMC)
doParallel::registerDoParallel(4)
if(verbose) message("SignatureFit, objective function: SA")
sig <- signature_data_matrix
#---- copy from Sandro's code below
exp_list <- foreach::foreach(j=1:ncol(cat)) %dopar% {
curr_cat <- as.numeric(cat[,j])
nmut <- sum(curr_cat)
exp_tmp <- rep(0, ncol(sig))
names(exp_tmp) <- names(sig)
if(nmut>0){
if(verbose) message("Analyzing " , j, " of ", ncol(cat), " sample name ", colnames(cat[j]))
## Compute the start solution for the sim. annealing
ss <- startSolution(sig, curr_cat)
## Compute the exposure by using the sim. annealing
out <- GenSA::GenSA(par=as.numeric(ss), lower=rep(0, ncol(sig)), upper=rep(nmut, ncol(sig)), fn=objSimAnnelaingFunction, control=list(maxit=n_sa_iter), xsample=curr_cat, xsignature=sig)
exp_tmp <- as.numeric(out$par)
if(sum(exp_tmp)==0){
exp_tmp <- ss
}else{
exp_tmp <- (exp_tmp/sum(exp_tmp))*nmut
}
names(exp_tmp) <- names(ss)
## Apply the bleeding Filter to remove the 'unnecessary' signatures
if(bf_method=="CosSim"){
if(verbose) message("Applying Bleeding Filter (Cosine Similarity) with alpha=",alpha*100,"%")
exp_tmp <- bleedingFilter(exp_tmp, sig, curr_cat, alpha)
}else if (bf_method=="KLD"){
if(verbose) message("Applying Bleeding Filter (KLD) with alpha=",alpha*100,"%")
exp_tmp <- bleedingFilterKLD(exp_tmp, sig, curr_cat, alpha)
}
}
exp_tmp
}
exposures <- matrix(unlist(exp_list), ncol=ncol(sig), byrow=T)
rownames(exposures) <- colnames(cat)
colnames(exposures) <- colnames(sig)
exposures <- t(exposures)
#----- end of Sandro's code
}else if(method=="NNLS"){
if(verbose) message("SignatureFit, method: NNLS")
exposures <- matrix(NA,ncol = ncol(cat),nrow = ncol(signature_data_matrix))
colnames(exposures) <- colnames(cat)
rownames(exposures) <- colnames(signature_data_matrix)
for (i in 1:ncol(cat)){
q <- as.vector(cat[,i])
exp_NNLS <- nnls::nnls(as.matrix(signature_data_matrix),q)
exposures[,i] <- exp_NNLS$x
}
fit_KLD <- KLD(cat,as.matrix(signature_data_matrix) %*% exposures)
if(verbose) message("Optimisation terminated, KLD=",fit_KLD)
if(alpha >= 0){ #negative alpha skips Bleeding Filter
#apply bleeding filter
if(bf_method=="CosSim"){
if(verbose) message("Applying Bleeding Filter (Cosine Similarity) with alpha=",alpha*100,"%")
}else if (bf_method=="KLD"){
if(verbose) message("Applying Bleeding Filter (KLD) with alpha=",alpha*100,"%")
}
for (i in 1:ncol(exposures)){
if(bf_method=="CosSim"){
exposures[,i] <- bleedingFilter(e = exposures[,i],
sig = signature_data_matrix,
sample = cat[,i],
alpha = alpha)
}else if (bf_method=="KLD"){
exposures[,i] <- bleedingFilterKLD(e = exposures[,i],
sig = signature_data_matrix,
sample = cat[,i],
alpha = alpha)
}
}
}
fit_KLD_afterBF <- KLD(cat,as.matrix(signature_data_matrix) %*% exposures)
if(verbose) message("New fit after Bleeding Filter, KLD=",fit_KLD_afterBF," (",sprintf("%.3f",(fit_KLD_afterBF - fit_KLD)/fit_KLD*100),"% increase)")
}else{
stop("SignatureFit error. Unknown method specified.")
}
exposures[exposures < .Machine$double.eps] <- 0
if(doRound) exposures <- round(exposures)
return(exposures)
}
#' Mutational Signatures Fit with Bootstrap
#'
#' Fit a given set of mutational signatures into mutational catalogues to extimate the
#' activty/exposure of each of the given signatures in the catalogues. Implementation
#' of method similar to Huang et al. 2017, Detecting presence of mutational signatures with
#' confidence, which uses a bootstrap apporach to calculate the empirical probability
#' of an exposure to be larger or equal to a given threshold (i.e. 5% of mutations of a sample).
#' This probability can be used to decide which exposures to remove from the initial fit,
#' thus increasing the sparsity of the exposures.
#'
#' @param cat catalogue matrix, patients as columns, channels as rows
#' @param signature_data_matrix signatures, signatures as columns, channels as rows
#' @param nboot number of bootstraps to use, more bootstraps more accurate results
#' @param threshold_percent threshold in percentage of total mutations in a sample, only exposures larger than threshold are considered
#' @param threshold_p.value p-value to determine whether an exposure is above the threshold_percent. In other words, this is the empirical probability that the exposure is lower than the threshold
#' @param method KLD or NNLS or SA
#' @param bf_method bleeding filter method, one of KLD or CosSim, only if bleeding filter is used (alpha>-1)
#' @param alpha set alpha to -1 to avoid Bleeding Filter
#' @param doRound round the exposures to the closest integer
#' @param nparallel to use parallel specify >1
#' @param verbose use FALSE to suppress messages
#' @param n_sa_iter set max Simulated Annealing iterations if method==SA
#' @return returns the activities/exposures of the signatures in the given sample and other information, such as p-values and exposures of individual bootstrap runs.
#' @keywords mutational signatures fit
#' @references Huang, X., Wojtowicz, D., & Przytycka, T. M. (2017). Detecting Presence Of Mutational Signatures In Cancer With Confidence. bioRxiv, (October). https://doi.org/10.1101/132597
#' @export
#' @examples
#' res <- SignatureFit_withBootstrap(catalogues,signature_data_matrix)
SignatureFit_withBootstrap <- function(cat, #catalogue, patients as columns, channels as rows
signature_data_matrix, #signatures, signatures as columns, channels as rows
nboot = 100, #number of bootstraps to use, more bootstraps more accurate results
threshold_percent = 5, #threshold in percentage of total mutations in a sample, only exposures larger than threshold are considered
threshold_p.value = 0.05, #p-value to determine whether an exposure is above the threshold_percent. In other words, this is the empirical probability that the exposure is lower than the threshold
method = "KLD", #KLD or SA, just don't use SA or you will wait forever, expecially with many bootstraps. SA is ~1000 times slower than KLD or NNLS
bf_method = "CosSim", #KLD or CosSim, only used if alpha != -1
alpha = -1, #set alpha to -1 to avoid Bleeding Filter
verbose=TRUE, #use FALSE to suppress messages
doRound = TRUE, #round the exposures to the closest integer
nparallel=1, #to use parallel specify >1
n_sa_iter = 500){ #only used if method = "SA"
if (nparallel > 1){
# library(foreach)
# library(doParallel)
# library(doMC)
doParallel::registerDoParallel(nparallel)
boot_list <- foreach::foreach(j=1:nboot) %dopar% {
bootcat <- generateRandMuts(cat)
SignatureFit(bootcat,signature_data_matrix,method,bf_method,alpha,verbose=verbose,doRound = doRound,n_sa_iter=n_sa_iter)
}
}else{
boot_list <- list()
for(i in 1:nboot){
bootcat <- generateRandMuts(cat)
boot_list[[i]] <- SignatureFit(bootcat,signature_data_matrix,method,bf_method,alpha,verbose=verbose,doRound = doRound,n_sa_iter=n_sa_iter)
}
}
samples_list <- list()
for(i in 1:ncol(cat)) {
samples_list[[i]] <- matrix(NA,ncol = nboot,nrow = ncol(signature_data_matrix))
colnames(samples_list[[i]]) <- 1:nboot
row.names(samples_list[[i]]) <- colnames(signature_data_matrix)
}
for(i in 1:nboot){
for(j in 1:ncol(cat)){
samples_list[[j]][,i] <- boot_list[[i]][,j]
}
}
E_median_notfiltered <- matrix(NA,nrow = ncol(signature_data_matrix),ncol = ncol(cat))
E_median_filtered <- matrix(NA,nrow = ncol(signature_data_matrix),ncol = ncol(cat))
E_p.values <- matrix(NA,nrow = ncol(signature_data_matrix),ncol = ncol(cat))
colnames(E_median_notfiltered) <- colnames(cat)
row.names(E_median_notfiltered) <- colnames(signature_data_matrix)
colnames(E_median_filtered) <- colnames(cat)
row.names(E_median_filtered) <- colnames(signature_data_matrix)
colnames(E_p.values) <- colnames(cat)
row.names(E_p.values) <- colnames(signature_data_matrix)
#KLD error vector
KLD_samples <- c()
for(i in 1:ncol(cat)) {
boots_perc <- samples_list[[i]]/matrix(apply(samples_list[[i]],2,sum),byrow = TRUE,nrow = nrow(samples_list[[i]]),ncol = ncol(samples_list[[i]]))*100
p.values <- apply(boots_perc <= threshold_percent,1,sum)/nboot
median_mut <- apply(samples_list[[i]],1,median)
E_median_notfiltered[,i] <- median_mut
E_p.values[,i] <- p.values
median_mut_perc <- median_mut/sum(median_mut)*100
# plot(median_mut_perc)
# abline(h=5)
median_mut_perc[p.values > threshold_p.value] <- 0
#below rescaling, not sure whether to use it or not. If not I have something like a residual
# median_mut_perc <- median_mut_perc/sum(median_mut_perc)*100
median_mut <- median_mut_perc/100*sum(cat[,i])
# boxplot(t(samples_list[[1]]))
# points(1:10,E[,1],col="red")
# points(1:10,median_mut,col="green")
E_median_filtered[,i] <- median_mut
KLD_samples <- c(KLD_samples,KLD(cat[,i,drop=FALSE],as.matrix(signature_data_matrix) %*% E_median_filtered[,i,drop=FALSE]))
}
names(KLD_samples) <- colnames(cat)
#unassigned mutations
reconstructed_with_median <- round(as.matrix(signature_data_matrix) %*% as.matrix(E_median_filtered))
unassigned_muts <- sapply(1:ncol(reconstructed_with_median),function(i) (sum(cat[,i,drop=FALSE]) - sum(reconstructed_with_median[,i,drop=FALSE]))/sum(cat[,i,drop=FALSE])*100)
names(unassigned_muts) <- colnames(cat)
res <- list()
res$signature_data_matrix <- signature_data_matrix
res$cat <- cat
res$E_median_filtered <- E_median_filtered
res$E_p.values <- E_p.values
res$samples_list <- samples_list
res$boot_list <- boot_list
res$KLD_samples <- KLD_samples
#need metadata
res$threshold_percent <- threshold_percent
res$threshold_p.value <- threshold_p.value
res$nboots <- nboot
res$method <- method
res$unassigned_muts <- unassigned_muts
res$alpha <- alpha
res$bf_method <- bf_method
res$n_sa_iter <- n_sa_iter
return(res)
}
#' Mutational Signatures Fit with Bootstrap Analysis
#'
#' This function is a wrapper for the function SignatureFit_withBootstrap_Analysis, which
#' produces several plots for each sample in the catalogues cat.
#' Fit a given set of mutational signatures into mutational catalogues to extimate the
#' activty/exposure of each of the given signatures in the catalogues. Implementation
#' of method similar to Huang 2017, Detecting presence of mutational signatures with
#' confidence, which uses a bootstrap apporach to calculate the empirical probability
#' of an exposure to be larger or equal to a given threshold (i.e. 5% of mutations of a sample).
#' This probability can be used to decide which exposures to remove from the initial fit,
#' thus increasing the sparsity of the exposures.
#' Note that SignatureFit_withBootstrap_Analysis will save the results of SignatureFit_withBootstrap
#' in the outdir directory using the R save() function. If SignatureFit_withBootstrap_Analysis
#' is rerun with the same setting, the saved file will be loaded to avoid rerunning the Signature Fit
#' and figures will be replotted.
#'
#' @param outdir output directory for the analysis, remember to add '/' at the end
#' @param cat catalogue matrix, patients as columns, channels as rows
#' @param signature_data_matrix signatures, signatures as columns, channels as rows
#' @param nboot number of bootstraps to use, more bootstraps more accurate results
#' @param type_of_mutations either "subs", "rearr" or "generic"
#' @param threshold_percent threshold in percentage of total mutations in a sample, only exposures larger than threshold are considered
#' @param threshold_p.value p-value to determine whether an exposure is above the threshold_percent. In other words, this is the empirical probability that the exposure is lower than the threshold
#' @param method KLD or NNLS or SA
#' @param bf_method bleeding filter method, one of KLD or CosSim, only if bleeding filter is used (alpha>-1)
#' @param alpha set alpha to -1 to avoid Bleeding Filter
#' @param doRound round the exposures to the closest integer
#' @param nparallel to use parallel specify >1
#' @param verbose use FALSE to suppress messages
#' @param n_sa_iter set max Simulated Annealing iterations if method==SA
#' @return returns the activities/exposures of the signatures in the given sample and other information, such as p-values and exposures of individual bootstrap runs.
#' @keywords mutational signatures fit
#' @export
#' @examples
#' res <- SignatureFit_withBootstrap_Analysis(catalogues,signature_data_matrix)
SignatureFit_withBootstrap_Analysis <- function(outdir, #output directory for the analysis, remember to add '/' at the end
cat, #catalogue, patients as columns, channels as rows
signature_data_matrix, #signatures, signatures as columns, channels as rows
nboot = 100, #number of bootstraps to use, more bootstraps more accurate results
type_of_mutations="subs", #use one of c("subs","rearr","generic")
threshold_percent = 5, #threshold in percentage of total mutations in a sample, only exposures larger than threshold are considered
threshold_p.value = 0.05, #p-value to determine whether an exposure is above the threshold_percent. In other words, this is the empirical probability that the exposure is lower than the threshold
method = "KLD", #KLD or SA, just don't use SA or you will wait forever, expecially with many bootstraps. SA is ~1000 times slower than KLD or NNLS
bf_method = "CosSim", #KLD or CosSim, only used if alpha != -1
alpha = -1, #set alpha to -1 to avoid Bleeding Filter
doRound = TRUE, #round the exposures to the closest integer
nparallel=1, #to use parallel specify >1
n_sa_iter = 500){ #only used if method = "SA"
# outdir <- "../results/sigfitbootstraptests/"
dir.create(outdir,recursive = TRUE,showWarnings = FALSE)
#begin by computing the sigfit bootstrap
file_store <- paste0(outdir,"SigFit_withBootstrap_Summary_m",method,"_bfm",bf_method,"_alpha",alpha,"_tr",threshold_percent,"_p",threshold_p.value,".rData")
if(file.exists(file_store)){
load(file_store)
message("Bootstrap Signature Fits loaded from file")
}else{
res <- SignatureFit_withBootstrap(cat = cat,
signature_data_matrix = signature_data_matrix,
nboot = nboot,
threshold_percent = threshold_percent,
threshold_p.value = threshold_p.value,
method = method,
bf_method = bf_method,
alpha = alpha,
doRound = doRound,
nparallel = nparallel,
n_sa_iter = n_sa_iter,
verbose = FALSE)
save(file = file_store,res,nboot)
}
plot_SignatureFit_withBootstrap(outdir,res,type_of_mutations)
return(res)
}
## This Function removes the 'unnecesary' signatures
#optimise w.r.t. cosine similarity
#alpha is the max cosine similarity that can be lost
bleedingFilter <- function(e, sig, sample, alpha){
## Compute the cosine similarity between the current solution and the catalogue
#sim_smpl <- computeSimSample(e, sample, sig)
sim_smpl <- as.matrix(sig) %*% e
val <- cos.sim(sample, sim_smpl)
e_orig <- e
delta <- 0
while(delta<=alpha && length(which(e>0))>1){
pos <- which(e>0)
e <- e[pos]
sig <- sig[,pos]
sim_m <- matrix(0, ncol(sig), ncol(sig))
colnames(sim_m) <- names(e)
rownames(sim_m) <- names(e)
## Move mutations across each pair of signatures and estimate the cosine similarity of the new solution
for(i in 1:ncol(sig)){
for(j in 1:ncol(sig)){
if(i!=j){
e2 <- e
e2[j] <- e2[j]+e2[i]
e2[i] <- 0
#sim_smpl <- computeSimSample(e2, sample, sig)
sim_smpl <- as.matrix(sig) %*% e2
sim_m[i,j] <- cos.sim(sample, sim_smpl)
}
}
}
## Extract the minimum delta
delta <- val-max(sim_m)
# If delta <= alpha accept the new solution
if(delta<=alpha){
pos <- which(sim_m==max(sim_m), arr.ind=T)
e[pos[1,2]] <- e[pos[1,2]]+e[pos[1,1]]
e[pos[1,1]] <- 0
}
}
e_orig[] <- 0
e_orig[names(e)] <- e
return(e_orig)
}
## This Function removes the 'unnecesary' signatures
#optimise w.r.t. KLD
#alpha is the max ratio of KLD that can be lost (e.g. 0.01 is 1% of original KLD)
bleedingFilterKLD <- function(e, sig, sample, alpha){
## Compute the cosine similarity between the current solution and the catalogue
#sim_smpl <- computeSimSample(e, sample, sig)
sim_smpl <- as.matrix(sig) %*% e
val <- KLD(sample, sim_smpl)
alpha <- alpha*val
e_orig <- e
delta <- 0
while(delta<=alpha && length(which(e>0))>1){
pos <- which(e>0)
e <- e[pos]
sig <- sig[,pos]
sim_m <- matrix(0, ncol(sig), ncol(sig))
colnames(sim_m) <- names(e)
rownames(sim_m) <- names(e)
## Move mutations across each pair of signatures and estimate the cosine similarity of the new solution
for(i in 1:ncol(sig)){
for(j in 1:ncol(sig)){
if(i!=j){
e2 <- e
e2[j] <- e2[j]+e2[i]
e2[i] <- 0
#sim_smpl <- computeSimSample(e2, sample, sig)
sim_smpl <- as.matrix(sig) %*% e2
sim_m[i,j] <- KLD(sample, sim_smpl)
}
}
}
sim_m <- sim_m + diag(nrow(sim_m))*1e6
## Extract the minimum delta
delta <- min(sim_m)-val
# If delta <= alpha accept the new solution
if(delta<=alpha){
pos <- which(sim_m==min(sim_m), arr.ind=T)
e[pos[1,2]] <- e[pos[1,2]]+e[pos[1,1]]
e[pos[1,1]] <- 0
}
}
e_orig[] <- 0
e_orig[names(e)] <- e
return(e_orig)
}
## Function to generate the start solution for the sim. annelaing
startSolution <- function(sig, cat){
summ <- apply(sig, 1, sum)
out <- (sig/summ)*cat
pos <- which(is.nan(as.matrix(out)), arr.ind=T)
if(length(pos)>0){
out[pos] <- 0
}
return(apply(out, 2, sum))
}
## The Objective Function fot the simulated annelaing
objSimAnnelaingFunction <- function(x, xsample, xsignature){
sim_smpl <- rep(0, length(xsample))
for(i in 1:ncol(xsignature)){
sim_smpl <- sim_smpl+(xsignature[,i]*x[i])
}
sum(abs(xsample-sim_smpl))
}
## Given the exposure and the probability matrix compute the Catalogue
# computeSimSample <- function(x, xsample, xsignature){
# sim_smpl <- rep(0, length(xsample))
# for(i in 1:ncol(xsignature)){
# sim_smpl <- sim_smpl+(xsignature[,i]*x[i])
# }
# return(sim_smpl)
# }
#' @export
plot.exposures <- function(exposures,output_file,dpi=300){
# library("ggplot2")
# Set up the vectors
signatures.names <- colnames(exposures)
sample.names <- row.names(exposures)
# Create the data frame
df <- expand.grid(sample.names,signatures.names)
df$value <- unlist(as.data.frame(exposures))
df$labels <- sprintf("%.0f", df$value)
df$labels[df$value==0] <- ""
#Plot the Data (500+150*nsamples)x1200
g <- ggplot2::ggplot(df, ggplot2::aes(Var1, Var2)) + ggplot2::geom_point(ggplot2::aes(size = value), colour = "green") + ggplot2::theme_bw() + ggplot2::xlab("") + ggplot2::ylab("")
g <- g + ggplot2::scale_size_continuous(range=c(0,10)) + ggplot2::geom_text(ggplot2::aes(label = labels))
g + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1, size=14),
axis.text.y = ggplot2::element_text(vjust = 1, size=14))
w <- (500+150*length(sample.names))/dpi
h <- (500+150*length(signatures.names))/dpi
ggplot2::ggsave(filename = output_file,dpi = dpi,height = h,width = w,limitsize = FALSE)
}
#' Export Signature Fit with bootstrap to JSON
#'
#' Given a res file obtained from the SignatureFit_withBootstrap or
#' SignatureFit_withBootstrap_Analysis function, export it to a set of
#' JSON files that can be used for web visualisation
#'
#' @param outdir output directory where the output JSON files should be saved, remember to put "/" at the end of the folder name.
#' @param res R object obtained from the SignatureFit_withBootstrap or SignatureFit_withBootstrap_Analysis function
#' @keywords mutational signatures fit JSON
#' @export
#' @examples
#' res <- SignatureFit_withBootstrap(cat = rnd_matrix,
#' signature_data_matrix = cosmic30,
#' nboot = 5,
#' threshold_percent = 0.1,
#' threshold_p.value = 0.1)
#' export_SignatureFit_withBootstrap_to_JSON("JSON_out/",res)
export_SignatureFit_withBootstrap_to_JSON <- function(outdir,res){
dir.create(outdir,showWarnings = FALSE,recursive = TRUE)
#plot consensus exposures file with metadata
consensus_file <- paste0(outdir,"consensus.json")
sink(file = consensus_file)
cat("{\n")
cat("\t\"nboots\": ",res$nboots,",\n",sep = "")
cat("\t\"threshold_percent\": ",res$threshold_percent,",\n",sep = "")
cat("\t\"threshold_p.value\": ",res$threshold_p.value,",\n",sep = "")
cat("\t\"method\": \"",res$method,"\",\n",sep = "")
cat("\t\"consensus\": {\n",sep = "")
for(i in 1:ncol(res$E_median_filtered)){
sname <- colnames(res$E_median_filtered)[i]
cat("\t\t\"",sname,"\": {\n",sep = "")
for(j in 1:nrow(res$E_median_filtered)){
rname <- rownames(res$E_median_filtered)[j]
cat("\t\t\t\"",rname,"\": ",res$E_median_filtered[j,i],sep = "")
if(j<nrow(res$E_median_filtered)){
cat(",\n")
}else{
cat("\n")
}
}
cat("\t\t}")
if(i<ncol(res$E_median_filtered)){
cat(",\n")
}else{
cat("\n")
}
}
cat("\t}\n")
cat("}\n")
sink()
#plot bootstraps exposures file
boot_file <- paste0(outdir,"bootstraps.json")
sink(file = boot_file)
cat("[\n")
for (b in 1:length(res$boot_list)){
data_mat <- res$boot_list[[b]]
cat("\t{\n")
for(i in 1:ncol(data_mat)){
sname <- colnames(data_mat)[i]
cat("\t\t\"",sname,"\": {\n",sep = "")
for(j in 1:nrow(data_mat)){
rname <- rownames(data_mat)[j]
cat("\t\t\t\"",rname,"\": ",data_mat[j,i],sep = "")
if(j<nrow(data_mat)){
cat(",\n")
}else{
cat("\n")
}
}
cat("\t\t}")
if(i<ncol(data_mat)){
cat(",\n")
}else{
cat("\n")
}
}
cat("\t}")
if(b<length(res$boot_list)){
cat(",\n")
}else{
cat("\n")
}
}
cat("]\n")
sink()
#plot correlation of exposures file for each sample
for (s in 1:ncol(res$E_median_filtered)){
if (nrow(res$samples_list[[s]])>1){
sname <- colnames(res$E_median_filtered)[s]
data_mat <- cor(t(res$samples_list[[s]]),method = "spearman")
data_mat[is.na(data_mat)] <- 0
data_mat[row(data_mat)+(ncol(data_mat)-col(data_mat))>=ncol(data_mat)] <- 0
cor_file <- paste0(outdir,sname,"_correlation.tsv")
sink(file = cor_file)
cat("Signature")
for (j in colnames(data_mat)) cat("\t",j,sep = "")
cat("\n")
for(i in 1:nrow(data_mat)){
cat(row.names(data_mat)[i])
for (j in 1:ncol(data_mat)) cat("\t",data_mat[i,j],sep = "")
cat("\n")
}
sink()
}
}
}
#' Distribution of Signatures in Samples
#'
#' Given a catalogue of samples and an exposures table, compute the relative amount of each signature
#' in each sample and the unassigned mutations. Also cluster the samples with hierarchical clustering
#' with average linkage and order the samples according to the clustering. Optionally, plot to file.
#'
#' @param fileout if specified, generate a plot, otherwise no plot is generated
#' @param catalogue original catalogue, channels as rows and samples as columns
#' @param exposures exposures/activities of signatures in each sample. Signatures as rows, samples as columns
#' @keywords unexplained samples
#' @return list with to objects: the matrix of the distribution of the signatures in the samples and the hierachical clustering object
#' @export
#' @examples
#' res <- SignatureFit_withBootstrap(cat = catalogue,
#' signature_data_matrix = cosmic30,
#' nboot = 5,
#' threshold_percent = 0.1,
#' threshold_p.value = 0.1)
#' distribution_object <- exposureDistributionBarplot(catalogue=catalogue,
#' exposures=res$E_median_filtered)
exposureDistributionBarplot <- function(fileout=NULL,catalogue,exposures){
total_mut_in_exp <- apply(exposures,2,sum)
total_mut_in_cat <- apply(catalogue,2,sum)
unassigned_mut <- total_mut_in_cat - total_mut_in_exp
unassigned_mut[unassigned_mut < 0] <- 0
plot_matrix <- rbind(exposures,unassigned_mut)
plot_matrix <- plot_matrix/matrix(rep(total_mut_in_cat,nrow(plot_matrix)),byrow = TRUE,nrow = nrow(plot_matrix))*100
#cluster samples
d <- dist(t(plot_matrix), method = "euclidean") # distance matrix
fit <- hclust(d, method="average")
#order according to clustering
plot_matrix <- plot_matrix[,fit$order]
kelly_colors <- c('F2F3F4', '222222', 'F3C300', '875692', 'F38400', 'A1CAF1', 'BE0032',
'C2B280', '848482', '008856', 'E68FAC', '0067A5', 'F99379', '604E97',
'F6A600', 'B3446C', 'DCD300', '882D17', '8DB600', '654522', 'E25822', '2B3D26','CCCCCC','CCCCCC','CCCCCC')
kelly_colors <- paste0("#",kelly_colors)
if(!is.null(fileout)){
jpeg(filename = fileout,width = max(1800,200+ncol(exposures)*3),height = 1000,res = 200)
par(mar=c(4,4,3,5),mgp=c(1.5,0.5,0))
barplot(plot_matrix,col = c(kelly_colors[1:nrow(exposures)],"grey"),
border = NA,ylab = "% of mutation",xlab = "samples",xaxt="n",space=0)
legend(x="topright",title = "Signatures",legend = c(rownames(exposures),"other"),fill = c(kelly_colors[1:nrow(exposures)],"grey"),bty = "n",inset = c(-0.08,0),xpd = TRUE)
dev.off()
}
res <- list()
res$distribution_matrix <- plot_matrix
res$clustering_object <- fit
return(res)
}
#' Estimate samples not fully explained by signature fit
#'
#' Given a catalogue of samples, signatures and exposures, compute the sum of the absolute deviations (SAD)
#' between the original catalogue and the reconstructed samples (i.e. signatures x exposures) and
#' normalise this sum by the total number of mutations in the sample. Then, for each sample, compare its
#' normalised SAD to the normalised SAD of the other samples and check if it is significantly different. In practice,
#' a p-value is computed fitting a gaussian distribution to the other samples.
#'
#' @param fileout if specified, generate a plot, otherwise no plot is generated
#' @param catalogue original catalogue, channels as rows and samples as columns
#' @param sigs mutational signautures used for fitting, channels as rows, signatures as columns
#' @param exposures exposures/activities of signatures in each sample. Signatures as rows, samples as columns
#' @param pvalue_threshold threshold for statistical significance
#' @keywords unexplained samples
#' @return table of unexplained samples
#' @export
#' @examples
#' res <- SignatureFit_withBootstrap(cat = catalogue,
#' signature_data_matrix = cosmic30,
#' nboot = 5,
#' threshold_percent = 0.1,
#' threshold_p.value = 0.1)
#' s_table <- unexplainedSamples(catalogue=catalogue,
#' sigs=cosmic30,
#' exposures=res$E_median_filtered)
unexplainedSamples <- function(fileout=NULL,catalogue,sigs,exposures,pvalue_threshold=0.01){
reconstructed <- as.matrix(sigs) %*% as.matrix(exposures)
#look at various metrics, normalised by number of mutation
norm_SAD <- c()
for (i in 1:ncol(catalogue)){
norm_SAD <- c(norm_SAD,sum(abs(reconstructed[,i] - catalogue[,i]))/sum(catalogue[,i]))
}
pval_norm_SAD <- c()
for (i in 1:length(norm_SAD)){
m <- mean(norm_SAD[-i])
s <- sd(norm_SAD[-i])
pval_norm_SAD <- c(pval_norm_SAD,1 - pnorm(norm_SAD[i],mean = m,sd = s,lower.tail = TRUE))
}
which_significant <- which(pval_norm_SAD<pvalue_threshold)
#plot the outliers
if(!is.null(fileout)){
jpeg(filename = fileout,width = 1200,height = 800,res = 200)
par(mar=c(4,4,5,2))
plot(norm_SAD,
col=rgb(0.5,0.5,0.5,0.5),
pch = 16, ylab = "SAD/n. mutations",xlab = "samples",main = paste0("Normalised Sum of Absolute Deviations (SAD)\n between original and reconstructed catalogue"))
points(which_significant,norm_SAD[which_significant],col="red",pch = 16)
legend(x="topleft",legend = c(paste0("significantly higher (p-value<",pvalue_threshold,")")),col = "red",pch = 16,cex = 0.9,bty = "n",inset = c(0,-0.14),xpd = TRUE)
dev.off()
}
#also write the outliers
res <- data.frame(index=which_significant,sample=colnames(catalogue)[which_significant],normSAD=norm_SAD[which_significant])
return(res)
}
#' plot_SignatureFit_withBootstrap
#'
#' Generate plots from the output of SignatureFit_withBootstrap.
#'
#' @param outdir output directory for the plot
#' @param boostrapFit_res output of SignatureFit_withBootstrap
#' @param type_of_mutations type of mutations: subs, rearr, or generic
#' @export
plot_SignatureFit_withBootstrap <- function(outdir,
boostrapFit_res,
type_of_mutations){
res <- boostrapFit_res
#function to draw a legend for the heatmap of the correlation matrix
draw_legend <- function(col,xl,xr,yb,yt){
par(xpd=TRUE)
rect(xl,yb,xr,yt)
rect(
xl,
head(seq(yb,yt,(yt-yb)/length(col)),-1),
xr,
tail(seq(yb,yt,(yt-yb)/length(col)),-1),
col=col,border = NA
)
text(x = 1.2, y = yt,labels = "1")
text(x = 1.2, y = (yt-yb)/2,labels = "0")
text(x = 1.2, y = yb,labels = "-1")
par(xpd=FALSE)
}
reconstructed_with_median <- round(as.matrix(res$signature_data_matrix) %*% res$E_median_filtered)
#provide a series of plots for each sample
#plot_nrows <- ncol(cat)
rows_ordered_from_best <- order(res$KLD_samples)
plot_nrows <- 2
plot_ncol <- 4
nplots <- plot_nrows*plot_ncol
howmanyplots <- ncol(res$cat)
plotsdir <- paste0(outdir,"SigFit_withBootstrap_Summary_m",res$method,"_bfm",res$bf_method,"_alpha",res$alpha,"_tr",res$threshold_percent,"_p",res$threshold_p.value,"/")
dir.create(plotsdir,recursive = TRUE,showWarnings = FALSE)
for(p in 1:howmanyplots){
current_samples <- p
jpeg(filename = paste0(plotsdir,"sigfit_bootstrap_",p,"of",howmanyplots,".jpg"),
width = 640*(plot_ncol),
height = 480*plot_nrows,
res = 150)
par(mfrow=c(plot_nrows,plot_ncol))
for(i in current_samples){
unassigned_mut <- sprintf("%.2f",(sum(res$cat[,i,drop=FALSE]) - sum(reconstructed_with_median[,i,drop=FALSE]))/sum(res$cat[,i,drop=FALSE])*100)
percentdiff <- sprintf("%.2f",sum(abs(res$cat[,i,drop=FALSE] - reconstructed_with_median[,i,drop=FALSE]))/sum(res$cat[,i,drop=FALSE])*100)
cos_sim <- sprintf("%.2f",cos.sim(res$cat[,i,drop=FALSE],reconstructed_with_median[,i,drop=FALSE]))
if(type_of_mutations=="subs"){
#1 original
plotSubsSignatures(signature_data_matrix = res$cat[,i,drop=FALSE],add_to_titles = "Catalogue",mar=c(6,3,5,2))
if(sum(res$cat[,i,drop=FALSE])>0){
#2 reconstructed
plotSubsSignatures(signature_data_matrix = reconstructed_with_median[,i,drop=FALSE],add_to_titles = "Model",mar=c(6,3,5,2))
#3 difference
#plotSubsSignatures(signature_data_matrix = cat[,i,drop=FALSE] - reconstructed_with_median[,i,drop=FALSE],add_to_titles = paste0("Difference, ",percentdiff,"%"),mar=c(6,3,5,2))
plotSubsSignatures(signature_data_matrix = res$cat[,i,drop=FALSE] - reconstructed_with_median[,i,drop=FALSE],add_to_titles = paste0("Difference\n(CosSim ",cos_sim,", Unassigned ",unassigned_mut,"%)"),mar=c(6,3,5,2),plot_sum = FALSE)
}
}else if(type_of_mutations=="rearr"){
#1 original
plotRearrSignatures(signature_data_matrix = res$cat[,i,drop=FALSE],add_to_titles = "Catalogue",mar=c(12,3,5,2))
if(sum(res$cat[,i,drop=FALSE])>0){
#2 reconstructed
plotRearrSignatures(signature_data_matrix = reconstructed_with_median[,i,drop=FALSE],add_to_titles = "Model",mar=c(12,3,5,2))
#3 difference
#plotRearrSignatures(signature_data_matrix = cat[,i,drop=FALSE] - reconstructed_with_median[,i,drop=FALSE],add_to_titles = paste0("Difference, ",percentdiff,"%"),mar=c(12,3,5,2))
plotRearrSignatures(signature_data_matrix = res$cat[,i,drop=FALSE] - reconstructed_with_median[,i,drop=FALSE],add_to_titles = paste0("Difference\n(CosSim ",cos_sim,", Unassigned ",unassigned_mut,"%)"),mar=c(12,3,5,2),plot_sum = FALSE)
}
}else if(type_of_mutations=="generic"){
#1 original
plotGenericSignatures(signature_data_matrix = res$cat[,i,drop=FALSE],add_to_titles = "Catalogue",mar=c(6,3,5,2))
if(sum(res$cat[,i,drop=FALSE])>0){
#2 reconstructed
plotGenericSignatures(signature_data_matrix = reconstructed_with_median[,i,drop=FALSE],add_to_titles = "Model",mar=c(6,3,5,2))
#3 difference
#plotGenericSignatures(signature_data_matrix = cat[,i,drop=FALSE] - reconstructed_with_median[,i,drop=FALSE],add_to_titles = paste0("Difference, ",percentdiff,"%"),mar=c(6,3,5,2))
plotGenericSignatures(signature_data_matrix = res$cat[,i,drop=FALSE] - reconstructed_with_median[,i,drop=FALSE],add_to_titles = paste0("Difference\n(CosSim ",cos_sim,", Unassigned ",unassigned_mut,"%)"),mar=c(6,3,5,2),plot_sum = FALSE)
}
}
if(sum(res$cat[,i,drop=FALSE])>0){
#4 bootstraps
par(mar=c(6,4,5,2))
boxplot(t(res$samples_list[[i]]),las=3,cex.axes=0.9,
ylab="n mutations",
ylim=c(0,max(res$samples_list[[i]])),
main=paste0("Exposures, of ",colnames(res$E_median_filtered)[i],"\nthreshold=",res$threshold_percent,"%, p-value=",res$threshold_p.value,", n=",res$nboot))
points(1:length(res$E_median_filtered[,i,drop=FALSE]),res$E_median_filtered[,i,drop=FALSE],col="red")
abline(h=res$threshold_percent/100*sum(res$cat[,i,drop=FALSE]),col="green")
legend(x="topleft",legend = c("consensus exposures"),col = "red",pch = 1,cex = 0.9,bty = "n",inset = c(0,-0.14),xpd = TRUE)
legend(x="topright",legend = c("threshold"),col = "green",lty = 1,cex = 0.9,bty = "n",inset = c(0,-0.14),xpd = TRUE)
if(ncol(res$signature_data_matrix)>1){
#5 top correlated signatures
res.cor <- suppressWarnings(cor(t(res$samples_list[[i]]),method = "spearman"))
res.cor_triangular <- res.cor
res.cor_triangular[row(res.cor)+(ncol(res.cor)-col(res.cor))>=ncol(res.cor)] <- 0
res.cor_triangular_label <- matrix(sprintf("%0.2f",res.cor_triangular),nrow = nrow(res.cor_triangular))
res.cor_triangular_label[row(res.cor)+(ncol(res.cor)-col(res.cor))>=ncol(res.cor)] <- ""
par(mar=c(6,8,5,6))
par(xpd=FALSE)
col<- colorRampPalette(c("blue", "white", "red"))(51)
image(res.cor_triangular,col = col,zlim = c(-1,1), axes=F,main="Exposures Correlation (spearman)")
extrabit <- 1/(ncol(res$signature_data_matrix)-1)/2
abline(h=seq(0-extrabit,1+extrabit,length.out = ncol(res$signature_data_matrix)+1),col="grey",lty=2)
abline(v=seq(0-extrabit,1+extrabit,length.out = ncol(res$signature_data_matrix)+1),col="grey",lty=2)
axis(2,at = seq(0,1,length.out = ncol(res$signature_data_matrix)),labels = colnames(res$signature_data_matrix),las=1,cex.lab=0.8)
axis(1,at = seq(0,1,length.out = ncol(res$signature_data_matrix)),labels = colnames(res$signature_data_matrix),las=2,cex.lab=0.8)
draw_legend(col,1.25,1.3,0,1)
#6 some correlation plots
#pos <- which(max(abs(res.cor_triangular))==abs(res.cor_triangular),arr.ind = TRUE)
vals <- res.cor_triangular[order(abs(res.cor_triangular),decreasing = TRUE)]
for (j in 1:(nplots-5)){
pos <- which(vals[j]==res.cor_triangular,arr.ind = TRUE)
mainpar <- paste0("Exposures across bootstraps, n=",res$nboot,"\nspearman correlation ",sprintf("%.2f",vals[j]))
plot(res$samples_list[[i]][pos[1],],res$samples_list[[i]][pos[2],],
xlab = colnames(res$signature_data_matrix)[pos[1]],
ylab = colnames(res$signature_data_matrix)[pos[2]],
# ylim = c(0,max(res$samples_list[[i]][pos[2],])),
# xlim = c(0,max(res$samples_list[[i]][pos[1],]))
main=mainpar,col="blue",pch = 16)
}
#sig.pca <- prcomp(t(res$samples_list[[i]]),center = TRUE,scale. = TRUE)
}
}
}
dev.off()
}
# res$E_median_filtered[,i,drop=FALSE]
#also plot and save exposures
sums_exp <- apply(res$cat, 2, sum)
exposures <- res$E_median_filtered
denominator <- matrix(sums_exp,nrow = nrow(exposures),ncol = ncol(exposures),byrow = TRUE)
file_table_exp <- paste0(outdir,"SigFit_withBootstrap_Exposures_m",res$method,"_bfm",res$bf_method,"_alpha",res$alpha,"_tr",res$threshold_percent,"_p",res$threshold_p.value,".tsv")
file_plot_exp <- paste0(outdir,"SigFit_withBootstrap_Exposures_m",res$method,"_bfm",res$bf_method,"_alpha",res$alpha,"_tr",res$threshold_percent,"_p",res$threshold_p.value,".jpg")
plot.CosSimMatrix(as.data.frame((exposures/denominator*100)),output_file = file_plot_exp)
write.table(exposures,file = file_table_exp,
sep = "\t",col.names = TRUE,row.names = TRUE,quote = FALSE)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.