#Fork of SigHunter from Sandro Morganella 2017
#Andrea Degasperi, ad923@cam.ac.uk
#' Mutational Signatures Extraction
#'
#' Perform signature extraction, by applying NMF to the input matrix. Multiple NMF runs and bootstrapping is used for robustness, followed by clustering of the solutions. A range of number of signatures to be used is required.
#'
#' @param cat matrix with samples as columns and channels as rows
#' @param outFilePath path were the extraction output files should go. Remember to add "/" at the end of the path
#' @param blacklist list of samples (column names) to ignore
#' @param nrepeats how many runs for each bootstrap (if filterBestOfEachBootstrap=TRUE with default params, only at most 10 runs within 0.1 percent of best will be considered, so nrepeats should be at least 10)
#' @param nboots how many bootstrapped catalogues to use
#' @param clusteringMethod choose among {"HC","PAM","MC"}, hierarchical clustering (HC), partitioning around the medoids (PAM) and matched clustering (MC)
#' @param completeLinkageFlag if clusteringMethod="HC", use complete linkage instead of default average linkage
#' @param useMaxMatching if clusteringMethod="MC", use the assignment problem algorithm (match with max similarity) instead of the stable matching algorithm (any stable match)
#' @param filterBestOfEachBootstrap if TRUE only at most filterBest_nmaxtokeep of the nrepeats runs that are within filterBest_RTOL*best from the best are kept
#' @param filterBest_RTOL realtive tolerace from best fit to consider a run as good as the best, RTOL=0.001 is recommended
#' @param filterBest_nmaxtokeep max number of runs that should be kept that are within the relative tolerance from the best
#' @param nparallel how many processing units to use
#' @param nsig list of number of signatures to try
#' @param mut_thr threshold of mutations to remove empty/almost empty rows and columns
#' @param type_of_extraction choose among {"subs","rearr","generic"}
#' @param project give a name to your project
#' @param parallel set to TRUE to use parallel computation (Recommended)
#' @param nmfmethod choose among {"brunet","lee","nsNMF"}, this choice will be passed to the NMF::nmf function
#' @param removeDuplicatesInCatalogue remove 0.99 cos sim similar samples
#' @param normaliseCatalogue scale samples to sum to 1
#' @param plotCatalogue also plot the catalogue, this may crash the library if the catalogue is too big, should work up to ~300 samples
#' @param plotResultsFromAllClusteringMethods if TRUE, all clustering methods are used and results are reported and plotted for all of them. If FALSE, only the requested clustering is reported
#' @return result files will be available in the outFilePath directory
#' @keywords signature extraction
#' @export
#' @examples
#' n_row <- 96
#' n_col <- 50
#' rnd_matrix <- round(matrix(runif(n_row*n_col,min = 0,max = 50),nrow = n_row,ncol = n_col))
#' colnames(rnd_matrix) <- paste0("C",1:n_col)
#' row.names(rnd_matrix) <- paste0("R",1:n_row)
#' SignatureExtraction(cat = rnd_matrix,
#' outFilePath = paste0("extraction_test_subs/"),
#' nrepeats = 10,
#' nboots = 2,
#' nparallel = 2,
#' nsig = 2:3,
#' mut_thr = 0,
#' type_of_extraction = "subs",
#' project = "test",
#' parallel = TRUE,
#' nmfmethod = "brunet")
SignatureExtraction <- function(cat, #matrix with samples as columns and channels as rows.
outFilePath, #path were the extraction output files should go. Remember to add "/" at the end of the path
blacklist=c(), #list of samples (column names) to ignore
nrepeats=10, #how many runs for each bootstrap (if filterBestOfEachBootstrap=TRUE with default params, only at most 10 runs within 0.1% of best will be considered, so nrepeats should be at least 10)
nboots=20, #how many bootstrapped catalogues to use
clusteringMethod="PAM", #choose among {"HC","PAM","MC"}, hierarchical clustering (HC), partitioning around the medoids (PAM) and matched clustering (MC)
completeLinkageFlag=FALSE, #if clusteringMethod="HC", use complete linkage instead of default average linkage
useMaxMatching=TRUE, #if clusteringMethod="MC", use the assignment problem algorithm (match with max similarity) instead of the stable matching algorithm (any stable match)
filterBestOfEachBootstrap=TRUE, #if true only at most filterBest_nmaxtokeep of the nrepeats runs that are within filterBest_RTOL*best from the best are kept
filterBest_RTOL=0.001, #realtive tolerace from best fit to consider a run as good as the best
filterBest_nmaxtokeep=10, #max number of runs that should be kept that are within the relative tolerance from the best
nparallel=1, # how many processing units to use
nsig=c(3:15), # range of number of signatures to try
mut_thr=0, # threshold of mutations to remove empty/almost empty rows and columns
type_of_extraction="subs", #choose among {"subs","rearr","generic"}
project="extraction", # give a name to your project
parallel=FALSE, # set to TRUE to use parallel computation (Recommended)
nmfmethod="brunet", #choose among {"brunet","lee","nsNMF"}
removeDuplicatesInCatalogue = FALSE, #remove 0.99 cos sim similar samples
normaliseCatalogue = FALSE, # scale samples to sum to 1
plotCatalogue = FALSE, #also plot the catalogue, this may crash the library if the catalogue is too big, should work up to ~300 samples
plotResultsFromAllClusteringMethods=TRUE){ #if TRUE, all clustering methods are used and results are reported and plotted for all of them. If FALSE, only the requested clustering is reported
library(NMF)
#tmp for debug
if (completeLinkageFlag) tmp_outFilePath <- outFilePath
#remove blacklisted samples
if(length(blacklist)>0){
cat <- cat[,setdiff(colnames(cat),blacklist)]
}
group <- project
doMC::registerDoMC(nparallel)
dir.create(outFilePath,showWarnings = FALSE,recursive = TRUE)
message("\n------------- START COMPUTATION ------------\n")
## Read the catalogue
message("[STEP 1]: Reading and Preprocessing the Catalogue...", appendLF=F)
if (type_of_extraction=="subs") cat <- sortCatalogue(cat)
if(removeDuplicatesInCatalogue){
cat <- removeSimilarCatalogueSamples(cat)
}
#Save catalogue used
nsamples <- ncol(cat)
if(plotCatalogue){
cat_file <- paste0(outFilePath,"CatalogueUsedAfterPreprocessing_plot_",project,".jpg")
if (type_of_extraction == "subs"){
plotSubsSignatures(cat,cat_file,plot_sum = TRUE,overall_title = paste0("Catalogue - ",project," (",nsamples," samples)"))
}else if (type_of_extraction == "rearr"){
plotRearrSignatures(cat,cat_file,plot_sum = TRUE,overall_title = paste0("Catalogue - ",project," (",nsamples," samples)"))
}else if (type_of_extraction == "generic"){
plotGenericSignatures(cat,cat_file,plot_sum = TRUE,overall_title = paste0("Catalogue - ",project," (",nsamples," samples)"))
}
}
cat_file <- paste0(outFilePath,"CatalogueUsedAfterPreprocessing_plot_",group,".txt")
write.table(cat,file = cat_file,
sep = "\t",quote = FALSE,row.names = TRUE,col.names = TRUE)
nrow_cat <- nrow(cat)
#remove channels if necessary
all_rows_cat <- cat
#cat <- preprocessCatalgue(cat, mut_thr)
channelsRemoved <- FALSE
if(nrow(all_rows_cat)>nrow(cat)) channelsRemoved <- TRUE
#normalised catalogue for computing replicates later
ncat <- cat
if(normaliseCatalogue) ncat <- normaliseSamples(cat)
if(sum(nsig>ncol(cat))>0){
nsig <- nsig[nsig[length(nsig)]<=ncol(cat)]
}
message("DONE")
message("\t> ", ncol(cat), " Samples and ",nrow_cat, " Mutations Correclty Processed")
if(mut_thr>0){
message("\t> ", nrow_cat -nrow(cat), " Mutation(s) Channel(s) Removed (<", mut_thr, ")")
}
nmuts <- apply(cat, 2, sum)
sample_names <- colnames(cat)
###### Find Signature ######
message("\n[STEP 2]: Finding Signatures")
## Iterate Until Convergence
complete <- FALSE
round <- 1
solutions <- matrix(0, 0, 4)
colnames(solutions) <- c("Round", "Optim_Nsig", "Estimate", "NumUncorrelated")
prev_num_uncorr <- ncol(cat)
while(complete==FALSE){
message("\t> Running round ", round)
## Create Directory
outDir <- paste(outFilePath, "round_", round, "/", sep="")
cmd_res <- system(paste("mkdir -p", outDir))
err <- c()
cos_sim <- c()
average_corr_smpls <- c()
sil <- c()
#additional metrics
ave.RMSE <- c()
sd.RMSE <- c()
ave.KLD <- c()
sd.KLD <- c()
ave.RMSE.orig <- c()
sd.RMSE.orig <- c()
ave.KLD.orig <- c()
sd.KLD.orig <- c()
# ave.CosSim.hclust <- c()
# ave.CosSim.PAM <- c()
ave.SilWid.hclust <- c()
ave.SilWid.PAM <- c()
ave.SilWid.MC <- c()
cophenetic.corr.hclust <- c()
# proportion.tooSimilar.Signatures <- c()
min.MinWCCS.hclust <- c()
min.MinWCCS.PAM <- c()
min.MinWCCS.MC <- c()
max.MaxBCCS.hclust <- c()
max.MaxBCCS.PAM <- c()
max.MaxBCCS.MC <- c()
mmcs_pam <- c()
mmcs_hclust <- c()
mmcs_MC <- c()
## Run Hunter on the specified range of signatures
for(ns in nsig){
#tmp for debug
if (completeLinkageFlag) outFilePath <- tmp_outFilePath
outNsDir <- paste(outDir, "sig_", ns, "/", sep="")
cmd_res <- system(paste("mkdir -p ", outNsDir))
message("\n> Running for ", ns, " Signatures (", nboots, " bootstraps)")
strt<-Sys.time()
#---------Bootstraps start here
#Define smoothing matrix S in case nsNMF is used
if (nmfmethod=="nsNMF") S <- diag(ns)*0.5 + matrix(1,nrow=ns,ncol=ns)*0.5/ns
#bootstraps file:
bootstraps_file <- paste0(outFilePath,"bootstraps_",group,"_ns",ns,"_nboots",nboots,".Rdata")
if (file.exists(bootstraps_file)){
load(bootstraps_file)
message("bootstraps file loaded")
}else{
## Collect the solutions
e_boot <- matrix(0, ncol(cat), 0)
p_boot <- matrix(0, nrow(cat), 0)
err_boot <- c()
boot_tracker <- c()
#boots_list <- list()
boot_cat <- list()
## Run NMF on nboots bootsrapped catalogues
if(parallel==TRUE){ ## Parallel
#boots_list_tmp <- list()
nseq <- ceiling(nboots/nparallel)
#first get and store catalogues
for (i in 1:(nseq*nparallel)){
#boot_cat[[i]] <- apply(cat, 2, generateRandMuts)
boot_cat[[i]] <- generateRandMuts(cat)
if(normaliseCatalogue) boot_cat[[i]] <- normaliseSamples(boot_cat[[i]])
}
for(tt in 1:nseq){
message(".", appendLF=F)
boots_list <- foreach::foreach(i=1:nparallel) %dopar%{
rnd_cat <- boot_cat[[(tt-1)*nparallel + i]]
rnd_cat <- preprocessCatalgue(rnd_cat, mut_thr) #remove channels with 0 mutations
NMF::nmf(rnd_cat, rank=ns,nrun=nrepeats,method = nmfmethod,.options="k-p")
}
#Extract data already to save memory
for(i in 1:length(boots_list)){
nmf_res <- boots_list[[i]]
best_residual <- NMF::residuals(nmf_res)
residuals_list <- c()
if(length(nmf_res)==1){
residuals_list <- NMF::residuals(nmf_res)
}else{
for (j in 1:length(nmf_res@.Data)){
#avoid numerical error (minimum cannot be less than 0)
residuals_list <- c(residuals_list,max(0,NMF::residuals(nmf_res@.Data[[j]])))
}
}
if(filterBestOfEachBootstrap){
#filter the best runs
runsToChooseFrom <- which(best_residual*(1+filterBest_RTOL)>=residuals_list)
#take at most filterBest_nmaxtokeep
if (length(runsToChooseFrom)>filterBest_nmaxtokeep){
runsToChooseFrom <- sample(runsToChooseFrom,filterBest_nmaxtokeep)
}
}else{
#just take all the runs
runsToChooseFrom <- 1:length(nmf_res)
}
countRuns <- 1
for (j in 1:length(nmf_res)){
#keep at most 10 repeats that have residuals close to the best (within 0.1% more than residual)
#if (best_residual*1.001>residuals_list[j] & countRuns <= 10) {
if (j %in% runsToChooseFrom) {
if(length(nmf_res)==1){
#here is where I fix channels by adding back the channels I removed
bbb <- NMF::basis(nmf_res)
coln <- paste0("b",i,"r",j,"s",1:ns)
p_boot_tmp <- matrix(0,nrow = nrow(cat),ncol = ns,dimnames = list(rownames(cat),coln))
p_boot_tmp[rownames(bbb),] <- bbb
if (nmfmethod=="nsNMF"){
p_boot <- cbind(p_boot , p_boot_tmp %*% S)
}else{
p_boot <- cbind(p_boot , p_boot_tmp)
}
e_boot <- cbind(e_boot , t(NMF::coef(nmf_res)))
err_boot <- c(err_boot,NMF::residuals(nmf_res))
}else{
#here is where I fix channels by adding back the channels I removed
bbb <- NMF::basis(nmf_res@.Data[[j]])
coln <- paste0("b",i,"r",j,"s",1:ns)
p_boot_tmp <- matrix(0,nrow = nrow(cat),ncol = ns,dimnames = list(rownames(cat),coln))
p_boot_tmp[rownames(bbb),] <- bbb
if (nmfmethod=="nsNMF"){
p_boot <- cbind(p_boot , p_boot_tmp %*% S)
}else{
p_boot <- cbind(p_boot , p_boot_tmp)
}
e_boot <- cbind(e_boot , t(NMF::coef(nmf_res@.Data[[j]])))
err_boot <- c(err_boot,NMF::residuals(nmf_res@.Data[[j]]))
}
boot_tracker <- c(boot_tracker,rep((tt-1)*nparallel + i,ns))
countRuns <- countRuns + 1
}
}
rm(nmf_res)
gc()
}
rm(boots_list)
gc()
}
#Now you need to cut so that you have only nboots and not more
selection_of_boots <- boot_tracker <= nboots
selection_of_boots_runs <- boot_tracker[seq(1,length(boot_tracker),ns)] <= nboots
p_boot <- p_boot[,selection_of_boots]
e_boot <- e_boot[,selection_of_boots]
err_boot <- err_boot[selection_of_boots_runs]
boot_tracker <- boot_tracker[selection_of_boots]
# boot_pos <- 1
# for (s in 1:nseq){
# for (p in 1:length(boots_list_tmp[[s]])){
# boots_list[[boot_pos]] <- boots_list_tmp[[s]][[p]]
# boot_pos <- boot_pos + 1
# }
# }
#boots_list <- unlist(boots_list_tmp)
}else{ ## No Parallel
for(i in 1:nboots){
message(".", appendLF=F)
boot_cat[[i]] <- generateRandMuts(cat)
if(normaliseCatalogue) boot_cat[[i]] <- normaliseSamples(boot_cat[[i]])
rnd_cat <- preprocessCatalgue(boot_cat[[i]], mut_thr) #remove channels with 0 mutations
nmf_res <- NMF::nmf(rnd_cat, rank=ns,nrun=nrepeats,method = nmfmethod,.options="k-p")
#extract already to save memory
best_residual <- NMF::residuals(nmf_res)
residuals_list <- c()
if(length(nmf_res)==1){
residuals_list <- NMF::residuals(nmf_res)
}else{
for (j in 1:length(nmf_res@.Data)){
#avoid numerical error (minimum cannot be less than 0)
residuals_list <- c(residuals_list,max(0,NMF::residuals(nmf_res@.Data[[j]])))
}
}
if(filterBestOfEachBootstrap){
#filter the best runs
runsToChooseFrom <- which(best_residual*(1+filterBest_RTOL)>=residuals_list)
#take at most filterBest_nmaxtokeep
if (length(runsToChooseFrom)>filterBest_nmaxtokeep){
runsToChooseFrom <- sample(runsToChooseFrom,filterBest_nmaxtokeep)
}
}else{
#just take all the runs
runsToChooseFrom <- 1:length(nmf_res)
}
countRuns <- 1
for (j in 1:length(nmf_res)){
#keep at most 10 repeats that have residuals close to the best (within 0.1% more than residual)
#if (best_residual*1.001>residuals_list[j] & countRuns <= 10) {
if (j %in% runsToChooseFrom) {
#here is where I fix channels by adding back the channels I removed
bbb <- NMF::basis(nmf_res)
coln <- paste0("b",i,"r",j,"s",1:ns)
p_boot_tmp <- matrix(0,nrow = nrow(cat),ncol = ns,dimnames = list(rownames(cat),coln))
p_boot_tmp[rownames(bbb),] <- bbb
if(length(nmf_res)==1){
if (nmfmethod=="nsNMF"){
p_boot <- cbind(p_boot , p_boot_tmp %*% S)
}else{
p_boot <- cbind(p_boot , p_boot_tmp)
}
e_boot <- cbind(e_boot , t(NMF::coef(nmf_res)))
err_boot <- c(err_boot,NMF::residuals(nmf_res))
}else{
#here is where I fix channels by adding back the channels I removed
bbb <- NMF::basis(nmf_res@.Data[[j]])
coln <- paste0("b",i,"r",j,"s",1:ns)
p_boot_tmp <- matrix(0,nrow = nrow(cat),ncol = ns,dimnames = list(rownames(cat),coln))
p_boot_tmp[rownames(bbb),] <- bbb
if (nmfmethod=="nsNMF"){
p_boot <- cbind(p_boot , p_boot_tmp %*% S)
}else{
p_boot <- cbind(p_boot , p_boot_tmp)
}
e_boot <- cbind(e_boot , t(NMF::coef(nmf_res@.Data[[j]])))
err_boot <- c(err_boot,NMF::residuals(nmf_res@.Data[[j]]))
}
boot_tracker <- c(boot_tracker,rep(i,ns))
countRuns <- countRuns + 1
}
}
rm(nmf_res)
gc()
}
}
#how many solutions were saved?
saved_nmf_runs <- ncol(p_boot)/ns
save(file = bootstraps_file,ns,nboots,nrepeats,e_boot,p_boot,err_boot,cat,all_rows_cat,saved_nmf_runs,boot_tracker,boot_cat)
}
#now add back the missing channel and reset cat to all channels
if(channelsRemoved){
lmissing <- setdiff(rownames(all_rows_cat),rownames(cat))
nmissing <- length(lmissing)
newrows <- matrix(0,nrow = nmissing,ncol = ncol(p_boot))
colnames(newrows) <- colnames(p_boot)
rownames(newrows) <- lmissing
p_boot <- rbind(p_boot,newrows)[rownames(all_rows_cat),]
#cat <- all_rows_cat
}
# ## Compute the average silhouette grouping all the computed solutions in ns clusters
#sil <- c(sil, summary(silhouette(pam(p_boot, ns)))$avg.width)
#above dissimilarity is not based on cosine similarity
colnames_p <- c()
sigs_p <- 1:ns
for (i in 1:saved_nmf_runs){
colnames_p <- c(colnames_p,paste(rep("sig",ns),sigs_p,rep("_run",ns),rep(i,ns),sep = ""))
}
colnames(p_boot) <- colnames_p
names(boot_tracker) <- colnames_p
#Load distance matrix if it already exists
distMatrix_file <- paste0(outFilePath,"distMatrix_",group,"_ns",ns,"_nboots",nboots,".Rdata")
if (file.exists(distMatrix_file)){
load(distMatrix_file)
message("distance matrix file loaded")
}else{
distMatrix <- 1 - computeCorrelation_parallel(p_boot,nparallel = nparallel,parallel = TRUE)
save(file = distMatrix_file,distMatrix)
}
#change prefix (debug)
if (completeLinkageFlag) outNsDir <- paste0(outNsDir,"completeLinkage")
if(ns>1 & saved_nmf_runs>1){
#Hierarchical clustering
if (completeLinkageFlag) {
fit_clust <- stats::hclust(as.dist(distMatrix), method="complete")
}else{
fit_clust <- stats::hclust(as.dist(distMatrix), method="average")
}
#Hierarchical clustering partitioning
cut_res <- stats::cutree(fit_clust,k = ns)
#PAM partitioning
clustering_result <- cluster::pam(as.dist(distMatrix), ns)
cut_res.PAM <- clustering_result$clustering
#Matched Clustering
mc_file <- paste0(outNsDir,"matchedClustering_",group,"_ns",ns,"_nboots",nboots,".Rdata")
if (file.exists(mc_file)){
load(mc_file)
message("matched clustering result loaded from file")
}else{
cut_res_MC <- matchedClustering(distMatrix,ns,maxMatch = useMaxMatching,parallel=TRUE,nparallel=nparallel)
save(file = mc_file,cut_res_MC)
}
#get medoids
medoids_hclust <- findMedoidsHclust(distMatrix,cut_res)
medoids_MC <- findMedoidsHclust(distMatrix,cut_res_MC)
medoids_PAM <- clustering_result$medoids
}else if(ns==1 & saved_nmf_runs>1){
cut_res <- rep(1,ncol(distMatrix))
names(cut_res) <- colnames(distMatrix)
cut_res.PAM <- cut_res
cut_res_MC <- cut_res
medoids_hclust <- findMedoidsHclust(distMatrix,cut_res)
medoids_MC <- medoids_hclust
medoids_PAM <- medoids_hclust
}else if(ns>1 & saved_nmf_runs==1){
cut_res <- seq(1,ncol(distMatrix))
names(cut_res) <- colnames(distMatrix)
cut_res.PAM <- cut_res
cut_res_MC <- cut_res
medoids_hclust <- findMedoidsHclust(distMatrix,cut_res)
medoids_MC <- medoids_hclust
medoids_PAM <- medoids_hclust
}else if(ns==1 & saved_nmf_runs==1){
cut_res <- 1
names(cut_res) <- colnames(distMatrix)
cut_res.PAM <- cut_res
cut_res_MC <- cut_res
medoids_hclust <- colnames(distMatrix)
medoids_MC <- medoids_hclust
medoids_PAM <- medoids_hclust
}
norm_p_boot <- p_boot/matrix(data=rep(apply(p_boot,2,sum),nrow(p_boot)),nrow = nrow(p_boot),byrow = TRUE)
#mean and sd
#set up mean and sd of signatures
mean_signatures <- list()
sd_signatures <- list()
for (cm in c("HC","PAM","MC")){
mean_signatures[[cm]] <- matrix(NA,nrow = nrow(p_boot),ncol = ns)
sd_signatures[[cm]] <- matrix(NA,nrow = nrow(p_boot),ncol = ns)
colnames(mean_signatures[[cm]]) <- paste0("S",1:ns)
colnames(sd_signatures[[cm]]) <- paste0("S",1:ns)
row.names(mean_signatures[[cm]]) <- row.names(p_boot)
row.names(sd_signatures[[cm]]) <- row.names(p_boot)
}
for (cm in c("HC","PAM","MC")){
if(cm=="HC"){
partitions <- cut_res
}else if(cm=="PAM"){
partitions <- cut_res.PAM
}else if(cm=="MC"){
partitions <- cut_res_MC
}
for(nsi in 1:ns){
pboot_dim <- dim(norm_p_boot[,partitions==nsi,drop=FALSE])
if(pboot_dim[2]==1){
mean_signatures[[cm]][,nsi] <- norm_p_boot[,partitions==nsi]
sd_signatures[[cm]][,nsi] <- 0
}else{
mean_signatures[[cm]][,nsi] <- apply(norm_p_boot[,partitions==nsi],1,mean)
sd_signatures[[cm]][,nsi] <- apply(norm_p_boot[,partitions==nsi],1,sd)
sd_signatures[[cm]][is.na(sd_signatures[[cm]][,nsi]),nsi] <- 0
}
}
}
if(ns>1 & saved_nmf_runs>1){
#Compute Silhouettes
sil_hclust <- summary(cluster::silhouette(cut_res,as.dist(distMatrix)))
sil_pam <- summary(cluster::silhouette(cut_res,as.dist(distMatrix)))
sil_MC <- summary(cluster::silhouette(cut_res_MC,as.dist(distMatrix)))
plotWithinClusterSilWidth(sil_hclust,sil_pam,sil_MC,outNsDir,group,ns,nboots)
#compute within cluster cos similiraty distance
cosSimPAM <- withinClusterCosSim(clustering_result$clustering,distMatrix,parallel = TRUE)
cosSimHClust <- withinClusterCosSim(cut_res,distMatrix,parallel = TRUE)
cosSimMC <- withinClusterCosSim(cut_res_MC,distMatrix,parallel = TRUE)
plotWithinClusterCosSim(cosSimHClust,cosSimPAM,cosSimMC,outNsDir,group,ns,nboots)
#compute cophenetic correlation
coph <- cor(stats::cophenetic(fit_clust),as.dist(distMatrix))
#compute the proportion of the solutions that contain signatures that are too similar
#propTooSimilar <- computePropTooSimilar(distMatrix,saved_nmf_runs,ns)
#Save metrics
#ave.CosSim.hclust <- c(ave.CosSim.hclust,mean(cosSimHClust))
#ave.CosSim.PAM <- c(ave.CosSim.PAM,mean(cosSimPAM))
ave.SilWid.hclust <- c(ave.SilWid.hclust,sil_hclust$avg.width)
ave.SilWid.PAM <- c(ave.SilWid.PAM,sil_pam$avg.width)
ave.SilWid.MC <- c(ave.SilWid.MC,sil_MC$avg.width)
cophenetic.corr.hclust <- c(cophenetic.corr.hclust,coph)
#proportion.tooSimilar.Signatures <- c(proportion.tooSimilar.Signatures,propTooSimilar)
#save metrics
additional_perf_file <- paste0(outNsDir,"Sigs_WithinClusterPerf_",group,"_ns",ns,"_nboots",nboots,".rData")
save(file = additional_perf_file,sil_pam,sil_hclust,sil_MC,cosSimPAM,cosSimHClust,cosSimMC)
#plotHierarchicalCluster(fit_clust,outNsDir,group,ns,nboots)
#Max Medoids Cosine Similarity
mmcs_pam <- c(mmcs_pam,max(medoids_cosSimMatrix(p_boot,medoids_PAM) - diag(ns)))
mmcs_hclust <- c(mmcs_hclust,max(medoids_cosSimMatrix(p_boot,medoids_hclust) - diag(ns)))
mmcs_MC <- c(mmcs_MC,max(medoids_cosSimMatrix(p_boot,medoids_MC) - diag(ns)))
}else{
ave.SilWid.hclust <- c(ave.SilWid.hclust,NA)
ave.SilWid.PAM <- c(ave.SilWid.PAM,NA)
ave.SilWid.MC <- c(ave.SilWid.MC,NA)
cophenetic.corr.hclust <- c(cophenetic.corr.hclust,NA)
mmcs_pam <- c(mmcs_pam,NA)
mmcs_hclust <- c(mmcs_hclust,NA)
mmcs_MC <- c(mmcs_MC,NA)
}
#error
rmse_list <- c()
kld_list <- c()
rmse_orig_list <- c()
kld_orig_list <- c()
for (i in 1:saved_nmf_runs){
selection <- ((i-1)*ns+1):(i*ns)
current_cat <- boot_cat[[boot_tracker[selection][1]]]
if(channelsRemoved){
current_p <- p_boot[!row.names(p_boot) %in% lmissing,selection]
}else{
current_p <- p_boot[,selection]
}
current_e <- e_boot[,selection]
reconstructed_cat <- current_p %*% t(current_e)
rmse_list <- c(rmse_list,sqrt(sum((current_cat - reconstructed_cat)^2)/(dim(current_cat)[1]*dim(current_cat)[2])))
kld_list <- c(kld_list,KLD(current_cat,reconstructed_cat))
rmse_orig_list <- c(rmse_orig_list,sqrt(sum((ncat - reconstructed_cat)^2)/(dim(ncat)[1]*dim(ncat)[2])))
kld_orig_list <- c(kld_orig_list,KLD(ncat,reconstructed_cat))
}
# #Save errors
ave.RMSE <- c(ave.RMSE,mean(rmse_list))
sd.RMSE <- c(sd.RMSE,sd(rmse_list))
ave.KLD <- c(ave.KLD,mean(kld_list))
sd.KLD <- c(sd.KLD,sd(kld_list))
#
ave.RMSE.orig <- c(ave.RMSE.orig,mean(rmse_orig_list))
sd.RMSE.orig <- c(sd.RMSE.orig,sd(rmse_orig_list))
ave.KLD.orig <- c(ave.KLD.orig,mean(kld_orig_list))
sd.KLD.orig <- c(sd.KLD.orig,sd(kld_orig_list))
#use requested medoids
medoids_final <- medoids_hclust
if(clusteringMethod=="PAM") medoids_final <- medoids_PAM
if(clusteringMethod=="MC") medoids_final <- medoids_MC
#plot signatures and compute similarity to known signatures
clustering_list <- ""
clustering_list_tag <- clusteringMethod
if (plotResultsFromAllClusteringMethods) clustering_list <- c("","_HC","_PAM","_MC")
if (plotResultsFromAllClusteringMethods) clustering_list_tag <- c(clusteringMethod,"HC","PAM","MC")
for (cli in 1:length(clustering_list)){
cl <- clustering_list[cli]
cl_tag <- clustering_list_tag[cli]
signature_names <- paste0("S",1:length(medoids_final))
if (cl==""){
signature_data_matrix <- p_boot[,medoids_final,drop=FALSE]/matrix(data=rep(apply(p_boot[,medoids_final,drop=FALSE],2,sum),nrow(p_boot)),nrow = nrow(p_boot),byrow = TRUE)
}else if(cl=="_HC"){
signature_data_matrix <- p_boot[,medoids_hclust,drop=FALSE]/matrix(data=rep(apply(p_boot[,medoids_hclust,drop=FALSE],2,sum),nrow(p_boot)),nrow = nrow(p_boot),byrow = TRUE)
}else if(cl=="_PAM"){
signature_data_matrix <- p_boot[,medoids_PAM,drop=FALSE]/matrix(data=rep(apply(p_boot[,medoids_PAM,drop=FALSE],2,sum),nrow(p_boot)),nrow = nrow(p_boot),byrow = TRUE)
}else if(cl=="_MC"){
signature_data_matrix <- p_boot[,medoids_MC,drop=FALSE]/matrix(data=rep(apply(p_boot[,medoids_MC,drop=FALSE],2,sum),nrow(p_boot)),nrow = nrow(p_boot),byrow = TRUE)
}
colnames(signature_data_matrix) <- signature_names
row.names(signature_data_matrix) <- row.names(p_boot)
subs_file <- paste0(outNsDir,"Sigs_plot_",group,"_ns",ns,"_nboots",nboots,cl,".jpg")
if (type_of_extraction == "subs"){
#plotSubsSignatures(signature_data_matrix,subs_file,plot_sum = FALSE,overall_title = paste0("Medoids Signatures when extracting ",ns))
plotSubsSignatures_withMeanSd(signature_data_matrix,mean_signatures[[cl_tag]],sd_signatures[[cl_tag]],subs_file,plot_sum = FALSE,overall_title = paste0("Medoids Signatures when extracting ",ns))
}else if (type_of_extraction == "rearr"){
#plotRearrSignatures(signature_data_matrix,subs_file,plot_sum = FALSE,overall_title = paste0("Medoids Signatures when extracting ",ns))
plotRearrSignatures_withMeanSd(signature_data_matrix,mean_signatures[[cl_tag]],sd_signatures[[cl_tag]],subs_file,plot_sum = FALSE,overall_title = paste0("Medoids Signatures when extracting ",ns))
}else if (type_of_extraction == "generic"){
#plotGenericSignatures(signature_data_matrix,subs_file,plot_sum = FALSE,overall_title = paste0("Medoids Signatures when extracting ",ns))
plotGenericSignatures_withMeanSd(signature_data_matrix,mean_signatures[[cl_tag]],sd_signatures[[cl_tag]],subs_file,plot_sum = FALSE,overall_title = paste0("Medoids Signatures when extracting ",ns))
}
subs_file <- paste0(outNsDir,"Sigs_plot_",group,"_ns",ns,"_nboots",nboots,cl,".tsv")
write.table(signature_data_matrix,file = subs_file,
sep = "\t",quote = FALSE,row.names = TRUE,col.names = TRUE)
#similarity to known signatures
if(type_of_extraction=="subs"){
#find the most similar cosmic signatures
res_cosmic <- findClosestCOSMIC30_withSimilarity(signature_data_matrix)
res_cosmicComb <- findClosestCOSMIC30andCombinations_withSimilarity(signature_data_matrix)
res_cosmic_table <- data.frame(res_cosmic,res_cosmicComb)
cosmic_file <- paste0(outNsDir,"Sigs_cosmicSimilar_",group,"_ns",ns,"_nboots",nboots,cl,".tsv")
write.table(res_cosmic_table,file = cosmic_file,
sep = "\t",quote = FALSE,row.names = TRUE,col.names = TRUE)
}else if (type_of_extraction=="rearr"){
res_rearrBreast560 <- findClosestRearrSigsBreast560_withSimilarity(signature_data_matrix)
res_rearrBreast560_table <- data.frame(res_rearrBreast560)
res_rearrBreast560_file <- paste0(outNsDir,"Sigs_rearrBreast560Similar_",group,"_ns",ns,"_nboots",nboots,cl,".tsv")
write.table(res_rearrBreast560_table,file = res_rearrBreast560_file,
sep = "\t",quote = FALSE,row.names = TRUE,col.names = TRUE)
}
}
if (ns>1){
#More metrics
#spread of the clusters: minimum within cluster cosine similarity (MinWCCS)
MinWCCS.hclust <- minWithinClusterCosSim(clustering = cut_res,distMatrix = distMatrix,parallel = TRUE)
names(MinWCCS.hclust) <- signature_names
MinWCCS.MC <- minWithinClusterCosSim(clustering = cut_res_MC,distMatrix = distMatrix,parallel = TRUE)
names(MinWCCS.MC) <- signature_names
MinWCCS.PAM <- minWithinClusterCosSim(clustering = clustering_result$clustering,distMatrix = distMatrix,parallel = TRUE)
names(MinWCCS.PAM) <- signature_names
MinWCCS_file <- paste0(outNsDir,"MinWithinClusterCosSim_",group,"_ns",ns,"_nboots",nboots,".tsv")
write.table(data.frame(MinWCCS.hclust=MinWCCS.hclust,MinWCCS.PAM=MinWCCS.PAM,MinWCCS.MC=MinWCCS.MC),file = MinWCCS_file,
sep = "\t",quote = FALSE,row.names = TRUE,col.names = TRUE)
min.MinWCCS.hclust <- c(min.MinWCCS.hclust,min(MinWCCS.hclust))
min.MinWCCS.PAM <- c(min.MinWCCS.PAM,min(MinWCCS.PAM))
min.MinWCCS.MC <- c(min.MinWCCS.MC,min(MinWCCS.MC))
#cluster neighbours: maximum between cluster cosine similarity (MaxBCCS)
MaxBCCS.hclust <- maxBetweenClustersCosSim(clustering = cut_res,distMatrix = distMatrix,parallel = TRUE)
colnames(MaxBCCS.hclust) <- signature_names
row.names(MaxBCCS.hclust) <- signature_names
MaxBCCS.MC <- maxBetweenClustersCosSim(clustering = cut_res_MC,distMatrix = distMatrix,parallel = TRUE)
colnames(MaxBCCS.MC) <- signature_names
row.names(MaxBCCS.MC) <- signature_names
MaxBCCS.PAM <- maxBetweenClustersCosSim(clustering = clustering_result$clustering,distMatrix = distMatrix,parallel = TRUE)
colnames(MaxBCCS.PAM) <- signature_names
row.names(MaxBCCS.PAM) <- signature_names
MaxBCCS.hclust_file <- paste0(outNsDir,"MaxBetweenClusterCosSim.hclust_",group,"_ns",ns,"_nboots",nboots,".tsv")
write.table(MaxBCCS.hclust,file = MaxBCCS.hclust_file,
sep = "\t",quote = FALSE,row.names = TRUE,col.names = TRUE)
MaxBCCS.PAM_file <- paste0(outNsDir,"MaxBetweenClusterCosSim.PAM_",group,"_ns",ns,"_nboots",nboots,".tsv")
write.table(MaxBCCS.PAM,file = MaxBCCS.PAM_file,
sep = "\t",quote = FALSE,row.names = TRUE,col.names = TRUE)
MaxBCCS.MC_file <- paste0(outNsDir,"MaxBetweenClusterCosSim.MC_",group,"_ns",ns,"_nboots",nboots,".tsv")
write.table(MaxBCCS.MC,file = MaxBCCS.MC_file,
sep = "\t",quote = FALSE,row.names = TRUE,col.names = TRUE)
max.MaxBCCS.hclust <- c(max.MaxBCCS.hclust,max(MaxBCCS.hclust - diag(nrow(MaxBCCS.hclust))))
max.MaxBCCS.PAM <- c(max.MaxBCCS.PAM,max(MaxBCCS.PAM - diag(nrow(MaxBCCS.PAM))))
max.MaxBCCS.MC <- c(max.MaxBCCS.MC,max(MaxBCCS.MC - diag(nrow(MaxBCCS.MC))))
}else if (ns==1){
min.MinWCCS.hclust <- c(min.MinWCCS.hclust,NA)
min.MinWCCS.PAM <- c(min.MinWCCS.PAM,NA)
min.MinWCCS.MC <- c(min.MinWCCS.MC,NA)
max.MaxBCCS.hclust <- c(max.MaxBCCS.hclust,NA)
max.MaxBCCS.PAM <- c(max.MaxBCCS.PAM,NA)
max.MaxBCCS.MC <- c(max.MaxBCCS.MC,NA)
}
#-------------------
#----- clean this ns
#-------------------
rm(p_boot)
rm(e_boot)
rm(distMatrix)
gc()
}
#Collect metrics
overall_metrics <- data.frame(nsig, #1
ave.RMSE, #2
sd.RMSE, #3
ave.RMSE.orig, #4
sd.RMSE.orig, #5
ave.KLD, #6
sd.KLD, #7
ave.KLD.orig, #8
sd.KLD.orig, #9
ave.SilWid.hclust, #10
ave.SilWid.PAM, #11
ave.SilWid.MC, #12
cophenetic.corr.hclust, #13
min.MinWCCS.hclust, #14
min.MinWCCS.PAM, #15
min.MinWCCS.MC, #16
max.MaxBCCS.hclust, #17
max.MaxBCCS.PAM, #18
max.MaxBCCS.MC, #19
mmcs_hclust, #20
mmcs_pam, #21
mmcs_MC) #22
#write table with all the metrics
overall_metrics_file <- paste0(outFilePath,"Sigs_OverallMetrics_",group,"_nboots",nboots,".tsv")
write.table(overall_metrics,file = overall_metrics_file,
sep = "\t",quote = FALSE,row.names = FALSE,col.names = TRUE)
whattoplot_hclust <- c(10,14,17,20,13)
whattoplot_PAM <- c(11,15,18,21)
whattoplot_MC <- c(12,16,19,22)
#plot requested medoids
whattoplot_final <- whattoplot_hclust
if(clusteringMethod=="PAM") whattoplot_final <- whattoplot_PAM
if(clusteringMethod=="MC") whattoplot_final <- whattoplot_MC
for (cl in clustering_list){
overall_metrics_file <- paste0(outFilePath,"Sigs_OverallMetrics_",group,"_nboots",nboots,cl,".jpg")
#read file for debugging
# overall_metrics <- read.table(file = overall_metrics_file,sep = "\t",header = TRUE,as.is = TRUE)
#specify which columns to plot. nsig and ave.RMSE will be used already, just specify others
# whattoplot <- c(4,6,7,9,11)
if(cl==""){
whattoplot <- whattoplot_final
}else if(cl=="_HC"){
whattoplot <- whattoplot_hclust
}else if(cl=="_PAM"){
whattoplot <- whattoplot_PAM
}else if(cl=="_MC"){
whattoplot <- whattoplot_MC
}
plotOverallMetrics(overall_metrics,whattoplot,overall_metrics_file,group,nboots,nmfmethod)
}
complete <- TRUE
}
message("\n\nResults can be found in ", outFilePath)
message("\n------------- COMPUTATION SUCCESSFULLY COMPLETED ------------\n")
}
#############################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.