R/signatures_utils.R

Defines functions convertSigNamesFromOrganToRefSigs convertSigNamesFromOrganToRefSigs_subroutine convertExposuresFromOrganToRefSigs convertExposuresFromOrganToRefSigsIfAny getCOSMICSignatures getReferenceSignatures getOrganSignatures readTable writeTable RMSE normaliseSamples removeSimilarCatalogueSamples computeCorrelationOfTwoSetsOfSigs KLD findClosestRearrSigsBreast560_withSimilarity findClosestRearrSigsBreast560 findClosestCOSMIC30andCombinations_withSimilarity findClosestCOSMIC30_withSimilarity findClosestCOSMIC30andCombinations findClosestCOSMIC30 plotCosSimSignatures plotRearrSignatures_withMeanSd plotRearrSignatures plotSubsSignatures_withMeanSd plotSubsSignatures plotGenericSignatures_withMeanSd plotGenericSignatures computePropTooSimilar medoids_cosSimMatrix average_medoids_cosSim findMedoidsHclust plotOverallMetrics plotWithinClusterSilWidth plotWithinClusterCosSim plotHierarchicalCluster maxBetweenClustersCosSim minWithinClusterCosSim withinClusterCosSim cos_sim computeCorrelation_parallel computeCorrelation sortCatalogue preprocessCatalgue generateRandMuts

Documented in computeCorrelationOfTwoSetsOfSigs computeCorrelation_parallel convertExposuresFromOrganToRefSigs convertExposuresFromOrganToRefSigsIfAny cos_sim findClosestCOSMIC30 findClosestCOSMIC30andCombinations findClosestCOSMIC30andCombinations_withSimilarity findClosestCOSMIC30_withSimilarity getCOSMICSignatures getOrganSignatures getReferenceSignatures KLD plotCosSimSignatures plotGenericSignatures plotRearrSignatures plotSubsSignatures readTable RMSE sortCatalogue writeTable

#------------------------------------------------
#------- Auxiliary functions ---------------------
#------------------------------------------------

#' @importFrom foreach %dopar%
NULL

## Generate a random replicate of the cataloge
# This method guarantees the total number of signatures is unchanged
generateRandMuts <- function(x,
                             verbose=TRUE){
  #consider the following method as a replacement
  full_r <- matrix(nrow = dim(x)[1],ncol = dim(x)[2])
  colnames(full_r) <- colnames(x)
  row.names(full_r) <- row.names(x)
  for (i in 1:ncol(x)){
    if(sum(x[,i])>=1){
      samples <- sample(1:nrow(x),size = sum(x[,i]),prob = x[,i]/sum(x[,i]),replace = TRUE)
      r <- unlist(lapply(1:nrow(x),function(p) sum(samples==p)))
    }else{ #there are not enough mutations to sample
      if(verbose) message("[warning generateRandMuts] Cannot sample mutations for catalogue in column ",i,", number of mutations is less than 1. Returning the original catalogue.")
      r <- x[,i]
    }
    names(r) <- rownames(x)
    full_r[,i] <- r
  }
  return(full_r)
}

## Remove unused Rows and Columns from the Catalogue
## Remove Mutations with small numbers
preprocessCatalgue <- function(d, mut_thr){

  ## Remove Mutations
  nmut <- apply(d, 1, sum)
  nmut <- nmut/sum(nmut)
  pos <- which(nmut<=mut_thr)
  if(length(pos)>0){
    d <- d[-pos,]
  }
  return(d)
}

#' Sort 96-channel Substitution Catalogues
#'
#' This function sorts a matrix of 96-channel Substitution Catalogues,
#' so that the channels (rows) are in the correct order.
#' where each column is a catalogue and each row is a channel.
#' rownames should be set as the name of the channel in the format
#' 5' base[Normal base>Tumour base]3' base, for example A[C>A]A.
#'
#' @param cat catalogues matrix where each column is a catalogue and each row is a channel. Rownames should be set as the name of the channel in the format 5' base[Normal base>Tumour base]3' base, for example A[C>A]A.
#' @return ordered catalogue
#' @export
sortCatalogue <- function(cat){
  all_bp <- c("A", "C", "G", "T")
  pyr_muts <- c("C>A", "C>G", "C>T", "T>A", "T>C", "T>G")
  nm <- c()
  for(m in pyr_muts){
    for(a in all_bp){
      for(b in all_bp){
        nm <- c(nm, paste(a, "[",m, "]", b, sep=""))
      }
    }
  }
  return(cat[nm,,drop=FALSE])

}



computeCorrelation <- function(x){
  if (ncol(x)==1){
    #if there is only one column, correlation matrix is 1
    return(matrix(1, 1, 1))
  }
  out <- matrix(NA, ncol(x), ncol(x))
  #diagonal is 1
  for(i in 1:ncol(x)){
    out[i,i] <- 1
  }
  colnames(out) <- colnames(x)
  rownames(out) <- colnames(x)
  for(i in 2:ncol(x)){
    for(j in 1:(i-1)){ #up to i-1, diag already set to 1
      #message(i, " ", j)
      out[i,j] <- cos_sim(as.numeric(x[,i]), as.numeric(x[,j]))
      out[j,i] <- out[i,j] #upper triangular is the same
    }
  }
  return(out)
}

#' Compute Correlation (parallel)
#'
#' Compute the correlation of a set of signatures/catalogues according to cosine similarity
#'
#' @param x catalogues/signature matrix where each column is a catalogue/signature and each row is a channel.
#' @param nparallel how many parallel processes to use
#' @param parallel set to TRUE in order to use parallel
#' @return correlation matrix
#' @export
computeCorrelation_parallel <- function(x,nparallel=1,parallel=FALSE){
  if (ncol(x)==1){
    #if there is only one column, correlation matrix is 1
    return(matrix(1, 1, 1))
  }
  out <- matrix(NA, ncol(x), ncol(x))
  #diagonal is 1
  for(i in 1:ncol(x)){
    out[i,i] <- 1
  }
  colnames(out) <- colnames(x)
  rownames(out) <- colnames(x)
  #correlation matrix is symmetric
  if (parallel){
    doParallel::registerDoParallel(nparallel)
    par_res <- foreach::foreach(i=2:ncol(x)) %dopar% {
      current_res <- c()
      for(j in 1:(i-1)){ #up to i-1, diag already set to 1
        #message(i, " ", j)
        current_res <- c(current_res,cos_sim(as.numeric(x[,i]), as.numeric(x[,j])))
      }
      current_res
    }
    for(i in 2:ncol(x)){
      out[i,1:(i-1)] <- par_res[[i-1]]
      out[1:(i-1),i] <- par_res[[i-1]]
    }
  }else{
    for(i in 2:ncol(x)){
      for(j in 1:(i-1)){ #up to i-1, diag already set to 1
        #message(i, " ", j)
        out[i,j] <- cos_sim(as.numeric(x[,i]), as.numeric(x[,j]))
        out[j,i] <- out[i,j] #upper triangular is the same
      }
    }
  }
  return(out)
}

#' Cosine Similarity
#'
#' Compute the cosine similarity between two vectors, using the formula sum(a*b)/sqrt(sum(a^2)*sum(b^2)).
#'
#' @param a first vector to compare
#' @param b second vector to compare
#' @return cosine similarity
#' @export
cos_sim <- function(a, b){
  return( sum(a*b)/sqrt(sum(a^2)*sum(b^2)) )
}

#function to compute the average within cluster cosine similarity,
#uses as input the distance matrix constructed as 1 - cosSimMatrix
withinClusterCosSim <- function(clustering,distMatrix,parallel){
  clusters <- unique(clustering)
  clusters <- clusters[order(clusters)]
  res <- c()
  if (parallel==TRUE){
    res <- foreach::foreach(i=clusters) %dopar%{
      samplesInCluster <- names(clustering[clustering==i])
      nSamples <- length(samplesInCluster)
      meanCosSim <- 0
      if(nSamples==1){
        meanCosSim <- 1
      }else{
        combinations <- nSamples*(nSamples - 1)/2
        #for efficiency, instead of summing lots of (1 - dist)
        #start from 1*combinations and subtract the dists
        sumCosSim <- combinations
        for(j in 1:(nSamples-1)){
          sumCosSim <- sumCosSim - sum(distMatrix[samplesInCluster[j],samplesInCluster[1:nSamples > j]])
        }
        meanCosSim <- sumCosSim/combinations
      }
      meanCosSim
    }
    res <- unlist(res)
  }else{
    for(i in clusters){
      samplesInCluster <- names(clustering[clustering==i])
      nSamples <- length(samplesInCluster)
      combinations <- nSamples*(nSamples - 1)/2
      #for efficiency, instead of summing lots of (1 - dist)
      #start from 1*combinations and subtract the dists
      sumCosSim <- combinations
      for(j in 1:(nSamples-1)){
        sumCosSim <- sumCosSim - sum(distMatrix[samplesInCluster[j],samplesInCluster[1:nSamples > j]])
      }
      res <- c(res,sumCosSim/combinations)
    }
  }
  return(res)
}

#function to compute the minimum within cluster cosine similarity,
#uses as input the distance matrix constructed as 1 - cosSimMatrix
minWithinClusterCosSim <- function(clustering,distMatrix,parallel){
  #find max distance and then do minCosSim <- 1 - maxDist
  clusters <- unique(clustering)
  clusters <- clusters[order(clusters)]
  res <- c()
  if (parallel==TRUE){
    res <- foreach::foreach(i=clusters) %dopar%{
      samplesInCluster <- names(clustering[clustering==i])
      nSamples <- length(samplesInCluster)
      maxDist <- 0
      if(nSamples==1){
        #maxDist <- 0
      }else{
        for(j in 1:(nSamples-1)){
          maxDist <- max(maxDist,max(distMatrix[samplesInCluster[j],samplesInCluster[1:nSamples > j]]))
        }
      }
      (1 - maxDist)
    }
    res <- unlist(res)
  }else{
    for(i in clusters){
      samplesInCluster <- names(clustering[clustering==i])
      nSamples <- length(samplesInCluster)
      maxDist <- 0
      if(nSamples==1){
        #maxDist <- 0
      }else{
        for(j in 1:(nSamples-1)){
          maxDist <- max(maxDist,max(distMatrix[samplesInCluster[j],samplesInCluster[1:nSamples > j]]))
        }
      }
      res <- c(res,1 - maxDist)
    }
  }
  return(res)
}

#function to compute the maximum between clusters cosine similarity,
#uses as input the distance matrix constructed as 1 - cosSimMatrix
maxBetweenClustersCosSim <- function(clustering,distMatrix,parallel){
  clusters <- unique(clustering)
  clusters <- clusters[order(clusters)]
  res <- matrix(1,nrow = length(clusters),ncol = length(clusters))
  colnames(res) <- clusters
  rownames(res) <- clusters
  if (parallel==TRUE){
    res_list <- foreach::foreach(c=1:(length(clusters) - 1)) %dopar%{
      i <- clusters[c]
      samplesInCluster <- names(clustering[clustering==i])
      res_table <- data.frame()
      for (d in (c+1):length(clusters)){
        j <- clusters[d]
        samplesInCluster2 <- names(clustering[clustering==j])
        ms <- 1 - min(distMatrix[samplesInCluster,samplesInCluster2])
        res_table <- rbind(res_table,data.frame(r=i,c=j,maxSim=ms))
      }
      res_table
    }

    for (i in 1:length(res_list)){
      currentTable <- res_list[[i]]
      for(j in 1:nrow(currentTable)){
        res[currentTable$r[j],currentTable$c[j]] <- currentTable$maxSim[j]
        res[currentTable$c[j],currentTable$r[j]] <- currentTable$maxSim[j]
      }
    }
  }else{
    for (c in 1:(length(clusters) - 1)){
      i <- clusters[c]
      samplesInCluster <- names(clustering[clustering==i])
      for (d in (c+1):length(clusters)){
        j <- clusters[d]
        samplesInCluster2 <- names(clustering[clustering==j])
        ms <- 1 - min(distMatrix[samplesInCluster,samplesInCluster2])
        res[i,j] <- ms
        res[j,i] <- ms
      }
    }
  }
  return(res)
}

plotHierarchicalCluster <- function(fit_clust,outFilePath,group,ns,nboots){
  output_file <- paste0(outFilePath,"Sigs_Cluster_Average_",group,"_ns",ns,"_nboots",nboots,".png")
  png(output_file,width = 12*nboots*ns,height = 500,res = 80)
  plot(fit_clust)
  abline(a=0.1,b=0,col="red")
  dev.off()
}


plotWithinClusterCosSim <- function(cosSimHClust,cosSimPAM,cosSimMC,outFilePath,group,ns,nboots){
  dists <- c(cosSimHClust,cosSimPAM,cosSimMC)
  clustermethod <- c(rep("hclust",length(cosSimHClust)),rep("pam",length(cosSimPAM)),rep("MC",length(cosSimMC)))
  output_file <- paste0(outFilePath,"Sigs_WithinClusterCosSim_",group,"_ns",ns,"_nboots",nboots,".png")
  png(output_file,width = 600,height = 500,res = 100)
  boxplot(dists ~ clustermethod, lwd = 2, ylab = 'mean Cosine Similarity',xlab = 'method',
          main = paste0("Within Cluster Cosine Similarity\n",group,", nSig=",ns))
  stripchart( dists ~ clustermethod, vertical = TRUE,
              method = "jitter", add = TRUE, pch = 20, col = 'blue')
  dev.off()
}

plotWithinClusterSilWidth <- function(sil_hclust,sil_pam,sil_MC,outFilePath,group,ns,nboots){
  dists <- c(sil_hclust$clus.avg.widths,sil_pam$clus.avg.widths,sil_MC$clus.avg.widths)
  clustermethod <- c(rep("hclust",length(sil_hclust$clus.avg.widths)),rep("pam",length(sil_pam$clus.avg.widths)),rep("MC",length(sil_pam$clus.avg.widths)))
  output_file <- paste0(outFilePath,"Sigs_WithinClusterSilWidth_",group,"_ns",ns,"_nboots",nboots,".png")
  png(output_file,width = 600,height = 500,res = 100)
  boxplot(dists ~ clustermethod, lwd = 2, ylab = 'mean Silhouette Width',xlab = 'method',
          main = paste0("Within Cluster Silhouette Width\n",group,", nSig=",ns))
  stripchart( dists ~ clustermethod, vertical = TRUE,
              method = "jitter", add = TRUE, pch = 20, col = 'blue')
  dev.off()
}


plotOverallMetrics <- function(overall_metrics,whattoplot,overall_metrics_file,group,nboots,nmfmethod){
  png(overall_metrics_file,width = 800,height = 500,res = 100)
  par(mar = c(5, 4, 4, 12),mgp = c(2.5,1,0))
  if(nmfmethod=="lee"){
    max_error <- max(overall_metrics$ave.RMSE,overall_metrics$ave.RMSE.orig)
    plot(overall_metrics$nsig,overall_metrics$ave.RMSE/max_error,type="l",
         ylab = "Score",xlab = "Number of extracted signatures",lwd = 2,
         ylim = c(min(min(overall_metrics$ave.RMSE/max_error,overall_metrics$ave.RMSE.orig/max_error),
                      min(overall_metrics[,whattoplot],na.rm = TRUE))*0.95,1),
         main = paste0("Overall Metrics\n(",group,", bootstraps=",nboots,")"))
    lines(overall_metrics$nsig,overall_metrics$ave.RMSE.orig/max_error,type="l",col="black",lwd = 2,lty = 2)
  }else{
    max_error <- max(overall_metrics$ave.KLD,overall_metrics$ave.KLD.orig)
    plot(overall_metrics$nsig,overall_metrics$ave.KLD/max_error,type="l",
         ylab = "Score",xlab = "Number of extracted signatures",lwd = 2,
         ylim = c(min(min(overall_metrics$ave.KLD/max_error,overall_metrics$ave.KLD.orig/max_error),
                      min(overall_metrics[,whattoplot],na.rm = TRUE))*0.95,1),
         main = paste0("Overall Metrics\n(",group,", bootstraps=",nboots,")"))
    lines(overall_metrics$nsig,overall_metrics$ave.KLD.orig/max_error,type="l",col="black",lwd = 2, lty = 2)
  }
  abline(h = 0.9,lty = 2)
  colours_list <- c("red","green","blue","purple","orange","brown","yellow")
  for(i in 1:length(whattoplot)){
    pos <- whattoplot[i]
    lines(overall_metrics$nsig,overall_metrics[,pos],type="l",col=colours_list[i],lwd = 2)
  }
  legend("right", c("norm.Error","norm.Error (orig. cat.)",colnames(overall_metrics)[whattoplot]), xpd = TRUE, horiz = FALSE, inset = c(-0.45,-0.4),lty = c(1,2,rep(1,length(whattoplot))),
         bty = "n", col = c("black","black",colours_list[1:length(whattoplot)]),lwd = 2, cex = 0.9)
  dev.off()
}

findMedoidsHclust <- function(distMatrix,cut_res){
  clusters <- unique(cut_res)
  clusters <- clusters[order(clusters)]
  medoids_hclust <- c()
  for (j in clusters){
    current_cluster <- cut_res[cut_res==j]
    current_cluster_names <- names(current_cluster)
    nInCluster <- length(current_cluster)
    if (nInCluster==1){
      medoids_hclust <- c(medoids_hclust,names(current_cluster))
    }else{
      for (p in 1:nInCluster){
        current_cluster[current_cluster_names[p]] <- sum(distMatrix[current_cluster_names[p],current_cluster_names[-p]])/(nInCluster-1)
      }
      medoids_hclust <- c(medoids_hclust,names(which.min(current_cluster)[1]))
    }
  }
  return(medoids_hclust)
}

average_medoids_cosSim <- function(distMatrix,medoids){
  #at least two clusters!
  nClusters <- length(medoids)
  medoidsNumber <- 1:nClusters
  nCombinations <- nClusters*(nClusters - 1)/2
  amcs <- 0
  for (j in (medoidsNumber-1)){
    amcs <- amcs + sum(distMatrix[medoids[j],medoids[medoidsNumber>j]])
  }
  amcs <- (nCombinations - amcs)/nCombinations
  return(amcs)
}

medoids_cosSimMatrix <- function(p_boot,medoids){
  return(computeCorrelation_parallel(p_boot[,medoids]))
}

computePropTooSimilar <- function(distMatrix,saved_nmf_runs,ns){
  countTooSimilar <- 0
  for (i in 1:length(saved_nmf_runs)){
    if (max(distMatrix[((i-1)*ns + 1):(ns*i),((i-1)*ns + 1):(ns*i)]) > 0.9) countTooSimilar <- countTooSimilar + 1
  }
  return(countTooSimilar/saved_nmf_runs)
}


#' Plot Generic Signatures or Catalogues
#'
#' Function to plot one or more signatures or catalogues with an arbitrary number of channels. Channel names will not be plotted and all channels will be plotted as bars of the same colour.
#'
#' @param signature_data_matrix matrix of signatures, signatures as columns and channels as rows
#' @param output_file set output file, should end with ".jpg", "png" or ".pdf". If output_file==null, output will not be to a file, but will still run the plot functions. The option output_file==null can be used to add this plot to a larger output file.
#' @param plot_sum whether the sum of the channels should be plotted. If plotting signatures this should be FALSE, but if plotting sample catalogues, this can be set to TRUE to display the number of mutations in each sample.
#' @param overall_title set the overall title of the plot
#' @param mar set the option par(mar=mar)
#' @param howManyInOnePage how many signatures or catalogues should be plotted on one page. Multiple pages are plotted if more signatures/catalogues to plot have been requested
#' @param ncolumns how many columns should be used to arrange the signatures/catalogues to plot
#' @export
plotGenericSignatures <- function(signature_data_matrix,
                                  output_file = NULL,
                                  plot_sum = TRUE,
                                  overall_title = "",
                                  add_to_titles = NULL,
                                  mar=NULL,
                                  howManyInOnePage=100,
                                  ncolumns=1){
  if(!is.null(output_file)) plottype <- substr(output_file,nchar(output_file)-2,nchar(output_file))
  npages <- ceiling(ncol(signature_data_matrix)/howManyInOnePage)
  if(!is.null(output_file)) rootoutput_file <- substr(output_file,1,nchar(output_file)-4)
  for(i in 1:npages){
    ifrom <- howManyInOnePage*(i-1) + 1
    ito <- min(ncol(signature_data_matrix),howManyInOnePage*i)
    tmpmatrix <- signature_data_matrix[,ifrom:ito,drop=F]
    if (!is.null(add_to_titles)) tmpadd <- add_to_titles[ifrom:ito]
    if(npages>1 & !is.null(output_file)) output_file <- paste0(rootoutput_file,"_",i,"of",npages,".",plottype)
    # now plot
    nplotrows <- ceiling(ncol(tmpmatrix)/ncolumns)
    if(!is.null(output_file)) {
      if(plottype=="jpg"){
        jpeg(output_file,width = ncolumns*800,height = nplotrows*400,res = 190)
      }else if(plottype=="png"){
        png(output_file,width = ncolumns*800,height = nplotrows*400,res = 190)
      }else if(plottype=="pdf"){
        pdf(output_file,width = ncolumns*8,height = nplotrows*4+0.5,pointsize = 26)
      }
      par(mfrow = c(nplotrows, ncolumns),omi=c(0,0,0.5,0),cex=0.7)
    }
    for (pos in 1:ncol(tmpmatrix)){
      par(mgp=c(1,1,0))
      if(is.null(mar)){
        par(mar=c(3,3.5,2,1))
      }else{
        par(mar=mar)
      }
      title <- colnames(tmpmatrix)[pos]
      if (!is.null(add_to_titles)) title <- paste0(title," ",tmpadd[pos])
      if (plot_sum) title <- paste0(title,"\n(",round(sum(tmpmatrix[,pos]))," mutations)")

      barplot(tmpmatrix[,pos],
              main = title,
              names.arg = c(rep("",nrow(tmpmatrix))),
              col="skyblue",
              beside = TRUE,
              xlab = "channels",
              cex.main = 0.9,
              cex.names = 1)
    }
    title(main = overall_title,outer = TRUE,cex.main = 1.5)
    if(!is.null(output_file)) dev.off()
  }
}

plotGenericSignatures_withMeanSd <- function(signature_data_matrix,
                                             mean_matrix,
                                             sd_matrix,
                                             output_file = NULL,
                                             plot_sum = TRUE,
                                             overall_title = "",
                                             add_to_titles = NULL,
                                             mar=NULL){
  if(!is.null(output_file)) plottype <- substr(output_file,nchar(output_file)-2,nchar(output_file))
  # rearr.colours <- c(rep("blue",16),rep("black",16),rep("red",16),rep("grey",16),rep("green",16),rep("pink",16))
  nplotrows <- ncol(signature_data_matrix)
  if(!is.null(output_file)) {
    if(plottype=="jpg"){
      jpeg(output_file,width = 2*800,height = nplotrows*400,res = 190)
    }else if(plottype=="png"){
      png(output_file,width = 2*800,height = nplotrows*400,res = 190)
    }else if (plottype=="pdf"){
      pdf(output_file,width = 2*8,height = nplotrows*4+0.5,pointsize = 26)
    }
  }
  par(mfrow = c(nplotrows, 2),omi=c(0,0,0.5,0))
  par(mgp=c(1,1,0))
  if(is.null(mar)){
    par(mar=c(3,3,2,1))
  }else{
    par(mar=mar)
  }
  for (pos in 1:ncol(signature_data_matrix)){
    ylimit <- c(0,max(signature_data_matrix[,pos],mean_matrix[,pos]+sd_matrix[,pos]))
    title <- colnames(signature_data_matrix)[pos]
    if (!is.null(add_to_titles)) title <- paste0(title," ",add_to_titles[pos])
    if (plot_sum) title <- paste0(title,"\n(",round(sum(signature_data_matrix[,pos]))," mutations)")
    barplot(signature_data_matrix[,pos],
            main = title,
            names.arg = c(rep("",nrow(signature_data_matrix))),
            col="skyblue",
            beside = TRUE,
            xlab = "channels",
            cex.main = 0.9,
            ylim = ylimit,
            cex.names = 1)
    barCenters <- barplot(mean_matrix[,pos],
                          main = "mean and sd of cluster",
                          names.arg = c(rep("",nrow(signature_data_matrix))),
                          col="skyblue",
                          beside = TRUE,
                          xlab = "channels",
                          cex.main = 0.9,
                          ylim = ylimit,
                          cex.names = 1)
    segments(barCenters, mean_matrix[,pos] - sd_matrix[,pos], barCenters,
             mean_matrix[,pos] + sd_matrix[,pos], lwd = 1.5)

    arrows(barCenters, mean_matrix[,pos] - sd_matrix[,pos], barCenters,
           mean_matrix[,pos] + sd_matrix[,pos], lwd = 1.5, angle = 90,
           code = 3, length = 0.05)
  }
  title(main = overall_title,outer = TRUE,cex.main = 1.5)
  if(!is.null(output_file)) dev.off()
}

#' Plot Substitution Signatures or Catalogues
#'
#' Function to plot one or more substitution signatures or catalogues.
#'
#' @param signature_data_matrix matrix of signatures, signatures as columns and channels as rows
#' @param output_file set output file, should end with ".jpg", ".png" or ".pdf". If output_file==null, output will not be to a file, but will still run the plot functions. The option output_file==null can be used to add this plot to a larger output file.
#' @param plot_sum whether the sum of the channels should be plotted. If plotting signatures this should be FALSE, but if plotting sample catalogues, this can be set to TRUE to display the number of mutations in each sample.
#' @param overall_title set the overall title of the plot
#' @param add_to_titles text to be added to the titles of each catalogue plot
#' @param mar set the option par(mar=mar)
#' @param howManyInOnePage how many signatures or catalogues should be plotted on one page. Multiple pages are plotted if more signatures/catalogues to plot have been requested
#' @param ncolumns how many columns should be used to arrange the signatures/catalogues to plot
#' @param textscaling scale the text of the plot, default is 1
#' @export
plotSubsSignatures <- function(signature_data_matrix,
                               output_file = NULL,
                               plot_sum = TRUE,
                               overall_title = "",
                               add_to_titles = NULL,
                               mar=NULL,
                               howManyInOnePage=100,
                               ncolumns=1,
                               textscaling=1){

  plotcolours <- c(rgb(5,195,239,maxColorValue = 255),
                   rgb(0,0,0,maxColorValue = 255),
                   rgb(230,47,41,maxColorValue = 255),
                   rgb(208,207,207,maxColorValue = 255),
                   rgb(169,212,108,maxColorValue = 255),
                   rgb(238,205,204,maxColorValue = 255))
  if(!is.null(output_file)) plottype <- substr(output_file,nchar(output_file)-2,nchar(output_file))
  rearr.colours <- c(rep(plotcolours[1],16),rep(plotcolours[2],16),rep(plotcolours[3],16),rep(plotcolours[4],16),rep(plotcolours[5],16),rep(plotcolours[6],16))
  npages <- ceiling(ncol(signature_data_matrix)/howManyInOnePage)
  if(!is.null(output_file)) rootoutput_file <- substr(output_file,1,nchar(output_file)-4)
  for(i in 1:npages){
    ifrom <- howManyInOnePage*(i-1) + 1
    ito <- min(ncol(signature_data_matrix),howManyInOnePage*i)
    tmpmatrix <- signature_data_matrix[,ifrom:ito,drop=F]
    if (!is.null(add_to_titles)) tmpadd <- add_to_titles[ifrom:ito]
    if(npages>1 & !is.null(output_file)) output_file <- paste0(rootoutput_file,"_",i,"of",npages,".",plottype)
    # now plot
    nplotrows <- ceiling(ncol(tmpmatrix)/ncolumns)
    if(!is.null(output_file)) {
      if(plottype=="jpg"){
        jpeg(output_file,width = ncolumns*800,height = nplotrows*300,res = 220)
      }else if(plottype=="png"){
        png(output_file,width = ncolumns*800,height = nplotrows*300,res = 220)
      }else if(plottype=="pdf"){
        pdf(output_file,width = ncolumns*8,height = nplotrows*3+0.5,pointsize = 26)
      }
      par(mfrow = c(nplotrows, ncolumns),omi=c(0,0,0.5,0),cex=0.7)
    }
    for (pos in 1:ncol(tmpmatrix)){
      if(is.null(mar)){
        par(mar=c(2,3.5,2,1))
      }else{
        par(mar=mar)
      }
      title <- colnames(tmpmatrix)[pos]
      if (!is.null(add_to_titles)) title <- paste0(title," ",tmpadd[pos])
      if (plot_sum) title <- paste0(title,"\n(",round(sum(tmpmatrix[,pos]))," SNVs)")
      muttypes <- c("C>A","C>G","C>T","T>A","T>C","T>G")
      xlabels <- rep("",96)
      barplot(tmpmatrix[,pos],
              main = title,
              cex.axis = textscaling,
              names.arg = xlabels,
              col=rearr.colours,
              beside = TRUE,
              las=2,
              cex.main = 0.9*textscaling,
              cex.names = 1*textscaling,border = NA,space = 0.2)
      par(xpd=TRUE)
      par(usr = c(0, 1, 0, 1))
      recttop <- -0.02
      rectbottom <- -0.16
      start1 <- 0.035
      gap <- 0.155
      rect(start1, rectbottom, start1+gap, recttop,col = plotcolours[1],border = NA)
      rect(start1+gap, rectbottom, start1+2*gap, recttop,col = plotcolours[2],border = NA)
      rect(start1+2*gap, rectbottom, start1+3*gap, recttop,col = plotcolours[3],border = NA)
      rect(start1+3*gap, rectbottom, start1+4*gap, recttop,col = plotcolours[4],border = NA)
      rect(start1+4*gap, rectbottom, start1+5*gap, recttop,col = plotcolours[5],border = NA)
      rect(start1+5*gap, rectbottom, start1+6*gap, recttop,col = plotcolours[6],border = NA)
      textposx <- 0.04+seq(8,88,16)/104
      text(x = textposx[1:3],y = -0.09,labels = muttypes[1:3],col = "white",font = 2,cex = textscaling)
      text(x = textposx[4:6],y = -0.09,labels = muttypes[4:6],col = "black",font = 2,cex = textscaling)
      par(xpd=FALSE)
    }
    title(main = overall_title,outer = TRUE,cex.main = 1.5*textscaling)
    if(!is.null(output_file)) dev.off()
  }
}

plotSubsSignatures_withMeanSd <- function(signature_data_matrix,
                                          mean_matrix,
                                          sd_matrix,
                                          output_file = NULL,
                                          plot_sum = TRUE,
                                          overall_title = "",
                                          add_to_titles = NULL,
                                          mar=NULL){

  plotcolours <- c(rgb(5,195,239,maxColorValue = 255),
                   rgb(0,0,0,maxColorValue = 255),
                   rgb(230,47,41,maxColorValue = 255),
                   rgb(208,207,207,maxColorValue = 255),
                   rgb(169,212,108,maxColorValue = 255),
                   rgb(238,205,204,maxColorValue = 255))
  if(!is.null(output_file)) plottype <- substr(output_file,nchar(output_file)-2,nchar(output_file))
  rearr.colours <- c(rep(plotcolours[1],16),rep(plotcolours[2],16),rep(plotcolours[3],16),rep(plotcolours[4],16),rep(plotcolours[5],16),rep(plotcolours[6],16))
  nplotrows <- ncol(signature_data_matrix)
  if(!is.null(output_file)) {
    if(plottype=="jpg"){
      jpeg(output_file,width = 2*800,height = nplotrows*300,res = 220)
    }else if(plottype=="png"){
      png(output_file,width = 2*800,height = nplotrows*300,res = 220)
    }else if(plottype=="pdf"){
      pdf(output_file,width = 2*8,height = nplotrows*3+0.5,pointsize = 26)
    }
  }
  par(mfrow = c(nplotrows, 2),omi=c(0,0,0.5,0),cex=0.7)
  if(is.null(mar)){
    par(mar=c(2,3,2,1))
  }else{
    par(mar=mar)
  }
  muttypes <- c("C>A","C>G","C>T","T>A","T>C","T>G")
  xlabels <- rep("",96)
  for (pos in 1:ncol(signature_data_matrix)){
    ylimit <- c(0,max(signature_data_matrix[,pos],mean_matrix[,pos]+sd_matrix[,pos]))
    title <- colnames(signature_data_matrix)[pos]
    if (!is.null(add_to_titles)) title <- paste0(title," ",add_to_titles[pos])
    if (plot_sum) title <- paste0(title,"\n(",round(sum(signature_data_matrix[,pos]))," SNVs)")
    barplot(signature_data_matrix[,pos],
            main = title,
            names.arg = xlabels,
            col=rearr.colours,
            beside = TRUE,
            ylim = ylimit,
            las=2,
            cex.main = 0.9,
            border = NA,
            cex.names = 1)
    par(xpd=TRUE)
    par(usr = c(0, 1, 0, 1))
    recttop <- -0.02
    rectbottom <- -0.16
    start1 <- 0.035
    gap <- 0.155
    rect(start1, rectbottom, start1+gap, recttop,col = plotcolours[1],border = NA)
    rect(start1+gap, rectbottom, start1+2*gap, recttop,col = plotcolours[2],border = NA)
    rect(start1+2*gap, rectbottom, start1+3*gap, recttop,col = plotcolours[3],border = NA)
    rect(start1+3*gap, rectbottom, start1+4*gap, recttop,col = plotcolours[4],border = NA)
    rect(start1+4*gap, rectbottom, start1+5*gap, recttop,col = plotcolours[5],border = NA)
    rect(start1+5*gap, rectbottom, start1+6*gap, recttop,col = plotcolours[6],border = NA)
    textposx <- 0.04+seq(8,88,16)/104
    text(x = textposx[1:3],y = -0.09,labels = muttypes[1:3],col = "white",font = 2)
    text(x = textposx[4:6],y = -0.09,labels = muttypes[4:6],col = "black",font = 2)
    par(xpd=FALSE)
    barCenters <- barplot(mean_matrix[,pos],
                          main = "mean and sd of cluster",
                          names.arg = xlabels,
                          col=rearr.colours,
                          beside = TRUE,
                          ylim = ylimit,
                          las=2,
                          cex.main = 0.9,
                          cex.names = 1,border = NA,space = 0.2)
    segments(barCenters, mean_matrix[,pos], barCenters,
             mean_matrix[,pos] + sd_matrix[,pos], lwd = 1)
    par(xpd=TRUE)
    par(usr = c(0, 1, 0, 1))
    recttop <- -0.02
    rectbottom <- -0.16
    start1 <- 0.035
    gap <- 0.155
    rect(start1, rectbottom, start1+gap, recttop,col = plotcolours[1],border = NA)
    rect(start1+gap, rectbottom, start1+2*gap, recttop,col = plotcolours[2],border = NA)
    rect(start1+2*gap, rectbottom, start1+3*gap, recttop,col = plotcolours[3],border = NA)
    rect(start1+3*gap, rectbottom, start1+4*gap, recttop,col = plotcolours[4],border = NA)
    rect(start1+4*gap, rectbottom, start1+5*gap, recttop,col = plotcolours[5],border = NA)
    rect(start1+5*gap, rectbottom, start1+6*gap, recttop,col = plotcolours[6],border = NA)
    textposx <- 0.04+seq(8,88,16)/104
    text(x = textposx[1:3],y = -0.09,labels = muttypes[1:3],col = "white",font = 2)
    text(x = textposx[4:6],y = -0.09,labels = muttypes[4:6],col = "black",font = 2)
    par(xpd=FALSE)
  }
  title(main = overall_title,outer = TRUE,cex.main = 1.5)
  if(!is.null(output_file)) dev.off()
}

#' Plot Rearrangement Signatures or Catalogues
#'
#' Function to plot one or more rearrangement signatures or catalogues.
#'
#' @param signature_data_matrix matrix of signatures, signatures as columns and channels as rows
#' @param output_file set output file, should end with ".jpg", ".png" or ".pdf". If output_file==null, output will not be to a file, but will still run the plot functions. The option output_file==null can be used to add this plot to a larger output file.
#' @param plot_sum whether the sum of the channels should be plotted. If plotting signatures this should be FALSE, but if plotting sample catalogues, this can be set to TRUE to display the number of mutations in each sample.
#' @param overall_title set the overall title of the plot
#' @param mar set the option par(mar=mar)
#' @param howManyInOnePage how many signatures or catalogues should be plotted on one page. Multiple pages are plotted if more signatures/catalogues to plot have been requested
#' @param ncolumns how many columns should be used to arrange the signatures/catalogues to plot
#' @param textscaling scale the text of the plot (default is 1)
#' @export
plotRearrSignatures <-function(signature_data_matrix,
                               output_file = NULL,
                               plot_sum = TRUE,
                               overall_title = "",
                               add_to_titles = NULL,
                               mar=NULL,
                               howManyInOnePage=100,
                               ncolumns=1,
                               textscaling = 1){
  #This function plots a set of signatures in a single file, three signatures for each row.
  if(!is.null(output_file)) plottype <- substr(output_file,nchar(output_file)-2,nchar(output_file))
  del_col = rgb(228,26,28, maxColorValue = 255)
  td_col = rgb(77,175,74, maxColorValue =255)
  inv_col  = rgb(55,126,184, maxColorValue = 255)
  transloc_col = rgb(152,78,163, maxColorValue =255)
  non_clust_col = rgb(240,240,240, maxColorValue =255)
  rearr.colours <- rep(c(rep(del_col,5),rep(td_col,5),rep(inv_col,5),transloc_col),2)
  npages <- ceiling(ncol(signature_data_matrix)/howManyInOnePage)
  if(!is.null(output_file)) rootoutput_file <- substr(output_file,1,nchar(output_file)-4)

  for(i in 1:npages){
    ifrom <- howManyInOnePage*(i-1) + 1
    ito <- min(ncol(signature_data_matrix),howManyInOnePage*i)
    tmpmatrix <- signature_data_matrix[,ifrom:ito,drop=F]
    if (!is.null(add_to_titles)) tmpadd <- add_to_titles[ifrom:ito]
    if(npages>1 & !is.null(output_file)) output_file <- paste0(rootoutput_file,"_",i,"of",npages,".",plottype)
    # now plot
    nplotrows <- ceiling(ncol(tmpmatrix)/ncolumns)
    if(!is.null(output_file)){
      if(plottype=="jpg"){
        jpeg(output_file,width = ncolumns*800,height = nplotrows*500,res = 220)
      }else if(plottype=="png"){
        png(output_file,width = ncolumns*800,height = nplotrows*500,res = 220)
      }else if(plottype=="pdf"){
        pdf(output_file,width = ncolumns*8,height = nplotrows*5+0.5,pointsize = 26)
      }
      par(mfrow = c(nplotrows, ncolumns),omi=c(0,0,0.5,0),cex=0.7)
    }
    sizes <- c("1-10Kb",
               "10-100Kb",
               "100Kb-1Mb",
               "1Mb-10Mb",
               ">10Mb")
    sizes_names <- c(rep(sizes,3),"",rep(sizes,3),"")
    for (pos in 1:ncol(tmpmatrix)){
      if(is.null(mar)){
        par(mar=c(8,3,2,1))
      }else{
        par(mar=mar)
      }
      title <- colnames(tmpmatrix)[pos]
      if (!is.null(add_to_titles)) title <- paste0(title," ",tmpadd[pos])
      if (plot_sum) title <- paste0(title,"\n(",round(sum(tmpmatrix[,pos]))," SVs)")
      pos <- barplot(tmpmatrix[,pos],
                     main = title,
                     names.arg = NA,
                     cex.axis = textscaling,
                     col=rearr.colours,
                     beside = FALSE,
                     las=2,
                     cex.names = 0.8*textscaling,
                     cex.main = 0.9*textscaling,
                     border = 0,
                     space = 0.1)
      #save old plot coordinates
      op <- par("usr")
      #set new coordinates
      par(usr = c(0, 1, 0, 1))
      #add graphics
      par(xpd=TRUE)
      start1 <- 0.035
      xsep = 0.145
      start1_text <- 0.11
      tr_size <- 0.03
      stop <- start1
      for(i in 1:2){
        start <- stop
        stop <- start + xsep
        rect(start, -0.14, stop, -0.02,col = del_col,lwd = 0,border = NA)
        text(x = start+0.5*xsep,y = -0.08,"del",col = "white",cex = textscaling)
        start <- stop
        stop <- start + xsep
        rect(start, -0.14, stop, -0.02,col = td_col,lwd = 0,border = NA)
        text(x = start+0.5*xsep,y = -0.08,"tds",col = "white",cex = textscaling)
        start <- stop
        stop <- start + xsep
        rect(start, -0.14, stop, -0.02,col = inv_col,lwd = 0,border = NA)
        text(x = start+0.5*xsep,y = -0.08,"inv",col = "white",cex = textscaling)
        start <- stop
        stop <- start + tr_size
        rect(start, -0.14, stop, -0.02,col = transloc_col,lwd = 0,border = NA)
        text(x = start+0.5*tr_size,y = -0.08,"tr",col = "white",cex = textscaling)
      }
      xsep2 <- 3*xsep+tr_size
      rect(start1, -0.26, start1+xsep2, -0.14,col = "black",lwd = 0,border = NA)
      text(x = start1+0.5*xsep2,y = -0.2,"clustered",col = "white",cex = textscaling)
      rect(start1+xsep2, -0.26, start1+2*xsep2, -0.14,col = non_clust_col,lwd = 0,border = NA)
      text(x = start1+1.5*xsep2,y = -0.2,"non-clustered",col = "black",cex = textscaling)

      # plot text
      tstep <- xsep/5
      tstart <- start1+tstep/2
      tpos <- seq(from = tstart,by = tstep,length.out = length(sizes_names))
      text(x = tpos,y = -0.3,adj = 1,labels = sizes_names,cex = 0.8*textscaling,srt = 90)
      
      #restore old coordinates
      par(usr = op)
    }
    title(main = overall_title,outer = TRUE,cex.main = 1.5*textscaling)
    if(!is.null(output_file)) dev.off()
  }
}


plotRearrSignatures_withMeanSd <-function(signature_data_matrix,
                                          mean_matrix,
                                          sd_matrix,
                                          output_file = NULL,
                                          plot_sum = TRUE,
                                          overall_title = "",
                                          add_to_titles = NULL,
                                          mar=NULL){
  #This function plots a set of signatures in a single file, three signatures for each row.
  if(!is.null(output_file)) plottype <- substr(output_file,nchar(output_file)-2,nchar(output_file))
  del_col = rgb(228,26,28, maxColorValue = 255)
  td_col = rgb(77,175,74, maxColorValue =255)
  inv_col  = rgb(55,126,184, maxColorValue = 255)
  transloc_col = rgb(152,78,163, maxColorValue =255)
  non_clust_col = rgb(240,240,240, maxColorValue =255)
  rearr.colours <- rep(c(rep(del_col,5),rep(td_col,5),rep(inv_col,5),transloc_col),2)
  nplotrows <- ncol(signature_data_matrix)
  if(!is.null(output_file)) {
    if(plottype=="jpg"){
      jpeg(output_file,width = 2*800,height = nplotrows*500,res = 220)
    }else if(plottype=="png"){
      png(output_file,width = 2*800,height = nplotrows*500,res = 220)
    }else if (plottype=="pdf"){
      pdf(output_file,width = 2*8,height = nplotrows*5+0.5,pointsize = 26)
    }
  }
  par(mfrow = c(nplotrows, 2),omi=c(0,0,0.5,0),cex=0.7)
  sizes <- c("1-10Kb",
             "10-100Kb",
             "100Kb-1Mb",
             "1Mb-10Mb",
             ">10Mb")
  sizes_names <- c(rep(sizes,3),"",rep(sizes,3),"")

  rearrAxis <- function(barCenters,sizes_names){
    axis(1,
         las=2,
         at=barCenters,
         lab=sizes_names,
         col = "transparent",
         line = 1,
         cex.axis = 0.8)
    #save old plot coordinates
    op <- par("usr")
    #set new coordinates
    par(usr = c(0, 1, 0, 1))
    #add graphics
    par(xpd=TRUE)
    start1 <- 0.035
    xsep = 0.145
    start1_text <- 0.11
    tr_size <- 0.03
    stop <- start1
    for(i in 1:2){
      start <- stop
      stop <- start + xsep
      rect(start, -0.14, stop, -0.02,col = del_col,lwd = 0,border = NA)
      text(x = start+0.5*xsep,y = -0.08,"del",col = "white")
      start <- stop
      stop <- start + xsep
      rect(start, -0.14, stop, -0.02,col = td_col,lwd = 0,border = NA)
      text(x = start+0.5*xsep,y = -0.08,"tds",col = "white")
      start <- stop
      stop <- start + xsep
      rect(start, -0.14, stop, -0.02,col = inv_col,lwd = 0,border = NA)
      text(x = start+0.5*xsep,y = -0.08,"inv",col = "white")
      start <- stop
      stop <- start + tr_size
      rect(start, -0.14, stop, -0.02,col = transloc_col,lwd = 0,border = NA)
      text(x = start+0.5*tr_size,y = -0.08,"tr",col = "white")
    }
    xsep2 <- 3*xsep+tr_size
    rect(start1, -0.26, start1+xsep2, -0.14,col = "black",lwd = 0,border = NA)
    text(x = start1+0.5*xsep2,y = -0.2,"clustered",col = "white")
    rect(start1+xsep2, -0.26, start1+2*xsep2, -0.14,col = non_clust_col,lwd = 0,border = NA)
    text(x = start1+1.5*xsep2,y = -0.2,"non-clustered",col = "black")

    #restore old coordinates
    par(usr = op)
  }

  for (pos in 1:ncol(signature_data_matrix)){
    ylimit <- c(0,max(signature_data_matrix[,pos],mean_matrix[,pos]+sd_matrix[,pos]))
    if(is.null(mar)){
      par(mar=c(8,3,2,1))
    }else{
      par(mar=mar)
    }
    title <- colnames(signature_data_matrix)[pos]
    if (!is.null(add_to_titles)) title <- paste0(title," ",add_to_titles[pos])
    if (plot_sum) title <- paste0(title,"\n(",round(sum(signature_data_matrix[,pos]))," SVs)")
    barCenters <- barplot(signature_data_matrix[,pos],
                          main = title,
                          names.arg = NA,
                          col=rearr.colours,
                          beside = FALSE,
                          las=2,
                          cex.names = 0.8,
                          border = 0,
                          ylim = ylimit,
                          cex.main = 0.9,
                          space = 0.1)
    rearrAxis(barCenters,sizes_names)
    barCenters <- barplot(mean_matrix[,pos],
                          main = "mean and sd of cluster",
                          names.arg = NA,
                          col=rearr.colours,
                          beside = FALSE,
                          ylim = ylimit,
                          cex.main = 0.9,
                          las=2,
                          border = 0,
                          space = 0.1)
    segments(barCenters, mean_matrix[,pos], barCenters,
             mean_matrix[,pos] + sd_matrix[,pos], lwd = 1.5)
    rearrAxis(barCenters,sizes_names)
  }
  title(main = overall_title,outer = TRUE,cex.main = 1.5)
  if(!is.null(output_file)) dev.off()
}


#' plotCosSimSignatures
#'
#' Plot a matrix of cosine similarities between two sets of signatures.
#'
#' @param sig1 matrix with signatures as columns
#' @param sig2 matrix with signatures as columns
#' @param output_file name of the output file (pdf), optional
#' @param cex.numbers scale the text used for the numbers in the matrix
#' @param circlesColBasic colour used for the circles
#' @param circlesColHighlight colour used for the circles that pass the thresholdMark
#' @export
plotCosSimSignatures <- function(sig1,sig2,output_file = NULL,
                                 thresholdMark = NULL,
                                 cex.numbers = 0.7,
                                 circlesColBasic = "#A1CAF1",
                                 circlesColHighlight = "#F6A600"){
  sigcor <- computeCorrelationOfTwoSetsOfSigs(sig1,sig2)
  plotMatrix(sigcor,output_file = output_file,
             thresholdMark = thresholdMark,
             cex.numbers = cex.numbers,
             circlesColBasic = circlesColBasic,
             circlesColHighlight = circlesColHighlight)
}

#' Find closest COSMIC30 signatures
#'
#' Compares a set of signatures to the COSMIC30 signatures and
#' returns the list of signatures identified. For example,
#' c("C1","C3","N1","C13","N2") means that Cosmic (C) signatures 1, 3 and 13 were found, while
#' signatures N1 and N2 are unknown signatures (N for not found), based on a similarity threshold (similarity >=threshold)
#' comparing to COSMIC30
#'
#' @param sigs matrix with 96-channel substitution signatures as columns
#' @param threshold cosine similarity threshold
#' @return the list of signatures identified
#' @export
findClosestCOSMIC30 <- function(sigs,threshold){
  #compute cos sim matrix
  cos_sim_df <- data.frame()
  for (s in colnames(sigs)){
    for(a in colnames(cosmic30)){
      cos_sim_df[s,a] <- cos_sim(sigs[,s],cosmic30[,a])
    }
  }
  max.sim <- apply(cos_sim_df,1,max)
  closestCosmic <- apply(cos_sim_df,1,which.max)
  notFound <- max.sim < threshold
  closestCosmic[notFound] <- paste0("N",1:sum(notFound))
  closestCosmic[!notFound] <- paste0("C",closestCosmic[!notFound])
  return(closestCosmic)
}

#automatically detect similarity with sum of two COSMIC30

#' Find closest COSMIC30 signatures or combination of COSMIC30
#'
#' Compares a set of signatures to the COSMIC30 signatures or the simple sum of all combinations of two COSMIC signatures and
#' returns the list of signatures identified. For example,
#' c("C1","C3","N1","C2+13","N2") means that Cosmic (C) signatures 1, 3 and 2+13 were found, while
#' signatures N1 and N2 are unknown signatures (N for not found), based on a similarity threshold (similarity >=threshold)
#' comparing to COSMIC30 or the sum of two COSMIC30 sigs.
#'
#' @param sigs matrix with 96-channel substitution signatures as columns
#' @param threshold cosine similarity threshold
#' @return the list of signatures identified
#' @export
findClosestCOSMIC30andCombinations <- function(sigs,threshold){
  ncols30 <- length(cosmic30)
  colnames(cosmic30) <- 1:ncols30
  for(i in 1:(ncols30-1)){
    for(j in (i+1):ncols30){
      #message(paste0(i,"+",j))
      cosmic30[,paste0(i,"+",j)] <- (cosmic30[,i]+cosmic30[,j])/sum(cosmic30[,i]+cosmic30[,j])
    }
  }
  #compute cos sim matrix
  cos_sim_df <- data.frame()
  for (s in colnames(sigs)){
    for(a in colnames(cosmic30)){
      cos_sim_df[s,a] <- cos_sim(sigs[,s],cosmic30[,a])
    }
  }
  max.sim <- apply(cos_sim_df,1,max)
  closestCosmic <- colnames(cosmic30)[apply(cos_sim_df,1,which.max)]
  notFound <- max.sim < threshold
  closestCosmic[notFound] <- paste0("N",1:sum(notFound))
  closestCosmic[!notFound] <- paste0("C",closestCosmic[!notFound])
  return(closestCosmic)
}


#' Find closest COSMIC30 signatures
#'
#' Compares a set of signatures to the COSMIC30 signatures and
#' returns the list of signatures identified and the corresponding similarity. For example,
#' list(cosmic = c("C1","C3","C13"),cos_sim = c(0.94,0.85,0.7))
#' means that Cosmic (C) signatures 1, 3 and 13 were found, while
#' the corrsponding similarities to those signatures are 0.94, 0.85 and 0.7
#'
#' @param sigs matrix with 96-channel substitution signatures as columns
#' @return the list of signatures identified and corresponding similarities
#' @export
findClosestCOSMIC30_withSimilarity <- function(sigs){
  #compute cos sim matrix
  cos_sim_df <- data.frame()
  for (s in colnames(sigs)){
    for(a in colnames(cosmic30)){
      cos_sim_df[s,a] <- cos_sim(sigs[,s],cosmic30[,a])
    }
  }
  max.sim <- apply(cos_sim_df,1,max)
  closestCosmic <- apply(cos_sim_df,1,which.max)
  closestCosmic <- paste0("C",closestCosmic)
  res <- list()
  res[["cosmic"]] <- closestCosmic
  res[["cos.sim"]] <- max.sim
  return(res)
}

#' Find closest COSMIC30 signatures or combination of COSMIC30
#'
#' Compares a set of signatures to the COSMIC30 signatures or the simple sum of all combinations of two COSMIC signatures and
#' returns the list of signatures identified and the corresponding similarity. For example,
#' list(cosmic = c("C1","C3","C2+C13"),cos_sim = c(0.94,0.85,0.7))
#' means that Cosmic (C) signatures 1, 3 and 2+13 were found, while
#' the corrsponding similarities to those signatures are 0.94, 0.85 and 0.7
#'
#' @param sigs matrix with 96-channel substitution signatures as columns
#' @return the list of signatures identified and corresponding similarities
#' @export
findClosestCOSMIC30andCombinations_withSimilarity <- function(sigs){
  ncols30 <- length(cosmic30)
  colnames(cosmic30) <- 1:ncols30
  for(i in 1:(ncols30-1)){
    for(j in (i+1):ncols30){
      #message(paste0(i,"+",j))
      cosmic30[,paste0(i,"+",j)] <- (cosmic30[,i]+cosmic30[,j])/sum(cosmic30[,i]+cosmic30[,j])
    }
  }
  #compute cos sim matrix
  cos_sim_df <- data.frame()
  for (s in colnames(sigs)){
    for(a in colnames(cosmic30)){
      cos_sim_df[s,a] <- cos_sim(sigs[,s],cosmic30[,a])
    }
  }
  max.sim <- apply(cos_sim_df,1,max)
  closestCosmic <- colnames(cosmic30)[apply(cos_sim_df,1,which.max)]
  closestCosmic <- paste0("C",closestCosmic)
  res <- list()
  res[["cosmic"]] <- closestCosmic
  res[["cos.sim"]] <- max.sim
  return(res)
}

#returns the list of signatures identified. For example,
#c("R1","R3","N1","R5","N2")
#means that Rearrangement (R) signatures 1, 3 and 5 were found, while
#signatures N1 and N2 are unknown signatures (N for not found), based on the fact
#that no similarity >=threshold was found with the Rearr Sigs from Breast 560 study
findClosestRearrSigsBreast560 <- function(sigs,threshold){
  #compute cos sim matrix
  cos_sim_df <- data.frame()
  for (s in colnames(sigs)){
    for(a in colnames(RS.Breast560)){
      cos_sim_df[s,a] <- cos_sim(sigs[,s],RS.Breast560[,a])
    }
  }
  max.sim <- apply(cos_sim_df,1,max)
  closestRS.Breast560 <- apply(cos_sim_df,1,which.max)
  notFound <- max.sim < threshold
  closestRS.Breast560[notFound] <- paste0("N",1:sum(notFound))
  closestRS.Breast560[!notFound] <- paste0("R",closestRS.Breast560[!notFound])
  return(closestRS.Breast560)
}

#returns the list of signatures identified and the corresponding similarity. For example,
#res$RS.Breast560 = c("R1","R3","R5")
#res$cos.sim = c(0.94,0.85,0.7)
#means that Rearrangement (R) signatures 1, 3 and 5 were found, while
#the corrsponding similarities to those signatures are 0.94, 0.85 and 0.7
findClosestRearrSigsBreast560_withSimilarity <- function(sigs){
  #compute cos sim matrix
  cos_sim_df <- data.frame()
  for (s in colnames(sigs)){
    for(a in colnames(RS.Breast560)){
      cos_sim_df[s,a] <- cos_sim(sigs[,s],RS.Breast560[,a])
    }
  }
  max.sim <- apply(cos_sim_df,1,max)
  closestRS.Breast560 <- apply(cos_sim_df,1,which.max)
  closestRS.Breast560 <- paste0("R",closestRS.Breast560)
  res <- list()
  res[["RS.Breast560"]] <- closestRS.Breast560
  res[["cos.sim"]] <- max.sim
  return(res)
}

#' KL-divergence
#'
#' Compute the Kullback-Leibler Divergence between two matrices. In order to compute the divergence, .Machine$double.eps is added to matrices zero entries.
#'
#' @param m1 original matrix
#' @param m2 matrix to be used to approximate m1
#' @return KL-Divergence
#' @export
KLD <- function(m1,m2){
  m1[m1==0] <- .Machine$double.eps
  m2[m2==0] <- .Machine$double.eps
  return(sum(m1*(log(m1)-log(m2)) - m1 + m2))
}

#samples/sigantures are ararnged by columns

#' computeCorrelationOfTwoSetsOfSigs
#'
#' Compute the cosine similarity between two sets of signatures, which results in a cosine similarity matrix.
#'
#' @param sig1 matrix of signatures, with signatures as columns
#' @param sig2 matrix of signatures, with signatures as columns
#' @return cosine similarity matrix
#' @export
computeCorrelationOfTwoSetsOfSigs <- function(sigs1,sigs2){
  cos_sim_df <- data.frame()
  for (s in colnames(sigs1)){
    for(a in colnames(sigs2)){
      cos_sim_df[s,a] <- cos_sim(sigs1[,s],sigs2[,a])
    }
  }
  return(cos_sim_df)
}

removeSimilarCatalogueSamples <- function(cat,cosSimThreshold = 0.99){
  catCosSimCorr <- computeCorrelation(cat) - diag(ncol(cat))
  newcat <- data.frame(cat[,1],row.names = row.names(cat))
  colnames(newcat) = colnames(cat)[1]
  #as you are building the new catalogue, add samples only if they have cos sim
  #less than cosSimThreshold w.r.t. samples in the new catalgue
  for (j in 2:ncol(cat)){
    cossimres <- catCosSimCorr[j,colnames(newcat)]
    if(!any(cossimres>cosSimThreshold)){
      newcat <- cbind(newcat,cat[,j])
      colnames(newcat)[ncol(newcat)] <- colnames(cat)[j]
    }
  }
  return(newcat)
}

normaliseSamples <- function(cat){
  return(cat/matrix(rep(apply(cat,2,sum),nrow(cat)),nrow = nrow(cat),byrow = TRUE))
}

#' RMSE
#'
#' Function to compute the root mean squared error between two matrices.
#'
#' @param m1 first matrix to compare
#' @param m2 second matrix to compare
#' @return root mean squared error
#' @export
RMSE <- function(m1,m2){
  sqrt(sum((m1-m2)^2)/(ncol(m1)*nrow(m1)))
}

#' writeTable
#'
#' Utility function for simple write table with the following parameters: (sep = "\\t",quote = FALSE,row.names = TRUE,col.names = TRUE).
#'
#' @param t R table or matrix
#' @param file name of the output plain text file
#' @export
writeTable <- function(t,file,row.names=TRUE){
  write.table(t,file = file,sep = "\t",quote = FALSE,row.names = row.names,col.names = TRUE)
}

#' readTable
#'
#' Utility function for simple read table with the following parameters: (sep = "\\t",check.names = FALSE,header = TRUE,stringsAsFactors = FALSE).
#'
#' @param file name of the plain text file to read
#' @export
readTable <- function(file){
  read.table(file = file,sep = "\t",check.names = FALSE,header = TRUE,stringsAsFactors = FALSE)
}


#' getOrganSignatures
#'
#' This function returns the organ-specific signatures for a given organ and mutation type as defined in Degasperi et al. 2020 Nat Cancer paper.
#'
#' @param typemut either subs, DNV or rearr
#' @param organ one of the following: "Biliary", "Bladder", "Bone_SoftTissue", "Breast", "Cervix", "CNS", "Colorectal", "Esophagus", "Head_neck", "Kidney", "Liver", "Lung", "Lymphoid", "Ovary", "Pancreas", "Prostate", "Skin", "Stomach", "Uterus"
#' @param version version "1" includes subs or rearr (ICGC cohort) organ-specific signatures from Degasperi et al. 2020, while version "2" includes the improved subs organ-specific signatures from ICGC as well as Hartwig and GEL, and the new DNV signatures. Set to "latest" to get the latest signature available for a given mutation type.
#' @param cohort for version 1 signatures only ICGC cohort is available, while for version 2 signatures ICGC, Hartwig and GEL cohort can be requested. Use "best" to get the most appropriate cohort for a given organ.
#' @return organ-specific signatures matrix
#' @references A. Degasperi, T. D. Amarante, J. Czarnecki, S. Shooter, X. Zou, D. Glodzik, S. Morganella, A. S. Nanda, C. Badja, G. Koh, S. E. Momen, I. Georgakopoulos-Soares, J. M. L. Dias, J. Young, Y. Memari, H. Davies, S. Nik-Zainal. A practical framework and online tool for mutational signature analyses show intertissue variation and driver dependencies, Nature Cancer, https://doi.org/10.1038/s43018-020-0027-5, 2020.
#' @export
getOrganSignatures <- function(organ,version="latest",cohort="best",typemut="subs",verbose=FALSE){
  sigs <- NULL
  organsAvail <- NULL
  if(typemut=="subs" & version=="1" & (cohort=="best" | cohort=="ICGC")){
    # find available organs
    organsAvail <- unique(sapply(colnames(all_organ_sigs_subs), function(x) {
      b <- strsplit(x,split = "_")[[1]]
      return(paste(b[1:(length(b)-1)],collapse = "_"))
    },USE.NAMES = F))
    if(organ %in% organsAvail) sigs <- all_organ_sigs_subs[,colnames(all_organ_sigs_subs)[grep(pattern = paste0("^",organ),colnames(all_organ_sigs_subs))]]
  }else if(typemut=="rearr" & (version=="1" | version=="latest") & (cohort=="best" | cohort=="ICGC")){
    organsAvail <- unique(sapply(colnames(all_organ_sigs_rearr), function(x) {
      b <- strsplit(x,split = "_")[[1]]
      return(paste(b[1:(length(b)-1)],collapse = "_"))
    },USE.NAMES = F))
    if(organ %in% organsAvail) sigs <- all_organ_sigs_rearr[,colnames(all_organ_sigs_rearr)[grep(pattern = paste0("^",organ),colnames(all_organ_sigs_rearr))]]
  }else if(typemut=="DNV" & (version=="2" | version=="latest")){
    cohortToUse <- cohort
    if(cohortToUse=="best") cohortToUse <- "GEL"
    signames <- colnames(organSignaturesDBSv1.01)[grepl(colnames(organSignaturesDBSv1.01),pattern = cohortToUse)]
    if(length(signames)>0){
      organsAvail <- unique(sapply(signames, function(x) {
        a <- strsplit(x,split = "-")[[1]]
        b <- strsplit(a[2],split = "_")[[1]]
        pos <- which(b %in% c("common","rare"))
        return(paste(b[1:(pos-1)],collapse = "_"))
      },USE.NAMES = F))
    }
    if(organ %in% organsAvail) sigs <- organSignaturesDBSv1.01[,grepl(colnames(organSignaturesDBSv1.01),pattern = paste0(cohortToUse,"-",organ)),drop=F]
  }else if(typemut=="subs" & (version=="2" | version=="latest")){
    cohortToUse <- cohort
    if(cohortToUse=="best") {
      cohortToUse <- "GEL"
      if(organ=="Esophagus" | organ=="Head_neck") cohortToUse <- "ICGC"
    }
    signames <- colnames(organSignaturesSBSv2.03)[grepl(colnames(organSignaturesSBSv2.03),pattern = cohortToUse)]
    if(length(signames)>0){
      organsAvail <- unique(sapply(signames, function(x) {
        a <- strsplit(x,split = "-")[[1]]
        b <- strsplit(a[2],split = "_")[[1]]
        pos <- which(b %in% c("common","rare"))
        return(paste(b[1:(pos-1)],collapse = "_"))
      },USE.NAMES = F))
    }
    if(organ %in% organsAvail) sigs <- organSignaturesSBSv2.03[,grepl(colnames(organSignaturesSBSv2.03),pattern = paste0(cohortToUse,"-",organ)),drop=F]
    # need the actual organsAvail for the "best" cohort, which is GEL all organs + ICGC Esophagus and Head_neck
    if(cohort=="best"){
      signames <- colnames(organSignaturesSBSv2.03)[grepl(colnames(organSignaturesSBSv2.03),pattern = "GEL")]
      organsAvail <- unique(sapply(signames, function(x) {
        a <- strsplit(x,split = "-")[[1]]
        b <- strsplit(a[2],split = "_")[[1]]
        pos <- which(b %in% c("common","rare"))
        return(paste(b[1:(pos-1)],collapse = "_"))
      },USE.NAMES = F))
      organsAvail <- c(organsAvail,"Esophagus","Head_neck")
    }
  }
  if(is.null(sigs)){
    message("[warning getOrganSignatures] Organ ",organ, " not available for mutation type ",typemut, ", version ",version, " and cohort ",cohort,".")
    if(verbose) message("[warning getOrganSignatures] retrived sigs is NULL.")
  }else{
    if(ncol(sigs)==0){
      message("[warning getOrganSignatures] Organ ",organ, " not available for mutation type ",typemut, ", version ",version, " and cohort ",cohort,".")
      if(verbose) message("[warning getOrganSignatures] retrived sigs is an empty table, set to NULL.")
      sigs <- NULL
    }
  }
  if(is.null(sigs) & !is.null(organsAvail)){
    message("[warning getOrganSignatures] Organ ",organ, " not available, but you can try one of the following for your specified cohort ",cohort,": ",paste(organsAvail,collapse = ", "))
  }
  return(sigs)
}


#' getReferenceSignatures
#'
#' This function returns the reference signatures for a given mutation type as defined in Degasperi et al. 2020 Nat Cancer paper,
#' and Degasperi et al. 2022 Science paper.
#'
#' @param typemut either subs, DNV or rearr
#' @param version version "1" includes subs or rearr (ICGC cohort) reference signatures from Degasperi et al. 2020, while version "2" includes the improved subs reference signatures and the DNV signatures from Degasperi et al. 2022. Set to "latest" to get the latest signature available for a given mutation type.
#' @return reference signatures matrix
#' @references A. Degasperi, T. D. Amarante, J. Czarnecki, S. Shooter, X. Zou, D. Glodzik, S. Morganella, A. S. Nanda, C. Badja, G. Koh, S. E. Momen, I. Georgakopoulos-Soares, J. M. L. Dias, J. Young, Y. Memari, H. Davies, S. Nik-Zainal. A practical framework and online tool for mutational signature analyses show intertissue variation and driver dependencies, Nature Cancer, https://doi.org/10.1038/s43018-020-0027-5, 2020.
#' @references A. Degasperi, X. Zou, T. D. Amarante, ..., H. Davies, Genomics England Research Consortium, S. Nik-Zainal. Substitution mutational signatures in whole-genome-sequenced cancers in the UK population. Science, 2022.
#' @export
getReferenceSignatures <- function(version="latest",typemut="subs",verbose = TRUE){
  sigs <- NULL
  if(typemut=="subs" & version=="1"){
    sigs <- RefSigv1_subs
  }else if(typemut=="rearr" & (version=="1" | version=="latest")){
    sigs <- RefSigv1_rearr
  }else if(typemut=="DNV" & (version=="2" | version=="1" | version=="latest")){
    sigs <- referenceSignaturesDBSv1.01
  }else if(typemut=="subs" & (version=="2" | version=="latest")){
    sigs <- referenceSignaturesSBSv2.03
  }
  if(is.null(sigs)){
    message("[warning getReferenceSignatures] Reference signatures not available for mutation type ",typemut, ", version ",version, ".")
  }else{
    if(ncol(sigs)==0 & verbose) message("[warning getReferenceSignatures] Reference signatures not available for mutation type ",typemut, ", version ",version, ".")
  }
  return(sigs)
}


#' getCOSMICSignatures
#'
#' This function returns the COSMIC signatures for a given mutation type.
#'
#' @param typemut either subs, DNV or rearr
#' @param version this is either "latest", which is v3.2 for SBS and DBS, or it is possible to specify "2" in combination with typemut="subs" to obtain the old COSMIC 30 signatures. For rearrangements, this function only returns the 6 breast cancer rearr signatures from Nik-Zainal et al. 2016.
#' @return reference signatures matrix
#' @export
getCOSMICSignatures <- function(version="latest",typemut="subs",verbose = TRUE){
  sigs <- NULL
  if(typemut=="subs" & version=="2"){
    sigs <- cosmic30
  }else if(typemut=="rearr" & version=="latest"){
    sigs <- RS.Breast560
  }else if(typemut=="DNV" & (version=="3.2" | version=="latest")){
    sigs <- COSMIC_v3.2_DBS_GRCh37
  }else if(typemut=="subs" & (version=="3.2" | version=="latest")){
    sigs <- COSMIC_v3.2_SBS_GRCh37
  }
  if(is.null(sigs)){
    message("[warning getCOSMICSignatures] COSMIC signatures not available for mutation type ",typemut, ", version ",version, ".")
  }else{
    if(ncol(sigs)==0 & verbose) message("[warning getCOSMICSignatures] COSMIC signatures not available for mutation type ",typemut, ", version ",version, ".")
  }
  return(sigs)
}


# # store before converting
# exposures_organSpecific_subs <- exposures_subs
# # if RefSigv2 was used, there may be some SBS sigs in the names
# sbssigs <- rownames(exposures_subs)[grepl(rownames(exposures_subs),pattern = "^SBS")]
# if(length(sbssigs)>0){
#   exposures_subs_toconvert <- exposures_subs[setdiff(rownames(exposures_subs),sbssigs),,drop=F]
#   exposures_subs_sbs <- exposures_subs[sbssigs,,drop=F]
#   exposures_subs_converted <- convertExposuresFromOrganToRefSigs(exposures_subs_toconvert,typemut = "subs")
#   exposures_subs_converted <- exposures_subs_converted[apply(exposures_subs_converted, 1,sum)>0,,drop=FALSE]
#   exposures_subs <- rbind(exposures_subs_converted,exposures_subs_sbs)
# }else{
#   exposures_subs <- convertExposuresFromOrganToRefSigs(exposures_subs,typemut = "subs")
#   exposures_subs <- exposures_subs[apply(exposures_subs, 1,sum)>0,,drop=FALSE]
# }

#' convertExposuresFromOrganToRefSigsIfAny
#'
#' This function inspects an exposure matrix, and converts organ-specific signature exposures into reference signature exposures if it finds any.
#' If the exposure matrix contains reference signature exposures to start with, then reference signature exposures are merged if possible.
#' The function will detect the version of the signatures automatically.
#'
#' @param typemut either subs, DNV or rearr
#' @param expMatrix exposures matrix obatined from fitting organ-specific signatures
#' @return exposure matrix with converted signatures, or unaltered exposure matrix if organ-specific signatures where not found
#' @references A. Degasperi, T. D. Amarante, J. Czarnecki, S. Shooter, X. Zou, D. Glodzik, S. Morganella, A. S. Nanda, C. Badja, G. Koh, S. E. Momen, I. Georgakopoulos-Soares, J. M. L. Dias, J. Young, Y. Memari, H. Davies, S. Nik-Zainal. A practical framework and online tool for mutational signature analyses show intertissue variation and driver dependencies, Nature Cancer, https://doi.org/10.1038/s43018-020-0027-5, 2020.
#' @export
convertExposuresFromOrganToRefSigsIfAny <- function(expMatrix,typemut="subs"){
  # check for organ-specific signatures and pick the correct conversion matrix
  organSpecificSignatures <- NULL
  conversionMatrix <- NULL
  if(typemut=="subs"){
    if(any(rownames(expMatrix) %in% rownames(conversion_matrix_subs))){
      organSpecificSignatures <- intersect(rownames(expMatrix),rownames(conversion_matrix_subs))
      conversionMatrix <- conversion_matrix_subs
    }else if(any(rownames(expMatrix) %in% rownames(conversionMatrixSBSv2.03))){
      organSpecificSignatures <- intersect(rownames(expMatrix),rownames(conversionMatrixSBSv2.03))
      conversionMatrix <- conversionMatrixSBSv2.03
    }
  }else if(typemut=="rearr"){
    if(any(rownames(expMatrix) %in% rownames(conversion_matrix_rearr))){
      organSpecificSignatures <- intersect(rownames(expMatrix),rownames(conversion_matrix_rearr))
      conversionMatrix <- conversion_matrix_rearr
    }
  }else if(typemut=="DNV"){
    if(any(rownames(expMatrix) %in% rownames(conversionMatrixDBSv1.01))){
      organSpecificSignatures <- intersect(rownames(expMatrix),rownames(conversionMatrixDBSv1.01))
      conversionMatrix <- conversionMatrixDBSv1.01
    }
  }
  
  # if no organ-specific signatures were found, that's great, just end early
  if(is.null(organSpecificSignatures)) return(expMatrix)
  
  # now, let's say that we found something, we need to convert
  exposures_to_convert <- expMatrix[organSpecificSignatures,,drop=F]
  exposures_converted <- t(conversionMatrix[rownames(exposures_to_convert),]) %*% as.matrix(exposures_to_convert)
  exposures_converted <- exposures_converted[apply(exposures_converted, 1,sum)>0,,drop=FALSE]
  
  # now let's see if there are some leftover signatures to append
  if(nrow(expMatrix)>length(organSpecificSignatures)){
    leftoversigs <- setdiff(rownames(expMatrix),organSpecificSignatures)
    # check if some converted refsigs overlap with leftover signatures
    overlapsigs <- intersect(leftoversigs,rownames(exposures_converted))
    if(length(overlapsigs)>0){
      exposures_converted[overlapsigs,] <- exposures_converted[overlapsigs,] + expMatrix[overlapsigs,]
      leftoversigs <- setdiff(leftoversigs,overlapsigs)
    }
    # now add everything together
    if(length(leftoversigs)>0){
      exposures_converted <- rbind(exposures_converted,expMatrix[leftoversigs,])
    }
  }
  
  return(exposures_converted)
}

#' convertExposuresFromOrganToRefSigs
#'
#' This function converts the exposures matrix obtained from fitting organ-specific signatures into reference signatures exposures.
#' The function will detect the version of the signatures automatically.
#'
#' @param typemut either subs, DNV or rearr
#' @param expMatrix exposures matrix obtained from fitting organ-specific signatures, with samples as columns and organ-specific signature names as row names. Must contain only organ-specific signature exposures
#' @return exposure matrix converted in reference signatures exposures, or NULL if expMatrix doesn't contain only organ-specific signatures or typemut is unknown
#' @references A. Degasperi, T. D. Amarante, J. Czarnecki, S. Shooter, X. Zou, D. Glodzik, S. Morganella, A. S. Nanda, C. Badja, G. Koh, S. E. Momen, I. Georgakopoulos-Soares, J. M. L. Dias, J. Young, Y. Memari, H. Davies, S. Nik-Zainal. A practical framework and online tool for mutational signature analyses show intertissue variation and driver dependencies, Nature Cancer, https://doi.org/10.1038/s43018-020-0027-5, 2020.
#' @export
convertExposuresFromOrganToRefSigs <- function(expMatrix,typemut="subs"){
  exposures <- NULL
  if(typemut=="subs"){
    if(all(rownames(expMatrix) %in% rownames(conversion_matrix_subs))){
      exposures <- t(conversion_matrix_subs[rownames(expMatrix),]) %*% as.matrix(expMatrix)
    }else if(all(rownames(expMatrix) %in% rownames(conversionMatrixSBSv2.03))){
      exposures <- t(conversionMatrixSBSv2.03[rownames(expMatrix),]) %*% as.matrix(expMatrix)
    }
  }else if(typemut=="rearr"){
    exposures <- t(conversion_matrix_rearr[rownames(expMatrix),]) %*% as.matrix(expMatrix)
  }else if(typemut=="DNV"){
    exposures <- t(conversionMatrixDBSv1.01[rownames(expMatrix),]) %*% as.matrix(expMatrix)
  }
  return(exposures)
}

convertSigNamesFromOrganToRefSigs_subroutine <- function(sigNames,typemut="subs"){
  convertedNames <- NULL
  if(typemut=="subs"){
    if(all(sigNames %in% rownames(conversion_matrix_subs))){
      convertedNames <- colnames(conversion_matrix_subs)[apply(conversion_matrix_subs[sigNames,],2,sum)>0]
    }else if(all(sigNames %in% rownames(conversionMatrixSBSv2.03))){
      convertedNames <- colnames(conversionMatrixSBSv2.03)[apply(conversionMatrixSBSv2.03[sigNames,],2,sum)>0]
    }
  }else if(typemut=="rearr"){
    convertedNames <- colnames(conversion_matrix_rearr)[apply(conversion_matrix_rearr[sigNames,],2,sum)>0]
  }else if(typemut=="DNV"){
    convertedNames <- colnames(conversionMatrixDBSv1.01)[apply(conversionMatrixDBSv1.01[sigNames,],2,sum)>0]
  }
  return(convertedNames)
}

convertSigNamesFromOrganToRefSigs <- function(sigNames,typemut="subs",mixedOnly=FALSE){
  # alternatively just convert the mixed signatures
  finalNames <- NULL
  if(mixedOnly){
    finalNames <- sigNames
    isMixed <- grepl(finalNames,pattern = "[+]")
    if(any(isMixed)){
      mixedSigs <- finalNames[isMixed]
      finalNames <- setdiff(finalNames,mixedSigs)
      finalNamesRefSig <- convertSigNamesFromOrganToRefSigs_subroutine(finalNames,typemut)
      mixedSigsRefSig <- convertSigNamesFromOrganToRefSigs_subroutine(mixedSigs,typemut)
      mixedSigsRefSig <- setdiff(mixedSigsRefSig,finalNamesRefSig)
      finalNames <- c(finalNames,mixedSigsRefSig)
    }
  }else{
    finalNames <- convertSigNamesFromOrganToRefSigs_subroutine(sigNames,typemut)
  }
  return(finalNames)
}
Nik-Zainal-Group/signature.tools.lib documentation built on April 13, 2025, 5:50 p.m.