#' Assess/evaluate multiple summarized runs on one dataset by one computational approach.
#'
#' Summarize results from each computational approach in \code{resultPath}/\code{run.names}
#' (generated by running a computational approach),
#' combine them into \code{resultPath}.
#'
#' @param datasetName Name of the dataset. (e.g. "S.0.1.Rsq.0.1").
#' Usually, it is has the same name as \code{basename(top.dir)}.
#'
#' @param toolName Name of computational approach. (e.g. "SigProExtractor")
#'
#' @param resultPath Path expected to have multiple result folders each
#' named as \code{run.names} (e.g. "seed.1").
#' The example \code{resultPath} is
#' S.0.1.Rsq.0.1/sp.sp/ExtrAttr/hdp.results/ in old folder structure, or
#' 3a.Original_output_K_unspecified/hdp/S.0.1.Rsq.0.1 in new folder structure.
#'
#' @param run.names A character vector records the list of directories
#' which are under \code{resultPath} and contain results of computational approach,
#' and a \code{summary} folder generated by \code{\link{SummarizeSigOneExtrAttrSubdir}}.
#'
#' @return A list contain values of measures measures in multiple runs: \itemize{
#' \item $averCosSim Average cosine similarity. Only similarities between
#' TP sigs and extracted sigs most similar to them.
#' \item $truePos True Positives(TP): Reference signatures which are
#' active in the spectra, and extracted.
#' \item $falseNeg False Negatives(FN): Reference signatures not extracted.
#' \item $falsePos False Positives(FP): Signatures wrongly extracted,
#' not resembling any reference signatures.
#' \item $TPR True positive rate (TPR, Sensitivity): TP / (TP + FN)
#' \item $PPV Positive predictive value (PPV, Precision): TP / (FP + TP)
#' \item $cosSim Cosine similarity between each of the reference signatures,
#' and its most similar extracted signature.
#' \item $AggManhattanDist (if exposures of signatures were inferred) Scaled
#' Manhattan distance between ground-truth and inferred
#' exposures to each of the reference signatures.
#' }
#' This list also contains \code{mean} and \code{sd}, and other
#' statistics of these measures in \itemize{
#' \item $fivenum - summary generated by \code{\link[stats]{fivenum}} -
#' columns of this table refer to Tukey's five number summary
#' for each extraction measure across all runs: \itemize{
#'
#' \item \code{min} - minimum
#' \item \code{lower-hinge} - first quartile. Serve as the lower-hinge
#' of the box-whisker plot.
#' \item \code{median} - median of measure across all runs.
#' \item \code{upper-hinge} - third quartile. Serve as the upper-hinge
#' of the box-whisker plot.
#' \item \code{max} - maximum
#'
#' }
#'
#' \item $fivenumMD - Tukey's five number summary for aggregately-scaled
#' Manhattan distance.
#' \item $meanSD - mean and standard deviation for extraction measures.
#' \item $meanSDMD - mean and standard deviation for aggregately-scaled
#' Manhattan distance.
#' }
#'
#' @details Also writes multiple files into folder \code{resultPath}.
#'
#' @importFrom utils write.csv capture.output sessionInfo
#'
#' @importFrom rlang .data
#'
#' @export
SummarizeMultiRuns <-
function(datasetName,
toolName,
resultPath,
run.names){
# Initialization ----------------------------------------------------------
indexes <- c("averCosSim","falseNeg","falsePos",
"truePos","TPR","PPV")
for(index in indexes) assign(index,numeric(0))
cosSim <- list()
# Combine extraction measures of multi runs on same data set --------------
for(runName in run.names){
# Directories of one run. Should include a sub-directory "summary"
runDir <- paste0(resultPath,"/",runName)
summaryDir <- paste0(runDir,"/summary")
sigAnalysisFile <- paste0(summaryDir,"/sigAnalysis.RDa")
# Add sigAnalysis <- NULL to please the Rcheck
sigAnalysis <- NULL
load(file = sigAnalysisFile)
# Names of reference signatures
gtSigNames <- colnames(sigAnalysis$ref.sigs)
# Number of TP, TN and FP sigs
falseNeg <- c(falseNeg, sigAnalysis$FN)
falsePos <- c(falsePos, sigAnalysis$FP)
truePos <- c(truePos, sigAnalysis$TP)
# Average cosine similarity
# only best matches with cosSim > thres (0.9 by default) are counted
averCosSim <- c(averCosSim, sigAnalysis$averCosSim)
# TPR (True positive rate) and PPV (Positive predictive value)
TPR <- c(TPR, sigAnalysis$TPR)
PPV <- c(PPV, sigAnalysis$PPV)
# Best cosine similarity between each reference signature
# and its MOST SIMILAR extracted signature
for(gtSigName in gtSigNames) {
if(is.null(cosSim[[gtSigName]]))
cosSim[[gtSigName]] <- numeric(0)
cosSim[[gtSigName]] <-
c(cosSim[[gtSigName]], sigAnalysis$cosSim[gtSigName])
}
}
# Save vectors and lists for combined summary of multi runs ---------------
# Name every vector by run names (e.g. "seed.1")
names(averCosSim) <- run.names
names(falseNeg) <- run.names
names(falsePos) <- run.names
names(truePos) <- run.names
names(TPR) <- run.names
names(PPV) <- run.names
for (gtSigName in gtSigNames) {
names(cosSim[[gtSigName]]) <- run.names
}
multiRun <- list()
# Save name of the spectra dataset and.signature analysis tool
multiRun$datasetName <- datasetName
multiRun$toolName <- toolName
# Save extraction measures on multiple runs
multiRun$averCosSim <- averCosSim
multiRun$falseNeg <- falseNeg
multiRun$falsePos <- falsePos
multiRun$truePos <- truePos
multiRun$TPR <- TPR
multiRun$PPV <- PPV
# Save best cosine similarity for each reference signature
multiRun$cosSim <- cosSim
# Calculate mean and SD for indexes of signature extraction ---------------
multiRun$meanSD <- matrix(nrow = 6, ncol = 2)
indexes <- c("averCosSim","falseNeg","falsePos",
"truePos","TPR","PPV")
rownames(multiRun$meanSD) <- indexes
colnames(multiRun$meanSD) <- c("mean","stdev")
for(index in indexes){
currentMean <- mean(multiRun[[index]])
currentStdev <- stats::sd(multiRun[[index]])
multiRun$meanSD[index,] <- c(currentMean, currentStdev)
}
# Calculate fivenums for signature extraction -----------------------------
multiRun$fivenum <- matrix(nrow = 6, ncol = 5)
indexes <- c("averCosSim","falseNeg","falsePos",
"truePos","TPR","PPV")
rownames(multiRun$fivenum) <- indexes
colnames(multiRun$fivenum) <- c("min","lower-hinge","median","upper-hinge","max")
for(index in indexes){
currentFiveNum <- stats::fivenum(multiRun[[index]])
multiRun$fivenum[index,] <- currentFiveNum
}
# Plot boxplot + beeswarm plot for signature extraction -------------------
if(FALSE){
titles <- c("averCosSim" = "Average cosine similarity",
"falseNeg" = "False negatives",
"falsePos" = "False positives",
"truePos" = "True positives",
"TPR" = "True positive rate (sensitivity)",
"PPV" = "Positive predictive value (PPV)")
subtitles <- c("averCosSim" = "",
"falseNeg" = "Number of reference signatures not extracted",
"falsePos" = "Number of signatures extracted, but different from reference signatures",
"truePos" = "Number of reference signatures extracted",
"TPR" = "#True Positives / (#True Positives + #False Negatives)",
"PPV" = "#True Positives / (#True Positives + #False Positives)")
# ggplot2 boxplot + beeswarm plot
ggplotList <- list()
for(index in indexes){
indexNum <- which(index == indexes)
ggplotList[[index]] <- ggplot2::ggplot(
data.frame(value = multiRun[[index]],
indexName = index),
ggplot2::aes(x = .data$indexName, y = .data$value))
ggplotList[[index]] <- ggplotList[[index]] +
ggplot2::ggtitle(titles[indexNum],subtitle = subtitles[indexNum])
ggplotList[[index]] <- ggplotList[[index]] +
ggplot2::geom_boxplot() +
ggbeeswarm::geom_quasirandom(groupOnX = TRUE, size = 0.3) +
# Restrict the decimal numbers of values of indexes to be 2
ggplot2::scale_y_continuous(labels =function(x) sprintf("%.2f", x))
}
for(gtSigName in gtSigNames){
ggplotList[[gtSigName]] <- ggplot2::ggplot(
data.frame(value = multiRun$cosSim[[gtSigName]],
gtSigName = gtSigName),
ggplot2::aes(x = .data$gtSigName, y = .data$value))
ggplotList[[gtSigName]] <- ggplotList[[gtSigName]] +
ggplot2::ggtitle(label = paste0("Cosine similarity to signature ",gtSigName),
subtitle = paste0("Considers all extracted signatures resembling ", gtSigName))
ggplotList[[gtSigName]] <- ggplotList[[gtSigName]] +
ggplot2::geom_boxplot() +
ggbeeswarm::geom_quasirandom(groupOnX = TRUE, size = 0.3) +
# Restrict the decimal numbers of values of indexes to be 2
ggplot2::scale_y_continuous(labels =function(x) sprintf("%.2f", x))
}
# Print high-resolution extraction indexes into one png file
# Only include extraction index plots
# in tempPlotList.
if(FALSE){
tempPlotList <- list()
for(index in indexes){
tempPlotList[[index]] <- ggplotList[[index]]
}
suppressMessages(
ggplot2::ggsave(
filename = paste0(resultPath,"/boxplot.extraction.png"),
plot = ggpubr::ggarrange(plotlist = tempPlotList),
device = "png",
dpi = 1000,
limitsize = FALSE
)
)
}
# Print extraction indexes into one pdf file
grDevices::pdf(paste0(resultPath,"/boxplot.extraction.measures.pdf"), pointsize = 1)
for (index in indexes) print(ggplotList[[index]])
grDevices::dev.off()
# Print high-resolution extraction indexes into one png file
# Only include one-signature cosine similarity plots
# in tempPlotList.
if(FALSE){
tempPlotList <- list()
for(gtSigName in gtSigNames){
tempPlotList[[gtSigName]] <- ggplotList[[gtSigName]]
}
suppressMessages(
ggplot2::ggsave(
filename = paste0(resultPath,"/boxplot.onesig.cossim.png"),
plot = ggpubr::ggarrange(plotlist = tempPlotList),
device = "png",
dpi = 1000,
limitsize = FALSE
)
)
}
# Print extraction indexes into one pdf file
grDevices::pdf(paste0(resultPath,"/boxplot.onesig.cossim.pdf"), pointsize = 1)
for (gtSigName in gtSigNames) print(ggplotList[[gtSigName]])
grDevices::dev.off()
}
# Check if result summary files "exposureDiff.Rda" exists for all runs ----
exposureFlag <- TRUE
for(runName in run.names){
runDir <- paste0(resultPath,"/",runName)
summaryDir <- paste0(runDir,"/summary")
exposureDiffFile <- paste0(summaryDir,"/exposureDiff.RDa")
if(!file.exists(exposureDiffFile)){
exposureFlag <- FALSE
#message("Skip summarizing scaled Manhattan distance...\n")
break
}
}
# Summarize aggregated scaled Manhattan distance --------------------------
if(exposureFlag){
# Read scaled aggregated Manhattan distances in multiple runs
AggManhattanDist <- matrix(nrow = length(gtSigNames), ncol = length(run.names))
rownames(AggManhattanDist) <- gtSigNames
colnames(AggManhattanDist) <- run.names
for(runName in run.names){
runDir <- paste0(resultPath,"/",runName)
summaryDir <- paste0(runDir,"/summary")
exposureDiffFile <- paste0(summaryDir,"/exposureDiff.RDa")
# Add exposureDiff <- NULL to please the R check
exposureDiff <- NULL
load(file = exposureDiffFile)
AggManhattanDist[gtSigNames,runName] <- exposureDiff$aggregated[gtSigNames,"Scaled.Aggregated.Manhattan.distance"]
}
multiRun$AggManhattanDist <- AggManhattanDist
# Calculate mean and SD for aggregated Manhattan distance
meanSDAggMD <- matrix(nrow = length(gtSigNames), ncol = 2)
rownames(meanSDAggMD) <- gtSigNames
colnames(meanSDAggMD) <- c("mean","stdev")
for(gtSigName in gtSigNames){
meanSDAggMD[gtSigName,"mean"] <- mean(AggManhattanDist[gtSigName,])
meanSDAggMD[gtSigName,"stdev"] <- stats::sd(AggManhattanDist[gtSigName,])
}
multiRun$meanSDAggMD <- meanSDAggMD
# Calculate fivenums for exposure inference Scaled Manhattan distance
multiRun$fivenumAggMD <- matrix(nrow = length(gtSigNames), ncol = 5)
rownames(multiRun$fivenumAggMD) <- gtSigNames
colnames(multiRun$fivenumAggMD) <- c("min","lower-hinge","median","upper-hinge","max")
for(gtSigName in gtSigNames){
multiRun$fivenumAggMD[gtSigName,] <- stats::fivenum(AggManhattanDist[gtSigName,])
}
}
# Summarize Manhattan distance scaled for each tumor ----------------------
# only if exposureFlag is TRUE.
if(exposureFlag){
if(TRUE){
# Read scaled aggregated Manhattan distances in multiple runs
meanSepMD <- matrix(nrow = length(gtSigNames), ncol = length(run.names))
sdSepMD <- matrix(nrow = length(gtSigNames), ncol = length(run.names))
rownames(meanSepMD) <- gtSigNames
colnames(meanSepMD) <- run.names
rownames(sdSepMD) <- gtSigNames
colnames(sdSepMD) <- run.names
for(runName in run.names){
runDir <- paste0(resultPath,"/",runName)
summaryDir <- paste0(runDir,"/summary")
exposureDiffFile <- paste0(summaryDir,"/exposureDiff.RDa")
# Add exposureDiff <- NULL to please the R check
exposureDiff <- NULL
load(file = exposureDiffFile)
for(gtSigName in gtSigNames){
meanSepMD[gtSigName,runName] <- mean(exposureDiff$separated[[gtSigName]][,"Scaled.Manhattan.distance"])
sdSepMD[gtSigName,runName] <- stats::sd(exposureDiff$separated[[gtSigName]][,"Scaled.Manhattan.distance"])
}
}
multiRun$meanSepMD <- meanSepMD
multiRun$sdSepMD <- sdSepMD
}
}
# Save data and results ---------------------------------------------------
save(multiRun,file = paste0(resultPath,"/multiRun.RDa"))
write.csv(x = multiRun$meanSD,
file = paste0(resultPath,"/meanSD.csv"))
write.csv(x = multiRun$fivenum,
file = paste0(resultPath,"/fivenum.csv"))
if(exposureFlag){
write.csv(x = multiRun$AggManhattanDist,
file = paste0(resultPath,"/Scaled.Aggregated.ManhattanDist.csv"))
write.csv(x = multiRun$meanSDAggMD,
file = paste0(resultPath,"/meanSD.Scaled.Aggregated.Manhattan.dist.csv"))
write.csv(x = multiRun$fivenumAggMD,
file = paste0(resultPath,"/fivenum.Scaled.Aggregated.Manhattan.dist.csv"))
write.csv(x = multiRun$meanSepMD,
file = paste0(resultPath,"/mean.of.sep.Scaled.Manhattan.dist.csv"))
write.csv(x = multiRun$sdSepMD,
file = paste0(resultPath,"/stdev.of.sep.Scaled.Manhattan.dist.csv"))
}
invisible(multiRun)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.