R/SummarizeMultiRuns.R

Defines functions SummarizeMultiRuns

Documented in SummarizeMultiRuns

#' Assess/evaluate multiple summarized runs on one dataset by one computational approach.
#'
#' Summarize results from each computational approach in \code{resultPath}/\code{run.names}
#' (generated by running a computational approach),
#' combine them into \code{resultPath}.
#'
#' @param datasetName Name of the dataset. (e.g. "S.0.1.Rsq.0.1").
#' Usually, it is has the same name as \code{basename(top.dir)}.
#'
#' @param toolName Name of computational approach. (e.g. "SigProExtractor")
#'
#' @param resultPath Path expected to have multiple result folders each
#' named as \code{run.names} (e.g. "seed.1").
#' The example \code{resultPath} is
#' S.0.1.Rsq.0.1/sp.sp/ExtrAttr/hdp.results/ in old folder structure, or
#' 3a.Original_output_K_unspecified/hdp/S.0.1.Rsq.0.1 in new folder structure.
#'
#' @param run.names A character vector records the list of directories
#' which are under \code{resultPath} and contain results of computational approach,
#' and a \code{summary} folder generated by \code{\link{SummarizeSigOneExtrAttrSubdir}}.
#'
#' @return A list contain values of measures measures in multiple runs: \itemize{
#' \item $averCosSim Average cosine similarity. Only similarities between
#'   TP sigs and extracted sigs most similar to them.
#' \item $truePos True Positives(TP): Reference signatures which are
#'   active in the spectra, and extracted.
#' \item $falseNeg False Negatives(FN): Reference signatures not extracted.
#' \item $falsePos False Positives(FP): Signatures wrongly extracted,
#'   not resembling any reference signatures.
#' \item $TPR True positive rate (TPR, Sensitivity): TP / (TP + FN)
#' \item $PPV Positive predictive value (PPV, Precision): TP / (FP + TP)
#' \item $cosSim Cosine similarity between each of the reference signatures,
#'    and its most similar extracted signature.
#' \item $AggManhattanDist (if exposures of signatures were inferred) Scaled
#'   Manhattan distance between ground-truth and inferred
#'   exposures to each of the reference signatures.
#' }
#' This list also contains \code{mean} and \code{sd}, and other
#' statistics of these measures in \itemize{
#' \item $fivenum - summary generated by \code{\link[stats]{fivenum}} -
#'   columns of this table refer to Tukey's five number summary
#'   for each extraction measure across all runs: \itemize{
#'
#' \item \code{min} - minimum
#' \item \code{lower-hinge} - first quartile. Serve as the lower-hinge
#'  of the box-whisker plot.
#' \item \code{median} - median of measure across all runs.
#' \item \code{upper-hinge} - third quartile. Serve as the upper-hinge
#'   of the box-whisker plot.
#' \item \code{max} - maximum
#'
#' }
#'
#' \item $fivenumMD - Tukey's five number summary for aggregately-scaled
#'   Manhattan distance.
#' \item $meanSD - mean and standard deviation for extraction measures.
#' \item $meanSDMD - mean and standard deviation for aggregately-scaled
#'   Manhattan distance.
#' }
#'
#' @details Also writes multiple files into folder \code{resultPath}.
#'
#' @importFrom utils write.csv capture.output sessionInfo
#'
#' @importFrom rlang .data
#'
#' @export
SummarizeMultiRuns <-
  function(datasetName,
           toolName,
           resultPath,
           run.names){



    # Initialization ----------------------------------------------------------
    indexes <- c("averCosSim","falseNeg","falsePos",
                 "truePos","TPR","PPV")
    for(index in indexes) assign(index,numeric(0))
    cosSim <- list()



    # Combine extraction measures of multi runs on same data set --------------
    for(runName in run.names){

      # Directories of one run. Should include a sub-directory "summary"
      runDir <- paste0(resultPath,"/",runName)
      summaryDir <- paste0(runDir,"/summary")
      sigAnalysisFile <- paste0(summaryDir,"/sigAnalysis.RDa")
      # Add sigAnalysis <- NULL to please the Rcheck
      sigAnalysis <- NULL
      load(file = sigAnalysisFile)

      # Names of reference signatures
      gtSigNames <- colnames(sigAnalysis$ref.sigs)

      # Number of TP, TN and FP sigs
      falseNeg <- c(falseNeg, sigAnalysis$FN)
      falsePos <- c(falsePos, sigAnalysis$FP)
      truePos <- c(truePos, sigAnalysis$TP)

      # Average cosine similarity
      # only best matches with cosSim > thres (0.9 by default) are counted
      averCosSim <- c(averCosSim, sigAnalysis$averCosSim)

      # TPR (True positive rate) and PPV (Positive predictive value)
      TPR <- c(TPR, sigAnalysis$TPR)
      PPV <- c(PPV, sigAnalysis$PPV)

      # Best cosine similarity between each reference signature
      # and its MOST SIMILAR extracted signature
      for(gtSigName in gtSigNames) {
        if(is.null(cosSim[[gtSigName]]))
          cosSim[[gtSigName]] <- numeric(0)
        cosSim[[gtSigName]] <-
          c(cosSim[[gtSigName]], sigAnalysis$cosSim[gtSigName])
      }
    }



    # Save vectors and lists for combined summary of multi runs ---------------

    # Name every vector by run names (e.g. "seed.1")
    names(averCosSim) <- run.names
    names(falseNeg) <- run.names
    names(falsePos) <- run.names
    names(truePos) <- run.names
    names(TPR) <- run.names
    names(PPV) <- run.names
    for (gtSigName in gtSigNames) {
        names(cosSim[[gtSigName]]) <- run.names
    }

    multiRun <- list()
    # Save name of the spectra dataset and.signature analysis tool
    multiRun$datasetName <- datasetName
    multiRun$toolName <- toolName
    # Save extraction measures on multiple runs
    multiRun$averCosSim <- averCosSim
    multiRun$falseNeg <- falseNeg
    multiRun$falsePos <- falsePos
    multiRun$truePos <- truePos
    multiRun$TPR <- TPR
    multiRun$PPV <- PPV
    # Save best cosine similarity for each reference signature
    multiRun$cosSim <- cosSim



    # Calculate mean and SD for indexes of signature extraction ---------------
    multiRun$meanSD <- matrix(nrow = 6, ncol = 2)
    indexes <- c("averCosSim","falseNeg","falsePos",
                 "truePos","TPR","PPV")
    rownames(multiRun$meanSD) <- indexes
    colnames(multiRun$meanSD) <- c("mean","stdev")
    for(index in indexes){
      currentMean <- mean(multiRun[[index]])
      currentStdev <- stats::sd(multiRun[[index]])
      multiRun$meanSD[index,] <- c(currentMean, currentStdev)
    }



    # Calculate fivenums for signature extraction -----------------------------
    multiRun$fivenum <- matrix(nrow = 6, ncol = 5)
    indexes <- c("averCosSim","falseNeg","falsePos",
                 "truePos","TPR","PPV")
    rownames(multiRun$fivenum) <- indexes
    colnames(multiRun$fivenum) <- c("min","lower-hinge","median","upper-hinge","max")
    for(index in indexes){
      currentFiveNum <- stats::fivenum(multiRun[[index]])
      multiRun$fivenum[index,] <- currentFiveNum
    }



    # Plot boxplot + beeswarm plot for signature extraction -------------------
    if(FALSE){

      titles <- c("averCosSim" = "Average cosine similarity",
                  "falseNeg" = "False negatives",
                  "falsePos" = "False positives",
                  "truePos" = "True positives",
                  "TPR" = "True positive rate (sensitivity)",
                  "PPV" = "Positive predictive value (PPV)")
      subtitles <- c("averCosSim" = "",
                     "falseNeg" = "Number of reference signatures not extracted",
                     "falsePos" = "Number of signatures extracted, but different from reference signatures",
                     "truePos" = "Number of reference signatures extracted",
                     "TPR" = "#True Positives / (#True Positives + #False Negatives)",
                     "PPV" = "#True Positives / (#True Positives + #False Positives)")

      # ggplot2 boxplot + beeswarm plot
      ggplotList <- list()
      for(index in indexes){
        indexNum <- which(index == indexes)
        ggplotList[[index]] <- ggplot2::ggplot(
          data.frame(value = multiRun[[index]],
                     indexName = index),
          ggplot2::aes(x = .data$indexName, y = .data$value))
        ggplotList[[index]] <- ggplotList[[index]] +
          ggplot2::ggtitle(titles[indexNum],subtitle = subtitles[indexNum])
        ggplotList[[index]] <- ggplotList[[index]] +
          ggplot2::geom_boxplot() +
          ggbeeswarm::geom_quasirandom(groupOnX = TRUE, size = 0.3) +
          # Restrict the decimal numbers of values of indexes to be 2
          ggplot2::scale_y_continuous(labels =function(x) sprintf("%.2f", x))
      }
      for(gtSigName in gtSigNames){
        ggplotList[[gtSigName]] <- ggplot2::ggplot(
          data.frame(value = multiRun$cosSim[[gtSigName]],
                     gtSigName = gtSigName),
          ggplot2::aes(x = .data$gtSigName, y = .data$value))
        ggplotList[[gtSigName]] <- ggplotList[[gtSigName]] +
          ggplot2::ggtitle(label = paste0("Cosine similarity to signature ",gtSigName),
                           subtitle = paste0("Considers all extracted signatures resembling ", gtSigName))
        ggplotList[[gtSigName]] <- ggplotList[[gtSigName]] +
          ggplot2::geom_boxplot() +
          ggbeeswarm::geom_quasirandom(groupOnX = TRUE, size = 0.3) +
          # Restrict the decimal numbers of values of indexes to be 2
          ggplot2::scale_y_continuous(labels =function(x) sprintf("%.2f", x))
      }

      # Print high-resolution extraction indexes into one png file
      # Only include extraction index plots
      # in tempPlotList.
      if(FALSE){
        tempPlotList <- list()
        for(index in indexes){
          tempPlotList[[index]] <- ggplotList[[index]]
        }
        suppressMessages(
          ggplot2::ggsave(
            filename = paste0(resultPath,"/boxplot.extraction.png"),
            plot = ggpubr::ggarrange(plotlist = tempPlotList),
            device = "png",
            dpi = 1000,
            limitsize = FALSE
          )
        )
      }

      # Print extraction indexes into one pdf file
      grDevices::pdf(paste0(resultPath,"/boxplot.extraction.measures.pdf"), pointsize = 1)
      for (index in indexes) print(ggplotList[[index]])
      grDevices::dev.off()


      # Print high-resolution extraction indexes into one png file
      # Only include one-signature cosine similarity plots
      # in tempPlotList.
      if(FALSE){
        tempPlotList <- list()
        for(gtSigName in gtSigNames){
          tempPlotList[[gtSigName]] <- ggplotList[[gtSigName]]
        }
        suppressMessages(
          ggplot2::ggsave(
            filename = paste0(resultPath,"/boxplot.onesig.cossim.png"),
            plot = ggpubr::ggarrange(plotlist = tempPlotList),
            device = "png",
            dpi = 1000,
            limitsize = FALSE
          )
        )
      }

      # Print extraction indexes into one pdf file
      grDevices::pdf(paste0(resultPath,"/boxplot.onesig.cossim.pdf"), pointsize = 1)
      for (gtSigName in gtSigNames) print(ggplotList[[gtSigName]])
      grDevices::dev.off()
    }




    # Check if result summary files "exposureDiff.Rda" exists for all runs ----
    exposureFlag <- TRUE
    for(runName in run.names){
      runDir <- paste0(resultPath,"/",runName)
      summaryDir <- paste0(runDir,"/summary")
      exposureDiffFile <- paste0(summaryDir,"/exposureDiff.RDa")
      if(!file.exists(exposureDiffFile)){
        exposureFlag <- FALSE
        #message("Skip summarizing scaled Manhattan distance...\n")
        break
      }
    }



    # Summarize aggregated scaled Manhattan distance --------------------------
    if(exposureFlag){
      # Read scaled aggregated Manhattan distances in multiple runs
      AggManhattanDist <- matrix(nrow = length(gtSigNames), ncol = length(run.names))
      rownames(AggManhattanDist) <- gtSigNames
      colnames(AggManhattanDist) <- run.names
      for(runName in run.names){
        runDir <- paste0(resultPath,"/",runName)
        summaryDir <- paste0(runDir,"/summary")
        exposureDiffFile <- paste0(summaryDir,"/exposureDiff.RDa")
        # Add exposureDiff <- NULL to please the R check
        exposureDiff <- NULL
        load(file = exposureDiffFile)
        AggManhattanDist[gtSigNames,runName] <- exposureDiff$aggregated[gtSigNames,"Scaled.Aggregated.Manhattan.distance"]
      }
      multiRun$AggManhattanDist <- AggManhattanDist

      # Calculate mean and SD for aggregated  Manhattan distance
      meanSDAggMD <- matrix(nrow = length(gtSigNames), ncol = 2)
      rownames(meanSDAggMD) <- gtSigNames
      colnames(meanSDAggMD) <- c("mean","stdev")
      for(gtSigName in gtSigNames){
        meanSDAggMD[gtSigName,"mean"] <- mean(AggManhattanDist[gtSigName,])
        meanSDAggMD[gtSigName,"stdev"] <- stats::sd(AggManhattanDist[gtSigName,])
      }
      multiRun$meanSDAggMD <- meanSDAggMD

      # Calculate fivenums for exposure inference Scaled Manhattan distance
      multiRun$fivenumAggMD <- matrix(nrow = length(gtSigNames), ncol = 5)
      rownames(multiRun$fivenumAggMD) <- gtSigNames
      colnames(multiRun$fivenumAggMD) <- c("min","lower-hinge","median","upper-hinge","max")
      for(gtSigName in gtSigNames){
        multiRun$fivenumAggMD[gtSigName,] <- stats::fivenum(AggManhattanDist[gtSigName,])
      }
    }



    # Summarize Manhattan distance scaled for each tumor ----------------------
    # only if exposureFlag is TRUE.
    if(exposureFlag){

      if(TRUE){
        # Read scaled aggregated Manhattan distances in multiple runs
        meanSepMD <- matrix(nrow = length(gtSigNames), ncol = length(run.names))
        sdSepMD <- matrix(nrow = length(gtSigNames), ncol = length(run.names))
        rownames(meanSepMD) <- gtSigNames
        colnames(meanSepMD) <- run.names
        rownames(sdSepMD) <- gtSigNames
        colnames(sdSepMD) <- run.names

        for(runName in run.names){
          runDir <- paste0(resultPath,"/",runName)
          summaryDir <- paste0(runDir,"/summary")
          exposureDiffFile <- paste0(summaryDir,"/exposureDiff.RDa")
          # Add exposureDiff <- NULL to please the R check
          exposureDiff <- NULL
          load(file = exposureDiffFile)
          for(gtSigName in gtSigNames){
            meanSepMD[gtSigName,runName] <- mean(exposureDiff$separated[[gtSigName]][,"Scaled.Manhattan.distance"])
            sdSepMD[gtSigName,runName] <- stats::sd(exposureDiff$separated[[gtSigName]][,"Scaled.Manhattan.distance"])
          }
        }
        multiRun$meanSepMD <- meanSepMD
        multiRun$sdSepMD <- sdSepMD
      }

    }



    # Save data and results ---------------------------------------------------
    save(multiRun,file = paste0(resultPath,"/multiRun.RDa"))
    write.csv(x = multiRun$meanSD,
              file = paste0(resultPath,"/meanSD.csv"))
    write.csv(x = multiRun$fivenum,
              file = paste0(resultPath,"/fivenum.csv"))
    if(exposureFlag){
      write.csv(x = multiRun$AggManhattanDist,
                file = paste0(resultPath,"/Scaled.Aggregated.ManhattanDist.csv"))
      write.csv(x = multiRun$meanSDAggMD,
                file = paste0(resultPath,"/meanSD.Scaled.Aggregated.Manhattan.dist.csv"))
      write.csv(x = multiRun$fivenumAggMD,
                file = paste0(resultPath,"/fivenum.Scaled.Aggregated.Manhattan.dist.csv"))
      write.csv(x = multiRun$meanSepMD,
                file = paste0(resultPath,"/mean.of.sep.Scaled.Manhattan.dist.csv"))
      write.csv(x = multiRun$sdSepMD,
                file = paste0(resultPath,"/stdev.of.sep.Scaled.Manhattan.dist.csv"))
    }
    invisible(multiRun)
  }
WuyangFF95/SynSigEval documentation built on Sept. 18, 2022, 11:41 a.m.