data-raw/deprecated.codes/SummarizeMultiToolsOneDataset.R

#' Combine results for a single dataset, from different computational approaches.
#'
#' Summarize results from each computational approach in \code{third.level.dir}/\code{tool.dirnames}
#' (generated by \code{\link{SummarizeMultiRuns}}),
#' combine them into \code{third.level.dir}.
#'
#'
#' @param third.level.dir Third level path distinguishing de novo extraction
#' + attribution packages from attribution-only packages.
#' Examples:
#' \code{top.dir}/sp.sp/ExtrAttr/
#' \code{top.dir}/sa.sa/Attr/
#'
#' @param toolNames Names of computational approach. (e.g. "SigProExtractor")
#'
#' @param tool.dirnames Third level path from the \code{top.dir}. Expected to have
#' summarized results generated by \code{\link{SummarizeMultiRuns}}.
#' (multiRun.RDa, ManhattanDist.csv, meanSD.csv, meanSD.Manhattan.dist.csv)
#' Examples:
#' \code{"signeR.results"} (Under \code{third.level.dir} "ExtrAttr")
#' \code{"deconstructSigs.results"} (Under \code{third.level.dir} "Attr")
#'
#' Here, \code{top.dir} refers to a top-level directory which contains the
#' full information of a synthetic dataset. (e.g. \code{syn.2.7a.7b.abst.v8})
#' This code depends on a conventional directory structure documented
#' elsewhere. However there should be a directory within the \code{tool.names}
#' which stores the software output.
#'
#' @param datasetGroup Numeric or character vector specifying the groups
#' each dataset belong to.
#' E.g. For SBS1-SBS5 correlated datasets, we can consider slope as the group:
#' c("slope=0.1","slope=0.5","slope=1","slope=2","slope=5","slope=10")
#' Default: "Default"
#'
#' @param datasetGroupName Meaning or label of all datasetGroup.
#' E.g. For SBS1-SBS5 correlated datasets, we can consider \code{"SBS1:SBS5 mutation count ratio"}
#' as the label of the \code{datasetGroup} slope.
#'
#' @param datasetSubGroup Optional. Numeric or character vector differentiating
#' datasets within each group.
#' E.g. For SBS1-SBS5 correlated datasets, we can consider Pearson's R^2
#' as the subgroup:
#' c("Rsq=0.1","Rsq=0.2","Rsq=0.3","Rsq=0.6")
#' Default: Names of datasets, which are \code{basename(dataset.dirs)}
#'
#' @param datasetSubGroupName Optional. Meaning or label of all datasetSubGroup.
#' E.g. For SBS1-SBS5 correlated datasets, we can consider \code{"Pearson's R squared"}
#' as the label of the \code{datasetSubGroup} Pearson's R^2.
#'
#' @return A list contain c(\code{mean},\code{sd}) of multiple runs:
#' Cosine similarity
#' True Positives(TP): Ground-truth signatures which are active in
#' the spectra, and extracted.
#' False Negatives(FN): Ground-truth signatures not extracted.
#' False Positives(FP): Signatures wrongly extracted, not resembling
#' any ground-truth signatures.
#' True positive rate (TPR, Sensitivity): TP / (TP + FN)
#' Positive predictive value (PPV, Precision): TP / (FP + TP)
#'
#' @details This function generates \code{multiTools.RDa} under
#' \code{third.level.dir}
#'
#' @importFrom utils write.csv capture.output sessionInfo
#'
#' @export
#'
SummarizeMultiToolsOneDataset <- function(
  third.level.dir,
  toolNames,
  tool.dirnames,
  datasetGroup,
  datasetGroupName,
  datasetSubGroup = NULL,
  datasetSubGroupName = NULL){

  multiTools <- list()
  combMeanSD <- NULL
  combMeanSDAggMD <- NULL

  for(toolNumber in 1:length(toolNames)){
    toolName <- toolNames[toolNumber]
    toolDirName <- tool.dirnames[toolNumber]
    toolPath <- paste0(third.level.dir,"/",toolDirName)
    ## Add multiRun <- NULL to please the R check
    multiRun <- NULL
    datasetName <- NULL
    load(paste0(toolPath,"/multiRun.RDa"))
    if(!is.null(datasetName)) {
      if(datasetName != multiRun$datasetName) {
        stop("Must provide results of different approaches on the SAME dataset.\n")
      }
    }
    datasetName <- multiRun$datasetName

    ## Combine multi-runs and multi-tools for each measure
    {
      indexes <- c("averCosSim","falseNeg","falsePos",
                   "truePos","TPR","PPV")
      indexLabels <- c("averCosSim" = "Average cosine similarity of all signatures",
                       "falseNeg" = "Number of False negatives",
                       "falsePos" = "Number of False positives",
                       "truePos" = "Number of True positives",
                       "TPR" = "True positive rate",
                       "PPV" = "Positive predictive value")
      for(index in indexes){
        indexNum <- which(index == indexes)
        if(!exists("datasetSubGroup")) { # datasetSubGroup is not provided
          measure4OneTool <- data.frame(seed = names(multiRun[[index]]),
                                        value = multiRun[[index]],
                                        toolName = toolName,
                                        datasetName = multiRun$datasetName,
                                        datasetGroup = datasetGroup,
                                        stringsAsFactors = FALSE)
        } else {
          measure4OneTool <- data.frame(seed = names(multiRun[[index]]),
                                        value = multiRun[[index]],
                                        toolName = toolName,
                                        datasetName = multiRun$datasetName,
                                        datasetGroup = datasetGroup,
                                        datasetSubGroup = datasetSubGroup,
                                        stringsAsFactors = FALSE)
        }
        rownames(measure4OneTool) <- NULL
        ## Create a data.frame for each index,
        ## and summarize multi-Run, multiDataset values
        ## for each index.
        if(is.null(multiTools[[index]])){
          multiTools[[index]] <- data.frame()
        }
        multiTools[[index]] <- rbind(multiTools[[index]],measure4OneTool)
      }
    }

    ## meanSD contains mean and standard deviation
    ## for each extraction measure.
    {
      meanSD <- multiRun$meanSD
      colnames(meanSD) <- paste0(toolDirName,".", colnames(meanSD))
      if(is.null(meanSD)){
        combMeanSD <- meanSD
      } else{
        combMeanSD <- cbind(combMeanSD,meanSD)
      }
    }

    ## Combine multi-runs and multi-tools for:
    ## $cosSim - cosine similarity to each ground-truth signature
    ## $NumSigsSimilar - number of extracted sigs similar to each
    ## ground-truth sig (requires cosine similarity > 0.9)
    if(TRUE){ ## old code
      {
      gtSigNames <- names(multiRun$cosSim)
      multiTools$gtSigNames <- gtSigNames
      if(is.null(multiTools$cosSim)) multiTools$cosSim <- list()

      for(gtSigName in gtSigNames){
        if(!exists("datasetSubGroup")) {
          gtMeanCosSim4OneTool <- data.frame(seed = names(multiRun$cosSim[[gtSigName]]),
                                             value = multiRun$cosSim[[gtSigName]],
                                             toolName = toolName,
                                             datasetName = multiRun$datasetName,
                                             datasetGroup = datasetGroup,
                                             stringsAsFactors = FALSE)
        } else {
          gtMeanCosSim4OneTool <- data.frame(seed = names(multiRun$cosSim[[gtSigName]]),
                                             value = multiRun$cosSim[[gtSigName]],
                                             toolName = toolName,
                                             datasetName = multiRun$datasetName,
                                             datasetGroup = datasetGroup,
                                             datasetSubGroup = datasetSubGroup,
                                             stringsAsFactors = FALSE)
        }
        rownames(gtMeanCosSim4OneTool) <- NULL
        ## Create a data.frame for each ground-truth signature,
        ## and summarize multi-Run, multiDataset values
        ## for each ground-truth signature.
        if(is.null(multiTools$cosSim[[gtSigName]])){
          multiTools$cosSim[[gtSigName]] <- data.frame()
        }
        multiTools$cosSim[[gtSigName]] <- rbind(multiTools$cosSim[[gtSigName]],gtMeanCosSim4OneTool)
      }
    }

    }else{ ## new code

      gtSigNames <- names(multiRun$cosSim)
      multiTools$gtSigNames <- gtSigNames
      measures <- c("cosSim","NumSigsSimilar")

      for(measure in measures){
        multiTools[[measure]] <- list()
        ## Create a data.frame for each ground-truth signature,
        ## and summarize multi-Run, multiDataset values
        ## for each ground-truth signature.
        for(gtSigName in gtSigNames){
          multiTools[[measure]][[gtSigName]] <- data.frame()
        }
      }

      for(measure in measures){
        for(gtSigName in gtSigNames){
          if(!exists("datasetSubGroup")) {
            measure4OneTool <- data.frame(
              seed = names(multiRun[[measure]][[gtSigName]]),
              value = multiRun[[measure]][[gtSigName]],
              toolName = toolName,
              datasetName = multiRun$datasetName,
              datasetGroup = datasetGroup,
              stringsAsFactors = FALSE)
          } else {
            measure4OneTool <- data.frame(
              seed = names(multiRun[[measure]][[gtSigName]]),
              value = multiRun[[measure]][[gtSigName]],
              toolName = toolName,
              datasetName = multiRun$datasetName,
              datasetGroup = datasetGroup,
              datasetSubGroup = datasetSubGroup,
              stringsAsFactors = FALSE)
          }
          rownames(measure4OneTool) <- NULL
          multiTools$cosSim[[gtSigName]] <- rbind(
            multiTools$cosSim[[gtSigName]],measure4OneTool)
        }
      }

    }


    ## Combine multi-runs and multi-tools for
    ## aggregated scaled Manhattan distance.
    if(!is.null(multiRun$AggManhattanDist)){
      ## Combine multi-runs and multi-tools for Manhattan
      ## distance of each ground-truth signature
      {
        if(is.null(multiTools$AggManhattanDist)) multiTools$AggManhattanDist <- list()
        for(gtSigName in gtSigNames){
          if(!exists("datasetSubGroup")) {
            gtAggManhattanDist4OneTool <- data.frame(seed = colnames(multiRun$AggManhattanDist),
                                                     value = multiRun$AggManhattanDist[gtSigName,],
                                                     toolName = toolName,
                                                     datasetName = multiRun$datasetName,
                                                     datasetGroup = datasetGroup,
                                                     stringsAsFactors = FALSE)
          } else{
            gtAggManhattanDist4OneTool <- data.frame(seed = colnames(multiRun$AggManhattanDist),
                                                     value = multiRun$AggManhattanDist[gtSigName,],
                                                     toolName = toolName,
                                                     datasetName = multiRun$datasetName,
                                                     datasetGroup = datasetGroup,
                                                     datasetSubGroup = datasetSubGroup,
                                                     stringsAsFactors = FALSE)
          }
          rownames(gtAggManhattanDist4OneTool) <- NULL
          ## Create a data.frame for each ground-truth signature,
          ## and summarize multi-Run, multiDataset values
          ## for each ground-truth signature.
          if(is.null(multiTools$AggManhattanDist[[gtSigName]])){
            multiTools$AggManhattanDist[[gtSigName]] <- data.frame()
          }
          multiTools$AggManhattanDist[[gtSigName]] <- rbind(multiTools$AggManhattanDist[[gtSigName]],gtAggManhattanDist4OneTool)
        }
      }

      ## meanSDAggMD contains mean and standard deviation
      ## for aggregated Scaled Manhattan distance between ground-truth exposures
      ## and inferred exposures for each ground-truth signature
      {
        meanSDAggMD <- multiRun$meanSDAggMD
        colnames(meanSDAggMD) <- paste0(toolDirName,".", colnames(meanSDAggMD))
        if(is.null(meanSDAggMD)){
          combMeanSDAggMD <- meanSDAggMD
        } else{
          combMeanSDAggMD <- cbind(combMeanSDAggMD,meanSDAggMD)
        }
      }
    }


    ## Combine multi-runs and multi-tools for
    ## mean of scaled Manhattan distance for each tumor.
    if(!is.null(multiRun$meanSepMD)){
      ## Combine multi-runs and multi-tools for Manhattan
      ## distance of each ground-truth signature
      if(is.null(multiTools$meanSepMD)) multiTools$meanSepMD <- list()
      for(gtSigName in gtSigNames){
        if(!exists("datasetSubGroup")) {
          gtmeanSepMD4OneTool <- data.frame(seed = colnames(multiRun$meanSepMD),
                                            value = multiRun$meanSepMD[gtSigName,],
                                            toolName = toolName,
                                            datasetName = multiRun$datasetName,
                                            datasetGroup = datasetGroup,
                                            stringsAsFactors = FALSE)
        } else{
          gtmeanSepMD4OneTool <- data.frame(seed = colnames(multiRun$meanSepMD),
                                            value = multiRun$meanSepMD[gtSigName,],
                                            toolName = toolName,
                                            datasetName = multiRun$datasetName,
                                            datasetGroup = datasetGroup,
                                            datasetSubGroup = datasetSubGroup,
                                            stringsAsFactors = FALSE)
        }
        rownames(gtmeanSepMD4OneTool) <- NULL
        ## Create a data.frame for each ground-truth signature,
        ## and summarize multi-Run, multiDataset values
        ## for each ground-truth signature.
        if(is.null(multiTools$meanSepMD[[gtSigName]])){
          multiTools$meanSepMD[[gtSigName]] <- data.frame()
        }
        multiTools$meanSepMD[[gtSigName]] <- rbind(multiTools$meanSepMD[[gtSigName]],gtmeanSepMD4OneTool)
      }
    }

    ## Combine multi-runs and multi-tools for
    ## standard deviation of scaled Manhattan distance for each tumor.
    if(!is.null(multiRun$sdSepMD)){
      ## Combine multi-runs and multi-tools for Manhattan
      ## distance of each ground-truth signature

      if(is.null(multiTools$sdSepMD)) multiTools$sdSepMD <- list()
      for(gtSigName in gtSigNames){
        if(!exists("datasetSubGroup")) {
          gtsdSepMD4OneTool <- data.frame(seed = colnames(multiRun$sdSepMD),
                                          value = multiRun$sdSepMD[gtSigName,],
                                          toolName = toolName,
                                          datasetName = multiRun$datasetName,
                                          datasetGroup = datasetGroup,
                                          stringsAsFactors = FALSE)
        } else{
          gtsdSepMD4OneTool <- data.frame(seed = colnames(multiRun$sdSepMD),
                                          value = multiRun$sdSepMD[gtSigName,],
                                          toolName = toolName,
                                          datasetName = multiRun$datasetName,
                                          datasetGroup = datasetGroup,
                                          datasetSubGroup = datasetSubGroup,
                                          stringsAsFactors = FALSE)
        }
        rownames(gtsdSepMD4OneTool) <- NULL
        ## Create a data.frame for each ground-truth signature,
        ## and summarize multi-Run, multiDataset values
        ## for each ground-truth signature.
        if(is.null(multiTools$sdSepMD[[gtSigName]])){
          multiTools$sdSepMD[[gtSigName]] <- data.frame()
        }
        multiTools$sdSepMD[[gtSigName]] <- rbind(multiTools$sdSepMD[[gtSigName]],gtmeanSepMD4OneTool)
      }
    }


  }

  multiTools$combMeanSD <- combMeanSD
  if(exists("combMeanSDAggMD")){
    multiTools$combMeanSDAggMD <- combMeanSDAggMD
  }
  multiTools$datasetName <- datasetName
  multiTools$datasetGroupName <- datasetGroupName
  multiTools$datasetSubGroupName <- datasetSubGroupName

  save(multiTools,file = paste0(third.level.dir,"/multiTools.RDa"))
  write.csv(x = multiTools$combMeanSD,
            file = paste0(third.level.dir,"/combined.meanSD.csv"))
  if(!is.null(multiTools$combMeanSDAggMD)){
    write.csv(x = multiTools$combMeanSDAggMD,
              file = paste0(third.level.dir,"/combined.meanSD.Aggregated.Manhattan.dist.csv"))
  }
  if(!is.null(multiTools$meanSepMD)){
    write.csv(x = multiTools$meanSepMD,
              file = paste0(third.level.dir,"/mean.of.sep.Scaled.Manhattan.dist.csv"))
  }
  if(!is.null(multiTools$sdSepMD)){
    write.csv(x = multiTools$sdSepMD,
              file = paste0(third.level.dir,"/stdev.of.sep.Scaled.Manhattan.dist.csv"))
  }
  invisible(multiTools)
}
WuyangFF95/SynSigEval documentation built on Sept. 18, 2022, 11:41 a.m.