R/SignatureAnalyzerInteraction.R

Defines functions SignatureAnalyzer4MatchedCatalogs SAMultiRunOneCatalog SignatureAnalyzerOneRun RunSignatureAnalyzerOnFile SourceSignatureAnalyzerCode FixSASigNames

Documented in FixSASigNames RunSignatureAnalyzerOnFile SAMultiRunOneCatalog SignatureAnalyzer4MatchedCatalogs SignatureAnalyzerOneRun SourceSignatureAnalyzerCode

# SignatureAnalyzerInteraction.R


#' Standardize SignatureAnalyzer signature names
#'
#' For example, change \code{BI_COMPOSITE_SNV_SBS83_P}
#' to \code{BI_COMPOSITE_SBS83_P}
#'
#' This is necessary because
#' for COMPOSITE signatures we rbind coordinated
#' "SNV", "DNP", and "INDEL" signatures.
#'
#' @param sig.names Vector of signature names
#'
#' @return Vector of signatures names with "_SNV" removed.
#'
#' @export

FixSASigNames <- function(sig.names) {
  return(gsub("_SNV_", "_", sig.names, fixed = TRUE))
}



#' Source SignatureAnalyzer Codes.
#'
#' @param signatureanalyzer.code.dir The directory which stores
#' SignatureAnalyzer program files. It must include a folder
#' named \code{INPUT_SignatureAnalyzer} and a R script named
#' \code{SignatureAnalyer.PCAWG.function.R}

SourceSignatureAnalyzerCode <-
  function(signatureanalyzer.code.dir) {
    if (!dir.exists(signatureanalyzer.code.dir)) {
      stop("SignatureAnalyzer code directory, ",
           signatureanalyzer.code.dir,
           " does not exist; getwd() == ", getwd())
    }
    #here <- getwd()
    #setwd(signatureanalyzer.code.dir)

    folderFullPath <- paste0(signatureanalyzer.code.dir,"/INPUT_SignatureAnalyzer/")
    assign("INPUT",
           folderFullPath,
           envir = envSA)
    if(!dir.exists(folderFullPath))
      dir.create(folderFullPath, recursive = T)


    suppressWarnings(
      suppressPackageStartupMessages(
        # Store the SA functions into envSA.
        sys.source(
          paste0(signatureanalyzer.code.dir,"/SignatureAnalyzer.PCAWG.function.R"),
          envir = envSA)
      )
    )
    #setwd(here) # This is necessary because the caller
    # as specified input and output locations
    # relative to here.
  }

#' Run SignatureAnalyzer on a file containing a catalog AFTER the
#' SignatureAnalyzer code has been source'ed.
#'
#' Normally, please call \code{\link{SignatureAnalyzerOneRun}}
#' instead of this function.
#'
#' @param input.catalog File containing input catalog.  Columns are
#' samples (tumors), rows are signatures.  SignatureAnalyzer does
#' not care about the row names (I think) TODO(Steve): check this.
#'
#' @param out.dir Directory that will be created for the output;
#' abort if it already exits.  Log files will be in
#' \code{paste0(out.dir, "/tmp")}.
#'
#' @param input.exposures A file with the synthetic exposures used to generate
#' \code{input.catalog}; if provided here,
#' this is copied over to the output directory
#' for downstream analysis.
#'
#' @param maxK The maximum number of signatures to consider
#' extracting.
#'
#' @param tol Controls when SignatureAnalyzer will terminate
#' its search; \code{tol} was 1.e-05 for the PCAWG7 analysis.
#'
#' @param test.only If TRUE, only analyze the first 10 columns
#' read in from \code{input.catalog}.
#'
#' @param delete.tmp.files If TRUE delete the many temporary
#'  files generated by SignatureAnalyzer.
#'
#' @param overwrite If TRUE, overwrite existing output
#'
#' @return A list with the following elements:
#' \enumerate{
#'  \item \code{signatures.W} The raw signature matrix, *including*
#'        columns of all zeros.
#'  \item \code{exposures.H} The raw exposure matrix, *excluding*
#'        rows of all zeros. The matrix
#'        product of the non-zero columns of \code{signatures.w}
#'        and \code{exposures.H} approximates the input spectrum
#'        matrix.
#'  \item \code{likelihood} The likelihood as returned by
#'  SignatureAnalyzer.
#'  \item \code{evidence} -1 * the posterior probability
#'  as returned by SignatureAnalyzer.
#'  \item \code{relevance} One for each column of the \code{signatures.W},
#'  as returned by SignatureAnalyzer.
#'  \item \code{error} A measure of reconstruction error (?) as
#'  returned by SignatureAnalyzer
#'  \item \code{normalized.sigs} The non-0 columns of \code{signatures.W}
#'  normalized so that each column sum is 1.
#' }
#'
#' @details Creates several files in \code{out.dir}:
#' \enumerate{
#' \item \code{sa.output.sigs.csv} Normalized signatures (no all-0 signatures,
#' column sums all 0)
#' \item \code{sa.output.raw.exp.csv} Raw exposures (attributions)
#' \item \code{sa.output.exp.csv} Same as \code{sa.output.raw.exp.csv}
#' \item \code{sa.output.other.data.csv}, contains a summary of important
#' information, including the number of signatures extracted.
#' \item \code{input.syn.exp.csv} Optional, a copy of \code{input.exposures},
#' if it was provided.
#' }
#'
#' @export
#'
#' @importFrom utils capture.output

RunSignatureAnalyzerOnFile <-
  function(input.catalog,
           out.dir,
           input.exposures = NULL,
           maxK = 30,
           tol = 1e-7,
           test.only = FALSE,
           delete.tmp.files = TRUE,
           overwrite = FALSE) {

    syn.data <- ICAMS::ReadCatalog(input.catalog,
                                   strict = FALSE)

    if (test.only) syn.data <- syn.data[ , 1:10]

    if (dir.exists(out.dir)) {
      if (!overwrite) stop(out.dir, " already exits")
    } else {
      dir.create(out.dir, recursive = TRUE)
    }
    # TEMPORARY is a global required by SignatureAnalyzer
    assign("TEMPORARY",paste0(out.dir, "/tmp/"),envir = envSA)
    if (dir.exists(envSA$TEMPORARY)) {
      if (!overwrite) stop("Directory ", envSA$TEMPORARY, " already exists")
    } else {
      dir.create(envSA$TEMPORARY, recursive = TRUE)
    }

    suppressWarnings(
      # The suppressed warnings relate to some deprecated
      # plotting options used by SignatureAnalyzer.
      capture.output(
        # BayesNMF.L1W.L2H is defined by the statement
        # source("SignatureAnalyzer.PCAWG.function.R") above.
        # BayesNMF.L1W.L2H will output:
        # [[1]]: extracted signatures
        # [[2]]: RAW-inferred exposures (not finalized)
        # [[3]]: likelihood -
        # [[4]]: evidence -
        # [[5]]: relevance -
        # [[6]]: error -
        out.data <-
          envSA$BayesNMF.L1W.L2H(syn.data, 200000, 10, 5, tol, maxK, maxK, 1),
        file = paste0(envSA$TEMPORARY, "captured.output.txt")))

    # out.data[[1]] has raw extracted signatures; the sum of each signature != 1
    sigs <- out.data[[1]]
    sigs.to.use <- which(colSums(sigs) > 1 )
    stopifnot(length(sigs.to.use) > 0)
    sigs <- sigs[ , sigs.to.use, drop = FALSE]
    # Normalize the mutational signatures so that the sum of all channels equal
    # to 1.
    sigs <- apply(sigs, 2, function(x) x/sum(x))
    # Change the names of extracted signatures to W1, W2,...
    new.names <- paste0("W.", 1:ncol(sigs))
    colnames(sigs) <- new.names

    # exp.raw is not the exposure SignatureAnalyzer is supposed to eventually
    # output. A separate function to be called on the output from the current
    # function will compuate fine-tuned inferred exposures.
    exp.raw <- out.data[[2]]
    exp.raw <- exp.raw[sigs.to.use, , drop = FALSE]
    rownames(exp.raw) <- new.names
    out.data[[2]] <- exp.raw

    out.data[[7]] <- sigs # Normalized sigs

    names(out.data) <- c("signatures.W", "exposures.H",
                         "likelihood", "evidence",
                         "relevance", "error",
                         "normalized.sigs")

    # Convert normalized signatures to ICAMS-format.
    sigs <- ICAMS::as.catalog(sigs, catalog.type = "counts.signature")

    # Output ICAMS-formatted signature catalog.
    ICAMS::WriteCatalog(sigs,
                        paste0(out.dir, "/sa.output.sigs.csv"))

    # Output raw attributions / exposures
    SynSigGen::WriteExposure(exp.raw,
                  file = paste0(out.dir, "/sa.output.raw.exp.csv"))

    # Output exposure attribution in mutation counts
    exp.normalized <- exp.raw
    for(ii in 1:ncol(exp.normalized)){
      exp.normalized[,ii] <- exp.normalized[,ii] / sum(exp.normalized[,ii])
      exp.normalized[,ii] <- exp.normalized[,ii] * sum(syn.data[,ii])
    }

    SynSigGen::WriteExposure(exp.normalized,
                  file = paste0(out.dir, "/sa.output.exp.csv"))

    if (!is.null(input.exposures)) {
      SynSigGen::WriteExposure(
        SynSigGen::ReadExposure(input.exposures),
        file = paste0(out.dir, "/input.syn.exp.csv"))
    }

    other.data <- paste0(out.dir, "/sa.output.other.data.csv")

    cat("num.sigs,", ncol(sigs), "\n", sep = "", file = other.data)
    cat("likelihood,", out.data$likelihood, "\n",
        sep = "", file = other.data, append = TRUE)
    cat("evidence,", out.data$evidence, "\n",
        sep = "", file = other.data, append = TRUE)
    cat("relevance,",
        paste(out.data$relevance, collapse = ","), "\n",
        sep = "", file = other.data, append = TRUE)
    cat("error,", out.data$error, "\n",
        sep = "", file = other.data, append = TRUE)

    if (delete.tmp.files) unlink(envSA$TEMPORARY, recursive = TRUE)

    invisible(out.data)
  }


#' Source SignatureAnalyzer and run it once on a single data set
#' and put results in specified location.
#'
#' @param signatureanalyzer.code.dir The directory holding the
#' SignatureAnalyzer code.
#'
#' @param seedNumber Specify the pseudo-random seed number
#' used to run SignatureAnalyzer. Setting seed can make the
#' attribution of SignatureAnalyzer repeatable.
#' If NULL, this function will not specify seed number.
#' Default: NULL.
#'
#' @param verbose If \code{TRUE}, then print various messages.
#'
#' @inheritParams RunSignatureAnalyzerOnFile
#'
#' @return A list with the following elements:
#' \enumerate{
#'  \item \code{signatures.W} The raw signature matrix, *including*
#'        columns of all zeros.
#'  \item \code{exposures.H} The raw exposure matrix, *excluding*
#'        rows of all zeros. The matrix
#'        product of the non-zero columns of \code{signatures.w}
#'        and \code{exposures.H} approximates the input spectrum
#'        matrix.
#'  \item \code{likelihood} The likelihood as returned by
#'  SignatureAnalyzer.
#'  \item \code{evidence} -1 * the posterior probability
#'  as returned by SignatureAnalyzer.
#'  \item \code{relevance} One for each column of the \code{signatures.W},
#'  as returned by SignatureAnalyzer.
#'  \item \code{error} A measure of reconstruction error (?) as
#'  returned by SignatureAnalyzer
#'  \item \code{normalized.sigs} The non-0 columns of \code{signatures.W}
#'  normalized so that each column sum is 1.
#' }
#'
#' @details Creates several files in \code{out.dir}:
#' \enumerate{
#' \item \code{sa.output.sigs.csv} Normalized signatures (no all-0 signatures,
#' column sums all 0)
#' \item \code{sa.output.raw.exp.csv} Raw exposures (attributions)
#' \item \code{sa.output.exp.csv} Same as \code{sa.output.raw.exp.csv}
#' \item \code{sa.output.other.data.csv}, contains a summary of important
#' information, including the number of signatures extracted.
#' \item \code{input.syn.exp.csv} Optional, a copy of \code{input.exposures},
#' if it was provided.
#' }
#'
SignatureAnalyzerOneRun <-
  function(signatureanalyzer.code.dir,
           input.catalog,
           out.dir,
           seedNumber = NULL,
           input.exposures = NULL,
           maxK = 30,
           tol = 1e-7,
           test.only = FALSE,
           delete.tmp.files = TRUE,
           verbose = 0,
           overwrite = FALSE)
{
  # Specify seeds, if given
  if(!is.null(seedNumber)){
    set.seed(seedNumber)
    seedInUse <- .Random.seed  # Save the seed used so that we can restore the pseudorandom series
    RNGInUse <- RNGkind() # Save the random number generator (RNG) used
  }

  options(warn = 0)
  SourceSignatureAnalyzerCode(signatureanalyzer.code.dir)

  if (verbose)
    cat("Running SignatureAnalyzerOneRun in", out.dir, "\n")
  retval <-
    RunSignatureAnalyzerOnFile(
      input.catalog = input.catalog,
      out.dir = out.dir,
      input.exposures = input.exposures,
      maxK = maxK,
      tol = tol,
      test.only = test.only,
      delete.tmp.files = delete.tmp.files,
      overwrite = overwrite)

  invisible(retval)
}

#' Run SignatureAnalyzer many times on one catalog and put results
#' in specified location.
#'
#' @param num.runs The number of times run SignatureAnalyzer on each
#' catalog (matrix of mutational spectra).
#'
#' @param signatureanalyzer.code.dir The directory holding the
#' SignatureAnalyzer code.
#'
#' @param input.catalog The catalog to analyze.
#'
#' @param out.dir Root of directory tree that will contain the
#' results.
#'
#' @param maxK The maximum number of signatures to consider
#' extracting.
#'
#' @param tol Controls when SignatureAnalyzer will terminate
#' its search; \code{tol} was 1.e-05 for the PCAWG7 analysis.
#'
#' @param test.only If TRUE, only analyze the first 10 columns
#' read in from \code{input.catalog}.
#'
#' @param delete.tmp.files If TRUE delete the many temporary
#'  files generated by SignatureAnalyzer.
#'
#' @param overwrite If TRUE overwrite previous results in same directory tree.
#'
#' @param mc.cores Number of cores to use for each SignatureAnalyzer run.
#' \code{mclapply}; ignored on Windows.
#' The total cores used simultaneously = \code{num.runs} * \code{mc.cores}.
#'
#' @param verbose If TRUE cat a message regarding progress.
#'
#' @param seed If not \code{NULL} call
#' \code{RNGkind(kind = "L'Ecuyer-CMRG"); set.seed(seed)}.
#'
#' @importFrom parallel mclapply
#'
#' @export
SAMultiRunOneCatalog <-
  function(num.runs,
           signatureanalyzer.code.dir,
           input.catalog,
           out.dir,
           maxK = 30,
           tol = 1e-7,
           test.only = FALSE,
           delete.tmp.files = TRUE,
           overwrite = FALSE,
           mc.cores = 1,
           verbose = FALSE,
           seed = NULL) {

    if (!dir.exists(out.dir)) {
     if (!dir.create(out.dir, recursive = TRUE)) {
       stop("Failed to create ", out.dir,
            "when getwd() == ", getwd())
     }
    }
    else warning(out.dir, "exists, overwriting")

    if (!file.exists(input.catalog)) {
      stop("Input catalog ", input.catalog,
           " does not exist when getwd() = ", getwd())
    }

    if (!is.null(seed)) {
      RNGkind(kind = "L'Ecuyer-CMRG")
      set.seed(seed)
    }

    RunOneIndex <- function(i) {
      out.dir2 <- paste0(out.dir, "/sa.run.", i)
      signature.analyzer.output <-
        SignatureAnalyzerOneRun(
          signatureanalyzer.code.dir = signatureanalyzer.code.dir,
          input.catalog = input.catalog,
          out.dir = out.dir2,
          maxK = maxK,
          tol = tol,
          test.only = test.only,
          delete.tmp.files = delete.tmp.files,
          overwrite = overwrite)
      attr(signature.analyzer.output, "message") <-
        paste0("Success for ", out.dir2)
      return(signature.analyzer.output)
    }

    mc.cores.to.use <-
      ifelse(Sys.info()["sysname"] == "Windows", 1, mc.cores)

    if (verbose) {
      cat("Using", mc.cores.to.use, " cores\n")
    }

    mc.output <-
      mclapply(1:num.runs, FUN = RunOneIndex, mc.cores = mc.cores.to.use)

    capture.output(
      print(mc.output),
      file = paste0(out.dir, "/verbose.txt"))

    attr(mc.output, "wd") <- getwd()
    attr(mc.output, "out.dir") <- out.dir
    attr(mc.output, "mc.cores") <- mc.cores.to.use
    return(mc.output)
  }

#' Run SignatureAnalyzer on 4 coordinated data sets and put results
#' in specified location.
#'
#' @details The 4 coordinated data sets are
#'
#' \enumerate{
#' \item \code{sa.sa.96}
#'
#' \item \code{sp.sp}
#'
#' \item \code{sa.sa.COMPOSITE}
#'
#' \item \code{sp.sa.COMPOSITE}
#'
#' }
#' which are described elsewhere.
#'
#'
#' @param num.runs Number of SignatureAnalyzer runs per data set.
#'
#' @param signatureanalyzer.code.dir The directory holding the
#' SignatureAnalyzer code.
#'
#' @param dir.root Root of directory tree that contains the
#' input data and to which the results will be written.
#'
#' @param maxK The maximum number of signatures to consider
#' extracting.
#'
#' @param tol Controls when SignatureAnalyzer will terminate
#' its search; \code{tol} was 1.e-05 for the PCAWG7 analysis.
#'
#' @param test.only If TRUE, only analyze the first 10 columns
#' read in from \code{input.catalog}.
#'
#' @param delete.tmp.files If TRUE delete the many temporary
#'  files generated by SignatureAnalyzer.
#'
#' @param slice Vector of integers from 1:4. Only run on the
#' corresponding data set (see Details).
#'
#' @param overwrite if TRUE overwrite preexisting results.
#'
#' @param mc.cores The number of cores to use with \code{mclapply};
#' automatically overridden to 1 on Windows.
#'
#' @export
#'
#' @importFrom ICAMS WriteCatalog ReadCatalog

SignatureAnalyzer4MatchedCatalogs <-
  function(
    num.runs = 20,
    signatureanalyzer.code.dir,
    dir.root,
    maxK = 30,
    tol = 1e-7,
    test.only = FALSE,
    delete.tmp.files = TRUE,
    slice = 1:4,
    overwrite = FALSE,
    mc.cores = 1) {

    if (!dir.exists(dir.root)) stop(dir.root, "does not exist")

    subdirs <- c("sa.sa.96", "sp.sp", "sa.sa.COMPOSITE", "sp.sa.COMPOSITE")

    tmp.fn <-
      #function(subdir, read.fn, write.fn)
      function(subdir) {
      retval1 <-
        SAMultiRunOneCatalog(
          num.runs = num.runs,
          signatureanalyzer.code.dir = signatureanalyzer.code.dir,
          input.catalog = paste0(dir.root, "/", subdir, "/ground.truth.syn.catalog.csv"),
          out.dir = paste0(dir.root, "/", subdir, "/sa.results/"),
          maxK = maxK,
          tol = tol,
          test.only = test.only,
          delete.tmp.files = delete.tmp.files,
          overwrite = overwrite,
          mc.cores = mc.cores)
      return(retval1)
    }

  retval2 <-
    mapply(tmp.fn, subdirs[slice])

  invisible(retval2)
  }
WuyangFF95/SynSigRun documentation built on Oct. 7, 2022, 1:16 p.m.