# SignatureAnalyzerInteraction.R
#' Standardize SignatureAnalyzer signature names
#'
#' For example, change \code{BI_COMPOSITE_SNV_SBS83_P}
#' to \code{BI_COMPOSITE_SBS83_P}
#'
#' This is necessary because
#' for COMPOSITE signatures we rbind coordinated
#' "SNV", "DNP", and "INDEL" signatures.
#'
#' @param sig.names Vector of signature names
#'
#' @return Vector of signatures names with "_SNV" removed.
#'
#' @export
FixSASigNames <- function(sig.names) {
return(gsub("_SNV_", "_", sig.names, fixed = TRUE))
}
#' Source SignatureAnalyzer Codes.
#'
#' @param signatureanalyzer.code.dir The directory which stores
#' SignatureAnalyzer program files. It must include a folder
#' named \code{INPUT_SignatureAnalyzer} and a R script named
#' \code{SignatureAnalyer.PCAWG.function.R}
SourceSignatureAnalyzerCode <-
function(signatureanalyzer.code.dir) {
if (!dir.exists(signatureanalyzer.code.dir)) {
stop("SignatureAnalyzer code directory, ",
signatureanalyzer.code.dir,
" does not exist; getwd() == ", getwd())
}
#here <- getwd()
#setwd(signatureanalyzer.code.dir)
folderFullPath <- paste0(signatureanalyzer.code.dir,"/INPUT_SignatureAnalyzer/")
assign("INPUT",
folderFullPath,
envir = envSA)
if(!dir.exists(folderFullPath))
dir.create(folderFullPath, recursive = T)
suppressWarnings(
suppressPackageStartupMessages(
# Store the SA functions into envSA.
sys.source(
paste0(signatureanalyzer.code.dir,"/SignatureAnalyzer.PCAWG.function.R"),
envir = envSA)
)
)
#setwd(here) # This is necessary because the caller
# as specified input and output locations
# relative to here.
}
#' Run SignatureAnalyzer on a file containing a catalog AFTER the
#' SignatureAnalyzer code has been source'ed.
#'
#' Normally, please call \code{\link{SignatureAnalyzerOneRun}}
#' instead of this function.
#'
#' @param input.catalog File containing input catalog. Columns are
#' samples (tumors), rows are signatures. SignatureAnalyzer does
#' not care about the row names (I think) TODO(Steve): check this.
#'
#' @param out.dir Directory that will be created for the output;
#' abort if it already exits. Log files will be in
#' \code{paste0(out.dir, "/tmp")}.
#'
#' @param input.exposures A file with the synthetic exposures used to generate
#' \code{input.catalog}; if provided here,
#' this is copied over to the output directory
#' for downstream analysis.
#'
#' @param maxK The maximum number of signatures to consider
#' extracting.
#'
#' @param tol Controls when SignatureAnalyzer will terminate
#' its search; \code{tol} was 1.e-05 for the PCAWG7 analysis.
#'
#' @param test.only If TRUE, only analyze the first 10 columns
#' read in from \code{input.catalog}.
#'
#' @param delete.tmp.files If TRUE delete the many temporary
#' files generated by SignatureAnalyzer.
#'
#' @param overwrite If TRUE, overwrite existing output
#'
#' @return A list with the following elements:
#' \enumerate{
#' \item \code{signatures.W} The raw signature matrix, *including*
#' columns of all zeros.
#' \item \code{exposures.H} The raw exposure matrix, *excluding*
#' rows of all zeros. The matrix
#' product of the non-zero columns of \code{signatures.w}
#' and \code{exposures.H} approximates the input spectrum
#' matrix.
#' \item \code{likelihood} The likelihood as returned by
#' SignatureAnalyzer.
#' \item \code{evidence} -1 * the posterior probability
#' as returned by SignatureAnalyzer.
#' \item \code{relevance} One for each column of the \code{signatures.W},
#' as returned by SignatureAnalyzer.
#' \item \code{error} A measure of reconstruction error (?) as
#' returned by SignatureAnalyzer
#' \item \code{normalized.sigs} The non-0 columns of \code{signatures.W}
#' normalized so that each column sum is 1.
#' }
#'
#' @details Creates several files in \code{out.dir}:
#' \enumerate{
#' \item \code{sa.output.sigs.csv} Normalized signatures (no all-0 signatures,
#' column sums all 0)
#' \item \code{sa.output.raw.exp.csv} Raw exposures (attributions)
#' \item \code{sa.output.exp.csv} Same as \code{sa.output.raw.exp.csv}
#' \item \code{sa.output.other.data.csv}, contains a summary of important
#' information, including the number of signatures extracted.
#' \item \code{input.syn.exp.csv} Optional, a copy of \code{input.exposures},
#' if it was provided.
#' }
#'
#' @export
#'
#' @importFrom utils capture.output
RunSignatureAnalyzerOnFile <-
function(input.catalog,
out.dir,
input.exposures = NULL,
maxK = 30,
tol = 1e-7,
test.only = FALSE,
delete.tmp.files = TRUE,
overwrite = FALSE) {
syn.data <- ICAMS::ReadCatalog(input.catalog,
strict = FALSE)
if (test.only) syn.data <- syn.data[ , 1:10]
if (dir.exists(out.dir)) {
if (!overwrite) stop(out.dir, " already exits")
} else {
dir.create(out.dir, recursive = TRUE)
}
# TEMPORARY is a global required by SignatureAnalyzer
assign("TEMPORARY",paste0(out.dir, "/tmp/"),envir = envSA)
if (dir.exists(envSA$TEMPORARY)) {
if (!overwrite) stop("Directory ", envSA$TEMPORARY, " already exists")
} else {
dir.create(envSA$TEMPORARY, recursive = TRUE)
}
suppressWarnings(
# The suppressed warnings relate to some deprecated
# plotting options used by SignatureAnalyzer.
capture.output(
# BayesNMF.L1W.L2H is defined by the statement
# source("SignatureAnalyzer.PCAWG.function.R") above.
# BayesNMF.L1W.L2H will output:
# [[1]]: extracted signatures
# [[2]]: RAW-inferred exposures (not finalized)
# [[3]]: likelihood -
# [[4]]: evidence -
# [[5]]: relevance -
# [[6]]: error -
out.data <-
envSA$BayesNMF.L1W.L2H(syn.data, 200000, 10, 5, tol, maxK, maxK, 1),
file = paste0(envSA$TEMPORARY, "captured.output.txt")))
# out.data[[1]] has raw extracted signatures; the sum of each signature != 1
sigs <- out.data[[1]]
sigs.to.use <- which(colSums(sigs) > 1 )
stopifnot(length(sigs.to.use) > 0)
sigs <- sigs[ , sigs.to.use, drop = FALSE]
# Normalize the mutational signatures so that the sum of all channels equal
# to 1.
sigs <- apply(sigs, 2, function(x) x/sum(x))
# Change the names of extracted signatures to W1, W2,...
new.names <- paste0("W.", 1:ncol(sigs))
colnames(sigs) <- new.names
# exp.raw is not the exposure SignatureAnalyzer is supposed to eventually
# output. A separate function to be called on the output from the current
# function will compuate fine-tuned inferred exposures.
exp.raw <- out.data[[2]]
exp.raw <- exp.raw[sigs.to.use, , drop = FALSE]
rownames(exp.raw) <- new.names
out.data[[2]] <- exp.raw
out.data[[7]] <- sigs # Normalized sigs
names(out.data) <- c("signatures.W", "exposures.H",
"likelihood", "evidence",
"relevance", "error",
"normalized.sigs")
# Convert normalized signatures to ICAMS-format.
sigs <- ICAMS::as.catalog(sigs, catalog.type = "counts.signature")
# Output ICAMS-formatted signature catalog.
ICAMS::WriteCatalog(sigs,
paste0(out.dir, "/sa.output.sigs.csv"))
# Output raw attributions / exposures
SynSigGen::WriteExposure(exp.raw,
file = paste0(out.dir, "/sa.output.raw.exp.csv"))
# Output exposure attribution in mutation counts
exp.normalized <- exp.raw
for(ii in 1:ncol(exp.normalized)){
exp.normalized[,ii] <- exp.normalized[,ii] / sum(exp.normalized[,ii])
exp.normalized[,ii] <- exp.normalized[,ii] * sum(syn.data[,ii])
}
SynSigGen::WriteExposure(exp.normalized,
file = paste0(out.dir, "/sa.output.exp.csv"))
if (!is.null(input.exposures)) {
SynSigGen::WriteExposure(
SynSigGen::ReadExposure(input.exposures),
file = paste0(out.dir, "/input.syn.exp.csv"))
}
other.data <- paste0(out.dir, "/sa.output.other.data.csv")
cat("num.sigs,", ncol(sigs), "\n", sep = "", file = other.data)
cat("likelihood,", out.data$likelihood, "\n",
sep = "", file = other.data, append = TRUE)
cat("evidence,", out.data$evidence, "\n",
sep = "", file = other.data, append = TRUE)
cat("relevance,",
paste(out.data$relevance, collapse = ","), "\n",
sep = "", file = other.data, append = TRUE)
cat("error,", out.data$error, "\n",
sep = "", file = other.data, append = TRUE)
if (delete.tmp.files) unlink(envSA$TEMPORARY, recursive = TRUE)
invisible(out.data)
}
#' Source SignatureAnalyzer and run it once on a single data set
#' and put results in specified location.
#'
#' @param signatureanalyzer.code.dir The directory holding the
#' SignatureAnalyzer code.
#'
#' @param seedNumber Specify the pseudo-random seed number
#' used to run SignatureAnalyzer. Setting seed can make the
#' attribution of SignatureAnalyzer repeatable.
#' If NULL, this function will not specify seed number.
#' Default: NULL.
#'
#' @param verbose If \code{TRUE}, then print various messages.
#'
#' @inheritParams RunSignatureAnalyzerOnFile
#'
#' @return A list with the following elements:
#' \enumerate{
#' \item \code{signatures.W} The raw signature matrix, *including*
#' columns of all zeros.
#' \item \code{exposures.H} The raw exposure matrix, *excluding*
#' rows of all zeros. The matrix
#' product of the non-zero columns of \code{signatures.w}
#' and \code{exposures.H} approximates the input spectrum
#' matrix.
#' \item \code{likelihood} The likelihood as returned by
#' SignatureAnalyzer.
#' \item \code{evidence} -1 * the posterior probability
#' as returned by SignatureAnalyzer.
#' \item \code{relevance} One for each column of the \code{signatures.W},
#' as returned by SignatureAnalyzer.
#' \item \code{error} A measure of reconstruction error (?) as
#' returned by SignatureAnalyzer
#' \item \code{normalized.sigs} The non-0 columns of \code{signatures.W}
#' normalized so that each column sum is 1.
#' }
#'
#' @details Creates several files in \code{out.dir}:
#' \enumerate{
#' \item \code{sa.output.sigs.csv} Normalized signatures (no all-0 signatures,
#' column sums all 0)
#' \item \code{sa.output.raw.exp.csv} Raw exposures (attributions)
#' \item \code{sa.output.exp.csv} Same as \code{sa.output.raw.exp.csv}
#' \item \code{sa.output.other.data.csv}, contains a summary of important
#' information, including the number of signatures extracted.
#' \item \code{input.syn.exp.csv} Optional, a copy of \code{input.exposures},
#' if it was provided.
#' }
#'
SignatureAnalyzerOneRun <-
function(signatureanalyzer.code.dir,
input.catalog,
out.dir,
seedNumber = NULL,
input.exposures = NULL,
maxK = 30,
tol = 1e-7,
test.only = FALSE,
delete.tmp.files = TRUE,
verbose = 0,
overwrite = FALSE)
{
# Specify seeds, if given
if(!is.null(seedNumber)){
set.seed(seedNumber)
seedInUse <- .Random.seed # Save the seed used so that we can restore the pseudorandom series
RNGInUse <- RNGkind() # Save the random number generator (RNG) used
}
options(warn = 0)
SourceSignatureAnalyzerCode(signatureanalyzer.code.dir)
if (verbose)
cat("Running SignatureAnalyzerOneRun in", out.dir, "\n")
retval <-
RunSignatureAnalyzerOnFile(
input.catalog = input.catalog,
out.dir = out.dir,
input.exposures = input.exposures,
maxK = maxK,
tol = tol,
test.only = test.only,
delete.tmp.files = delete.tmp.files,
overwrite = overwrite)
invisible(retval)
}
#' Run SignatureAnalyzer many times on one catalog and put results
#' in specified location.
#'
#' @param num.runs The number of times run SignatureAnalyzer on each
#' catalog (matrix of mutational spectra).
#'
#' @param signatureanalyzer.code.dir The directory holding the
#' SignatureAnalyzer code.
#'
#' @param input.catalog The catalog to analyze.
#'
#' @param out.dir Root of directory tree that will contain the
#' results.
#'
#' @param maxK The maximum number of signatures to consider
#' extracting.
#'
#' @param tol Controls when SignatureAnalyzer will terminate
#' its search; \code{tol} was 1.e-05 for the PCAWG7 analysis.
#'
#' @param test.only If TRUE, only analyze the first 10 columns
#' read in from \code{input.catalog}.
#'
#' @param delete.tmp.files If TRUE delete the many temporary
#' files generated by SignatureAnalyzer.
#'
#' @param overwrite If TRUE overwrite previous results in same directory tree.
#'
#' @param mc.cores Number of cores to use for each SignatureAnalyzer run.
#' \code{mclapply}; ignored on Windows.
#' The total cores used simultaneously = \code{num.runs} * \code{mc.cores}.
#'
#' @param verbose If TRUE cat a message regarding progress.
#'
#' @param seed If not \code{NULL} call
#' \code{RNGkind(kind = "L'Ecuyer-CMRG"); set.seed(seed)}.
#'
#' @importFrom parallel mclapply
#'
#' @export
SAMultiRunOneCatalog <-
function(num.runs,
signatureanalyzer.code.dir,
input.catalog,
out.dir,
maxK = 30,
tol = 1e-7,
test.only = FALSE,
delete.tmp.files = TRUE,
overwrite = FALSE,
mc.cores = 1,
verbose = FALSE,
seed = NULL) {
if (!dir.exists(out.dir)) {
if (!dir.create(out.dir, recursive = TRUE)) {
stop("Failed to create ", out.dir,
"when getwd() == ", getwd())
}
}
else warning(out.dir, "exists, overwriting")
if (!file.exists(input.catalog)) {
stop("Input catalog ", input.catalog,
" does not exist when getwd() = ", getwd())
}
if (!is.null(seed)) {
RNGkind(kind = "L'Ecuyer-CMRG")
set.seed(seed)
}
RunOneIndex <- function(i) {
out.dir2 <- paste0(out.dir, "/sa.run.", i)
signature.analyzer.output <-
SignatureAnalyzerOneRun(
signatureanalyzer.code.dir = signatureanalyzer.code.dir,
input.catalog = input.catalog,
out.dir = out.dir2,
maxK = maxK,
tol = tol,
test.only = test.only,
delete.tmp.files = delete.tmp.files,
overwrite = overwrite)
attr(signature.analyzer.output, "message") <-
paste0("Success for ", out.dir2)
return(signature.analyzer.output)
}
mc.cores.to.use <-
ifelse(Sys.info()["sysname"] == "Windows", 1, mc.cores)
if (verbose) {
cat("Using", mc.cores.to.use, " cores\n")
}
mc.output <-
mclapply(1:num.runs, FUN = RunOneIndex, mc.cores = mc.cores.to.use)
capture.output(
print(mc.output),
file = paste0(out.dir, "/verbose.txt"))
attr(mc.output, "wd") <- getwd()
attr(mc.output, "out.dir") <- out.dir
attr(mc.output, "mc.cores") <- mc.cores.to.use
return(mc.output)
}
#' Run SignatureAnalyzer on 4 coordinated data sets and put results
#' in specified location.
#'
#' @details The 4 coordinated data sets are
#'
#' \enumerate{
#' \item \code{sa.sa.96}
#'
#' \item \code{sp.sp}
#'
#' \item \code{sa.sa.COMPOSITE}
#'
#' \item \code{sp.sa.COMPOSITE}
#'
#' }
#' which are described elsewhere.
#'
#'
#' @param num.runs Number of SignatureAnalyzer runs per data set.
#'
#' @param signatureanalyzer.code.dir The directory holding the
#' SignatureAnalyzer code.
#'
#' @param dir.root Root of directory tree that contains the
#' input data and to which the results will be written.
#'
#' @param maxK The maximum number of signatures to consider
#' extracting.
#'
#' @param tol Controls when SignatureAnalyzer will terminate
#' its search; \code{tol} was 1.e-05 for the PCAWG7 analysis.
#'
#' @param test.only If TRUE, only analyze the first 10 columns
#' read in from \code{input.catalog}.
#'
#' @param delete.tmp.files If TRUE delete the many temporary
#' files generated by SignatureAnalyzer.
#'
#' @param slice Vector of integers from 1:4. Only run on the
#' corresponding data set (see Details).
#'
#' @param overwrite if TRUE overwrite preexisting results.
#'
#' @param mc.cores The number of cores to use with \code{mclapply};
#' automatically overridden to 1 on Windows.
#'
#' @export
#'
#' @importFrom ICAMS WriteCatalog ReadCatalog
SignatureAnalyzer4MatchedCatalogs <-
function(
num.runs = 20,
signatureanalyzer.code.dir,
dir.root,
maxK = 30,
tol = 1e-7,
test.only = FALSE,
delete.tmp.files = TRUE,
slice = 1:4,
overwrite = FALSE,
mc.cores = 1) {
if (!dir.exists(dir.root)) stop(dir.root, "does not exist")
subdirs <- c("sa.sa.96", "sp.sp", "sa.sa.COMPOSITE", "sp.sa.COMPOSITE")
tmp.fn <-
#function(subdir, read.fn, write.fn)
function(subdir) {
retval1 <-
SAMultiRunOneCatalog(
num.runs = num.runs,
signatureanalyzer.code.dir = signatureanalyzer.code.dir,
input.catalog = paste0(dir.root, "/", subdir, "/ground.truth.syn.catalog.csv"),
out.dir = paste0(dir.root, "/", subdir, "/sa.results/"),
maxK = maxK,
tol = tol,
test.only = test.only,
delete.tmp.files = delete.tmp.files,
overwrite = overwrite,
mc.cores = mc.cores)
return(retval1)
}
retval2 <-
mapply(tmp.fn, subdirs[slice])
invisible(retval2)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.