#' Run SignatureAnalyzer attribution on a catalog file and output by RunSignatureAnalyzerOnFile().
#'
#' Normally, please call \code{\link{SignatureAnalyzerOneRun}}
#' instead of this function.
#'
#' @param input.catalog File containing input catalog. Columns are
#' samples (tumors), rows are signatures. SignatureAnalyzer does
#' not care about the row names (I think) TODO(Steve): check this.
#'
#' @param read.catalog.function Function taking a file path as
#' its only argument and returning a catalog as a numeric matrix.
#'
#' @param extracted.signature.file A .csv file containing extracted
#' signatures. Normally, this file is named \code{"sa.output.sigs.csv"}
#' and is generated by function \code{RunSignatureAnalyzerOnFile()}.
#' It expects to have the same format as the \code{input.catalog},
#' thus it will be read by \code{read.catalog.function} too.
#'
#' @param raw.exposures.file A .csv file containing raw attributions
#' of exposures. Normally, this file is named \code{"sa.output.raw.exp.csv"}
#' and is generated by function \code{RunSignatureAnalyzerOnFile()}.
#'
#' @param out.dir Directory that will be created for the output;
#' abort if it already exits. Log files will be in
#' \code{paste0(out.dir, "/tmp")}.
#'
#' @param write.signature.function Function with first argument the
#' signatures generated by SignatureAnalyzer and second argument
#' the file to write to.
#'
#' @param input.exposures A file with the synthetic exposures used to generate
#' \code{input.catalog}; if provided here,
#' this is copied over to the output directory
#' for downstream analysis.
#'
#' @param test.only If TRUE, only analyze the first 10 columns
#' read in from \code{input.catalog}.
#'
#' @param delete.tmp.files If TRUE delete the many temporary
#' files generated by SignatureAnalyzer.
#'
#' @param verbose If \code{TRUE} cat a message regarding progress.
#'
#' @param overwrite If TRUE, overwrite existing output.
#'
#' @return The final attribution matrix.
#' (i.e. \code{exp.fine.tuned})
#'
#' @details Save the final attribution of a catalog matrix into
#' a file named \code{"sa.output.fine.exp.csv"} under the
#' folder \code{out.dir}.
#'
#' @export
#'
#' @importFrom utils capture.output
RunSignatureAnalyzerAttribution <-
function(input.catalog,
read.catalog.function,
extracted.signature.file,
raw.exposures.file,
write.signature.function,
out.dir,
test.only = FALSE,
input.exposures = NULL,
delete.tmp.files = TRUE,
overwrite = FALSE,
verbose = FALSE) {
# TEMPORARY is a global required by SignatureAnalyzer
assign("TEMPORARY",paste0(out.dir, "/tmp/"),envir = envSA)
if (dir.exists(envSA$TEMPORARY)) {
if (!overwrite) stop("Directory ", envSA$TEMPORARY, " already exists")
} else {
dir.create(envSA$TEMPORARY)
}
# Load files required by fine-attribution step.
# Load catalog matrix file
# same as the file used in RunSignatureAnalyzerOnFile()
syn.data <- read.catalog.function(input.catalog,
strict = FALSE)
if (test.only) syn.data <- syn.data[ , 1:10]
if (dir.exists(out.dir)) {
if (!overwrite) stop(out.dir, " already exits")
} else {
dir.create(out.dir)
}
# Load extracted signatures.
sigs <- read.catalog.function(extracted.signature.file,
strict = FALSE)
# If signatures are un-normalized, Normalize them
sigs <- apply(sigs, 2, function(x) x/sum(x))
# Load RAW-inferred exposures required by fine-attribution step.
exp.raw <- SynSigGen::ReadExposure(raw.exposures.file)
if(FALSE){
exp.raw <- exp.raw[sigs.to.use, , drop = FALSE]
rownames(exp.raw) <- new.names
}
# 2 more steps of fine-tuned attribution.
# INPUT: extracted signatures (sigs),
# input catalog matrix (syn.data),
# and RAW-inferred exposures (exp.raw)
W0 <- sigs # Extracted signatures
H0 <- exp.raw # initially inferred exposures
V0 <- syn.data # Catalog matrix for all tumors
K0 <- ncol(W0) # Number of signatures
# Z0 is a signature allowance matrix for the whole dataset.
# rows refer to signatures, columns refer to tumor samples.
# If you specify an entry as 0,
# then you prohibit a signature to be active in this tumor sample.
# Here, we simply select Z0 as all-1 matrix
# and therefore any signature is allowed to be present in any tumor.
Z0 <- array(1,dim=c(K0,ncol(V0))) # signature indicator matrix Z (0 = not allowed, 1 = allowed); all elements initialized by one.
colnames(Z0) <- colnames(H0)
rownames(Z0) <- rownames(H0)
# The original SignatureAnalyzer code attributes tumors of different tumor types separately
# TODO(Wuyang): Add separation of tumor types later.
ttype <- rep("SBS1.SBS5.correlated",ncol(V0))
ttype.unique <- unique(ttype)
n.ttype.unique <- length(ttype.unique)
a0 <- 10 ### default parameter
phi <- 1.5 ### default parameter
for (i in 1:n.ttype.unique) {
#### First step: determine a set of optimal signatures best explaining the observed mutations in each tumor type ("cohort").
#### The automatic relevance determination technique was applied to the H matrix only, while keeping signatures (W0) frozen.
cohort <- ttype.unique[i] ## each cohort is composed of tumor catalogs with the SAME tumor type.
W1 <- W0
V1 <- as.matrix(V0[,ttype==cohort])
Z1 <- Z0[,match(colnames(V1),colnames(Z0),nomatch=0)] ## If a signature is not active in a tumor type in initial attribution,
## it is no more allowed in the fine-tuned attribution step.
H1 <- Z1*H0[,match(colnames(V1),colnames(Z0),nomatch=0)]
lambda <- rep(1+(sqrt((a0-1)*(a0-2)*mean(V1,na.rm=T)/K0))/(nrow(V1) + ncol(V1) + a0 + 1)*2,K0)
res0 <- envSA$BayesNMF.L1.KL.fixed_W.Z(as.matrix(V1),as.matrix(W1),as.matrix(H1),as.matrix(Z1),lambda,2000000,a0,1.e-07,1/phi)
H2 <- res0[[2]] ## Attribution from second step
colnames(H2) <- colnames(V1) ## The colnames of H2 (exposure attribution matrix) are the names of tumor samples,
## it should be the same as the colnames of V1 (catalog matrix for tumors whose tumor type is cohort=ttype.unique[i])
rownames(H2) <- colnames(W0) ## The rownames of H2 (exposure attribution matrix) are the names of signatures,
## it should be the same as the colnames of W0 (signature matrix extracted in the previous section)
#### Second step: determine a sample-level exposure attribution using selected sigantures in the first step.
index.H2 <- rowSums(H2)>1 ### identify only active signatures in the cohort
Z2 <- Z1
Z2[!index.H2,] <- 0 ### only selected signatures in the first step are allowed + the original contraints on the signature availability from Z1.
for (j in 1:ncol(H2)) {
tmpH <- rep(0,ncol(W0))
if (sum(V1[,j])>=5) {
lambda <- 1+(sqrt((a0-1)*(a0-2)*mean(V1,na.rm=T)/K0))/(nrow(V1) + ncol(V1) + a0 + 1)*2
res <- envSA$BayesNMF.L1.KL.fixed_W.Z.sample(as.matrix(V1[,j]),W0,as.matrix(H2[,j]),as.matrix(Z2[,j]),lambda,1000000,a0,1.e-07,1) ## Precise attribution
tmpH <- res[[2]]
}
if (j==1) {
H3 <- tmpH
} else {
H3 <- cbind(H3,tmpH)
if (verbose) cat(j,'\n')
}
}
colnames(H3) <- colnames(V1)
rownames(H3) <- colnames(W0)
if (i==1) {
H2.all <- H2
H3.all <- H3
} else {
H2.all <- cbind(H2.all,H2) ## Attribution result from step 2
H3.all <- cbind(H3.all,H3) ## Attribution result from step 3
}
}
exp.fine.tuned <- H3.all ### fine-tuned attributions of signature exposures
# Output fine-tuned attributions
SynSigGen::WriteExposure(
exp.fine.tuned,
file = paste0(out.dir, "/sa.output.fine.exp.csv"))
# Copy the input exposure into "out.dir",
# if filepath "input.exposures" is provided.
if (!is.null(input.exposures)) {
SynSigGen::WriteExposure(
SynSigGen::ReadExposure(input.exposures),
file = paste0(out.dir, "/input.syn.exp.csv"))
}
# Delete temporary files if flag delete.tmp.files == TRUE
if (delete.tmp.files) unlink(TEMPORARY, recursive = TRUE)
invisible(exp.fine.tuned)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.