#' Run hdp extraction and attribution on a spectra catalog file
#'
#' @param input.catalog File containing a spectra catalog
#' in \code{\link[ICAMS]{ICAMS}} format.
#'
#' @param out.dir Directory that will be created for the output;
#' abort if it already exits. Log files will be in
#' \code{paste0(out.dir, "/tmp")}.
#'
#' @param CPU.cores Number of CPUs to use in running
#' \code{\link[hdp]{hdp_posterior}}.
#'
#' @param seedNumber Specify the pseudo-random seed number
#' used to run hdp. Setting seed can make the
#' attribution of hdp repeatable.
#' Default: 1.
#'
#' @param K.guess Suggested initial value of the number of
#' signatures, passed to \code{\link[hdp]{dp_activate}} as
#' \code{initcc}.
#'
#' @param multi.types A logical scalar or
#' a character vector.
#' If \code{FALSE}, hdp will regard all input spectra as one tumor type,
#' and will allocate them to one single dirichlet process node.
#'
#' If \code{TRUE}, hdp will infer tumor types based on the string before "::" in their names.
#' e.g. Tumor type for "SA.Syn.Ovary-AdenoCA::S.500" would be "SA.Syn.Ovary-AdenoCA"
#'
#' If it is a character vector, it should be a vector of case-sensitive tumor
#' types.
#' e.g. \code{c("SA.Syn.Ovary-AdenoCA", "SA.Syn.Ovary-AdenoCA", "SA.Syn.Kidney-RCC")}.
#'
#' @param remove.noise Whether to remove noise signature "hdp.0"? In normal cases scenarios,
#' only few mutations will be assigned to noise signature.
#'
#' For result visualization and assessment of \code{hdp} package, select \code{TRUE};
#' for diagnostic purposes, select \code{FALSE}.
#'
#' @param num.posterior Number of posterior sampling chains; can set to
#' 1 for testing.
#'
#' @param post.burnin Pass to \code{\link[hdp]{hdp_posterior}}
#' \code{burnin}.
#'
#' @param post.n Pass to \code{\link[hdp]{hdp_posterior}}
#' \code{n}.
#'
#' @param post.space Pass to \code{\link[hdp]{hdp_posterior}}
#' \code{space}.
#'
#' @param post.cpiter Pass to \code{\link[hdp]{hdp_posterior}}
#' \code{cpiter}.
#'
#' @param test.only If TRUE, only analyze the first 10 columns
#' in \code{input.catalog}.
#'
#' @param overwrite If TRUE, overwrite existing output.
#'
#' @param verbose If \code{TRUE} then \code{message} progress information.
#'
#' @return The inferred exposure of \code{hdp}, invisibly.
#'
#' @details Creates several
#' files in \code{out.dir}. These are:
#' TODO(Steve): list the files
#'
#' TODO(Wuyang)
#'
#' @importFrom utils capture.output
#'
#' @export
RunhdpLessHier <-
function(input.catalog,
out.dir,
CPU.cores = 1,
seedNumber = 1,
K.guess,
multi.types = FALSE,
remove.noise = FALSE,
num.posterior = 4,
post.burnin = 4000,
post.n = 50,
post.space = 50,
post.cpiter = 3,
test.only = FALSE,
overwrite = FALSE,
verbose = TRUE) {
# Set seed
set.seed(seedNumber)
seedInUse <- .Random.seed # To document the seed used
RNGInUse <- RNGkind() # To document the random number generator (RNG) used
# Read in spectra data from input.catalog file
spectra <- ICAMS::ReadCatalog(input.catalog,strict = FALSE)
if (test.only) spectra <- spectra[ , 1:25]
# CPU.cores specifies number of CPU cores to use.
# CPU.cores will be capped at 30.
# If CPU.cores is not specified, CPU.cores will
# be equal to the minimum of 30 or (total cores)/2
# convSpectra: convert the ICAMS-formatted spectra catalog
# into a matrix which HDP accepts:
# 1. Remove the catalog related attributes in convSpectra
# 2. Transpose the catalog
convSpectra <- spectra
# hdp get confused if there are extra attributes
# or classes.
class(convSpectra) <- "matrix"
attr(convSpectra,"catalog.type") <- NULL
attr(convSpectra,"region") <- NULL
dimnames(convSpectra) <- dimnames(spectra) # check if really necessary
convSpectra <- as.data.frame(t(convSpectra))
number.channels <- nrow(spectra)
number.samples <- ncol(spectra)
# Create output directory
if (dir.exists(out.dir)) {
if (!overwrite) stop(out.dir, " already exits")
} else {
dir.create(out.dir, recursive = T)
}
if (verbose) {
message("number of Dirichlet process data clusters = ", K.guess)
}
# Run hdp main program.
# Step 1: initialize hdp object
{
# Allocate process index for hdp initialization.
# Each different index number refers to a dirichlet process
# for one tumor type.
if(multi.types == FALSE){ # All tumors belong to one tumor type (default)
num.tumor.types <- 1
process.index <- c(0,rep(1,number.samples))
} else if(multi.types == TRUE){
# There are multiple tumors in the sample.
# Tumor type will be inferred by the string before "::" in the column names.
# e.g. Tumor type for "SA.Syn.Ovary-AdenoCA::S.500" would be "SA.Syn.Ovary-AdenoCA"
tumor.types <- sapply(
colnames(spectra),
function(x) {strsplit(x,split = "::",fixed = T)[[1]][1]})
num.tumor.types <- length(unique(tumor.types))
# 0 refers to top grandparent DP node. All signatures are drawn from this node.
# Signature of each tumor type is drawn from a parent DP node (level 1).
# If a dataset has X tumor types, then we need to specify X level-1 nodes.
process.index <- c(0, rep(1,num.tumor.types))
# For every tumor of the 1st/2nd/3rd/... tumor type,
# we need to specify a level 2/3/4/... DP node for the tumor.
process.index <- c(process.index,1 + as.numeric(as.factor(tumor.types)))
# Something like
# c(0, 1, 1, 2, 2, 2, 3, 3)
# 0 is grandparent
# 1 is a parent of one type (there are 2 types)
# 2 indcates tumors of the first type
# 3 indicates tumors of second type
} else if (is.character(multi.types)){ # multi.types is a character vector recording tumor types
num.tumor.types <- length(unique(multi.types))
process.index <- c(0, rep(1,num.tumor.types))
process.index <- c(process.index, 1 + as.numeric(as.factor(multi.types)))
} else {
stop("Error. multi.types should be TRUE, FALSE, or a vector of tumor types for each tumor sample.\n")
}
# Specify ppindex as process.index,
# and cpindex (concentration parameter) as 1 + process.index
ppindex <- process.index
cpindex <- 1 + process.index
# Calculate the number of levels in the DP node tree.
dp.levels <- length(unique(ppindex))
# DP node of each level share two Dirichlet Hyperparameters:
# shape (alphaa) and rate (alphab).
# For mutational signature analysis purpose,
# alphaa and alphab are set as 1 for each level.
alphaa <- rep(1,dp.levels)
alphab <- rep(1,dp.levels)
# initialise hdp
if (verbose) message("calling hdp_init")
hdpObject <- hdp::hdp_init(ppindex = ppindex,
cpindex = cpindex,
hh = rep(1,number.channels),
alphaa = alphaa,
alphab = alphab)
# num.process is the number of samples plus number cancer types plus 1 (grandparent)
num.process <- hdp::numdp(hdpObject)
if (verbose) message("calling hdp_setdata")
if(num.tumor.types > 1){
hdpObject <- hdp::hdp_setdata(
hdpObject,
(1 + num.tumor.types + 1):num.process,
convSpectra)
} else{
hdpObject <- hdp::hdp_setdata(
hdpObject,
(1 + 1):num.process,
convSpectra)
}
if (verbose) message("calling dp_activate")
# dp_activate calls hdp:::stirling, which when
# called on a large number (e.g. > 200000), requires a great deal of
# memory (e.g. estimated 2 Terabyte for stirling(200000)).
hdpObject <- hdp::dp_activate(hdpObject,
1:num.process,
initcc = K.guess,
seed = seedNumber)
# Release the occupied RAM by dp_activate
gc()
}
# Step 2: run 4 independent sampling chains
{
if(CPU.cores == 1){ # debug
# Run four independent posterior sampling chains
chlist <- vector("list", 4) #4 is too much here!
for (i in seq(1,num.posterior)) {
if (verbose) message("calling hdp_posterior ", i)
chlist[[i]] <-
hdp::hdp_posterior(
hdpObject,
# The remaining values, except seed, are from the vignette; there
# are no defaults.
burnin = post.burnin,
n = post.n,
space = post.space,
cpiter = post.cpiter,
# Must choose a different seed for each of the 4 chains:
seed = (seedNumber + i * 10^6) )
}
} else {
f_posterior <- function(seed,hdpObject) {
if (verbose) message("calling hdp_posterior")
retval <- hdp::hdp_posterior(
hdpObject,
# The remaining values, except seed, are from the vignette; there
# are no defaults.
burnin = post.burnin,
n = post.n,
space = post.space,
cpiter = post.cpiter,
# Must choose a different seed for each of the 4 chains:
seed = seed )
return(retval)
}
chlist <- parallel::mclapply(
X = seedNumber + 10^6 * 1:4,
FUN = f_posterior,
hdpObject = hdpObject,
mc.cores = CPU.cores
)
}
# Generate the original multi_chain for the sample
if (verbose) message("calling hdp_multi_chain")
mut_example_multi <- hdp::hdp_multi_chain(chlist)
}
# Step 3: Plot the diagnostics of sampling chains.
{
# Plotting using hdp functions
# Plotting device on the server does not work
# Need to plot the file into a pdf
if (verbose) message("plotting to original_sample.pdf")
# Draw the DP oscillation plot for mut_example_multi(original_sample)
{
grDevices::pdf(file = paste0(out.dir,"/original_sample.pdf"))
graphics::par(mfrow=c(2,2), mar=c(4, 4, 2, 1))
p1 <- lapply(hdp::chains(mut_example_multi),
hdp::plot_lik, bty="L", start=500)
p2 <- lapply(hdp::chains(mut_example_multi),
hdp::plot_numcluster, bty="L")
grDevices::dev.off()
}
if (verbose) message("calling hdp_extract_components")
# Extract components(here is signature) with cosine.merge = 0.90 (default)
mut_example_multi_extracted <- hdp::hdp_extract_components(mut_example_multi)
mut_example_multi_extracted
# Generate a pdf for mut_example_multi_extracted
if(FALSE){
if (verbose) message("plotting to signature_hdp_embedded_func.pdf")
grDevices::pdf(
file = paste0(out.dir,"/signature_hdp_embedded_func.pdf"))
# Draw the DP oscillation plot for mut_example_multi_extracted
graphics::par(mfrow=c(2,2), mar=c(4, 4, 2, 1))
p1 <- lapply(hdp::chains(mut_example_multi_extracted),
hdp::plot_lik, bty="L", start=500)
p2 <- lapply(hdp::chains(mut_example_multi_extracted),
hdp::plot_numcluster, bty="L")
# Draw the computation size plot
graphics::par(mfrow=c(1,1), mar=c(5, 4, 4, 2))
hdp::plot_comp_size(mut_example_multi_extracted, bty="L")
# Close the PDF device so that the plots are exported to PDF
grDevices::dev.off()
}
}
# Step 4: Using hdp samples to extract signatures
{
if (verbose) message("calling hdp::comp_categ_distn")
# Calculate the mutation composition in each signature:
extractedSignatures <-
hdp::comp_categ_distn(mut_example_multi_extracted)$mean
dim(extractedSignatures)
# Add base context for extractedSignatures
extractedSignatures <- t(extractedSignatures)
rownames(extractedSignatures) <- rownames(spectra)
# Change signature names in extractedSignatures
# from "0","1","2" to "hdp.0","hdp.1","hdp.2"
colnames(extractedSignatures) <-
paste("hdp", colnames(extractedSignatures), sep = ".")
# Remove "hdp.0" (noise signature) if remove.noise == TRUE
flagRemoveHDP0 <- remove.noise
# Remove "hdp.0" (noise signature) anyway if it is a signature contains NA.
# Also remove "hdp.0" if all channels of hdp.0 equal to 0.
if(all(extractedSignatures[,"hdp.0"] == 0) | any(is.na(extractedSignatures[,"hdp.0"])))
flagRemoveHDP0 <- TRUE
if(flagRemoveHDP0){
sigToBeRemoved <- which(colnames(extractedSignatures) == "hdp.0")
if(length(sigToBeRemoved) == 1)
extractedSignatures <- extractedSignatures[,-(sigToBeRemoved),drop = FALSE]
}
# Convert extractedSignatures to ICAMS-formatted catalog.
extractedSignatures <- ICAMS::as.catalog(extractedSignatures,
region = "unknown",
catalog.type = "counts.signature")
if (verbose) message("calling ICAMS::WriteCatalog")
ICAMS::WriteCatalog(extractedSignatures,
paste0(out.dir,"/extracted.signatures.csv"))
}
# Step 5: Using hdp samples to attribute exposure counts.
{
# Calculate the exposure probability of each signature(component) for each tumor sample(posterior sample corresponding to a dirichlet process node):
# This is the probability distribution of signatures(components) for all tumor samples(DP nodes)
# exposureProbs proves to be the normalized signature exposure all tumor samples
if (verbose) message("Calling hdp::comp_dp_distn to generate exposure probability")
exposureProbs <- hdp::comp_dp_distn(mut_example_multi_extracted)$mean
dim(exposureProbs)
if(num.tumor.types > 1){
exposureProbs <- exposureProbs[(num.tumor.types + 2):dim(exposureProbs)[1],]
} else {
exposureProbs <- exposureProbs[2:dim(exposureProbs)[1],]
}
rownames(exposureProbs) <- rownames(convSpectra)[1:dim(exposureProbs)[1]]
# Remove NA or NULL "hdp.0" signature in exposureProbs matrix.
if(flagRemoveHDP0){
sigToBeRemoved <- which(colnames(exposureProbs) == "0")
exposureProbs <- exposureProbs[,-(sigToBeRemoved),drop = FALSE]
}
# Change signature names in exposureCounts
# from "0","1","2" to "hdp.0","hdp.1","hdp.2"
colnames(exposureProbs) <- colnames(extractedSignatures)
dim(exposureProbs)
# Transpose exposureProbs so that it conforms to SynSigEval format.
exposureProbs <- t(exposureProbs)
# Calculate signature exposure counts from signature exposure probability
# Unnormalized exposure counts = Normalized exposure probability * Total mutation count in a sample
sample_mutation_count <- apply(convSpectra,1,sum)
exposureCounts <- matrix(nrow = nrow(exposureProbs), ncol = ncol(exposureProbs))
dimnames(exposureCounts) <- dimnames(exposureProbs)
for (sample in seq(1,ncol(exposureProbs)))
exposureCounts[,sample] <- sample_mutation_count[[sample]] * exposureProbs[,sample]
if (verbose) message("Calling SynSigGen::WriteExposure() to write exposure counts")
SynSigGen::WriteExposure(exposureProbs,
paste0(out.dir,"/exposure.probs.csv"))
SynSigGen::WriteExposure(exposureCounts,
paste0(out.dir,"/inferred.exposures.csv"))
}
# Save seeds and session information
# for better reproducibility
capture.output(sessionInfo(), file = paste0(out.dir,"/sessionInfo.txt")) # Save session info
write(x = seedInUse, file = paste0(out.dir,"/seedInUse.txt")) # Save seed in use to a text file
write(x = RNGInUse, file = paste0(out.dir,"/RNGInUse.txt")) # Save seed in use to a text file
# Save signatures, exposures and sample chains
retval <- list(signature = extractedSignatures,
exposure = exposureCounts,
exposure.p = exposureProbs,
mut_example_multi = mut_example_multi,
mut_example_multi_extracted = mut_example_multi_extracted)
save(retval,file = paste0(out.dir,"/Runhdp.retval.RData"))
# Return a list of signatures and exposures, and sample chains
invisible(retval)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.