#options taken from the report subprogram
includeOptNames <- c("outdir", "prefix", "annot")
getSegmentOptions <- function(){
#this is not the actual complete list of options
#for segmentCLI: the complete list is a merge between
#the two lists 'getSegmentOptions()' and the elements in 'getReportOptions()'
#specified in 'includeOptNames'
opts1 <- list(
list(arg="--counts", type="character", required=TRUE, vectorial=TRUE,
help="Path to the count matrix. To train a model on multiple datasets,
the count matrices must be compatible, i.e. with the same marks and on the
same genomic regions. If this option is repeated the paths must be labelled,
example:
`--counts dataset1:path1/counts.txt --counts dataset2:path2/counts.txt`"),
list(arg="--regions", type="character", required=TRUE, parser=readRegions,
help="Path to the BED file with the genomic regions associated to
the count matrix. Only the first three fields of each line
will be considered. Typically each region comprises several bins
(for example, one region per chromosome)."),
list(arg="--nstates", type="integer", required=TRUE,
help="Number of states to use for training"),
list(arg="--model", type="character", parser=readModel,
help="Path to the file with the parameters of the HMM.
The model contains the initial parameters for learning.
If `--notrain` is set, no learning will take place and all
model parameters must be specified."),
list(arg="--notrain", flag=TRUE,
help="The provided model will be used 'as-is' without training."),
list(arg="--collapseInitP", type="logical", meta="true_or_false",
help="In case a model with multiple initial probabilities is provided,
should those probabilities be averaged? If you are not sure about what
this means, the default is fine (the default is `FALSE` in train mode
and `TRUE` in predict mode)"),
list(arg="--nthreads", type="integer", default=formals(segment)$nthreads,
help="Number of threads to be used"),
list(arg="--split4speed", type="logical", default=formals(segment)$split4speed,
help="Add artificial splits in the input regions to speed-up the
algorithm. This makes sense when the number of regions is small compared
to the number of threads. The artificial breaks are used only in the
training phase, not in the computation of the final state assignments."),
list(arg="--maxiter", type="integer", default=formals(segment)$maxiter,
help="Maximum number of iterations in train mode"),
list(arg="--save_rdata", flag=TRUE,
help="Save an R data archive with the results of the segmentation. To
see what they are, inside R, type `library(epicseg)` and `?segment`"))
opts2 <- makeOptions(getReportOptions())[includeOptNames]
c(opts1, opts2)
}
segmentCLI <- function(args, prog){
segmentOptions <- getSegmentOptions()
#parse the options
opt <- parseArgs(segmentOptions, args, prog)
#deal with the two scenarios for the 'counts' option
if (length(opt$counts)==1 && !grepl(":", opt$counts)){
opt$counts <- readCounts(opt$counts)
} else {
lp <- label_sc_path(opt$counts, unique.labels=TRUE)
opt$counts <- lapply(setNames(lp$path, lp$label), readCounts)
}
#split the options for 'segment' from the options for 'report'
roptnames <- intersect(includeOptNames, names(opt))
sopt <- opt[setdiff(names(opt), roptnames)]
ropt <- opt[roptnames]
#deal with 'save_rdata'
save_rdata <- !is.null(sopt$save_rdata)
sopt$save_rdata <- NULL
#call 'segment'
segmentation <- do.call(segment, sopt)
#set the computed options for 'report'
ropt$segments <- segmentation$segments
ropt$model <- segmentation$model
if (save_rdata) ropt$rdata <- segmentation
#call 'report'
cat("producing report\n")
do.call(report, ropt)
}
#these options are not available from the CLI and can be set in 'segment' with the ellipsis (...)
advancedOpts <- c("tol","verbose","nbtype","init","init.nlev", "rmin")
#' Learn a model and produce a segmentation
#'
#' @param counts Count matrix or list of count matrices matching with the
#' \code{regions} parameter. Each row of the matrix represents a mark and each
#' column a bin resulting from dividing the genomic regions into non-overlapping
#' bins of equal size. The rows of the matrix must be named with the
#' name of the marks and these names must be unique.
#' @param regions GRanges object containing the genomic regions of interest.
#' Each of these regions corresponds to a set of bins and each bin to a column
#' of the count matrix. The binsize is automatically derived by comparing
#' the columns of the count matrix with the width of the regions.
#' @param nstates Number of states to learn.
#' @param model A list with the parameters that describe
#' the HMM. Missing parameters will be learned, and the provided
#' parameters will be used as initial parameters for the learning
#' algorithm. If \code{train==FALSE} the parameter set must be
#' complete and no learning will take place.
#' @param notrain If FALSE, the parameters will be learned, otherwise
#' the provided parameters (with the \code{model} option) will be
#' used without learning to produce a segmentation.
#' @param collapseInitP In case a model with multiple initial probabilities
#' is provided, should those probabilities be averaged and reduced to
#' one initial probabilities vector? If you are not sure
#' about what this means, don't set this option.
#' @param nthreads number of threads used for learning
#' @param split4speed add artificial splits in the input regions to improve
#' the parallelism of the forward-backward algorithm. Usually the results change
#' very little and the algorithm runs considerably faster, if the number of
#' input regions is smaller than the number of threads. See \code{?kfoots} for
#' more details.
#' @param maxiter Maximum number of iterations for learning.
#' @param ... Advanced options for learning. Type
#' \code{epicseg:::advancedOpts} to see which options are allowed,
#' and type \code{?kfoots} to see what the options do.
#' @return A list with the following arguments:
#' \item{segments}{The segmentation as a GRanges object.
#' The slot \code{names} of this object contains a number from
#' 1 to nstates saying which state each segment belongs to.}
#' \item{model}{A list containing all the parameters of the model.}
#' \item{posteriors}{A matrix of size \code{nstates*ncol(counts)} containing the posterior
#' probability that a given datapoint is generated by the given state}
#' \item{states}{An integer vector of length \code{ncol(counts)} saying
#' which state each bin is associated to (using the posterior decoding
#' algorithm). This vector is used to create the \code{segments} argument.}
#' \item{viterbi}{Same as \code{states}, but using the viterbi algorithm.}
#' \item{loglik}{the log-likelihood of the whole dataset.}
#' @export
segment <- function(counts, regions, nstates=NULL, model=NULL, notrain=FALSE, collapseInitP=FALSE,
nthreads=1, split4speed=FALSE, maxiter=200, ...){
#REFORMAT KFOOTS OPTIONS
kfootsOpts <- list(...)
if (length(kfootsOpts)>0){
optNames <- names(kfootsOpts)
if (is.null(optNames) || any(is.na(optNames))) stop("the advanced options must be named")
if (!all(optNames %in% advancedOpts)) stop("check 'epicseg::advancedOpts' for the list of allowed advanced options")
}
kfootsOpts <- c(list(nthreads=nthreads, split4speed=split4speed, framework="HMM"), kfootsOpts)
#CHECK ARGUMENTS
if (!is.list(counts)) {
clist <- list(counts)
} else clist <- counts
validateCList(clist)
nmat <- length(clist)
if (nmat==0) stop("empty list of count matrices")
if (is.null(regions) || !inherits(regions, "GRanges")) stop("'regions' must be a GenomicRanges object")
#check consistency between regions and counts
binsize <- checkBinsize(regions, ncol(clist[[1]]))
if (all(width(regions) <= binsize)) stop("no region contains consecutive bins, cannot fit transition probabilities")
if (is.null(model) && notrain) stop("no model provided, training necessary")
if (!is.null(model)){
#if we can't figure out the number of states from the model
#we can completely discard it (it is an empty model)
dims <- validateModel(model, strict=!notrain)
if (is.null(dims$nstates)) model <- NULL
}
if (!is.null(model)) {
#figure out the number of states
if (!is.null(nstates) && nstates != dims$nstates) {
warning("inconsistent 'nstates' given, using the value provided in the model")
nstates <- dim$nstates
}
if (!is.null(model$marks) && !is.null(model$emisP)) model <- matchModelToCounts(model, clist[[1]])
if (!is.null(model$initP)){
#adapt the initPs to the current set of observations
nInits <- length(regions)*nmat
if (!collapseInitP && ncol(model$initP) != nInits){
warning("collapsing the initial probabilities for compatibility with the new set of observations")
collapseInitP <- TRUE
}
if (collapseInitP){
avgInitP <- rowMeans(model$initP)
model$initP <- matrix(avgInitP, nrow=length(avgInitP), ncol=nInits)
}
}
nstates <- model$nstates
if (is.null(model$emisP)) kfootsOpts$k <- nstates
else kfootsOpts$k <- model$emisP
kfootsOpts$trans <- model$transP
kfootsOpts$initP <- model$initP
} else if (!is.null(nstates)){
kfootsOpts$k <- nstates
} else stop("provide either an existing HMM or a desired number of states")
kfootsOpts$maxiter <- maxiter
#seqlens depend on the genomic regions
kfootsOpts$seqlens <- rep(width(regions)/binsize, nmat)
#pass the count matrix (join the elements of the list if necessary)
if (nmat==1){ kfootsOpts$counts <- clist[[1]]
} else {
message("concatenating count matrices")
kfootsOpts$counts <- bindCList(clist, nthreads)
}
#deal with the 'train' option
if (notrain){
kfootsOpts$maxiter <- 0
kfootsOpts$verbose <- FALSE
}
#CALL KFOOTS
tryCatch(
fit <- do.call(kfoots, kfootsOpts),
error=function(e) kfoots_error_handler(e$message, regions, binsize)
)
#REORDER STATES
if (is.null(model)) fit <- reorderFitStates(fit)
#MAKE MODELS
model <- list(nstates=length(fit$models), marks=rownames(clist[[1]]),
emisP=fit$models, transP=fit$trans, initP=fit$initP)
if (nmat > 1){
#format the data appropriately
nbinsSingle <- ncol(kfootsOpts$counts)/nmat
#posteriors
setDim_unsafe(fit$posteriors, c(nstates, nbinsSingle, nmat))
setDimnames_unsafe(fit$posteriors, list(NULL, NULL, names(clist)))
#states and viterbi
for (v in list(fit$clusters, fit$viterbi$vpath)){
setDim_unsafe(v, c(nbinsSingle, nmat))
setDimnames_unsafe(v, list(NULL, names(clist)))
}
#MAKE SEGMENTS as GRangesList
segms <- GRangesList(apply(fit$clusters, 2, statesToSegments, regions=regions))
} else {
#MAKE SEGMENTS as GRanges
segms <- statesToSegments(fit$clusters, regions)
}
list(segments=segms, model=model, posteriors=fit$posteriors, states=fit$clusters,
viterbi=fit$viterbi$vpath, loglik=fit$loglik)
}
kfoots_error_handler <- function(err_msg, regions, binsize) {
#deal with underflow error
uflowmsg <- "^Underflow error.*@([[:digit:]]*):([[:digit:]]*):([[:digit:]]*)@$"
if (!grepl(uflowmsg, err_msg)) stop(err_msg)
msg <- tryCatch({
# parse coordinates from error message
# we only take the 3rd, because there is no guarantee that
# the regions used by kfoots are the same as those in the
# 'regions' variable (because of the 'split4speed' option)
coords <- gsub(uflowmsg, "\\1 \\2 \\3", err_msg)
coords <- as.integer(strsplit(coords, split=" ")[[1]])
absbin <- coords[3]
# get the genomic range that caused the underflow
bin <- getBin(absbin, regions, binsize)
if (is.null(bin)) stop(err_msg)
paste0(err_msg, "\n Are the read counts near the bin ",
seqnames(bin), ":", int2str(start(bin)-1), "-",
int2str(end(bin)),
" abnormally high?\n Try to remove this region")
},
# this happens if we fail to parse the error message
# could be due to an older version of kfoots
error=function(e) stop(err_msg)
)
stop(msg)
}
getBin <- function(binidx, regions, binsize){
ws <- width(regions)/binsize
nbins <- sum(ws)
binidx <- ((binidx-1) %% nbins) + 1
breaks <- cumsum(c(1,ws))
starts <- breaks[1:(length(breaks)-1)]
ends <- breaks[2:length(breaks)]
regidx <- which(starts <= binidx & binidx < ends)[1]
if (is.na(regidx)) {return(NULL)}
chr <- seqnames(regions)[regidx]
beg <- start(regions)[regidx] + (binidx - starts[regidx])*binsize
end <- beg + binsize - 1
GRanges(seqnames=chr, IRanges(start=beg, end=end))
}
reorderFitStates <- function(fit){
nstates <- length(fit$models)
#determine a distance matrix between states
#in 'mps' rows are states and columns are histone marks
mps <- t(log(getMeanMatrix(fit$models) + 1))
dmat <- as.matrix(dist(mps, method="euclidean"))
#get a small weight path through all the states
path <- smallWeightHamiltonianPath(dmat)
if (length(path) != nstates) stop("something went wrong..")
#let's make state 1 the one with more counts
if (sum(mps[path[1],]) < sum(mps[path[nstates],])){
path <- rev(path)
}
#let's permute every object now
fit$models <- fit$models[path, drop=F]
fit$trans <- fit$trans[path, path, drop=F]
fit$initP <- fit$initP[path,, drop=F]
fit$posteriors <- fit$posteriors[path,, drop=F]
#compute inverse permutation
io <- integer(nstates); io[path] <- 1:nstates
fit$clusters <- io[fit$clusters]
fit$viterbi$vpath <- io[fit$viterbi$vpath]
fit
}
#METHODS TO VALIDATE AND SANITIZE ARGUMENTS
checkBinsize <- function(regions, nbins){
genomicSpan <- sum(as.numeric(width(regions)))
if (genomicSpan %% nbins != 0) stop("total genomic span is not a multiple of the number of bins")
binsize <- genomicSpan/nbins
if (any((width(regions) %% binsize) != 0)) stop("regions widths are not multiples of binsize")
binsize
}
#make sure that the rownames of counts
#are in the same order as the marks in the model
matchModelToCounts <- function(model, counts){
if (!setequal(model$marks, rownames(counts))) {
stop("provided models have different names than the rows of the 'counts' matrix")
}
mperm <- match(rownames(counts), model$marks)
#this function reorders the 'marks' and 'emisP' fields of the model
model <- reoderMarks(model, mperm)
model
}
#MISC
statesToSegments <- function(states, regions){
h <- statesToSegments_helper(regions, states)
GRanges(seqnames=h$chrs, IRanges(start=h$starts, end=h$ends, names=h$states))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.