# estimate.R, Copyright 2016,2017 Florian G. Pflug
#
# This file is part of gwpcR
#
# gwpcR is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# gwpcR is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with gwpcR. If not, see <http://www.gnu.org/licenses/>.
#' Parameter Estimation for PCR-Poisson Mixture
#'
#' Estimates the parameters \var{efficiency} and \var{lambda0} from a vector
#' of observed read counts per molecular family, or (depending on the estimation
#' method) the mean and variance of these observations. Supports arbitrary
#' detection thresholds and initial molecule counts, but estimation may be
#' considerably faster in the (unrealistic) case \var{threshold=0} than in the
#' general one.
#'
#' @param x a numeric vector containing the observed read counts per molecular
#' family, after removal of families below the detection threshold. The
#' vector must thus contain only whole numbers not smaller than \var{threshold}.
#' This parameter and the combination of parameters \var{mean} and \var{var}
#' are mutually exclusive.
#'
#' @param mean average number of observations per molecular family
#' \emph{computed over the unambiguously detected famililies}, i.e. over those
#' families which were observed at least \var{threshold} times. This parameter
#' and specifying an observation vector through parameter \var{x} are mutually
#' exclusive, and specifying \var{mean} and \var{var} instead of the full
#' observation vector is only possible for estimation method 'mom'.
#'
#' @param var standard deviations of number of observations per molecular
#' family, also \emph{computed over the unambiguously detected famililies},
#' i.e. over those families which were observed at least \var{threshold} times.
#' This parameter and specifying an observation vector through parameter \var{x}
#' are mutually exclusive, and specifying \var{mean} and \var{var} instead of
#' the full observation vector is only possible for estimation method 'mom'.
#'
#' @param n.umis the number of observed \acronym{UMI}s, used in the estimation
#' of \code{n.tot} (i.e. the total number of molecules/\acronym{UMI}s in the
#' original sample). See also the dicussion of the parameter \code{loss}, and
#' the result value \code{n.tot}.
#'
#' @param method the estimation method to use, either 'mle' for \emph{maximum
#' likelihood estimation} or 'mom' for \emph{method of moments}. See Details.
#'
#' @param must.converge if set to \code{TRUE}, an error is reported of the
#' parameter estimation fails to converge. If \code{FALSE}, a warning is
#' reported instead.
#'
#' @inheritParams gwpcrpois
#'
#' @param loss an expression specifying how the loss, i.e. the percentage of
#' all molecules (or UMIs) that was not observed, or removed by the read
#' count threshold. In the simple case of each read count observation
#' representing a separate molecules, the default value \var{p0} is correct
#' -- the lost molecules are then simply those whose read count lies below
#' the specified \var{threshold}. In more complex scenarios, e.g. if a
#' single molecule produces separate read count for each strand, which are
#' then either both rejected or both accepted, the additional rejection cases
#' must be considered by a custom loss expression
#'
#' @param ctrl a list of settings controlling the estimation procedure.
#' Difference estimation methods recognize different possible \var{ctrl}
#' settings, unrecognized settings are ignored without warning. See Details
#' for the settings relevant to each estimation method.
#'
#' @return A list containing the values
#'
#' \item{convergence}{flag indicating whether the estimation converged.
#' \var{0} indicates convergence.}
#'
#' \item{efficiency}{parameter estimate for \var{efficiency} (see
#' \link{gwpcrpois})}
#'
#' \item{lambda0}{parameter estimate for \var{lambda0} (see \link{gwpcrpois})}
#'
#' \item{p0}{probability of observing a read count less than the specified
#' \var{threshold}}
#'
#' \item{loss}{the estimation loss according to the specified loss expression,
#' i.e. percentage of molecules not observed or filtered out}
#'
#' \item{n.tot}{the estimated total number of molecules in the sample, i.e.
#' \code{n.ums / (1 - loss)}}
#'
#' \item{n.obs}{The length of the observation vector \var{x} used for parameter
#' estimation, or \code{NA} if \var{mean} and \var{var} were specified
#' directly.}
#'
#' \item{n.umis}{The number of observed molecules/\acronym{UMI}s specified in
#' the call to \code{gwpcrpois.est}}
#'
#' \item{threshold}{detection threshold specified in the call to
#' \code{gwpcrpois.est}}
#'
#' \item{molecules}{initial molecule count specified in the call to
#' \code{gwpcrpois.est}}
#'
#' @details The two available estimation methods, \emph{method of
#' moments} (\code{method='mome'}) and \emph{maximum likelihood}
#' (\code{method='mle'}) have different propertiers and accept different
#' \var{ctrl} parameters:
#' \describe{
#' \item{method of moments (\code{mom}):}{
#' For the (unrealistic) uncensored case, i.e. \var{threshold=0}, the
#' specified mean is the method-of-moments estimate for \var{lambda0}, and a
#' closed formula is used to compute \var{efficiency} from \var{mean} and
#' \var{var}. The \var{ctrl} argument is not used in this case.
#'
#' In case of a censored distribution, the sample mean is not a consistent
#' estimator for \var{lambda0} because the expectation of the censored
#' distribution is in general larger than \var{lambda0}. The sample variance
#' simiarly deviates from the variance of the uncensored distribution.
#'
#' An interative approach is used to find method-of-moment estimates in this
#' case. Initial estimates are computed as if \var{threshold} were zero. From
#' these the probability \var{pdetect} (i.e. \code{1-p0}) of detecting a particular family is
#' found, and used to correct for the biases in the sample mean and variance.
#' Then the parameter estimates are updated. This process continues until it
#' either converges or reaches the maximum allowed number of iterations. Both
#' termination criteria can be controlled via the \var{ctrl} parameter, which
#' is a list that can contain the following components: \describe{
#'
#' \item{maxit}{Maximum number of iterations. Defaults to 150.}
#'
#' \item{rel.tol}{Relative convergence tolerance. Applied to \var{efficiency},
#' \var{lambda0} and the detection probability \var{pdetect}. Defaults to
#' \code{1e-4}.}
#'
#' \item{rel.tol}{Absolute convergence tolerance. Only used as the tolerance
#' around zero, where the relative tolerance becomes meaningless. Defaults to
#' \code{1e-4}.}
#'
#' \item{trace}{Output estimates after each round}
#' }
#' }
#' \item{maximum likelihood (\code{mle}):}{
#' The parameters are estimated by maximizing the log-likelihood using
#' \code{\link{optim}}. The \var{ctrl} settings are passed through to
#' \code{\link{optim}}, except for \var{fnscale} and \var{parscale} which
#' are overwritten. The method of moments (method \code{'mom'})) estimate
#' is used as the starting point during liklihood optimization.
#' }
#' }
#'
#' @seealso \code{\link{gwpcrpois}}
#'
#' @export
gwpcrpois.est <- function(x=NULL, mean=NULL, var=NULL, n.umis=NULL, method="mom",
must.converge=TRUE, threshold=1, molecules=1, loss=expression(p0),
ctrl=list())
{
if (!is.null(x) && !is.numeric(x))
stop('if specified, observation vector x must be a numeric vector')
if (!is.null(mean) && (!is.numeric(mean) || (length(mean) != 1) || (mean < 0)))
stop('if specified, mean must be a non-negative scalar')
if (!is.null(var) && (!is.numeric(var) || (length(var) != 1) || (var < 0)))
stop('if specified, var must be a non-negative scalar')
method <- match.arg(method, c("mom", "mle"))
if (!(method %in% c("mom", "mle")))
stop("method must be 'mom' or 'mle'")
if (!is.logical(must.converge) || (length(must.converge) != 1) || is.na(must.converge))
stop('must.converge must be TRUE or FALSE')
if (!is.numeric(threshold) || (length(threshold) != 1) || (threshold != floor(threshold)) || (threshold < 0))
stop('threshold must be a non-negative integral scalar')
if (!is.numeric(molecules) || (length(molecules) != 1) || (molecules != floor(molecules)) || (molecules < 1))
stop('molecules must be a strictly positive integral scalar')
if (!is.expression(loss) || (length(loss) != 1))
stop('loss must be an expression')
if (!is.list(ctrl))
stop('ctrl must be a list')
# Call either gwpcrpois.estimator.mom or gwpcrpois.estimator.mle
r <- if (is.numeric(x)) {
if (!is.null(mean) || !is.null(var))
stop("either mean and var OR a vector or observations, but not both, must be specified")
# x is a vector of per-UMI read counts (after threshold application)
if (any(x != floor(x)) || any(is.na(x)) || any(!is.finite(x)) || any(x < threshold))
stop("observations in vector x must be whole numbers >= threshold")
if (method == "mom")
gwpcrpois.estimator.mom(mean=mean(x), var=var(x), threshold=threshold,
molecules=molecules, ctrl=ctrl)
else if (method == "mle")
gwpcrpois.estimator.mle(x, threshold=threshold, molecules=molecules, ctrl=ctrl)
} else if (is.null(x) && is.numeric(mean) && is.numeric(var)) {
if (method != "mom")
stop("if mean and var instead of the full observation vector is specified, only method 'mom' is supported")
gwpcrpois.estimator.mom(mean=mean, var=var, threshold=threshold,
molecules=molecules, ctrl=ctrl)
} else {
stop("either mean and var or a vector of observations must be specified")
}
# Handle nonconvergence.is.error
if (r$convergence != 0) {
if (must.converge)
stop("gwpcrpois.mom did not converge")
else
warning("gwpcrpois.mom did not converge, returning best estimate so far")
}
# Evaluate loss expression
r$loss <- tryCatch(eval(loss, envir=r, enclos=parent.frame()),
error=function(e) { warning(conditionMessage(e)) ; NA})
# Set n.obs, n.umis and n.tot = n.umis / (1 - loss)
r$n.obs <- if (!is.null(x)) length(x) else NA
r$n.umis <- if (!is.null(n.umis)) n.umis else if (!is.null(x)) length(x) else NA
r$n.tot <- r$n.umis / (1 - r$loss)
return(r)
}
#' Group-wise Parameter Estimation for PCR-Poisson Mixture
#'
#' Estimates the parameters \var{efficiency} and \var{lambda0} separately for each
#' \emph{group} of UMIs (i.e, for example separately for each gene, each exon, each
#' cell, ...). To reduce the noise of the raw group-specific parameters, the raw
#' group-specific parameters are \emph{shrunken} towards global estimates for
#' \var{efficiency} and \var{lambda0} that are specified
#'
#' @param formula Formula of the form \code{reads ~ key1 + key2 + ...},
#' where \var{reads} is the column containing the per-UMI read count,
#' and \var{key1}, \var{key2}, ... are the column(s) that uniquely identify
#' a group. If multiple read counts are observed per UMI (for example if a
#' protocol that yields strand-specific counts for the two strands of double-
#' stranded molecules is used), \code{c(reads1, reads2, ...)} can be used as
#' the left hand side to combine read counts from multiple columns. Note that
#' in this case, the default \var{loss} expression is probably not appropriate.
#' All the read counts in the columns listed in \var{reads} must be greater or
#' equal than \var{threshold}.
#'
#' @param data a \code{\link{data.frame}} or \code{\link{data.table}} with one
#' row per observed UMI. The required columns are determined by the \var{formula}.
#'
#' @param method the estimation method to use, either 'mle' for \emph{maximum
#' likelihood estimation} or 'mom' for \emph{method of moments}. See Details.
#'
#' @inheritParams gwpcrpois
#'
#' @param loss an expression specifying how the loss, i.e. the percentage of
#' all molecules (or UMIs) that was not observed, or removed by the read
#' count threshold. In the simple case of each read count observation
#' representing a separate molecules, the default value \var{p0} is correct
#' -- the lost molecules are then simply those whose read count lies below
#' the specified \var{threshold}. In more complex scenarios, e.g. if a
#' single molecule produces separate read count for each strand, which are
#' then either both rejected or both accepted, the additional rejection cases
#' must be considered by a custom loss expression
#'
#' @param ctrl a list of settings controlling the estimation procedure.
#' Difference estimation methods recognize different possible \var{ctrl}
#' settings, unrecognized settings are ignored without warning. See
#' \code{\link{gwpcrpois.est}} for the \var{ctrl} settings affecting the raw
#' group-specific estimates, and the Details section for settings affecting
#' the shrinkage of those estimates.
#'
#' @details Apart from the \var{ctrl} parameters described in
#' \code{\link{gwpcrpois.est}}, the following settings can be modified
#' \describe{
#' \item{core}{number of CPU cores to use to compute group-specific estimates.
#' If set to a value greather than 1, the \code{parallel} package must be
#' loaded.}
#'
#' \item{obs.min.ingroup}{how many observed per-UMI read counts a group must
#' contain for group-specific estimates to be computed and used. Default is 5.}
#'
#' \item{use.nonconv.globalest}{whether to use global estimates that didn't
#' fully converge (i.e. where \code{\link{gwpcrpois.est}} reported a value other
#' than \code{0} for \var{convergence}). Default is \code{FALSE}.}
#'
#' \item{use.nonconv.groupest}{whether to use group-specific estimates
#' that didn't fully converge (i.e. where \code{\link{gwpcrpois.est}} reported
#' a value other than \code{0} for \var{convergence}). Default is \code{FALSE}.}
#'
#' \item{include.mean.var}{whether to include columns with the group-wise
#' global mean and variance of readcounts in the output table. Default is
#' \code{FALSE}.}
#'
#' \item{verbose}{whether to output progress and diagnostic messages, default
#' is \code{FALSE}.}
#' }
#'
#' @return a \code{\linkS4class{data.table}} containing one row per group, and
#' which besides the group key column(s) as indicated by the \var{formula},
#' contains the following columns
#'
#' \item{efficiency}{the final group-specific estimate of the PCR efficiency}
#'
#' \item{lambda0}{the final group-specific estimate of the number of reads per
#' molecule}
#'
#' \item{loss}{the final group-specific estimate of the fraction of lost
#' molecules.}
#'
#' \item{n.tot}{the final group-specific estimate of the total (loss-corrected)
#' number of molecules.}
#'
#' @seealso \code{\link{gwpcrpois}}
#'
#' @export
gwpcrpois.groupest <- function(formula, data, method="mom", threshold=1, molecules=1,
loss=expression(p0), ctrl=list())
{
# Get control parameters
verbose <- as.logical(ctrl.get("verbose", FALSE))
use.nonconv.globalest <- as.logical(ctrl.get("use.nonconv.globalest", FALSE))
include.mean.var <- as.logical(ctrl.get("include.mean.var", FALSE))
if (!class(formula) == "formula")
stop("formula must be a 'formula', see help(formula)")
data <- as.data.table(data)
method <- match.arg(method, c("mom", "mle"))
if (!(method %in% c("mom", "mle")))
stop("method must be 'mom' or 'mle'")
if (!is.numeric(threshold) || (length(threshold) != 1) || (threshold != floor(threshold)) || (threshold < 0))
stop('threshold must be a non-negative integral scalar')
if (!is.numeric(molecules) || (length(molecules) != 1) || (molecules != floor(molecules)) || (molecules < 1))
stop('molecules must be a strictly positive integral scalar')
if (!is.expression(loss) || (length(loss) != 1))
stop('loss must be an expression')
if (!is.list(ctrl))
stop('ctrl must be a list')
# Convert to "frame", i.e. a data.table with group key columns, "n.umis" and "count".
frame <- gwpcrpois.groupest.frame(formula, data)
if (nrow(frame[!is.finite(count) | (count < threshold)]) > 0)
stop("data contains non-finite counts or counts below the threshold")
# Compute global parameter estimates
if (verbose)
message("Computing overall parameter estimates")
mean.all = frame[, mean(count)]
var.all = frame[, var(count)]
m.all <- gwpcrpois.est(frame$count, method=method, must.converge=!use.nonconv.globalest,
loss=loss, threshold=threshold, molecules=molecules, ctrl=ctrl)
# Group data, add global estimate as columns for convenience
frame.grp <- frame[, list(dummy=1), keyby=key(frame)][, dummy := NULL]
if (include.mean.var)
frame.grp[, c("mean.all", "var.all") := list(mean(frame$count), var(frame$count)) ]
frame.grp[, c("efficiency.all", "lambda0.all", "loss.all") :=
list(m.all$efficiency, m.all$lambda0, m.all$loss) ]
# Estimate raw group-wise parameters efficiency.raw, lambda0.raw, loss.raw
frame.grp <- gwpcrpois.groupest.raw(frame, frame.grp, method, threshold=threshold,
molecules=molecules, loss=loss, ctrl)
# Estimage between-groups and estimator variances
frame.grp <- gwpcrpois.groupest.var(frame.grp, ctrl)
# Shrink raw estimates
frame.grp <- gwpcrpois.groupest.shrink(frame.grp, ctrl)
# Correct for unobserved UMIs, i.e. compute
# n
# n = ----------
# tot 1 - loss
if (verbose)
message("Computing n.tot")
frame.grp[, n.tot := ifelse(is.finite(loss), n.umis / (1 - loss), NA) ]
return(frame.grp)
}
# ***************************************************************************************
# Method of Moments (MoM) Estimator
# ***************************************************************************************
gwpcrpois.estimator.mom <- function(mean, var, threshold, molecules, ctrl) {
# If the threshold is non-zero, we resort to a numerical solution
exact.solution <- (threshold == 0)
ctrl.get <- function(key, default) {
r <- ctrl[[key]]
if (!is.null(r))
r
else
default
}
rel.tol <- as.numeric(ctrl.get("rel.tol", 1e-4))
abs.tol <- as.numeric(ctrl.get("abs.tol", 1e-4))
maxit <- as.integer(ctrl.get("maxit", 150))
trace <- as.logical(ctrl.get("trace", FALSE))
initial <- as.list(ctrl.get("initial", NULL))
within.tol <- function(old, new) {
if (abs(new) < abs.tol)
return(TRUE)
else if (abs((new - old) / new) < rel.tol)
return(TRUE)
else
return(FALSE)
}
# A random variable C distributed according to the PCR-Poisson mixture
# with parameters E (efficiency) and lambda0 has mean
# E( C | E, lambda0 ) = lambda0,
# and variance
# V( C | E, lambda0 ) = lambda0 + lambda0^2 * V( L | E ),
# where L is distributed according to the PCR distribution with efficiency E.
# A method-of-moments estimator for the parameters of C is thus to set
# lambda0 = Avg(C),
# and to set
# v' = ( Var(C) - lambda0 ) / lambda0^2
# and find E such that
# V( L | E ) = v',
# which is what gwpcr.sd.inv does
if (exact.solution || (length(initial) == 0)) {
pdetect <- 1
lambda0 <- mean
efficiency <- gwpcr.sd.inv(sqrt(max((var - lambda0) / ( lambda0^2 ), 0)),
molecules=molecules)
} else {
# If the user specified an initial value, use that
pdetect <- 1 - initial$p0
lambda0 <- initial$lambda0
efficiency <- initial$efficiency
}
# If the data is censored, i.e. if only counts >= threshold > 0 are
# observed, the sample mean will over-estimate lambda0, and the sample
# variance also won't reflect the uncensored distribution's variance.
#
# Let c_m be the expected sampling mean, i.e.
# c_m = E ( C | C >= TH, E, lambda0 ),
# and v_m the expected sampling variance, i.e.
# v_m = V ( C | C >= TH, E, lambda0 ),
# and set:
# m_0 = Sum c * P( C=c | E, lambda0 ) for 0 <= c < TH,
# v_0 = Sum c^2 * P( C=c | E, lambda0 ) for 0 <= c < TH,
# p_0 = Sum P( C=c | E, lambda0 ) for 0 <= c < TH,
# then the following relationship holds:
# (A) E( C | E, lambda0 ) = lambda_0 = m_0 + (1 - p_0) * c_m,
# (B) V( C | E, lambda0 ) = v = v_0 + (1 - p_0) * ( v_m + c_m^2 ) - lambda_0^2.
# Together with the relationship between v and E from above, i.e.
# (C) V( L | E ) = ( v - lambda0 ) / lambda0^2
# this yields a system of equations in E and lambda with parameters
# c_m, v_m and TH. The code below solves this iteratively by starting with
# the estimates for lambda0 and E for TH=0 from abovel. (A) is then used
# to update lambda0, (B) to compute a new v, and (C) to find the updated (E).
#
# Experiments showed that it is beneficial to use the updated lambda0 when
# evaluting (B) and (C). However, for performance reasons, m_0, v_0 and p_0
# are not re-evaluated immediately after updating lambda0, but instead the
# old values to used to update the efficiency. The values ARE then re-computed
# during the next round, however,
if (!exact.solution) {
stop <- FALSE
i <- 0
if (trace)
print(cbind(iteration=i, efficiency=efficiency,
lambda0=lambda0, pdetect=pdetect))
while (!stop) {
x <- seq(from=0, to=threshold-1, by=1)
d <- dgwpcrpois(x, efficiency=efficiency, lambda0=lambda0,
threshold=0, molecules=molecules)
pdetect.p <- (1 - sum(d))
lambda0.p <- sum(x * d) + pdetect.p * mean
v <- sum(x^2 * d) + pdetect.p * (var + mean^2) - lambda0.p^2
efficiency.p <- gwpcr.sd.inv(sqrt(max((v - lambda0.p) / ( lambda0.p^2 ), 0)),
molecules=molecules)
i <- i + 1
if (within.tol(lambda0, lambda0.p) && within.tol(efficiency, efficiency.p) &&
within.tol(pdetect, pdetect.p)) {
stop <- TRUE
convergence <- 0
} else if (i >= maxit) {
stop <- TRUE
convergence <- 1
}
lambda0 <- lambda0.p
efficiency <- efficiency.p
pdetect <- pdetect.p
if (trace)
print(cbind(iteration=i, efficiency=efficiency,
lambda0=lambda0, pdetect=pdetect))
}
} else {
pdetect <- 1
convergence <- 0
}
if (trace)
cat(paste0("Convergence: ", if (convergence == 0) "Yes" else "No"))
# Return results
return(list(lambda0=lambda0, efficiency=efficiency, p0=1-pdetect,
convergence=convergence, threshold=threshold, molecules=molecules))
}
# ***************************************************************************************
# Maximum Likelihood (ML) Estimator
# ***************************************************************************************
gwpcrpois.estimator.mle <- function(c, threshold, molecules, ctrl) {
# Since evaluating the PCR-Poisson mixture is slow, we optimize by evaluating it only
# once for each unique observed count, and multiplying the log-likelihood with the
# number of times that count was observed.
v <- rle(sort(c))
# Use method of moments estimates as initial parameters. Since we do the parameter
# search with clamp.efficiency set to FALSE, we must take care to clamp it to the
# range of efficiencies found in GWPCR here
mom <- gwpcrpois.estimator.mom(mean=mean(c), var=var(c), ctrl=ctrl.get('mom', list()),
molecules=molecules, threshold=threshold)
mom$efficiency <- pmin(pmax(GWPCR$efficiency[1], mom$efficiency), tail(GWPCR$efficiency,1))
# Optimize likelihood
# We set parscale such that on the par/parscale scale that optim uses, a
# step of minus one away from the initial parameters corresponds to a decrease
# of 10%. fnscale is set such that on the fn/fnscale scale of optim, such a step
# corresponds approximately to a unit change of the goal function. Remeber that
# optim minimizes, so to maximize fnscale contains an additional factor "-1".
logl <- function(p) {
e <- p['efficiency']
l <- p['lambda0']
# If parameters are valid, compute log-likelihood, otherwise
# return NA.
if ((e >= 0) && (e <= 1) && (l > 0))
sum(log(dgwpcrpois(c=v$values, efficiency=e, lambda0=l,
threshold=threshold, molecules=molecules)) * v$lengths)
else
as.numeric(NA)
}
p.init <- c(efficiency=mom$efficiency, lambda0=mom$lambda0)
ctrl$fnscale <- -abs(logl(p.init) - logl(p.init * c(0.9, 0.9)))
ctrl$parscale <- c(efficiency=p.init['efficiency']/10, lambda0=p.init['lambda0']/10)
r <- optim(par=p.init, fn=logl, method="Nelder-Mead", control=ctrl)
# Compute p0
p0 <- if (threshold > 0)
pgwpcrpois(threshold-1, efficiency=r$par['efficiency'], lambda0=r$par['lambda0'],
threshold=0, molecules=molecules)
else
0
# Return result
list(lambda0=as.vector(r$par['lambda0']),
efficiency=as.vector(r$par['efficiency']),
p0=p0,
convergence=r$convergence,
threshold=threshold,
molecules=molecules)
}
# ***************************************************************************************
# Group-wise Shrinkage Estimator
# ***************************************************************************************
# Data Normalization
# ---------------------------------------------------------------------------------------
gwpcrpois.groupest.frame <- function(formula, data) {
# Subset data to have a single "counts" columns, follows by the key columns defining
# the groups (e.g. the gene name, but there can be more than one, e.g. sample and gene).
# If the LHS of the formula contains an expressionlike c(col1, col2), the resulting
# table will have more rows than "data". This allows counts in multiple columns to be
# treated together (e.g. read counts for the plus and minus strand).
formula.t <- terms(formula)
group.key <- labels(formula.t)
counts.expr <- attr(formula.t, "variables")[[1 + attr(formula.t, "response")]]
data[, list(n.umis=.N, count=eval(counts.expr)), keyby=group.key]
}
# Raw Groupwise Estimator
# ---------------------------------------------------------------------------------------
gwpcrpois.groupest.raw <- function(frame, frame.grp, method, threshold, molecules, loss, ctrl)
{
# Get control parameters
cores <- as.numeric(ctrl.get("cores", 1))
my.lapply <- if (cores > 1) {
function(...) { mclapply(..., mc.cores=cores, mc.preschedule=TRUE, mc.cleanup=TRUE) }
} else {
lapply
}
use.nonconv.groupest <- as.logical(ctrl.get("use.nonconv.groupest", FALSE))
obs.min.ingroup <- as.integer(ctrl.get("obs.min.ingroup", 5))
include.mean.var <- as.logical(ctrl.get("include.mean.var", FALSE))
verbose <- as.logical(ctrl.get("verbose", FALSE))
# Find columns of class "factor" in frame
factor.columns <- Filter(function(n) { is.factor(frame[[n]]) }, colnames(frame))
# Compute raw (group-wise) parameter estimates
if (verbose)
message("Computing group-wise parameter estimates on ", cores, " cores")
group.key <- key(frame.grp)
groups <- do.call(mapply, c(list(list), frame.grp[, ..group.key], list(SIMPLIFY=FALSE)))
model.na <- list(lambda0=NA, efficiency=NA, p0=NA, loss=NA)
frame.grp <- rbindlist(my.lapply(groups, function(k) {
# Fetch observations for group
r <- frame.grp[k,]
stopifnot(nrow(r) == 1)
# Fit model
obs <- frame[k, count]
m <- if (length(obs) >= obs.min.ingroup)
tryCatch(gwpcrpois.est(obs, must.converge=!use.nonconv.groupest, loss=loss,
threshold=threshold, molecules=molecules, ctrl=ctrl),
error=function(e) {
cat(paste0("Failed to estimate parameters for group (",
paste0(lapply(k, as.character), collapse=","),
"): ", conditionMessage(e), "\n"), file = stderr())
model.na })
else
model.na
# Backwards-compatibility hack: Insert raw group mean and variance for method of moments
if (include.mean.var)
r[, c("mean.raw", "var.raw") := list(mean(obs), var(obs)) ]
# Generate row
r[, c("n.umis", "n.obs", "efficiency.raw", "lambda0.raw", "loss.raw") :=
list(frame[k, n.umis[1] ], length(obs), m$efficiency, m$lambda0, m$loss)]
# Remove superfluous labels from factor columns. Not doing so causes problems
# in rbindlist -- it seems that it concatenates *all* the levels together first,
# without checking for duplicates, and bails out if the resulting vector
# is longer than 2^31 entries.
if (length(factor.columns) > 0)
r[, (factor.columns) := lapply(factor.columns, function(c) { factor(eval(as.name(c))) } )]
# Collect rows
r
}))
setkeyv(frame.grp, group.key)
# Return grouped frame
return(frame.grp)
}
# Variance Estimator
# ---------------------------------------------------------------------------------------
gwpcrpois.groupest.var <- function(frame.grp, ctrl)
{
verbose <- as.logical(ctrl.get("verbose", FALSE))
var.est.distfree <- as.logical(ctrl.get("var.est.distfree", TRUE))
# Estimate variance of loss^raw (and also lambda0^raw, efficiency^raw) as a function
# of the number of observed UMIs n.
#
# This is the distribution-free version of the estimator. We assume the mean loss is
# independent from n^obs. For a fixed n^obs, the expected value of (loss^raw - m)^2
# is then v_g + v_e / n^obs. We find v_g and v_e by mininizing the squared deviation
# of the (single-point) variance estimate (loss^raw - m)^2 and the predictor. Since
# variances are generally bigger for small n^obs, minimizing an unweighted sum of
# square deviations would allow loss estimates for small n^obs to influence the
# estimation unduely strongly. We correct for this by weigthing the squared deivations
# with n^obs (which matches the expected drop in scale), and thus minimize the sum
#
# 2
# / \
# | (loss^raw - m)^2 - v_g - v_e / n^obs | * w(n^obs), where w(n) = n / (1 + n/W)
# \ /
#
# over all genes. m is the sample mean over all genes. We start the numerical
# search with v_g = v_e = v/2, where v is the sampling variance of the loss.
variance.estimates.distfree <- function(x.name) {
# Parameter W controls the grow behaviour of weights. For small n.obs, the weights
# equal n.obs, but as n.obs grows, weights eventually converge to W.
W <- 100
if (verbose)
message("Estimating variances of ", x.name, " using the distribution-free algorithm")
x <- parse(text=paste0(x.name, ".raw"))
# Find least-squared estimates for v_g, v_e.
r <- frame.grp[is.finite(eval(x)) & (n.obs > 0), {
y <- (eval(x) - mean(eval(x), na.rm=TRUE))^2
x <- cbind(v_g=rep(1, .N), v_e=1/n.obs)
w <- n.obs / (1 + n.obs / W)
lsfit(x, y, wt=w, intercept=FALSE)$coefficients
} ]
if (all(r < 0)) {
stop("both variance estimators are negative")
} else if (r["v_g"] < 0) {
warning("variance of ", x.name, " between groups estimated to be negative, setting to zero")
# Between-gene variance estimate is negative. Assume between-gene variance
# is negligible, and estimate only the estimator variance.
r <- c(v_g=0, frame.grp[is.finite(eval(x)) & (n.obs > 0), {
y <- (eval(x) - mean(eval(x), na.rm=TRUE))^2
x <- cbind(v_e=1/n.obs)
w <- n.obs
lsfit(x, y, wt=w, intercept=FALSE)$coefficients["v_e"]
} ])
} else if (r["v_e"] < 0) {
warning("estimator variance of ", x.name, " within groups estimated to be negative, setting to zero")
# Estimate of estimator variance is negative. Assume estimator variance is
# negligible, and estimate only the between-gene variance.
r <- c(v_g=frame.grp[is.finite(eval(x)) & (n.obs > 0), var(eval(x), na.rm=TRUE)], v_e=0)
}
if (any(r < 0))
stop("some variance estimators are negative")
# Add columns
frame.grp[, paste0(x.name, ".grp.var") := r["v_g"]]
frame.grp[, paste0(x.name, ".raw.var") := r["v_e"] / n.obs]
}
# Estimate variance of p_0^raw (and also lambda0^raw, efficiency^raw) as a function
# of the number of observed UMIs n.
#
# We assume that p_0^raw | n ~ Beta( a(n), b(n) ), with a fixed expectation m and
# n-dependent variance comprising a estimation error that decreases with 1/n and a
# constant part that reflects the variance of the loss between groups,
#
# raw
# V( p | n) = v(n) = v + v / n.
# 0 g e
#
# From the mean (assumed constant) and variance of the Beta distribution it follows that
#
# m * (1 - m)
# a(n) = m * f(n), b(n) = (1-m) * f(n) where f(n) = ------------- - 1.
# v
# e
# v + ---
# g n
#
# For m we used the sample mean over all genes, and for v_g and v_e ML estimates
# found using numerical optimzation. We start the numerical search with
# v_g = v_e = v/2, where v is the sampling variance of the loss over all genes (limited
# to the possible range (0, m * (1 - m) ).
#
# efficiency^raw is handled similarly. For lambda0^raw, the beta distribution
# is replaced by a normal distribution.
variance.estimates.beta <- function(x.name) {
if (verbose)
message("Estimating variances of ", x.name, " assuming a beta distribution")
x <- parse(text=paste0(x.name, ".raw"))
# For the mean use the sampling mean, since we assume it's independent of n
m <- frame.grp[, mean(eval(x), na.rm=TRUE) ]
# Compute quantity to minimize, i.e. negative log-liklihood
# We contract "x" a bit towards 0.5 to ensure that all values are strictly
# greater than 0 and less than 1 -- otherwise, the log-likelihood becomes -Inf.
# M is the contraction factor, i.e. we move X to the interval [M/2, 1-M/2]
M <- 1e-6
logl <- function(p) {
# Reject invalid parameters. Note that the beta variance is always <= m * (1-m).
if (any(p <= 0) || (p["v_g"] >= m * (1-m)))
return(NA)
# Evaluate likelihoods. We strict shape1,2 to >= 1 to ensure monomodality
-sum(frame.grp[is.finite(eval(x)) & (n.obs > 0), {
f <- pmax(m * (1-m) / (p["v_g"] + p["v_e"] / n.obs) - 1, 1/m, 1/(1-m))
dbeta(M/2 + eval(x)*(1-M), shape1=m*f, shape2=(1-m)*f, log=TRUE)
}])
}
# Optimize v_g and v_e. Initially, we split the total observed variance evenly
# between v_g and v_e / n.obs for the smallest positive group size n.obs.
v <- frame.grp[, min(var(eval(x), na.rm=TRUE), m * (1-m) ) ]
n.min <- frame.grp[n.obs > 0, min(n.obs) ]
r <- optim(fn=logl, par=c(v_g=v/2, v_e=n.min*v/2), method="Nelder-Mead")
# Correct variances for the previous contraction
v_g <- r$par["v_g"] / (1-M)^2
v_e <- r$par["v_e"] / (1-M)^2
# Add columns
frame.grp[, paste0(x.name, ".raw.var") := v_e / n.obs]
frame.grp[, paste0(x.name, ".grp.var") := v_g]
}
variance.estimates.normal <- function(x.name) {
if (verbose)
message("Estimating variances of ", x.name, " assuming a normal distribution")
x <- parse(text=paste0(x.name, ".raw"))
m <- frame.grp[, mean(eval(x), na.rm=TRUE) ]
v <- frame.grp[, var(eval(x), na.rm=TRUE) ]
# Compute quantity to minimize, i.e. negative log-liklihood
logl <- function(p) {
# Reject invalid parameters
if (any(p < 0))
return(NA)
# Evaluate likelihoods
-sum(frame.grp[is.finite(eval(x)) & (n.obs > 0),
dnorm(eval(x), mean=m, sd=sqrt(p["v_g"] + p["v_e"] / n.obs), log=TRUE)
])
}
r <- optim(fn=logl, par=c(v_g=v/2, v_e=v/2), method="Nelder-Mead")
# Add columns
frame.grp[, paste0(x.name, ".raw.var") := r$par["v_e"] / n.obs ]
frame.grp[, paste0(x.name, ".grp.var") := r$par["v_g"] ]
}
if (var.est.distfree) {
variance.estimates.distfree("loss")
variance.estimates.distfree("efficiency")
variance.estimates.distfree("lambda0")
} else {
variance.estimates.beta("loss")
variance.estimates.beta("efficiency")
variance.estimates.normal("lambda0")
}
return(frame.grp)
}
# Shrinkage Estimator
# ---------------------------------------------------------------------------------------
gwpcrpois.groupest.shrink <- function(frame.grp, ctrl)
{
verbose <- as.logical(ctrl.get("verbose", FALSE))
# Compute shrunken estimates of loss, efficiency and lambda0 i.e.
#
# all v_e raw
# p * --- + p * v_g
# 0 n 0
# p = -----------------------------------.
# 0 v_e
# --- + v_g
# n
#
# and similarly for the other two quantities
if (verbose)
message("Computing final parameter estimates")
frame.grp[, loss := (loss.all * loss.raw.var + loss.raw * loss.grp.var) / (loss.raw.var + loss.grp.var) ]
frame.grp[, efficiency := (efficiency.all * efficiency.raw.var + efficiency.raw * efficiency.grp.var) / (efficiency.raw.var + efficiency.grp.var) ]
frame.grp[, lambda0 := (lambda0.all * lambda0.raw.var + lambda0.raw * lambda0.grp.var) / (lambda0.raw.var + lambda0.grp.var) ]
# If the local estimate is NA, use the global one
frame.grp[!is.finite(loss), loss := loss.all ]
frame.grp[!is.finite(efficiency), efficiency := efficiency.all ]
frame.grp[!is.finite(lambda0), lambda0 := lambda0.all ]
return(frame.grp)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.