R/meanVarCurve.R

Defines functions setPriorDf estimatePriorDf scaleMeanVarCurve estimateD0 inv.trigamma

Documented in estimateD0 estimatePriorDf inv.trigamma scaleMeanVarCurve setPriorDf

# Functions in this file are for assessing the goodness of fit of
# mean-variance curves and adjusting them accordingly.
#
# Last update: 2021-09-09


#' Inversion of Trigamma Function
#'
#' \code{inv.trigamma} implements the Newton iteration for solving, given
#' \code{x}, the equation for \code{y}: \code{\link[base]{trigamma}(y) = x}.
#' See appendix of the \code{limma} paper (see "References") for a theoretical
#' deduction of the method.
#'
#' @param x A positive numeric scalar.
#' @param eps The required precision of the solution.
#' @return The solution, which is also a positive numeric scalar.
#' @references Smyth, G.K., \emph{Linear models and empirical bayes methods for
#'     assessing differential expression in microarray experiments.} Stat Appl
#'     Genet Mol Biol, 2004. \strong{3}: p. Article3.
#' @seealso \code{\link[base]{trigamma}} for the trigamma function.
#' @export
#' @examples
#' x <- trigamma(1:6)
#' vapply(x, inv.trigamma, numeric(1))
#'
inv.trigamma <- function(x, eps = 1e-8) {
    if (x > 1e7) return(1 / sqrt(x))
    if (x < 1e-7) return(0.5 + 1 / x)
    y <- 0.5 + 1 / x
    temp <- trigamma(y)
    delta <- temp * (1 - temp / x) / psigamma(y, 2)
    while ((-delta / y) > eps) {
        y <- y + delta
        temp <- trigamma(y)
        delta <- temp * (1 - temp / x) / psigamma(y, 2)
    }
    y + delta
}


#' Workhorse Function for Estimating Number of Prior Degrees of Freedom
#'
#' \code{estimateD0} underlies other interface functions for assessing
#' the goodness of fit of an unadjusted mean-variance curve (or a set of
#' unadjusted mean-variance curves).
#'
#' For each \code{\link{bioCond}} object with replicate samples, a vector of
#' FZ statistics can be deduced from the unadjusted mean-variance curve
#' associated with it. More specifically, for each genomic interval in a
#' \code{bioCond} with replicate samples, its FZ statistic is defined to be
#' \eqn{log(t_hat / v0)}, where \eqn{t_hat} is the observed variance of signal
#' intensities of the interval, and \eqn{v0} is the interval's prior variance
#' read from the corresponding mean-variance curve.
#'
#' Theoretically, each FZ statistic follows a scaled Fisher's Z distribution
#' plus a constant (since the mean-variance curve is not adjusted yet), and we
#' can use the sample variance (plus a constant) of the FZ statistics
#' of each single \code{bioCond} to get an estimate of
#' \eqn{trigamma(d0 / 2)},
#' where \eqn{d0} is the number of prior degrees of freedom
#' (see also \code{\link[base]{trigamma}}).
#'
#' The final estimate of \eqn{trigamma(d0 / 2)} is a weighted mean of estimates
#' across \code{bioCond} objects, with the weights being their respective
#' numbers of genomic intervals minus 1 that
#' are used to deduce the FZ statistics.
#' This should be appropriate, as Fisher's Z distribution is roughly normal
#' (see also "References"). The weighted mean is similar to the pooled sample
#' variance in an ANOVA analysis.
#'
#' Finally, an estimate of \eqn{d0} can be obtained by taking the inverse of
#' \eqn{trigamma} function, which is achieved by applying Newton iteration
#' to it. Note that \eqn{d0} is considered to be infinite if the estimated
#' \eqn{trigamma(d0 / 2)} is less than or equal to 0.
#'
#' @param z A list of which each element is a vector of FZ statistics
#'     corresponding to a \code{\link{bioCond}} object (see also "Details").
#' @param m A vector of numbers of replicates in \code{bioCond}
#'     objects. Must correspond to \code{z} one by one in the same
#'     order.
#' @return The estimated number of prior degrees of freedom. Note that the
#'     function returns \code{NA} if there are not sufficient genomic intervals
#'     for estimating it.
#' @references
#' Smyth, G.K., \emph{Linear models and empirical bayes methods for
#' assessing differential expression in microarray experiments.} Stat Appl
#' Genet Mol Biol, 2004. \strong{3}: p. Article3.
#'
#' Tu, S., et al., \emph{MAnorm2 for quantitatively comparing groups of
#' ChIP-seq samples.} Genome Res, 2021. \strong{31}(1): p. 131-145.
#'
#' @seealso \code{\link{bioCond}} for creating a \code{bioCond} object;
#'     \code{\link{fitMeanVarCurve}} for fitting a mean-variance curve;
#'     \code{\link{estimatePriorDf}} for an interface to estimating the
#'     number of prior degrees of freedom on \code{bioCond} objects;
#'     \code{\link{varRatio}} for a description of variance ratio factor;
#'     \code{\link{scaleMeanVarCurve}} for estimating the variance ratio factor
#'     for adjusting a mean-variance curve (or a set of curves).
#'
#'     \code{\link{estimateD0Robust}} and \code{\link{scaleMeanVarCurveRobust}}
#'     for estimating number of prior degrees of freedom and variance ratio
#'     factor \emph{in a robust manner}, respectively.
#' @importFrom stats var
estimateD0 <- function(z, m) {
    weights <- numeric(length(z))
    vals <- numeric(length(z))
    flag <- TRUE
    for (i in 1:length(z)) {
        x <- z[[i]]
        x <- x[!(is.na(x) | is.infinite(x))]
        if (length(x) < 2) next
        flag <- FALSE
        weights[i] <- length(x) - 1
        vals[i] <- var(x) - trigamma((m[i] - 1) / 2)
    }
    if (flag) return(NA_real_)

    # Since Fisher's Z distribution is roughly normal, we weight the estimates
    # by their respective numbers of degrees of freedom, similar to the pooled
    # sample variance for normal distribution.
    temp <- sum(vals * weights) / sum(weights)
    if (temp <= 0) {
        d0 <- Inf
    } else {
        d0 <- inv.trigamma(temp) * 2
    }
    d0
}


#' Scale a Mean-Variance Curve
#'
#' \code{scaleMeanVarCurve} underlies other interface functions for estimating
#' the variance ratio factor of an unadjusted mean-variance curve (or a set of
#' unadjusted mean-variance curves).
#'
#' For each \code{\link{bioCond}} object with replicate samples, a vector of
#' FZ statistics can be deduced from the unadjusted mean-variance curve
#' associated with it. More specifically, for each genomic interval in a
#' \code{bioCond} with replicate samples, its FZ statistic is defined to be
#' \eqn{log(t_hat / v0)}, where \eqn{t_hat} is the observed variance of signal
#' intensities of the interval, and \eqn{v0} is the interval's prior variance
#' read from the corresponding mean-variance curve.
#'
#' Theoretically, each FZ statistic follows a scaled Fisher's Z distribution
#' plus a constant (since the mean-variance curve is not adjusted yet), and we
#' can use the sample mean (plus a constant that depends on the number of
#' prior degrees
#' of freedom) of the FZ statistics of each single \code{bioCond} to get
#' an estimate of log variance ratio factor.
#'
#' The final estimate of log variance ratio factor is a weighted mean of
#' estimates across \code{bioCond} objects, with the weights being their
#' respective numbers of genomic intervals that are used to calculate
#' FZ statistics.
#' This should be appropriate, as Fisher's Z distribution is roughly normal
#' (see also "References"). The weighted mean is actually a plain (unweighted)
#' mean across all the involved genomic intervals.
#'
#' Finally, we get an estimate of variance ratio factor by taking an
#' exponential.
#'
#' @param z A list of which each element is a vector of FZ statistics
#'     corresponding to a \code{\link{bioCond}} object (see also "Details").
#' @param m A vector of numbers of replicates in \code{bioCond}
#'     objects. Must correspond to \code{z} one by one in the same
#'     order.
#' @param d0 A positive real specifying the number of
#'     prior degrees of freedom of the
#'     mean-variance curve(s). \code{Inf} is allowed. Note that \code{d0} is
#'     typically estimated via \code{\link{estimateD0}}.
#' @return The estimated variance ratio factor for adjusting the mean-variance
#'     curve(s). Note that the function returns \code{NA} if there are not
#'     sufficient genomic intervals for estimating it.
#' @references
#' Smyth, G.K., \emph{Linear models and empirical bayes methods for
#' assessing differential expression in microarray experiments.} Stat Appl
#' Genet Mol Biol, 2004. \strong{3}: p. Article3.
#'
#' Tu, S., et al., \emph{MAnorm2 for quantitatively comparing groups of
#' ChIP-seq samples.} Genome Res, 2021. \strong{31}(1): p. 131-145.
#'
#' @seealso \code{\link{bioCond}} for creating a \code{bioCond} object;
#'     \code{\link{fitMeanVarCurve}} for fitting a mean-variance curve;
#'     \code{\link{varRatio}} for a formal description of variance ratio
#'     factor; \code{\link{estimateD0}} for estimating the number of prior
#'     degrees of freedom associated with a mean-variance curve (or a set
#'     of curves); \code{\link{estimatePriorDf}} for an interface to
#'     estimating the number of prior degrees of freedom on \code{bioCond}
#'     objects as well as adjusting their mean-variance curve(s) accordingly.
#'
#'     \code{\link{estimateD0Robust}} and \code{\link{scaleMeanVarCurveRobust}}
#'     for estimating number of prior degrees of freedom and variance ratio
#'     factor \emph{in a robust manner}, respectively.
scaleMeanVarCurve <- function(z, m, d0) {
    # Note the asymptotic behavior of digamma function.
    if (is.infinite(d0)) {
        offset <- -log(2)
    } else {
        offset <- digamma(d0 / 2) - log(d0)
    }

    weights <- numeric(length(z))
    vals <- numeric(length(z))
    flag <- TRUE
    for (i in 1:length(z)) {
        x <- z[[i]]
        x <- x[!(is.na(x) | is.infinite(x))]
        if (length(x) == 0) next
        flag <- FALSE
        weights[i] <- length(x)
        vals[i] <- mean(x) + offset - (digamma((m[i] - 1) / 2) - log(m[i] - 1))
    }
    if (flag) return(NA_real_)

    # Since Fisher's Z distribution is roughly normal, we weight the estimates
    # using the sample sizes associated with them.
    exp(sum(vals * weights) / sum(weights))
}


#' Assess the Goodness of Fit of Mean-Variance Curves
#'
#' Given a set of \code{\link{bioCond}} objects of which each has been
#' associated with a mean-variance curve, \code{estimatePriorDf} derives a
#' common number of
#' prior degrees of freedom assessing the overall goodness of fit of the
#' mean-variance curves and accordingly adjusts the variance ratio factor of
#' each of the \code{bioCond}s.
#'
#' \code{estimatePriorDf} borrows part of the modeling strategy implemented in
#' \code{limma} (see "References"). For each \code{\link{bioCond}} object, the
#' predicted variances from its mean-variance curve serve as the prior
#' variances associated with individual intervals.
#' The common number of prior degrees of freedom
#' of the supplied \code{bioCond}s
#' quantifies the confidence we have on the associated mean-variance curves.
#' Intuitively, the closer the observed mean-variance points are
#' to the curves, the more prior degrees of freedom there will be.
#' See \code{\link{estimateD0}} for technical details about the estimation of
#' number of prior degrees of freedom.
#'
#' According to the estimated number of prior degrees of freedom,
#' \code{estimatePriorDf}
#' separately adjusts the variance ratio factor of each \code{bioCond}.
#' Intrinsically, this process is to scale the mean-variance curve of each
#' \code{bioCond} so that it passes the "middle" of the observed mean-variance
#' points. See \code{\link{scaleMeanVarCurve}} for technical details of
#' scaling a mean-variance curve.
#'
#' ChIP-seq signals located in non-occupied intervals result primarily from
#' background noise, and are therefore associated with less data regularity
#' than signals in occupied intervals. Involving non-occupied intervals in the
#' estimation process may result in an under-estimated number of prior degrees
#' of freedom. Thus, the recommended usage is to set \code{occupy.only} to
#' \code{TRUE} (i.e., the default).
#'
#' In most cases, the estimation of number of prior degrees of freedom
#' is automatically
#' handled when fitting or setting a mean-variance curve, and you don't need to
#' call this function explicitly (see also \code{\link{fitMeanVarCurve}} and
#' \code{\link{setMeanVarCurve}}). See "Examples"
#' below for a practical application of this function. Note also that there is
#' a \emph{robust} version of this function that uses Winsorized statistics to
#' protect the estimation procedure against potential outliers (see
#' \code{\link{estimatePriorDfRobust}} for details).
#'
#' @param conds A list of \code{\link{bioCond}} objects, of which each has a
#'     \code{fit.info} field describing its mean-variance curve (see also
#'     \code{\link{fitMeanVarCurve}}).
#' @param occupy.only A logical scalar. If it is \code{TRUE} (default), only
#'     occupied intervals are used to estimate the number of
#'     prior degrees of freedom and adjust the variance ratio factors.
#'     Otherwise, all intervals are used (see also "Details").
#' @param return.d0 A logical scalar. If set to \code{TRUE}, the function
#'     simply returns the estimated number of prior degrees of freedom.
#' @param no.rep.rv A positive real specifying the variance ratio factor of
#'     those \code{bioCond}s without replicate samples, if any. By default,
#'     it's set to the geometric mean of variance ratio factors of the other
#'     \code{bioCond}s.
#' @param .call Never care about this argument.
#' @return By default, \code{estimatePriorDf} returns the argument list of
#'     \code{\link{bioCond}} objects,
#'     with the estimated number of prior degrees of
#'     freedom substituted for the \code{"df.prior"} component of each of them.
#'     Besides, their \code{"ratio.var"} components have been adjusted
#'     accordingly, and an attribute named \code{"no.rep.rv"} is added to the
#'     list if it's ever been used as the variance ratio factor of the
#'     \code{bioCond}s without replicate samples. A special case is that the
#'     estimated number of prior degrees of freedom is 0. In this case,
#'     \code{estimatePriorDf} won't adjust existing variance ratio factors,
#'     and you may want to use \code{\link{setPriorDfVarRatio}} to
#'     explicitly specify variance ratio factors.
#'
#'     If \code{return.d0} is set to \code{TRUE}, \code{estimatePriorDf} simply
#'     returns the estimated number of prior degrees of freedom.
#' @references
#' Smyth, G.K., \emph{Linear models and empirical bayes methods for
#' assessing differential expression in microarray experiments.} Stat Appl
#' Genet Mol Biol, 2004. \strong{3}: p. Article3.
#'
#' Tu, S., et al., \emph{MAnorm2 for quantitatively comparing groups of
#' ChIP-seq samples.} Genome Res, 2021. \strong{31}(1): p. 131-145.
#'
#' @seealso \code{\link{bioCond}} for creating a \code{bioCond} object;
#'     \code{\link{fitMeanVarCurve}} for fitting a mean-variance curve and
#'     using a \code{fit.info} field to characterize it;
#'     \code{\link{estimatePriorDfRobust}} for a \emph{robust} version of
#'     \code{estimatePriorDf};
#'     \code{\link{setPriorDf}} for setting the number of
#'     prior degrees of freedom and accordingly
#'     adjusting the variance ratio factors of a set of \code{bioCond}s;
#'     \code{\link[=diffTest.bioCond]{diffTest}} for calling differential
#'     intervals between two \code{bioCond} objects.
#' @export
#' @examples
#' data(H3K27Ac, package = "MAnorm2")
#' attr(H3K27Ac, "metaInfo")
#'
#' ## Fit a mean-variance curve treating each gender as a biological condition,
#' ## and each individual (i.e., cell line) a replicate.
#' \donttest{
#' # First perform the MA normalization and construct bioConds to represent
#' # individuals.
#' norm <- normalize(H3K27Ac, 4, 9)
#' norm <- normalize(norm, 5:6, 10:11)
#' norm <- normalize(norm, 7:8, 12:13)
#' conds <- list(GM12890 = bioCond(norm[4], norm[9], name = "GM12890"),
#'               GM12891 = bioCond(norm[5:6], norm[10:11], name = "GM12891"),
#'               GM12892 = bioCond(norm[7:8], norm[12:13], name = "GM12892"))
#' autosome <- !(H3K27Ac$chrom %in% c("chrX", "chrY"))
#' conds <- normBioCond(conds, common.peak.regions = autosome)
#'
#' # Group individuals into bioConds based on their genders.
#' female <- cmbBioCond(conds[c(1, 3)], name = "female")
#' male <- cmbBioCond(conds[2], name = "male")
#'
#' # The dependence of variance of ChIP-seq signal intensity across individuals
#' # on the mean signal intensity is typically not as regular as could be well
#' # modeled by an explicit parametric form. Better use the local regression to
#' # adaptively capture the mean-variance trend.
#' genders <- list(female = female, male = male)
#' genders1 <- fitMeanVarCurve(genders, method = "local", occupy.only = TRUE)
#' genders2 <- fitMeanVarCurve(genders, method = "local", occupy.only = FALSE)
#'
#' # Suppose the local regression is performed using only the occupied genomic
#' # intervals as input. Good chances are that the extrapolation algorithm
#' # implemented in the regression method will produce over-estimated variances
#' # for the non-occupied intervals.
#' plotMeanVarCurve(genders1, subset = "all")
#' plotMeanVarCurve(genders2, subset = "all")
#' plotMeanVarCurve(genders1, subset = "non-occupied")
#' plotMeanVarCurve(genders2, subset = "non-occupied")
#'
#' # On the other hand, applying the local regression on all genomic intervals
#' # may considerably reduce the estimated number of prior degrees of freedom
#' # associated with the fitted mean-variance curve, as ChIP-seq signals in the
#' # non-occupied intervals are generally of less data regularity compared with
#' # those in the occupied intervals.
#' summary(genders1$female)
#' summary(genders2$female)
#'
#' # To split the difference, fit the mean-variance curve on all genomic
#' # intervals and re-estimate the number of prior degrees of freedom using
#' # only the occupied intervals, which is also the most recommended strategy
#' # in practice.
#' genders3 <- estimatePriorDf(genders2, occupy.only = TRUE)
#' plotMeanVarCurve(genders3, subset = "all")
#' plotMeanVarCurve(genders3, subset = "non-occupied")
#' summary(genders3$female)
#' }
estimatePriorDf <- function(conds, occupy.only = TRUE, return.d0 = FALSE,
                            no.rep.rv = NULL, .call = TRUE) {
    # Check conds
    if (length(conds) == 0) {
        stop("conds mustn't be empty")
    }
    for (cond in conds) {
        if (!is(cond, "bioCond")) {
            stop("Objects in conds must be of the class \"bioCond\"")
        }
        if (is.null(cond$fit.info)) {
            stop("Missing the \"fit.info\" field for some object in conds")
        }
    }

    # Estimate the number of prior degrees of freedom
    n <- length(conds)
    z <- lapply(conds, function(cond){ "place holder" })
    m <- vapply(conds, function(cond){ ncol(cond$norm.signal) }, numeric(1))
    noRep <- m <= 1
    if (all(noRep)) {
        stop("None of the bioCond objects in conds contains replicate samples.
Cannot estimate the number of prior degrees of freedom")
    }
    for (i in 1:n) {
        if (noRep[i]) next
        cond <- conds[[i]]
        if (occupy.only) {
            f <- cond$occupancy
        } else {
            f <- TRUE
        }
        means <- cond$sample.mean[f]
        vars <- cond$sample.var[f]
        z[[i]] <- log(vars / cond$fit.info$predict(means))
    }
    d0 <- estimateD0(z[!noRep], m[!noRep])
    if (is.na(d0)) {
        stop("Too few genomic intervals to estimate the number of prior degrees of freedom")
    }
    if (return.d0) return(d0)
    temp <- match.call()
    for (i in 1:n) {
        conds[[i]]$fit.info$df.prior <- d0
        if (.call) conds[[i]]$fit.info$calls$estParam <- temp
    }

    # Adjust the variance ratio factors
    if (d0 == 0) {
        warning("Estimated number of prior degrees of freedom is 0.
Won't adjust the existing variance ratio factors.
Refer to setPriorDfVarRatio() if you want to explicitly specify variance ratio factors")
        return(conds)
    }
    ratio.vars <- numeric(n)
    for (i in 1:n) {
        if (noRep[i]) next
        ratio.vars[i] <- scaleMeanVarCurve(z[i], m[i], d0)
        if (is.na(ratio.vars[i])) {
            warning(gettextf(
"Too few genomic intervals in \"%s\" to estimate its variance ratio factor.
Use the one for no-replicate conditions", conds[[i]]$name))
            noRep[i] <- TRUE
        } else {
            conds[[i]]$fit.info$ratio.var <- ratio.vars[i]
        }
    }

    # Handle no-replicate conditions
    if (!any(noRep)) return(conds)
    if (is.null(no.rep.rv)) {
        if (all(noRep)) {
            stop(
"Cannot estimate the variance ratio factor for no-replicate conditions.
You may specify it explicitly")
        }
        temp <- ratio.vars[!noRep]
        no.rep.rv <- exp(mean(log(temp)))
    } else {
        no.rep.rv <- as.numeric(no.rep.rv)[1]
        if (no.rep.rv <= 0) stop("no.rep.rv must be a positive real")
        if (is.infinite(no.rep.rv)) stop("no.rep.rv is not allowed to be Inf")
    }
    for (i in (1:n)[noRep]) conds[[i]]$fit.info$ratio.var <- no.rep.rv
    attr(conds, "no.rep.rv") <- no.rep.rv
    conds
}


#' Set the Number of Prior Degrees of Freedom of Mean-Variance Curves
#'
#' Given a set of \code{\link{bioCond}} objects of which each has been
#' associated with a mean-variance curve, \code{setPriorDf} assigns a
#' common number of prior degrees of freedom to all the \code{bioCond}s
#' and accordingly adjusts their variance ratio factors.
#'
#' The specific behavior of this function is pretty much the same as
#' \code{\link{estimatePriorDf}}, except that
#' the number of prior degrees of freedom is
#' directly specified by users rather than estimated based on the observed
#' data. Refer to \code{\link{estimatePriorDf}} for more information.
#'
#' Note also that there is a \emph{robust} version of this function that uses
#' Winsorized statistics to derive variance ratio factors (see
#' \code{\link{setPriorDfRobust}} for details).
#'
#' @inheritParams estimatePriorDf
#' @param d0 A non-negative real specifying the number of prior degrees of
#'     freedom. \code{Inf} is allowed.
#' @param occupy.only A logical scalar. If it is \code{TRUE} (default), only
#'     occupied intervals are used to adjust the variance ratio factors.
#'     Otherwise, all intervals are used.
#' @return \code{setPriorDf} returns the argument list of
#'     \code{\link{bioCond}} objects, with the specified
#'     number of prior degrees of
#'     freedom substituted for the \code{"df.prior"} component of each of them.
#'     Besides, their \code{"ratio.var"} components have been adjusted
#'     accordingly, and an attribute named \code{"no.rep.rv"} is added to the
#'     list if it's ever been used as the variance ratio factor of the
#'     \code{bioCond}s without replicate samples.
#'
#'     To be noted, if the specified number of prior degrees of freedom is 0,
#'     \code{setPriorDf} won't adjust existing variance ratio factors.
#'     In this case, you may want to use \code{\link{setPriorDfVarRatio}} to
#'     explicitly specify variance ratio factors.
#' @seealso \code{\link{bioCond}} for creating a \code{bioCond} object;
#'     \code{\link{fitMeanVarCurve}} for fitting a mean-variance curve and
#'     using a \code{fit.info} field to characterize it;
#'     \code{\link{estimatePriorDf}} for estimating the number of prior
#'     degrees of freedom and adjusting the variance ratio factors of a set of
#'     \code{bioCond}s;
#'     \code{\link{setPriorDfRobust}} for a \emph{robust} version of
#'     \code{setPriorDf};
#'     \code{\link[=diffTest.bioCond]{diffTest}} for calling
#'     differential intervals between two \code{bioCond} objects.
#' @export
#' @examples
#' data(H3K27Ac, package = "MAnorm2")
#' attr(H3K27Ac, "metaInfo")
#'
#' ## Fit a mean-variance curve for the GM12892 cell line (i.e., individual)
#' ## and set the number of prior degrees of freedom of the curve to Inf.
#'
#' # Perform the MA normalization and construct a bioCond to represent GM12892.
#' norm <- normalize(H3K27Ac, 7:8, 12:13)
#' GM12892 <- bioCond(norm[7:8], norm[12:13], name = "GM12892")
#'
#' # Variations in ChIP-seq signals across biological replicates of a cell line
#' # are generally of a low level, and typically their relationship with the
#' # mean signal intensities could be well modeled by the presumed parametric
#' # form.
#' GM12892 <- fitMeanVarCurve(list(GM12892), method = "parametric",
#'                            occupy.only = TRUE, init.coef = c(0.1, 10))[[1]]
#'
#' # In the vast majority of cases for modeling biological replicates of cell
#' # lines, the associated variance structure is so regular that variances of
#' # individual genomic intervals could be reliably estimated by fully
#' # depending on the mean-variance curve.
#' GM12892_2 <- setPriorDf(list(GM12892), Inf, occupy.only = TRUE)[[1]]
#'
#' # The resulting model makes few differences from the original one, though.
#' # This is because MAnorm2 will adaptively deduce a large number of prior
#' # degrees of freedom for the mean-variance curve if the underlying variance
#' # structure is of high regularity. In practice, we recommend leaving the
#' # specification of prior df to the estimation method implemented in MAnorm2
#' # all the time.
#' summary(GM12892)
#' summary(GM12892_2)
#'
setPriorDf <- function(conds, d0, occupy.only = TRUE,
                       no.rep.rv = NULL, .call = TRUE) {
    # Set the number of prior degrees of freedom
    if (length(conds) == 0) return(conds)
    for (cond in conds) {
        if (!is(cond, "bioCond")) {
            stop("Objects in conds must be of the class \"bioCond\"")
        }
        if (is.null(cond$fit.info)) {
            stop("Missing the \"fit.info\" field for some object in conds")
        }
    }
    d0 <- as.numeric(d0)[1]
    if (d0 < 0) stop("d0 must be a non-negative real")
    n <- length(conds)
    temp <- match.call()
    for (i in 1:n) {
        conds[[i]]$fit.info$df.prior <- d0
        if (.call) conds[[i]]$fit.info$calls$estParam <- temp
    }

    # Adjust the variance ratio factors
    if (d0 == 0) {
        warning("Specified number of prior degrees of freedom is 0.
Won't adjust the existing variance ratio factors.
Refer to setPriorDfVarRatio() if you want to explicitly specify variance ratio factors")
        return(conds)
    }
    m <- vapply(conds, function(cond){ ncol(cond$norm.signal) }, numeric(1))
    noRep <- m <= 1
    ratio.vars <- numeric(n)
    for (i in 1:n) {
        if (noRep[i]) next
        cond <- conds[[i]]
        if (occupy.only) {
            f <- cond$occupancy
        } else {
            f <- TRUE
        }
        means <- cond$sample.mean[f]
        vars <- cond$sample.var[f]
        z <- log(vars / cond$fit.info$predict(means))
        ratio.vars[i] <- scaleMeanVarCurve(list(z), m[i], d0)
        if (is.na(ratio.vars[i])) {
            warning(gettextf(
"Too few genomic intervals in \"%s\" to estimate its variance ratio factor.
Use the one for no-replicate conditions", cond$name))
            noRep[i] <- TRUE
        } else {
            conds[[i]]$fit.info$ratio.var <- ratio.vars[i]
        }
    }

    # Handle no-replicate conditions
    if (!any(noRep)) return(conds)
    if (is.null(no.rep.rv)) {
        if (all(noRep)) {
            stop(
"Cannot estimate the variance ratio factor for no-replicate conditions.
You may specify it explicitly")
        }
        temp <- ratio.vars[!noRep]
        no.rep.rv <- exp(mean(log(temp)))
    } else {
        no.rep.rv <- as.numeric(no.rep.rv)[1]
        if (no.rep.rv <= 0) stop("no.rep.rv must be a positive real")
        if (is.infinite(no.rep.rv)) stop("no.rep.rv is not allowed to be Inf")
    }
    for (i in (1:n)[noRep]) conds[[i]]$fit.info$ratio.var <- no.rep.rv
    attr(conds, "no.rep.rv") <- no.rep.rv
    conds
}

Try the MAnorm2 package in your browser

Any scripts or data that you put into this service are public.

MAnorm2 documentation built on Oct. 29, 2022, 1:12 a.m.