meffil: Efficient algorithms for DNA methylation

Documented in meffil.normalize.samples

#' Normalize Infinium HumanMethylation450 BeadChips
#'
#' Normalize a set of samples using their normalized quality control objects.
#'
#' @param norm.objects The list or sublist of \code{\link{meffil.normalize.quantiles}()}.
#' @param pseudo Value to add to the denominator to make the methylation
#' estimate more stable when calculating methylation levels (Default: 100).
#' @param just.beta If \code{TRUE}, then return just the normalized methylation levels; otherwise,
#' return the normalized methylated and unmethylated matrices (Default: TRUE).
#' @param remove.poor.signal Set methylation values for poorly detected probes
#' to missing  (Default: \code{FALSE}). Poor signal was 
#' identified during QC by \code{\link{meffil.qc}()}
#' as signal that failed to pass the 
#' detection p-value threshold (\code{detection.threshold})
#' or bead threshold (\code{bead.threshold}).
#' @param dup.fun Function to collapse duplicate probes 
#' (EPIC v2 has over 5000 duplicated probes). If NULL, then
#' duplicates are not collapsed (Default: median).
#' @param cpglist.remove Optional list of CpGs to exclude from final output
#' @param verbose If \code{TRUE}, then detailed status messages are printed during execution (Default: \code{FALSE}).
#' @param gds.filename If not \code{NULL} (default), then saves the output to a GDS (Genomic Data Structure).
#' This is for cases where the output is too large to fit into main memory.
#' The GDS option assumes that argument \code{just.beta == TRUE}.
#' @param ... Arguments passed to \code{\link[parallel]{mclapply}()}.
#' @return If \code{just.beta == TRUE}, the normalized matrix of 
#' methylation levels between between 0 and 1
#' equal to methylated signal/(methylated + unmethylated signal + pseudo).
#' Otherwise, a list containing two matrices, the normalized methylated and unmethylated signals.
#' If \code{gds.filename} is not \code{NULL}, then the output is saved to the GDS file
#' rather than retained in memory and returned to the caller.
#' The library 'gdsfmt' must be installed in this case.
#' 
#' @export
meffil.normalize.samples <- function(norm.objects, 
                                     pseudo=100,
                                     just.beta=T,
                                     cpglist.remove=NULL,
                                     remove.poor.signal=F,
                                     dup.fun=function(x) median(x,na.rm=T),
                                     max.bytes=2^30-1, ## maximum number of bytes for mclapply
                                     gds.filename=NULL,
                                     verbose=F,
                                     ...) {
    stopifnot(length(norm.objects) >= 2)
    stopifnot(all(sapply(norm.objects, is.normalized.object)))
    stopifnot(is.null(gds.filename) || just.beta)

    if (length(unique(sapply(norm.objects, function(object) object$featureset))) > 1)
        stop(paste("Heterogeneous microarray formats included without a common featureset.",
                   "Need to set the 'featureset' argument when creating QC objects."))

    featureset <- norm.objects[[1]]$featureset
    
    if (is.null(featureset)) { ## backwards compatibility
        featureset <- "450k"
    }   

    sites <- meffil.get.sites(featureset)
    if(!is.null(cpglist.remove))
        sites <- setdiff(sites, cpglist.remove)

    dups <- NULL
    if (!is.null(dup.fun)) 
        dups <- identify.dups(sites)

    if (is.null(gds.filename)) {
        ret <- mcsapply.safe(
            norm.objects,
            FUN=function(object) {
                mu <- meffil.normalize.sample(object, remove.poor.signal=remove.poor.signal, verbose=verbose)
                m.idx <- match(sites, names(mu$M))
                u.idx <- match(sites, names(mu$U))
                if (just.beta)
                    get.beta(unname(mu$M[m.idx]), unname(mu$U[u.idx]), pseudo)
                else
                    c(unname(mu$M[u.idx]), unname(mu$U[u.idx]))
            },
            ...,
            max.bytes=max.bytes)

        if (!just.beta) {
            ret <- list(M=ret[1:length(sites),],
                        U=ret[(length(sites)+1):nrow(ret),])
            dimnames(ret$M) <- dimnames(ret$U) <- list(sites, names(norm.objects))
            if (length(dups) > 0) {
                ret$M <- collapse.dups(ret$M, dups, dup.fun)
                ret$U <- collapse.dups(ret$U, dups, dup.fun)
            }
        }
        else {
            dimnames(ret) <- list(sites, names(norm.objects))
            if (length(dups) > 0)
                ret <- collapse.dups(ret, dups, dup.fun)
        }
        ret
    } else {
        require(gdsfmt)
        mcsapply.to.gds(
            norm.objects,
            FUN=function(object) {
                mu <- meffil.normalize.sample(object, remove.poor.signal=remove.poor.signal, verbose=verbose)
                m.idx <- match(sites, names(mu$M))
                u.idx <- match(sites, names(mu$U))
                ret <- get.beta(unname(mu$M[m.idx]), unname(mu$U[u.idx]), pseudo)
                names(ret) <- sites
                if (length(dups) > 0)
                    ret <- collapse.dups(ret, dups, dup.fun)
                ret
            },
            ...,
            gds.filename=gds.filename,
            storage="float64",
            max.bytes=max.bytes)
        gds.filename
    }
}

# We use \code{\link[parallel]{mclapply}()} to reduce running time by taking advantage of the fact
# that each sample can be normalized independently of the others.
# Unfortuantely \code{\link[parallel]{mclapply}()} has two important limitations.
# The size in memory of the list returned may be at most 2Gb otherwise
# \code{\link[parallel]{mclapply}()} fails with the following error message:
#    Error in sendMaster(try(lapply(X = S, FUN = FUN, ...), silent = TRUE)) :
#    long vectors not supported ...
# 
# A non-elegant solution to this problem is to guess the size of each element
# in the returned list and then apply \code{\link[parallel]{mclapply}()} sequentially to a sequence
# appropriately sized input subsets.
# A solution for \code{lapply} is to allocate the final object (e.g. a matrix)
# prior to calling \code{lapply} and then populate the object during
# the call to \code{lapply} using the global assignment operator '<<-'.
# Unfortunately this is not a solution for \code{\link[parallel]{mclapply}()}
# because \code{\link[parallel]{mclapply}()} immediately
# duplicates the object, applies any modifications to the duplicate
# and then deletes it prior to completion losing all modifications.
# I'm not sure why the duplicate is not copied onto the original.
# This is a solution if the object is a \code{\link{bigmemory::big.matrix}}.
# However, we tried this but encountered random errors.
# Sometimes the function completed without incident but other times,
# with the same data, entire columns of the output matrix would be NA,
# implying that the meffil.normalize.sample() function failed.
# However, no errors were generated (tested with a tryCatch).
# It seems that mclapply and big.matrix do not play well together all the time.
# We have replaced this with the less elegant approach implemented in mcsapply().