R/Spectra-functions.R

Defines functions .has_mz_each .has_mz combineSpectra .concatenate_spectra .combine_spectra .compare_spectra_self .compare_spectra_chunk .check_ms_level applyProcessing .lapply .peaksapply .apply_processing_queue addProcessing .valid_processing_queue

Documented in addProcessing applyProcessing combineSpectra

#' @include hidden_aliases.R
NULL

.valid_processing_queue <- function(x) {
    if (length(x) && !all(vapply1l(x, inherits, "ProcessingStep")))
        return("'processingQueue' should only contain ProcessingStep objects.")
    NULL
}

#' @export addProcessing
#'
#' @rdname Spectra
addProcessing <- function(object, FUN, ...) {
    if (missing(FUN))
        return(object)
    object@processingQueue <- c(object@processingQueue,
                                list(ProcessingStep(FUN, ARGS = list(...))))
    validObject(object)
    object
}

#' @description
#'
#' Apply the lazy evaluation processing queue to the peaks (i.e. m/z, intensity
#' matrix) and return the result.
#'
#' @param x `list` of peak `matrix`.
#'
#' @param msLevel `integer` with the MS level of each spectrum. Length has to
#'     match `length(x)`.
#'
#' @param centroided `logical` whether spectra are centroided. Length has to
#'     match `length(x)`.
#'
#' @param queue `list` of `ProcessStep` elements.
#'
#' @return `list` of peak `matrix` elements.
#'
#' @author Johannes Rainer
#'
#' @noRd
.apply_processing_queue <- function(x, msLevel, centroided, queue = NULL) {
    if (length(queue)) {
        for (i in seq_along(x)) {
            for (pStep in queue) {
                x[[i]] <- executeProcessingStep(pStep, x[[i]],
                                                spectrumMsLevel = msLevel[i],
                                                centroided = centroided[i])
            }
        }
    }
    x
}

#' @title Apply arbitrary functions and processing queue to peaks matrices
#'
#' @description
#'
#' This function applies the processing queue and an arbitrary function to
#' the peaks matrix of each spectrum of the `Spectra` object `object`.
#'
#' @param object `Spectra` object.
#'
#' @param FUN optional function to be applied to the peaks matrix. The peaks
#'     matrix will be passed to the first argument of the function. The function
#'     should also have arguments `...`.
#'
#' @param ... optional additional arguments to `FUN`.
#'
#' @param f `factor` or `vector` that can be coerced to one defining how the
#'     data should be split for parallel processing.
#'
#' @param BPPARAM parallel processing setup.
#'
#' @return `list` of `matrix` with the peaks.
#'
#' @author Johannes Rainer
#'
#' @importMethodsFrom BiocParallel bplapply
#'
#' @noRd
.peaksapply <- function(object, FUN = NULL, ..., f = dataStorage(object),
                        BPPARAM = bpparam()) {
    len <- length(object)
    if (!len)
        return(list())
    if (length(f) != len)
        stop("Length of 'f' has to match 'length(object)' (", len, ")")
    if (!is.factor(f))
        f <- factor(f)
    pqueue <- object@processingQueue
    if (!is.null(FUN))
        pqueue <- c(pqueue, ProcessingStep(FUN, ARGS = list(...)))
    if (length(levels(f)) > 1 || length(pqueue)) {
        res <- bplapply(split(object@backend, f), function(z, queue) {
            .apply_processing_queue(peaksData(z), msLevel(z),
                                    centroided(z), queue = queue)
        }, queue = pqueue, BPPARAM = BPPARAM)
        unsplit(res, f = f, drop = TRUE)
    } else peaksData(object@backend)
}


#' @title Apply a function to subsets of Spectra
#'
#' @description
#'
#' Applying a function to a Spectra eventually splitting it into chunks for
#' parallel/serial processing. By default `x` is splitted by `dataStorage`
#' applying the function `FUN` to each chunk. This has the advantage of
#' possibly running calculations in parallel **and** requiring, depending on
#' the backend, less memory (e.g. in case of an on-disk backend only the m/z
#' and intensity values for the current chunk has to be loaded into memory).
#'
#' While being partially redundant to the `.peaksapply` function, this function
#' allows the function `FUN` to get access to ALL spectra variables.
#'
#' Note that results are returned as generated by `split`, i.e. in this order
#' and as a `list` of values. Eventually use `unlist` later.
#'
#' @param x `Spectra` object.
#'
#' @param FUN `function` to be applied to each spectrum.
#'
#' @param f factor to split `x` into chunks for parallel processing. Defaults
#'     to splitting `x` by `dataStorage`.
#'
#' @param ... Additional parameters for `FUN`.
#'
#' @param BPPARAM Optional settings for `BiocParallel`-based parallel
#'     processing. See [bpparam()] for more informations and options.
#'
#' @return A `list` with the result of `FUN`.
#'
#' @author Johannes Rainer
#'
#' @importFrom BiocParallel SerialParam
#'
#' @noRd
.lapply <- function(x, FUN, f = as.factor(dataStorage(x)), ...,
                    BPPARAM = SerialParam()) {
    bplapply(split(x, f), FUN = FUN, ..., BPPARAM = BPPARAM)
}

#' @export applyProcessing
#'
#' @rdname Spectra
applyProcessing <- function(object, f = dataStorage(object),
                            BPPARAM = bpparam(), ...) {
    if (!length(object@processingQueue))
        return(object)
    if (isReadOnly(object@backend))
        stop(class(object@backend), " is read-only. 'applyProcessing' works ",
             "only with backends that support writing data.")
    if (!is.factor(f))
        f <- factor(f, levels = unique(f))
    if (length(f) != length(object))
        stop("length 'f' has to be equal to the length of 'object' (",
             length(object), ")")
    bknds <- bplapply(split(object@backend, f = f), function(z, queue) {
        peaksData(z) <- .apply_processing_queue(peaksData(z), msLevel(z),
                                                centroided(z), queue)
        z
    }, queue = object@processingQueue, BPPARAM = BPPARAM)
    bknds <- backendMerge(bknds)
    if (is.unsorted(f))
        bknds <- bknds[order(unlist(split(seq_along(bknds), f),
                                    use.names = FALSE))]
    object@backend <- bknds
    object@processing <- .logging(object@processing,
                                  "Applied processing queue with ",
                                  length(object@processingQueue),
                                  " steps")
    object@processingQueue <- list()
    object
}

#' @description
#'
#' Simple helper function to test parameter msLevel. Returns `TRUE` if parameter
#' is OK, `FALSE` if a warning is thrown and throws an error if it is not
#' a numeric.
#'
#' @noRd
.check_ms_level <- function(object, msLevel) {
    if (!length(object))
        return(TRUE)
    if (!is.numeric(msLevel))
        stop("'msLevel' must be numeric")
    if (!any(msLevel(object) %in% msLevel)) {
        warning("Specified MS levels ", paste0(msLevel, collapse = ","),
                " not available in 'object'")
        FALSE
    } else TRUE
}

#' @title Spectrum comparison
#'
#' @description
#'
#' Compare each spectrum in `x` with each spectrum in `y` with function `FUN`.
#' Mapping between the peaks in both spectra can be defined with the `MAPFUN`
#' function. This function *realizes* the peak matrices in chunks to balance
#' between performance (high `chunkSize`) and low memory demand
#' (small `chunkSize`).
#'
#' @note
#'
#' Results might be slightly different if `x` and `y` are switched, because of
#' the matching of the peaks by their m/z between spectra. Matching is performed
#' by applying a `tolerance` and `ppm` to the second spectrum (i.e. from `y`)
#' and, especially for `ppm`, the maximal accepted difference depend thus on
#' the m/z values from the second spectrum.
#'
#' @param x [Spectra()]
#'
#' @param y [Spectra()]
#'
#' @param FUN `function` to compare the (m/z matched) intensities from each
#'     spectrum in `x` with those in `y`.
#'
#' @param MAPFUN `function` to map peaks between the two compared spectra. See
#'     [joinPeaks()].
#'
#' @param tolerance `numeric(1)` allowing to define a constant maximal accepted
#'     difference between m/z values for peaks to be matched.
#'
#' @param ppm `numeric(1)` allowing to define a relative, m/z-dependent,
#'     maximal accepted difference between m/z values for peaks to be matched.
#'
#' @param chunkSize `integer(1)` defining the number of peak matrices that
#'     should be realized (i.e. loaded into memory) in each iteration. Large
#'     `chunkSize` will result in faster processing, small `chunkSize` in
#'     lower memory demand. Defaults to `chunkSize = 10000`.
#'
#' @param ... additional parameters passed to `FUN` and `MAPFUN`.
#'
#' @return
#'
#' `matrix` with number of rows equal to length of `x` and number of columns
#' equal `length(y)`.
#'
#' @importFrom stats cor
#'
#' @examples
#'
#' library(Spectra)
#' fl <- system.file("TripleTOF-SWATH/PestMix1_SWATH.mzML", package = "msdata")
#' sps <- Spectra(fl, source = MsBackendMzR())
#'
#' sps <- sps[1:10]
#' res <- .compare_spectra_chunk(sps, sps, ppm = 20)
#'
#' res <- .compare_spectra_chunk(sps[1], sps, FUN = correlateSpectra)
#' res <- .compare_spectra_chunk(sps, sps[1], FUN = correlateSpectra)
#'
#' @noRd
.compare_spectra_chunk <- function(x, y = NULL, MAPFUN = joinPeaks,
                                 tolerance = 0, ppm = 20,
                                 FUN = ndotproduct, chunkSize = 10000, ...) {
    nx <- length(x)
    ny <- length(y)
    if (nx <= chunkSize && ny <= chunkSize) {
        mat <- .peaks_compare(.peaksapply(x), .peaksapply(y), MAPFUN = MAPFUN,
                              tolerance = tolerance, ppm = ppm,
                              FUN = FUN, ...)
        dimnames(mat) <- list(spectraNames(x), spectraNames(y))
        return(mat)
    }

    x_idx <- seq_len(nx)
    x_chunks <- split(x_idx, ceiling(x_idx / chunkSize))
    rm(x_idx)

    y_idx <- seq_len(ny)
    y_chunks <- split(y_idx, ceiling(y_idx / chunkSize))
    rm(y_idx)

    mat <- matrix(NA_real_, nrow = nx, ncol = ny,
                  dimnames = list(spectraNames(x), spectraNames(y)))
    for (x_chunk in x_chunks) {
        x_buff <- .peaksapply(x[x_chunk])
        for (y_chunk in y_chunks) {
            mat[x_chunk, y_chunk] <- .peaks_compare(
                x_buff, .peaksapply(y[y_chunk]),
                MAPFUN = MAPFUN, tolerance = tolerance, ppm = ppm,
                FUN = FUN, ...)
        }
    }
    mat
}

#' @description
#'
#' Compare each spectrum in `x` with each other spectrum in `x`. Makes use of
#' `combn` to avoid calculating combinations twice.
#'
#' @inheritParams .compare_spectra
#'
#' @importFrom utils combn
#'
#' @importFrom MsCoreUtils vapply1d
#' @examples
#'
#' res <- .compare_spectra(sps, sps)
#' res_2 <- .compare_spectra_self(sps)
#' all.equal(res, res_2)    # comparison x[1], x[2] != x[2], x[1]
#' all.equal(res[!lower.tri(res)], res_2[!lower.tri(res_2)])
#'
#' @noRd
.compare_spectra_self <- function(x, MAPFUN = joinPeaks, tolerance = 0,
                                  ppm = 20, FUN = ndotproduct, ...) {
    nx <- length(x)
    m <- matrix(NA_real_, nrow = nx, ncol = nx,
                  dimnames = list(spectraNames(x), spectraNames(x)))

    cb <- which(lower.tri(m, diag = TRUE), arr.ind = TRUE)
    for (i in seq_len(nrow(cb))) {
        cur <- cb[i, 2L]
        if (i == 1L || cb[i - 1L, 2L] != cur)
            py <- px <- peaksData(x[cur])[[1L]]
        else
            py <- peaksData(x[cb[i, 1L]])[[1L]]
        map <- MAPFUN(px, py, tolerance = tolerance, ppm = ppm, ...)
        m[cb[i, 1L], cur] <- m[cur, cb[i, 1L]] <-
            FUN(map[[1L]], map[[2L]], ...)
    }
    m
}

#' @description
#'
#' Combine a `Spectra` of length `n` to a `Spectra` of length `1`.
#' Takes all *spectra variables* from the first spectrum and combines the
#' peaks into a single peaks matrix.
#'
#' @note
#'
#' If the backend of the input `Spectra` is *read-only* we're first converting
#' that to a `MsBackendDataFrame` to support replacing peak data.
#'
#' @param x `Spectra`, ideally from a single `$dataStorage`.
#'
#' @param f `factor` to group spectra in `x`. It is supposed that input checks
#'     have been performed beforehand.
#'
#' @param FUN `function` to be applied to the list of peak matrices.
#'
#' @return `Spectra` of length 1. If the input `Spectra` uses a read-only
#'     backend that is changed to a `MsBackendDataFrame`.
#'
#' @author Johannes Rainer
#'
#' @noRd
#'
#' @examples
#'
#' spd <- DataFrame(msLevel = c(2L, 2L, 2L), rtime = c(1, 2, 3))
#' spd$mz <- list(c(12, 14, 45, 56), c(14.1, 34, 56.1), c(12.1, 14.15, 34.1))
#' spd$intensity <- list(c(10, 20, 30, 40), c(11, 21, 31), c(12, 22, 32))
#'
#' sps <- Spectra(spd)
#'
#' res <- .combine_spectra(sps)
#' res$mz
#' res$intensity
#'
#' res <- .combine_spectra(sps, FUN = combinePeaks, tolerance = 0.1)
#' res$mz
#' res$intensity
.combine_spectra <- function(x, f = x$dataStorage, FUN = combinePeaks, ...) {
    if (!is.factor(f))
        f <- factor(f)
    else f <- droplevels(f)
    x_new <- x[match(levels(f), f)]
    if (isReadOnly(x_new@backend))
        x_new <- setBackend(x_new, MsBackendDataFrame())
    peaksData(x_new@backend) <- lapply(
        split(.peaksapply(x, BPPARAM = SerialParam()), f = f), FUN = FUN, ...)
    x_new@processingQueue <- list()
    validObject(x_new)
    x_new
}

#' Utility to concatenate a `list` of `Spectra` objects into a single `Spectra`.
#' This function is called in `c`.
#'
#' @param x `list` of `Spectra` objects.
#'
#' @return `Spectra`.
#'
#' @author Johannes Rainer
#'
#' @noRd
.concatenate_spectra <- function(x) {
    cls <- vapply1c(x, class)
    if (any(cls != "Spectra"))
        stop("Can only concatenate 'Spectra' objects")
    pqs <- lapply(x, function(z) z@processingQueue)
    ## For now we stop if there is any of the processingQueues not empty. Later
    ## we could even test if they are similar, and if so, merge.
    if (any(lengths(pqs)))
        stop("Can not concatenate 'Spectra' objects with non-empty ",
             "processing queue")
    metad <- do.call(c, lapply(x, function(z) z@metadata))
    procs <- unique(unlist(lapply(x, function(z) z@processing)))
    object <- new(
        "Spectra", metadata = metad,
        backend = backendMerge(lapply(x, function(z) z@backend)),
        processing = c(procs, paste0("Merge ", length(x),
                                     " Spectra into one [", date(), "]"))
    )
    validObject(object)
    object
}

#' @export combineSpectra
#'
#' @rdname Spectra
combineSpectra <- function(x, f = x$dataStorage, p = x$dataStorage,
                           FUN = combinePeaks, ..., BPPARAM = bpparam()) {
    if (!is.factor(f))
        f <- factor(f, levels = unique(f))
    if (!is.factor(p))
        p <- factor(p, levels = unique(p))
    if (length(f) != length(x) || length(p) != length(x))
        stop("length of 'f' and 'p' have to match length of 'x'")
    if (isReadOnly(x@backend))
        message("Backend of the input object is read-only, will change that",
                " to an 'MsBackendDataFrame'")
    if (nlevels(p) > 1) {
        ## We split the workload by storage file. This ensures memory efficiency
        ## for file-based backends.
        res <- bpmapply(FUN = .combine_spectra, split(x, p), split(f, p),
                        MoreArgs = list(FUN = FUN, ...), BPPARAM = BPPARAM)
        .concatenate_spectra(res)
    } else
        .combine_spectra(x, f = f, FUN = FUN, ...)
}

#' @description
#'
#' Internal function to check if any (or all) of the provided `mz` values are
#' in the spectras' m/z.
#'
#' @param x `Spectra` object
#'
#' @param mz `numeric` of m/z value(s) to check in each spectrum of `x`.
#'
#' @param tolarance `numeric(1)` with the tolerance.
#'
#' @param ppm `numeric(1)` with the ppm.
#'
#' @param condFun `function` such as `any` or `all`.
#'
#' @param parallel `BiocParallel` parameter object.
#'
#' @return `logical` same length than `x`.
#'
#' @author Johannes Rainer
#'
#' @importFrom MsCoreUtils common
#'
#' @noRd
.has_mz <- function(x, mz = numeric(), tolerance = 0, ppm = 20, condFun = any,
                    parallel = SerialParam()) {
    mzs <- mz(x, BPPARAM = parallel)
    vapply(mzs, FUN = function(z)
        condFun(common(mz, z, tolerance = tolerance, ppm = ppm)), logical(1))
}

#' @description
#'
#' Same as `.has_mz` only that a different `mz` is used for each spectrum in
#' `x`. Length of `mz` is thus expected to be equal to length of `x`.
#'
#' @param mz `numeric` **same length as `x`**.
#'
#' @author Johannes Rainer
#'
#' @noRd
.has_mz_each <- function(x, mz = numeric(), tolerance = 0, ppm = 20,
                         parallel = SerialParam()) {
    mzs <- mz(x, BPPARAM = parallel)
    mapply(mzs, mz, FUN = function(z, y) {
        if (is.na(y))
            NA # if m/z is NA is better to return NA instead of FALSE
        else common(y, z, tolerance = tolerance, ppm = ppm)
    })
}

Try the Spectra package in your browser

Any scripts or data that you put into this service are public.

Spectra documentation built on Nov. 27, 2020, 2 a.m.