R/functions-utils.R

Defines functions valueCount2ScanIndex useOriginalCode .copy_env .createProfileMatrix .featureIDs weightedMeanAroundApex plotMsData rla rowRla .rect_overlap .ppm_range .insertColumn .update_feature_definitions .rbind_fill .match_closest .reduce groupOverlaps

Documented in groupOverlaps plotMsData rla rowRla useOriginalCode

## Unsorted utility functions.
#' @include DataClasses.R

############################################################
## valueCount2ScanIndex
##
#' @title Create index vector for internal C calls
#'
#' @description Simple helper function that converts the number of values
#'     per scan/spectrum to an integer vector that can be passed to the base
#'     xcms functions/downstream C functions.
#'
#' @param valCount Numeric vector representing the number of values per
#'     spectrum.
#'
#' @return An integer vector with the index (0-based) in the mz or intensity
#'     vectors indicating the start of a spectrum.
#'
#' @author Johannes Rainer
#'
#' @noRd
valueCount2ScanIndex <- function(valCount){
    ## Convert into 0 based.
    valCount <- cumsum(valCount)
    return(as.integer(c(0, valCount[-length(valCount)])))
}

############################################################
## useOriginalCode
##
## Simple function allowing the user to enable using the orignal
## code instead of the new implementations.
## This sets options.
##
#' @title Enable usage of old xcms code
#'
#' @description
#'
#' This function allows to enable the usage of old, partially deprecated
#' code from xcms by setting a corresponding global option. See details
#' for functions affected.
#'
#' @note
#'
#' For parallel processing using the SOCKS method (e.g. by [SnowParam()] on
#' Windows computers) this option might not be passed to the individual R
#' processes performing the calculations. In such cases it is suggested to
#' specify the option manually and system-wide by adding the line
#' `options(XCMSuseOriginalCode = TRUE)` in a file called *.Rprofile* in the
#' folder in which new R processes are started (usually the user's
#' home directory; to ensure that the option is correctly read add a new line
#' to the file too). See also [Startup] from the base R documentation on how to
#' specify system-wide options for R.
#'
#' Usage of old code is strongly dicouraged. This function is thought
#' to be used mainly in the transition phase from xcms to xcms version 3.
#'
#' @details
#'
#' The functions/methods that are affected by this option are:
#'
#' - [do_findChromPeaks_matchedFilter]: use the original
#'   code that iteratively creates a subset of the binned (profile)
#'   matrix. This is helpful for computers with limited memory or
#'   matchedFilter settings with a very small bin size.
#' - [getPeaks]
#'
#' @param x `logical(1)` to specify whether or not original
#'     old code should be used in corresponding functions. If not provided the
#'     function simply returns the value of the global option.
#'
#' @return `logical(1)` indicating whether old code is being used.
#'
#' @md
#'
#' @author Johannes Rainer
useOriginalCode <- function(x) {
    if (missing(x)) {
        res <- options()$XCMSuseOriginalCode
        if (is.null(res))
            return(FALSE)
        return(res)
    }
    if (!is.logical(x))
        stop("'x' has to be logical.")
    options(XCMSuseOriginalCode = x[1])
    return(options()$XCMSuseOriginalCode)
}

## .getOriginalFunction <- function(x) {
##     if (!any(names(.ORIGINAL_FUNCTIONS)))
## }
## .ORIGINAL_FUNCTIONS <- c(
##     matchedFilter = ".matchedFilter_orig"
## )

#' @title Copy the content from an environment to another one
#'
#' @description This function copies the content of an environment into another
#'     one.
#'
#' @param env environment from which to copy.
#'
#' @param inheritLocks logical(1) whether the locking status should be copied
#'     too.
#'
#' @return an env.
#'
#' @noRd
.copy_env <- function(env, inheritLocks = FALSE) {
    ## new_e <- new.env(parent = emptyenv())
    ## eNames <- ls(env, all.names = TRUE)
    ## if (length(eNames) > 0) {
    ##     for (eN in eNames) {
    ##         new_e[[eN]] <- env[[eN]]
    ##     }
    ## }
    new_e <- as.environment(as.list(env, all.names = TRUE))
    if (inheritLocks) {
        if (environmentIsLocked(env))
            lockEnvironment(new_e)
    }
    return(new_e)
}

## #' Simulates the \code{findRange} function.
## #' @noRd
## findRangeR <- function(x, values) {
##     start <- min(which(x >= values[1]))
##     end <- max(which(x <= values[2]))
##     return(c(start, end))
## }

############################################################
## .createProfileMatrix
#' @title Create the profile matrix
#'
#' @description This function creates a \emph{profile} matrix, i.e. a rt times
#'     m/z matrix of aggregated intensity values with values aggregated within
#'     bins along the m/z dimension.
#'
#' @details This is somewhat the successor function for the deprecated
#'     \code{profBin} methods (\code{profBinM}, \code{profBinLinM},
#'     \code{profBinLinBaseM} and \code{profIntLin}).
#'
#' @param mz Numeric representing the m/z values across all scans/spectra.
#'
#' @param int Numeric representing the intensity values across all
#'     scans/spectra.
#'
#' @param valsPerSpect Numeric representing the number of measurements for each
#'     scan/spectrum.
#'
#' @param method A character string specifying the profile matrix generation
#'     method. Allowed are \code{"bin"}, \code{"binlin"},
#'     \code{"binlinbase"} and \code{"intlin"}.
#'
#' @param step Numeric specifying the size of the m/z bins.
#'
#' @param baselevel Numeric specifying the base value.
#'
#' @param basespace Numeric.
#'
#' @param mzrange. numeric(2) optionally specifying the mz value range
#'     for binning. This is to adopt the old profStepPad<- method used for
#'     obiwarp retention time correction that did the binning from
#'     whole-number limits.
#'
#' @param returnBreaks logical(1): hack to return the breaks of the bins.
#'     Setting this to TRUE causes the function to return a \code{list} with
#'     elements \code{"$profMat"} and \code{"breaks"}.
#'
#' @param baseValue numeric(1) defining the value to be returned if no signal
#'     was found in the corresponding bin. Defaults to 0 for backward
#'     compatibility.
#'
#' @noRd
.createProfileMatrix <- function(mz, int, valsPerSpect,
                                 method, step = 0.1, baselevel = NULL,
                                 basespace = NULL,
                                 mzrange. = NULL,
                                 returnBreaks = FALSE,
                                 baseValue = 0) {
    profMeths <- c("bin", "binlin", "binlinbase", "intlin")
    names(profMeths) <- c("none", "lin", "linbase", "intlin")
    method <- match.arg(method, profMeths)
    impute <- names(profMeths)[profMeths == method]
    brks <- NULL

    if (length(mzrange.) != 2) {
        mrange <- range(mz, na.rm = TRUE)
        mzrange. <- c(floor(mrange[1] / step) * step,
                      ceiling(mrange[2] / step) * step)
    }
    mass <- seq(mzrange.[1], mzrange.[2], by = step)
    mlength <- length(mass)
    ## Calculate the "real" bin size; old xcms code oddity that that's different
    ## from step.
    bin_size <- (mass[mlength] - mass[1]) / (mlength - 1)
    ## Define the breaks.
    toIdx <- cumsum(valsPerSpect)
    fromIdx <- c(1L, toIdx[-length(toIdx)] + 1L)
    shiftBy <- TRUE
    binFromX <- min(mass)
    binToX <- max(mass)
    brks <- breaks_on_nBins(fromX = binFromX, toX = binToX,
                            nBins = mlength, shiftByHalfBinSize = TRUE)
    ## for profIntLinM we have to use the old code.
    if (impute == "intlin") {
        profFun <- "profIntLinM"
        profp <- list()
        scanindex <- valueCount2ScanIndex(valsPerSpect)
        buf <- do.call(profFun, args = list(mz, int,
                                            scanindex, mlength,
                                            mass[1], mass[mlength],
                                            TRUE))
    } else {
        ## Binning the data.
        binRes <- binYonX(mz, int,
                          breaks = brks,
                          fromIdx = fromIdx,
                          toIdx = toIdx,
                          baseValue = ifelse(impute == "none", yes = baseValue,
                                             no = NA),
                          sortedX = TRUE,
                          returnIndex = FALSE,
                          returnX = FALSE
                          )
        if (length(toIdx) == 1)
            binRes <- list(binRes)
        ## Missing value imputation.
        if (impute == "linbase") {
            ## need arguments distance and baseValue.
            if (length(basespace) > 0) {
                if (!is.numeric(basespace))
                    stop("'basespace' has to be numeric!")
                distance <- floor(basespace[1] / bin_size)
            } else {
                distance <- floor(0.075 / bin_size)
            }
            if (length(baselevel) > 0) {
                if (!is.numeric(baselevel))
                    stop("'baselevel' has to be numeric!")
                baseValue <- baselevel
            } else {
                baseValue <- min(int, na.rm = TRUE) / 2
            }
        } else {
            distance <- 0
            baseValue <- 0
        }
        if (method == "none") {
            ## binVals <- lapply(binRes, function(z) z$y)
            binVals <- binRes
        } else {
            binVals <- lapply(binRes, function(z) {
                imputeLinInterpol(z$y, method = impute, distance = distance,
                                  noInterpolAtEnds = TRUE,
                                  baseValue = baseValue)
            })
        }
        buf <- base::do.call(cbind, binVals)
    }
    if (returnBreaks)
        buf <- list(profMat = buf, breaks = brks)
    buf
}

#' @description This function creates arbitrary IDs for features.
#'
#' @param prefix character(1) with the prefix to be added to the ID.
#'
#' @param x integer(1) with the number of IDs that should be generated.
#'
#' @noRd
.featureIDs <- function(x, prefix = "FT", from = 1L) {
    sprintf(paste0(prefix, "%0", ceiling(log10(x + 1L)), "d"),
            seq(from = from, length.out = x))
}

#' @title Weighted mean around maximum
#'
#' @description Calculate a weighted mean of the values around the value with
#'     the largest weight. \code{x} could e.g. be mz values and \code{w} the
#'     corresponding intensity values.
#'
#' @param x \code{numeric} vector from which the weighted mean should be
#'     calculated.
#'
#' @param w \code{numeric} of same length than \code{x} with the weights.
#'
#' @param i \code{integer(1)} defining the number of data points left and right
#'     of the index with the largest weight that should be considered for the
#'     weighted mean calculation.
#'
#' @return The weighted mean value.
#'
#' @author Johannes Rainer
#'
#' @noRd
#'
#' @examples
#'
#' mz <- c(124.0796, 124.0812, 124.0828, 124.0843, 124.0859, 124.0875,
#'     124.0890, 124.0906, 124.0922, 124.0938, 124.0953, 124.0969)
#' ints <- c(10193.8, 28438.0, 56987.6, 85107.6, 102531.6, 104262.6,
#'     89528.8, 61741.2, 33485.8, 14146.6, 5192.2, 1630.2)
#'
#' plot(mz, ints)
#'
#' ## What would be found by the max:
#' abline(v = mz[which.max(ints)], col = "grey")
#' ## What does the weighted mean around apex return:
#' abline(v = weightedMeanAroundApex(mz, ints, i = 2), col = "blue")
weightedMeanAroundApex <- function(x, w = rep(1, length(x)), i = 1) {
    max_idx <- which.max(w)
    seq_idx <- max(1, max_idx - i):min(length(x), max_idx + i)
    weighted.mean(x[seq_idx], w[seq_idx])
}

#' @title DEPRECATED: Create a plot that combines a XIC and a mz/rt 2D plot for one sample
#'
#' @description
#'
#' **UPDATE**: please use `plot(x, type = "XIC")` from the `MSnbase` package
#' instead. See examples below.
#'
#' The `plotMsData` creates a plot that combines an (base peak )
#' extracted ion chromatogram on top (rt against intensity) and a plot of
#' rt against m/z values at the bottom.
#'
#' @param x `data.frame` such as returned by the [extractMsData()] function.
#'     Only a single `data.frame` is supported.
#'
#' @param main `character(1)` specifying the title.
#'
#' @param cex `numeric(1)` defining the size of points. Passed directly to the
#'     `plot` function.
#'
#' @param mfrow `numeric(2)` defining the plot layout. This will be passed
#'     directly to `par(mfrow = mfrow)`. See `par` for more information. Setting
#'     `mfrow = NULL` avoids calling `par(mfrow = mfrow)` hence allowing to
#'     pre-define the plot layout.
#'
#' @param grid.color a color definition for the grid line (or `NA` to skip
#'     creating them).
#'
#' @param colramp a *color ramp palette* to be used to color the data points
#'     based on their intensity. See argument `col.regions` in
#'     [lattice::level.colors] documentation.
#'
#' @author Johannes Rainer
#'
#' @md
#'
#' @examples
#'
#' ## Read two files from the faahKO package
#' library(faahKO)
#' library(magrittr)
#' cdfs <- dir(system.file("cdf", package = "faahKO"), full.names = TRUE,
#'     recursive = TRUE)[1:2]
#' raw_data <- readMSData(cdfs, mode = "onDisk")
#'
#' ## Subset the object to a rt and mz range and plot the data.
#' raw_data %>%
#'     filterRt(rt = c(2700, 2900)) %>%
#'     filterMz(mz = c(334.9, 335.1)) %>%
#'     plot(type = "XIC")
plotMsData <- function(x, main = "", cex = 1, mfrow = c(2, 1),
                       grid.color = "lightgrey",
                       colramp = colorRampPalette(
                           rev(brewer.pal(9, "YlGnBu")))) {
    .Deprecated(msg = paste0("'plotMsData' is deprecated. Please use ",
                             "'plot(x, type = \"XIC\") instead."))
    if (length(mfrow) == 2)
        par(mfrow = mfrow)
    par(mar = c(0, 4, 2, 1))
    x_split <- split(x$i, f = x$rt)
    ints <- unlist(lapply(x_split, function(z) max(z)))
    brks <- do.breaks(range(x$i), nint = 256)
    cols <- level.colors(ints, at = brks, col.regions = colramp)
    plot(as.numeric(names(ints)), ints, main = main, xlab = "", xaxt = "n",
         ylab = "", las = 2, pch = 21, bg = cols, col = "grey", cex = cex)
    mtext(side = 4, line = 0, "intensity", cex = par("cex.lab"))
    grid(col = grid.color)
    par(mar = c(3.5, 4, 0, 1))
    cols <- level.colors(x$i, at = brks, col.regions = colramp)
    plot(x$rt, x$mz, main = "", pch = 21, bg = cols, col = "grey",
         xlab = "", ylab = "", yaxt = "n", cex = cex)
    axis(side = 2, las = 2)
    grid(col = grid.color)
    mtext(side = 1, line = 2.5, "retention time", cex = par("cex.lab"))
    mtext(side = 4, line = 0, "mz", cex = par("cex.lab"))
}

#' @title Calculate relative log abundances
#'
#' `rla` calculates the relative log abundances (RLA, see reference) on a
#' `numeric` vector.
#'
#' @details The RLA is defines as the (log) abundance of an analyte relative
#'     to the median across all abundances of the same group.
#'
#' @param x `numeric` (for `rla`) or `matrix` (for `rowRla`) with the
#'     abundances (in natural scale) on which the RLA should be calculated.
#'
#' @param group `factor`, `numeric` or `character` with the same length
#'     than `x` that groups values in `x`. If omitted all values are considered
#'     to be from the same group.
#'
#' @param log.transform `logical(1)` whether `x` should be log2 transformed.
#'     Set to `log.transform = FALSE` if `x` is already in log scale.
#'
#' @return `numeric` of the same length than `x` (for `rla`) or `matrix` with
#'     the same dimensions than `x` (for `rowRla`).
#'
#' @rdname rla
#'
#' @author Johannes Rainer
#'
#' @md
#'
#' @references
#'
#' De Livera AM, Dias DA, De Souza D, Rupasinghe T, Pyke J, Tull D, Roessner U,
#' McConville M, Speed TP. Normalizing and integrating metabolomics data.
#' *Anal Chem* 2012 Dec 18;84(24):10768-76.
#'
#' @examples
#'
#' x <- c(3, 4, 5, 1, 2, 3, 7, 8, 9)
#'
#' grp <- c(1, 1, 1, 2, 2, 2, 3, 3, 3)
#'
#' rla(x, grp)
rla <- function(x, group, log.transform = TRUE) {
    if (missing(group))
        group <- rep_len(1, length(x))
    if (length(x) != length(group))
        stop("length of 'x' has to match length of 'group'")
    if (!is.factor(group))
	group <- factor(group, levels = unique(group))
    ## Calculate group medians.
    if (log.transform)
        x <- log2(x)
    grp_meds <- unlist(lapply(split(x, group), median, na.rm = TRUE))
    x - grp_meds[group]
}

#' `rowRla` calculates row-wise RLAs.
#'
#' @rdname rla
#'
#' @md
rowRla <- function(x, group, log.transform = TRUE) {
    t(apply(x, MARGIN = 1, rla, group = group, log.transform = log.transform))
}

#' @title Identify rectangles overlapping in a two-dimensional space
#'
#' @description
#'
#' `.rect_overlap` identifies rectangles overlapping in a two dimensional
#' space.
#'
#' @return `list` with indices of overlapping elements.
#'
#' @noRd
#'
#' @author Johannes Rainer
.rect_overlap <- function(xleft, xright, ybottom, ytop) {
    if (missing(xleft) | missing(xright) | missing(ybottom) | missing(ytop))
        stop("'xleft', 'xright', 'ybottom' and 'ytop' are required parameters")
    if (length(unique(c(length(xleft), length(xright), length(ybottom),
                        length(ytop)))) != 1)
        stop("'xleft', 'xright', 'ybottom' and 'ytop' have to have the same",
             " length")
    .overlap <- function(x1, x2, xs1, xs2) {
        x1 <= xs2 & x2 >= xs1
    }
    nr <- length(xleft)
    ovlap <- vector("list", nr)
    ## Calculate overlap of any element with any other. Need only to compare
    ## element i with i:length.
    for (i in seq_len(nr)) {
        other_idx <- i:nr
        do_ovlap <- .overlap(xleft[i], xright[i],
                             xleft[other_idx], xright[other_idx]) &
            .overlap(ybottom[i], ytop[i],
                     ybottom[other_idx], ytop[other_idx])
        ovlap[[i]] <- other_idx[do_ovlap]
    }
    ovlap <- ovlap[lengths(ovlap) > 1]
    ovlap_merged <- list()
    ovlap_remain <- ovlap
    ## Combine grouped features if the have features in common
    while (length(ovlap_remain)) {
        current <- ovlap_remain[[1]]
        ovlap_remain <- ovlap_remain[-1]
        if (length(ovlap_remain) > 0) {
            ## Check if we have any overlap with any other merged group
            also_here <- vapply(ovlap_remain, function(z) any(z %in% current),
                                logical(1), USE.NAMES = FALSE)
            ## Join them with current.
            current <- sort(unique(c(current, unlist(ovlap_remain[also_here]))))
            ovlap_remain <- ovlap_remain[!also_here]
        }
        ## Check if current is in any of the already joined ones, if so, merge
        also_here <- which(vapply(ovlap_merged, function(z) any(z %in% current),
                                  logical(1), USE.NAMES = FALSE))
        if (length(also_here)) {
            ovlap_merged[[also_here[1]]] <-
                sort(unique(c(unlist(ovlap_merged[also_here]), current)))
            ## In case also remove all others - shouldn't really happen...
            if (length(also_here) > 1)
                ovlap_merged <- ovlap_merged[-also_here[-1]]
        } else
            ovlap_merged[[length(ovlap_merged) + 1]] <- current
    }
    ovlap_merged
}

#' Calculate a range of values adding a part per million to it. The minimum
#' will be the minimum - ppm/2, the maximum the maximum + ppm/2
#'
#' @param x `numeric`
#'
#' @param ppm `numeric(1)`
#'
#' @return `numeric(2)` with the range +/- ppm
#'
#' @noRd
#'
#' @author Johannes Rainer
.ppm_range <- function(x, ppm = 0) {
    x <- range(x)
    x[1] <- x[1] - x[1] * ppm / 2e6
    x[2] <- x[2] + x[2] * ppm / 2e6
    x
}

#' Simple helper to insert column(s) in a matrix.
#'
#' @param x `matrix`
#'
#' @param pos `integer()` with positions (columns) where a column should be
#'     inserted in `x`.
#'
#' @param val `vector` or `list` with the elements to insert.
#'
#' @return `matrix`
#'
#' @author Johannes Rainer
#'
#' @noRd
#'
#' @examples
#'
#' mat <- matrix(1:100, ncol = 5)
#'
#' ## Insert a column at position 3, containing a single value.
#' .insertColumn(mat, pos = 3, 5)
#'
#' ## Insert columns at positions 2 and 4 containing the same sequence of
#' ## values
#' .insertColumn(mat, c(2, 4), list(101:120))
.insertColumn <- function(x, pos = integer(), val = NULL) {
    if (length(pos)) {
        if (length(val) == 1)
            val <- rep(val, length(pos))
        if (length(val) != length(pos))
            stop("length of 'pos' and 'val' have to match")
    }
    for (i in seq_along(pos)) {
        if (pos[i] == 1) {
            x <- cbind(val[[i]], x)
        } else {
            if (pos[i] == ncol(x))
                x <- cbind(x, val[[i]])
            else
                x <- cbind(x[, 1:(pos[i]-1)], val[[i]], x[, pos[i]:ncol(x)])
        }
    }
    x
}

#' helper to subset featureDefinitions based on provided chrom peak names and
#' update the peakidx.
#'
#' @param x `DataFrame` with feature definitions (such as returned by
#'     `featureDefinitions(object)`.
#'
#' @param original_names `character` with the original rownames (peak IDs) of
#'     the `chromPeaks` matrix **before** subsetting.
#'
#' @param subset_names `character` with the rownames (peak IDs) of the
#'     `chromPeaks` matrix **after** subsetting.
#'
#' @return updated feature definitions `DataFrame`.
#'
#' @author Johannes Rainer
#'
#' @md
#'
#' @noRd
.update_feature_definitions <- function(x, original_names, subset_names) {
    x$peakidx <- lapply(x$peakidx, function(z) {
        idx <- base::match(original_names[z], subset_names)
        idx[!is.na(idx)]
    })
    x[lengths(x$peakidx) > 0, ]
}

#' @description
#'
#' Combine `matrix` or `data.frame`s adding eventually missing columns filling
#' them with `NA`s.
#'
#' @param x `matrix` or `data.frame`.
#'
#' @param y `matrix` or `data.frame`.
#'
#' @md
#'
#' @author Johannes Rainer
#'
#' @noRd
.rbind_fill <- function(x, y) {
    cnx <- colnames(x)
    cny <- colnames(y)
    cn <- union(cnx, cny)
    mis_col <- setdiff(cn, colnames(x))
    for (mc in mis_col) {
        if (is.factor(y[, mc]))
            x <- cbind(x, tmp = as.factor(NA))
        else
            x <- cbind(x, tmp = as(NA, class(y[, mc])))
    }
    colnames(x) <- c(cnx, mis_col)
    mis_col <- setdiff(cn, colnames(y))
    for (mc in mis_col) {
        if (is.factor(x[, mc]))
            y <- cbind(y, tmp = as.factor(NA))
        else
            y <- cbind(y, tmp = as(NA, class(x[, mc])))
    }
    colnames(y) <- c(cny, mis_col)
    rbind(x, y[, colnames(x)])
}

#' @title Match closest values between vectors
#'
#' @description
#'
#' Match values in `x` to their closests counterpart in `y` if their difference
#' is smaller than `maxDiff`. which is by defaul `min(mean(diff(x)), mean(diff(y)))`.
#'
#' @param x `numeric` of values to find closest matches in `y`.
#'
#' @param y `numeric` of values to match against.
#'
#' @return `integer` with the indices in `y` where `x` matches. An `NA` is
#'     reported if for a value in `x` no value in `y` with a difference smaller
#'     than `maxDiff` can be found.
#'
#' @author Johannes Rainer
#'
#' @noRd
#'
#' @examples
#'
#' a <- 1:10
#' b <- c(3.1, 3.2, 4.3, 7.8)
#'
#' .match_closest(b, a)
#'
#' a <- c(1, 4, 7, 10)
#' b <- c(2.2, 2.3, 2.4, 2.5)
#' .match_closest(a, b)
#'
#' a <- c(1, 2.11, 3, 4, 5)
#' .match_closest(a, b)
#'
#' a <- c(1, 1.5, 2, 2.5, 3, 3.5, 4)
#' b <- c(1.7, 2.3, 3)
#' .match_closest(a, b)
#'
#' .match_closest(b, a)
.match_closest <- function(x, y, maxDiff = min(mean(diff(x)), mean(diff(y)))) {
    vapply(x, function(a) {
        diffs <- abs(y - a)
        idx <- intersect(which(diffs <= maxDiff), which.min(diffs))
        if (length(idx))
            idx
        else NA_integer_
    }, integer(1))
}

#' @description
#'
#' Similar to the `IRanges::reduce` method, this function *joins* overlapping
#' ranges (e.g. m/z ranges or retention time ranges) to create unique and
#' disjoined (i.e. not overlapping) ranges.
#'
#' @param start `numeric` with start positions.
#'
#' @param end `numeric` with end positions.
#'
#' @return `matrix` with two columns containing the start and end values for
#'     the disjoined ranges. Note that the ranges are increasingly ordered.
#'
#' @author Johannes Rainer
#'
#' @md
#'
#' @noRd
#'
#' @examples
#'
#' mzmin <- c(2, 3, 4, 7)
#' mzmax <- c(2.5, 3.5, 4.2, 7.6)
#' .reduce(mzmin, mzmax)
#' .reduce(mzmin - 0.1, mzmax + 0.1)
#' .reduce(mzmin - 0.5, mzmax + 0.5)
.reduce <- function(start, end) {
    if (!length(start))
        return(matrix(ncol = 2, nrow = 0,
                      dimnames = list(NULL, c("start", "end"))))
    if (length(start) == 1) {
        return(cbind(start, end))
    }
    idx <- order(start, end)
    start <- start[idx]
    end <- end[idx]
    new_start <- new_end <- numeric(length(start))
    current_slice <- 1
    new_start[current_slice] <- start[1]
    new_end[current_slice] <- end[1]
    for (i in 2:length(start)) {
        if (start[i] <= new_end[current_slice]) {
            if (end[i] > new_end[current_slice])
                new_end[current_slice] <- end[i]
        } else {
            current_slice <- current_slice + 1
            new_start[current_slice] <- start[i]
            new_end[current_slice] <- end[i]
        }
    }
    idx <- 1:current_slice
    cbind(start = new_start[idx], end = new_end[idx])
}

#' @title Group overlapping ranges
#'
#' @description
#'
#' `groupOverlaps` identifies overlapping ranges in the input data and groups
#' them by returning their indices in `xmin` `xmax`.
#'
#' @param xmin `numeric` (same length than `xmax`) with the lower boundary of
#'     the range.
#'
#' @param xmax `numeric` (same length than `xmin`) with the upper boundary of
#'     the range.
#'
#' @return `list` with the indices of grouped elements.
#'
#' @author Johannes Rainer
#'
#' @md
#'
#' @examples
#'
#' x <- c(2, 12, 34.2, 12.4)
#' y <- c(3, 16, 35, 36)
#'
#' groupOverlaps(x, y)
groupOverlaps <- function(xmin, xmax) {
    tolerance <- sqrt(.Machine$double.eps)
    reduced_ranges <- .reduce(xmin, xmax)
    res <- vector("list", nrow(reduced_ranges))
    for (i in seq_along(res)) {
        res[[i]] <- which(xmin >= reduced_ranges[i, 1] - tolerance &
                          xmax <= reduced_ranges[i, 2] + tolerance)
    }
    res
}
yclement/xcms documentation built on April 10, 2020, 12:08 a.m.