xcms: LC-MS and GC-MS Data Analysis

Documented in adjustRtimePeakGroups applyAdjustedRtime exportMetaboAnalyst featureSummary highlightChromPeaks isCalibrated overlappingFeatures

#' @include DataClasses.R functions-utils.R

#' @description
#'
#' Takes a XCMSnExp and drops ProcessHistory steps from the
#' `@.processHistory` slot matching the provided type.
#'
#' @param num which should be dropped? If \code{-1} all matching will be dropped,
#'     otherwise just the most recent num.
#'
#' @return The XCMSnExp input object with selected ProcessHistory steps dropped.
#'
#' @noRd
dropProcessHistories <- function(x, type, num = -1) {
    x@.processHistory <- dropProcessHistoriesList(processHistory(x),
                                                  type = type, num = num)
    return(x)
}

#' Drop a generic processing history step based on the provided function name
#'
#' @noRd
dropGenericProcessHistory <- function(x, fun) {
    x@.processHistory <- dropGenericProcessHistoryList(processHistory(x), fun)
    x
}

#' Convert an XCMSnExp to an xcmsSet.
#'
#' @noRd
.XCMSnExp2xcmsSet <- function(from) {
    xs <- new("xcmsSet")
    if (hasChromPeaks(from)) {
        if (any(chromPeakData(from)$ms_level) > 1)
            stop("Coercing from an ", class(from)[1L],
                 " with results on MS levels > 1 is not supported.")
        xs@peaks <- chromPeaks(from)
    }
    ## @groups <- part of featureDefinitions
    ## @groupidx <- featureDefinitions(x)$peakidx
    if (hasFeatures(from)){
        fgs <- featureDefinitions(from)
        xs@groups <- S4Vectors::as.matrix(fgs[,names(fgs)!="peakidx"])
        rownames(xs@groups) <- NULL
        xs@groupidx <- fgs$peakidx
    }
    ## @rt combination from rtime(x) and adjustedRtime(x)
    rts <- list()
    ## Ensure we're getting the raw rt
    rts$raw <- split(rtime(from, adjusted = FALSE), fromFile(from))
    if (hasAdjustedRtime(from))
        rts$corrected <- split(rtime(from, adjusted = TRUE), fromFile(from))
    else
        rts$corrected <- rts$raw
    xs@rt <- rts

    ## @phenoData
    if (inherits(from, "XcmsExperiment"))
        pd <- as.data.frame(sampleData(from))
    else  pd <- pData(from)
    if (nrow(pd) != length(fileNames(from))) {
        pd <- data.frame(file_name = basename(fileNames(from)))
        rownames(pd) <- pd$file_name
    }
    xs@phenoData <- pd
    ## @filepaths
    xs@filepaths <- fileNames(from)

    ## @profinfo (list)
    profMethod <- "bin"
    profStep <- 0.1
    profParam <- list()
    ## If we've got any MatchedFilterParam we can take the values from there
    ph <- processHistory(from, type = .PROCSTEP.PEAK.DETECTION)
    if (length(ph)) {
        if (is(ph[[1]], "XProcessHistory")) {
            prm <- processParam(ph[[1]])
            if (is(prm, "MatchedFilterParam")) {
                if (impute(prm) != "none") {
                    profMethod <- paste0("bin", impute(prm))
                    profStep <- binSize(prm)
                    if (profMethod == "binlinbase")
                        warning("Optional parameters for 'binlinbase' can not be",
                                " converted yet, using default parameters.")
                ## distance
                ## baseValue
                }
            }
        }
    }
    profinfo(xs) <- c(list(method = profMethod, step = profStep), profParam)

    ## @mslevel <- msLevel?
    xs@mslevel <- 1L

    ## @scanrange
    if (inherits(from, "XcmsExperiment"))
        xs@scanrange <- range(scanIndex(spectra(from)))
    else xs@scanrange <- range(scanIndex(from))

    ## .processHistory: just take the processHistory as is.
    xs@.processHistory <- processHistory(from)

    ## Implement later or skip:
    ## @polarity (character), has to be "positive" or "negative"; not clear how
    ## and if this is used at all.

    ## @filled ... not yet.
    if (any(chromPeakData(from)$is_filled)) {
        fld <- which(chromPeakData(from)$is_filled)
        xs@filled <- as.integer(fld)
    }
    ## @dataCorrection (numeric) ? in xcmsSet function, if lockMassFreq.
    ## @progressInfo skip
    ## @progressCallback skip
    if (!any(colnames(xs@phenoData) == "class"))
        message("Note: you might want to set/adjust the",
                " 'sampclass' of the returned xcmSet object",
                " before proceeding with the analysis.")
    if (validObject(xs))
        return(xs)
}

.XCMSnExp2SummarizedExperiment <- function(x, ...) {
    if (!hasFeatures(x))
        stop("No correspondence analysis results present. Please run ",
             "groupChromPeaks first.")
    SummarizedExperiment(assays = list(raw = featureValues(x, ...)),
                         rowData = featureDefinitions(x),
                         colData = pData(x),
                         metadata = processHistory(x))
}

#' @description
#'
#' Extract a \code{data.frame} of retention time, mz and intensity
#' values from each file/sample in the provided rt-mz range.
#'
#' @note
#'
#' Ideally, \code{x} should be an \code{OnDiskMSnExp} object as subsetting
#' of a \code{XCMSnExp} object is more costly (removing of preprocessing
#' results, restoring data etc). If retention times reported in the
#' featureData are replaced by adjusted retention times, these are set
#' in the Spectrum objects as retention time.
#'
#' @param x An \code{OnDiskMSnExp} object.
#'
#' @param rt \code{numeric(2)} with the retention time range from which the
#'     data should be extracted.
#'
#' @param mz \code{numeric(2)} with the mz range.
#'
#' @param msLevel \code{integer} defining the MS level(s) to which the data
#'     should be restricted prior to data extraction.
#'
#' @return
#'
#' A \code{list} with length equal to the number of files and
#' each element being a \code{data.frame} with the extracted values.
#'
#' @noRd
#'
#' @author Johannes Rainer
.extractMsData <- function(x, rt, mz, msLevel = 1L) {
    if (!missing(rt)) {
        rt <- range(rt, na.rm = TRUE)
        if (length(rt) != 2)
            stop("'rt' has to be a numeric of length 2!")
    }
    if (!missing(mz)) {
        mz <- range(mz, na.rm = TRUE)
        if (length(mz) != 2)
            stop("'mz' has to be a numeric of length 2!")
        fmzr <- mz
    } else fmzr <- c(0, 0)
    ## Subset the object based on MS level rt and mz range.
    subs <- filterMz(filterRt(filterMsLevel(x, msLevel = msLevel), rt = rt),
                     mz = mz)
    if (length(subs) == 0) {
        ## Return a list with empty data.frames
        empty_df <- data.frame(rt = numeric(), mz = numeric(), i = integer())
        return(lapply(1:length(fileNames(x)), FUN = function(z){empty_df}))
    }
    suppressWarnings(
        dfs <- spectrapply(subs, FUN = function(z) {
            if (!z@peaksCount)
                return(data.frame(rt = numeric(), mz = numeric(),
                                  i = integer()))
            data.frame(rt = rep_len(z@rt, length(z@mz)),
                       mz = z@mz, i = z@intensity)
        })
    )
    fns <- fileNames(x)
    fromF <- base::match(fileNames(subs), fns)

    ## Now I want to rbind the spectrum data frames per file
    L <- split(dfs, f = fromFile(subs))
    L <- lapply(L, do.call, what = rbind)
    ## Put them into a vector same length that we have files.
    res <- vector(mode = "list", length = length(fns))
    res[fromF] <- L
    return(res)
}

#' @description
#'
#' Integrates the intensities for chromatograpic peak(s). This is
#' supposed to be called by the fillChromPeaks method.
#'
#' @note This reads the full data first and does the subsetting later in R.
#'
#' @param object An \code{XCMSnExp} object representing a single sample.
#'
#' @param peakArea A \code{matrix} with the peak definition, i.e.
#'     \code{"rtmin"}, \code{"rtmax"}, \code{"mzmin"} and \code{"mzmax"}.
#'
#' @param sample_idx \code{integer(1)} with the index of the sample in the
#'     object.
#'
#' @param mzCenterFun Name of the function to be used to calculate the mz value.
#'     Defaults to \code{weighted.mean}, i.e. the intensity weighted mean mz.
#'
#' @param cn \code{character} with the names of the result matrix.
#'
#' @return
#'
#' A \code{matrix} with at least columns \code{"mz"}, \code{"rt"},
#' \code{"into"} and \code{"maxo"} with the by intensity weighted mean of
#' mz, rt or the maximal intensity in the area, the integrated signal in
#' the area and the maximal signal in the area.
#'
#' @noRd
.getChromPeakData <- function(object, peakArea, sample_idx,
                              mzCenterFun = "weighted.mean",
                              cn = c("mz", "rt", "into", "maxo", "sample"),
                              msLevel = 1L) {
    if (length(fileNames(object)) != 1)
        stop("'object' should be an XCMSnExp for a single file!")
    ncols <- length(cn)
    res <- matrix(ncol = ncols, nrow = nrow(peakArea))
    colnames(res) <- cn
    res[, "sample"] <- sample_idx
    res[, c("mzmin", "mzmax")] <- peakArea[, c("mzmin", "mzmax")]
    ## Load the data
    message("Requesting ", nrow(res), " peaks from ",
            basename(fileNames(object)), " ... ", appendLF = FALSE)
    object <- filterRt(
        object, rt = range(peakArea[, c("rtmin", "rtmax")]) + c(-2, 2))
    object <- filterMsLevel(object, msLevel)
    if (!length(object)) {
        message("FAIL: no MS level ", msLevel, " data available.")
        return(res)
    }
    spctr <- spectra(object, BPPARAM = SerialParam())
    mzs <- lapply(spctr, function(z) z@mz)
    valsPerSpect <- lengths(mzs)
    ints <- unlist(lapply(spctr, function(z) z@intensity), use.names = FALSE)
    rm(spctr)
    mzs <- unlist(mzs, use.names = FALSE)
    mzs_range <- range(mzs)
    rtim <- rtime(object)
    rtim_range <- range(rtim)
    for (i in seq_len(nrow(res))) {
        rtr <- peakArea[i, c("rtmin", "rtmax")]
        mzr <- peakArea[i, c("mzmin", "mzmax")]
        ## If the rt range is completely out; additional fix for #267
        if (rtr[2] < rtim_range[1] | rtr[1] > rtim_range[2] |
            mzr[2] < mzs_range[1] | mzr[1] > mzs_range[2]) {
            res[i, ] <- rep(NA_real_, ncols)
            next
        }
        ## Ensure that the mz and rt region is within the range of the data.
        rtr[1] <- max(rtr[1], rtim_range[1])
        rtr[2] <- min(rtr[2], rtim_range[2])
        mzr[1] <- max(mzr[1], mzs_range[1])
        mzr[2] <- min(mzr[2], mzs_range[2])
        mtx <- .rawMat(mz = mzs, int = ints, scantime = rtim,
                       valsPerSpect = valsPerSpect, rtrange = rtr,
                       mzrange = mzr)
        if (length(mtx)) {
            if (any(!is.na(mtx[, 3]))) {
                ## How to calculate the area: (1)sum of all intensities / (2)by
                ## the number of data points (REAL ones, considering also NAs)
                ## and multiplied with the (3)rt width.
                ## (1) sum(mtx[, 3], na.rm = TRUE)
                ## (2) sum(rtim >= rtr[1] & rtim <= rtr[2]) - 1 ; if we used
                ## nrow(mtx) here, which would correspond to the non-NA
                ## intensities within the rt range we don't get the same results
                ## as e.g. centWave. Using max(1, ... to avoid getting Inf in
                ## case the signal is based on a single data point.
                ## (3) rtr[2] - rtr[1]
                res[i, "into"] <- sum(mtx[, 3], na.rm = TRUE) *
                    ((rtr[2] - rtr[1]) /
                     max(1, (sum(rtim >= rtr[1] & rtim <= rtr[2]) - 1)))
                maxi <- which.max(mtx[, 3])
                res[i, c("rt", "maxo")] <- mtx[maxi[1], c(1, 3)]
                res[i, c("rtmin", "rtmax")] <- rtr
                ## Calculate the intensity weighted mean mz
                meanMz <- do.call(mzCenterFun, list(mtx[, 2], mtx[, 3]))
                if (is.na(meanMz)) meanMz <- mtx[maxi[1], 2]
                res[i, "mz"] <- meanMz

                if ("beta_cor" %in% cn) {
                    res[i, c("beta_cor", "beta_snr")] <- .get_beta_values(
                        mtx[, 3L], mtx[, 1L])
                }
            } else {
                res[i, ] <- rep(NA_real_, ncols)
            }
        } else {
            res[i, ] <- rep(NA_real_, ncols)
        }
    }
    message("got ", sum(!is.na(res[, "into"])), ".")
    res
}

#' @description Same as getChromPeakData, just without retention time.
#'
#' @note
#'
#' The mz and maxo are however estimated differently than for the
#' getChromPeakData: mz is the mz closest to the median mz of the feature and
#' maxo its intensity.
#'
#' @noRd
.getMSWPeakData <- function(object, peakArea, sample_idx,
                              cn = c("mz", "rt", "into", "maxo", "sample")) {
    if (length(fileNames(object)) != 1)
        stop("'object' should be an XCMSnExp for a single file!")
    ncols <- length(cn)
    res <- matrix(ncol = ncols, nrow = nrow(peakArea))
    colnames(res) <- cn
    res[, "sample"] <- sample_idx
    res[, "rt"] <- -1
    res[, "rtmin"] <- -1
    res[, "rtmax"] <- -1
    res[, c("mzmin", "mzmax")] <- peakArea[, c("mzmin", "mzmax")]
    ## Load the data
    message("Requesting ", nrow(res), " peaks from ",
             basename(fileNames(object)), " ... ", appendLF = FALSE)
    spctr <- spectra(object, BPPARAM = SerialParam())
    mzs <- lapply(spctr, mz)
    valsPerSpect <- lengths(mzs)
    ints <- unlist(lapply(spctr, intensity), use.names = FALSE)
    rm(spctr)
    mzs <- unlist(mzs, use.names = FALSE)
    for (i in 1:nrow(res)) {
        mz_area <- which(mzs >= peakArea[i, "mzmin"] &
                         mzs <= peakArea[i, "mzmax"])
        ## Alternative version from original code: but this can also pick up
        ## mzs from outside of the range! See also comments on issue #130
        ## mz_area <- seq(which.min(abs(mzs - peakArea[i, "mzmin"])),
        ##                which.min(abs(mzs - peakArea[i, "mzmax"])))
        mtx <- cbind(time = -1, mz = mzs[mz_area], intensity = ints[mz_area])
        ## mtx <- xcms:::.rawMat(mz = mzs, int = ints, scantime = rtime(object),
        ##                valsPerSpect = valsPerSpect,
        ##                mzrange = peakArea[i, c("mzmin", "mzmax")])
        if (length(mz_area)) {
            if (!all(is.na(mtx[, 3]))) {
                ## How to calculate the area: (1)sum of all intensities
                res[i, "into"] <- sum(mtx[, 3], na.rm = TRUE)
                ## Get the index of the mz value(s) closest to the mzmed of the
                ## feature
                mzDiff <- abs(mtx[, 2] - peakArea[i, "mzmed"])
                mz_idx <- which(mzDiff == min(mzDiff))
                ## Now get the one with the highest intensity.
                maxi <- mz_idx[which.max(mtx[mz_idx, 3])]
                ## Return these.
                res[i, c("mz", "maxo")] <- mtx[maxi, 2:3]
                ## ## mz should be the weighted mean!
                ## res[i, c("mz", "maxo")] <- c(weighted.mean(mtx[, 2], mtx[, 3]),
                ##                              mtx[maxi[1], 3])
            } else {
                res[i, ] <- rep(NA_real_, ncols)
            }
        } else {
            res[i, ] <- rep(NA_real_, ncols)
        }
    }
    message("got ", sum(!is.na(res[, "into"])), ".")
    return(res)
}
## The same version as above, but the maxo is the maximum signal of the peak,
## and the mz the intensity weighted mean mz.
.getMSWPeakData2 <- function(object, peakArea, sample_idx,
                              cn = c("mz", "rt", "into", "maxo", "sample")) {
    if (length(fileNames(object)) != 1)
        stop("'object' should be an XCMSnExp for a single file!")
    ncols <- length(cn)
    res <- matrix(ncol = ncols, nrow = nrow(peakArea))
    colnames(res) <- cn
    res[, "sample"] <- sample_idx
    res[, "rt"] <- -1
    res[, "rtmin"] <- -1
    res[, "rtmax"] <- -1
    res[, c("mzmin", "mzmax")] <- peakArea[, c("mzmin", "mzmax")]
    ## Load the data
    message("Reguesting ", nrow(res), " peaks from ",
             basename(fileNames(object)), " ... ", appendLF = FALSE)
    spctr <- spectra(object, BPPARAM = SerialParam())
    mzs <- lapply(spctr, mz)
    valsPerSpect <- lengths(mzs)
    ints <- unlist(lapply(spctr, intensity), use.names = FALSE)
    rm(spctr)
    mzs <- unlist(mzs, use.names = FALSE)
    for (i in 1:nrow(res)) {
        mtx <- .rawMat(mz = mzs, int = ints, scantime = rtime(object),
                       valsPerSpect = valsPerSpect,
                       mzrange = peakArea[i, c("mzmin", "mzmax")])
        if (length(mtx)) {
            if (!all(is.na(mtx[, 3]))) {
                ## How to calculate the area: (1)sum of all intensities
                res[i, "into"] <- sum(mtx[, 3], na.rm = TRUE)
                res[i, c("mz", "maxo")] <- c(weighted.mean(mtx[, 2], mtx[, 3]),
                                             max(mtx[, 3], na.rm = TRUE))
            } else {
                res[i, ] <- rep(NA_real_, ncols)
            }
        } else {
            res[i, ] <- rep(NA_real_, ncols)
        }
    }
    message("got ", sum(!is.na(res[, "into"])), ".")
    return(res)
}

## Same as .getChromPeakData but for matchedFilter, i.e. using the profile
## matrix instead of the original signal.
.getChromPeakData_matchedFilter <- function(object, peakArea, sample_idx,
                                            mzCenterFun = "weighted.mean",
                                            param = MatchedFilterParam(),
                                            cn = c("mz", "rt", "into", "maxo",
                                                   "sample"), msLevel = 1L) {
    if (length(fileNames(object)) != 1)
        stop("'object' should be an XCMSnExp for a single file!")
    ncols <- length(cn)
    res <- matrix(ncol = ncols, nrow = nrow(peakArea))
    colnames(res) <- cn
    res[, "sample"] <- sample_idx
    res[, c("mzmin", "mzmax")] <-
        peakArea[, c("mzmin", "mzmax")]
    ## Load the data
    message("Requesting ", nrow(res), " peaks from ",
            basename(fileNames(object)), " ... ", appendLF = FALSE)
    object <- filterRt(
        object, rt = range(peakArea[, c("rtmin", "rtmax")]) + c(-2, 2))
    object <- filterMsLevel(object, msLevel)
    if (!length(object)) {
        message("FAIL: no data available for the requested m/z - rt range.")
        return(res)
    }
    spctr <- spectra(object, BPPARAM = SerialParam())
    ## Issue #653: empty spectra is not supported; this is the same fix applied
    ## to matchedFilter peak detection.
    spctr <- lapply(spctr, function(z) {
        if (!length(z@mz)) {
            z@mz <- 0.0
            z@intensity <- 0.0
        }
        z
    })
    mzs <- lapply(spctr, mz)
    vps <- lengths(mzs)
    ints <- unlist(lapply(spctr, intensity), use.names = FALSE)
    rm(spctr)
    mzs <- unlist(mzs, use.names = FALSE)
    rtim <- rtime(object)
    rtim_range <- range(rtim)
    ## Now, if we do have "distance" defined it get's tricky:
    basespc <- NULL
    if (length(distance(param)) > 0) {
        mass <- seq(floor(min(mzs) / binSize(param)) * binSize(param),
                    ceiling(max(mzs) / binSize(param)) * binSize(param),
                    by = binSize(param))
        bin_size <- (max(mass) - min(mass)) / (length(mass) - 1)
        basespc <- distance(param) * bin_size
    }
    ## Create the profile matrix:
    pMat <- .createProfileMatrix(mz = mzs, int = ints, valsPerSpect = vps,
                                 method = .impute2method(param),
                                 step = binSize(param),
                                 baselevel = baseValue(param),
                                 basespace = basespc,
                                 returnBreaks = TRUE,
                                 baseValue = NA)  # We want to return NA not 0
                                        # if nothing was found
    brks <- pMat$breaks
    pMat <- pMat$profMat  ## rows are masses, cols are retention times/scans.
    bin_size <- diff(brks[1:2])
    bin_half <- bin_size / 2
    ## Calculate the mean mass per bin using the breaks used for the binning.
    mass <- brks[-length(brks)] + bin_half ## midpoint for the breaks
    mass_range <- range(mass)

    for (i in seq_len(nrow(res))) {
        rtr <- peakArea[i, c("rtmin", "rtmax")]
        mzr <- peakArea[i, c("mzmin", "mzmax")]
        ## Ensure that the rt region is within the rtrange of the data.
        rtr[1] <- max(rtr[1], rtim_range[1])
        rtr[2] <- min(rtr[2], rtim_range[2])
        mzr[1] <- max(mzr[1], mass_range[1])
        mzr[2] <- min(mzr[2], mass_range[2])
        ## Get the index of rt in rtim that are within the rt range rtr
        ## range_rt <- c(min(which(rtim >= rtr[1])), max(which(rtim <= rtr[2])))
        ## range_rt <- c(which.min(abs(rtim - rtr[1])),
        ##               which.min(abs(rtim - rtr[2])))
        range_rt <- findRange(rtim, rtr, TRUE)
        idx_rt <- range_rt[1]:range_rt[2]
        ## Get the index of the mz in the data that are within the mz range.
        ##range_mz <- c(min(which(brks >= mzr[1])) - 1, max(which(brks <= mzr[2])))
        range_mz <- findRange(mass, c(mzr[1] - bin_half, mzr[2] + bin_half),
                              TRUE)
        idx_mz <- range_mz[1]:range_mz[2]

        if (length(idx_mz) > 0 & length(idx_rt) > 0) {
            intMat <- pMat[idx_mz, idx_rt, drop = FALSE]
            is_na <- is.na(intMat)
            if (all(is_na)) {
                res[i, ] <- rep(NA_real_, ncols)
                next
            }
            intMat_0 <- intMat
            intMat_0[is.na(intMat)] <- 0
            ## Calculate the mean mz value using a intensity weighted mean.
            mz_ints <- rowSums(intMat, na.rm = TRUE)
            ## mz_ints <- Biobase::rowMax(intMat)
            if (length(mz_ints) != length(idx_mz)) {
                ## Take from original code...
                warning("weighted.mean: x and weights have to have same length!")
                mz_ints <- rep(1, length(idx_mz))
            }
            ## mean mz: intensity weighted mean
            mean_mz <- weighted.mean(mass[idx_mz], mz_ints, na.rm = TRUE)
            if (is.nan(mean_mz) || is.na(mean_mz))
                mean_mz <- mean(mzr, na.rm = TRUE)
            res[i, "mz"] <- mean_mz
            ## mean rt: position of the maximum intensity (along rt.)
            rt_ints <- colMax(intMat_0, na.rm = TRUE)
            res[i, c("rt", "rtmin", "rtmax")] <-
                c(rtim[idx_rt][which.max(rt_ints)], rtr)
            ## maxo
            res[i, "maxo"] <- max(rt_ints, na.rm = TRUE)
            ## into
            rt_width <- diff(rtim[range_rt])/diff(range_rt)
            res[i, "into"] <- rt_width * sum(rt_ints, na.rm = TRUE)
        } else
            res[i, ] <- rep(NA_real_, ncols)
    }
    message("got ", sum(!is.na(res[, "into"])), ".")
    return(res)
}

.hasFilledPeaks <- function(object) {
    hasChromPeaks(object) & any(chromPeakData(object)$is_filled, na.rm = TRUE)
}

#' @description
#'
#' Simple helper function to extract the peakidx column from the
#' featureDefinitions DataFrame. The function ensures that the names of the
#' returned list correspond to the rownames of the DataFrame
#'
#' @noRd
.peakIndex <- function(object) {
    if (is.data.frame(object) || inherits(object, "DataFrame")) {
        idxs <- object$peakidx
        names(idxs) <- rownames(object)
    } else {
        if (!hasFeatures(object))
            stop("No feature definitions present. ",
                 "Please run 'groupChromPeaks' first.")
        idxs <- featureDefinitions(object)$peakidx
        names(idxs) <- rownames(featureDefinitions(object))
    }
    idxs
}

#' @rdname adjustRtime
adjustRtimePeakGroups <- function(object, param = PeakGroupsParam(),
                                  msLevel = 1L) {
    if (!(inherits(object, "XCMSnExp") | inherits(object, "XcmsExperiment")))
        stop("'object' has to be an 'XCMSnExp' or 'XcmsExperiment' object.")
    if (!hasFeatures(object))
        stop("No features present. Please run 'groupChromPeaks' first.")
    if (hasAdjustedRtime(object))
        warning("Alignment/retention time correction was already performed, ",
                "returning a matrix with adjusted retention times.")
    subs <- subset(param)
    if (!length(subs))
        subs <- seq_along(fileNames(object))
    nSamples <- length(subs)
    missingSample <- nSamples - (nSamples * minFraction(param))
    pkGrp <- .getPeakGroupsRtMatrix(
        peaks = chromPeaks(object, msLevel = msLevel),
        peakIndex = .peakIndex(
            .update_feature_definitions(
                featureDefinitions(object), rownames(chromPeaks(object)),
                rownames(chromPeaks(object, msLevel = msLevel)))),
        sampleIndex = subs,
        missingSample = missingSample,
        extraPeaks = extraPeaks(param)
    )
    colnames(pkGrp) <- basename(fileNames(object))[subs]
    pkGrp
}

.plotChromPeakDensity <- function(object, mz, rt, param, simulate = TRUE,
                                 col = "#00000080", xlab = "retention time",
                                 ylab = "sample", xlim = range(rt),
                                 main = NULL, type = c("any", "within",
                                                       "apex_within"), ...) {
    .Deprecated(
        msg = paste0("Use of 'plotChromPeakDensity' on 'XCMSnExp' is",
                     "discouraged. Please extract chromatographic ",
                     "data first and call 'plotChromPeakDensity' ",
                     "directly on the 'XChromatograms' object. See ",
                     "?XChromatograms, section 'Correspondence ",
                     "analysis' for more details."))
    type <- match.arg(type)
    if (missing(object))
        stop("Required parameter 'object' is missing")
    if (!is(object, "XCMSnExp"))
        stop("'object' must be an XCMSnExp object")
    if (!hasChromPeaks(object))
        stop("No chromatographic peaks present in 'object'")
    if (missing(mz))
        mz <- c(-Inf, Inf)
    if (missing(rt))
        rt <- range(rtime(object))
    if (!simulate & !hasFeatures(object)) {
        warning("Falling back to 'simulate = TRUE' because no correspondence ",
                "results are present")
        simulate <- TRUE
    }
    if (missing(param)) {
        ## Use the internal parameter - if available.
        if (hasFeatures(object)) {
            ph <- processHistory(object, type = .PROCSTEP.PEAK.GROUPING)
            param <- processParam(ph[[length(ph)]])
        } else {
            param = PeakDensityParam(
                sampleGroups = rep(1, length(fileNames(object))))
        }
    }
    if (!is(param, "PeakDensityParam"))
        stop("'param' has to be a 'PeakDensityParam'")
    mz <- range(mz)
    rt <- range(rt)
    ## Get all the data we require.
    nsamples <- length(fileNames(object))
    if (length(col) != nsamples)
        col <- rep_len(col[1], nsamples)
    pks <- chromPeaks(object, mz = mz, rt = rt, type = type, msLevel = 1L)
    if (nrow(pks)) {
        ## Extract parameters from the param object
        bw = bw(param)
        ## Ensure the density is calculated exactly as it would in the real
        ## case.
        full_rt_range <- range(chromPeaks(object)[, "rt"])
        dens_from <- full_rt_range[1] - 3 * bw
        dens_to <- full_rt_range[2] + 3 * bw
        ## That's Jan Stanstrup's fix (issue #161).
        densN <- max(512, 2 * 2^(ceiling(log2(diff(full_rt_range) / (bw / 2)))))
        sample_groups <- sampleGroups(param)
        if (length(sample_groups) == 0)
            sample_groups <- rep(1, nsamples)
        if (length(sample_groups) != nsamples)
            stop("If provided, the 'sampleGroups' parameter in the 'param' ",
                 "class has to have the same length than there are samples ",
                 "in 'object'")
        sample_groups_table <- table(sample_groups)
        dens <- density(pks[, "rt"], bw = bw, from = dens_from, to = dens_to,
                        n = densN)
        yl <- c(0, max(dens$y))
        ypos <- seq(from = yl[1], to = yl[2], length.out = nsamples)
        if (is.null(main))
            main <- paste0(format(mz, digits = 7), collapse = " - ")
        ## Plot the peaks as points.
        ## Fix assignment of point types for each sample.
        dots <- list(...)
        if (any(names(dots) == "bg")) {
            bg <- dots$bg
            if (length(bg) != nsamples)
                bg <- rep_len(bg[1], nsamples)
            dots$bg <- bg[pks[, "sample"]]
        }
        if (any(names(dots) == "pch")) {
            pch <- dots$pch
            if (length(pch) != nsamples)
                pch <- rep_len(pch[1], nsamples)
            dots$pch <- pch[pks[, "sample"]]
        }
        do.call("plot", args = c(list(x = pks[, "rt"],
                                      y = ypos[pks[, "sample"]], xlim = xlim,
                                      col = col[pks[, "sample"]], xlab = xlab,
                                      yaxt = "n", ylab = ylab,
                                      main = main, ylim = yl), dots))
        axis(side = 2, at = ypos, labels = 1:nsamples)
        points(x = dens$x, y = dens$y, type = "l")
        ## Estimate what would be combined to a feature
        ## Code is taken from do_groupChromPeaks_density
        dens_max <- max(dens$y)
        dens_y <- dens$y
        if (simulate) {
            snum <- 0
            while(dens_y[max_y <- which.max(dens_y)] > dens_max / 20 &&
                  snum < maxFeatures(param)) {
                      feat_range <- descendMin(dens_y, max_y)
                      dens_y[feat_range[1]:feat_range[2]] <- 0
                      feat_idx <- which(pks[, "rt"] >= dens$x[feat_range[1]] &
                                        pks[, "rt"] <= dens$x[feat_range[2]])
                      tt <- table(sample_groups[pks[feat_idx, "sample"]])
                      if (!any(tt / sample_groups_table[names(tt)] >=
                               minFraction(param) & tt >= minSamples(param)))
                          next
                      rect(xleft = min(pks[feat_idx, "rt"]), ybottom = yl[1],
                           xright = max(pks[feat_idx, "rt"]), ytop = yl[2],
                           border = "#00000040", col = "#00000020")
                  }
        } else {
            ## Plot all features in the region.
            fts <- featureDefinitions(object, mz = mz, rt = rt, type = type)
            if (nrow(fts)) {
                rect(xleft = fts$rtmin, xright = fts$rtmax,
                     ybottom = rep(yl[1], nrow(fts)),
                     ytop = rep(yl[2], nrow(fts)),
                     border = "#00000040", col = "#00000020")
                abline(v = fts$rtmed, col = "#00000040", lty = 2)
            }
        }
    } else {
        plot(3, 3, pch = NA, xlim = rt, xlab = xlab,
             main = paste0(format(mz, digits = 7), collapse = " - "))
    }
}

#' @title Add definition of chromatographic peaks to an extracted chromatogram
#'     plot
#'
#' @description
#'
#' The \code{highlightChromPeaks} function adds chromatographic
#' peak definitions to an existing plot, such as one created by the
#' \code{plot} method on a \code{\link{Chromatogram}} or
#' \code{\link{MChromatograms}} object.
#'
#' @param x For \code{highlightChromPeaks}: \code{XCMSnExp} object with the
#'     detected peaks.
#'
#' @param rt For \code{highlightChromPeaks}: \code{numeric(2)} with the
#'     retention time range from which peaks should be extracted and plotted.
#'
#' @param mz \code{numeric(2)} with the mz range from which the peaks should
#'     be extracted and plotted.
#'
#' @param peakIds \code{character} defining the IDs (i.e. rownames of the peak
#'     in the \code{chromPeaks} table) of the chromatographic peaks to be
#'     highlighted in a plot.
#'
#' @param border colors to be used to color the border of the rectangles/peaks.
#'     Has to be equal to the number of samples in \code{x}.
#'
#' @param lwd \code{numeric(1)} defining the width of the line/border.
#'
#' @param col For \code{highlightChromPeaks}: color to be used to fill the
#'     rectangle (if \code{type = "rect"}) or the peak
#'     (for \code{type = "polygon"}).
#'
#' @param type the plotting type. See \code{\link{plot}} in base grapics for
#'     more details.
#'     For \code{highlightChromPeaks}: \code{character(1)} defining how the peak
#'     should be highlighted: \code{type = "rect"} draws a rectangle
#'     representing the peak definition, \code{type = "point"} indicates a
#'     chromatographic peak with a single point at the position of the peak's
#'     \code{"rt"} and \code{"maxo"} and \code{type = "polygon"} will highlight
#'     the peak shape. For \code{type = "polygon"} the color of the border and
#'     area can be defined with parameters \code{"border"} and \code{"col"},
#'     respectively.
#'
#' @param whichPeaks \code{character(1)} specifying how peaks are called to be
#'     located within the region defined by \code{mz} and \code{rt}. Can be
#'     one of \code{"any"}, \code{"within"}, and \code{"apex_within"} for all
#'     peaks that are even partially overlapping the region, peaks that are
#'     completely within the region, and peaks for which the apex is within
#'     the region. This parameter is passed to the \code{type} argument of the
#'     \code{\link{chromPeaks}} function. See related documentation for more
#'     information and examples.
#'
#' @param ... additional parameters to the \code{\link{matplot}} or \code{plot}
#'     function.
#'
#' @author Johannes Rainer
#'
#' @examples
#'
#' ## Load a test data set with detected peaks
#' library(MSnbase)
#' data(faahko_sub)
#' ## Update the path to the files for the local system
#' dirname(faahko_sub) <- system.file("cdf/KO", package = "faahKO")
#'
#' ## Disable parallel processing for this example
#' register(SerialParam())
#'
#' ## Extract the ion chromatogram for one chromatographic peak in the data.
#' chrs <- chromatogram(faahko_sub, rt = c(2700, 2900), mz = 335)
#'
#' plot(chrs)
#'
#' ## Extract chromatographic peaks for the mz/rt range (if any).
#' chromPeaks(faahko_sub, rt = c(2700, 2900), mz = 335)
#'
#' ## Highlight the chromatographic peaks in the area
#' ## Show the peak definition with a rectangle
#' highlightChromPeaks(faahko_sub, rt = c(2700, 2900), mz = 335)
#'
#' ## Color the actual peak
#' highlightChromPeaks(faahko_sub, rt = c(2700, 2900), mz = 335,
#'     col = c("#ff000020", "#00ff0020"), type = "polygon")
highlightChromPeaks <- function(x, rt, mz, peakIds = character(),
                                border = rep("00000040", length(fileNames(x))),
                                lwd = 1, col = NA,
                                type = c("rect", "point", "polygon"),
                                whichPeaks = c("any", "within", "apex_within"),
                                ...) {
    type <- match.arg(type)
    msLevel <- 1L
    whichPeaks <- match.arg(whichPeaks)
    n_samples <- length(fileNames(x))
    if (!hasChromPeaks(x))
        stop("'x' does not contain detected peaks")
    if (length(peakIds)) {
        if (!missing(rt) | !missing(mz))
            warning("Ignoring 'rt' and 'mz' because peakIds were provided")
        if (!all(peakIds %in% rownames(chromPeaks(x, msLevel = msLevel))))
            stop("'peakIds' do not match rownames of 'chromPeaks(x)'")
        rt <- range(chromPeaks(x)[peakIds, c("rtmin", "rtmax")])
        mz <- range(chromPeaks(x)[peakIds, c("mzmin", "mzmax")])
    }
    if (missing(rt))
        rt <- c(-Inf, Inf)
    if (missing(mz))
        mz <- c(-Inf, Inf)
    if (!is(x, "XCMSnExp"))
        stop("'x' has to be a XCMSnExp object")
    if (length(peakIds))
        pks <- chromPeaks(x)[peakIds, , drop = FALSE]
    else pks <- chromPeaks(x, rt = rt, mz = mz, ppm = 0, type = whichPeaks,
                           msLevel = msLevel)
    if (length(col) != n_samples)
        col <- rep(col[1], n_samples)
    if (length(border) != n_samples)
        border <- rep(border[1], n_samples)
    if (length(pks)) {
        switch(type,
               rect = rect(xleft = pks[, "rtmin"], xright = pks[, "rtmax"],
                           ybottom = rep(0, nrow(pks)), ytop = pks[, "maxo"],
                           border = border[pks[, "sample"]], lwd = lwd,
                           col = col[pks[, "sample"]]),
               point = {
                   ## Fix assignment of point types for each sample.
                   dots <- list(...)
                   if (any(names(dots) == "bg")) {
                       bg <- dots$bg
                       if (length(bg) != n_samples)
                           bg <- rep_len(bg[1], n_samples)
                       dots$bg <- bg[pks[, "sample"]]
                   }
                   if (any(names(dots) == "pch")) {
                       pch <- dots$pch
                       if (length(pch) != n_samples)
                           pch <- rep_len(pch[1], n_samples)
                       dots$pch <- pch[pks[, "sample"]]
                   }
                   if (any(is.na(col)))
                       col <- border
                       ## Draw a point at the position defined by the "rt" column
                       do.call("points",
                               args = c(list(x = pks[, "rt"],
                                             y = pks[, "maxo"],
                                             col = col[pks[, "sample"]]), dots))
               },
               polygon = {
                   if (nrow(pks)) {
                       chrs <- chromatogram(
                           x, rt = range(pks[, c("rtmin", "rtmax")]), mz = mz)
                       pks <- pks[order(pks[, "maxo"], decreasing = TRUE), ,
                                  drop = FALSE]
                       for (j in seq_len(nrow(pks))) {
                           i <- pks[j, "sample"]
                           chr <- filterRt(chrs[1, i],
                                           rt = pks[j, c("rtmin", "rtmax")])
                           xs <- rtime(chr)
                           xs <- c(xs, xs[length(xs)], xs[1])
                           ys <- c(intensity(chr), 0, 0)
                           nona <- !is.na(ys)
                           polygon(xs[nona], ys[nona], border = border[i],
                                   col = col[i])
                       }
                   }
               })
    }
}

#' @rdname calibrate-calibrant-mass
#'
#' @description
#'
#' The `isCalibrated` function returns `TRUE` if chromatographic
#' peaks of the [XCMSnExp] object `x` were calibrated and `FALSE` otherwise.
#'
#' @md
isCalibrated <- function(object) {
    if (length(processHistory(object, type = .PROCSTEP.CALIBRATION)))
        TRUE
    else
        FALSE
}

#' @title Replace raw with adjusted retention times
#'
#' @description
#'
#' Replaces the raw retention times with the adjusted retention
#' time or returns the object unchanged if none are present.
#'
#' @details
#'
#' Adjusted retention times are stored *in parallel* to the adjusted
#' retention times in `XCMSnExp` or `XcmsExperiment` objects. The
#' `applyAdjustedRtime` replaces the raw (original) retention times with the
#' adjusted retention times.
#'
#' @note
#'
#' Replacing the raw retention times with adjusted retention times
#' disables the possibility to restore raw retention times using the
#' [dropAdjustedRtime()] method. This function does **not** remove the
#' retention time processing step with the settings of the alignment from
#' the [processHistory()] of the `object` to ensure that the processing
#' history is preserved.
#'
#' @param object An [XCMSnExp] or [XcmsExperiment] object.
#'
#' @md
#'
#' @return
#'
#' An `XCMSnExp` or `XcmsExperiment` object with the raw (original) retention
#' times being replaced with the adjusted retention time.
#'
#' @author Johannes Rainer
#'
#' @seealso [adjustRtime()] for the function to perform the alignment (retention
#'     time correction).
#'
#'     [adjustedRtime()] for the method to extract adjusted retention times from
#'     an [XCMSnExp] object.
#'
#'     [dropAdjustedRtime] for the method to delete alignment results and to
#'     restore the raw retention times.
#'
#' @examples
#'
#' ## Load a test data set with detected peaks
#' library(MSnbase)
#' data(faahko_sub)
#' ## Update the path to the files for the local system
#' dirname(faahko_sub) <- system.file("cdf/KO", package = "faahKO")
#'
#' ## Disable parallel processing for this example
#' register(SerialParam())
#'
#' xod <- adjustRtime(faahko_sub, param = ObiwarpParam())
#'
#' hasAdjustedRtime(xod)
#'
#' ## Replace raw retention times with adjusted retention times.
#' xod <- applyAdjustedRtime(xod)
#'
#' ## No adjusted retention times present
#' hasAdjustedRtime(xod)
#'
#' ## Raw retention times have been replaced with adjusted retention times
#' plot(split(rtime(faahko_sub), fromFile(faahko_sub))[[1]] -
#'     split(rtime(xod), fromFile(xod))[[1]], type = "l")
#'
#' ## And the process history still contains the settings for the alignment
#' processHistory(xod)
applyAdjustedRtime <- function(object) {
    if (inherits(object, "XCMSnExp")) {
        if (!hasAdjustedRtime(object))
            return(object)
        ## Replace retention times.
        fData(object)$retentionTime <- rtime(object, adjusted = TRUE)
        ## Copy the data
        newFd <- new("MsFeatureData")
        newFd@.xData <- .copy_env(object@msFeatureData)
        rm(list = "adjustedRtime", envir = newFd)
        object@msFeatureData <- newFd
        object
    } else if (inherits(object, "XcmsExperiment")) {
        if (!hasAdjustedRtime(object))
            return(object)
        rtime(object@spectra) <- object@spectra$rtime_adjusted
        svs <- unique(c(spectraVariables(object@spectra), "mz", "intensity"))
        object@spectra <- selectSpectraVariables(
            object@spectra, svs[svs != "rtime_adjusted"])
        object
    } else stop("'object' has to be either an 'XCMSnExp' or ",
                "'XcmsExperiment' object")
}

## Find mz ranges with multiple peaks per sample.
## Use the density distribution for that? with a bandwidth = 0.001, check
## density method for that...


.concatenate_XCMSnExp <- function(...) {
    x <- list(...)
    if (length(x) == 0)
        return(NULL)
    if (length(x) == 1)
        return(x[[1]])
    ## Check that all are XCMSnExp objects.
    if (!all(unlist(lapply(x, function(z) is(z, "XCMSnExp")))))
        stop("All passed objects should be 'XCMSnExp' objects")
    new_x <- as(.concatenate_OnDiskMSnExp(...), "XCMSnExp")
    ## If any of the XCMSnExp has alignment results or detected features drop
    ## them!
    x <- lapply(x, function(z) {
        if (hasAdjustedRtime(z)) {
            z <- dropAdjustedRtime(z)
            warning("Adjusted retention times found, had to drop them.")
        }
        if (hasFeatures(z)) {
            z <- dropFeatureDefinitions(z)
            warning("Feature definitions found, had to drop them.")
        }
        z
    })
    ## Combine peaks
    fls <- lapply(x, fileNames)
    startidx <- cumsum(lengths(fls))
    pks <- lapply(x, chromPeaks)
    procH <- lapply(x, processHistory)
    for (i in 2:length(fls)) {
        pks[[i]][, "sample"] <- pks[[i]][, "sample"] + startidx[i - 1]
        procH[[i]] <- lapply(procH[[i]], function(z) {
            z@fileIndex <- as.integer(z@fileIndex + startidx[i - 1])
            z
            })
    }
    pks <- do.call(rbind, pks)
    rownames(pks) <- NULL
    new_x@.processHistory <- unlist(procH)
    chromPeaks(new_x) <- pks
    rm(pks)
    cpd <- do.call(rbind, lapply(x, chromPeakData))
    rownames(cpd) <- rownames(chromPeaks(new_x))
    chromPeakData(new_x) <- cpd
    validObject(new_x)
    new_x
}

#' @title Simple feature summaries
#'
#' @description
#'
#' Simple function to calculate feature summaries. These include counts and
#' percentages of samples in which a chromatographic peak is present for each
#' feature and counts and percentages of samples in which more than one
#' chromatographic peak was annotated to the feature. Also relative standard
#' deviations (RSD) are calculated for the integrated peak areas per feature
#' across samples. For `perSampleCounts = TRUE` also the individual
#' chromatographic peak counts per sample are returned.
#'
#' @param x [XcmsExperiment()] or [XCMSnExp()] object with correspondence
#'     results.
#'
#' @param group `numeric`, `logical`, `character` or `factor` with the same
#'     length than `x` has samples to aggregate counts by the groups defined
#'     in `group`.
#'
#' @param perSampleCounts `logical(1)` whether feature wise individual peak
#'     counts per sample should be returned too.
#'
#' @param method `character` passed to the [featureValues()] function. See
#'     respective help page for more information.
#'
#' @param skipFilled `logical(1)` whether filled-in peaks should be excluded
#'     (default) or included in the summary calculation.
#'
#' @return
#'
#' `matrix` with one row per feature and columns:
#'
#' - `"count"`: the total number of samples in which a peak was found.
#' - `"perc"`: the percentage of samples in which a peak was found.
#' - `"multi_count"`: the total number of samples in which more than one peak
#'   was assigned to the feature.
#' - `"multi_perc"`: the percentage of those samples in which a peak was found,
#'   that have also multiple peaks annotated to the feature. Example: for a
#'   feature, at least one peak was detected in 50 samples. In 5 of them 2 peaks
#'   were assigned to the feature. `"multi_perc"` is in this case 10%.
#' - `"rsd"`: relative standard deviation (coefficient of variation) of the
#'   integrated peak area of the feature's peaks.
#' - The same 4 columns are repeated for each unique element (level) in `group`
#'   if `group` was provided.
#'
#' If `perSampleCounts = TRUE` also one column for each sample is returned
#' with the peak counts per sample.
#'
#' @author Johannes Rainer
featureSummary <- function(x, group, perSampleCounts = FALSE,
                           method = "maxint", skipFilled = TRUE) {
    if (!(is(x, "XCMSnExp") | inherits(x, "XcmsExperiment")))
        stop("'x' is expected to be an 'XcmsExperiment' or 'XCMSnExp' object")
    if (!hasFeatures(x))
        stop("No feature definitions found in 'x'. Please perform first a ",
             "correspondence analysis with the 'groupChromPeaks' function.")
    if (!missing(group)) {
        if (length(group) != length(fileNames(x)))
            stop("length of 'group' does not match the number of ",
                 "samples in 'x'")
    }
    if (skipFilled && .hasFilledPeaks(x))
        x <- dropFilledChromPeaks(x)
    ## First determine the number of peaks per sample
    smpls <- seq_along(fileNames(x))
    pks_per_sample <- lapply(featureDefinitions(x)$peakidx, function(z)
        as.numeric(table(factor(chromPeaks(x)[z, "sample"],
                                levels = smpls))))
    pks_per_sample <- do.call(rbind, pks_per_sample)
    rownames(pks_per_sample) <- rownames(featureDefinitions(x))
    sum_fun <- function(z) {
        cnt <- sum(z > 0)
        cnt_multi <- sum(z > 1)
        sm <- c(count = cnt,
                 perc = cnt * 100 / length(z),
                 multi_count = cnt_multi,
                 multi_perc = cnt_multi * 100 / cnt
                 )
        sm[is.na(sm)] <- 0
        sm
    }
    fts_sum <- t(apply(pks_per_sample, MARGIN = 1, sum_fun))
    ## Calculate RSD
    rsd <- function(z) {sd(z, na.rm = TRUE) / abs(mean(z, na.rm = TRUE))}
    fvals <- featureValues(x, value = "into", method = method)
    fts_sum <- cbind(fts_sum, rsd = apply(fvals, MARGIN = 1, rsd))
    if (!missing(group)) {
        per_group <- lapply(unique(group), function(grp) {
            idx <- which(group == grp)
            res <- cbind(t(apply(pks_per_sample[, idx, drop = FALSE],
                                 MARGIN = 1, sum_fun)),
                         rsd = apply(fvals[, idx, drop = FALSE], MARGIN = 1,
                                     rsd))
            colnames(res) <- paste0(grp, "_", colnames(res))
            res
        })
        fts_sum <- cbind(fts_sum, do.call(cbind, per_group))
    }
    if (perSampleCounts) {
        colnames(pks_per_sample) <- basename(fileNames(x))
        fts_sum <- cbind(fts_sum, pks_per_sample)
    }
    fts_sum
}

#' @title Identify overlapping features
#'
#' @description
#'
#' `overlappingFeatures` identifies features that are overlapping or close in
#' the m/z - rt space.
#'
#' @param x [XcmsExperiment()] or [XCMSnExp()] object with the features.
#'
#' @param expandMz `numeric(1)` with the value to expand each feature (on each
#'     side) in m/z dimension before identifying overlapping features.
#'     The resulting `"mzmin"` for the feature is thus `mzmin - expandMz` and
#'     the `"mzmax"` `mzmax + expandMz`.
#'
#' @param expandRt `numeric(1)` with the value to expand each feature (on each
#'     side) in retention time dimension before identifying overlapping
#'     features. The resulting `"rtmin"` for the
#'     feature is thus `rtmin - expandRt` and the `"rtmax"` `rtmax + expandRt`.
#'
#' @param ppm `numeric(1)` to grow the m/z width of the feature by a relative
#'     value: `mzmin - mzmin * ppm / 2e6`, `mzmax + mzmax * ppm / 2e6`. Each
#'     feature is thus expanded in m/z dimension by ppm/2 on each side before
#'     identifying overlapping features.
#'
#' @return `list` with indices of features (in [featureDefinitions()]) that
#'     are overlapping.
#'
#' @md
#'
#' @author Johannes Rainer
#'
#' @examples
#'
#' ## Load a test data set with detected peaks
#' library(MSnbase)
#' data(faahko_sub)
#' ## Update the path to the files for the local system
#' dirname(faahko_sub) <- system.file("cdf/KO", package = "faahKO")
#'
#' ## Disable parallel processing for this example
#' register(SerialParam())
#'
#' ## Correspondence analysis
#' xdata <- groupChromPeaks(faahko_sub, param = PeakDensityParam(sampleGroups = c(1, 1, 1)))
#'
#' ## Identify overlapping features
#' overlappingFeatures(xdata)
#'
#' ## Identify features that are separated on retention time by less than
#' ## 2 minutes
#' overlappingFeatures(xdata, expandRt = 60)
overlappingFeatures <- function(x, expandMz = 0, expandRt = 0, ppm = 0) {
    if (!(is(x, "XCMSnExp") | is(x, "XcmsExperiment")))
        stop("'x' is expected to be an 'XcmsExperiment' or 'XCMSnExp' object")
    if (!hasFeatures(x))
        stop("No feature definitions found in 'x'. Please perform first a ",
             "correspondence analysis with the 'groupChromPeaks' function.")
    xl <- featureDefinitions(x)$rtmin
    xr <- featureDefinitions(x)$rtmax
    yb <- featureDefinitions(x)$mzmin
    yt <- featureDefinitions(x)$mzmax
    ## Expand them?
    if (expandMz != 0) {
        yb <- yb - expandMz
        yt <- yt + expandMz
    }
    if (ppm != 0) {
        yb <- yb - yb * ppm / 2e6
        yt <- yt + yt * ppm / 2e6
    }
    if (expandRt != 0) {
        rng <- range(rtime(x))
        xl <- xl - expandRt
        xl[xl < rng[1]] <- rng[1]
        xr <- xr + expandRt
        xr[xr > rng[2]] <- rng[2]
    }
    .rect_overlap(xleft = xl, xright = xr, ybottom = yb, ytop = yt)
}


#' @title Export data for use in MetaboAnalyst
#'
#' @description
#'
#' Export the feature table for further analysis in the MetaboAnalyst
#' software (or the `MetaboAnalystR` R package).
#'
#' @param x [XCMSnExp] object with identified chromatographic peaks grouped
#'     across samples.
#'
#' @param file `character(1)` defining the file name. If not specified, the
#'     `matrix` with the content is returned.
#'
#' @param label either `character(1)` specifying the phenodata column in `x`
#'     defining the sample grouping or a vector with the same length than
#'     samples in `x` defining the group assignment of the samples.
#'
#' @param value `character(1)` specifying the value to be returned for each
#'     feature. See [featureValues()] for more details.
#'
#' @param digits `integer(1)` defining the number of significant digits to be
#'     used for numeric. The default `NULL` uses `getOption("digits")`. See
#'     [format()] for more information.
#'
#' @param groupnames `logical(1)` whether row names of the resulting matrix
#'     should be the feature IDs (`groupnames = FALSE`; default) or IDs that
#'     are composed of the m/z and retention time of the features (in the
#'     format `M<m/z>T<rt>` (`groupnames = TRUE`). See help of the [groupnames]
#'     function for details.
#'
#' @param ... additional parameters to be passed to the [featureValues()]
#'     function.
#'
#' @return If `file` is not specified, the function returns the `matrix` in
#'     the format supported by MetaboAnalyst.
#'
#' @export
#'
#' @author Johannes Rainer
#'
#' @md
exportMetaboAnalyst <- function(x, file = NULL, label,
                                value = "into", digits = NULL,
                                groupnames = FALSE, ...) {
    if (!is(x, "XCMSnExp"))
        stop("'x' is supposed to be an XCMSnExp object")
    fv <- featureValues(x, value = value, ...)
    nas <- is.na(fv)
    fv <- format(fv, trim = TRUE, digits = digits)
    fv[nas] <- NA
    if (missing(label))
        stop("Please provide the group assignment of the samples with the ",
             "'label' parameter")
    if (groupnames)
        rownames(fv) <- groupnames(x)
    if (length(label) == 1) {
        if (any(colnames(pData(x)) == label))
            label <- as.character(pData(x)[, label])
        else
            stop("No pheno data column \"", label, "\" found. If length ",
                 "'label' is 1 it is expected to specify the column in ",
                 "the object's phenodata data.frame containing the",
                 " group assignment.")
    }
    if (length(label) != ncol(fv))
        stop("Length of 'label' has to match the number of samples (",
             ncol(fv), ").")
    fv <- rbind(Sample = colnames(fv),
                Label = as.character(label),
                fv)
    if (!is.null(file))
        write.table(fv, file = file, sep = ",",
                    qmethod = "double", col.names = FALSE, row.names = TRUE)
    else
        fv
}

#' @description
#'
#' Identifies for all peaks of an `XCMSnExp` object MS2 spectra and returns
#' them. Identification is performed separately (but parallel) for each file.
#'
#' @author Johannes Rainer
#'
#' @noRd
ms2_mspectrum_for_all_peaks <- function(x, expandRt = 0, expandMz = 0,
                                  ppm = 0, method = c("all",
                                                      "closest_rt",
                                                      "closest_mz",
                                                      "signal"),
                                  skipFilled = FALSE, subset = NULL,
                                  BPPARAM = bpparam()) {
    ## DEPRECATE THIS IN BIOC3.14
    method <- match.arg(method)
    pks <- chromPeaks(x)
    if (ppm != 0)
        ppm <- pks[, "mz"] * ppm / 1e6
    if (expandMz != 0 || length(ppm) > 1) {
        pks[, "mzmin"] <- pks[, "mzmin"] - expandMz - ppm
        pks[, "mzmax"] <- pks[, "mzmax"] + expandMz + ppm
    }
    if (expandRt != 0) {
        pks[, "rtmin"] <- pks[, "rtmin"] - expandRt
        pks[, "rtmax"] <- pks[, "rtmax"] + expandRt
    }
    if (length(subset)) {
        if (!(min(subset) >= 1 && max(subset) <= nrow(pks)))
            stop("If 'subset' is defined it has to be >= 1 and <= ",
                 nrow(pks), ".")
        not_subset <- rep(TRUE, nrow(pks))
        not_subset[subset] <- FALSE
        pks[not_subset, "mz"] <- NA
        rm(not_subset)
    }
    if (skipFilled && any(chromPeakData(x)$is_filled))
        pks[chromPeakData(x)$is_filled, ] <- NA
    ## Split data per file
    file_factor <- factor(pks[, "sample"])
    peak_ids <- rownames(pks)
    pks <- split.data.frame(pks, f = file_factor)
    x <- .split_by_file2(
        x, msLevel. = 2L, subsetFeatureData = FALSE)[as.integer(levels(file_factor))]
    res <- bpmapply(ms2_mspectrum_for_peaks_from_file, x, pks,
                    MoreArgs = list(method = method), SIMPLIFY = FALSE,
                    USE.NAMES = FALSE, BPPARAM = BPPARAM)
    res <- unsplit(res, file_factor)
    names(res) <- peak_ids
    res
}

#' @description
#'
#' Identify for all chromatographic peaks of a single file MS2 spectra with
#' an precursor m/z within the peak's m/z and retention time within the peak's
#' retention time width and return them.
#'
#' @note
#'
#' If needed, the m/z and rt width of the peaks should be increased previously.
#' No MS2 spectra are identified for peaks with an `"mz"` of `NA`, thus, to
#' skip identification for some (e.g. filled-in) peaks their value in the
#' `"mz"` column of `pks` should be set to `NA` before passing the parameter
#' to the function.
#'
#' @param x `OnDiskMSnExp` with (only) MS2 spectra of a single file.
#'
#' @param pks `matrix` with chromatographic peaks of a single file.
#'
#' @param method `character` defining the method to optionally select a single
#'     MS2 spectrum for the peak.
#'
#' @return `list` with length equal to the number of rows of `pks` with
#'     `Spectrum2` objects
#'
#' @author Johannes Rainer
#'
#' @noRd
ms2_mspectrum_for_peaks_from_file <- function(x, pks, method = c("all",
                                                                 "closest_rt",
                                                                 "closest_mz",
                                                                 "signal")) {
    ## DEPRECATE THIS IN BIOC3.14
    res <- vector(mode = "list", nrow(pks))
    if (nrow(pks) == 0 || !any(msLevel(x) == 2))
        return(res)
    method <- match.arg(method)
    fromFile <- as.integer(pks[1, "sample"])
    sps <- spectra(x)
    pmz <- precursorMz(x)
    rtm <- rtime(x)
    for (i in 1:nrow(pks)) {
        if (is.na(pks[i, "mz"]))
            next
        idx <- which(pmz >= pks[i, "mzmin"] & pmz <= pks[i, "mzmax"] &
                     rtm >= pks[i, "rtmin"] & rtm <= pks[i, "rtmax"])
        if (length(idx)) {
            if (length(idx) > 1 & method != "all") {
                if (method == "closest_rt")
                    idx <- idx[order(abs(rtm[idx] - pks[i, "rt"]))][1]
                if (method == "closest_mz")
                    idx <- idx[order(abs(pmz[idx] - pks[i, "mz"]))][1]
                if (method == "signal") {
                    sps_sub <- sps[idx]
                    ints <- vapply(sps_sub, function(z) sum(intensity(z)),
                                   numeric(1))
                    idx <- idx[order(abs(ints - pks[i, "maxo"]))][1]
                }
                if (method == "largest_tic") {
                    sps_sub <- sps[idx]
                    ints <- vapply(sps_sub, function(z) sum(intensity(z)),
                                   numeric(1))
                    idx <- idx[order(ints, decreasing = TRUE)][1L]
                }
                if (method == "largest_bpi") {
                    sps_sub <- sps[idx]
                    ints <- vapply(sps_sub, function(z) max(intensity(z)),
                                   numeric(1))
                    idx <- idx[order(ints, decreasing = TRUE)][1L]
                }
            }
            res[[i]] <- lapply(sps[idx], function(z) {
                z@fromFile = fromFile
                z
            })
        }
    }
    names(res) <- rownames(pks)
    res
}

#' given an XCMSnExp this function identifies for each MS1 chromatographic
#' peak all MS2 spectra with a precursor m/z within the peak region and returns
#' a `Spectra` object with all of these spectra.
#'
#' @return a `list`, same length than there are `chromPeaks` with a `Spectra`
#'     in each, if found.
#'
#' @noRd
.spectra_for_peaks <- function(x, method = c("all", "closest_rt",
                                             "closest_mz", "signal",
                                             "largest_tic", "largest_bpi"),
                               msLevel = 2L, expandRt = 0, expandMz = 0,
                               ppm = 0, skipFilled = FALSE,
                               peaks = character()) {
    if (is(x, "XCMSnExp") && hasAdjustedRtime(x))
        fData(x)$retentionTime <- rtime(x)
    ## from_msl <- 1L
    method <- match.arg(method)
    if (msLevel == 1L && method %in% c("closest_mz", "signal")) {
        warning("method = \"closest_mz\" and method = \"signal\" are not",
                " supported for msLevel = 1. Changing to method = \"all\".")
        method <- "all"
    }
    pks <- as.data.frame(chromPeaks(x))[, c("mz", "mzmin", "mzmax", "rt",
                                            "rtmin", "rtmax", "maxo", "sample")]
    if (ppm != 0)
        expandMz <- expandMz + pks$mz * ppm / 1e6
    if (expandMz[1L] != 0) {
        pks$mzmin <- pks$mzmin - expandMz
        pks$mzmax <- pks$mzmax + expandMz
    }
    if (expandRt != 0) {
        pks$rtmin <- pks$rtmin - expandRt
        pks$rtmax <- pks$rtmax + expandRt
    }
    if (length(peaks)) {
        peaks <- .i2index(peaks, rownames(pks), "peaks")
        keep <- rep(FALSE, nrow(pks))
        keep[peaks] <- TRUE
    } else {
        keep <- rep(TRUE, nrow(pks))
        if (skipFilled && any(chromPeakData(x)$is_filled))
            keep <- !chromPeakData(x)$is_filled
        ## if (any(chromPeakData(x)$ms_level))
        ##     keep <- keep & chromPeakData(x)$ms_level == from_msl
    }
    ## maybe subset by peak ID.
    fns <- fileNames(x)
    res <- vector("list", length(keep))
    be <- new("MsBackendMzR")
    sps <- new("Spectra")
    for (i in which(keep)) {
        sel <- .fdata(x)$msLevel == msLevel &
               .fdata(x)$retentionTime >= pks$rtmin[i] &
               .fdata(x)$retentionTime <= pks$rtmax[i] &
               .fdata(x)$fileIdx == pks$sample[i]
        if (msLevel > 1L)
            sel <- sel &
                   .fdata(x)$precursorMZ >= pks$mzmin[i] &
                   .fdata(x)$precursorMZ <= pks$mzmax[i]
        fd <- .fdata(x)[which(sel), ]
        if (nrow(fd)) {
            fd$peak_index <- i
            fd$peak_id <- rownames(pks)[i]
            sp <- .fData2MsBackendMzR(fd, fns, be)
            sp <- switch(
                method,
                all = sp,
                closest_rt = {
                    sp[which.min(abs(pks$rt[i] - rtime(sp)))[1L]]
                },
                closest_mz = {
                    sp[which.min(abs(pks$mz[i] - precursorMz(sp)))[1L]]
                },
                signal = {
                    ints <- vapply(intensity(sp), sum, numeric(1))
                    sp[which.min(abs(ints - pks$maxo[i]))[1L]]
                },
                largest_tic = {
                    ints <- vapply(intensity(sp), sum, numeric(1))
                    sp[which.max(ints)[1L]]
                },
                largest_bpi = {
                    ints <- vapply(intensity(sp), max, numeric(1))
                    sp[which.max(ints)[1L]]
                })
            slot(sps, "backend", check = FALSE) <- sp
            res[[i]] <- sps
        }
    }
    names(res) <- rownames(pks)
    if (length(peaks))
        res[peaks]
    else res
}

#' For information and details see featureSpectra
#'
#' @noRd
ms2_mspectrum_for_features <- function(x, expandRt = 0, expandMz = 0, ppm = 0,
                                       skipFilled = FALSE, ...) {
    idxs <- featureDefinitions(x)$peakidx
    sp_pks <- ms2_mspectrum_for_all_peaks(x, expandRt = expandRt,
                                          expandMz = expandMz, ppm = ppm,
                                          skipFilled = skipFilled,
                                          subset = unique(unlist(idxs)), ...)
    res <- lapply(idxs, function(z) unlist(sp_pks[z]))
    names(res) <- rownames(featureDefinitions(x))
    res
}

#' get spectra per feature:
#' 1) call chromPeakSpectra to get spectra for all peaks (as List)
#' 2) iterate over features to extract all
#' @noRd
.spectra_for_features <- function(x, msLevel = 2L, expandRt = 0,
                                  expandMz = 0, ppm = 0, skipFilled = TRUE,
                                  features = character(), ...) {
    fids <- rownames(featureDefinitions(x))
    idx <- featureDefinitions(x)$peakidx
    if (length(features)) {
        findex <- .i2index(features, fids, "features")
        fids <- fids[findex]
        idx <- idx[findex]
    }
    ## Get spectra for all peaks of these features
    pkidx <- sort(unique(unlist(idx, use.names = FALSE)))
    peak_sp <- vector("list", nrow(chromPeaks(x)))
    peak_sp[pkidx] <- .spectra_for_peaks(
        x, msLevel = msLevel, expandRt = expandRt, expandMz = expandMz,
        ppm = ppm, skipFilled = skipFilled, peaks = pkidx, ...)
    res <- lapply(seq_along(fids), function(i) {
        z <- peak_sp[idx[[i]]]
        if (any(lengths(z))) {
            z <- Spectra::concatenateSpectra(z)
            z@backend@spectraData <- cbind(z@backend@spectraData,
                                           DataFrame(feature_id = fids[i]))
            z@processing <- character()
            z
        }
    })
    names(res) <- fids
    res
}

#' @description
#'
#' \code{hasFilledChromPeaks}: whether filled-in peaks are present or not.
#'
#' @rdname XCMSnExp-class
setMethod("hasFilledChromPeaks", "XCMSnExp", function(object) {
    .hasFilledPeaks(object)
})

#' Process the results from a peak detection in SWATH pockets.
#'
#' @param x `list` of `XCMSnExp` objects.
#'
#' @param msf `MsFeatureData` of the original object
#'
#' @param fileNames `character` with the file names of the original object. This
#'     is required to ensure that column `"sample"` in the chrom peaks matrix
#'     contains the correct indices.
#'
#' @return `MsFeatureData` with the `chromPeaks` and `chromPeakData` updated.
#'
#' @author Johannes Rainer
#'
#' @noRd
.swath_collect_chrom_peaks <- function(x, msf, fileNames) {
    pks <- do.call(rbind, lapply(x, function(z) {
        suppressWarnings(cpks <- .chromPeaks(z))
        if (!is.null(cpks) && nrow(cpks))
            cpks[, "sample"] <- match(fileNames(z)[cpks[, "sample"]], fileNames)
        cpks
    }))
    cpd <- do.call(rbind, lapply(x, function(z) {
        if (hasChromPeaks(z) && nrow(chromPeakData(z))) {
            ret <- chromPeakData(z)
            target_mz <- isolationWindowTargetMz(z)[1]
            ret$isolationWindow <- .fdata(z)$isolationWindow[1]
            ret$isolationWindowTargetMZ <- target_mz
            ret$isolationWindowLowerMz <-
                target_mz - .fdata(z)$isolationWindowLowerOffset[1]
            ret$isolationWindowUpperMz <-
                target_mz + .fdata(z)$isolationWindowUpperOffset[1]
            ret
        } else DataFrame()
    }))
    if (!nrow(cpd))
        return(msf)
    if (hasChromPeaks(msf)) {
        idx_start <- max(nrow(chromPeaks(msf)),
                         as.numeric(sub("CP", "", rownames(chromPeaks(msf)))))
        rownames(pks) <- rownames(cpd) <- .featureIDs(nrow(pks),
                                                      from = idx_start + 1,
                                                      prefix = "CP")
        chromPeaks(msf) <- .rbind_fill(chromPeaks(msf), pks)
        chromPeakData(msf) <- .rbind_fill(chromPeakData(msf), cpd)
    } else {
        rownames(pks) <- rownames(cpd) <- .featureIDs(nrow(pks), prefix = "CP")
        chromPeaks(msf) <- pks
        chromPeakData(msf) <- cpd
    }
    msf
}

#' This function *overwrites* the `MSnbase` .plot_XIC function by adding also
#' a rectangle with the identified chromatographic peak.
#'
#' @param x `XCMSnExp` object with identifie chromatographic peaks.
#'
#' @param peakCol color for the border of the rectangle.
#'
#' @author Johannes Rainer
#'
#' @md
#'
#' @noRd
.plot_XIC <- function(x, peakCol = "#00000060", ...) {
    peakCol <- peakCol[1]
    x <- filterMsLevel(x, 1L)
    if (!length(x))
        stop("No MS1 data available")
    pks <- chromPeaks(x)
    fls <- basename(fileNames(x))
    x <- .split_by_file2(x)
    x <- lapply(x, as, "data.frame")
    ## Check if we are greedy and plot a too large area
    if (any(unlist(lapply(x, nrow)) > 20000))
        warning("The MS area to be plotted seems rather large. It is suggested",
                " to restrict the data first using 'filterRt' and 'filterMz'. ",
                "See also ?chromatogram and ?Chromatogram for more efficient ",
                "functions to plot a total ion chromatogram or base peak ",
                "chromatogram.",
                immediate = TRUE, call = FALSE)
    ## Define the layout.
    dots <- list(...)
    if (any(names(dots) == "layout")) {
        if (!is.null(dots$layout))
            layout(layout)
        dots$layout <- NULL
    } else
        layout(MSnbase:::.vertical_sub_layout(length(x)))
    for (i in seq_along(x)) {
        do.call(MSnbase:::.plotXIC,
                c(list(x = x[[i]], main = fls[i], layout = NULL), dots))
        pks_current <- pks[pks[, "sample"] == i, , drop = FALSE]
        if (length(pks_current) && nrow(pks_current)) {
            do.call(rect, c(list(xleft = pks_current[, "rtmin"],
                                 ybottom = pks_current[, "mzmin"],
                                 xright = pks_current[, "rtmax"],
                                 ytop = pks_current[, "mzmax"],
                                 border = peakCol), dots))
        }
    }
}

#' @title Group chromatographic peaks based on m/z or retention time
#'
#' @description
#'
#' Group chromatographic peaks if they are overlapping on m/z (independently
#' of their retention time) or *vice versa*.
#'
#' @param x `matrix` with columns `"mzmin"` and `"mzmax"` or `"rtmin"` and
#'     `"rtmax"`.
#'
#' @param min_col
#'
#' @param max_col `character(1)` with the name of the column with the upper
#'     range (e.g. `"mzmax"` or `"rtmax"`).
#'
#' @param min_col `character(1)` with the name of the column with the lower
#'     range (e.g. `"mzmin"` or `"rtmin"`).
#'
#' @param expand `numeric(1)` defining a constant value by which each e.g. m/z
#'     range is supposed to be expanded. Note that the mz range will be expanded
#'     by `expandMz` in both dimensions (i.e. `"mzmin"` - `expandMz` and
#'     `"mzmax"` + `expandMz`.
#'
#' @param ppm `numeric(1)` defining an m/z relative value by which the m/z range
#'     should be expanded.
#'
#' @note
#'
#' `x` is supposed to be a `chromPeaks` matrix of a single file, otherwise we're
#' grouping chromatographic peaks across samples.
#'
#' Note also that **each** peak gets expanded by `expandMz`, thus
#' peaks differing by `2 * expand` will be overlapping. As an example: m/z max
#' of one peak is 12.2, m/z min of another one is 12.4, if `expand = 0.1` is
#' used the m/z max of the first peak will be 12.3 and the m/z min of the second
#' one 12.3, thus both are considered *overlapping*.
#'
#' @author Johannes Rainer
#'
#' @return `list` with rownames (chromatographic peak IDs) of peak groups.
#'
#' @noRd
#'
#' @examples
#'
#' mat <- cbind(rtmin = c(10, 13, 16, 18), rtmax = c(12, 15, 17, 20),
#'     mzmin = c(2, 3, 4, 7), mzmax = c(2.5, 3.5, 4.2, 7.6))
#' rownames(mat) <- c("a", "b", "c", "d")
#' .group_overlapping_peaks(mat)
#'
#' .group_overlapping_peaks(mat, expand = 1)
#'
#' .group_overlapping_peaks(mat, expand = 0.25)
.group_overlapping_peaks <- function(x, min_col = "mzmin", max_col = "mzmax",
                                     expand = 0, ppm = 0) {
    x[, min_col] <- x[, min_col] - expand - x[, min_col] * ppm / 1e6
    x[, max_col] <- x[, max_col] + expand + x[, max_col] * ppm / 1e6
    reduced_ranges <- .reduce(x[, min_col], x[, max_col])
    res <- vector("list", nrow(reduced_ranges))
    tolerance <- sqrt(.Machine$double.eps)
    for (i in seq_along(res)) {
        res[[i]] <- rownames(x)[
            x[, min_col] >= reduced_ranges[i, 1] - tolerance &
            x[, max_col] <= reduced_ranges[i, 2] + tolerance
        ]
    }
    res
}

#' @description
#'
#' Identify chromatographic peaks overlapping in m/z dimension and being close
#' on retention time to combine them if they fulfill the additional criteria:
#' intensity at `"rtmax"` for the first chromatographic peak is
#' `> prop * "maxo"` (`"maxo"` being the maximal intensity of the first peak)
#' **and** intensity at `"rtmin"` for the second chromatographic peak is
#' `> prop * "maxo"` of the second peak.
#'
#' @details
#'
#' The function first identifies chromatographic peaks within the same sample
#' that are overlapping on their m/z range. The m/z range can be expanded with
#' parameter `expandMz` or `ppm`. Note that both the upper and lower m/z is
#' expanded by these resulting in m/z ranges that are of size *original m/z
#' range* `+ 2 * expandMz`.
#'
#' All peaks are first ordered by theyr `"mzmin"` and subsequently expanded by
#' `expandMz` and `ppm`. Peaks are grouped if their expanded m/z ranges
#' (`"mzmin" - expandMz - ppm("mzmin")` to `"mzmax + expandMz + ppm("mzmax")`)
#' and rt ranges (`"rtmin" - expandRt` to `"rtmax" + expandRt`) are overlapping.
#'
#' For overlapping peak-candidates a chromatogram is extracted, with the
#' m/z range being the range of the individual chromatographic peak's m/z range
#' expanded by `expandMz` and `ppm` (on both sides). This is to avoid data
#' points in between peaks being `NA`.
#'
#' @param x `XCMSnExp` object with chromatographic peaks of a **single** file or
#'     an `OnDiskMSnExp` object, in which case parameters `pks` and `pkd` have
#'     to be provided.
#'
#' @param pks `chromPeaks` matrix.
#'
#' @param pkd `chromPeakData` data frame.
#'
#' @param sample_index `integer(1)` representing the index of the sample in the
#'     original object. To be used in column `"sample"` of the new peaks.
#'
#' @param expandRt `numeric(1)` defining by how many seconds the retention time
#'     window is expanded on both sides to check for overlapping peaks.
#'
#' @param expandMz `numeric(1)` constant value by which the m/z range of each
#'     chromatographic peak should be expanded (on both sides!) to check for
#'     overlapping peaks.
#'
#' @param ppm `numeric(1)` defining a m/z relative value (in parts per million)
#'     by which the m/z range of each chromatographic peak should be expanded
#'     to check for overlapping peaks.
#'
#' @param minProp `numeric(1)` between `0` and `1` representing the proporion
#'     of intensity to be required for peaks to be joined. See description for
#'     more details. The default (`minProp = 0.75`) means that peaks are only
#'     joined if the signal half way between then is larger 75% of the smallest
#'     of the two peak's `"maxo"` (maximal intensity at peak apex).
#'
#' @return `list` with element `"chromPeaks"`, that contains the peaks `matrix`
#'     containing newly merged peaks and original peaks if they could not be
#'     merged and `"chromPeakData"` that represents the `DataFrame` with the
#'     corresponding metadata information. The merged peaks will have a row
#'     name of `NA`.
#'
#' @author Johannes Rainer, Mar Garcia-Aloy
#'
#' @md
#'
#' @noRd
#'
#' @examples
#'
#' library(MSnbase)
#' xd <- readMSData(system.file('cdf/KO/ko15.CDF', package = "faahKO"),
#'     mode = "onDisk")
#' xd <- findChromPeaks(xd, param = CentWaveParam())
#'
#' xchr <- chromatogram(xd, mz = c(-0.5, 0.5) + 305.1)
#' plot(xchr)
#'
#' res <- xcms:::.merge_neighboring_peaks(xd, expandRt = 4)
#'
#' res_sub <- res[res[, "mz"] >= 305.05 & res[, "mz"] <= 305.15, ]
#' rect(res_sub[, "rtmin"], 0, res_sub[, "rtmax"], res_sub[, "maxo"],
#'     border = "red")
#'
#' xchr <- chromatogram(xd, mz = c(-0.5, 0.5) + 496.2)
#' plot(xchr)
#'
#' res <- xcms:::.merge_neighboring_peaks(xd, expandRt = 4)
#'
#' res_sub <- res[res[, "mz"] >= 496.15 & res[, "mz"] <= 496.25, ]
#' rect(res_sub[, "rtmin"], 0, res_sub[, "rtmax"], res_sub[, "maxo"],
#'     border = "red")
.merge_neighboring_peaks <- function(x, pks = chromPeaks(x),
                                     pkd = chromPeakData(x),
                                     expandRt = 2, expandMz = 0, ppm = 10,
                                     minProp = 0.75) {
    pks <- force(pks)
    pkd <- force(pkd)
    if (is.null(rownames(pks)))
        stop("Chromatographic peak IDs are required.")
    ms_level <- unique(pkd$ms_level)
    if (length(ms_level) != 1)
        stop("Got chromatographic peaks from different MS levels.", call. = FALSE)
    ## drop_peaks <- rep(FALSE, nrow(pks))
    ## names(drop_peaks) <- rownames(pks)
    cands <- .define_merge_candidates(pks, expandMz, ppm, expandRt)
    if (!length(cands))
        return(list(chromPeaks = pks, chromPeakData = pkd))
    chr_def_mat <- cands[[1L]]
    pk_groups <- cands[[2L]]
    message("Evaluating ", length(pk_groups), " peaks in file ",
            basename(fileNames(x)), " for merging ... ", appendLF = FALSE)
    chrs <- chromatogram(x, mz = chr_def_mat[, c(1, 2)],
                         rt = chr_def_mat[, c(3, 4)],
                         msLevel = ms_level, aggregationFun = "sum")
    ## Now proceed to process them.
    res_list <- pkd_list <- vector("list", length(pk_groups))
    for (i in seq_along(pk_groups)) {
        pk_group <- pk_groups[[i]]
        res <- .chrom_merge_neighboring_peaks(
            chrs[i, 1], pks = pks[pk_group, , drop = FALSE],
            extractROWS(pkd, pk_group), diffRt = 2 * expandRt,
            minProp = minProp)
        ## drop_peaks[pk_group[!pk_group %in% rownames(res$chromPeaks)]] <- TRUE
        res_list[[i]] <- res$chromPeaks
        pkd_list[[i]] <- res$chromPeakData
    }
    pks_new <- do.call(rbind, res_list)
    pks_new[, "sample"] <- pks[1, "sample"]
    pkd_new <- do.call(rbind, pkd_list)
    ## drop peaks that were candidates, but that were not returned (i.e.
    ## were either merged or dropped)
    keep <- which(!(rownames(pks) %in% setdiff(unlist(pk_groups,
                                                      use.names = FALSE),
                                               rownames(pks_new))))
    pks <- pks[keep, , drop = FALSE]
    pkd <- extractROWS(pkd, keep)
    ## add merged peaks
    idx_new <- is.na(rownames(pks_new))
    message("OK")
    list(chromPeaks = rbind(pks, pks_new[idx_new, , drop = FALSE]),
         chromPeakData = rbind(pkd, extractROWS(pkd_new, idx_new)))
}

.define_merge_candidates <- function(pks, expandMz, ppm, expandRt) {
    mz_groups <- .group_overlapping_peaks(pks, expand = expandMz, ppm = ppm)
    mz_groups <- mz_groups[lengths(mz_groups) > 1]
    if (!length(mz_groups))
        return(list())
    ## Defining merge candidates
    pk_groups <- list()
    chr_def_mat <- list()
    current_group <- 1
    ppme <- ppm * 1e-6
    for (i in seq_along(mz_groups)) {
        rt_groups <- .group_overlapping_peaks(
            pks[mz_groups[[i]], , drop = FALSE],
            expand = expandRt, min_col = "rtmin",
            max_col = "rtmax")
        rt_groups <- rt_groups[lengths(rt_groups) > 1]
        for (j in seq_along(rt_groups)) {
            rt_group <- rt_groups[[j]]
            pk_groups[[current_group]] <- rt_group
            pks_sub <- pks[rt_group, ]
            mzr_sub <- range(pks_sub[, c("mzmin", "mzmax")])
            ## Expand the mz range in a similar fashion than used for checking
            ## if chrom peaks are overlapping. This fixes an issue with very
            ## low intensities in between two peaks, that tend to have shifted
            ## m/z value (because their intensities are so low).
            chr_def_mat[[current_group]] <-
                c(mzr_sub[1L] - expandMz - mzr_sub[1L] * ppme,
                  mzr_sub[2L] + expandMz + mzr_sub[2L] * ppme,
                  range(pks_sub[, c("rtmin", "rtmax")]))
            current_group <- current_group + 1
        }
    }
    if (!length(chr_def_mat))
        return(list())
    list(do.call(rbind, chr_def_mat), pk_groups)
}

.filter_file_XCMSnExp <- function(object, file,
                                  keepAdjustedRtime = hasAdjustedRtime(object),
                                  keepFeatures = FALSE) {
    if (missing(file)) return(object)
    if (is.character(file))
        file <- base::match(file, basename(fileNames(object)))
    ## This will not work if we want to get the files in a different
    ## order (i.e. c(3, 1, 2, 5))
    file <- base::sort(unique(file))
    ## Error checking - seems that's not performed downstream.
    if (!all(file %in% seq_along(fileNames(object))))
        stop("'file' has to be within 1 and the number of files in object.")
    has_features <- hasFeatures(object)
    has_chrom_peaks <- hasChromPeaks(object)
    has_adj_rt <- hasAdjustedRtime(object)
    ph <- processHistory(object)
    if (has_features && !keepFeatures) {
        has_features <- FALSE
        ph <- dropProcessHistoriesList(ph, .PROCSTEP.PEAK.GROUPING, num = 1)
    }
    ## Extracting all the XCMSnExp data from the object.
    newFd <- new("MsFeatureData")
    ## Subset original data:
    nobject <- as(filterFile(as(object, "OnDiskMSnExp"), file = file),
                 "XCMSnExp")
    if (has_adj_rt)
        adjustedRtime(newFd) <- adjustedRtime(object, bySample = TRUE)[file]
    if (has_chrom_peaks) {
        pks <- chromPeaks(object)
        idx <- pks[, "sample"] %in% file
        pks <- pks[idx, , drop = FALSE]
        pks[, "sample"] <- match(pks[, "sample"], file)
        if (has_features) {
            featureDefinitions(newFd) <- .update_feature_definitions(
                featureDefinitions(object),
                original_names = rownames(chromPeaks(object)),
                subset_names = rownames(pks))
        }
        chromPeaks(newFd) <- pks
        chromPeakData(newFd) <- extractROWS(chromPeakData(object), idx)
    }
    if (hasAdjustedRtime(newFd) && !keepAdjustedRtime)
        newFd <- dropAdjustedRtime(newFd, rtime(nobject, bySample = TRUE,
                                                adjusted = FALSE))
    lockEnvironment(newFd, bindings = TRUE)
    nobject@msFeatureData <- newFd
    nobject@.processHistory <- ph
    nobject
}

#' @param x `XCMSnExp` object of a single file.
#'
#' @param nValues `integer(1)` defining the number of values that have to be above
#'     threshold.
#'
#' @param threshold `numeric(1)` with the threshold value.
#'
#' @param msLevel `integer(1)` with the MS level.
#'
#' @return `integer` with the index of the peaks that should be retained.
#'
#' @author Johannes Rainer
#'
#' @noRd
.chrom_peaks_above_threshold <- function(x, nValues = 1L, threshold = 0,
                                         msLevel = 1L) {
    if (length(msLevel) > 1)
        stop("Currently only filtering of a single MS level at a ",
             "time is supported")
    x <- applyAdjustedRtime(x)
    chrs <- chromatogram(
        filterMsLevel(as(x, "OnDiskMSnExp"), msLevel = msLevel),
        rt = chromPeaks(x, msLevel = msLevel)[, c("rtmin", "rtmax")],
        mz = chromPeaks(x, msLevel = msLevel)[, c("mzmin", "mzmax")],
        msLevel = msLevel)
    keep <- vapply(chrs@.Data, function(z) {
        sum(z@intensity >= threshold, na.rm = TRUE) >= nValues
    }, logical(1))
    keep_all <- rep(TRUE, nrow(chromPeaks(x)))
    keep_all[chromPeakData(x)$ms_level == msLevel] <- keep
    keep_all
}

.subset_feature_definitions <- function(x, mz, rt, ppm, type) {
    if (length(rt) && nrow(x)) {
        rt <- range(rt)
        keep <- switch(type,
                       any = which(x$rtmin <= rt[2L] &
                                   x$rtmax >= rt[1L]),
                       within = which(x$rtmin >= rt[1L] &
                                      x$rtmax <= rt[2L]),
                       apex_within = which(x$rtmed >= rt[1L] &
                                           x$rtmed <= rt[2L]))
        x <- x[keep, , drop = FALSE]
    }
    if (length(mz) && nrow(x)) {
        mz <- range(mz)
        if (is.finite(mz[1L]))
            mz[1L] <- mz[1L] - mz[1L] * ppm / 1e6
        if (is.finite(mz[2L]))
            mz[2L] <- mz[2L] + mz[2L] * ppm / 1e6
        keep <- switch(type,
                       any = which(x$mzmin <= mz[2L] &
                                   x$mzmax >= mz[1L]),
                       within = which(x$mzmin >= mz[1L] &
                                      x$mzmax <= mz[2L]),
                       apex_within = which(x$mzmed >= mz[1L] &
                                           x$mzmed <= mz[2L]))
        x <- x[keep, , drop = FALSE]
    }
    x
}