R/components-nontarget.R

#' @include main.R
#' @include components.R
#' @include components-set.R
#' @include feature_groups-set.R
NULL

#' Components class for homologous series.
#'
#' This class is derived from \code{\link{components}} and is used to store
#' results from unsupervised homolog detection with the \pkg{nontarget}
#' package.
#'
#' Objects from this class are generated by
#' \code{\link{generateComponentsNontarget}}
#'
#' @slot homol A \code{list} with \code{homol} objects for each replicate group
#'   as returned by \code{\link[nontarget]{homol.search}}
#'
#' @references \insertRef{Loos2017}{patRoon} \cr\cr \addCitations{enviPat}{1}
#'
#' @seealso \code{\link{components}} and \code{\link{generateComponents}}
#'
#' @export
componentsNT <- setClass("componentsNT", slots = c(homol = "list"), contains = "components")

setMethod("collapseComponents", "componentsNT", function(obj)
{
    obj@components <- lapply(obj@components, function(cmp)
    {
        keep <- c("rt", "mz", "group", "rGroup")
        cmp <- cmp[, keep, with = FALSE]
        cmp[, rGroup := paste0(unique(rGroup), collapse = ","), by = "group"]
        return(unique(cmp, by = "group"))
    })
    return(obj)
})

#' @describeIn componentsNT Plots an interactive network graph for linked
#'   homologous series (\emph{i.e.} series with (partial) overlap which could
#'   not be merged). The resulting graph can be browsed interactively and allows
#'   quick inspection of series which may be related. The graph is constructed
#'   with the \pkg{\link{igraph}} package and rendered with
#'   \pkg{\link{visNetwork}}.
#'
#' @param obj The \code{componentsNT} object to plot.
#' @param onlyLinked If \code{TRUE} then only components with links are plotted.
#'
#' @template plotGraph
#' 
#' @references \addCitations{igraph}{2}
#' 
#' @export
setMethod("plotGraph", "componentsNT", function(obj, onlyLinked = TRUE, width = NULL, height = NULL)
{
    checkmate::assertFlag(onlyLinked)
    
    if (length(obj) == 0)
        stop("No components to plot", call. = FALSE)
    
    cInfo <- componentInfo(obj)
    cTable <- componentTable(obj)

    hsFGroups <- sapply(cTable, function(cmp) paste0(cmp$group, collapse = ", "))
    hsRGroups <- sapply(cTable, function(cmp) paste0(unique(cmp$rGroup), collapse = ", "))
    titles <- sprintf("<b>%s</b> (RT: %.2f; m/z: %.4f; #%d)<br>fGroups: <i>%s</i><br>rGroups: <i>%s</i>",
                      names(obj), cInfo$ret_increment, cInfo$mz_increment, cInfo$size, hsFGroups, hsRGroups)

    makeGraph(obj, onlyLinked, titles, width, height)
})

#' Componentization of homologous series with nontarget
#'
#' Uses \href{https://cran.r-project.org/web/packages/nontarget/index.html}{the nontarget R package} to generate
#' components by unsupervised detection of homologous series.
#'
#' @templateVar algo nontarget
#' @templateVar do generate components
#' @templateVar generic generateComponents
#' @templateVar algoParam nontarget
#' @template algo_generator
#'
#' @details In the first step the \code{\link[nontarget]{homol.search}} function is used to detect all homologous series
#'   within each replicate group (analyses within each replicate group are averaged prior to detection). Then,
#'   homologous series across replicate groups are merged in case of full overlap or when merging of partial overlapping
#'   series causes no conflicts.
#'
#' @param rtRange A numeric vector containing the minimum and maximum retention time (in seconds) between homologues.
#'   Series are always considered from low to high \emph{m/z}, thus, a negative minimum retention time allows detection
#'   of homologous series with increasing \emph{m/z} and decreasing retention times. These values set the \code{minrt}
#'   and \code{maxrt} arguments of \code{\link[nontarget]{homol.search}}.
#' @param mzRange A numeric vector specifying the minimum and maximum \emph{m/z} increment of a homologous series. Sets
#'   the \code{minmz} and \code{maxmz} arguments of \code{\link[nontarget]{homol.search}}.
#' @param elements A character vector with elements to be considered for detection of repeating units. Sets the
#'   \code{elements} argument of \code{\link[nontarget]{homol.search}} function.
#' @param rtDev Maximum retention time deviation. Sets the \code{rttol} to \code{\link[nontarget]{homol.search}}.
#' @param absMzDevLink Maximum absolute \emph{m/z} deviation when linking series. This should usually be a bit higher
#'   than \code{absMzDev} to ensure proper linkage.
#' @param traceHack Currently \code{\link[nontarget]{homol.search}} does not work with \R \samp{>3.3.3}. This flag, which is
#'   enabled by default on these R versions, implements a (messy) workaround
#'   (\href{https://github.com/blosloos/nontarget/issues/6}{more details here}).
#'
#' @templateVar ion TRUE
#' @templateVar absMzDev \code{mztol} argument to \code{\link[nontarget]{homol.search}}
#' @templateVar myDots Any further arguments passed to \code{\link[nontarget]{homol.search}}.
#' @template compon_algo-args
#'
#' @inheritParams generateComponents
#'
#' @return The generated comnponents are returned as an object from the \code{\link{componentsNT}} class.
#'
#' @templateVar class componentsNTSet
#' @template compon_gen-sets-merged
#'
#' @section Sets workflows: The output class supports additional methods such as \code{plotGraph}.
#'
#' @references \insertRef{Loos2017}{patRoon} \cr\cr \addCitations{enviPat}{1}
#'
#' @templateVar what generateComponentsNontarget
#' @template main-rd-method
#' @export
setMethod("generateComponentsNontarget", "featureGroups", function(fGroups, ionization = NULL, rtRange = c(-120, 120),
                                                                   mzRange = c(5, 120), elements = c("C", "H", "O"),
                                                                   rtDev = 30, absMzDev = 0.002,
                                                                   absMzDevLink = absMzDev * 2, 
                                                                   traceHack = all(R.Version()[c("major", "minor")] >= c(3, 4)),
                                                                   ...)
{
    checkPackage("nontarget", "https://github.com/blosloos/nontarget")
    
    ac <- checkmate::makeAssertCollection()
    checkmate::assertClass(fGroups, "featureGroups", add = ac)
    ionization <- checkAndGetIonization(ionization, fGroups, add = ac)
    checkmate::assertNumeric(rtRange, finite = TRUE, any.missing = FALSE, len = 2, add = ac)
    checkmate::assertNumeric(mzRange, lower = 0, finite = TRUE, any.missing = FALSE, len = 2, add = ac)
    checkmate::assertCharacter(elements, min.chars = 1, any.missing = FALSE, min.len = 1, add = ac)
    aapply(checkmate::assertNumber, . ~ rtDev + absMzDev + absMzDevLink, lower = 0, finite = TRUE, fixed = list(add = ac))
    checkmate::assertFlag(traceHack, add = ac)
    checkmate::reportAssertions(ac)

    if (length(fGroups) == 0)
        return(componentsNT(homol = list(), componentInfo = data.table(), components = list(),
                            algorithm = "nontarget"))

    extraArgs <- list(...)
    hash <- makeHash(fGroups, ionization, rtRange, mzRange, elements, rtDev, absMzDev, absMzDevLink, extraArgs)
    cd <- loadCacheData("componentsNontarget", hash)
    if (!is.null(cd))
        return(cd)

    gTable <- groupTable(fGroups)
    gInfo <- groupInfo(fGroups)
    anaInfo <- analysisInfo(fGroups)

    utils::data(isotopes, package = "enviPat")

    # adduct.search() and homol.search() call stop() when nothing is found (arg!) --> use tryCatch

    # For now just stick with homologues as there is no easy way to do adduct/isotopes over multiple analyses
    # find homologous series for each replicate group
    rGroups <- unique(anaInfo$group)
    groupTablesRG <- sapply(rGroups, function(rg)
    {
        fGrpRep <- replicateGroupFilter(fGroups, rg, verbose = FALSE)
        return(if (length(fGrpRep) == 0) NULL else as.data.table(fGrpRep, average = TRUE))
    }, simplify = FALSE)

    homArgs <- c(list(isotopes = isotopes, elements = elements, minmz = mzRange[1],
                      maxmz = mzRange[2], minrt = rtRange[1], maxrt = rtRange[2], ppm = FALSE,
                      mztol = absMzDev, rttol = rtDev),
                 extraArgs)
    
    homList <- sapply(rGroups, function(rg)
    {
        gt <- groupTablesRG[[rg]][, c("mz", rg, "ret"), with = FALSE] # convert to nontarget peaklist format
        
        if (is.null(gt))
            return(NULL) # rep group doesn't have feature groups
        
        if (traceHack)
            suppressMessages(trace(nontarget::homol.search, function() {})) # put in dummy trace
        
        # UNDONE: find some way to differentiate between actual errors?
        hom <- tryCatch(do.call(nontarget::homol.search, c(list(gt), homArgs)),
                        error = function(e) FALSE)
        
        if (traceHack)
            suppressMessages(untrace(nontarget::homol.search))
        
        if (is.logical(hom))
            return(NULL) # no results
        
        return(hom)
    }, simplify = FALSE)
    homList <- homList[!sapply(homList, is.null)]
    
    compTab <- rbindlist(mapply(homList, names(homList), SIMPLIFY = FALSE, FUN = function(hom, rg)
    {
        stopifnot(identical(hom[[3]][["HS IDs"]], hom[[3]][["HS cluster"]])) # what is the difference!?
        
        homTab <- as.data.table(hom[[3]])
        
        gNames <- groupTablesRG[[rg]]$group
        
        # peak IDs are now numeric indices of a subset of the original feature
        # groups. Replace them by group names to allow easy comparison.
        homTab[, groups := list(sapply(`peak IDs`, function(pids)
        {
            ginds <- as.integer(unlist(strsplit(pids, ",")))
            ginds <- ginds[order(gInfo[gNames[ginds], "mzs"])] # ensure they are from low to high m/z
            return(list(gNames[ginds]))
        }, simplify = FALSE))]
        
        homTab[, c("HS IDs", "peak IDs") := NULL]
        homTab[, (rg) := list(groups)]
        
        return(homTab)
    }), fill = TRUE)

    if (nrow(compTab) == 0)
        return(componentsNT(componentInfo = data.table(), components = list(), algorithm = "nontarget"))

    # check which series should be linked
    compTab[, links := list(list(integer()))]

    for (r in seq_len(nrow(compTab)))
    {
        series <- compTab[["groups"]][[r]][[1]]
        links <- integer(0)
        for (ro in seq_len(nrow(compTab)))
        {
            if (r == ro)
                next

            if (abs(compTab[["m/z increment"]][r] - compTab[["m/z increment"]][ro]) > absMzDev ||
                abs(compTab[["RT increment"]][r] - compTab[["RT increment"]][ro]) > rtDev)
                next # different series

            otherSeries <- compTab[["groups"]][[ro]][[1]]

            if (sum(series %in% otherSeries) < 1) # UNDONE: minimum overlap?
                next

            # check for conflicts: groups that are not present in both but with
            # m/z values close or equal to those groups that are present
            missingG <- setdiff(series, otherSeries)
            missingOtherG <- setdiff(otherSeries, series)

            mzSame <- function(mz1, mz2) numLTE(abs(mz1 - mz2), absMzDevLink)
            
            if (any(sapply(missingG, function(mg) any(mzSame(gInfo[mg, "mzs"], gInfo[otherSeries, "mzs"])))) ||
                any(sapply(missingOtherG, function(mg) any(mzSame(gInfo[mg, "mzs"], gInfo[series, "mzs"])))))
                next
            
            links <- c(links, ro)
        }

        set(compTab, r, "links", list(list(links)))
    }
    
    presentRGroups <- rGroups[sapply(rGroups, function(rg) !is.null(compTab[[rg]]))]

    # merge any pairs of series that are just pointing to each other: they are
    # either exactly the same or otherwise subsets of each other that can be
    # merged without conflicts

    compTab[, keep := TRUE]
    for (r in seq_len(nrow(compTab)))
    {
        if (compTab[["keep"]][r] == FALSE)
            next # already merged

        links <- compTab[["links"]][[r]]
        
        for (other in links)
        {
            otherLinks <- compTab[["links"]][[other]]
            # linkage is equal?
            if (length(links) == length(otherLinks) && all(otherLinks == r | otherLinks %in% links))
            {
                # merge groups
                mGroups <- union(compTab[["groups"]][[r]][[1]], compTab[["groups"]][[other]][[1]])
                mGroups <- mGroups[order(gInfo[mGroups, "mzs"])] # make sure order stays correct
                set(compTab, r, "groups", list(list(list(mGroups))))
                
                # mark presence
                lc <- compTab[other, presentRGroups, with = FALSE]
                for (rg in presentRGroups)
                {
                    if (!is.null(lc[[rg]][[1]]))
                        set(compTab, r, rg, list(list(lc[[rg]][[1]]))) # need to rewrap it in a list?
                }
                
                # update statistics
                indBoth <- c(r, other)
                set(compTab, r, "m/z increment", mean(compTab[["m/z increment"]][indBoth]))
                set(compTab, r, "RT increment", mean(compTab[["RT increment"]][indBoth]))
                set(compTab, r, "min. RT in series", min(compTab[["min. RT in series"]][indBoth]))
                set(compTab, r, "max. RT in series", max(compTab[["max. RT in series"]][indBoth]))
                set(compTab, r, "max.-min. RT", compTab[["max. RT in series"]][r] - compTab[["min. RT in series"]][r])
                
                set(compTab, other, "keep", FALSE) # remove other
                set(compTab, r, "links", list(list(setdiff(links, other)))) # unlink
            }
        }
    }

    # clearout merged series
    compTab[, IDs := seq_len(.N)] # store current row order
    compTab <- compTab[keep == TRUE]
    # update links
    for (r in seq_len(nrow(compTab)))
    {
        links <- compTab[["links"]][[r]]
        if (length(links) > 0)
        {
            # no sapply: this may return integer(0) for removed links and therefore make it a list
            # --> use lapply and convert from list afterwards
            newLinks <- lapply(links, function(l) compTab[IDs == l, which = TRUE])
            newLinks <- newLinks[lengths(newLinks) > 0] # length will be zero if linked series was removed
            newLinks <- unlist(newLinks)
            if (is.null(newLinks)) # no links anymore?
                newLinks <- integer()
            set(compTab, r, "links", list(list(newLinks)))
        }
    }
    compTab[, c("keep", "IDs") := NULL]

    # split all rows in list with tables containing groups per row
    comps <- lapply(seq_len(nrow(compTab)), function(cmpi)
    {
        allGroups <- compTab[["groups"]][[cmpi]][[1]]
        homSeries <- seq_along(allGroups)
        ret <- rbindlist(lapply(presentRGroups, function(rg)
        {
            grp <- compTab[[rg]][[cmpi]][[1]]
            if (!is.null(grp))
                return(data.table(rt = gInfo[grp, "rts"], mz = gInfo[grp, "mzs"], group = grp,
                                  hsnr = match(grp, allGroups), rGroup = rg,
                                  intensity = groupTablesRG[[rg]][group %in% grp, get(rg)]))
            return(NULL)
        }), fill = TRUE)
        setorderv(ret, c("hsnr"))
    })
    cNames <- paste0("CMP", seq_along(comps))
    names(comps) <- cNames

    cInfo <- copy(compTab)
    cInfo[, c("groups", "HS cluster") := NULL]
    setnames(cInfo,
             c("m/z increment", "RT increment", "min. RT in series", "max. RT in series", "max.-min. RT"),
             c("mz_increment", "ret_increment", "ret_min", "ret_max", "ret_range"))
    cInfo[, name := names(comps)]
    cInfo[, size := sapply(comps, nrow)]

    # convert from fgroup lists to logical presence
    for (rg in presentRGroups)
        set(cInfo, j = rg, value = !sapply(cInfo[[rg]], is.null))
    
    # Convert numeric links to component names. This also keeps links alive
    # after subsetting
    cInfo[, links := lapply(links, function(l) cNames[l])]

    ret <- componentsNT(homol = homList, componentInfo = cInfo, components = comps,
                        algorithm = "nontarget")
    saveCacheData("componentsNontarget", ret, hash)

    return(ret)
})

#' @rdname generateComponentsNontarget
#' @export
setMethod("generateComponentsNontarget", "featureGroupsSet", function(fGroups, ionization = NULL, ...)
{
    return(generateComponentsSet(fGroups, ionization, generateComponentsNontarget, setIonization = TRUE, ...,
                                 classGenerator = componentsNTSet))
})
rickhelmus/patRoon documentation built on April 25, 2024, 8:15 a.m.