R/feature_annotations.R

#' @include main.R
#' @include workflow-step.R
NULL

#' Base feature annotations class
#'
#' Holds information for all feature group annotations.
#'
#' This class stores annotation data for feature groups, such as molecular formulae, SMILES identifiers, compound names
#' etc. The class of objects that are generated by formula and compound annotation (\code{\link{generateFormulas}} and
#' \code{\link{generateCompounds}}) are based on this class.
#'
#' @param obj,x \code{featureAnnotations} object to be accessed
#' @param \dots For the \code{"["} operator: ignored.
#'
#'   For \code{delete}: passed to the function specified as \code{j}.
#'   
#'   Others: Any further (and unique) \code{featureAnnotations} objects.
#' @param OM For \code{as.data.table}: if set to \code{TRUE} several columns with information relevant for organic
#'   matter (OM) characterization will be added (e.g. elemental ratios, classification). This will also make sure that
#'   \code{countElements} contains at least C, H, N, O, P and S.
#'
#'   For \code{filter}: If \code{TRUE} then several filters are applied to exclude unlikely formula candidates present
#'   in organic matter (OM). See Source section for details.
#' @param labels A \code{character} with names to use for labelling. If \code{NULL} labels are automatically generated.
#'
#' @seealso \code{\link{formulas-class}} and \code{\link{compounds-class}}
#'
#' @slot groupAnnotations A \code{list} with for each annotated feature group a \code{data.table} with annotation data.
#'   Use the \code{annotations} method for access.
#' @slot scoreTypes A \code{character} with all the score types present in this object.
#' @slot scoreRanges The minimum and maximum score values of all candidates for each feature group. Used for
#'   normalization.
#'
#' @templateVar seli feature groups
#' @templateVar selOrderi groupNames()
#' @templateVar del TRUE
#' @templateVar deli feature groups
#' @templateVar delj candidates
#' @templateVar deljtype numeric indices (rows)
#' @templateVar delfwhat feature group
#' @templateVar delfa the annotation table (a \code{data.table}), the feature group name
#' @templateVar delfr the candidate indices (rows) to be removed (specified as an \code{integer} or \code{logical} vector)
#' @templateVar dollarOpName feature group
#' @template sub_sel_del-args
#'
#' @templateVar normParam normalizeScores
#' @templateVar excludeParam excludeNormScores
#' @template norm-args
#'
#' @section Source: Calculation of the aromaticity index (AI) and related double bond equivalents (DBE_AI) is performed
#'   as described in Koch 2015. Formula classification is performed by the rules described in Abdulla 2013. Filtering of
#'   OM related molecules is performed as described in Koch 2006 and Kujawinski 2006. (see references).
#'
#' @references \insertRef{Koch2015}{patRoon} \cr\cr \insertRef{Abdulla2013}{patRoon} \cr\cr
#'   \insertRef{Koch2006}{patRoon} \cr\cr \insertRef{Kujawinski2006}{patRoon}
#'
#' @seealso The derived \code{\link{formulas}} and \code{\link{compounds}} classes.
#'
#' @templateVar class featureAnnotations
#' @template class-hierarchy
#'
#' @export
featureAnnotations <- setClass("featureAnnotations",
                               slots = c(groupAnnotations = "list", scoreTypes = "character", scoreRanges = "list"),
                               contains = c("workflowStep", "VIRTUAL"))

setMethod("initialize", "featureAnnotations", function(.Object, ...)
{
    .Object <- callNextMethod(.Object, ...)
    .Object@groupAnnotations <- makeEmptyListNamed(.Object@groupAnnotations)
    return(.Object)
})

setMethod("groupNamesResults", "featureAnnotations", function(obj) groupNames(obj))

#' @describeIn featureAnnotations Accessor for the \code{groupAnnotations} slot.
#' @export
setMethod("annotations", "featureAnnotations", function(obj) obj@groupAnnotations)

#' @templateVar class featureAnnotations
#' @templateVar what feature groups
#' @template strmethod
#' @export
setMethod("groupNames", "featureAnnotations", function(obj) names(obj@groupAnnotations))

#' @describeIn featureAnnotations Obtain total number of candidates.
#' @export
setMethod("length", "featureAnnotations", function(x) if (length(x@groupAnnotations) > 0) sum(sapply(x@groupAnnotations, nrow)) else 0)

#' @describeIn featureAnnotations Subset on feature groups.
#' @export
setMethod("[", c("featureAnnotations", "ANY", "missing", "missing"), function(x, i, ...)
{
    if (!missing(i))
    {
        i <- assertSubsetArgAndToChr(i, groupNames(x))
        x <- delete(x, setdiff(groupNames(x), i))
    }
    
    return(x)
})

#' @describeIn featureAnnotations Extracts annotation data for a feature group.
#' @export
setMethod("[[", c("featureAnnotations", "ANY", "missing"), function(x, i, j)
{
    assertExtractArg(i)
    return(x@groupAnnotations[[i]])
})

#' @describeIn featureAnnotations Extracts annotation data for a feature group.
#' @export
setMethod("$", "featureAnnotations", function(x, name)
{
    eval(substitute(x@groupAnnotations$NAME_ARG, list(NAME_ARG = name)))
})

#' @describeIn featureAnnotations Generates a table with all annotation data for each feature group and other
#'   information such as element counts.
#'
#' @param fragments If \code{TRUE} then information on annotated fragments will be included. Automatically set to
#'   \code{TRUE} if \code{countFragElements} is set.
#' @param countElements,countFragElements A \code{character} vector with elements that should be counted for each
#'   candidate's formula. For instance, \code{c("C", "H")} adds columns for both carbon and hydrogen amounts of each
#'   formula. Note that the neutral formula (\code{neutral_formula} column) is used to count elements of non-fragmented
#'   formulae, whereas the charged formula of fragments (\code{ion_formula} column in \code{fragInfo} data) is used for
#'   fragments. Set to \code{NULL} to not count any elements.
#'
#' @template as_data_table-args
#'
#' @return \code{as.data.table} returns a \code{\link{data.table}}.
#'
#' @export
setMethod("as.data.table", "featureAnnotations", function(x, fGroups = NULL, fragments = FALSE, countElements = NULL,
                                                          countFragElements = NULL, OM = FALSE,
                                                          normalizeScores = "none",
                                                          excludeNormScores = defaultExclNormScores(x))
{
    # NOTE: keep args in sync with formulas/compounds methods
    
    ac <- checkmate::makeAssertCollection()
    checkmate::assertClass(fGroups, "featureGroups", null.ok = TRUE, add = ac)
    checkmate::assertCharacter(countElements, min.chars = 1, any.missing = FALSE, null.ok = TRUE, add = ac)
    checkmate::assertCharacter(countFragElements, min.chars = 1, any.missing = FALSE, null.ok = TRUE, add = ac)
    checkmate::assertFlag(OM, add = ac)
    checkmate::assertFlag(fragments, add = ac)
    assertNormalizationMethod(normalizeScores, add = ac)
    checkmate::assertCharacter(excludeNormScores, min.chars = 1, null.ok = TRUE, add = ac)
    checkmate::reportAssertions(ac)
    
    if (!is.null(countFragElements))
        fragments <- TRUE
    
    mcn <- mergedConsensusNames(x)
    annTable <- annotations(x)
    if (normalizeScores != "none")
    {
        annTable <- Map(annTable, x@scoreRanges, f = normalizeAnnScores,
                        MoreArgs = list(scoreCols = annScoreNames(x, TRUE), mConsNames = mcn, normalizeScores == "minmax",
                                        exclude = excludeNormScores))
    }
    
    if (fragments)
    {
        ret <- rbindlist(lapply(annTable, function(ct)
        {
            ct <- copy(ct)
            ct[, row := seq_len(nrow(ct))]
            
            fragTab <- rbindlist(ct$fragInfo, idcol = "row", fill = TRUE)
            fragTab[, PLID := NULL]
            cnames <- setdiff(names(fragTab), "row")
            setnames(fragTab, cnames, paste0("frag_", cnames))
            
            return(merge(ct, fragTab, by = "row", all.x = TRUE)[, -"row"])
        }), idcol = "group", fill = TRUE)
    }
    else
        ret <- rbindlist(annTable, idcol = "group", fill = TRUE)
    
    if (!is.null(fGroups))
    {
        ret[, c("ret", "group_mz") := groupInfo(fGroups)[group, c("rts", "mzs")]]
        setcolorder(ret, c("group", "ret", "group_mz"))
    }
    
    ret <- addElementInfoToAnnTable(ret, countElements, countFragElements, OM, TRUE)
    
    if (!is.null(ret[["fragInfo"]]))
        return(ret[, -"fragInfo"]) # not there if empty results
    
    return(ret)
})

#' @templateVar where featureAnnotations
#' @templateVar what annotations
#' @template delete
#' @export
setMethod("delete", "featureAnnotations", function(obj, i = NULL, j = NULL, ...)
{
    # NOTE: this is ~ a c/p from the features method
    
    ac <- checkmate::makeAssertCollection()
    i <- assertDeleteArgAndToChr(i, groupNames(obj), add = ac)
    checkmate::assert(
        checkmate::checkIntegerish(j, any.missing = FALSE, null.ok = TRUE),
        checkmate::checkFunction(j, null.ok = TRUE),
        .var.name = "j"
    )
    checkmate::reportAssertions(ac)
    
    if (length(i) == 0 || (!is.null(j) && length(j) == 0))
        return(obj) # nothing to remove...
    
    # UNDONE: NULL for i and j will remove all?
    
    # i = NULL; j = vector: remove from all groups
    # i = vector; j = NULL: remove specified groups
    # j = function: remove specific results from given groups (or all if i=NULL)
    
    if (!is.function(j))
    {
        if (is.null(j))
            obj@groupAnnotations <- obj@groupAnnotations[setdiff(groupNames(obj), i)]
        else
        {
            obj@groupAnnotations[i] <- lapply(obj@groupAnnotations[i], function(at)
            {
                inds <- j[j <= nrow(at)]
                return(if (length(inds) > 0) at[-inds] else at)
            })
        }
    }
    else
    {
        obj@groupAnnotations[i] <- Map(obj@groupAnnotations[i], i, f = function(at, grp)
        {
            rm <- j(at, grp, ...)
            if (is.logical(rm))
                return(at[!rm])
            return(at[setdiff(seq_len(nrow(at)), rm)])
        })
    }
    
    obj@groupAnnotations <- pruneList(obj@groupAnnotations, checkZeroRows = TRUE)
    
    obj@scoreRanges <- obj@scoreRanges[names(obj@groupAnnotations)]

    return(obj)
})

#' @describeIn featureAnnotations Provides rule based filtering for feature group annotations. Useful to eliminate
#'   unlikely candidates and speed up further processing.
#'
#' @param minExplainedPeaks Minimum number of explained peaks. Set to \code{NULL} to ignore.
#' @param scoreLimits Filter results by their scores. Should be a named \code{list} that contains two-sized numeric
#'   vectors with the minimum/maximum value of a score (use \code{-Inf}/\code{Inf} for no limits). The names of each
#'   element should follow the name column of the table returned by \code{\link{formulaScorings}$name} and
#'   \code{\link{compoundScorings}()$name}. For instance, \code{scoreLimits=list(numberPatents=c(10, Inf))} specifies
#'   that \code{numberPatents} should be at least \samp{10}. Note that a result without a specified scoring is never
#'   removed. If a score term exists multiple times, \emph{i.e.} due to a consensus, then a candidate is kept if at
#'   least one of the terms falls within the range. Set to \code{NULL} to skip this filter.
#' @param topMost Only keep a maximum of \code{topMost} candidates with highest score (or least highest if
#'   \code{negate=TRUE}). Set to \code{NULL} to ignore.
#' @param negate If \code{TRUE} then filters are applied in opposite manner.
#'
#' @template element-args
#'
#' @return \code{filter} returns a filtered \code{featureAnnotations} object.
#'
#' @export
setMethod("filter", "featureAnnotations", function(obj, minExplainedPeaks = NULL, scoreLimits = NULL, elements = NULL,
                                                   fragElements = NULL, lossElements = NULL, topMost = NULL, OM = FALSE,
                                                   negate = FALSE)
{
    ac <- checkmate::makeAssertCollection()
    aapply(checkmate::assertCount, . ~ minExplainedPeaks + topMost, positive = c(FALSE, TRUE),
           null.ok = TRUE, fixed = list(add = ac))
    assertScoreRange(scoreLimits, NULL, add = ac)
    aapply(checkmate::assertCharacter, . ~ elements + fragElements + lossElements,
           min.chars = 1, min.len = 1, null.ok = TRUE, fixed = list(add = ac))
    aapply(checkmate::assertFlag, . ~ OM + negate, fixed = list(add = ac))
    checkmate::reportAssertions(ac)
    
    cat("Filtering annotations... ")
    
    mConsNames <- mergedConsensusNames(obj)
    
    if (!is.null(minExplainedPeaks))
        scoreLimits <- modifyList(if (is.null(scoreLimits)) list() else scoreLimits,
                                  list(explainedPeaks = c(minExplainedPeaks, Inf)))
    mark <- if (negate) function(x) !x else function(x) x
    
    oldn <- length(obj)
    obj <- delete(obj, j = function(annTable, ...)
    {
        annTable <- copy(annTable)
        annTable[, keep := TRUE]
        
        if (!is.null(scoreLimits))
        {
            for (sc in names(scoreLimits))
            {
                cols <- getAllMergedConsCols(sc, names(annTable), mConsNames)
                if (length(cols) == 0)
                    next
                
                annTable[keep == TRUE, keep :=
                             mark(do.call(pmax, c(.SD, list(na.rm = TRUE))) >= scoreLimits[[sc]][1] &
                                      do.call(pmin, c(.SD, list(na.rm = TRUE))) <= scoreLimits[[sc]][2]),
                         .SDcols = cols]
            }
        }
        
        if (!is.null(elements))
            annTable[keep == TRUE, keep := sapply(neutral_formula, checkFormula, elements, negate)]
        
        if (!is.null(fragElements) || !is.null(lossElements))
        {
            annTable[keep == TRUE, keep := sapply(fragInfo, function(fi)
            {
                if (nrow(fi) == 0)
                    return(FALSE)
                if (!is.null(fragElements) && !any(sapply(fi$ion_formula, checkFormula, fragElements, negate)))
                    return(FALSE)
                if (!is.null(lossElements) && !any(sapply(fi$neutral_loss, checkFormula, lossElements, negate)))
                    return(FALSE)
                return(TRUE)
            })]
        }
        
        if (OM)
        {
            annTable <- addElementInfoToAnnTable(annTable, NULL, NULL, OM = TRUE, classify = FALSE)
            annTable[keep == TRUE, keep := mark( 
                         # rules from Kujawinski & Behn, 2006 (10.1021/ac0600306)
                         H >= 1/3 * C &
                         H <= ((2 * C) + N + 2) &
                         (H + N) %% 2 == 0 &
                         N <= C &
                         O <= C &
                         P <= 2 &
                         S <= 2 &
                         
                         # rules from Koch & dittmar 2006 (10.1002/rcm.2386)
                         sapply(DBE_AI, checkmate::checkInt) &
                         HC <= 2.2 &
                         OC <= 1.2 &
                         NC <= 0.5)]
        }
        
        return(!annTable$keep)
    })

    if (!is.null(topMost))
    {
        if (negate)
            obj <- delete(obj, j = function(at, ...) seq_len(nrow(at)) <= (nrow(at) - topMost))
        else
            obj <- delete(obj, j = function(at, ...) seq_len(nrow(at)) > topMost)
    }
    
    newn <- length(obj)
    printf("Done! Filtered %d (%.2f%%) annotations. Remaining: %d\n", oldn - newn, if (oldn == 0) 0 else (1-(newn/oldn))*100, newn)
    return(obj)
})

#' @describeIn featureAnnotations plots a Venn diagram (using \pkg{\link{VennDiagram}}) outlining unique and shared
#'   candidates of up to five different \code{featureAnnotations} objects.
#'
#' @param vennArgs A \code{list} with further arguments passed to \pkg{VennDiagram} plotting functions. Set to
#'   \code{NULL} to ignore.
#'
#' @template plotvenn-ret
#'
#' @export
setMethod("plotVenn", "featureAnnotations", function(obj, ..., labels = NULL, vennArgs = NULL)
{
    allFeatAnnotations <- c(list(obj), list(...))
    
    ac <- checkmate::makeAssertCollection()
    checkmate::assertList(allFeatAnnotations, types = "featureAnnotations", min.len = 2, any.missing = FALSE,
                          unique = TRUE, .var.name = "...", add = ac)
    checkmate::assertCharacter(labels, min.chars = 1, len = length(allFeatAnnotations), null.ok = TRUE, add = ac)
    checkmate::assertList(vennArgs, names = "unique", null.ok = TRUE, add = ac)
    checkmate::reportAssertions(ac)
    
    if (is.null(labels))
        labels <- make.unique(sapply(allFeatAnnotations, algorithm))
    if (is.null(vennArgs))
        vennArgs <- list()
    
    allAnnTabs <- lapply(allFeatAnnotations, as.data.table)
    do.call(makeVennPlot, c(list(allAnnTabs, labels, lengths(allFeatAnnotations), function(obj1, obj2)
    {
        if (length(obj1) == 0 || length(obj2) == 0)
            return(data.table())
        fintersect(obj1[, c("group", "UID")], obj2[, c("group", "UID")])
    }, nrow), vennArgs))
})

#' @describeIn featureAnnotations plots an UpSet diagram (using the \code{\link[UpSetR]{upset}} function) outlining
#'   unique and shared candidates between different \code{featureAnnotations} objects.
#' @templateVar withArgs TRUE
#' @template plotUpSet
#' @export
setMethod("plotUpSet", "featureAnnotations", function(obj, ..., labels = NULL, nsets = length(list(...)) + 1,
                                                      nintersects = NA, upsetArgs = NULL)
{
    allFeatAnnotations <- c(list(obj), list(...))
    
    ac <- checkmate::makeAssertCollection()
    checkmate::assertList(allFeatAnnotations, types = "featureAnnotations", min.len = 2, any.missing = FALSE,
                          unique = TRUE, .var.name = "...", add = ac)
    checkmate::assertCharacter(labels, min.chars = 1, len = length(allFeatAnnotations), null.ok = TRUE, add = ac)
    checkmate::assertList(upsetArgs, names = "unique", null.ok = TRUE, add = ac)
    checkmate::assertCount(nsets, positive = TRUE)
    checkmate::assertCount(nintersects, positive = TRUE, na.ok = TRUE)
    checkmate::reportAssertions(ac)
    
    if (is.null(labels))
        labels <- make.unique(sapply(allFeatAnnotations, algorithm))
    
    allAnnTabs <- mapply(allFeatAnnotations, labels, SIMPLIFY = FALSE, FUN = function(f, l)
    {
        ret <- as.data.table(f)
        if (nrow(ret) == 0)
            ret <- data.table(group = character(), UID = character())
        ret <- unique(ret[, c("group", "UID")])[, (l) := 1]
    })
    
    annTab <- Reduce(function(f1, f2)
    {
        merge(f1, f2, by = c("group", "UID"), all = TRUE)
    }, allAnnTabs)
    
    annTab <- annTab[, labels, with = FALSE]
    for (j in seq_along(annTab))
        set(annTab, which(is.na(annTab[[j]])), j, 0)
    
    if (sum(sapply(annTab, function(x) any(x>0))) < 2)
        stop("Need at least two non-empty objects to plot")
    
    do.call(UpSetR::upset, c(list(annTab, nsets = nsets, nintersects = nintersects), upsetArgs))
})

setMethod("annScoreNames", "featureAnnotations", function(obj, onlyNums)
{
    # NOTE: onlyNums ignored here, mainly for MF
    if (length(obj@scoreRanges) == 0)
        return(character())
    return(unique(unlist(lapply(obj@scoreRanges, names))))
})

setMethod("prepareConsensusLabels", "featureAnnotations", function(obj, ..., labels)
{
    if (is.null(labels))
        labels <- sapply(list(obj, ...), algorithm)
    
    # in case names are (still) duplicated
    labels <- make.unique(labels)
    
    return(labels)
})
rickhelmus/patRoon documentation built on April 25, 2024, 8:15 a.m.