qmtools: Quantitative Metabolomics Data Processing Tools

Documented in clusterFeatures

##' Feature clustering
##'
##' Function to cluster LC-MS features according to their retention time and
##' intensity correlation across samples with a
##' \linkS4class{SummarizedExperiment}.
##'
##' For soft ionization methods (e.g., LC/ESI-MS) commonly used in metabolomics,
##' one or more ions could be generated from an individual compound upon
##' ionization. The redundancy of feature data needs to be addressed since we
##' typically interested in compounds rather than different ion species. This
##' function attempts to identify a group of features from the same compound
##' with the following steps:
##'
##' 1. Features are grouped by their retention times to identify co-eluting
##' compounds.
##'
##' 2. For each retention time-based group, features are further clustered by
##' patterns of the intensity correlations across samples to identify a subset
##' of features from the same compound.
##'
##' The retention time-based grouping is performed using either a hierarchical
##' clustering via [hclust] or the methods available in the \pkg{MsFeatures}
##' package via [MsFeatures::groupClosest] and [MsFeatures::groupConsecutive].
##' For the \code{rt_grouping} = "hclust", by default, complete-linkage
##' clustering is conducted using the Manhattan distance (i.e., difference in
##' retention times) where the distance between two clusters is defined as the
##' difference in retention times between the farthest pair of elements in the
##' two clusters. Group memberships are assigned by specifying the cut height
##' for the distance metric. Other linkage methods can be specified with
##' \code{hclust_linkage}. Please refer to \code{?hclust} for details. For the
##' "closest" and "consecutive", please refer to
##' \code{?MsFeatures::groupClosest} and \code{?MsFeatures::groupConsecutive}
##' for the details of algorithms.
##'
##' For the correlation-based grouping, \code{cor_grouping} = "connected"
##' creates a undirected graph using feature correlations as an adjacency matrix
##' (i.e., correlations serve as edge weights). The edges whose weights are
##' below the cut-off specified by `cor_cut` will be removed from the graph,
##' separating features into several disconnected subgroups. Features in the
##' same subgroup will be assigned to the same feature cluster. For the
##' "louvain", the function further applies the Louvain algorithm to the graph
##' in order to identify densely connected features via
##' [igraph::cluster_louvain]. For the "SimilarityMatrix",
##' [MsFeatures::groupSimilarityMatrix] is used for feature grouping. Please
##' refer to \code{?MsFeatures::groupSimilarityMatrix} for the details of
##' algorithm.
##'
##' @param x A \linkS4class{SummarizedExperiment} object.
##' @param i A string or integer value specifying which assay values to use.
##' @param rtime_var A string specifying the name of variable containing a
##'   numeric vector of retention times in `rowData(x)`.
##' @param rt_cut A numeric value specifying a cut-off for the retention-time
##'   based feature grouping.
##' @param cor_cut A numeric value specifying a cut-off for the
##'   correlation-based feature grouping.
##' @param rt_grouping A string specifying which method to use for the
##'   retention-time based feature grouping.
##' @param cor_grouping A string specifying which method to use for the
##'   correlation-based feature grouping.
##' @param cor_use A string specifying which method to compute correlations in
##'   the presence of missing values. Refer to \code{?cor} for details.
##' @param cor_method A string specifying which correlation coefficient is to be
##'   computed. See \code{?cor} for details.
##' @param log2 A logical specifying whether feature intensities need to be
##'   log2-transformed before calculating a correlation matrix.
##' @param hclust_linkage A string specifying the linkage method to be used when
##'   \code{rt_grouping} is "hclust".
##' @return A \linkS4class{SummarizedExperiment} object with the grouping
##'   results added to columns "rtime_group" (initial grouping on retention
##'   times) and "feature_group" in its `rowData`.
##'
##' @references
##'
##' Johannes Rainer (2022). MsFeatures: Functionality for Mass Spectrometry
##' Features. R package version 1.3.0.
##' 'https://github.com/RforMassSpectrometry/MsFeatures
##'
##' Vincent D. Blondel, Jean-Loup Guillaume, Renaud Lambiotte, Etienne Lefebvre:
##' Fast unfolding of communities in large networks. J. Stat. Mech. (2008)
##' P10008
##'
##' Csardi G, Nepusz T: The igraph software package for complex network
##' research, InterJournal, Complex Systems 1695. 2006. https://igraph.org
##'
##' @seealso
##'
##' See [hclust], [cutree], [MsFeatures::groupClosest],
##' [MsFeatures::groupConsecutive], [MsFeatures::groupSimilarityMatrix], and
##' [igraph::cluster_louvain] for the underlying functions that do work.
##'
##' See [plotRTgroup] to visualize the grouping result.
##'
##' @examples
##'
##' data(faahko_se)
##'
##' se <- clusterFeatures(faahko_se, i = "knn_vsn", rtime_var = "rtmed")
##' rowData(se)[, c("rtmed", "rtime_group", "feature_group")]
##'
##' @export
clusterFeatures <- function(x, i, rtime_var = "rtime",
                            rt_cut = 10, cor_cut = 0.7,
                            rt_grouping = c("hclust", "closest", "consecutive"),
                            cor_grouping = c("louvain", "SimilarityMatrix",
                                             "connected", "none"),
                            cor_use = c("everything", "all.obs",
                                        "complete.obs", "na.or.complete",
                                        "pairwise.complete.obs"),
                            cor_method = c("pearson", "kendall", "spearman"),
                            log2 = FALSE,
                            hclust_linkage = "complete") {
    if (!is(x, "SummarizedExperiment")) {
        stop("`x` must be a SummarizedExperiment object.")
    }
    if (!any(colnames(rowData(x)) == rtime_var)) {
        stop("Variable `", rtime_var, "` is not found in `rowData(x)`." )
    }
    rt_grouping <- match.arg(rt_grouping)
    cor_grouping <- match.arg(cor_grouping)
    rts <- rowData(x)[, rtime_var]
    rt_clus <- .groupRT(rts, method = rt_grouping, cut = rt_cut,
                        hclust_linkage = hclust_linkage)
    rt_clus <- .format_padding(rt_clus)
    rtime_group <- factor(paste0("FG.", rt_clus),
                          levels = unique(paste0("FG.", rt_clus)))
    rowData(x)$rtime_group <- rtime_group

    if (cor_grouping == "none") {
        x
    } else {
        m <- assay(x, i)
        if (log2) {
            m <- log2(m)
        }
        ml <- split.data.frame(m, rtime_group)
        res <- lapply(ml, function(x) {
            .groupCorr(x, use = cor_use, method = cor_method,
                       type = cor_grouping, cut = cor_cut)
        })
        cor_clus <- unsplit(res, f = rtime_group)
        cor_clus <- .format_padding(cor_clus)
        rowData(x)$feature_group <- paste(rtime_group, cor_clus, sep = ".")
        x
    }
}


.groupRT <- function(x, method = c("hclust", "closest", "consecutive"),
                     cut = 10, hclust_linkage = "complete") {
    method <- match.arg(method)
    if (anyNA(x)) {
        stop("`x` must have no missing values.")
    }
    switch(
        method,
        hclust = cutree(hclust(dist(x, "manhattan"), hclust_linkage), h = cut),
        closest = {
            .verify_package("MsFeatures")
            clus <- MsFeatures::groupClosest(x, maxDiff = cut)
            .relabel(clus)
        },
        consecutive = {
            .verify_package("MsFeatures")
            clus <- MsFeatures::groupConsecutive(x, maxDiff = cut)
            .relabel(clus)
        }
    )
}

.groupCorr <- function(x, use = c("everything", "all.obs",
                                  "complete.obs", "na.or.complete",
                                  "pairwise.complete.obs"),
                       method = c("pearson", "kendall", "spearman"),
                       type = c("louvain", "connected", "SimilarityMatrix"),
                       cut = 0.7
                       ) {
    use <- match.arg(use)
    ## method <- match.arg(method) ## handled with cor function
    type <- match.arg(type)

    ## Calculate a correlation matrix
    m <- cor(t(x), use = use, method = method)
    if (anyNA(m)) {
        stop("A correlation matrix contains NA. ",
             "Please check `x` and consider imputation.")
    }

    switch(
        type,
        connected = {
            g <- igraph::graph_from_adjacency_matrix(
                             m, mode = "lower", weighted = TRUE, diag = FALSE
                         )
            g <- igraph::delete_edges(g, which(igraph::E(g)$weight < cut))
            igraph::components(g)$membership
        },
        louvain = {
            g <- igraph::graph_from_adjacency_matrix(
                             m, mode = "lower", weighted = TRUE, diag = FALSE
                         )
            g <- igraph::delete_edges(g, which(igraph::E(g)$weight < cut))
            res <- igraph::cluster_louvain(g)
            igraph::membership(res)
        },
        SimilarityMatrix = {
            res <- MsFeatures::groupSimilarityMatrix(m, threshold = cut)
            .relabel(res)
        }
    )
}

.relabel <- function(x) {
    v <- rle(x)
    newc <- as.integer(factor(v$values, levels = unique(v$values)))
    rep(newc, times = v$lengths)
}

.format_padding <- function(x) {
    padding_width <- nchar(max(x))
    x <- formatC(x, width = padding_width, format = "d", flag = 0)
}