##' Feature clustering
##'
##' Function to cluster LC-MS features according to their retention time and
##' intensity correlation across samples with a
##' \linkS4class{SummarizedExperiment}.
##'
##' For soft ionization methods (e.g., LC/ESI-MS) commonly used in metabolomics,
##' one or more ions could be generated from an individual compound upon
##' ionization. The redundancy of feature data needs to be addressed since we
##' typically interested in compounds rather than different ion species. This
##' function attempts to identify a group of features from the same compound
##' with the following steps:
##'
##' 1. Features are grouped by their retention times to identify co-eluting
##' compounds.
##'
##' 2. For each retention time-based group, features are further clustered by
##' patterns of the intensity correlations across samples to identify a subset
##' of features from the same compound.
##'
##' The retention time-based grouping is performed using either a hierarchical
##' clustering via [hclust] or the methods available in the \pkg{MsFeatures}
##' package via [MsFeatures::groupClosest] and [MsFeatures::groupConsecutive].
##' For the \code{rt_grouping} = "hclust", by default, complete-linkage
##' clustering is conducted using the Manhattan distance (i.e., difference in
##' retention times) where the distance between two clusters is defined as the
##' difference in retention times between the farthest pair of elements in the
##' two clusters. Group memberships are assigned by specifying the cut height
##' for the distance metric. Other linkage methods can be specified with
##' \code{hclust_linkage}. Please refer to \code{?hclust} for details. For the
##' "closest" and "consecutive", please refer to
##' \code{?MsFeatures::groupClosest} and \code{?MsFeatures::groupConsecutive}
##' for the details of algorithms.
##'
##' For the correlation-based grouping, \code{cor_grouping} = "connected"
##' creates a undirected graph using feature correlations as an adjacency matrix
##' (i.e., correlations serve as edge weights). The edges whose weights are
##' below the cut-off specified by `cor_cut` will be removed from the graph,
##' separating features into several disconnected subgroups. Features in the
##' same subgroup will be assigned to the same feature cluster. For the
##' "louvain", the function further applies the Louvain algorithm to the graph
##' in order to identify densely connected features via
##' [igraph::cluster_louvain]. For the "SimilarityMatrix",
##' [MsFeatures::groupSimilarityMatrix] is used for feature grouping. Please
##' refer to \code{?MsFeatures::groupSimilarityMatrix} for the details of
##' algorithm.
##'
##' @param x A \linkS4class{SummarizedExperiment} object.
##' @param i A string or integer value specifying which assay values to use.
##' @param rtime_var A string specifying the name of variable containing a
##' numeric vector of retention times in `rowData(x)`.
##' @param rt_cut A numeric value specifying a cut-off for the retention-time
##' based feature grouping.
##' @param cor_cut A numeric value specifying a cut-off for the
##' correlation-based feature grouping.
##' @param rt_grouping A string specifying which method to use for the
##' retention-time based feature grouping.
##' @param cor_grouping A string specifying which method to use for the
##' correlation-based feature grouping.
##' @param cor_use A string specifying which method to compute correlations in
##' the presence of missing values. Refer to \code{?cor} for details.
##' @param cor_method A string specifying which correlation coefficient is to be
##' computed. See \code{?cor} for details.
##' @param log2 A logical specifying whether feature intensities need to be
##' log2-transformed before calculating a correlation matrix.
##' @param hclust_linkage A string specifying the linkage method to be used when
##' \code{rt_grouping} is "hclust".
##' @return A \linkS4class{SummarizedExperiment} object with the grouping
##' results added to columns "rtime_group" (initial grouping on retention
##' times) and "feature_group" in its `rowData`.
##'
##' @references
##'
##' Johannes Rainer (2022). MsFeatures: Functionality for Mass Spectrometry
##' Features. R package version 1.3.0.
##' 'https://github.com/RforMassSpectrometry/MsFeatures
##'
##' Vincent D. Blondel, Jean-Loup Guillaume, Renaud Lambiotte, Etienne Lefebvre:
##' Fast unfolding of communities in large networks. J. Stat. Mech. (2008)
##' P10008
##'
##' Csardi G, Nepusz T: The igraph software package for complex network
##' research, InterJournal, Complex Systems 1695. 2006. https://igraph.org
##'
##' @seealso
##'
##' See [hclust], [cutree], [MsFeatures::groupClosest],
##' [MsFeatures::groupConsecutive], [MsFeatures::groupSimilarityMatrix], and
##' [igraph::cluster_louvain] for the underlying functions that do work.
##'
##' See [plotRTgroup] to visualize the grouping result.
##'
##' @examples
##'
##' data(faahko_se)
##'
##' se <- clusterFeatures(faahko_se, i = "knn_vsn", rtime_var = "rtmed")
##' rowData(se)[, c("rtmed", "rtime_group", "feature_group")]
##'
##' @export
clusterFeatures <- function(x, i, rtime_var = "rtime",
rt_cut = 10, cor_cut = 0.7,
rt_grouping = c("hclust", "closest", "consecutive"),
cor_grouping = c("louvain", "SimilarityMatrix",
"connected", "none"),
cor_use = c("everything", "all.obs",
"complete.obs", "na.or.complete",
"pairwise.complete.obs"),
cor_method = c("pearson", "kendall", "spearman"),
log2 = FALSE,
hclust_linkage = "complete") {
if (!is(x, "SummarizedExperiment")) {
stop("`x` must be a SummarizedExperiment object.")
}
if (!any(colnames(rowData(x)) == rtime_var)) {
stop("Variable `", rtime_var, "` is not found in `rowData(x)`." )
}
rt_grouping <- match.arg(rt_grouping)
cor_grouping <- match.arg(cor_grouping)
rts <- rowData(x)[, rtime_var]
rt_clus <- .groupRT(rts, method = rt_grouping, cut = rt_cut,
hclust_linkage = hclust_linkage)
rt_clus <- .format_padding(rt_clus)
rtime_group <- factor(paste0("FG.", rt_clus),
levels = unique(paste0("FG.", rt_clus)))
rowData(x)$rtime_group <- rtime_group
if (cor_grouping == "none") {
x
} else {
m <- assay(x, i)
if (log2) {
m <- log2(m)
}
ml <- split.data.frame(m, rtime_group)
res <- lapply(ml, function(x) {
.groupCorr(x, use = cor_use, method = cor_method,
type = cor_grouping, cut = cor_cut)
})
cor_clus <- unsplit(res, f = rtime_group)
cor_clus <- .format_padding(cor_clus)
rowData(x)$feature_group <- paste(rtime_group, cor_clus, sep = ".")
x
}
}
.groupRT <- function(x, method = c("hclust", "closest", "consecutive"),
cut = 10, hclust_linkage = "complete") {
method <- match.arg(method)
if (anyNA(x)) {
stop("`x` must have no missing values.")
}
switch(
method,
hclust = cutree(hclust(dist(x, "manhattan"), hclust_linkage), h = cut),
closest = {
.verify_package("MsFeatures")
clus <- MsFeatures::groupClosest(x, maxDiff = cut)
.relabel(clus)
},
consecutive = {
.verify_package("MsFeatures")
clus <- MsFeatures::groupConsecutive(x, maxDiff = cut)
.relabel(clus)
}
)
}
.groupCorr <- function(x, use = c("everything", "all.obs",
"complete.obs", "na.or.complete",
"pairwise.complete.obs"),
method = c("pearson", "kendall", "spearman"),
type = c("louvain", "connected", "SimilarityMatrix"),
cut = 0.7
) {
use <- match.arg(use)
## method <- match.arg(method) ## handled with cor function
type <- match.arg(type)
## Calculate a correlation matrix
m <- cor(t(x), use = use, method = method)
if (anyNA(m)) {
stop("A correlation matrix contains NA. ",
"Please check `x` and consider imputation.")
}
switch(
type,
connected = {
g <- igraph::graph_from_adjacency_matrix(
m, mode = "lower", weighted = TRUE, diag = FALSE
)
g <- igraph::delete_edges(g, which(igraph::E(g)$weight < cut))
igraph::components(g)$membership
},
louvain = {
g <- igraph::graph_from_adjacency_matrix(
m, mode = "lower", weighted = TRUE, diag = FALSE
)
g <- igraph::delete_edges(g, which(igraph::E(g)$weight < cut))
res <- igraph::cluster_louvain(g)
igraph::membership(res)
},
SimilarityMatrix = {
res <- MsFeatures::groupSimilarityMatrix(m, threshold = cut)
.relabel(res)
}
)
}
.relabel <- function(x) {
v <- rle(x)
newc <- as.integer(factor(v$values, levels = unique(v$values)))
rep(newc, times = v$lengths)
}
.format_padding <- function(x) {
padding_width <- nchar(max(x))
x <- formatC(x, width = padding_width, format = "d", flag = 0)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.