#' Cluster cells using Louvain/Leiden community detection
#'
#' Unsupervised clustering is a common step in many workflows. This function takes
#' a matrix as input, clusters the columns using Louvain/Leiden community
#' detection, and returns the clusterassignments. In addition to clusters this function
#' calculates partitions, which represent superclusters of the Louvain/Leiden communities
#' that are found using a kNN pruning method.
#'
#' @param obj input matrix
#' @param pd phenodata - a dataframe with relevant column metadata
#' @param reduction_method The dimensionality reduction method upon which to
#' base clustering. Options are "UMAP", "tSNE", "PCA" and "LSI".
#' @param k Integer number of nearest neighbors to use when creating the k
#' nearest neighbor graph for Louvain/Leiden clustering. k is related to the
#' resolution of the clustering result, a bigger k will result in lower
#' resolution and vice versa. Default is 20.
#' @param cluster_method String indicating the clustering method to use.
#' Options are "louvain" or "leiden". Default is "leiden". Resolution
#' parameter is ignored if set to "louvain".
#' @param num_iter Integer number of iterations used for Louvain/Leiden
#' clustering. The clustering result giving the largest modularity score will
#' be used as the final clustering result. Default is 1. Note that if
#' num_iter is greater than 1, the random_seed argument will be ignored
#' for the louvain method.
#' @param partition_qval Numeric, the q-value cutoff to determine when to
#' partition. Default is 0.05.
#' @param weight A logical argument to determine whether or not to use Jaccard
#' coefficients for two nearest neighbors (based on the overlapping of their
#' kNN) as the weight used for Louvain clustering. Default is FALSE.
#' @param resolution Parameter that controls the resolution of clustering. If
#' NULL (Default), the parameter is determined automatically.
#' @param random_seed The seed used by the random number generator in
#' louvain-igraph package. This argument will be ignored if num_iter is
#' larger than 1.
#' @param verbose A logic flag to determine whether or not we should print the
#' run details.
#' @param ... Additional arguments passed to the leidenbase package.
#'
#' @return a list containing 1) the clusters, and 2) the partitions.
#' @references Rodriguez, A., & Laio, A. (2014). Clustering by fast search and
#' find of density peaks. Science, 344(6191), 1492-1496.
#' doi:10.1126/science.1242072
#' @references Vincent D. Blondel, Jean-Loup Guillaume, Renaud Lambiotte,
#' Etienne Lefebvre: Fast unfolding of communities in large networks.
#' J. Stat. Mech. (2008) P10008
#' @references V. A. Traag and L. Waltman and N. J. van Eck: From Louvain
#' to Leiden: guaranteeing well-connected communities. Scientific Reports,
#' 9(1) (2019). doi: 10.1038/s41598-019-41695-z.
#' @references Jacob H. Levine and et. al. Data-Driven Phenotypic Dissection of
#' AML Reveals Progenitor-like Cells that Correlate with Prognosis.
#' Cell, 2015.
#' @useDynLib seqGlue, .registration = TRUE
#' @export
#'
#'
cluster<-function (obj, pd, reduction_method = c("UMAP", "tSNE", "PCA", "LSI",
"Aligned"), k = 20, cluster_method = c("leiden", "louvain"),
num_iter = 2, partition_qval = 0.05, weight = FALSE, resolution = NULL,
random_seed = NULL, verbose = F, ...)
{
cluster_method<-match.arg(cluster_method)
reduced_dim_res <- obj
if (is.null(random_seed)) {
random_seed <- sample.int(.Machine$integer.max, 1)
}
if (verbose)
message("Running ", cluster_method, " clustering algorithm ...")
if (cluster_method == "louvain") {
cluster_result <- louvain_clustering(data = reduced_dim_res,
pd = pd, k = k, weight = weight, num_iter = num_iter,
random_seed = random_seed, verbose = verbose, ...)
if (length(unique(cluster_result$optim_res$membership)) >
1) {
cluster_graph_res <- compute_partitions(cluster_result$g,
cluster_result$optim_res, partition_qval, verbose)
partitions <- igraph::components(cluster_graph_res$cluster_g)$membership[cluster_result$optim_res$membership]
names(partitions) <- row.names(reduced_dim_res)
partitions <- as.factor(partitions)
}
else {
partitions <- rep(1, nrow(pd))
}
clusters <- factor(igraph::membership(cluster_result$optim_res))
list(cluster_result = cluster_result, partitions = partitions, clusters = clusters)
}
else if (cluster_method == "leiden") {
#cds <- add_citation(cds, "leiden")
cluster_result <- leiden_clustering(data = reduced_dim_res,
pd = pd, k = k, weight = weight, num_iter = num_iter,
resolution_parameter = resolution, random_seed = random_seed,
verbose = verbose, ...)
if (length(unique(cluster_result$optim_res$membership)) >
1) {
cluster_graph_res <- compute_partitions(cluster_result$g,
cluster_result$optim_res, partition_qval, verbose)
partitions <- igraph::components(cluster_graph_res$cluster_g)$membership[cluster_result$optim_res$membership]
names(partitions) <- row.names(reduced_dim_res)
partitions <- as.factor(partitions)
}
else {
partitions <- rep(1, nrow(pd))
}
clusters <- factor(igraph::membership(cluster_result$optim_res))
list(cluster_result = cluster_result,
partitions = partitions, clusters = clusters)
}
}
louvain_clustering<-function (data, pd, k = 20, weight = F, louvain_iter = 1, random_seed = 0L,
verbose = F, ...)
{
extra_arguments <- list(...)
cell_names <- row.names(pd)
if (!identical(cell_names, row.names(pd)))
stop("Phenotype and row name from the data doesn't match")
if (is.data.frame(data))
data <- as.matrix(data)
if (!is.matrix(data))
stop("Wrong input data, should be a data frame of matrix!")
if (k < 1) {
stop("k must be a positive integer!")
}
else if (k > nrow(data) - 2) {
k <- nrow(data) - 2
warning(paste("RANN counts the point itself, k must be smaller than\nthe",
"total number of points - 1 (all other points) - 1",
"(itself)!"))
}
if (verbose) {
message("Run kNN based graph clustering starts:", "\n",
" -Input data of ", nrow(data), " rows and ", ncol(data),
" columns", "\n", " -k is set to ", k)
message(" Finding nearest neighbors...")
}
t1 <- system.time(tmp <- RANN::nn2(data, data, k + 1, searchtype = "standard"))
neighborMatrix <- tmp[[1]][, -1]
distMatrix <- tmp[[2]][, -1]
if (verbose)
message("DONE. Run time: ", t1[3], "s\n", " Compute jaccard coefficient between nearest-neighbor sets ...")
t2 <- system.time(links <- jaccard_coeff(neighborMatrix,
weight))
if (verbose)
message("DONE. Run time:", t2[3], "s\n", " Build undirected graph from the weighted links ...")
links <- links[links[, 1] > 0, ]
relations <- as.data.frame(links)
colnames(relations) <- c("from", "to", "weight")
relations$from <- cell_names[relations$from]
relations$to <- cell_names[relations$to]
t3 <- system.time(g <- igraph::graph.data.frame(relations,
directed = FALSE))
if (verbose)
message("DONE ~", t3[3], "s\n Run louvain clustering on the graph ...\n")
t_start <- Sys.time()
Qp <- -1
optim_res <- NULL
best_max_resolution <- "No resolution"
if (louvain_iter >= 2) {
random_seed <- NULL
}
for (iter in 1:louvain_iter) {
if (verbose) {
cat("Running louvain iteration ", iter, "...\n")
}
Q <- igraph::cluster_louvain(g)
if (is.null(optim_res)) {
Qp <- max(Q$modularity)
optim_res <- Q
}
else {
Qt <- max(Q$modularity)
if (Qt > Qp) {
optim_res <- Q
Qp <- Qt
}
}
}
if (verbose)
message("Maximal modularity is ", Qp, "; corresponding resolution is ",
best_max_resolution)
t_end <- Sys.time()
if (verbose) {
message("\nRun kNN based graph clustering DONE, totally takes ",
t_end - t_start, " s.")
cat(" -Number of clusters:", length(unique(igraph::membership(optim_res))),
"\n")
}
if (igraph::vcount(g) < 3000) {
coord <- NULL
edge_links <- NULL
}
else {
coord <- NULL
edge_links <- NULL
}
igraph::V(g)$names <- as.character(igraph::V(g))
return(list(g = g, relations = relations, distMatrix = distMatrix,
coord = coord, edge_links = edge_links, optim_res = optim_res))
}
leiden_clustering <-function (data, pd, k = 20, weight = NULL, num_iter = 2, resolution_parameter = 1e-04,
random_seed = NULL, verbose = FALSE, ...)
{
extra_arguments <- list(...)
if ("partition_type" %in% names(extra_arguments))
partition_type <- extra_arguments[["partition_type"]]
else partition_type <- "CPMVertexPartition"
if ("initial_membership" %in% names(extra_arguments))
initial_membership <- extra_arguments[["initial_membership"]]
else initial_membership <- NULL
if ("weights" %in% names(extra_arguments))
edge_weights <- extra_arguments[["weights"]]
else edge_weights <- NULL
if ("node_sizes" %in% names(extra_arguments))
node_sizes <- extra_arguments[["node_sizes"]]
else node_sizes <- NULL
if (partition_type %in% c("ModularityVertexPartition", "SignificanceVertexPartition",
"SurpriseVertexPartition")) {
resolution_parameter = NA
}
else if (is.null(resolution_parameter)) {
resolution_parameter = 1e-04
}
if (is.null(num_iter))
num_iter = 2
if (random_seed == 0L)
random_seed = NULL
cell_names <- row.names(pd)
if (!identical(cell_names, row.names(pd)))
stop("Phenotype and row name from the data don't match")
if (is.data.frame(data))
data <- as.matrix(data)
if (!is.matrix(data))
stop("Wrong input data, should be a data frame of matrix!")
if (k < 1) {
stop("k must be a positive integer!")
}
else if (k > nrow(data) - 2) {
k <- nrow(data) - 2
warning(paste("RANN counts the point itself, k must be smaller than\nthe",
"total number of points - 1 (all other points) - 1",
"(itself)!"))
}
if (verbose) {
message("Run kNN based graph clustering starts:", "\n",
" -Input data of ", nrow(data), " rows and ", ncol(data),
" columns", "\n", " -k is set to ", k)
message(" Finding nearest neighbors...")
}
t1 <- system.time(tmp <- RANN::nn2(data, data, k + 1, searchtype = "standard"))
neighborMatrix <- tmp[[1]][, -1]
distMatrix <- tmp[[2]][, -1]
if (verbose)
message(" DONE. Run time: ", t1[3], "s")
if (verbose)
message(" Compute jaccard coefficient between nearest-neighbor sets ...")
t2 <- system.time(links <- jaccard_coeff(neighborMatrix,
weight))
if (verbose)
message(" DONE. Run time: ", t2[3], "s")
if (verbose)
message(" Build undirected graph from the weighted links ...")
links <- links[links[, 1] > 0, ]
relations <- as.data.frame(links)
colnames(relations) <- c("from", "to", "weight")
relations$from <- cell_names[relations$from]
relations$to <- cell_names[relations$to]
t3 <- system.time(g <- igraph::graph.data.frame(relations,
directed = FALSE))
if (verbose)
message(" DONE. Run time: ", t3[3], "s")
if (verbose)
message(" Run leiden clustering ...")
t_start <- Sys.time()
if (verbose) {
table_results <- data.frame(resolution_parameter = double(),
quality = double(), modularity = double(), significance = double(),
number_clusters = integer())
}
best_modularity <- -1
best_result <- NULL
best_resolution_parameter <- "No resolution"
for (i in 1:length(resolution_parameter)) {
cur_resolution_parameter <- resolution_parameter[i]
cluster_result <- leidenbase::leiden_find_partition(g,
partition_type = partition_type, initial_membership = initial_membership,
edge_weights = edge_weights, node_sizes = node_sizes,
seed = random_seed, resolution_parameter = cur_resolution_parameter,
num_iter = num_iter, verbose = verbose)
quality <- cluster_result[["quality"]]
modularity <- cluster_result[["modularity"]]
significance <- cluster_result[["significance"]]
if (verbose)
table_results <- rbind(table_results, data.frame(resolution_parameter = cur_resolution_parameter,
quality = quality, modularity = modularity, significance = significance,
cluster_count = max(cluster_result[["membership"]])))
if (verbose)
message(" Current resolution is ", cur_resolution_parameter,
"; Modularity is ", modularity, "; Quality is ",
quality, "; Significance is ", significance,
"; Number of clusters is ", max(cluster_result[["membership"]]))
if (modularity > best_modularity) {
best_result <- cluster_result
best_resolution_parameter <- cur_resolution_parameter
best_modularity <- modularity
}
if (is.null(best_result)) {
best_result <- cluster_result
best_resolution_parameter <- NULL
best_modularity <- cluster_result[["modularity"]]
}
}
t_end <- Sys.time()
if (verbose) {
message(" Done. Run time: ", t_end - t_start, "s\n")
message(" Clustering statistics")
selected <- vector(mode = "character", length = length(resolution_parameter))
for (irespar in 1:length(resolution_parameter)) {
if (identical(table_results[["resolution_parameter"]][irespar],
best_resolution_parameter))
selected[irespar] <- "*"
else selected[irespar] <- " "
}
print(cbind(` ` = " ", table_results, selected), row.names = FALSE)
message()
message(" Cell counts by cluster")
membership <- best_result[["membership"]]
membership_frequency <- stats::aggregate(data.frame(cell_count = membership),
list(cluster = membership), length)
membership_frequency <- cbind(` ` = " ", membership_frequency,
cell_fraction = sprintf("%.3f", membership_frequency[["cell_count"]]/sum(membership_frequency[["cell_count"]])))
print(membership_frequency, row.names = FALSE)
message()
message(" Maximal modularity is ", best_modularity,
" for resolution parameter ", best_resolution_parameter)
message("\n Run kNN based graph clustering DONE.\n -Number of clusters: ",
max(best_result[["membership"]]))
}
if (igraph::vcount(g) < 3000) {
coord <- NULL
edge_links <- NULL
}
else {
coord <- NULL
edge_links <- NULL
}
igraph::V(g)$names <- as.character(igraph::V(g))
out_result <- list(membership = best_result[["membership"]],
modularity = best_result[["modularity"]])
names(out_result$membership) = cell_names
return(list(g = g, relations = relations, distMatrix = distMatrix,
coord = coord, edge_links = edge_links, optim_res = out_result))
}
compute_partitions<-function (g, optim_res, qval_thresh = 0.05, verbose = FALSE)
{
cell_membership <- as.factor(igraph::membership(optim_res))
membership_matrix <- Matrix::sparse.model.matrix(~cell_membership +
0)
num_links <- Matrix::t(membership_matrix) %*% igraph::as_adjacency_matrix(g) %*%
membership_matrix
diag(num_links) <- 0
louvain_modules <- levels(cell_membership)
edges_per_module <- Matrix::rowSums(num_links)
total_edges <- sum(num_links)
theta <- (as.matrix(edges_per_module)/total_edges) %*% Matrix::t(edges_per_module/total_edges)
var_null_num_links <- theta * (1 - theta)/total_edges
num_links_ij <- num_links/total_edges - theta
cluster_mat <- pnorm_over_mat(as.matrix(num_links_ij), var_null_num_links)
num_links <- num_links_ij/total_edges
cluster_mat <- matrix(stats::p.adjust(cluster_mat), nrow = length(louvain_modules),
ncol = length(louvain_modules))
sig_links <- as.matrix(num_links)
sig_links[cluster_mat > qval_thresh] = 0
diag(sig_links) <- 0
cluster_g <- igraph::graph_from_adjacency_matrix(sig_links,
weighted = T, mode = "undirected")
list(cluster_g = cluster_g, num_links = num_links, cluster_mat = cluster_mat)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.