#' @importFrom grDevices dev.off png
#' @importFrom methods is
#' @importFrom stats median model.matrix quantile
NULL
#' Find Differentiated Clusters
#'
#' @param object
#' @param ...
#'
#' @return
#' @rdname find_differentiated_clusters
#' @export find_differentiated_clusters
#'
#' @examples
#' if(requireNamespace("Seurat", quietly=TRUE)){
#' # Find differentiated clusters in Seurat object using
#' # edgeR_pseudobulk_LRT function
#' data("Seu")
#'
#' DA = find_differentiated_clusters(
#' Seu,
#' differential_function = differential_edgeR_pseudobulk_LRT,
#' logFC.th = log2(1.5),
#' qval.th = 0.01,
#' by = "seurat_clusters",
#' limit = 5,
#' cluster_of_origin = "Alpha",
#' min_frac_cell_assigned = 0.1,
#' verbose = TRUE
#' )
#'
#' # Summary of differential genes per cluster
#' head(DA$diffmat_n)
#'
#' # Differential analysis
#' head(DA$res)
#'
#' # Did the clustering pass the minimum percent of cell assigned threshold ?
#' print(DA$passing_min_pct_cell_assigned)
#'
#' # Find differentiated clusters in Seurat object using Seurat function
#' data("Seu")
#'
#' DA = find_differentiated_clusters(
#' Seu,
#' differential_function = differential_Seurat,
#' logFC.th = log2(1.5),
#' qval.th = 0.01,
#' by = "seurat_clusters",
#' limit = 5,
#' cluster_of_origin = "Alpha",
#' min_frac_cell_assigned = 0.1,
#' verbose = TRUE
#' )
#'
#' # Find differentiated clusters in Seurat object using Seurat function,
#' # passing additional arguments to differential_Seurat and thus
#' # Seurat::FindAllMarkers funtion.
#' data("Seu")
#'
#' DA = find_differentiated_clusters(
#' Seu,
#' differential_function = differential_Seurat,
#' logFC.th = log2(1.5),
#' qval.th = 0.01,
#' by = "seurat_clusters",
#' limit = 5,
#' cluster_of_origin = "Alpha",
#' min_frac_cell_assigned = 0.1,
#' verbose = TRUE,
#' test.use = "roc" # additional argument
#' )
#' }
#' # Find differentiated clusters in SingleCellExperiment object using
#' # differential_ChromSCape function.
#' if(requireNamespace("ChromSCape", quietly=TRUE)){
#' data("scExp")
#' scExp = ChromSCape::find_clusters_louvain_scExp(scExp,
#' resolution = 0.1)
#' DA = find_differentiated_clusters(
#' scExp,
#' differential_function = differential_ChromSCape,
#' logFC.th = log2(5),
#' qval.th = 0.01,
#' by = "IDcluster",
#' limit = 5,
#' cluster_of_origin = "Alpha",
#' min_frac_cell_assigned = 0.1,
#' verbose = TRUE,
#' )
#' }
find_differentiated_clusters <- function(object, ...) {
UseMethod(generic = 'find_differentiated_clusters', object = object)
}
#' iterative_differential_clustering
#'
#' @param object
#' @param ...
#'
#' @return
#' @rdname iterative_differential_clustering
#' @export iterative_differential_clustering
#'
#' @examples
#' # Clustering of Seurat scRNA object (Paired-Tag)
#' if(requireNamespace("Seurat", quietly=TRUE)){
#'
#' data("Seu", package = "IDclust")
#' set.seed(47)
#' Seu = iterative_differential_clustering(Seu, saving = FALSE, plotting =FALSE,
#' logFC.th = 0.2, qval.th = 0.1)
#'
#' }
#'
#' # Clustering of scExp scH3K27ac object (Paired-Tag)
#' if(requireNamespace("ChromSCape", quietly=TRUE)){
#'
#' data("scExp", package = "IDclust")
#' set.seed(47)
#' scExp = iterative_differential_clustering(scExp, saving = FALSE, plotting =FALSE,
#' logFC.th = 0.5, qval.th = 0.01)
#'
#' }
iterative_differential_clustering <- function(object, ...) {
UseMethod(generic = 'iterative_differential_clustering', object = object)
}
#' @details Find significantly differential features between the given set
#' of clusters (within the 'IDcluster' column of the SingleCellExperiment).
#' For each cluster, if enough differences are found, mark the cluster as a
#' 'true' subcluster and gives it the alias 'cluster_of_origin_cluster'.
#' The function will use by default [ChromSCape::differential_activation()]
#' function to define differential features.
#'
#' @param object A SingleCellExperiment with 'IDcluster' column filled with
#' cluster assignations.
#' @param differential_function A function that take in entry a
#' SingleCellExperiment object and parameters passed in ... and returns a
#' data.frame containing the significantly differential features for each
#' cluster.
#' @param by A character specifying the name of the metadata column referencing
#' the clusters.
#' @param logFC.th A numeric specifying the log2 fold change of activation
#' above/below which a feature is considered as significantly differential
#' passed to the differential_function.
#' @param qval.th A numeric specifying the adjusted p-value below
#' which a feature is considered as significantly differential passed to the
#' differential_function.
#' @param min_frac_cell_assigned A numeric between 0 and 1 specifying the
#' minimum percentage of the total cells in the SingleCellExperiment object that
#' needs to be assigned. If a lower proportion is assigned, all cells are
#' assigned to the cluster of origin.
#' @param limit An integer specifying the minimum number of features required
#' for a subcluster to be called 'true' subcluster.
#' @param FP_linear_model Optional. A linear model (see [stats::lm()]) of
#' the number of false positive expected for a given cluster size. The [lm_list]
#' list of linear models present in this package gives default values accross
#' multiple binsizes. (See [calculate_FDR_scEpigenomics]).
#' @param cluster_of_origin A character specifying the name of the cluster of
#' origin that will be concatenated before the name of true subclusters.
#' @param min_cluster_size An integer specifying the minimum number of cells
#' in a cluster to consider it as a 'true' subcluster.
#' @param verbose A logical specifying wether to print.
#' @param ... Additional parameters passed to the differential_function. See
#' [differential_ChromSCape()] for more information on additional
#' parameters for the default function.
#'
#' @seealso [differential_ChromSCape()]
#'
#' @return A list containing :
#' * "diffmat_n" - A data.frame containing the number of differential regions
#' foun per cluster and the new assignations of the subclusters.
#' * "res" - A data.frame containing the differential analysis.
#' * "passing_min_pct_cell_assigned" - A boolean indicating if enough cells
#' were assigned
#'
#' @import dplyr
#' @export
#' @rdname find_differentiated_clusters
#' @exportS3Method find_differentiated_clusters default
find_differentiated_clusters.default <- function(
object,
differential_function = differential_ChromSCape,
by = "IDcluster",
logFC.th = log2(1.5),
qval.th = 0.01,
min_frac_cell_assigned = 0.1,
limit = 5,
FP_linear_model = NULL,
cluster_of_origin = "Alpha",
min_cluster_size = 30,
verbose = TRUE,
...){
cluster_u = names(sort(table(object[[by]])))
diffmat_n = data.frame(n_differential = rep(0,length(cluster_u)),
cluster_of_origin = cluster_of_origin,
subcluster = cluster_u,
true_subcluster = cluster_u)
res = differential_function(object, by = by, logFC.th = logFC.th, qval.th = qval.th, ...)
gc()
n_cell_assigned = 0
for(i in 1:(length(cluster_u))){
cluster = cluster_u[i]
group_cells = colnames(object)[object[[by]] == cluster]
if(length(group_cells) >= min_cluster_size){
res. = res[which(res$cluster == cluster),]
diffmat_n$n_differential[i] = nrow(res.)
if(verbose) cat(cluster," - found", diffmat_n$n_differential[i], "enriched features.\n")
if(is.null(FP_linear_model)){
if(diffmat_n$n_differential[i] < limit){
if(verbose) cat(cluster, " cluster has less than", limit, "enriched features.\nAssigning the cells to cluster of origin.\n")
diffmat_n$true_subcluster[i] = cluster_of_origin
} else{
n_cell_assigned = n_cell_assigned + length(group_cells)
diffmat_n$true_subcluster[i] = paste0(cluster_of_origin, "_", cluster)
}
} else{
cluster_size <- data.frame(ncells=length(group_cells))
conf = stats::predict(FP_linear_model, newdata = cluster_size, interval = 'confidence')
confidence_limit = conf[1,"upr"]
cat("Ncells = ",length(group_cells), " - Confidence = ", confidence_limit, "\n")
limit. = max(limit, confidence_limit)
if(diffmat_n$n_differential[i] < limit.){
if(verbose) cat(cluster, " cluster has less than", limit., "enriched features.\nAssigning the cells to cluster of origin.\n")
diffmat_n$true_subcluster[i] = cluster_of_origin
} else{
n_cell_assigned = n_cell_assigned + length(group_cells)
diffmat_n$true_subcluster[i] = paste0(cluster_of_origin, "_", cluster)
}
}
} else{
if(verbose) cat(cluster, " cluster has less than", min_cluster_size, " cells. \nAssigning the cells to cluster of origin.\n")
diffmat_n$true_subcluster[i] = cluster_of_origin
}
}
passing = TRUE
if(verbose) cat("Finished finding differences - ", n_cell_assigned/ ncol(object), " fraction of cells were assigned.\n")
if((n_cell_assigned/ ncol(object)) < min_frac_cell_assigned){
if(verbose) cat("Not enough cells were assigned - not clustering.\n")
passing = FALSE
}
out = list("diffmat_n" = diffmat_n, "res" = res, "passing_min_pct_cell_assigned" = passing)
return(out)
}
#' @description Main function of the IDclust package. Provided a
#' SingleCellExperiment pre-processed with ChromSCape, will find biologically
#' relevant clusters by iteratively re-clustering and re-processing clusters.
#' At each iteration, subclusters having enough significantly enriched features
#' compared to other subclusters are defined as 'true' subclusters. Others are
#' assigned to parent clusters. The algorithm will stop when no more 'true'
#' subclusters are found.
#'
#' This method ensure that each cluster found in this unsupervised way have
#' significant biological differences, based on the user defined thresholds.
#'
#' @details The default differential analysis used is the
#' [ChromSCape::differential_activation()] function. This function
#' compares the % of active cells in the cluster versus the rest of cells and
#' perform a Chi-squared test to calculate p-values.
#'
#'
#' @param object A SingleCellExperiment object.
#' @param output_dir The output directory in which to plot and save objects.
#' @param plotting A logical specifying wether to save the plots or not.
#' @param saving A logical specifying wether to save the data or not.
#' @param limit An integer specifying the minimum number of significantly
#' enriched / depleted features required in order for a subcluster to be called
#' a 'true' subcluster
#' @param dim_red The name of the slot to save the dimensionality reduction at
#' each step.
#' @param vizualization_dim_red The name of the slot used for plotting. Must be
#' a valid slot present in \code{reducedDimNames(object)}.
#' @param processing_function A function that re-process the subset of clusters
#' at each step. It msut take in entry a SingleCellExperiment object, \code{dim_red}
#' and \code{n_dims} parameters and returns a SingleCellExperiment containing a cell
#' embedding. See [processing_ChromSCape] for the default function.
#' @param differential_function A function that take in entry a
#' SingleCellExperiment object and parameters passed in ... and returns a
#' data.frame containing the significantly differential features for each
#' cluster.
#' @param logFC.th A numeric specifying the log2 fold change of activation
#' above/below which a feature is considered as significantly differential
#' passed to the differential_function.
#' @param qval.th A numeric specifying the adjusted p-value below
#' which a feature is considered as significantly differential passed to the
#' differential_function.
#' @param min.pct A numeric between 0 and 1 specifying the fraction of cells
#' active in a cluster for a feature to be defined as marker. Default to NULL,
#' if NULL, the 70th percentile of global activation is taken as minimal
#' percentage of activation for the differential analysis. Increasing this
#' value will decrease the number of differential features.
#' @param min_frac_cell_assigned A numeric between 0 and 1 specifying the
#' minimum percentage of the total cells in the SingleCellExperiment object that
#' needs to be assigned. If a lower proportion is assigned, all cells are
#' assigned to the cluster of origin.
#' @param starting.resolution A numeric specifying the resolution to use for the
#' Louvain clustering of the first iteration. It is recommended to set it quite
#' low in order to have few starting clusters.
#' @param starting.k An integer specifying the number of nearest neighbors to
#' use for the Louvain clustering of the first iteration. It is recommended
#' to set it quite high in order to have few starting clusters
#' @param resolution A numeric specifying the resolution to use for the Louvain
#' clustering at each iteration.
#' @param max_k An integer specifying the maximum number of nearest neighbors to use for
#' the Louvain clustering at each iteration. This k is reduced with the number of cells,
#' to a minimum of k = 5.
#' @param k_percent A numeric between 0 and 1 representing the fraction of cells
#' to calculate the k for the KNN graph calulation of clustering.
#' @param FP_linear_model Optional. A linear model (see [stats::lm()]) of
#' the number of false positive expected for a given cluster size. The [lm_list]
#' list of linear models present in this package gives default values accross
#' multiple binsizes. (See [calculate_FDR_scEpigenomics]).
#' @param n_dims An integer specifying the number of first dimensions to keep
#' in the dimensionality reduction step.
#' @param color Set of colors to use for the coloring of the clusters. This must
#' contains enough colors for each cluster (minimum 20 colors, but 100 colors
#' at least is recommended, based on the dataset).
#' @param swapExperiment A character specifying an alternative experiment
#' (see [SingleCellExperiment::altExp()]) to switch for differential analysis.
#' The processing will be done in the main experiment while the differential
#' analysis will be done in the alternative experiment.
#' @param force_initial_clustering A logical specifying wether to force the
#' initial number of cluster between 2 and 6. This is in order to avoid a too high
#' number of initial clusters which would be equivalent to a classical louvain
#' clustering.
#' @param verbose A logical specifying wether to print.
#'
#' @param ...
#'
#' @return The SingleCellExperiment object with the assignation of cells to clusters.
#' If saving is true, also saves list of differential analyses, differential
#' analyses summaries and embeddings for each re-clustered cluster. If runFDR is
#' TRUE, also saves the list of FDR for each re-clusterd cluster.
#'
#'
#' @importFrom Matrix Matrix rowSums
#' @importFrom qs qsave
#' @importFrom SummarizedExperiment assays
#' @importFrom grDevices colors
#' @importFrom SingleCellExperiment reducedDim
#' @import ggplot2
#' @import dplyr
#' @export
#' @rdname iterative_differential_clustering
#' @exportS3Method iterative_differential_clustering default
#'
iterative_differential_clustering.default <- function(
object,
output_dir = "./",
plotting = TRUE,
saving = TRUE,
n_dims = 50,
dim_red = "PCA",
vizualization_dim_red = "UMAP",
processing_function = processing_ChromSCape,
min.pct = NULL,
differential_function = differential_ChromSCape,
logFC.th = log2(2),
qval.th = 0.01,
min_frac_cell_assigned = 0.1,
limit = 5,
starting.k = 100,
starting.resolution = 0.1,
resolution = 0.1,
max_k = 50,
k_percent = 0.1,
FP_linear_model = NULL,
color = NULL,
d = 10,
swapExperiment = NULL,
force_initial_clustering = TRUE,
verbose = TRUE,
...
){
set.seed(47)
if(plotting == TRUE){
if(dir.exists(output_dir))
dir.create(file.path(output_dir, "iterations"), showWarnings = FALSE) else stop("output_dir is an invalid directory")
}
# For the first partition, try to find very low level clusters (e.g. low
# resolution, high number of neighbors)
if(force_initial_clustering){
nclust = 1
factor = 1
max_iter = 10
iter = 0
while( (nclust < 2 | nclust > 6) & iter < max_iter){
object. = ChromSCape::find_clusters_louvain_scExp(object,
k = starting.k,
resolution = factor * starting.resolution,
use.dimred = dim_red)
nclust = length(unique(object.$cell_cluster))
if(nclust < 2) factor = 1.15 * factor
if(nclust > 6) factor = 0.85 * factor
iter = iter + 1
}
if(iter > max_iter) {
warning("IDclust::iterative_differential_clustering - Didn't manage",
" to find more than 1 initial cluster...")
return()
}
} else {
object. = ChromSCape::find_clusters_louvain_scExp(object,
k = starting.k,
resolution = starting.resolution,
use.dimred = dim_red)
}
object$IDcluster = gsub("C","A", object.$cell_cluster)
rm(object.)
gc()
# Calculate the average % cells activated in a feature
# and return a level of activation based on a given decile
binmat = Matrix::Matrix((SummarizedExperiment::assays(object)$counts > 0) + 0, sparse = TRUE)
if(is.null(min.pct)) min.pct = quantile(Matrix::rowSums(binmat) / ncol(binmat), 0.7)
if(verbose) cat("The minimum percentage of activation is ", round(100 * min.pct, 3),"%.\n")
# Find initial differences (even if there are none, the initial clusters
# are always considered as true clusters).
if(verbose) cat("Initial round of clustering - limit of differential genes set to 0",
" for this first round only.\n")
if(length(swapExperiment)){
if(verbose) cat("Swicthing experiment for DA to ", swapExperiment, ".\n")
object = ChromSCape::swapAltExp_sameColData(object, alt = swapExperiment)
}
DA = find_differentiated_clusters(
object,
differential_function = differential_function,
by = "IDcluster",
logFC.th = logFC.th,
qval.th = qval.th,
min_frac_cell_assigned = min_frac_cell_assigned,
limit = 0,
FP_linear_model = FP_linear_model,
cluster_of_origin = "Alpha",
verbose = verbose,
min.pct = min.pct,
...)
if(length(swapExperiment)){
if(verbose) cat("Swicthing back for clustering to main Experiment.\n")
object = ChromSCape::getMainExperiment(object)
}
# Starting list of clusters to re-cluster
differential_summary_df = DA$diffmat_n
object$IDcluster = differential_summary_df$true_subcluster[match(object$IDcluster, differential_summary_df$subcluster)]
# Colors for the plot
if (is.null(color)){
color = grDevices::colors()[grep('gr(a|e)y', grDevices::colors(), invert = TRUE)]
color <- sample(color, 400)
} else {
stopifnot(is.character(color))
if(length(color) < 20) warning("IDclust::iterative_differential_clustering - ",
"The color vector might be too short.",
"Please precise more colors.")
}
object$cell_cluster_color = NULL
# List of embeddings
list_embeddings = list(SingleCellExperiment::reducedDim(object, dim_red))
names(list_embeddings)[1] = paste(unique(object$IDcluster), collapse = "_")
# List of marker features
res = DA$res
if(nrow(res) > 0) res$cluster_of_origin = "Alpha"
list_res = list(res)
names(list_res)[1] = "Alpha"
iteration = 0
gc()
# Run IDC until no more clusters can be re-clustered into meaningful clusters
while(iteration < nrow(differential_summary_df)){
if(plotting == TRUE){
# Plot each iteration of the algorithm
png(file.path(output_dir, "iterations", paste0("Iteration_",iteration,".png")), width = 1600, height = 1200, res = 200)
object. = object
object.$cell_cluster = gsub("Alpha_","",object.$IDcluster)
print(
ChromSCape::plot_reduced_dim_scExp(object., reduced_dim = vizualization_dim_red, color_by = "IDcluster",
downsample = 50000, size = 0.35, transparency = 0.75, annotate_clusters = TRUE) +
ggtitle(paste0("Initital Clustering")) + theme(legend.position = "none")
)
dev.off()
rm(object.)
}
iteration = iteration + 1
if(verbose) cat("Doing iteration number ", iteration,"...\n")
# Name of the cluster of origin
partition_cluster_of_origin = differential_summary_df$true_subcluster[iteration]
# Make sure that we do not re-cluster a 'parent cluster',
# e.g. cells that were unassigned and therefore assigned to the parent
# cluster
if(!(partition_cluster_of_origin %in% differential_summary_df$true_subcluster[duplicated(differential_summary_df$true_subcluster)])){
# Letter assigned to the 'partition depth' (e.g. A, B, C...)
partition_depth = which(LETTERS == substr(gsub(".*_","",partition_cluster_of_origin),1,1)) + 1
# Select only cells from the given cluster
object. = object[, which(object$IDcluster %in% partition_cluster_of_origin)]
if(verbose) cat("Re-calculating PCA and subclustering for cluster", partition_cluster_of_origin,
"with",ncol(object.),"cells.\n")
# Re-cluster a cluster only if there are more than 100 cells
if(ncol(object.) > max(10, n_dims)){
# Re-processing sub-cluster
object. = processing_function(object., n_dims = n_dims, dim_red = dim_red)
# Re-clustering sub-cluster
k = max(10, min(max_k, k_percent * ncol(object.))) # select a k according to the number of cells
if(verbose) cat("Reclustering with k =", k, "and resolution =", resolution, "\n")
object.. = ChromSCape::find_clusters_louvain_scExp(object., k = k, resolution = resolution,
use.dimred = dim_red)
object.$IDcluster <- paste0(LETTERS[partition_depth],gsub("C", "", object..$cell_cluster))
rm(object..)
clusters = object.$IDcluster
cluster_u = unique(clusters)
if(length(cluster_u) > 1 ){
if(verbose) cat("Found", length(cluster_u),"subclusters.\n")
if(verbose) print(table(clusters))
if(length(swapExperiment)){
if(verbose) cat("Swicthing experiment for DA to ", swapExperiment, ".\n")
object. = ChromSCape::swapAltExp_sameColData(object., alt = swapExperiment)
}
# Find differentiated clusters
DA = find_differentiated_clusters(object.,
differential_function = differential_function,
by = "IDcluster",
logFC.th = logFC.th,
qval.th = qval.th,
min_frac_cell_assigned = min_frac_cell_assigned,
limit = limit,
FP_linear_model = FP_linear_model,
cluster_of_origin = partition_cluster_of_origin,
verbose = verbose,
min.pct = min.pct,
...)
gc()
if(length(swapExperiment)){
if(verbose) cat("Swicthing back for clustering to main Experiment.\n")
object. = ChromSCape::getMainExperiment(object.)
}
# Retrieve DA results
diffmat_n = DA$diffmat_n
res = DA$res
res$cluster_of_origin = partition_cluster_of_origin[min(1,nrow(res))]
list_res[[partition_cluster_of_origin]] = res
list_embeddings[[partition_cluster_of_origin]] = SingleCellExperiment::reducedDim(object., dim_red)
# If more than 'min_frac_cell_assigned' of the cells were assigned
# to 'true' subclusters (with marker features)
if(!isFALSE(DA$passing_min_pct_cell_assigned)){
# Add the new sublclusters to the list of clusters
differential_summary_df = rbind(differential_summary_df, diffmat_n)
object.$IDcluster = diffmat_n$true_subcluster[match(object.$IDcluster, diffmat_n$subcluster)]
object$IDcluster[match(colnames(object.), colnames(object))] = object.$IDcluster
if(plotting == TRUE){
png(file.path(output_dir, paste0(partition_cluster_of_origin,"_true.png")), width = 1400, height = 1200, res = 200)
print(
ChromSCape::plot_reduced_dim_scExp(object., color_by = "IDcluster",annotate_clusters = F, reduced_dim = dim_red) +
ggtitle(paste0(partition_cluster_of_origin))
)
dev.off()
}
} else{
if(verbose) cat("Found 2 subcluster for", partition_cluster_of_origin," but with no difference. Maximum clustering reached...\n")
}
} else{
if(verbose) cat("Found only 1 subcluster for", partition_cluster_of_origin,". Maximum clustering reached...\n")
}
}
gc()
} else{
if(verbose) cat(partition_cluster_of_origin,"- This cluster was formed by 'unassigned' cells, not clustering it further...\n")
}
}
if(verbose) cat("\n\n\n##########################################################\nFinished !\nFound a total of", length(unique(object$IDcluster)),"clusters after",iteration ,"iterations.",
"\nThe average cluster size is ", floor(mean(table(object$IDcluster)))," and the median is",floor(median(table(object$IDcluster))),".",
"\nThe number of initital clusters not subclustered is ",length(grep("_", unique(object$IDcluster),invert = TRUE)),".",
"\n##########################################################\n")
## Saving results
if(saving){
# Table of differential features for each re-clustering
IDC_DA = do.call("rbind", list_res)
IDC_DA$IDcluster = paste0(IDC_DA$cluster_of_origin, "_", IDC_DA$cluster)
write.csv(IDC_DA, file = file.path(output_dir, "IDC_DA.csv"), quote = FALSE, row.names = FALSE)
# List of embedding of each re-clustered cluster
qs::qsave(list_embeddings, file.path(output_dir, "IDC_embeddings.qs"))
# Summary table of the number of differential features for each re-clustering
write.csv(differential_summary_df, file = file.path(output_dir, "IDC_summary.csv"), quote = FALSE, row.names = FALSE)
# Final SingleCellExperiment with the clusters found by IDC
qs::qsave(object, file.path(output_dir, "scExp_IDC.qs"))
}
return(object)
}
#' @param object A Seurat object containing scRNA dataset with 'IDcluster'
#' column.
#' @param differential_function A function that take in entry a
#' SingleCellExperiment object and parameters passed in ... and returns a
#' data.frame containing the significantly differential features for each
#' cluster. See [differential_edgeR_pseudobulk_LRT] for the default function.
#' @param by A character specifying the name of the metadata column referencing
#' the clusters.
#' @param logFC.th A numeric specifying the log2 fold change of activation
#' above/below which a feature is considered as significantly differential
#' passed to the differential_function.
#' @param qval.th A numeric specifying the adjusted p-value below
#' which a feature is considered as significantly differential passed to the
#' differential_function.
#' @param limit An integer specifying the minimum number of features required
#' for a subcluster to be called 'true' subcluster.
#' @param cluster_of_origin A character specifying the name of the cluster of
#' origin that will be concatenated before the name of true subclusters.
#' @param min_frac_cell_assigned A numeric between 0 and 1 specifying the
#' minimum percentage of the total cells in the SingleCellExperiment object that
#' needs to be assigned. If a lower proportion is assigned, all cells are
#' assigned to the cluster of origin.
#' @param min_cluster_size An integer specifying the minimum number of cells
#' in a cluster to consider it as a 'true' subcluster.
#' @param verbose A logical specifying wether to print.
#' @param ... Additional parameters passed to the differential_function. See
#' [differential_ChromSCape()] for more information on additional
#' parameters for the default function.
#'
#' @return
#' @export
#' @rdname find_differentiated_clusters
#' @exportS3Method find_differentiated_clusters Seurat
#'
find_differentiated_clusters.Seurat <- function(object,
differential_function = differential_edgeR_pseudobulk_LRT,
by = "IDcluster",
logFC.th = log2(1.5),
qval.th = 0.01,
limit = 5,
cluster_of_origin = "Alpha",
min_frac_cell_assigned = 0.1,
min_cluster_size = 30,
verbose = TRUE,
...
){
# If Seurat is a SingleCellExperiment or else, try convert it to Seurat
if(!is(object, "Seurat")) {
object = Seurat::as.Seurat(object)
if(is.null(object$IDcluster) & !is.null(object$IDcluster))
object$IDcluster = object$IDcluster else stop("Need 'IDcluster' or 'IDcluster', column.")
Seurat::Idents(object) = object$IDcluster
}
set.seed(47)
# Starting list of clusters to re-cluster
cluster_u = as.character(unique(object@meta.data[[by]]))
diffmat_n = data.frame(n_differential = rep(0,length(cluster_u)),
cluster_of_origin = cluster_of_origin,
subcluster = cluster_u,
true_subcluster = cluster_u)
res = differential_function(object,
by = by,
logFC.th = logFC.th,
qval.th = qval.th,
...)
n_cell_assigned = 0
for(i in seq_along(cluster_u)){
group_cells = colnames(object)[which(object@meta.data[[by]] %in% cluster_u[i])]
if(length(group_cells) >= min_cluster_size){
res. = res[which(res$cluster == cluster_u[i]),]
diffmat_n$n_differential[i] = nrow(res.)
if(verbose) cat(cluster_u[i],"- Found",diffmat_n$n_differential[i], "differential genes.\n")
if(diffmat_n$n_differential[i] < limit){
if(verbose) cat(cluster_u[i], " cluster has less than", limit, "enriched features.\nAssigning the cells to cluster of origin.\n")
diffmat_n$true_subcluster[i] = cluster_of_origin
} else{
n_cell_assigned = n_cell_assigned + length(group_cells)
diffmat_n$true_subcluster[i] = paste0(cluster_of_origin, "_", cluster_u[i])
}
} else{
if(verbose) cat(cluster_u[i], " cluster has less than", min_cluster_size, " cells. \nAssigning the cells to cluster of origin.\n")
diffmat_n$true_subcluster[i] = cluster_of_origin
}
}
passing = TRUE
if(verbose) cat("Finished finding differences - ", n_cell_assigned/ ncol(object), " fraction of cells were assigned.\n")
if((n_cell_assigned/ ncol(object)) < min_frac_cell_assigned){
if(verbose) cat("Not enough cells were assigned - not clustering.\n")
passing = FALSE
}
out = list("diffmat_n" = diffmat_n, "res" = res, "passing_min_pct_cell_assigned" = passing)
return(out)
}
#' @param object A Seurat object preprocessed with Seurat.
#' @param output_dir The output directory in which to plot and save objects.
#' @param plotting A logical specifying wether to save the plots or not.
#' @param saving A logical specifying wether to save the data or not.
#' @param limit An integer specifying the minimum number of significantly
#' enriched / depleted features required in order for a subcluster to be called
#' a 'true' subcluster
#' @param dim_red The name of the slot to save the dimensionality reduction at
#' each step in the \code{Seurat::Reductions(object)}.
#' @param vizualization_dim_red The name of the slot used for plotting. Must be
#' a valid slot present in \code{Seurat::Reductions(object)}.
#' @param processing_function A function that re-process the subset of clusters
#' at each step. It msut take in entry a Seurat object, \code{dim_red}
#' and \code{n_dims} parameters and returns a Seurat containing a cell
#' embedding. See [processing_Seurat] for the default function.
#' @param differential_function A function that take in entry a
#' SingleCellExperiment object and parameters passed in ... and returns a
#' data.frame containing the significantly differential features for each
#' cluster. See [differential_edgeR_pseudobulk_LRT] for the default function.
#' @param logFC.th A numeric specifying the log2 fold change of activation
#' above/below which a feature is considered as significantly differential
#' passed to the differential_function.
#' @param qval.th A numeric specifying the adjusted p-value below
#' which a feature is considered as significantly differential passed to the
#' differential_function.
#' @param min_frac_cell_assigned A numeric between 0 and 1 specifying the
#' minimum percentage of the total cells in the SingleCellExperiment object that
#' needs to be assigned. If a lower proportion is assigned, all cells are
#' assigned to the cluster of origin.
#' @param max_k An integer specifying the maximum number of nearest neighbors to use for
#' the Louvain clustering at each iteration. This k is reduced with the number of cells,
#' to a minimum of k = 5.
#' @param k_percent A numeric between 0 and 1 representing the fraction of cells
#' to calculate the k for the KNN graph calulation of clustering.
#' @param resolution A numeric specifying the resolution to use for the Louvain
#' clustering at each iteration.
#' @param starting.resolution A numeric specifying the resolution to use for the
#' Louvain clustering of the first iteration. It is recommended to set it quite
#' low in order to have few starting clusters.
#' @param n_dims An integer specifying the number of first dimensions to keep
#' in the dimensionality reduction step.
#' @param color Set of colors to use for the coloring of the clusters. This must
#' contains enough colors for each cluster (minimum 20 colors, but 100 colors
#' at least is recommended, based on the dataset).
#' @param force_initial_clustering A logical specifying wether to force the
#' initial number of cluster between 2 and 6. This is in order to avoid a too high
#' number of initial clusters which would be equivalent to a classical louvain
#' clustering.
#' @param verbose A logical specifying wether to print.
#' @param ... Additional parameters passed to the differential_function. See
#' [differential_edgeR_pseudobulk_LRT()] for more information on additional
#' parameters for the default function.
#'
#' @return The Seurat object with the assignation of cells to clusters.
#' If saving is true, also saves list of differential analyses, differential
#' analyses summaries and embeddings for each re-clustered cluster.
#'
#'
#' @import ggplot2
#' @import dplyr
#' @importFrom grDevices colors
#' @importFrom qs qsave
#' @export
#' @rdname iterative_differential_clustering
#' @exportS3Method iterative_differential_clustering Seurat
#'
iterative_differential_clustering.Seurat <- function(
object,
output_dir = "./",
plotting = TRUE,
saving = TRUE,
n_dims = 50,
dim_red = "pca",
vizualization_dim_red = "umap",
processing_function = processing_Seurat,
differential_function = differential_edgeR_pseudobulk_LRT,
logFC.th = log2(2),
qval.th = 0.01,
min_frac_cell_assigned = 0.1,
limit = 5,
starting.resolution = 0.1,
starting.k = 100,
resolution = 0.1,
max_k = 50,
k_percent = 0.1,
color = NULL,
force_initial_clustering = TRUE,
verbose = TRUE,
...
){
set.seed(47)
if(plotting == TRUE){
if(dir.exists(output_dir))
dir.create(file.path(output_dir, "iterations"), showWarnings = FALSE) else stop("output_dir is an invalid directory")
}
# Preprocessing the object & Running UMAP if needed
object = processing_function(object, n_dims = n_dims, dim_red = dim_red)
if(vizualization_dim_red == "umap") {
object <- FindNeighbors(object, dims = seq_len(n_dims))
object = Seurat::RunUMAP(object, dims = seq_len(n_dims), reduction = dim_red)
}
# Colors for the plot
if (is.null(color)){
color = grDevices::colors()[grep('gr(a|e)y', grDevices::colors(), invert = TRUE)]
color <- sample(color, 400)
} else {
stopifnot(is.character(color))
if(length(color) < 20) warning("IDclust::iterative_differential_clustering_scRNA - ",
"The color vector might be too short.",
"Please precise more colors.")
color = unname(color)
}
# For the first partition, try to find very low level clusters (e.g. low
# resolution, high number of neighbors)
object = Seurat::FindNeighbors(object, reduction = dim_red, k.param = starting.k, dims = 1:n_dims, verbose = FALSE)
if(force_initial_clustering){
nclust = 1
factor = 1
max_iter = 100
iter = 0
while( (nclust < 2 | nclust > 6) & iter < max_iter){
object = Seurat::FindClusters(object, algorithm = 2,
resolution = factor * starting.resolution,
random.seed = 47, verbose = FALSE)
nclust = length(unique(object$seurat_clusters))
if(nclust < 2) factor = 1.15 * factor
if(nclust > 6) factor = 0.85 * factor
iter = iter + 1
print(nclust)
}
if(iter >= max_iter) {
stop("IDclust::iterative_differential_clustering - Didn't manage",
" to find more than 1 initial cluster...")
}
} else {
object = Seurat::FindClusters(object, algorithm = 2, resolution = starting.resolution, random.seed = 47, verbose = FALSE)
}
object$IDcluster = object$seurat_clusters
object$IDcluster <- paste0("A",as.numeric(object$IDcluster))
Seurat::Idents(object) = object$IDcluster
# Starting list of clusters to re-cluster
cluster_u = unique(object$IDcluster)
# Find initial differences (even if there are none, the initial clusters
# are always considered as true clusters).
if(verbose) cat("Initial round of clustering - limit of differential genes set to 0",
" for this first round only.\n")
DA = find_differentiated_clusters(
object,
differential_function = differential_function,
by = "IDcluster",
logFC.th = logFC.th,
qval.th = qval.th,
min_frac_cell_assigned = min_frac_cell_assigned,
cluster_of_origin = "Alpha",
limit = 0,
verbose = verbose,
...)
# Starting list of clusters to re-cluster
differential_summary_df = DA$diffmat_n
object$IDcluster = differential_summary_df$true_subcluster[match(object$IDcluster, differential_summary_df$subcluster)]
# List of embeddings
list_embeddings = list(object@reductions[[dim_red]]@cell.embeddings )
names(list_embeddings)[1] = paste(unique(object$IDcluster), collapse = "_")
# List of differential analyses
res = DA$res
rownames(res) = NULL
if(nrow(res) > 0) res$cluster_of_origin = "Alpha"
list_res = list(res)
names(list_res)[1] = "Alpha"
iteration = 0
gc()
while(iteration < nrow(differential_summary_df)){
if(plotting == TRUE){
# Plot initial
png(file.path(output_dir, "iterations", paste0("Iteration_",iteration,".png")), width = 1600, height = 1200, res = 200)
object. = object
object.$IDcluster = gsub("Alpha_","",object.$IDcluster)
print(
Seurat::DimPlot(object, group.by = "IDcluster", reduction = vizualization_dim_red, cols = color)
)
dev.off()
}
iteration = iteration + 1
if(verbose) cat("Doing iteration number ", iteration,"...\n")
partition_cluster_of_origin = differential_summary_df$true_subcluster[iteration]
if(!(partition_cluster_of_origin %in% differential_summary_df$true_subcluster[duplicated(differential_summary_df$true_subcluster)])){
partition_depth = which(LETTERS == substr(gsub(".*_","",partition_cluster_of_origin),1,1)) + 1
# Select first cluster
object. = object[, which(object$IDcluster %in% partition_cluster_of_origin)]
if(verbose) cat("Re-calculating PCA and subclustering for cluster", partition_cluster_of_origin,".\n")
if(ncol(object.) > max(10, n_dims)){
# Re-processing sub-cluster
object. = processing_function(object., n_dims = n_dims, dim_red = dim_red)
# Re-clustering sub-cluster
k = max(10, min(max_k, k_percent * ncol(object.))) # select a k according to the number of cells
object. = Seurat::FindNeighbors(object., reduction = dim_red, k.param = k, verbose = FALSE)
object. = Seurat::FindClusters(object., algorithm = 2, resolution = resolution,
random.seed = 47, verbose = FALSE)
object.$IDcluster = paste0(LETTERS[partition_depth], as.numeric(object.$seurat_clusters))
Seurat::Idents(object.) = object.$IDcluster
clusters = object.$IDcluster
cluster_u = unique(clusters)
if(length(cluster_u) > 1 ){
if(verbose) cat("Found", length(cluster_u),"subclusters.\n")
if(verbose) print(table(clusters))
## Differential analysis
DA = find_differentiated_clusters(object.,
differential_function = differential_function,
by = "IDcluster",
logFC.th = logFC.th,
qval.th = qval.th,
min_frac_cell_assigned = min_frac_cell_assigned,
limit = limit,
cluster_of_origin = partition_cluster_of_origin,
verbose = verbose,
...)
# Retrieve DA results
diffmat_n = DA$diffmat_n
res = DA$res
rownames(res) = NULL
res$cluster_of_origin = partition_cluster_of_origin[min(1,nrow(res))]
list_res[[partition_cluster_of_origin]] = res
list_embeddings[[partition_cluster_of_origin]] = object.@reductions[[dim_red]]@cell.embeddings
# If more than 'min_frac_cell_assigned' of the cells were assigned
# to 'true' subclusters (with marker features)
if(!isFALSE(DA$passing_min_pct_cell_assigned)){
# Add the new sublclusters to the list of clusters
differential_summary_df = rbind(differential_summary_df, diffmat_n)
object.$IDcluster = diffmat_n$true_subcluster[match(object.$IDcluster, diffmat_n$subcluster)]
object$IDcluster[match(colnames(object.), colnames(object))] = object.$IDcluster
Seurat::Idents(object.) = object.$IDcluster
if(plotting == TRUE){
png(file.path(output_dir, paste0(partition_cluster_of_origin,"_true.png")), width = 1400, height = 1200, res = 200)
print(
Seurat::DimPlot(object., reduction = dim_red, cols = color) + ggtitle(paste0(partition_cluster_of_origin, " - true"))
)
dev.off()
}
} else{
if(verbose) cat("Found 2 subcluster for",
partition_cluster_of_origin," but with no difference. Maximum clustering reached...\n")
}
} else{
if(verbose) cat("Found only 1 subcluster for",
partition_cluster_of_origin,". Maximum clustering reached...\n")
}
}
gc()
} else{
if(verbose) cat(partition_cluster_of_origin,
"- This cluster was formed by 'unassigned' cells, not clustering it further...\n")
}
}
if(verbose) cat("\n\n\n##########################################################\nFinished !\nFound a total of", length(unique(object$IDcluster)),"clusters after",iteration ,"iterations.",
"\nThe average cluster size is ",floor(mean(table(object$IDcluster)))," and the median is",floor(median(table(object$IDcluster))),".",
"\nThe number of initital clusters not subclustered is ",length(grep("_", unique(object$IDcluster),invert = TRUE)),".",
"\n##########################################################\n")
## Saving results
if(saving){
# Table of differential features for each re-clustering
IDC_DA = do.call("rbind", list_res)
if(nrow(IDC_DA) > 0){
IDC_DA$IDcluster = paste0(IDC_DA$cluster_of_origin, "_", IDC_DA$cluster)
} else{
IDC_DA$IDcluster = character(0)
warning("There were no differential genes found between clusters, keeping first round only.")
}
write.csv(IDC_DA, file = file.path(output_dir, "IDC_DA.csv"), quote = FALSE, row.names = FALSE)
# List of embedding of each re-clustered cluster
qs::qsave(list_embeddings, file.path(output_dir, "IDC_embeddings.qs"))
# Summary table of the number of differential features for each re-clustering
write.csv(differential_summary_df, file = file.path(output_dir, "IDC_summary.csv"), quote = FALSE, row.names = FALSE)
# Final SingleCellExperiment with the clusters found by IDC
qs::qsave(object, file.path(output_dir, "Seu_IDC.qs"))
}
return(object)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.