#' Plot RMSD-to-size plots for clustering result
#'
#' Plots a grid of RMSD-to-size plots per manually assigned population.
#' This is an experimental tool to assess dissimilarities between manual labelling of data and the result of automated clustering.
#'
#' RMSD (root mean square deviation) is a measure of the average variation in signal across all parameters.
#' A high RMSD will be seen in groups of points that span a large part of the high-dimensional space or are irregular in shape.
#' In the plotted grid, vertical axis shows RMSD and horizontal axis shows size.
#'
#' The RMSD-to-size ratio is computed for each manually labelled population, each cluster mapped to it (by the fixed-cluster scheme) and the union of all the clusters matched to it.
#' Properties of clusters can be investigated based on their position in the grid relative to the matched (reference) population.
#' We assume that
#'
#' * clusters in the bottom-left quadrant (lower RMSD and smaller in size than the reference population) correspond to real sub-populations that go beyond the resolution of the manual annotation and are valuable to explore,
#'
#' * clusters in the bottom-right quadrant (lower RMSD and bigger in size) correspond to clusters that identified some different population, which nonetheless overlaps with the identified population,
#'
#' * clusters to the right of the reference population (with about the same RMSD and bigger in size) correspond to clusters that contain the matched population, but also include some unlabelled or mislabelled cells from it that might need to be included also,
#'
#' * clusters in the top-right quadrant (with higher RMSD and bigger in size) are a result from under-clustering,
#'
#' * clusters in the top-left quadrant (with higher RMSD and smaller in size) are a result of over-clustering.
#'
#' Then for example, if a population mostly has clusters in the bottom-left quadrant matched to it, it seems the clustering identified smaller constituent populations that go beyond the resolution of manual labelling.
#' If a gating strategy has many unlabelled cells and we see clusters that are to the right of the reference population on the grid, these might likely correspond to unlabelled cells that really belong to the reference.
#'
#' @param benchmark an object of class \code{Benchmark}, as generated by the constructor \code{Benchmark} and evaluated using \code{Evaluate.Benchmark}
#' @param idx.subpipeline integer value: index of sub-pipeline that includes a projection step
#' @param idx.n_param integer value: index of *n*-parameter iteration if *n*-parameters were used. Deault value is \code{NULL}
#' @param idx.run integer value: number of run to select if multiple (repeated) runs of clustering algorithm are available. Default value is \code{1}
#'
#' @export
PlotRMSDToSize <- function(
benchmark,
idx.subpipeline,
idx.n_param = NULL,
idx.run = 1
) {
if (is.null(idx.run)) {
stop('idx.run must be specified (or left as 1)')
}
.PlotClustering.ValidityChecks(environment())
rmsd_real <- GetRMSDPerPopulation(benchmark, idx.subpipeline, idx.n_param, idx.run, match_type = 'real')
rmsd_fixed_cluster <- GetRMSDPerPopulation(benchmark, idx.subpipeline, idx.n_param, idx.run, match_type = 'fixed_cluster')
sizes_real <- GetPopulationSizes(benchmark)
sizes_fixed_cluster <- GetMatchedClusterSizes(benchmark, idx.subpipeline, idx.n_param, idx.run, match_type = 'fixed_cluster')
df.real <-
data.frame(
Population = names(rmsd_real),
ClassType = 'Labelled Population',
ClusterIndex = NA,
RMSD = unlist(rmsd_real),
Size = unlist(sizes_real),
vintercept = unlist(rmsd_real),
hintercept = unlist(sizes_real),
PointSize = 4,
Alpha = 1.0,
row.names = NULL
)
pops <- unlist(
purrr::map(names(rmsd_fixed_cluster),
function(n) rep(n, times = sum(!is.na(x <- rmsd_fixed_cluster[[n]])))))
rmsd_fc <- unlist(rmsd_fixed_cluster)
rmsd_fc <- rmsd_fc[!is.na(rmsd_fc)]
sizes_fc <- unlist(sizes_fixed_cluster)
sizes_fc <- sizes_fc[sizes_fc != 0]
cl_idcs <- unlist(purrr::map(rmsd_fixed_cluster, names))
cl_idcs <- cl_idcs[cl_idcs != 'NA']
cl_idcs <- as.numeric(cl_idcs)
cl <- GetClustering(benchmark, idx.subpipeline, idx.n_param, all_runs = TRUE, concatenate = TRUE)
if (is.list(cl))
cl <- cl[[if (is.null(idx.run)) 1 else idx.run]]
m <- GetLabelClusterMatching(benchmark, idx.subpipeline, idx.n_param, idx.run)$`Fixed Cluster`
exprs <- GetExpressionMatrix(b, concatenate = TRUE)
multipops <- unique(pops[duplicated(pops)])
df.multipops <- purrr::map(
multipops,
function(pop) {
idcs <- cl %in% as.numeric(m$Cluster[m$Population == pop])
data.frame(
Population = pop,
ClassType = 'Union of Matched Clusters',
ClusterIndex = NA,
RMSD = rmsd_per_cluster(exprs, idcs),
Size = sum(idcs),
vintercept = NA,
hintercept = NA,
PointSize = 4,
Alpha = 0.7,
row.names = NULL
)
}
)
df.multipops <- do.call(rbind, df.multipops)
rm(exprs)
rm(cl)
df.fixed_cluster <-
data.frame(
Population = pops,
ClassType = 'Cluster Matched to Population',
ClusterIndex = cl_idcs,
RMSD = rmsd_fc,
Size = sizes_fc,
vintercept = NA,
hintercept = NA,
PointSize = 3,
Alpha = 0.7,
row.names = NULL
)
data <- rbind(df.real, df.multipops, df.fixed_cluster)
rownames(data) <- NULL
nn <- GetNParameterIterationName(benchmark, idx.subpipeline, idx.n_param)
ggplot(data, aes(x = Size, y = RMSD, col = ClassType)) +
geom_vline(aes(xintercept = hintercept), col = 'darkgrey') +
geom_hline(aes(yintercept = vintercept), col = 'darkgrey') +
geom_point(size = data$PointSize, alpha = data$Alpha) + geom_text(aes(label = ClusterIndex), size = 2.7, col = 'black') +
facet_wrap(~ Population) + theme_grey() +
ggtitle('RMSD/size ratios of labelled populations and matched clusters', nn)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.