knitr::opts_chunk$set(echo = TRUE, results = "hide", warning = FALSE, message = FALSE, tidy = TRUE)
This vignette demonstrates how to use basic functions in scNMF:
cluster.divisive
nnmf.cv
nnmf
cluster.dimplot
# library(remotes) # remotes::install_github("zdebruine/scNMF") library(scNMF)
Load the pbmc3k single cell dataset from the SeuratData
package and normalize with SCTransform on highly variable genes:
library(cowplot) library(Seurat) library(SeuratData) InstallData("pbmc3k") data("pbmc3k") pbmc3k <- FindVariableFeatures(pbmc3k, verbose = FALSE) pbmc3k <- NormalizeData(pbmc3k, verbose = FALSE)
pbmc3k
Run divisive clustering with a stopping criteria based on both the change in distance (how much more similar cells are on average to their assigned cluster than their opposing cluster) and a minimum number of cells:
pbmc3k <- cluster.divisive(pbmc3k, min.samples = 10, min.dist = 0.05, seed = 123, verbose = 3)
Or, simply specify a min.samples cutoff:
pbmc3k <- cluster.divisive(pbmc3k, min.samples = 10, seed = 123, verbose = 1, idents.prefix = "cells10_")
Use scNMF::cluster.dimplot
to project cluster centers onto a UMAP plot based on the average UMAP coordinates of all cells in that cluster:
# first generate UMAP coordinates from a PCA graph pbmc3k <- ScaleData(pbmc3k, verbose = FALSE) pbmc3k <- RunPCA(pbmc3k, npcs = 10, verbose = FALSE) pbmc3k <- RunUMAP(pbmc3k, dims = 1:10, verbose = FALSE) cluster.dimplot(pbmc3k, max.scale.size = 200)
Most cluster centers thoroughly sample PCA transcriptional space, but some clusters fall between clusters, indicating that there is additional variance between cells not represented by the PCA/UMAP projection.
To find the best rank for NMF, use scNMF::nnmf.cv
. This function takes as input a number of ranks to sample, and measures the robustness of factorization at each rank. Robustness is defined as the cosine angle between independent factorizations of non-overlapping halves of the dataset.
We will run nnmf.cv
with a split by genes (we have more sampling power with >10,000 genes compared to <200 clusters), set a seed to determine the indices for our split and make sure the same indices are used for all runs, and sample ranks of k between 5 and 25. Note that nnmf.cv pulls cluster centers from the "dclus" dimensional reduction object.
cv.row <- nnmf.cv(pbmc3k, byrow = TRUE, k = seq(4,20,1), seed = 123, n.starts = 5) p1 <- canyon.plot(cv.row) p2 <- canyon.plot(cv.row, line.collapse = TRUE) plot_grid( p1 + NoLegend() + ggtitle("Five random starts"), get_legend(p1), p2 + ggtitle("Average of five random starts"), ncol = 3, rel_widths = c(1,0.2,1))
We estimate the best rank to be around 9.
Note that this cross-validation requires substantial signal redundancy within the axis to be split. For example, if we cross-validate against columns there is no significant minima:
cv.col <- nnmf.cv(pbmc3k, byrow = FALSE, k = seq(4,20,1), seed = 123, n.starts = 3) p4 <- canyon.plot(cv.col) p5 <- canyon.plot(cv.col, line.collapse = TRUE) plot_grid( p4 + NoLegend() + ggtitle("Random starts (by columns)"), get_legend(p4), p5 + ggtitle("Average of random starts (by columns)"), ncol = 3, rel_widths = c(1,0.2,1))
Use the scNMF::runNMF
function to average the expression of leaf node clusters, run NMF at a given rank on these leaf nodes, project cells onto NMF coordinates, and update the Seurat object with the dimensional reduction result:
pbmc3k <- nnmf(pbmc3k, k = 9, reduction.use = "dclus")
pbmc3k <- RunUMAP(pbmc3k, reduction = "nmf", reduction.name = "umap.nmf", reduction.key = "umapnmf_", dims = 1:9, verbose = FALSE) DimPlot(pbmc3k, group.by = "leaf.idents", reduction = "umap.nmf", label = TRUE, repel = 5) + NoLegend()
Compare this to a Louvain graph-based clustering:
pbmc3k <- FindNeighbors(pbmc3k, dims = 1:10, reduction = "pca") pbmc3k <- FindClusters(pbmc3k, verbose = FALSE, resolution = 0.5) p1 <- DimPlot(pbmc3k, group.by = "leaf.idents", reduction = "umap", label = TRUE, repel = 5) + NoLegend() p2 <- DimPlot(pbmc3k, group.by = "seurat_clusters", reduction = "umap", label = TRUE, repel = 5) + NoLegend() p3 <- DimPlot(pbmc3k, group.by = "leaf.idents", reduction = "umap.nmf", label = TRUE, repel = 5) + NoLegend() p4 <- DimPlot(pbmc3k, group.by = "seurat_clusters", reduction = "umap.nmf", label = TRUE, repel = 5) + NoLegend() plot_grid(p1 + labs(title = "PCA coords, divisive clusters") + theme(plot.title = element_text(size=12)), p2 + labs(title = "PCA coords, graph-based clusters") + theme(plot.title = element_text(size=12)), p3 + labs(title = "NMF coords, divisive clusters") + theme(plot.title = element_text(size=12)), p4 + labs(title = "NMF coords, graph-based clusters") + theme(plot.title = element_text(size=12)), ncol = 2, nrow = 2)
We might also explore cluster center localizations on the PCA or NMF graphs with cluster.dimplot
:
pbmc3k <- FindClusters(pbmc3k, resolution = 25, verbose = FALSE) p5 <- cluster.dimplot(pbmc3k, ident = "cells10_leaf.idents", reduction.name = "umap", max.scale.size = 500, scale.breaks = c(10, 25, 50), bubble.color = "#DA4803", verbose = FALSE) p6 <- cluster.dimplot(pbmc3k, ident = "seurat_clusters", reduction.name = "umap", max.scale.size = 500, scale.breaks = c(10, 25, 50), bubble.color = "#0F68AC", verbose = FALSE) p7 <- cluster.dimplot(pbmc3k, ident = "cells10_leaf.idents", reduction.name = "umap.nmf", max.scale.size = 500, scale.breaks = c(10, 25, 50), bubble.color = "#DA4803", verbose = FALSE) p8 <- cluster.dimplot(pbmc3k, ident = "seurat_clusters", reduction.name = "umap.nmf", max.scale.size = 500, scale.breaks = c(10, 25, 50), bubble.color = "#0F68AC", verbose = FALSE) plot_grid(p5 + labs(title = "PCA coords, divisive clusters") + NoLegend() + theme(aspect.ratio = 0.9), p6 + labs(title = "PCA coords, graph-based clusters") + NoLegend() + theme(aspect.ratio = 0.9), get_legend(p5), p7 + labs(title = "NMF coords, divisive clusters") + NoLegend() + theme(aspect.ratio = 0.9), p8 + labs(title = "NMF coords, graph-based clusters") + NoLegend() + theme(aspect.ratio = 0.9), get_legend(p7), ncol = 3, nrow = 2, rel_widths = c(1,1,0.3))
This vignette was rendered on a Windows 10 desktop with 8 Gb of RAM and 6 threads in less than 10 minutes.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.