knitr::opts_chunk$set(echo = TRUE)
This vignette reproduces Figures 2 in the scNMF manuscript for the pbmc3k dataset, although any dataset can be used simply by loading it as seuratObj
on line 24.
Start off by running the standard Seurat pipeline (normalize data %>% scale data %>% run pca %>% choose dimensions for umap %>% run umap). These steps are the most memory intensive and for larger datasets may need to be run on more than a standard laptop.
library(scNMF) library(reshape2) library(Seurat) # library(SeuratData) # InstallData("pbmc3k") seuratObj <- readRDS("moca7k.rds") # this file is available on Github in the "data" folder seuratObj <- SCTransform(seuratObj) seuratObj <- FindVariableFeatures(seuratObj, verbose = 1) seuratObj <- ScaleData(seuratObj, verbose = 1) seuratObj <- RunPCA(seuratObj, npcs = 30, verbose = 1)
seuratObj <- RunUMAP(seuratObj, dims = 1:30, verbose = FALSE)
Run cluster.divisive
with several stopping criteria:
min.samples.arr <- c(100,50,10,5,5,5) min.dist.arr <- c("NA","NA","NA",0.15,0.1,0.05) ident.prefixes <- c("cells100", "cells25", "cells5", "dist150", "dist100", "dist50") criteria <- c("100 cells", "25 cells", "5 cells", "0.15 dist", "0.1 dist", "0.05 dist") for (i in 1:length(ident.prefixes)){ min.dist.iter <- NULL if(min.dist.arr[i] != "NA") min.dist.iter <- min.dist.arr[i] message("\n\nRunning clustering, min.samples = ", min.samples.arr[i], ", min.dist = ", min.dist.arr[i],"\n") seuratObj <- cluster.divisive( seuratObj, min.samples = min.samples.arr[i], min.dist = min.dist.iter, seed = 123, verbose = 1, idents.prefix = ident.prefixes[i], details = TRUE, reduction.name = ident.prefixes[i] ) }
Now retrieve information about the averaged expression of these clusters from the respective dimensional reduction objects in the seurat object:
# get array of avgs avgs <- list() for(i in 1:length(ident.prefixes)){ avgs[[criteria[i]]] <- list( "centers" = seuratObj@reductions[[ident.prefixes[i]]]@feature.loadings, "cells.per.center" = seuratObj@reductions[[ident.prefixes[i]]]@misc$cells.per.center, "cos.dists" = seuratObj@reductions[[ident.prefixes[i]]]@misc$cos.dists, "cells.in.center" = seuratObj@reductions[[ident.prefixes[i]]]@misc$cells.in.center ) }
Count the number of clusters, the number of cells per cluster, and the sparsity of clusters for each clustering run.
# number of clusters for each clustering run num_clusters <- reshape2::melt(unlist(lapply(avgs, function(x) ncol(x$centers)))) # number of cells per cluster for each clustering run library(reshape2) num_cells_per_cluster <- reshape2::melt(lapply(avgs, function(x) as.vector(x$cells.per.center))) # sparsity of clusters for each clustering run (and sparsity of the input dataset just for reference) d <- GetAssayData(seuratObj) calc.sparsity <- function(data) 1 - as.vector(apply(data, 2, function(x) length(which(x > 0))/length(x))) cluster_sparsity <- list() ifelse(ncol(d) > 5000, cluster_sparsity[["input cells"]] <- calc.sparsity(d[,1:5000]), cluster_sparsity[["input cells"]] <- calc.sparsity(d)) cluster_sparsity <- reshape2::melt(c(cluster_sparsity, lapply(avgs, function(x) calc.sparsity(x$centers))))
Create ggplot2 barcharts, violin plots, or density histograms for each of these information types:
library(cowplot) library(ggplot2) library(dplyr) # sample.colors <- c("#757575","#DA4803","#F26815","#FE8D3C","#0F68AC","#2B8CBE","#4EB3D4") sample.colors <- c("#757575","#FE8D3C","#F26815","#DA4803","#4EB3D4","#2B8CBE","#0F68AC") # ************************ Figure 2A # barchart of number of clusters for each object roundUpTo <- function(x,to) {to*(x%/%to + as.logical(x%%to))} # ADJUST if needed based on dataset scale.y.roundUpTo <- 500 # round up to the nearest 500 num_clusters[,"name"] <- gsub("_", " ", ident.prefixes) num_clusters <- tibble(num_clusters) %>% mutate(row = row_number()) p_numclusters <- ggplot(data = num_clusters, aes(x = reorder(name,row), y = value, fill = reorder(name, row))) + geom_bar(stat = "identity", width = 0.8) + theme_classic() + xlab("Stopping critera") + ylab("number of clusters") + scale_fill_manual(values = c(sample.colors[2:7])) + scale_y_continuous(limits = c(0,roundUpTo(max(num_clusters$value),scale.y.roundUpTo)), expand = c(0,0)) + theme(axis.title.x = element_text(colour = "black"), axis.title.y = element_text(colour = "black")) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + NoLegend() + theme(aspect.ratio = 0.9) # ************************ Figure 2B # violin plot of number of cells per cluster for each object dat <- tibble(num_cells_per_cluster) %>% mutate(row = row_number()) p_numcells <- ggplot(dat, aes(x=reorder(L1,row), y=value, fill = reorder(L1, row))) + geom_boxplot(outlier.color = "#252525", outlier.size = 0.8, outlier.alpha = 0.5) + theme_classic() + xlab("Stopping critera") + ylab("cells per cluster") + scale_fill_manual(values = c(sample.colors[2:7])) + scale_y_continuous(trans="log10") + theme(axis.title.x = element_text(colour = "black"), axis.title.y = element_text(colour = "black")) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + NoLegend() + theme(aspect.ratio = 0.9) # ************************ Figure 2C # violin plot of sparsity of clusters cluster_sparsity <- tibble(cluster_sparsity) %>% mutate(row = row_number()) p_sparsity <- ggplot(cluster_sparsity, aes(x=reorder(L1,row), y=value, fill = reorder(L1,row))) + geom_boxplot(outlier.color = "#252525", outlier.size = 0.8, outlier.alpha = 0.5) + theme_classic() + xlab("Stopping critera") + ylab("sparsity of centers") + scale_fill_manual(values = c(sample.colors)) + scale_y_continuous(expand = c(0,0), limits = c(0,1)) + theme(axis.title.x = element_text(colour = "black"), axis.title.y = element_text(colour = "black")) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + NoLegend() + theme(aspect.ratio = 0.9) # ************************ Figure 2D # cosine distance of cells to cluster centers and an unclustered center for reference object.dists <- list() allcells.center <- rowMeans(d) object.dists[["unclustered"]] <- 1 - cosSparse(d, cbind(allcells.center,allcells.center))[,1] for(i in 1:length(avgs)) object.dists[[ident.prefixes[i]]] <- avgs[[i]]$cos.dists cos.dists <- tibble(reshape2::melt(object.dists)) %>% mutate(row = row_number()) p_cosdists <- ggplot(cos.dists, aes(x=reorder(L1,row), y=value, fill = reorder(L1,row))) + geom_boxplot(outlier.color = "#FFFFFF") + theme_classic() + xlab("Stopping critera") + ylab("cell dists from centers") + scale_fill_manual(values = c(sample.colors)) + scale_y_continuous(expand = c(0,0), limits = c(0,1)) + theme(axis.title.x = element_text(colour = "black"), axis.title.y = element_text(colour = "black")) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + NoLegend() + theme(aspect.ratio = 0.9)
Use scNMF:: cluster.dimplot
to project cluster centers onto UMAP coordinates from a PCA projection as bubbles over grayed-out cells:
# ************************ Figure 2E-I p <- list() for(i in 1:length(avgs)) p[[i]] <- cluster.dimplot(seuratObj, centers = avgs[[i]], bubble.color = sample.colors[i+1], cells.color = "#a0a0a0", max.scale.size = 4000, bubble.scale = "sqrt", scale.breaks = c(10, 100, 1000), bubble.alpha = 0.5) + ggtitle(paste0("Stopping criteria: ", criteria[i])) + theme(aspect.ratio = 1) fig_top_row <- plot_grid( p_numclusters, p_numcells, p_sparsity, p_cosdists, ncol = 4, labels = c("A","B","C","D"), align = "hv") fig_bubble_plots <- plot_grid( p[[1]] + NoLegend(), p[[2]] + NoLegend(), p[[3]] + NoLegend(), get_legend(p[[1]]), p[[4]] + NoLegend(), p[[5]] + NoLegend(), p[[6]] + NoLegend(), get_legend(p[[1]]), rel_widths = c(1,1,1,0.3), labels = c("E","F","G","","H","I","J",""), nrow = 2, ncol = 4 ) fig <- plot_grid( fig_top_row, fig_bubble_plots, nrow = 2, rel_heights = c(1,2) )
Save the figure to the working directory:
ggsave("fig2.png", plot = fig, path = "C:/Users/zacha/Desktop/scNMF", width = 10, height = 10, units = "in", dpi = "retina")
Note: Use of a high-performance computing environment is highly recommended, although the pbmc3k dataset can easily be analyzed on a standard laptop. scNMF::nnmf...
functions benefit significantly from multithreading, and cross-validation can be time-consuming without parallelization.
To make this figure, we will use scNMF::nnmf.cv
, and scNMF::canyon.plot
to find the optimal rank of factorization for a clustering scheme with 5 cells minimum:
Run cross-validation with nnmf.cv
with splits by row or columns, a smart split or three random splits, and performed on the raw data or on the cluster centers.
We will use the compressed space calculated from the dist50
reduction.
max.k <- 50 canyon.plot(cv.clusters.smart.row) cv.clusters.smart.col <- nnlm.cv(seuratObj, byrow = FALSE, k = seq(5,30,1), smart.split = TRUE, seed = 123, reduction = "dist50") cv.clusters.col <- nnmf.cv(seuratObj, byrow = FALSE, k = seq(5,40,1), seed = 123, reduction = "dist50", n.starts = 3) cv.clusters.smart.row <- nnmf.cv(seuratObj, byrow = TRUE, k = seq(5,40,1), smart.split = TRUE, seed = 123, reduction = "dist50") cv.clusters.row <- nnmf.cv(seuratObj, byrow = TRUE, k = seq(5,40,1), seed = 123, reduction = "dist50", n.starts = 3) canyon.plot(cv.clusters.smart.row) canyon.plot(cv.clusters.smart.col) canyon.plot(cv.clusters.col) cv.clusters.row <- nnmf.cv(seuratObj, byrow = TRUE, k = seq(5,150,5), smart.split = TRUE, seed = 123, reduction = "dist50") canyon.plot(cv.clusters.smart.row) cv.rawdata.smart.row <- nnmf.cv(seuratObj, byrow = TRUE, k = seq(5,max.k,2), smart.split = TRUE, seed = 123, reduction = "dist50") cv.rawdata.smart.col <- nnmf.cv(seuratObj, byrow = FALSE, k = seq(5,max.k,1), smart.split = TRUE, seed = 123, use.idents = FALSE) A <- average.expression(seuratObj, ident = paste0("X0.01_dist_leaf.idents")) cv.clusters.smart.row <- nnmf.cv(A, byrow = TRUE, k = seq(5,max.k,1), smart.split = TRUE, seed = 123) cv.clusters.row <- nnmf.cv(A, byrow = TRUE, k = seq(5,max.k,1), n.starts = 3, seed = 123) cv.clusters.smart.col <- nnmf.cv(A, byrow = FALSE, k = seq(5,max.k,1), smart.split = TRUE, seed = 123) cv.clusters.col <- nnmf.cv(A, byrow = FALSE, k = seq(5,max.k,1), n.starts = 3, seed = 123)
scNMF::canyon.plot
can easily be used to visualize cross-validation results.
fig3a <- canyon.plot(cv.rawdata.smart.row, title = "Raw data\n(smart split by genes)") fig3b <- canyon.plot(cv.clusters.smart.row, title = "Cluster centers\n(smart split by genes)") fig3c <- canyon.plot(cv.clusters.row, title = "Cluster centers\n(random splits by genes)") fig3d <- canyon.plot(cv.rawdata.smart.col, title = "Raw data\n(smart split by cells)") fig3e <- canyon.plot(cv.clusters.smart.col, title = "Cluster centers\n(smart split by cells)") fig3f <- canyon.plot(cv.clusters.col, title = "Cluster centers\n(random splits by cells)") fig3 <- plot_grid( fig3a, fig3b, fig3c + NoLegend(), get_legend(fig3c), fig3d, fig3e, fig3f + NoLegend(), get_legend(fig3f), labels = c("A","B","C","","D","E","F",""), nrow = 2, ncol = 4, rel_widths = c(1,1,1,0.3) ) fig3
Save the figure to the working directory:
ggsave("fig3.png", plot = fig3, path = "C:/Users/zacha/Desktop/scNMF", width = 10, height = 7, units = "in", dpi = "retina")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.