knitr::opts_chunk$set(echo = TRUE)

This vignette reproduces Figures 2 in the scNMF manuscript for the pbmc3k dataset, although any dataset can be used simply by loading it as seuratObj on line 24.

Preprocessing

Start off by running the standard Seurat pipeline (normalize data %>% scale data %>% run pca %>% choose dimensions for umap %>% run umap). These steps are the most memory intensive and for larger datasets may need to be run on more than a standard laptop.

library(scNMF)
library(reshape2)
library(Seurat)
# library(SeuratData)
# InstallData("pbmc3k")
seuratObj <- readRDS("moca7k.rds") # this file is available on Github in the "data" folder
seuratObj <- SCTransform(seuratObj)
seuratObj <- FindVariableFeatures(seuratObj, verbose = 1)
seuratObj <- ScaleData(seuratObj, verbose = 1)
seuratObj <- RunPCA(seuratObj, npcs = 30, verbose = 1)
seuratObj <- RunUMAP(seuratObj, dims = 1:30, verbose = FALSE)

Figure 2. Fast compression of transcriptional space with divisive clustering

Run cluster.divisive with several stopping criteria:

min.samples.arr <- c(100,50,10,5,5,5)
min.dist.arr <- c("NA","NA","NA",0.15,0.1,0.05)
ident.prefixes <- c("cells100", "cells25", "cells5", "dist150", "dist100", "dist50")
criteria <- c("100 cells", "25 cells", "5 cells", "0.15 dist", "0.1 dist", "0.05 dist")

for (i in 1:length(ident.prefixes)){

  min.dist.iter <- NULL
  if(min.dist.arr[i] != "NA") min.dist.iter <- min.dist.arr[i]

  message("\n\nRunning clustering, min.samples = ", min.samples.arr[i], ", min.dist = ", min.dist.arr[i],"\n")

  seuratObj <- cluster.divisive(
    seuratObj,
    min.samples = min.samples.arr[i],
    min.dist = min.dist.iter,
    seed = 123,
    verbose = 1,
    idents.prefix = ident.prefixes[i],
    details = TRUE,
    reduction.name = ident.prefixes[i]
  )
}

Now retrieve information about the averaged expression of these clusters from the respective dimensional reduction objects in the seurat object:

# get array of avgs
avgs <- list()
for(i in 1:length(ident.prefixes)){
  avgs[[criteria[i]]] <- list(
    "centers" = seuratObj@reductions[[ident.prefixes[i]]]@feature.loadings,
    "cells.per.center" = seuratObj@reductions[[ident.prefixes[i]]]@misc$cells.per.center,
    "cos.dists" = seuratObj@reductions[[ident.prefixes[i]]]@misc$cos.dists,
    "cells.in.center" = seuratObj@reductions[[ident.prefixes[i]]]@misc$cells.in.center
    )
}

Count the number of clusters, the number of cells per cluster, and the sparsity of clusters for each clustering run.

# number of clusters for each clustering run
num_clusters <- reshape2::melt(unlist(lapply(avgs, function(x) ncol(x$centers))))

# number of cells per cluster for each clustering run
library(reshape2)
num_cells_per_cluster <- reshape2::melt(lapply(avgs, function(x) as.vector(x$cells.per.center)))

# sparsity of clusters for each clustering run (and sparsity of the input dataset just for reference)
d <- GetAssayData(seuratObj)
calc.sparsity <- function(data) 1 - as.vector(apply(data, 2, function(x) length(which(x > 0))/length(x)))

cluster_sparsity <- list()
ifelse(ncol(d) > 5000,
  cluster_sparsity[["input cells"]] <- calc.sparsity(d[,1:5000]),
  cluster_sparsity[["input cells"]] <- calc.sparsity(d))
cluster_sparsity <- reshape2::melt(c(cluster_sparsity, lapply(avgs, function(x) calc.sparsity(x$centers))))

Create ggplot2 barcharts, violin plots, or density histograms for each of these information types:

library(cowplot)
library(ggplot2)
library(dplyr)

# sample.colors <- c("#757575","#DA4803","#F26815","#FE8D3C","#0F68AC","#2B8CBE","#4EB3D4")
sample.colors <- c("#757575","#FE8D3C","#F26815","#DA4803","#4EB3D4","#2B8CBE","#0F68AC")


# ************************ Figure 2A
# barchart of number of clusters for each object
roundUpTo <- function(x,to) {to*(x%/%to + as.logical(x%%to))}

# ADJUST if needed based on dataset
scale.y.roundUpTo <- 500 # round up to the nearest 500

num_clusters[,"name"] <- gsub("_", " ", ident.prefixes)
num_clusters <- tibble(num_clusters) %>% mutate(row = row_number())
p_numclusters <- ggplot(data = num_clusters, aes(x = reorder(name,row), y = value, fill = reorder(name, row))) + 
  geom_bar(stat = "identity", width = 0.8) + 
  theme_classic() + 
  xlab("Stopping critera") +
  ylab("number of clusters") + 
  scale_fill_manual(values = c(sample.colors[2:7])) + 
  scale_y_continuous(limits = c(0,roundUpTo(max(num_clusters$value),scale.y.roundUpTo)), expand = c(0,0)) +
  theme(axis.title.x = element_text(colour = "black"), axis.title.y = element_text(colour = "black")) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
  NoLegend() +
  theme(aspect.ratio = 0.9)

# ************************ Figure 2B
# violin plot of number of cells per cluster for each object
dat <- tibble(num_cells_per_cluster) %>% mutate(row = row_number())
p_numcells <- ggplot(dat, aes(x=reorder(L1,row), y=value, fill = reorder(L1, row))) +
  geom_boxplot(outlier.color = "#252525", outlier.size = 0.8, outlier.alpha = 0.5) +
  theme_classic() + 
  xlab("Stopping critera") +
  ylab("cells per cluster") + 
  scale_fill_manual(values = c(sample.colors[2:7])) + 
  scale_y_continuous(trans="log10") + 
  theme(axis.title.x = element_text(colour = "black"), axis.title.y = element_text(colour = "black")) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
  NoLegend() + 
  theme(aspect.ratio = 0.9)

# ************************ Figure 2C
# violin plot of sparsity of clusters
cluster_sparsity <- tibble(cluster_sparsity) %>% mutate(row = row_number())
p_sparsity <- ggplot(cluster_sparsity, aes(x=reorder(L1,row), y=value, fill = reorder(L1,row))) +
  geom_boxplot(outlier.color = "#252525", outlier.size = 0.8, outlier.alpha = 0.5) +
  theme_classic() + 
  xlab("Stopping critera") +
  ylab("sparsity of centers") + 
  scale_fill_manual(values = c(sample.colors)) + 
  scale_y_continuous(expand = c(0,0), limits = c(0,1)) + 
  theme(axis.title.x = element_text(colour = "black"), axis.title.y = element_text(colour = "black")) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
  NoLegend() + 
  theme(aspect.ratio = 0.9)

# ************************ Figure 2D
# cosine distance of cells to cluster centers and an unclustered center for reference
object.dists <- list()
allcells.center <- rowMeans(d)
object.dists[["unclustered"]] <- 1 - cosSparse(d, cbind(allcells.center,allcells.center))[,1]
for(i in 1:length(avgs)) object.dists[[ident.prefixes[i]]] <- avgs[[i]]$cos.dists
cos.dists <- tibble(reshape2::melt(object.dists)) %>% mutate(row = row_number())
p_cosdists <- ggplot(cos.dists, aes(x=reorder(L1,row), y=value, fill = reorder(L1,row))) + 
  geom_boxplot(outlier.color = "#FFFFFF") +
  theme_classic() + 
  xlab("Stopping critera") +
  ylab("cell dists from centers") + 
  scale_fill_manual(values = c(sample.colors)) + 
  scale_y_continuous(expand = c(0,0), limits = c(0,1)) + 
  theme(axis.title.x = element_text(colour = "black"), axis.title.y = element_text(colour = "black")) + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
  NoLegend() + 
  theme(aspect.ratio = 0.9)

Use scNMF:: cluster.dimplot to project cluster centers onto UMAP coordinates from a PCA projection as bubbles over grayed-out cells:

# ************************ Figure 2E-I
p <- list()
for(i in 1:length(avgs))
  p[[i]] <- cluster.dimplot(seuratObj, centers = avgs[[i]], bubble.color = sample.colors[i+1], cells.color = "#a0a0a0", max.scale.size = 4000, bubble.scale = "sqrt", scale.breaks = c(10, 100, 1000), bubble.alpha = 0.5) +
  ggtitle(paste0("Stopping criteria: ", criteria[i])) +
  theme(aspect.ratio = 1)

fig_top_row <- plot_grid(
  p_numclusters, 
  p_numcells, 
  p_sparsity, 
  p_cosdists, 
  ncol = 4,
  labels = c("A","B","C","D"),
  align = "hv")

fig_bubble_plots <- plot_grid(
  p[[1]] + NoLegend(),
  p[[2]] + NoLegend(),
  p[[3]] + NoLegend(),
  get_legend(p[[1]]),
  p[[4]] + NoLegend(),
  p[[5]] + NoLegend(),
  p[[6]] + NoLegend(),
  get_legend(p[[1]]),
  rel_widths = c(1,1,1,0.3),
  labels = c("E","F","G","","H","I","J",""),
  nrow = 2,
  ncol = 4
)

fig <- plot_grid(
  fig_top_row,
  fig_bubble_plots,
  nrow = 2,
  rel_heights = c(1,2)
)

Save the figure to the working directory:

ggsave("fig2.png", plot = fig, path = "C:/Users/zacha/Desktop/scNMF", width = 10, height = 10, units = "in", dpi = "retina")

Figure 3. NMF rank determination by cross-validation against angualr similarity of independent factorizations

Note: Use of a high-performance computing environment is highly recommended, although the pbmc3k dataset can easily be analyzed on a standard laptop. scNMF::nnmf... functions benefit significantly from multithreading, and cross-validation can be time-consuming without parallelization.

Figure 3A-B. Canyon plot of NMF on dataset in compressed space

To make this figure, we will use scNMF::nnmf.cv, and scNMF::canyon.plot to find the optimal rank of factorization for a clustering scheme with 5 cells minimum:

Run cross-validation with nnmf.cv with splits by row or columns, a smart split or three random splits, and performed on the raw data or on the cluster centers.

We will use the compressed space calculated from the dist50 reduction.

max.k <- 50

canyon.plot(cv.clusters.smart.row)

cv.clusters.smart.col <- nnlm.cv(seuratObj, byrow = FALSE, k = seq(5,30,1), smart.split = TRUE, seed = 123, reduction = "dist50")
cv.clusters.col <- nnmf.cv(seuratObj, byrow = FALSE, k = seq(5,40,1), seed = 123, reduction = "dist50", n.starts = 3)
cv.clusters.smart.row <- nnmf.cv(seuratObj, byrow = TRUE, k = seq(5,40,1), smart.split = TRUE, seed = 123, reduction = "dist50")
cv.clusters.row <- nnmf.cv(seuratObj, byrow = TRUE, k = seq(5,40,1), seed = 123, reduction = "dist50", n.starts = 3)
canyon.plot(cv.clusters.smart.row)
canyon.plot(cv.clusters.smart.col)
canyon.plot(cv.clusters.col)


cv.clusters.row <- nnmf.cv(seuratObj, byrow = TRUE, k = seq(5,150,5), smart.split = TRUE, seed = 123, reduction = "dist50")


canyon.plot(cv.clusters.smart.row)


cv.rawdata.smart.row <- nnmf.cv(seuratObj, byrow = TRUE, k = seq(5,max.k,2), smart.split = TRUE, seed = 123, reduction = "dist50")
cv.rawdata.smart.col <- nnmf.cv(seuratObj, byrow = FALSE, k = seq(5,max.k,1), smart.split = TRUE, seed = 123, use.idents = FALSE)

A <- average.expression(seuratObj, ident = paste0("X0.01_dist_leaf.idents"))
cv.clusters.smart.row <- nnmf.cv(A, byrow = TRUE, k = seq(5,max.k,1), smart.split = TRUE, seed = 123)
cv.clusters.row <- nnmf.cv(A, byrow = TRUE, k = seq(5,max.k,1), n.starts = 3, seed = 123)
cv.clusters.smart.col <- nnmf.cv(A, byrow = FALSE, k = seq(5,max.k,1), smart.split = TRUE, seed = 123)
cv.clusters.col <- nnmf.cv(A, byrow = FALSE, k = seq(5,max.k,1), n.starts = 3, seed = 123)

scNMF::canyon.plot can easily be used to visualize cross-validation results.

fig3a <- canyon.plot(cv.rawdata.smart.row, title = "Raw data\n(smart split by genes)")
fig3b <- canyon.plot(cv.clusters.smart.row, title = "Cluster centers\n(smart split by genes)")
fig3c <- canyon.plot(cv.clusters.row, title = "Cluster centers\n(random splits by genes)")
fig3d <- canyon.plot(cv.rawdata.smart.col, title = "Raw data\n(smart split by cells)")
fig3e <- canyon.plot(cv.clusters.smart.col, title = "Cluster centers\n(smart split by cells)")
fig3f <- canyon.plot(cv.clusters.col, title = "Cluster centers\n(random splits by cells)")
fig3 <- plot_grid(
  fig3a,
  fig3b,
  fig3c + NoLegend(),
  get_legend(fig3c),
  fig3d,
  fig3e,
  fig3f + NoLegend(),
  get_legend(fig3f),
  labels = c("A","B","C","","D","E","F",""),
  nrow = 2,
  ncol = 4,
  rel_widths = c(1,1,1,0.3)
)
fig3

Save the figure to the working directory:

ggsave("fig3.png", plot = fig3, path = "C:/Users/zacha/Desktop/scNMF", width = 10, height = 7, units = "in", dpi = "retina")


zdebruine/scNMF documentation built on Jan. 1, 2021, 1:50 p.m.