knitr::opts_chunk$set(echo = TRUE) options(bitmapType="cairo")
This is a walkthrough comparing SWNE, UMAP, and t-SNE on a hematopoiesis dataset from the Mouse Cell Atlas. Based off of the analysis done by in Figure 2d of Becht, McInnes et al.
Load the required libraries and data. The Mouse Cell Atlas data as well as the UMAP, tSNE, and PCA reductions can be downloaded here courtesy of Becht, McInnes et al.
library(Matrix) library(swne) ## Load data load("~/swne/Data/Han/BM_BMcKit_PB_RData/xp.RData") load("~/swne/Data/Han/BM_BMcKit_PB_RData/g2.RData") load("~/swne/Data/Han/BM_BMcKit_PB_RData/cells_AUC.RData") ## Filter dataset w2 <- !is.na(g2) xp <- Matrix::t(xp[w2,]) load("~/swne/Data/Han/BM_BMcKit_PB_RData/pca_g2.RData") rownames(pca) <- colnames(xp)
Assign labels to cells using the classifier results from Becht, McInnes et al.
## Assign labels to cells lineages <- c("Multi Potential Progenitor", "Macrophage Lineage", "Neutrophil Lineage", "Erythrocyte Lineage", "B Cell Lineage", "T Cell Lineage", "NK Cell Lineage") cutoffs <- setNames(c(0.04,0.09,0.05,0.045,0.09,0.075,0.04), lineages) labels <- sapply(lineages, function(i) cells_AUC@assays$data[[1]][i,][w2] >= cutoffs[i]) labels <- apply(labels, 1, which) labels <- sapply(labels, function(x) { if(length(x) == 1) {x} else {0} }) labels[labels != 0] <- lineages[labels[labels != 0]] labels[labels == 0] <- NA names(labels) <- colnames(xp) labels <- factor(labels) labels <- plyr::revalue(labels, replace = c("Multi Potential Progenitor" = "MPP", "Macrophage Lineage" = "Macrophage", "Neutrophil Lineage" = "Neutrophil", "Erythrocyte Lineage" = "Erythrocyte", "B Cell Lineage" = "B Cell", "T Cell Lineage" = "T Cell", "NK Cell Lineage" = "NK Cell")) table(labels); paste("Cells with missing labels:", sum(is.na(labels)))
Make the t-SNE plot using the pre-computed t-SNE from Becht, McInnes et al.
## Set a seed to make sure the cluster colors are consistent plot.seed <- 312525 load("~/swne/Data/Han/BM_BMcKit_PB_RData/tsne_g2.RData") rownames(tsne) <- names(labels) PlotDims(tsne, sample.groups = labels, show.legend = F, show.axes = F, alpha.plot = 0.75, label.size = 4, pt.size = 0.5, seed = plot.seed, use.brewer.pal = T)
Make the UMAP plot using the pre-computed UMAP from Becht, McInnes et al.
load("~/swne/Data/Han/BM_BMcKit_PB_RData/umap_g2.RData") rownames(umap) <- names(labels) PlotDims(umap, sample.groups = labels, show.legend = F, show.axes = F, alpha.plot = 0.75, label.size = 4, pt.size = 0.5, seed = plot.seed, use.brewer.pal = T)
Filter lowly expressed genes and get gene variance info
norm.xp <- xp*1000 norm.xp <- FilterData(norm.xp, min.samples.frac = 2.5e-4, trim = 1e-4, min.nonzero.features = 0, max.sample.sum = Inf) var.df <- AdjustVariance(norm.xp, verbose = F, plot = F)
Stabilize gene variances with a log-transformation
norm.xp@x <- log(norm.xp@x + 1)
Select variable genes to use
n.genes <- 4e3 var.df <- var.df[order(var.df$lp),] var.genes <- rownames(var.df[1:n.genes,])
Run SWNE
n.cores <- 16 nmf.res <- RunNMF(norm.xp[var.genes,], k = 40, n.cores = n.cores, ica.fast = T) nmf.res$W <- ProjectFeatures(norm.xp, nmf.res$H, n.cores = n.cores) snn <- CalcSNN(t(pca), k = 30, prune.SNN = 0.0) knn <- CalcKNN(t(pca), k = 30) snn <- PruneSNN(snn, knn, qval.cutoff = 1e-3) swne.embedding <- EmbedSWNE(nmf.res$H, SNN = snn, alpha.exp = 1.25, snn.exp = 0.25, n_pull = 3, proj.method = "sammon") swne.embedding$H.coords$name <- ""
Embed some hematopoiesis marker genes and plot SWNE embedding
## Embed selected genes onto swne plot genes.embed <- c("Ms4a1", "Cd4", "Ly6g", "Fcgr1") swne.embedding <- EmbedFeatures(swne.embedding, nmf.res$W, genes.embed, n_pull = 3) ## SWNE plot PlotSWNE(swne.embedding, alpha.plot = 0.6, sample.groups = labels, do.label = T, label.size = 6, pt.size = 0.75, show.legend = F, seed = plot.seed, use.brewer.pal = T)
Next, we'll define some helper functions for quantitative benchmarking of these embeddings
library(FNN) library(proxy) ## Calculate approximate kNN for an embedding ComputeKNN <- function(emb, k) { my.knn <- FNN::get.knn(t(emb), k) n.cells <- ncol(emb) nn.ranked <- cbind(1:n.cells, my.knn$nn.index[, 1:(k - 1)]) j <- as.numeric(x = t(x = nn.ranked)) i <- ((1:length(x = j)) - 1) %/% k + 1 nn.matrix <- as(sparseMatrix(i = i, j = j, x = 1, dims = c(ncol(emb), ncol(emb))), "dgCMatrix") rownames(nn.matrix) <- colnames(emb) colnames(nn.matrix) <- colnames(emb) nn.matrix } ## Calculate Jaccard similarities CalcJaccard <- function(x,y) { a <- sum(x) b <- sum(y) c <- sum(x == 1 & y == 1) c/(a + b - c) } ## Calculate pairwise distances between centroids CalcPairwiseDist <- function(data.use, clusters, dist.method = "euclidean") { data.centroids <- t(apply(data.use, 1, function(x) tapply(x, clusters, mean))) return(proxy::dist(data.centroids, method = dist.method, by_rows = F)) }
Compute how well each embedding maintains the pairwise distances between cell types relative to the original gene expression space. This is meant to capture how well each embedding maintains the global structure of the data.
## Compile embeddings embeddings <- list(tsne = t(tsne), umap = t(umap)) swne.emb <- t(as.matrix(swne.embedding$sample.coords)) ## Compute cluster distance correlations label.cells <- names(labels[!is.na(labels)]) ref.dist <- CalcPairwiseDist(xp[,label.cells], labels[label.cells]) embeddings.cor <- sapply(embeddings, function(emb) { emb.dist <- CalcPairwiseDist(emb[,label.cells], labels[label.cells]) cor(ref.dist, emb.dist) }) ## Compare the SWNE embedding to the variance stabilized expression ## space to ensure we're comparing apples to apples norm.ref.dist <- CalcPairwiseDist(norm.xp[,label.cells], labels[label.cells]) swne.emb.dist <- CalcPairwiseDist(swne.emb[,label.cells], labels[label.cells]) embeddings.cor <- c(embeddings.cor, cor(norm.ref.dist, swne.emb.dist)) names(embeddings.cor) <- c("tsne", "umap", "swne")
Compute how well each embedding maintains the nearest neighbors of each cell relative to the original gene expression space. This is meant to capture how well each embedding maintains the local structure of the data.
# Calculate neighborhood fidelity n.neighbors <- 30 # ref.knn <- ComputeKNN(xp[,label.cells], k = n.neighbors) # norm.ref.knn <- ComputeKNN(norm.xp[,label.cells], k = n.neighbors) load("~/swne/Data/Han/BM_BMcKit_PB_RData/Han_hemato_ref_knn.RData") ## Load pre-computed kNN networks to save time. Computing the kNN networks in the original expression space can take quite a bit of time. ## Compute kNN for embeddings embeddings.knn <- lapply(embeddings, function(x) ComputeKNN(x, k = n.neighbors)) knn.simil <- sapply(embeddings.knn, function(knn.emb) { mean(sapply(1:ncol(knn.emb), function(i) CalcJaccard(knn.emb[,i], ref.knn[,i]))) }) swne.knn <- ComputeKNN(swne.emb, k = n.neighbors) knn.simil <- c(knn.simil, mean(sapply(1:ncol(swne.knn), function(i) CalcJaccard(swne.knn[,i], ref.knn[,i])))) names(knn.simil) <- c("tsne", "umap", "swne")
Finally we'll plot the combined global/local structure results
library(ggplot2) library(ggrepel) scatter.df <- data.frame(x = knn.simil, y = embeddings.cor, name = names(embeddings.cor)) ggplot(scatter.df, aes(x, y)) + geom_point(size = 2, alpha = 1) + theme_classic() + theme(legend.position = "none", text = element_text(size = 16)) + xlab("Neighborhood Similarity") + ylab("Cluster Distance Correlation") + geom_text_repel(aes(x, y, label = name), size = 6.5) + xlim(0, max(knn.simil)) + ylim(0, max(embeddings.cor))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.