knitr::opts_chunk$set(echo = TRUE)
options(bitmapType="cairo")

This is a walkthrough comparing SWNE, UMAP, and t-SNE on a hematopoiesis dataset from the Mouse Cell Atlas. Based off of the analysis done by in Figure 2d of Becht, McInnes et al.

Load the required libraries and data. The Mouse Cell Atlas data as well as the UMAP, tSNE, and PCA reductions can be downloaded here courtesy of Becht, McInnes et al.

library(Matrix)
library(swne)

## Load data
load("~/swne/Data/Han/BM_BMcKit_PB_RData/xp.RData")
load("~/swne/Data/Han/BM_BMcKit_PB_RData/g2.RData")
load("~/swne/Data/Han/BM_BMcKit_PB_RData/cells_AUC.RData")

## Filter dataset
w2 <- !is.na(g2)
xp <- Matrix::t(xp[w2,])

load("~/swne/Data/Han/BM_BMcKit_PB_RData/pca_g2.RData")
rownames(pca) <- colnames(xp)

Assign labels to cells using the classifier results from Becht, McInnes et al.

## Assign labels to cells
lineages <- c("Multi Potential Progenitor", "Macrophage Lineage", "Neutrophil Lineage",
              "Erythrocyte Lineage", "B Cell Lineage", "T Cell Lineage", "NK Cell Lineage")
cutoffs <- setNames(c(0.04,0.09,0.05,0.045,0.09,0.075,0.04), lineages)

labels <- sapply(lineages, function(i) cells_AUC@assays$data[[1]][i,][w2] >= cutoffs[i])
labels <- apply(labels, 1, which)
labels <- sapply(labels, function(x) { if(length(x) == 1) {x} else {0} })
labels[labels != 0] <- lineages[labels[labels != 0]]
labels[labels == 0] <- NA
names(labels) <- colnames(xp)
labels <- factor(labels)
labels <- plyr::revalue(labels, replace = c("Multi Potential Progenitor" = "MPP",
                                            "Macrophage Lineage" = "Macrophage",
                                            "Neutrophil Lineage" = "Neutrophil",
                                            "Erythrocyte Lineage" = "Erythrocyte",
                                            "B Cell Lineage" = "B Cell",
                                            "T Cell Lineage" = "T Cell",
                                            "NK Cell Lineage" = "NK Cell"))
table(labels); paste("Cells with missing labels:", sum(is.na(labels)))

Make the t-SNE plot using the pre-computed t-SNE from Becht, McInnes et al.

## Set a seed to make sure the cluster colors are consistent
plot.seed <- 312525

load("~/swne/Data/Han/BM_BMcKit_PB_RData/tsne_g2.RData")
rownames(tsne) <- names(labels)
PlotDims(tsne, sample.groups = labels, show.legend = F, show.axes = F,
         alpha.plot = 0.75, label.size = 4, pt.size = 0.5,
         seed = plot.seed, use.brewer.pal = T)

Make the UMAP plot using the pre-computed UMAP from Becht, McInnes et al.

load("~/swne/Data/Han/BM_BMcKit_PB_RData/umap_g2.RData")
rownames(umap) <- names(labels)
PlotDims(umap, sample.groups = labels, show.legend = F, show.axes = F,
         alpha.plot = 0.75, label.size = 4, pt.size = 0.5,
         seed = plot.seed, use.brewer.pal = T)

Filter lowly expressed genes and get gene variance info

norm.xp <- xp*1000
norm.xp <- FilterData(norm.xp, min.samples.frac = 2.5e-4, trim = 1e-4, min.nonzero.features = 0,
                      max.sample.sum = Inf)
var.df <- AdjustVariance(norm.xp, verbose = F, plot = F)

Stabilize gene variances with a log-transformation

norm.xp@x <- log(norm.xp@x + 1)

Select variable genes to use

n.genes <- 4e3
var.df <- var.df[order(var.df$lp),]
var.genes <- rownames(var.df[1:n.genes,])

Run SWNE

n.cores <- 16
nmf.res <- RunNMF(norm.xp[var.genes,], k = 40, n.cores = n.cores, ica.fast = T)
nmf.res$W <- ProjectFeatures(norm.xp, nmf.res$H, n.cores = n.cores)

snn <- CalcSNN(t(pca), k = 30, prune.SNN = 0.0)
knn <- CalcKNN(t(pca), k = 30)
snn <- PruneSNN(snn, knn, qval.cutoff = 1e-3)
swne.embedding <- EmbedSWNE(nmf.res$H, SNN = snn, alpha.exp = 1.25, snn.exp = 0.25, n_pull = 3, 
                            proj.method = "sammon")
swne.embedding$H.coords$name <- ""

Embed some hematopoiesis marker genes and plot SWNE embedding

## Embed selected genes onto swne plot
genes.embed <- c("Ms4a1", "Cd4", "Ly6g", "Fcgr1")
swne.embedding <- EmbedFeatures(swne.embedding, nmf.res$W, genes.embed,
                                n_pull = 3)

## SWNE plot
PlotSWNE(swne.embedding, alpha.plot = 0.6, sample.groups = labels, do.label = T,
         label.size = 6, pt.size = 0.75, show.legend = F, seed = plot.seed,
         use.brewer.pal = T)

Next, we'll define some helper functions for quantitative benchmarking of these embeddings

library(FNN)
library(proxy)

## Calculate approximate kNN for an embedding
ComputeKNN <- function(emb, k) {
  my.knn <- FNN::get.knn(t(emb), k)

  n.cells <- ncol(emb)
  nn.ranked <- cbind(1:n.cells, my.knn$nn.index[, 1:(k - 1)])

  j <- as.numeric(x = t(x = nn.ranked))
  i <- ((1:length(x = j)) - 1) %/% k + 1
  nn.matrix <- as(sparseMatrix(i = i, j = j, x = 1, dims = c(ncol(emb), ncol(emb))), "dgCMatrix")
  rownames(nn.matrix) <- colnames(emb)
  colnames(nn.matrix) <- colnames(emb)

  nn.matrix
}


## Calculate Jaccard similarities
CalcJaccard <- function(x,y) {
  a <- sum(x)
  b <- sum(y)
  c <- sum(x == 1 & y == 1)
  c/(a + b - c)
}


## Calculate pairwise distances between centroids
CalcPairwiseDist <- function(data.use, clusters, dist.method = "euclidean") {
  data.centroids <- t(apply(data.use, 1, function(x) tapply(x, clusters, mean)))
  return(proxy::dist(data.centroids, method = dist.method, by_rows = F))
}

Compute how well each embedding maintains the pairwise distances between cell types relative to the original gene expression space. This is meant to capture how well each embedding maintains the global structure of the data.

## Compile embeddings
embeddings <- list(tsne = t(tsne), umap = t(umap))
swne.emb <- t(as.matrix(swne.embedding$sample.coords))

## Compute cluster distance correlations
label.cells <- names(labels[!is.na(labels)])
ref.dist <- CalcPairwiseDist(xp[,label.cells], labels[label.cells])

embeddings.cor <- sapply(embeddings, function(emb) {
  emb.dist <- CalcPairwiseDist(emb[,label.cells], labels[label.cells])
  cor(ref.dist, emb.dist)
})

## Compare the SWNE embedding to the variance stabilized expression
## space to ensure we're comparing apples to apples
norm.ref.dist <- CalcPairwiseDist(norm.xp[,label.cells], labels[label.cells])
swne.emb.dist <- CalcPairwiseDist(swne.emb[,label.cells], labels[label.cells])
embeddings.cor <- c(embeddings.cor, cor(norm.ref.dist, swne.emb.dist))
names(embeddings.cor) <- c("tsne", "umap", "swne")

Compute how well each embedding maintains the nearest neighbors of each cell relative to the original gene expression space. This is meant to capture how well each embedding maintains the local structure of the data.

# Calculate neighborhood fidelity
n.neighbors <- 30
# ref.knn <- ComputeKNN(xp[,label.cells], k = n.neighbors)
# norm.ref.knn <- ComputeKNN(norm.xp[,label.cells], k = n.neighbors)
load("~/swne/Data/Han/BM_BMcKit_PB_RData/Han_hemato_ref_knn.RData") ## Load pre-computed kNN networks to save time. Computing the kNN networks in the original expression space can take quite a bit of time.

## Compute kNN for embeddings
embeddings.knn <- lapply(embeddings, function(x) ComputeKNN(x, k = n.neighbors))
knn.simil <- sapply(embeddings.knn, function(knn.emb) {
  mean(sapply(1:ncol(knn.emb), function(i) CalcJaccard(knn.emb[,i], ref.knn[,i])))
})

swne.knn <- ComputeKNN(swne.emb, k = n.neighbors)
knn.simil <- c(knn.simil, mean(sapply(1:ncol(swne.knn), function(i) CalcJaccard(swne.knn[,i], ref.knn[,i])))) 
names(knn.simil) <- c("tsne", "umap", "swne")

Finally we'll plot the combined global/local structure results

library(ggplot2)
library(ggrepel)

scatter.df <- data.frame(x = knn.simil, y = embeddings.cor, name = names(embeddings.cor))
ggplot(scatter.df, aes(x, y)) + geom_point(size = 2, alpha = 1) +
  theme_classic() + theme(legend.position = "none", text = element_text(size = 16)) +
  xlab("Neighborhood Similarity") + ylab("Cluster Distance Correlation") +
  geom_text_repel(aes(x, y, label = name), size = 6.5) +
  xlim(0, max(knn.simil)) + ylim(0, max(embeddings.cor))


yanwu2014/swne documentation built on Aug. 5, 2023, 4:42 a.m.