In stevexniu/spots: Spatial Component Analysis

options(width = 150)
knitr::opts_chunk$set(
  collapse = TRUE,
  tidy = FALSE,
  message = FALSE,
  warning = FALSE
)

Load Visium Brain Data

For this tutorial, we will be analyzing the Visium Sagittal-Posterior Mouse Brain data produced by 10x Genomics.

A fraction of this data containing 3353 spatial barcodes (spots) and 2000 highly variable genes can be loaded using the LoadData function. We will be using Seurat for this analysis.

# install Seurat v4.0.0
if (!requireNamespace("Seurat", quietly = TRUE) | utils::packageVersion("Seurat") < "4.0.0")
  remotes::install_version("Seurat", version = "4.0.0")

# load data
library(spots)
library(Seurat)
library(ggplot2)
posterior1 <- LoadData("~/Downloads/","Visium.Brain")
SpatialDimPlot(posterior1, pt.size.factor = 1.2, group.by = "region")

Spatial Component Analysis (SCA)

To run Spatial Component Analysis (SCA), we will need to define a weight matrix between each pair of datapoints (spots).

To achieve this, we first calculate the distance between each spots using hexagonal nearest neighbor distance (VisiumHnn and HnnNeighbor functions) as it resembles the way Visium slides were generated.

Typically, datapoints not located on the edges should have 6 1st-degree, 12 2nd-degree, 18 3rd-degree neighbors and so on. These specific parameters can be controlled using HnnNeighbor function.

{width=400px}

To turn this distance matrix to a weight matrix, we call HnnWeight function to apply a Gaussian filter. I'm using parameter dist.k = 2 to consider only the 2nd-degree neighbors and sigma = 2 for filter width.

# HNN distance 
posterior1.hnn.dist <- VisiumHnn("~/Downloads/", Cells(posterior1))

# HNN weights
posterior1.data.hnn <- HnnNeighbor(posterior1.hnn.dist, k = 19, include.self = FALSE)
posterior1.data.weight <- HnnWeight(posterior1.data.hnn$dist.mat, dist.k = 2, sigma = 2)

After this, we are ready to run SCA using the 2000 spatial transcriptomic measurements and the weight matrix we just calculated. Here I set the n.eigen = 30 to return only the first 30 Spatial Components (SCs), and plot four different visualizations:

Dimensional reduction plot of first two SCs. (top left)
Elbow plot showing the eigenvalues or variance explained. (top right)
Visualization of the first SC levels in space. (bottom left)
Heatmap showing the features with the largest SC loadings. (bottom right)

# Run SCA
posterior1 <- ScaleData(posterior1)
posterior1.sca <- SCA(X = Matrix::t(posterior1@assays$Spatial@data), 
                      W = posterior1.data.weight, 
                      scaled.data = t(posterior1@assays$Spatial@scale.data),
                      n.eigen = 30)

# Store in the Seurat object
posterior1@reductions[["sca"]] <- CreateDimReducObject(embeddings = posterior1.sca$X, 
                                                       loadings = posterior1.sca$rotation, 
                                                       stdev = posterior1.sca$eigenvalues, 
                                                       key = "SC_", assay = "Spatial")

# Visualization
p1 <- DimPlot(posterior1, reduction = "sca", group.by = "region")
p2 <- ElbowPlot(posterior1, reduction = "sca", ndims = 30)
p3 <- SpatialFeaturePlot(posterior1, features = 'SC_1', pt.size.factor = 1.2)
p4 <- DimHeatmap(posterior1, reduction = "sca", fast = FALSE)
(p1+p2)/(p3+p4)

Downstream Analysis

Once finished, we can use the Spatial Components (SCs) for downstream analysis such as clustering. Here we utilized Seurat's FindClusters function to identify clusters of spots using the first 20 SCs we just calculated.

posterior1 <- FindNeighbors(posterior1, reduction = "sca", dims = 1:20,graph.name = c("sca.nn", "sca.snn"))
posterior1 <- FindClusters(posterior1, resolution = 1, verbose = FALSE, graph.name = "sca.snn")
SpatialDimPlot(posterior1, label = TRUE, label.size = 7, repel = TRUE, pt.size.factor = 1.2)

Comparison to PCA

We can compare the clustering result based on SCA to PCA. Here I used the same clustering parameters but with the first 20 PCs instead of SCs.

posterior1 <- RunPCA(posterior1, npcs = 30, verbose = FALSE)
posterior1 <- FindNeighbors(posterior1, reduction = "pca", dims = 1:20, graph.name = c("pca.nn", "pca.snn"))
posterior1 <- FindClusters(posterior1, resolution = 1, verbose = FALSE, graph.name = "pca.snn")
p1 <- SpatialDimPlot(posterior1, label = TRUE, label.size = 5, group.by = "sca.snn_res.1", repel = TRUE, pt.size.factor = 1.2) 
p2 <- SpatialDimPlot(posterior1, label = TRUE, label.size = 5, group.by = "pca.snn_res.1", repel = TRUE, pt.size.factor = 1.2) 
p1+p2

There are 19 clusters (0-18) and 17 clusters (0-16) that were identified by SCA and PCA respectively.

We can visualize the differences between the two clustering results using Chi-square test and interpret the result with Chi-square residuals.

# Chi-square test
tab <- xtabs(data = posterior1@meta.data[,c("sca.snn_res.1","pca.snn_res.1")])
chi.result <- chisq.test(tab)
chi.resid <- unclass(chi.result$residuals)

# Data for visualization
data.plot <- Seurat:::Melt(chi.resid)
colnames(data.plot)[1:3] <- c("SCA clustering", "PCA clustering", "Chi-square Residual")

# Reorder the columns
data.plot[,1] <- factor(data.plot[,1], 
                        levels = order(apply(chi.resid, 1, which.max))[c(1:10,12:11,13:19)]-1, 
                        ordered = TRUE)
data.plot[,2] <- factor(data.plot[,2], levels = unique(as.numeric(data.plot[,2])), ordered = TRUE)

# Heatmap
ggplot(data = data.plot, aes(x=`PCA clustering`, y=`SCA clustering`, fill=`Chi-square Residual`)) + 
  geom_tile() + cowplot::theme_cowplot(font_size = 20) + labs(fill="Chi-square\nResidual")

Conclusion

For this data, the clustering results based on SCA and PCA in general agree with each other. However, as we noted PCA cluster 4, 9 and 10 were split by the SCA each into two distinct groups.

Here, we will go through one of them in detail to check whether SCA made a false discovery or PCA underestimated certain distinct clusters.

PCA Cluster 4 vs SCA Cluster 10 and 14

cluster4 <- subset(posterior1, pca.snn_res.1 == 4 & sca.snn_res.1 %in% c(10, 14))
cluster4$sca.snn_res.1 <- factor(cluster4$sca.snn_res.1)
cluster4$pca.snn_res.1 <- factor(cluster4$pca.snn_res.1)
p1 <- SpatialDimPlot(cluster4, group.by = "pca.snn_res.1", pt.size.factor = 3, 
                     label = TRUE, label.size = 5) + 
  ggtitle("PCA Clustering") + NoLegend() + 
  theme(plot.title = element_text(size = 15, face = 2, hjust = 0.5))
p2 <- SpatialDimPlot(cluster4, group.by = "sca.snn_res.1", pt.size.factor = 3, 
                     label = TRUE, label.size = 5) + 
  ggtitle("SCA Clustering") + NoLegend() + 
  theme(plot.title = element_text(size = 15, face = 2, hjust = 0.5))
p1 + p2

We can examine the differentially expressed genes between cluster 10 and 14 from SCA clustering. We see that these are heterogeneous populations marked by different gene expressions.

de.genes <- FindMarkers(cluster4, 14, 10, group.by = "sca.snn_res.1", verbose = FALSE)
de.genes <- de.genes[de.genes$p_val_adj < 0.05, ]
de.genes <- de.genes[order(de.genes$avg_log2FC),]
cluster4 <- ScaleData(cluster4, features = rownames(de.genes))
p1 <- DoHeatmap(cluster4, rownames(de.genes), group.by = "sca.snn_res.1")
p2 <- SpatialFeaturePlot(cluster4, c(head(rownames(de.genes),2), tail(rownames(de.genes),2)),
                         pt.size.factor = 3)
p1 + p2