In ccb-hms/SpatialData: Collection of spatially resolved omics datasets

knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    crop = NULL ## Related to https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016656.html
)

Setup

library(ggplot2)
library(vroom)
library(jsonlite)
library(BumpyMatrix)
library(SpatialExperiment)

Start with downloading from and unzipping the dataset from: https://datadryad.org/stash/dataset/doi:10.5061%2Fdryad.jm63xsjb2

data.dir <- "data_release_baysor_merfish_gut"
raw.dir <- file.path(data.dir, "raw_data")
proc.dir <- file.path(data.dir, "data_analysis")
baysor.dir <- file.path(proc.dir, "baysor") 
cellpose.dir <- file.path(proc.dir, "cellpose") 
baysor.cellpose.dir <- file.path(proc.dir, "baysor_membrane_prior")

Data

Def. ileum: the final and longest segment of the small intestine.

Raw data

What's there:

list.files(raw.dir)

mRNA molecule data: 820k observations for 241 genes

mol.file <- file.path(raw.dir, "molecules.csv")
mol.dat <- vroom::vroom(mol.file)
mol.dat <- data.frame(mol.dat)
dim(mol.dat)
head(mol.dat)
length(unique(mol.dat$gene))

Image data:

DAPI stain signal:

dapi.file <- file.path(raw.dir, "dapi_stack.tif")
dapi.img <- SpatialExperiment::SpatialImage(dapi.file)
dapi.img <- SpatialExperiment::mirrorImg(dapi.img, "h")
class(dapi.img)
SpatialExperiment::imgSource(dapi.img)
plot(SpatialExperiment::imgRaster(dapi.img))

Membrane Na+/K+ - ATPase immunofluorescence signal:

mem.file <- file.path(raw.dir, "membrane_stack.tif")
mem.img <- SpatialExperiment::SpatialImage(mem.file)
mem.img <- SpatialExperiment::mirrorImg(mem.img, "h")
class(mem.img)
SpatialExperiment::imgSource(mem.img)
plot(SpatialExperiment::imgRaster(mem.img))

Processed data

Baysor (mRNA-only) segmentation

Cell-by-gene matrix. Rows: genes; Cols: cells

counts.file <- file.path(baysor.dir, "segmentation", "segmentation_counts.tsv")
counts.baysor <- vroom::vroom(counts.file, col_names = FALSE)
counts.baysor <- data.frame(counts.baysor)
genes <- counts.baysor[,1]
counts.baysor <- as.matrix(counts.baysor[,-1])
rownames(counts.baysor) <- genes
dim(counts.baysor)
counts.baysor[1:5,1:5]

Cell metadata. Each cell corresponds to each column in segmentation_counts.csv.

cdat.file <- file.path(baysor.dir, "segmentation", "segmentation_cell_stats.csv")
cdat.baysor <- vroom::vroom(cdat.file)
cdat.baysor <- data.frame(cdat.baysor)
colnames(counts.baysor) <- cdat.baysor$cell
dim(cdat.baysor)
head(cdat.baysor)

Segmentation and mRNA metadata. Each row corresponds to one mRNA in raw_data/molecules.csv

rdat.file <- file.path(baysor.dir, "segmentation", "segmentation.csv")
rdat.baysor <- vroom::vroom(rdat.file)
rdat.baysor <- data.frame(rdat.baysor)
rdat.baysor <- subset(rdat.baysor, cell != 0)
dim(rdat.baysor)
head(rdat.baysor)

Polygon segmentation borders:

Segmentation borders are provided in JSON format, and we have polygon segmentation borders for each of the nine z-layers:

poly.file <- file.path(baysor.dir, "segmentation", "poly_per_z.json")
pdat <- jsonlite::fromJSON(poly.file)
dim(pdat)
colnames(pdat)

Now let's look at the segmentation borders of the first cell in the first z-layer:

head(pdat[1,2][[1]]$coordinates[[1]])
pdf <- as.data.frame(pdat[1,2][[1]]$coordinates[[1]][1,,])
colnames(pdf) <- c("x", "y")
ggplot(pdf, aes(x = x, y = y)) + geom_polygon()

That is a terribly nested data structure, let's reshape that to a data.frame so we can more conveniently work with that.

We therefore first write a function to pull out the coordinates for one z-layer at a time:

pcoords <- pdat[1,2][[1]]$coordinates
getSegmentationDF <- function(coords)
{
    y <- lapply(coords, function(x) as.data.frame(x[1,,])) 
    y <- lapply(y, as.matrix)
    z <- do.call(rbind, y)
    n <- vapply(y, nrow, numeric(1)) 
    colnames(z) <- c("x", "y")
    z <- cbind(z, cell = rep(seq_along(y), n))    
    return(z)
}
head(getSegmentationDF(pcoords))

We then apply the function to each z-layer to arrive at an overall data.frame:

dfl <- lapply(pdat[,2], function(x) getSegmentationDF(x$coordinates))
n <- vapply(dfl, nrow, numeric(1))
df <- do.call(rbind, dfl)
df <- cbind(df, z = rep(seq_along(dfl), n))
df <- data.frame(df)
head(df)
dim(df)

Ok, now we can look into how many cells / polygons do we have for each z-layer:

spl <- split(df$cell, df$z)
spl <- lapply(spl, unique)
lengths(spl)

Question: how to relate that to the 5,800 cells that we have in the segmentation counts matrix?

Somehow the numbers don't add up with what we have in the segmentation table:

splr <- split(rdat.baysor$cell, rdat.baysor$z_raw)
length(splr)
splr <- lapply(splr, unique)
lengths(splr)

Next question will be how to add the coordinates of the segmentation borders to the SpatialExperiment. Shila Ghazanfour did some work with having instances of IntegerLists in the spatialCoords slot here.

Clustering / cell type assignment: assignment of each cell to cell type clusters.

ctype.file <- file.path(baysor.dir, "clustering", "cell_assignment.csv")
ctype.baysor <- vroom::vroom(ctype.file)
ctype.baysor <- data.frame(ctype.baysor)
dim(ctype.baysor)
head(ctype.baysor)
table(ctype.baysor$leiden_final)

combine with cell metadata

stopifnot(all(ctype.baysor$cell == cdat.baysor$cell))
cdat.baysor <- merge(cdat.baysor, ctype.baysor, by = "cell")
head(cdat.baysor)

Marker genes: Marker gene statistics for each cluster

marker.file <- file.path(baysor.dir, "clustering", "marker_genes.csv")
marker.baysor <- vroom::vroom(marker.file)
marker.baysor <- data.frame(marker.baysor)
dim(marker.baysor)
head(marker.baysor)

Quick sanity check whether all markers are present in the count matrix:

all(marker.baysor$gene_name %in% rownames(counts.baysor))
hist(table(marker.baysor$gene_name))

Construct SpatialExperiment:

Construct data.frame of molecule coordinates. Here, we only include the x-, y-, z-coordinates, but we could keep all columns from the molecule metadata file such as qc_score and assignment confidence.

genes <- rdat.baysor$gene
cells <- rdat.baysor$cell
mol <- BumpyMatrix::splitAsBumpyMatrix(rdat.baysor[, c("x", "y", "z")], 
                                       row = genes, col = cells)

Quick sanity check for molecules of a specific gene in a specific cell:

subset(rdat.baysor, gene == "Neat1" & cell == 1)
mol["Neat1",1]
counts.baysor["Neat1",1]

stopifnot(all(rownames(mol) == rownames(counts.baysor)))
spe <- SpatialExperiment(assays = list(counts = counts.baysor, molecules = mol),
                         colData = cdat.baysor,
                         spatialCoordsNames = c("x", "y"),
                         spatialDataNames = c("density", "elongation", "area", "avg_confidence"))
spe

Add the images:

spe <- SpatialExperiment::addImg(spe, 
                                 sample_id = "sample01", 
                                 image_id = "dapi",
                                 imageSource = dapi.file, 
                                 scaleFactor = NA_real_, 
                                 load = FALSE)
spe <- SpatialExperiment::addImg(spe, 
                                 sample_id = "sample01", 
                                 image_id = "membrane",
                                 imageSource = mem.file, 
                                 scaleFactor = NA_real_, 
                                 load = FALSE)
spe

Inspect SpatialExperiment:

assay(spe, "counts")[1:5,1:5]
assay(spe, "molecules")["Neat1",1]
colData(spe)
head(spatialCoords(spe))
spatialData(spe)
imgData(spe)
imgData(spe)$data

Cellpose (membrane or DAPI-based) segmentation

Ok, here is a question: where do we store alternative segmentations? The altExps slot of a SummarizedExperiment-derivative is thought for alternative experiments over a distinct set of features for the same samples. But here we are having a distinct set of samples/cells for the sample features. Is this just a separate SpatialExperiment with potentially duplicating information (incl. the images!) or a use case for MultiAssayExperiment? Or maybe alternative BumpyMatrices in a separate slot such as the reducedDim slot.

Cell-by-gene matrix. Rows: genes; Cols: cells

counts.file <- file.path(cellpose.dir, "segmentation", "segmentation_counts.tsv")
counts.cellpose <- vroom::vroom(counts.file) 
counts.cellpose <- data.frame(counts.cellpose) 
dim(counts.cellpose)
counts.cellpose[1:5,1:5]

Cell coordinates:

cdat.file <- file.path(cellpose.dir, "segmentation", "cell_coords.csv")
cdat.cellpose <- vroom::vroom(cdat.file)
cdat.cellpose <- data.frame(cdat.cellpose)
dim(cdat.cellpose)
head(cdat.cellpose)

Baysor (with Cellpose membrane segmentation prior) segmentation

Cell-by-gene matrix. Rows: genes; Cols: cells

counts.file <- file.path(baysor.cellpose.dir, "segmentation", "segmentation_counts.tsv")
counts.baysor.cellpose <- read.delim(counts.file, header = FALSE) 
dim(counts.baysor.cellpose)
counts.baysor.cellpose[1:5,1:5]

Cell metadata. Each cell corresponds to each column in segmentation_counts.csv.

cdat.file <- file.path(baysor.dir, "segmentation", "segmentation_cell_stats.csv")
cdat.baysor.cellpose <- vroom::vroom(cdat.file)
cdat.baysor.cellpose <- data.frame(cdat.baysor.cellpose)
colnames(counts.baysor.cellpose) <- cdat.baysor.cellpose$cell
dim(cdat.baysor.cellpose)
head(cdat.baysor.cellpose)

Segmentation and mRNA metadata. Each row corresponds to one mRNA in raw_data/molecules.csv

rdat.file <- file.path(baysor.dir, "segmentation", "segmentation.csv")
rdat.baysor.cellpose <- vroom::vroom(rdat.file)
rdat.baysor.cellpose <- data.frame(rdat.baysor.cellpose)
rdat.baysor.cellpose <- subset(rdat.baysor.cellpose, cell != 0)
dim(rdat.baysor.cellpose)
head(rdat.baysor.cellpose)

Visualization

Cell metadata

Overlay cell type annotation as in Figure 6 of the publication.

plot(SpatialExperiment::imgRaster(mem.img))
endo.ind <- spe$leiden_final == "Endothelial"
points(x = spatialCoords(spe)[endo.ind, "x"], 
       y = spatialCoords(spe)[endo.ind, "y"],
       col = "lightgreen", pch = 20)
bplasm.ind <- spe$leiden_final == "B (Plasma)"
points(x = spatialCoords(spe)[bplasm.ind, "x"],
       y = spatialCoords(spe)[bplasm.ind, "y"],
       col = "firebrick", pch = 20)

Segmentation

Let's plot segmentation borders for the first z-layer:

df1 <- subset(df, z == 1)
ggplot(df1, aes(x = x, y = y)) + 
    geom_polygon(aes(group = cell)) +
    theme_void()

Add holes:

.f <- function(df) 
{
    df$x <- df$x + 0.5 * (mean(df$x) - df$x)
    df$y <- df$y + 0.5 * (mean(df$y) - df$y)
    return(df)
}
spl <- split(df1, df1$cell)
dl <- lapply(spl, .f)
holes <- do.call(rbind, dl)  
df1$subid <- 1L
holes$subid <- 2L
df1 <- rbind(df1, holes)

Plot with holes:

ggplot(df1, aes(x = x, y = y)) + 
    geom_polygon(aes(group = cell, subgroup = subid), fill = "firebrick") +
    theme_void()

Plot over image:

grb <- grid::rasterGrob(mem.img,
    interpolate = FALSE,
    width = unit(1, "npc"),
    height = unit(1, "npc"))
p <- ggplot() + 
    annotation_custom(
        grob = grb,
        xmin = 0,
        xmax = ncol(grb$raster),
        ymin = 0,
        ymax = nrow(grb$raster)) + 
    coord_fixed(
        xlim = c(0, ncol(grb$raster)),
        ylim = c(0, nrow(grb$raster))) 
p <- p + geom_polygon(
            data = df1,
            aes(x = x, y = y, group = cell, subgroup = subid), 
            fill = "lightblue")
p + theme_void()

Interactive exploration with iSEE and Vitessce

Create and process SingleCellExperiment for iSEE:

sce <- SingleCellExperiment(assays = list(counts = counts.baysor), 
                            colData = cdat.baysor)
sce <- scater::logNormCounts(sce)
sce <- scater::runPCA(sce)
sce <- scater::runTSNE(sce, dimred = "PCA", perplexity = 30)
sce <- scater::runUMAP(sce, dimred = "PCA")

Some cosmetics for visualization purposes:

sce.sub <- subset(sce, , leiden_final != "Removed")
lev <- levels(factor(sce.sub$leiden_final))
levi <- setdiff(lev, lev[c(1,4,6)])
sce.sub <- subset(sce, , leiden_final %in% levi)
for(col in c("x", "y", "sizeFactor"))
    sce.sub[[col]] <- round(sce.sub[[col]], digits = 3)
sce.sub