Started on r format(Sys.time(), "%Y-%m-%d %H:%M:%S")

library(Seurat)
library(ggplot2)
library(patchwork)
library(dplyr)
library(RColorBrewer)
library(kableExtra)
library(clustree)
library(ezRun)
library(scater)
library(SingleCellExperiment)
library(enrichR)
library(SCpubr)

knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE, knitr.table.format = "html")
posMarkers <- readxl::read_xlsx("posMarkers.xlsx")
spatialMarkers <- readxl::read_xlsx("spatialMarkers.xlsx")
param <- readRDS("param.rds")
input <- readRDS("input.rds")
output <- readRDS("output.rds")
scData <- readRDS("scData.rds")
scData.unfiltered_spatial <- readRDS("scData.unfiltered.rds")
imageName <- names(scData.unfiltered_spatial@images)
scData.unfiltered_spatial@images[[imageName]]@scale.factors$lowres=scData@images[[imageName]]@scale.factors$lowres
scDataRaw <- readRDS("scData.raw.rds")
allCellsMeta <- readRDS("allCellsMeta.rds")
sampleName <- input$getNames()

coord <- GetTissueCoordinates(object = scData@images[[imageName]])
myRatio <- (max(coord$imagerow) - min(coord$imagerow)) / (max(coord$imagecol) - min(coord$imagecol))

if (file.exists("cellsPerGeneFraction.rds")){
    cellsPerGeneFraction <-  readRDS("cellsPerGeneFraction.rds")
} else {
  cellsPerGeneFraction <- NULL
}
if (file.exists("enrichRout.rds")){
  enrichRout <- readRDS("enrichRout.rds")
  if(length(enrichRout) == 1){ ##enrichR call creates empty file
     enrichRout <- NULL 
  }
} else {
  enrichRout <- NULL
}
if (file.exists("aziResults.rds")){
  aziResults <- readRDS("aziResults.rds")
} else {
  aziResults <- NULL
}

# Process the posMarkers
posMarkers$gene = as.factor(posMarkers$gene)
# make sure the loaded cluster uses proper integer ordering if all levels are integer!
clusterSet <- posMarkers$cluster %>% as.integer() %>% unique() %>% sort(na.last=TRUE)
if (any(is.na(clusterSet))){
  posMarkers$cluster = as.factor(posMarkers$cluster)
} else {
  posMarkers$cluster = factor(posMarkers$cluster, levels=clusterSet)
}
posMarkers$p_val_adj[posMarkers$p_val_adj==0] <- min(posMarkers$p_val_adj[posMarkers$p_val_adj>0])

Analysis results {.tabset}

Quality control

Selected QC metrics

We use several common QC metrics to identify low-quality spots based on their expression profiles. The metrics that were chosen are described below.

  1. The library size is defined as the total sum of counts across all relevant features for each spot. Spots with small library sizes are of low quality as the RNA has been lost at some point during library preparation.
  2. The number of expressed features in each spot is defined as the number of genes with non-zero counts for that spot. Any spot with very few expressed genes is likely to be of poor quality as the diverse transcript population has not been successfully captured.
  3. The proportions of mitochondrial genes per spot. High proportions are indicative of poor-quality spots (Islam et al. 2014; Ilicic et al. 2016), presumably because of the loss of cytoplasmic RNA from perforated spots.


xxAll <- SingleCellExperiment(colData = allCellsMeta)
xxAll$discard <- !xxAll$useCell
scData.unfiltered <- CreateSeuratObject(ezMatrix(0, rows=1:10, cols=rownames(allCellsMeta)), meta.data = allCellsMeta)
scData.unfiltered$discard <- !colnames(scData.unfiltered) %in% colnames(scData)

Diagnostic plots

A key assumption here is that the QC metrics are independent of the biological state of each cell. Poor values (e.g., low library sizes, high mitochondrial proportions) are presumed to be driven by technical factors rather than biological processes, meaning that the subsequent removal of spots will not misrepresent the biology in downstream analyses. Major violations of this assumption would potentially result in the loss of cell types that have, say, systematically low RNA content or high numbers of mitochondria. We can check for such violations using some diagnostics plots. In the most ideal case, we would see normal distributions that would justify the thresholds used in outlier detection. A large proportion of spots in another mode suggests that the QC metrics might be correlated with some biological state, potentially leading to the loss of distinct cell types during filtering. The violin plots represent the spots that were kept (FALSE) and the ones that were discarded (TRUE) after QC filtering. The QC metrics were also plot onto the tissue section.

plotColData(xxAll, x="Sample", y="nCount_Spatial", colour_by="discard") + scale_y_log10() + ggtitle("Number of UMIs")
plotColData(xxAll, x="Sample", y="nFeature_Spatial", colour_by="discard") + ggtitle("Detected genes")
plotColData(xxAll, x="nCount_Spatial", y="nFeature_Spatial", colour_by="discard")+ scale_x_log10() + scale_y_log10()

xxAll$genePerCount <- xxAll$nFeature_Spatial / xxAll$nCount_Spatial
plotColData(xxAll, x="Sample", y="genePerCount", colour_by="discard") + ggtitle("Detected genes")
plotColData(xxAll, x="nCount_Spatial", y="genePerCount", colour_by="discard")+ scale_x_log10() + xlim(1, 10000) + ylim(0.2, 0.8)

plotColData(xxAll, x="Sample", y="percent_mito", colour_by="discard") + ggtitle("Mito percent")
plotColData(xxAll, x="nCount_Spatial", y="percent_mito", colour_by="discard")+ scale_x_log10()

if (!is.null(xxAll$percent_riboprot)){
  print(plotColData(xxAll, x="Sample", y="percent_riboprot", colour_by="discard") + ggtitle("Ribosomal percent"))
  print(plotColData(xxAll, x="nCount_Spatial", y="percent_riboprot", colour_by="discard")+ scale_x_log10())
}

Spatial QC Plots

a.) cropped version

if(any(is.nan(scData.unfiltered_spatial$percent_mito))){
  toKeep <- rownames(scData.unfiltered_spatial@meta.data)[-which(is.nan(scData.unfiltered_spatial$percent_mito))]
  scData.unfiltered_spatial <- subset(scData.unfiltered_spatial, cells = toKeep)
}
plot5 <- SpatialFeaturePlot(scData.unfiltered_spatial, features = "nCount_Spatial", pt.size.factor =  param$pt.size.factor) + theme(legend.position = "right") + theme(aspect.ratio = myRatio)
plot6 <- SpatialFeaturePlot(scData.unfiltered_spatial, features = "nFeature_Spatial",  pt.size.factor =  param$pt.size.factor) + theme(legend.position = "right") + theme(aspect.ratio = myRatio)
plot7 <- SpatialFeaturePlot(scData.unfiltered_spatial, features = "percent_mito",  pt.size.factor =  param$pt.size.factor) + theme(legend.position = "right") + theme(aspect.ratio = myRatio)
plot8 <- SpatialFeaturePlot(scData.unfiltered_spatial, features = "percent_riboprot",  pt.size.factor =  param$pt.size.factor) + theme(legend.position = "right") + theme(aspect.ratio = myRatio)

plot6 + plot8 + plot7 + plot5 + plot_layout(nrow = 2, ncol = 2)


b.) based on all spots without image based filtering by SpaceRanger

imageDims <- dim(GetImage(scDataRaw)[[1]])
my_pt.size.factor <- min(max(imageDims[1]/600,1), param$pt.size.factor)
plot0 <- SpatialFeaturePlot(scDataRaw, features = "nCount_Spatial", pt.size.factor = 0) + NoLegend()
plot1 <- SpatialFeaturePlot(scDataRaw, features = "nCount_Spatial", pt.size.factor = my_pt.size.factor, min.cutoff = 'q1', max.cutoff = 'q50', alpha = 0.5) + theme(legend.position = "right") 
plot2 <- SpatialFeaturePlot(scDataRaw, features = "nFeature_Spatial", pt.size.factor = my_pt.size.factor, min.cutoff = 'q1', max.cutoff = 'q50',  alpha = 0.5) + theme(legend.position = "right")

plot0 + plot1 + plot2 + plot_layout(nrow = 1, ncol = 3)


Spots filtering

A standard approach is to filter spots with a low amount of reads as well as genes that are present in at least a certain amount of spots. While simple, using fixed thresholds requires knowledge of the experiment and of the experimental protocol. An alternative approach is to use adaptive, data-driven thresholds to identify outlying spots, based on the set of QC metrics just calculated. To obtain an adaptive threshold, we assume that most of the dataset consists of high-quality spots. When the parameter values of nreads, ngenes, perc_mito and perc_ribo are specified, fixed thresholds are used for filtering. Otherwise, filtering is performed excluding spots that are outliers by more than r param$nmad MADs below the median for the library size and the number of genes detected. Spots with a percentage counts of mitochondrial genes above the median by r param$nmad MADs are also excluded.

wzxhzdk:7
wzxhzdk:8


Gene filtering

We also excluded genes that are lowly or not expressed in our system, as they do not contribute any information to our experiment and may add noise. In this case, we removed genes that were not expressed in at least r 100*param$cellsFraction % of the spots. In case one or more rare cell populations are expected we might need to decrease the percentage of spots.

cat("total genes:", nrow(cellsPerGeneFraction), "\n")
cat("genes kept:", nrow(scData), ", fraction:", round(nrow(scData)/nrow(cellsPerGeneFraction), digits = 2), " \n")
if(!is.null(cellsPerGeneFraction)) {
    p = ggplot(cellsPerGeneFraction, aes(frac)) + geom_histogram(
        aes(y = after_stat(density)),
        colour = "black",
        fill = "white",
        binwidth = 0.005
    )
    p = p + geom_vline(
        aes(xintercept = param$cellsFraction),
        color = "red",
        linetype = "dashed",
        size = 1
    )
    p = p + geom_density(alpha = .2, fill = "#FF6666") + labs(x = "Fraction") + ggtitle('Fraction of spots per gene')
    p
}

Dimensionality reduction

Dimensionality reduction aims to reduce the number of separate dimensions in the data. This is possible because different genes are correlated if they are affected by the same biological process. Thus, we do not need to store separate information for individual genes, but can instead compress multiple features into a single dimension. This reduces computational work in downstream analyses, as calculations only need to be performed for a few dimensions rather than thousands of genes; reduces noise by averaging across multiple genes to obtain a more precise representation of the patterns in the data, and enables effective plotting of the data.

The numbers of PCs that should be retained for downstream analyses typically range from 10 to 50. However, identifying the true dimensionality of a dataset can be challenging, that's why we recommend considering the ‘Elbow plot’ approach. a ranking of principal components based on the percentage of variance explained by each one. The assumption is that each of the top PCs capturing biological signal should explain much more variance than the remaining PCs. Thus, there should be a sharp drop in the percentage of variance explained when we move past the last “biological” PC. This manifests as an elbow in the scree plot, the location of which serves as a natural choice for a number of PCs.

pct <- scData[["pca"]]@stdev / sum(scData[["pca"]]@stdev) * 100
cumu <- cumsum(pct)
plot_df <- data.frame(pct = pct, cumu = cumu, rank = 1:length(pct))

# Elbow plot
p <- ggplot(plot_df, aes(cumu, pct, label = rank, color = rank > param$npcs))
p <- p + geom_vline(xintercept = cumu[param$npcs], color = "grey") + geom_text() 
p <- p + xlab(paste("Variance based on 50 PCs of", param$nfeatures, "features")) +  ylab("Cumulative variance in % explained per PC") 
p <- p + theme_bw() + labs(color = paste("PC >", param$npcs)) + theme(legend.position = "bottom")
p <- p + annotate(geom="text", y=max(plot_df$pct), x=max(plot_df$cumu[param$npcs]), label=paste('Variance used:', round(plot_df$cumu[param$npcs], digits = 2), '%'), color="red")
p


Transforming the data and feature selection

We used the SCtransform v2 method from the Seurat package for normalizing, estimating the variance of the raw filtered data, and identifying the most variable genes. By default, SCtransform accounts for cellular sequencing depth, or nUMIs.
r if(ezIsSpecified(param$SCT.regress.CellCycle) && param$SCT.regress.CellCycle) { "We already checked cell cycle and decided that it does represent a major source of variation in our data, and this may influence clustering. Therefore, we regressed out variation due to cell cycle." } As a result, SCTransform ranked the genes by residual variance and returned the r param$nfeatures most variant genes.

Clustering

In order to find clusters of spots we first built a graph called K-nearest neighbor (KNN), where each node is a cell that is connected to its nearest neighbors in the high-dimensional space. Edges are weighted based on the similarity between the spots involved, with higher weight given to spots that are more closely related. This step takes as input the previously defined dimensionality of the dataset (first r param$npcs PCs). We then applied algorithms to identify “communities” of spots that are more connected to spots in the same community than they are to spots of different communities. Each community represents a cluster that we can use for downstream interpretation.

We can visualize the distribution of clusters in UMAP and image space. However, we should not perform downstream analyses directly on their coordinates. These plots are most useful for checking whether two clusters are actually neighboring subclusters or whether a cluster can be split into further subclusters.

scData$seurat_clusters <- Idents(scData)

p1 <- DimPlot(scData, label = FALSE, pt.size = 1.5) + labs(color = "seurat_clusters")
p1 <- LabelClusters(p1, id = "ident",  fontface = "bold", color = "black", size = 5)
p2 <- SpatialDimPlot(scData, label = TRUE, label.size = 4, pt.size.factor =  param$pt.size.factor) +
  labs(fill = "seurat_clusters") + theme(aspect.ratio = myRatio)

p1 + p2 + plot_annotation(
) + plot_layout(nrow = 1)

We can also plot each cluster separately

SpatialDimPlot(scData, cells.highlight = CellsByIdentities(scData), facet.highlight = TRUE, pt.size.factor =  param$pt.size.factor, 
               ncol = 3, cols.highlight = c('red', 'white')) & theme(aspect.ratio = myRatio)



The number of spots in each cluster and sample is represented in this barplot.


wzxhzdk:13
wzxhzdk:14


Cluster assessment

Segregation of clusters by various sources of uninteresting variation.

Once we have created the clusters we need to asses if the clustering was driven by technical artifacts or uninteresting biological variability, such as cell cycle, mitochondrial or ribosomal gene expression. We can explore whether the spots cluster by the different cell cycle phases. In such a case, we would have clusters where most of the spots would be in one specific phase. This bias could be taken into account when normalizing and transforming the data prior to clustering. We can also look at the total number of reads, genes detected and mitochondrial gene expression. The clusters should be more or less even but if we observe big differences among some of them for these metrics, we will keep an eye on them and see if the cell types we identify later can explain the differences.

plot1 <- VlnPlot(scData, features="nCount_Spatial", group.by="seurat_clusters") + ggtitle("Number of UMIs vs cluster") + 
  NoLegend()
plot2 <- VlnPlot(scData, features = "nFeature_Spatial", group.by="seurat_clusters", pt.size = 0.1) + 
  ggtitle("Number of genes vs cluster") +
  NoLegend()
plot3 <- VlnPlot(scData, features = "percent_mito",  group.by ="seurat_clusters", pt.size = 0.1) + 
  ggtitle("Mitochondrial percentage vs cluster") +
  NoLegend()
plot4 <- VlnPlot(scData, features = "percent_riboprot",  group.by ="seurat_clusters", pt.size = 0.1) + 
  ggtitle("Ribosomal percentage vs cluster") +
  theme(legend.position = "right")

plot1 + plot2 + plot3  + plot4 + plot_layout(nrow = 1, ncol = 4)

Cluster resolution

One of the most important parameters when clustering is k, the number of nearest neighbors used to construct the graph. This controls the resolution of the clustering where higher k yields a more inter-connected graph and broader clusters. Users can experiment with different values of k to obtain a satisfactory resolution. We recommend increasing the resolution when a rare population is expected. Below, it is shown a clustering tree that helps us to visualize the relationships between clusters at a range of resolutions. Each cluster forms a node in the tree and edges are constructed by considering the spots in a cluster at a lower resolution that end up in a cluster at the next highest resolution. By connecting clusters in this way, we can see how clusters are related to each other, which are clearly distinct and which are unstable. The size of each node is related to the number of spots in each cluster and the color indicates the clustering resolution. Edges are colored according to the number of spots they represent and the transparency shows the incoming node proportion, the number of spots in the edge divided by the number of samples in the node it points to.

clustree::clustree(scData, prefix = "SCT_snn_res.")


Cluster markers

cat("We found positive markers that defined clusters compared to all other spots via differential expression. The test we used was the Wilcoxon Rank Sum test. Genes with an average, at least 0.25-fold difference (log-scale) between the spots in the tested cluster and the rest of the spots and an adjusted p-value < 0.05 were declared as significant.")
cat("We found positive markers that defined clusters compared to all other spots via differential expression using a logistic regression test and including in the model the cell cycle as the batch effect. Genes with an average, at least 0.25-fold difference (log-scale) between the spots in the tested cluster and the rest of the spots and an adjusted p-value < 0.05 were declared as significant.")
ezInteractiveTableRmd(data.frame(posMarkers), digits=10)

Marker plots

Here, we use a heatmap and a dotplot to visualize simultaneously the top 5 markers in each cluster. Be aware that some genes may be in the top markers for different clusters.


top5 <- posMarkers %>% 
  group_by(cluster) %>%
  slice_max(n = 5, order_by = tibble(diff_pct, avg_log2FC))
genesToPlot <- c(gsub("_", "-", param$controlSeqs), unique(as.character(top5$gene)))
genesToPlot <- intersect(genesToPlot, rownames(scData))

Heatmap & DotPlot

DoHeatmap(scData, features=unique(genesToPlot))
DotPlot(scData, features=genesToPlot) + coord_flip() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))

Group-wise DE Plot

do_GroupwiseDEPlot(sample = scData,
                   de_genes = posMarkers %>% mutate(gene=as.character(gene)),
                   min.cutoff = 1)

Top Markers

We plot on the tissue the top 5 markers in each cluster. Be aware that some genes may be in the top markers for different clusters.


eachCluster <- 0
for (eachCluster in levels(posMarkers$cluster)) {
  cat("\n")
  cat("#### Cluster ", eachCluster, "\n")
  cat("\n")
  markersPerCluster <- dplyr::filter(posMarkers, cluster == eachCluster) %>%
    dplyr::arrange(desc(avg_log2FC)) %>%
    select(gene) %>%
    pull()
  markersPerCluster <- head(markersPerCluster, 5)
  eachMarker <- markersPerCluster[1]
  for (eachMarker in markersPerCluster) {
    sp = SpatialFeaturePlot(object = scData, features = eachMarker, alpha = c(0.1, 1), pt.size.factor =  param$pt.size.factor) + theme(aspect.ratio = myRatio)
    print(sp)
  }
  cat("\n")
}

Spatially Variable Genes plots

We also identify molecular features that correlate with spatial location within a tissue. Here we show the expression of the top 10 features identified by this measure. The full list can be found in the Data availability section.

top.features <- head(spatialMarkers$GeneSymbol, 10)
for(gene in top.features) {
   sp = SpatialFeaturePlot(object = scData, features = gene, alpha = c(0.1, 1), pt.size.factor = param$pt.size.factor) + theme(aspect.ratio = myRatio)
   print(sp)
}

EnrichR

Cell type scoring using the EnrichR tool. This approach consists in performing a gene set enrichment analysis on the marker genes defining each cluster. This identifies the pathways and processes that are (relatively) active in each cluster based on the upregulation of the associated genes compared to other clusters.

markersPerClusterTable <- c()
eachCluster <- 0
for (eachCluster in levels(posMarkers$cluster)) {
  markersPerCluster <- dplyr::filter(posMarkers, cluster == eachCluster) %>%
    dplyr::arrange(desc(avg_log2FC))
  markersPerCluster <- head(markersPerCluster, min(nrow(markersPerCluster), 500))
  markersPerClusterTable <- rbind(markersPerClusterTable,markersPerCluster)
}

genesPerCluster <- split(markersPerClusterTable$gene, markersPerClusterTable$cluster)
jsCall = paste0('enrich({list: "', sapply(genesPerCluster, paste, collapse="\\n"), '", popup: true});')
enrichrCalls <- paste0("<a href='javascript:void(0)' onClick='", jsCall, 
                         "'>Analyse at Enrichr website</a>")
enrichrTable <- tibble(Cluster=names(genesPerCluster),
                         "# of posMarkers"=lengths(genesPerCluster),
                         "Enrichr link"=enrichrCalls)
if (!is.null(enrichRout)){
  enrichRTerm <- as.data.frame(do.call(rbind, lapply(enrichRout, as.vector)))
  enrichRTerm <- map_df(enrichRTerm, ~ map_df(.x, ~ replace(.x, is.null(.x), NA)), .id = "database")
  enrichRTerm <- enrichRTerm %>%
    group_by(., Cluster, database) %>%
    summarise(topTerms = paste(Term, collapse = "; ")) %>%
    as.data.frame()
  enrichRTerm <- dcast(enrichRTerm, ... ~ database)
  enrichrTable <- merge(enrichrTable, enrichRTerm, by = "Cluster")
}
kable(enrichrTable, format="html", escape=FALSE,
        caption=paste0("GeneSet enrichment")) %>%
kable_styling("striped", full_width = F, position = "left")

Azimuth

scData$ident = scData$Azimuth.celltype.l1
p1 <- DimPlot(scData, label = FALSE, pt.size = 1.5) + labs(color = "ident")
p1 <- LabelClusters(p1, id = "ident",  fontface = "bold", color = "black", size = 2.8)
p1

p2 <- SpatialDimPlot(scData, label = TRUE, label.size = 2, pt.size.factor = param$pt.size.factor, group.by = "Azimuth.celltype.l1") + labs(fill = "Azimuth.celltype.l1") + theme(aspect.ratio = myRatio)
p2
scData$ident = scData$Azimuth.celltype.l2
p1 <- DimPlot(scData, label = FALSE, pt.size = 1.5) + labs(color = "ident")
p1 <- LabelClusters(p1, id = "ident",  fontface = "bold", color = "black", size = 2.8)
p1
p2 <- SpatialDimPlot(scData, label = TRUE, label.size = 2, pt.size.factor = param$pt.size.factor, group.by = "Azimuth.celltype.l2") + labs(fill = "Azimuth.celltype.l2") + theme(aspect.ratio = myRatio)
p2
scData$ident = scData$Azimuth.celltype.l3
p1 <- DimPlot(scData, label = FALSE, pt.size = 1.5) + labs(color = "ident")
p1 <- LabelClusters(p1, id = "ident",  fontface = "bold", color = "black", size = 2.8)
p1
p2 <- SpatialDimPlot(scData, label = TRUE, label.size = 2, pt.size.factor = param$pt.size.factor, group.by = "Azimuth.celltype.l3") + labs(fill = "Azimuth.celltype.l3") + theme(aspect.ratio = myRatio)
p2
scData$ident = scData$Azimuth.celltype.l4
p1 <- DimPlot(scData, label = FALSE, pt.size = 1.5) + labs(color = "ident")
p1 <- LabelClusters(p1, id = "ident",  fontface = "bold", color = "black", size = 2.8)
p1
p2 <- SpatialDimPlot(scData, label = TRUE, label.size = 2, pt.size.factor = param$pt.size.factor, group.by = "Azimuth.celltype.l4") + labs(fill = "Azimuth.celltype.l4") + theme(aspect.ratio = myRatio)
p2

Interactive

simple explorer{target="_blank"}

Data availability

Aggregated expression of every gene across the spots in each cluster

geneExprPerCluster

Aggregated expression of every gene across all the spots

geneExprPerSample

Positive markers of each cluster

posMarkers

Spatially variable genes

spatialMarkers

The final Seurat Object is here
LoupeBrowserObject here

Parameters

param[c("npcs","pcGenes","resolution", "SCT.regress.CellCycle", "DE.method", "cellsFraction", "nUMIs", "nmad")]

SessionInfo

ezSessionInfo()


uzh/ezRun documentation built on May 4, 2024, 3:23 p.m.