knitr::opts_chunk$set(echo = TRUE)

In this tutorial, we'll demonstrate how to infer gene regulatory network using single-cell multimodal data which measures gene expression and chromatin accessibility profiles from the same single cells. We will use a publicly available 10x Genomic Multiome data set for human PBMCs.

We first download the required data.

Run the following commands to download the data:

if(!dir.exists('./10x_pbmc')){
    dir.create('./10x_pbmc')
}

download.file(url = 'https://costalab.ukaachen.de/open_data/scMEGA/10x_multiome/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz', 
              destfile = './10x_pbmc/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz', 
              method = 'wget', extra = '--no-check-certificate')

download.file(url = 'https://costalab.ukaachen.de/open_data/scMEGA/10x_multiome/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz.tbi', 
               destfile = './10x_pbmc/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz.tbi', 
               method = 'wget', extra = '--no-check-certificate')

download.file(url = 'https://costalab.ukaachen.de/open_data/scMEGA/10x_multiome/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5', 
              destfile = './10x_pbmc/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5', 
              method = 'wget', extra = '--no-check-certificate')

Next, we load all necessary packages:

suppressMessages(library(ArchR))
suppressMessages(library(Seurat))
suppressMessages(library(Signac))
suppressMessages(library(scMEGA))
suppressMessages(library(Nebulosa))
suppressMessages(library(BSgenome.Hsapiens.UCSC.hg38))
suppressMessages(library(GenomeInfoDb))
suppressMessages(library(EnsDb.Hsapiens.v86))
suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))
suppressMessages(library(JASPAR2020))
suppressMessages(library(TFBSTools))
suppressMessages(library(igraph))
suppressMessages(library(ggraph))
suppressMessages(library(MOJITOO))

Data pre-processing

Let's load the data into memory and extract scRNA and scATAC-seq data:

inputdata.10x <- Read10X_h5("./10x_pbmc/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5")

# extract RNA and ATAC data
rna_counts <- inputdata.10x$`Gene Expression`
atac_counts <- inputdata.10x$Peaks

# filter peaks by chromosome
atac_counts <- atac_counts[grep("chr", rownames(atac_counts)), ]

Next create a Seurat object and filter low-quality cells in scRNA-seq data

obj.rna <- CreateSeuratObject(counts = rna_counts)
obj.rna[["percent.mt"]] <- PercentageFeatureSet(obj.rna, pattern = "^MT-")

We can visualize the data quality

# Visualize QC metrics as a violin plot
VlnPlot(obj.rna, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, pt.size = 0)

obj.rna <- subset(obj.rna, subset = nFeature_RNA > 200 & nFeature_RNA < 3000 & percent.mt < 20)

obj.rna

Create Seurat object for scATAC-seq data:

# create seurat object
chrom_assay <- CreateChromatinAssay(
    counts = atac_counts,
    sep = c(":", "-"),
    min.cells = 1,
    genome = 'hg38',
    fragments = './10x_pbmc/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz'
)

obj.atac <- CreateSeuratObject(
    counts = chrom_assay,
    assay = "ATAC")

# extract gene annotations from EnsDb
annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86, 
                                   verbose = FALSE)

# change to UCSC style since the data was mapped to hg38
seqlevelsStyle(annotations) <- 'UCSC'

# add the gene information to the object
Annotation(obj.atac) <- annotations

Make sure that the two modalities have the same cells:

cell.sel <- intersect(colnames(obj.rna), colnames(obj.atac))

obj.rna <- subset(obj.rna, cells = cell.sel)
obj.atac <- subset(obj.atac, cells = cell.sel)

We next process the scRNA-seq and scATAC-seq data using standard Seurat and Signac analysis pipeline:

# normalization followed by dimensionality reduction
obj.rna <- obj.rna %>%
    NormalizeData(verbose=FALSE) %>%
    FindVariableFeatures(nfeatures=3000, verbose=F) %>%
    ScaleData() %>%
    RunPCA(verbose = FALSE) %>%
    RunUMAP(dims = 1:30, verbose = FALSE)

obj.atac <- obj.atac %>%
    RunTFIDF() %>%
    FindTopFeatures() %>%
    RunSVD() %>%
    RunUMAP(reduction = 'lsi', dims = 2:30, verbose = FALSE)

Visualize the scRNA-seq and scATAC-seq separately.

p1 <- DimPlot(obj.rna) + NoLegend()
p2 <- DimPlot(obj.atac) + NoLegend()

p1 + p2

Cell type annotation

We next will annotate the cell types in our multimodal data via label transfer approach implemented by Seurat. The reference dataset is found here

Download data

download.file(url = 'https://costalab.ukaachen.de/open_data/scMEGA/10x_multiome/seurat.rds', 
              destfile = './10x_pbmc/seurat.rds', 
              method = 'wget', extra = '--no-check-certificate')

Load data

reference <- readRDS("./10x_pbmc/seurat.rds")

reference <- UpdateSeuratObject(object = reference)

reference[["RNA"]] <- as(reference[["RNA"]], "Assay5")

# we only keep cells with annotated cell type
reference <- subset(reference, subset = celltype != "NA")

# run sctransform
reference <- reference %>%
    NormalizeData(verbose=FALSE) %>%
    FindVariableFeatures(nfeatures=3000, verbose=F) %>%
    ScaleData() %>%
    RunPCA(verbose = FALSE) %>%
    RunUMAP(dims = 1:30, verbose = FALSE)
p1 <- DimPlot(reference, label = TRUE, repel = TRUE, 
        reduction = "umap", group.by = "broad_celltype") + NoLegend()

p2 <- DimPlot(reference, label = TRUE, repel = TRUE, 
        reduction = "umap", group.by = "celltype") + NoLegend()

p1 + p2

Transfer cell type labels from reference to query:

transfer_anchors <- FindTransferAnchors(
  reference = reference,
  query = obj.rna,
  reference.reduction = "pca",
  dims = 1:30
)

predictions <- TransferData(
  anchorset = transfer_anchors, 
  refdata = reference$celltype,
  weight.reduction = obj.rna[['pca']],
  dims = 1:30
)

obj.rna <- AddMetaData(
  object = obj.rna,
  metadata = predictions
)

obj.atac <- AddMetaData(
  object = obj.atac,
  metadata = predictions
)

Visualize the predicted cell types:

p1 <- DimPlot(obj.rna, label = TRUE, repel = TRUE, 
        reduction = "umap", group.by = "predicted.id") + NoLegend()

p2 <- DimPlot(obj.atac, label = TRUE, repel = TRUE, 
        reduction = "umap", group.by = "predicted.id") + NoLegend()

p1 + p2

Integration of multimodal single-cell data using MOJITOO

We next need to project the cells into a low-dimensional space. Here we can use MOJITOO to do this job.

Let's first create a single Seurat object including both modalities and predicted cell types that we generated in the above step:

meta.data <- obj.rna@meta.data %>%
    as.data.frame()

# create a Seurat object containing the RNA adata
pbmc <- CreateSeuratObject(
  counts = GetAssayData(obj.rna, layer = "counts"),
  assay = "RNA",
  meta.data = meta.data
)

# create ATAC assay and add it to the object
pbmc[["ATAC"]] <- CreateChromatinAssay(
  counts = GetAssayData(obj.atac, layer = "counts",),
  sep = c(":", "-"),
  min.cells = 1,
  genome = 'hg38',
  fragments = './10x_pbmc/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz'
)

MOJITOO takes as input the individual low-dimensional space for each modality, so here we do this again:

## RNA pre-processing and PCA dimension reduction
DefaultAssay(pbmc) <- "RNA"

pbmc <- pbmc %>%
    NormalizeData(verbose=F) %>%
    FindVariableFeatures(nfeatures=3000, verbose=F) %>%
    ScaleData(verbose=F) %>%
    RunPCA(npcs=50, reduction.name="RNA_PCA", verbose=F)

## ATAC pre-processing and LSI dimension reduction
DefaultAssay(pbmc) <- "ATAC"

pbmc <- pbmc %>%
    RunTFIDF(verbose=F) %>%
    FindTopFeatures(min.cutoff = 'q0', verbose=F) %>%
    RunSVD(verbose=F)

Run MOJITOO:

pbmc <- mojitoo(
     object = pbmc,
     reduction.list = list("RNA_PCA", "lsi"),
     dims.list = list(1:50, 2:50), ## exclude 1st dimension of LSI
     reduction.name = 'MOJITOO',
     assay = "RNA"
)

We can generate another UMAP representation based on the MOJITOO results:

DefaultAssay(pbmc) <- "RNA"
embedd <- Embeddings(pbmc[["MOJITOO"]])
pbmc <- RunUMAP(pbmc, 
                reduction="MOJITOO", 
                reduction.name="MOJITOO_UMAP", 
                dims=1:ncol(embedd), verbose=F)

DimPlot(pbmc, group.by = "predicted.id", 
        shuffle = TRUE, label = TRUE, reduction = "MOJITOO_UMAP") + NoLegend()

Trajectory analysis

We next infer a trajectory from naive CD4 T cells to memory CD4 T cells to characterize CD4+ T cell activation.

pbmc <- AddTrajectory(object = pbmc, 
                      trajectory = c("naive CD4 T cells", 
                                     "memory CD4 T cells"),
                      group.by = "predicted.id", 
                      reduction = "MOJITOO_UMAP",
                      dims = 1:2, 
                      use.all = FALSE)

# we only plot the cells that are in this trajectory
pbmc.t.cells <- pbmc[, !is.na(pbmc$Trajectory)]

The results can be visualized as:

p1 <- DimPlot(object = pbmc.t.cells, 
              group.by = "predicted.id", 
              reduction = "MOJITOO_UMAP",
             label = TRUE) + NoLegend()

p2 <- TrajectoryPlot(object = pbmc.t.cells, 
                    reduction = "MOJITOO_UMAP",
                    continuousSet = "blueYellow",
                    size = 1,
                   addArrow = FALSE)
p1 + p2

TF and gene selection

We next select candidate TFs and genes for building a meaningful gene regulatory network.

Select TFs

To identify potential regulator (i.e., TFs), we first estimate an acitivty score for each TF in each cell. This is done by first performing motif matching and then computing deviation scores using chromVAR.

# Get a list of motif position frequency matrices from the JASPAR database
pfm <- getMatrixSet(
  x = JASPAR2020,
  opts = list(collection = "CORE", tax_group = 'vertebrates', all_versions = FALSE)
)

# add motif information
pbmc.t.cells <- AddMotifs(
  object = pbmc.t.cells,
  genome = BSgenome.Hsapiens.UCSC.hg38,
  pfm = pfm,
    assay = "ATAC"
)

# run chromVAR
pbmc.t.cells <- RunChromVAR(
  object = pbmc.t.cells,
  genome = BSgenome.Hsapiens.UCSC.hg38,
    assay = "ATAC"
)

sel.tfs <- SelectTFs(object = pbmc.t.cells, 
                 return.heatmap = TRUE,
                cor.cutoff = 0.4)

df.cor <- sel.tfs$tfs
ht <- sel.tfs$heatmap

draw(ht)

Select genes

sel.genes <- SelectGenes(object = pbmc.t.cells,
                  labelTop1 = 0,
                  labelTop2 = 0)

df.p2g <- sel.genes$p2g
ht <- sel.genes$heatmap

draw(ht)

Gene regulatory network inference and visualization

We here will try to predict a gene regulatory network based on the correlation of the TF binding activity as estimated from snATAC-seq and gene expression as measured by snRNA-seq along the trajectory.

tf.gene.cor <- GetTFGeneCorrelation(object = pbmc.t.cells, 
                                    tf.use = df.cor$tfs, 
                                    gene.use = unique(df.p2g$gene),
                                    tf.assay = "chromvar", 
                                    gene.assay = "RNA",
                                    trajectory.name = "Trajectory")

We can then visualize this correlation matrix by heatmap. Also, we can cluster the genes and TFs to identify different regulatory modules for the predefined sub-populations.

ht <- GRNHeatmap(tf.gene.cor, 
                 tf.timepoint = df.cor$time_point)

ht

To associate genes to TFs, we will use the peak-to-gene links and TF binding sites information. Specifically, if a gene is regulated by a peak and this peak is bound by a TF, then we consider this gene to be a target of this TF.

motif.matching <- pbmc.t.cells@assays$ATAC@motifs@data
colnames(motif.matching) <- pbmc.t.cells@assays$ATAC@motifs@motif.names
motif.matching <-
    motif.matching[unique(df.p2g$peak), unique(tf.gene.cor$tf)]


df.grn <- GetGRN(motif.matching = motif.matching, 
                 df.cor = tf.gene.cor, 
                 df.p2g = df.p2g)

Finally, we can visualize our network as the last step of this analysis

# define colors for nodes representing TFs (i.e., regulators)
df.cor <- df.cor[order(df.cor$time_point), ]
tfs.timepoint <- df.cor$time_point
names(tfs.timepoint) <- df.cor$tfs

# plot the graph, here we can highlight some genes
df.grn2 <- df.grn %>%
    subset(correlation > 0.5) %>%
    select(c(tf, gene, correlation)) %>%
    rename(weights = correlation)

p <- GRNPlot(df.grn2, 
             tfs.timepoint = tfs.timepoint,
             show.tf.labels = TRUE,
             seed = 42, 
             plot.importance = FALSE,
            min.importance = 2,
            remove.isolated = FALSE)

print(p)

GRN visualization

Once we generated the gene regulatory network, we can visualize individual TFs in terms of binding activity, expression, and target expression along the pseudotime trajectory.

Here we select two TFs for visualization.

pbmc.t.cells <- AddTargetAssay(object = pbmc.t.cells, df.grn = df.grn2)

p1 <- PseudotimePlot(object = pbmc.t.cells, tf.use = "SOX4")
p2 <- PseudotimePlot(object = pbmc.t.cells, tf.use = "TBX21")

p1 + p2

The x-axis in above plots present pseudotime point along the trajectory, and the y-axis represent TF binding acitivty, TF expression, and TF target expression after z-score transformation.

# Check session information
sessionInfo()


CostaLab/scMEGA documentation built on Sept. 25, 2024, 6:11 a.m.