knitr::opts_chunk$set(echo = TRUE)
In this tutorial, we'll demonstrate how to infer gene regulatory network using single-cell multimodal data which measures gene expression and chromatin accessibility profiles from the same single cells. We will use a publicly available 10x Genomic Multiome data set for human PBMCs.
We first download the required data.
Run the following commands to download the data:
if(!dir.exists('./10x_pbmc')){ dir.create('./10x_pbmc') } download.file(url = 'https://costalab.ukaachen.de/open_data/scMEGA/10x_multiome/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz', destfile = './10x_pbmc/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz', method = 'wget', extra = '--no-check-certificate') download.file(url = 'https://costalab.ukaachen.de/open_data/scMEGA/10x_multiome/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz.tbi', destfile = './10x_pbmc/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz.tbi', method = 'wget', extra = '--no-check-certificate') download.file(url = 'https://costalab.ukaachen.de/open_data/scMEGA/10x_multiome/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5', destfile = './10x_pbmc/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5', method = 'wget', extra = '--no-check-certificate')
Next, we load all necessary packages:
suppressMessages(library(ArchR)) suppressMessages(library(Seurat)) suppressMessages(library(Signac)) suppressMessages(library(scMEGA)) suppressMessages(library(Nebulosa)) suppressMessages(library(BSgenome.Hsapiens.UCSC.hg38)) suppressMessages(library(GenomeInfoDb)) suppressMessages(library(EnsDb.Hsapiens.v86)) suppressMessages(library(ggplot2)) suppressMessages(library(dplyr)) suppressMessages(library(JASPAR2020)) suppressMessages(library(TFBSTools)) suppressMessages(library(igraph)) suppressMessages(library(ggraph)) suppressMessages(library(MOJITOO))
Let's load the data into memory and extract scRNA and scATAC-seq data:
inputdata.10x <- Read10X_h5("./10x_pbmc/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5") # extract RNA and ATAC data rna_counts <- inputdata.10x$`Gene Expression` atac_counts <- inputdata.10x$Peaks # filter peaks by chromosome atac_counts <- atac_counts[grep("chr", rownames(atac_counts)), ]
Next create a Seurat object and filter low-quality cells in scRNA-seq data
obj.rna <- CreateSeuratObject(counts = rna_counts) obj.rna[["percent.mt"]] <- PercentageFeatureSet(obj.rna, pattern = "^MT-")
We can visualize the data quality
# Visualize QC metrics as a violin plot VlnPlot(obj.rna, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, pt.size = 0) obj.rna <- subset(obj.rna, subset = nFeature_RNA > 200 & nFeature_RNA < 3000 & percent.mt < 20) obj.rna
Create Seurat object for scATAC-seq data:
# create seurat object chrom_assay <- CreateChromatinAssay( counts = atac_counts, sep = c(":", "-"), min.cells = 1, genome = 'hg38', fragments = './10x_pbmc/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz' ) obj.atac <- CreateSeuratObject( counts = chrom_assay, assay = "ATAC") # extract gene annotations from EnsDb annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86, verbose = FALSE) # change to UCSC style since the data was mapped to hg38 seqlevelsStyle(annotations) <- 'UCSC' # add the gene information to the object Annotation(obj.atac) <- annotations
Make sure that the two modalities have the same cells:
cell.sel <- intersect(colnames(obj.rna), colnames(obj.atac)) obj.rna <- subset(obj.rna, cells = cell.sel) obj.atac <- subset(obj.atac, cells = cell.sel)
We next process the scRNA-seq and scATAC-seq data using standard Seurat and Signac analysis pipeline:
# normalization followed by dimensionality reduction obj.rna <- obj.rna %>% NormalizeData(verbose=FALSE) %>% FindVariableFeatures(nfeatures=3000, verbose=F) %>% ScaleData() %>% RunPCA(verbose = FALSE) %>% RunUMAP(dims = 1:30, verbose = FALSE) obj.atac <- obj.atac %>% RunTFIDF() %>% FindTopFeatures() %>% RunSVD() %>% RunUMAP(reduction = 'lsi', dims = 2:30, verbose = FALSE)
Visualize the scRNA-seq and scATAC-seq separately.
p1 <- DimPlot(obj.rna) + NoLegend() p2 <- DimPlot(obj.atac) + NoLegend() p1 + p2
We next will annotate the cell types in our multimodal data via label transfer approach implemented by Seurat. The reference dataset is found here
Download data
download.file(url = 'https://costalab.ukaachen.de/open_data/scMEGA/10x_multiome/seurat.rds', destfile = './10x_pbmc/seurat.rds', method = 'wget', extra = '--no-check-certificate')
Load data
reference <- readRDS("./10x_pbmc/seurat.rds") reference <- UpdateSeuratObject(object = reference) reference[["RNA"]] <- as(reference[["RNA"]], "Assay5") # we only keep cells with annotated cell type reference <- subset(reference, subset = celltype != "NA") # run sctransform reference <- reference %>% NormalizeData(verbose=FALSE) %>% FindVariableFeatures(nfeatures=3000, verbose=F) %>% ScaleData() %>% RunPCA(verbose = FALSE) %>% RunUMAP(dims = 1:30, verbose = FALSE)
p1 <- DimPlot(reference, label = TRUE, repel = TRUE, reduction = "umap", group.by = "broad_celltype") + NoLegend() p2 <- DimPlot(reference, label = TRUE, repel = TRUE, reduction = "umap", group.by = "celltype") + NoLegend() p1 + p2
Transfer cell type labels from reference to query:
transfer_anchors <- FindTransferAnchors( reference = reference, query = obj.rna, reference.reduction = "pca", dims = 1:30 ) predictions <- TransferData( anchorset = transfer_anchors, refdata = reference$celltype, weight.reduction = obj.rna[['pca']], dims = 1:30 ) obj.rna <- AddMetaData( object = obj.rna, metadata = predictions ) obj.atac <- AddMetaData( object = obj.atac, metadata = predictions )
Visualize the predicted cell types:
p1 <- DimPlot(obj.rna, label = TRUE, repel = TRUE, reduction = "umap", group.by = "predicted.id") + NoLegend() p2 <- DimPlot(obj.atac, label = TRUE, repel = TRUE, reduction = "umap", group.by = "predicted.id") + NoLegend() p1 + p2
We next need to project the cells into a low-dimensional space. Here we can use MOJITOO to do this job.
Let's first create a single Seurat object including both modalities and predicted cell types that we generated in the above step:
meta.data <- obj.rna@meta.data %>% as.data.frame() # create a Seurat object containing the RNA adata pbmc <- CreateSeuratObject( counts = GetAssayData(obj.rna, layer = "counts"), assay = "RNA", meta.data = meta.data ) # create ATAC assay and add it to the object pbmc[["ATAC"]] <- CreateChromatinAssay( counts = GetAssayData(obj.atac, layer = "counts",), sep = c(":", "-"), min.cells = 1, genome = 'hg38', fragments = './10x_pbmc/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz' )
MOJITOO takes as input the individual low-dimensional space for each modality, so here we do this again:
## RNA pre-processing and PCA dimension reduction DefaultAssay(pbmc) <- "RNA" pbmc <- pbmc %>% NormalizeData(verbose=F) %>% FindVariableFeatures(nfeatures=3000, verbose=F) %>% ScaleData(verbose=F) %>% RunPCA(npcs=50, reduction.name="RNA_PCA", verbose=F) ## ATAC pre-processing and LSI dimension reduction DefaultAssay(pbmc) <- "ATAC" pbmc <- pbmc %>% RunTFIDF(verbose=F) %>% FindTopFeatures(min.cutoff = 'q0', verbose=F) %>% RunSVD(verbose=F)
Run MOJITOO:
pbmc <- mojitoo( object = pbmc, reduction.list = list("RNA_PCA", "lsi"), dims.list = list(1:50, 2:50), ## exclude 1st dimension of LSI reduction.name = 'MOJITOO', assay = "RNA" )
We can generate another UMAP representation based on the MOJITOO results:
DefaultAssay(pbmc) <- "RNA" embedd <- Embeddings(pbmc[["MOJITOO"]]) pbmc <- RunUMAP(pbmc, reduction="MOJITOO", reduction.name="MOJITOO_UMAP", dims=1:ncol(embedd), verbose=F) DimPlot(pbmc, group.by = "predicted.id", shuffle = TRUE, label = TRUE, reduction = "MOJITOO_UMAP") + NoLegend()
We next infer a trajectory from naive CD4 T cells to memory CD4 T cells to characterize CD4+ T cell activation.
pbmc <- AddTrajectory(object = pbmc, trajectory = c("naive CD4 T cells", "memory CD4 T cells"), group.by = "predicted.id", reduction = "MOJITOO_UMAP", dims = 1:2, use.all = FALSE) # we only plot the cells that are in this trajectory pbmc.t.cells <- pbmc[, !is.na(pbmc$Trajectory)]
The results can be visualized as:
p1 <- DimPlot(object = pbmc.t.cells, group.by = "predicted.id", reduction = "MOJITOO_UMAP", label = TRUE) + NoLegend() p2 <- TrajectoryPlot(object = pbmc.t.cells, reduction = "MOJITOO_UMAP", continuousSet = "blueYellow", size = 1, addArrow = FALSE) p1 + p2
We next select candidate TFs and genes for building a meaningful gene regulatory network.
To identify potential regulator (i.e., TFs), we first estimate an acitivty score for each TF in each cell. This is done by first performing motif matching and then computing deviation scores using chromVAR.
# Get a list of motif position frequency matrices from the JASPAR database pfm <- getMatrixSet( x = JASPAR2020, opts = list(collection = "CORE", tax_group = 'vertebrates', all_versions = FALSE) ) # add motif information pbmc.t.cells <- AddMotifs( object = pbmc.t.cells, genome = BSgenome.Hsapiens.UCSC.hg38, pfm = pfm, assay = "ATAC" ) # run chromVAR pbmc.t.cells <- RunChromVAR( object = pbmc.t.cells, genome = BSgenome.Hsapiens.UCSC.hg38, assay = "ATAC" ) sel.tfs <- SelectTFs(object = pbmc.t.cells, return.heatmap = TRUE, cor.cutoff = 0.4) df.cor <- sel.tfs$tfs ht <- sel.tfs$heatmap draw(ht)
sel.genes <- SelectGenes(object = pbmc.t.cells, labelTop1 = 0, labelTop2 = 0) df.p2g <- sel.genes$p2g ht <- sel.genes$heatmap draw(ht)
We here will try to predict a gene regulatory network based on the correlation of the TF binding activity as estimated from snATAC-seq and gene expression as measured by snRNA-seq along the trajectory.
tf.gene.cor <- GetTFGeneCorrelation(object = pbmc.t.cells, tf.use = df.cor$tfs, gene.use = unique(df.p2g$gene), tf.assay = "chromvar", gene.assay = "RNA", trajectory.name = "Trajectory")
We can then visualize this correlation matrix by heatmap. Also, we can cluster the genes and TFs to identify different regulatory modules for the predefined sub-populations.
ht <- GRNHeatmap(tf.gene.cor, tf.timepoint = df.cor$time_point) ht
To associate genes to TFs, we will use the peak-to-gene links and TF binding sites information. Specifically, if a gene is regulated by a peak and this peak is bound by a TF, then we consider this gene to be a target of this TF.
motif.matching <- pbmc.t.cells@assays$ATAC@motifs@data colnames(motif.matching) <- pbmc.t.cells@assays$ATAC@motifs@motif.names motif.matching <- motif.matching[unique(df.p2g$peak), unique(tf.gene.cor$tf)] df.grn <- GetGRN(motif.matching = motif.matching, df.cor = tf.gene.cor, df.p2g = df.p2g)
Finally, we can visualize our network as the last step of this analysis
# define colors for nodes representing TFs (i.e., regulators) df.cor <- df.cor[order(df.cor$time_point), ] tfs.timepoint <- df.cor$time_point names(tfs.timepoint) <- df.cor$tfs # plot the graph, here we can highlight some genes df.grn2 <- df.grn %>% subset(correlation > 0.5) %>% select(c(tf, gene, correlation)) %>% rename(weights = correlation) p <- GRNPlot(df.grn2, tfs.timepoint = tfs.timepoint, show.tf.labels = TRUE, seed = 42, plot.importance = FALSE, min.importance = 2, remove.isolated = FALSE) print(p)
Once we generated the gene regulatory network, we can visualize individual TFs in terms of binding activity, expression, and target expression along the pseudotime trajectory.
Here we select two TFs for visualization.
pbmc.t.cells <- AddTargetAssay(object = pbmc.t.cells, df.grn = df.grn2) p1 <- PseudotimePlot(object = pbmc.t.cells, tf.use = "SOX4") p2 <- PseudotimePlot(object = pbmc.t.cells, tf.use = "TBX21") p1 + p2
The x-axis in above plots present pseudotime point along the trajectory, and the y-axis represent TF binding acitivty, TF expression, and TF target expression after z-score transformation.
# Check session information sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.