In KChen-lab/bindSC: Bi-order INtegration of Data from Single Cell multi-omics technologies \

knitr::opts_chunk$set(echo = TRUE, cache = TRUE)

This guide will demonstrate the usage of bindSC to jointly define cell types by leveraging mulitple single-cell modalities. Here we define cell types that incorporate both gene expression and chromatin accessibility data. The dataset is generated from the 10x multiOme of mouse retina cell altals. The dataset will be provided soon.

Stage 1: Load the scRNA-seq and scATAC-seq data with seurat object

library(bindSC)
rna <- readRDS(file="./../../data/retina_10x.rna.RDS")
atac <- readRDS(file="./../../data/retina_10x.atac.RDS")

We gate BC cell type only for demonstration

cell_sel <- rownames(rna@meta.data)[grepl("BC",rna@meta.data$celltype)]
rna <- subset(rna, cells = cell_sel)
atac <- subset(atac, cells = cell_sel)
rna <- FindVariableFeatures(rna, nfeatures = 5000)
DefaultAssay(atac) <- "ACTIVITY"
atac <- FindVariableFeatures(atac, nfeatures = 5000)
gene.use <- intersect(VariableFeatures(rna), 
                      VariableFeatures(atac))
X <- rna[["RNA"]][gene.use,]
Y <- atac[["ATAC"]][]
Z0 <- atac[["ACTIVITY"]][gene.use]
type <- c(rep("RNA", ncol(X)), rep("ATAC", ncol(X)))

a <- rowSums(as.matrix(Y))
Y <- Y[a>50,]

We then visualize each cell, colored by cell type, from two technologies. Left is UMAP from snRNA-seq, middle is UMAP from snATAC-seq (gene-activity based),and right is from snATAC-seq (peak based).

library(Seurat)
library(ggplot2)
library(ggrepel)

DefaultAssay(atac) <- "ATAC"
atac <- RunLSI(atac, n =50,features = rownames(atac)[a>50])
atac <- FindNeighbors(atac, dims = 1:20, reduction = "lsi")
atac <- FindClusters(atac, resolution = 0.5)
atac <- RunUMAP(atac, reduction = "lsi", dims = 1:15)


DefaultAssay(atac) <- "ACTIVITY"

atac <- RunPCA(atac,npcs = 50)
atac <- FindNeighbors(atac, dims = 1:20, reduction = "lsi")
atac <- FindClusters(atac, resolution = 0.5)
atac <- RunUMAP(atac, reduction = "pca", dims = 1:15, reduction.name="gene.umap")


b <- X
rownames(b)<-rownames(X)
colnames(b) <-colnames(X)
rna1 <- CreateSeuratObject(counts = b)
rna1 <- ScaleData(rna1)
rna1 <- RunPCA(rna1,npcs = 50, features = rownames(rna1))
rna1 <- FindNeighbors(rna1, dims = 1:20, reduction = "pca")
rna1 <- FindClusters(rna1, resolution = 0.5)
rna1 <- RunUMAP(rna1, reduction = "pca", dims = 1:30, reduction.name="gene.umap")
rna1@meta.data$celltype  <- rna@meta.data$celltype
Idents(rna1) <- rna1@meta.data$celltype


n <- nrow(rna1@meta.data)

plt_dt <- rbind(rna1@reductions$gene.umap@cell.embeddings ,
                atac@reductions$gene.umap@cell.embeddings,
                atac@reductions$umap@cell.embeddings
                )
plt_dt <- as.data.frame(plt_dt)
plt_dt$data <- c(rep("snRNA",n ), rep("snATAC (gene-based)",n ), rep("snATAC (peak-based)", n))
plt_dt$cluster<-c(rna1@meta.data$celltype, rna1@meta.data$celltype,rna1@meta.data$celltype)


colnames(plt_dt)<-c("UMAP1","UMAP2", "data", "cluster")
plt_dt$data<-factor(plt_dt$data,levels=c("snRNA","snATAC (gene-based)",
                                         "snATAC (peak-based)"))

label_pos<-aggregate(. ~ cluster + data, plt_dt[,c("cluster","data",  "UMAP1","UMAP2")], median) 
library(ggplot2)     
p <- ggplot(plt_dt, aes(x = UMAP1, y = UMAP2,  color = cluster)) + 
geom_point(alpha = 0.5, size =0.25)   + 
theme(plot.title = element_text(size = 40, face = "bold")) + facet_wrap(~data, scales = "free") + 
scale_colour_manual(values = paletteDiscrete(plt_dt$cluster)) + 
geom_text_repel(data = label_pos, repel = TRUE,
                    aes(label = cluster), color="black", fontface="bold",
                    alpha = 0.75,box.padding = 0.5, point.padding = 0.1) + 
    NoLegend() + theme(axis.text=element_blank(), axis.title=element_blank(),
                       axis.ticks=element_blank()) +theme_classic()

p

Generally, you can run bindSC using full datasets. However, the feature dimension is high for single cell epigenetic profiles (for example, >100k peaks for scATAC-seq data). Here we use low-dimension representations rather than the orignal matrixs for bindSC alignment. You should perform dimension reductions (SVD) on X and Z0 jointly. This will take ~2 mins

out <- dimReduce( dt1 =  X, dt2 = Z0,  K = 30)
x <- out$dt1
z0 <- out$dt2
y  <- atac@reductions$lsi@cell.embeddings

Stage 3: Parameter optimization [optional]

There are two key parameters that may influence the integraton results: 1) lambda modality weighting factor; 2) alpha weighting factor of initilized gene score matrix. The parameter optimization step will run bindS alignment with 0<lambda<1 and 0<alpha<1. THis may take a long time if the sampe size is large.We found the defualt settings with lambda = 0.5 and alpha = 0.5 work well on most integration tasks. You can skip the parameter optimization step if you want to use the default settings.

To run the parameter optimization step, you need to prepare for the cluster annotaion information for each dataset. I will skip this step and use default parameters in bindSC.

x.clst <- rna@meta.data$celltype
y.clst <- atac@meta.data$ATAC_snn_res.0.5
run = FALSE
if(run){
  paraSel <- BiCCA_para_opt( X = x1 ,
               Y = y, 
               Z0 =z0, 
               tp="out",
               X.clst = x.clst,
               Y.clst = y.clst,
               alpha.lst = seq(0,1,0.1), 
               K.lst = c(15),
               lambda.lst = seq(0,1,0.1),
               num.iteration = 50,
               tolerance = 0.01,
               save = TRUE,
               block.size = 1000
  )
  p1 <- paraSel_plot(paraSel)
  p1
}

Stage 4: Run bindSC

We use defualt settings with lambda = 0.5 and alpha = 0.5. The iteration process will be done in ~20s.

res <- BiCCA( X = t(x) ,
             Y = t(y), 
             Z0 =t(z0), 
             X.clst = x.clst,
             Y.clst = y.clst,
             alpha = 0.5, 
             lambda = 0.5,
             K = 15,
             temp.path  = "out",
             num.iteration = 50,
             tolerance = 0.01,
             save = TRUE,
             parameter.optimize = FALSE,
             block.size = 0)

Check the iteration process using plotIteration function (optional). The first row shows the objective function costs for 3 terms separately. The fourth figure shows the total objective function over iteration time. The fifth figure figure shows the relative change of gene score matrix Z over iteration time.

p2 <- plotIteration(res)
p2

Stage 4: Project cells from two modalites in co-embedding spaces.

res$uandres$r` denote the coordinates of two modalites in co-embeddings.

dim(res$u)
dim(res$r)
umap_plt <- umap(rbind(res$u, res$r))
umap_plt  <- data.frame("UMAP1"=umap_plt$layout[,1],
                        "UMAP2"=umap_plt$layout[,2],
                        "celltype" = c(x.clst, x.clst),
                        "data" = c(rep("scRNA-seq",length(x.clst)),
                                   rep("scATAC-seq",length(y.clst))))

xlim <- c(min(umap_plt$UMAP1), max(umap_plt$UMAP1))
ylim <- c(min(umap_plt$UMAP2), max(umap_plt$UMAP2))

p11 <- UMAP_plot(meta = umap_plt[umap_plt$data=="scRNA-seq",],mylabel=paletteDiscrete(umap_plt$celltype),
                color = "celltype", xlim = xlim, ylim = ylim,alpha = 1) + ggtitle("snRNA-seq")
p12 <- UMAP_plot(meta = umap_plt[umap_plt$data=="scATAC-seq",], mylabel=paletteDiscrete(umap_plt$celltype),
              color = "celltype", xlim = xlim, ylim = ylim,alpha = 1)  + ggtitle("snATAC-seq")

p3 <- ggarrange(p11,p12, ncol=2)
print(p3)

Stage 4: Impute gene expression profiles for cells from scATAC.

We can also show how bindSC improves the gene score matrix after iteration process using co-assayed cells as validation.

Z_impu <- impuZ(X=rna[["RNA"]][gene.use,], bicca = res)
p5 <- plot_geneScoreChange(X=rna[["RNA"]][gene.use,], Z0 = atac[["ACTIVITY"]][gene.use,], Z_impu = Z_impu)


p5

sessionInfo()

KChen-lab/bindSC documentation built on Sept. 29, 2022, 4:24 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

KChen-lab/bindSC
Bi-order INtegration of Data from Single Cell multi-omics technologies \

In KChen-lab/bindSC: Bi-order INtegration of Data from Single Cell multi-omics technologies \

Stage 1: Load the scRNA-seq and scATAC-seq data with seurat object

Stage 3: Parameter optimization [optional]

Stage 4: Run bindSC

Stage 4: Project cells from two modalites in co-embedding spaces.

Stage 4: Impute gene expression profiles for cells from scATAC.

R Package Documentation

Browse R Packages

We want your feedback!

KChen-lab/bindSC Bi-order INtegration of Data from Single Cell multi-omics technologies \

In KChen-lab/bindSC: Bi-order INtegration of Data from Single Cell multi-omics technologies \

Stage 1: Load the scRNA-seq and scATAC-seq data with seurat object

Stage 3: Parameter optimization [optional]

Stage 4: Run bindSC

Stage 4: Project cells from two modalites in co-embedding spaces.

Stage 4: Impute gene expression profiles for cells from scATAC.

R Package Documentation

Browse R Packages

We want your feedback!

KChen-lab/bindSC
Bi-order INtegration of Data from Single Cell multi-omics technologies \