#' Integrate data
#'
#' This function wraps a number of functions from the Seurat package.
#' It integrates and projects the single-cell multi-omics (e.g., scRNA-seq
#' and scATAC-seq) data into a common low-dimensional space via CCA approach.
#' The details can be found:
#' \url{https://satijalab.org/seurat/articles/seurat5_atacseq_integration_vignette}
#'
#' @param obj.rna A Seurat object including gene expression data
#' @param obj.atac A Seurat object including chromatin accessibility data
#' @param gene.activity A sparse matrix containing gene activity score per cell
#' @param reference.assay The reference assay from gene expression data
#' @param reduction The reduction name used for function FindTransferAnchors.
#' @param weight.reduction Dimensional reduction to use for the weighting anchors
#' @param dims Set of dimensions to use in the anchor weighting procedure.
#' @param verbose Print progress bars and output
#'
#' @return An integrated Seurat object
#' @export
#'
#' @examples
#' \dontrun{
#' obj.coembed <- CoembedData(
#' obj.rna = obj.rna,
#' obj.atac = obj.atac,
#' gene.activity = gene.activity,
#' weight.reduction = "harmony"
#' )
#'
#' }
CoembedData <-
function(obj.rna,
obj.atac,
gene.activity,
reference.assay = "RNA",
reduction = "cca",
weight.reduction = NULL,
verbose = TRUE,
dims = 1:30) {
## make sure there are the same number of cells in atac and gene activity data
if (ncol(obj.atac) != ncol(gene.activity)) {
stop("The number of cells in ATAC-seq and Gene activity are not the same!")
}
if (is.null(weight.reduction)) {
stop("Please specify dimensional reduction to use for the weighting anchors!")
}
gene.use <- intersect(rownames(gene.activity),
rownames(obj.rna))
n_genes <- length(gene.use)
message(glue::glue("Find {n_genes} common genes between ATAC and RNA"))
message("Subset ATAC and RNA data")
obj.atac[['GeneActivity']] <-
CreateAssayObject(counts = gene.activity[gene.use,])
DefaultAssay(obj.atac) <- "GeneActivity"
obj.rna <- subset(obj.rna, features = gene.use)
message("Normalize gene activity score for ATAC data")
obj.atac <- obj.atac %>%
NormalizeData(verbose = verbose) %>%
ScaleData(verbose = verbose)
message("Performing data integration using Seurat...")
transfer.anchors <- FindTransferAnchors(
reference = obj.rna,
query = obj.atac,
features = gene.use,
reference.assay = reference.assay,
query.assay = "GeneActivity",
reduction = reduction,
verbose = verbose
)
# we here restrict the imputation to the selected genes
refdata <- LayerData(obj.rna, assay = reference.assay, layer = "data")
# refdata (input) contains a scRNA-seq expression matrix for the scRNA-seq cells.
# imputation (output) will contain an imputed scRNA-seq matrix for each of the ATAC cells
obj.atac[["RNA"]] <- TransferData(
anchorset = transfer.anchors,
refdata = refdata,
weight.reduction = obj.atac[[weight.reduction]],
dims = dims
)
DefaultAssay(obj.atac) <- "RNA"
obj.rna$tech <- "RNA"
obj.atac$tech <- "ATAC"
# merge the objects
coembed <- merge(x = obj.atac, y = obj.rna)
coembed <- JoinLayers(coembed)
# Finally, we run PCA and UMAP on this combined object,
# to visualize the co-embedding of both datasets
message("Coemebdding the data...")
coembed <- coembed %>%
ScaleData(do.scale = FALSE) %>%
RunPCA(verbose = verbose) %>%
RunUMAP(dims = 1:30, verbose = verbose)
return(coembed)
}
#' Pair the cells
#'
#' This function will create the pairs of cells between different modalities based on
#' the integrated data generated by \code{\link{CoembedData}}.
#' It was modified from \url{https://github.com/buenrostrolab/stimATAC_analyses_code/blob/master/R/optMatching_functions.R}
#' to use Seurat object as input. \cr
#' Author: Vinay Kartha, Yan Hu \cr
#' Contact: <vinay_kartha@g.harvard.edu> \cr
#' Affiliation: Buenrostro Lab, Department of Stem Cell and Regenerative Biology, Harvard University \cr
#'
#' @param object The integrated Seurat object generated by the function CoembedData
#' @param reduction Dimensional reduction to use for the pairing cells
#' @param pair.by Name of one metadata column to split the object for pairing;
#' @param ident1 Specify how to split the object, must be an value of pair.by
#' @param ident2 Specify how to split the object, must be an value of pair.by
#' @param assay Assay name based on which a KNN graph is constructed
#' @param pair.mode Pair mode. Available options are: "greedy" and "geodesic".
#' @param tol Tol times the number of subjects to be matched specifies the extent to which fullmatch's
#' @param search.range This determines the size of the search knn. search_range * total number of cells = size of knn.
#' @param max.multimatch Maximum number of cells allowed to be matched to each cell
#' @param min_subgraph_size Minimum number of cells (ATAC/RNA each) needed for
#' pairing in a given subgraph. Will skip subgraphs with fewer than these cells
#' @param seed Random seed
#' @param k k-NN parameter used for applying constraints on ATAC-RNA pairs
#' @return A data frame containing the cell pairs
#' @export
#'
#' @examples
#' \dontrun{
#' df.pair <- PairCells(
#' object = obj.coembed,
#' reduction = "harmony",
#' pair.by = "tech",
#' ident1 = "ATAC",
#' ident2 = "RNA"
#' )
#' }
PairCells <- function(object,
reduction = NULL,
pair.by = NULL,
ident1 = "ATAC",
ident2 = "RNA",
assay = "RNA",
pair.mode = "greedy",
tol = 0.0001,
search.range = 0.2,
max.multimatch = 5,
min_subgraph_size = 50,
seed = 42,
k = 300) {
if (is.null(reduction)) {
stop("Please specify dimensional reduction to use for the pairing cells!")
}
if (is.null(pair.by)) {
stop("Please specify how to split the data for pairing!")
}
message("Getting dimensional reduction data for pairing cells...")
obj.1 <- object[, object@meta.data[[pair.by]] == ident1]
obj.2 <- object[, object@meta.data[[pair.by]] == ident2]
embedding.atac <-
Seurat::Embeddings(object = obj.1, reduction = reduction)
embedding.rna <-
Seurat::Embeddings(object = obj.2, reduction = reduction)
embedding <- rbind(embedding.atac, embedding.rna)
n.cells <- dim(embedding)[1]
n.cells.atac <- dim(embedding.atac)[1]
n.cells.rna <- dim(embedding.rna)[1]
message(glue::glue("Number of ATAC cells: {n.cells.atac}, number of RNA cells: {n.cells.rna}"))
# release memory
rm(obj.1)
rm(obj.2)
gc()
message(glue::glue("Pairing cells using {pair.mode} mode..."))
if (pair.mode == "geodesic") {
message("Constructing KNN graph for computing geodesic distance ..")
optmatch::setMaxProblemSize(size = Inf)
object <-
Seurat::FindNeighbors(object,
reduction = reduction,
assay = assay,
verbose = FALSE)
adjmatrix <- object@graphs$RNA_nn
diag(adjmatrix) <- 0
knn.graph <-
igraph::graph_from_adjacency_matrix(adjmatrix = adjmatrix,
mode = "max",
weighted = NULL)
message("Computing graph-based geodesic distance ..")
# Compute shortest paths between all cell pairs.
# We checked that this returns a symmetric matrix
shortest.paths <- igraph::shortest.paths(knn.graph)
# Find connected subgraphs
sub.graphs <- igraph::clusters(knn.graph)
message(
glue::glue(
"# KNN subgraphs detected: {length(unique(sub.graphs$membership))}"
)
)
object$subgraph <- sub.graphs$membership
all.pairs <- NULL
message("Skipping subgraphs with either ATAC/RNA cells fewer than: ",
min_subgraph_size)
# Go through each subgraph
for (sub.graph.idx in unique(sub.graphs$membership)) {
message("Pairing cells for subgraph No.", sub.graph.idx)
# Retrieve the subgraph
sub.graph.nodes <- sub.graphs$membership == sub.graph.idx
knn.sub.graph <-
igraph::induced_subgraph(knn.graph, which(sub.graph.nodes))
# Use down-sampling to make sure in this subgraph the number of ATAC and RNA cells are balanced
subgraph_cells <- colnames(object)[sub.graph.nodes]
n_ATAC <-
sum(subgraph_cells %in% rownames(embedding.atac))
n_RNA <- sum(subgraph_cells %in% rownames(embedding.rna))
message("Total ATAC cells in subgraph: ", n_ATAC)
message("Total RNA cells in subgraph: ", n_RNA)
embedding.atac.sub <-
embedding.atac[sub.graph.nodes[1:dim(embedding.atac)[1]], ]
embedding.rna.sub <-
embedding.rna[sub.graph.nodes[(dim(embedding.atac)[1] + 1):n.cells],]
if (n_ATAC > n_RNA) {
set.seed(seed)
embedding.atac.sub <-
embedding.atac.sub[sample(1:n_ATAC, n_RNA, replace = FALSE), ]
} else if (n_ATAC < n_RNA) {
set.seed(seed)
embedding.rna.sub <-
embedding.rna.sub[sample(1:n_RNA, n_ATAC, replace = FALSE), ]
}
if (is.null(nrow(embedding.atac.sub)) |
is.null(nrow(embedding.rna.sub))) {
message("Down-sampling within subgraph between assays led to 0 cells in one assay..")
message("Skipping current subgraph")
next
}
# Subset the original geodesic distance matrix to get the geodesic distance matrix for the subgraph
subgraph_geodist <-
shortest.paths[match(rownames(embedding.atac.sub), rownames(embedding)),
match(rownames(embedding.rna.sub), rownames(embedding))]
subgraph_size <- dim(subgraph_geodist)[1]
message("Subgraph size: ", subgraph_size)
# TO AVOID MAJOR SUBGRAPH(S) WERE BEING SKIPPED SOMETIMES
size_threshold <-
ceiling((nrow(embedding.atac.sub) + nrow(embedding.rna.sub)) * search.range)
k_pairing <- size_threshold
message("Search threshold being used: ", k_pairing)
if (subgraph_size < size_threshold |
subgraph_size < min_subgraph_size) {
message("Insufficient number of cells in subgraph. Skipping current subgraph")
next
}
# We also calculate euclidean distance matrix
subgraph_eucdist <-
pracma::distmat(embedding.atac.sub, embedding.rna.sub)
# Find KNN based on geodesic distances.
message("Constructing KNN based on geodesic distance to reduce search pairing search space")
geodist_knn <- array(-1, dim = dim(subgraph_geodist))
for (i in 1:subgraph_size) {
# Find RNA cells in the KNN of each ATAC cell
geodist_threshold <- sort(subgraph_geodist[i,])[k_pairing]
knn_ind <- subgraph_geodist[i, ] < geodist_threshold
geodist_knn[i, knn_ind] <- subgraph_eucdist[i, knn_ind]
# Find ATAC cells in the KNN of each RNA cell
geodist_threshold <- sort(subgraph_geodist[, i])[k_pairing]
knn_ind <- subgraph_geodist[, i] < geodist_threshold
geodist_knn[knn_ind, i] <- subgraph_eucdist[knn_ind, i]
}
# For an ATAC-RNA cell pair, if neither of them are in the KNN of the other, we set their distance to be inf
geodist_knn[geodist_knn < 0] <- Inf
# Add rownames to the matrix
rownames(geodist_knn) <- paste0("ATAC_", 1:subgraph_size)
colnames(geodist_knn) <- paste0("RNA_", 1:subgraph_size)
message(paste(
"Number of cells being paired:",
dim(geodist_knn)[1],
"ATAC and",
dim(geodist_knn)[1],
" RNA cells"
))
message("Determing pairs through optimized bipartite matching ..")
cell_matches <-
suppressWarnings(
optmatch::fullmatch(
optmatch::as.InfinitySparseMatrix(as.matrix(geodist_knn)),
tol = tol,
min.controls = 1 / max.multimatch,
max.controls = max.multimatch
)
)
pair.list <- GetPairList(cell_matches,
rownames(embedding.atac.sub),
rownames(embedding.rna.sub))
message("Finished!\n")
# Append the results for this subgraph to the list of all results
all.pairs <- rbind(all.pairs, pair.list)
}
} else if(pair.mode == "greedy"){
n_ATAC <- dim(embedding.atac)[1]
n_RNA <- dim(embedding.rna)[1]
if (n_ATAC >= n_RNA){
print("Pairing all RNA cells to nearest ATAC cells")
ATAC_paired <- vector("character", length = n_RNA)
for(i in 1:n_RNA){
query <- t(as.matrix(embedding.rna[i, ]))
pair_knn <- FNN::get.knnx(data = embedding.atac,
query = query,
k = 1)
ATAC_paired[i] <- rownames(embedding.atac)[pair_knn$nn.index]
embedding.atac <- embedding.atac[-pair_knn$nn.index, ]
}
RNA_paired <- rownames(embedding.rna)
} else{
print("Pairing all ATAC cells to nearest RNA cells")
RNA_paired <- vector("character", length = n_ATAC)
for(i in 1:n_ATAC){
query <- t(as.matrix(embedding.atac[i, ]))
pair_knn <- FNN::get.knnx(data = embedding.rna,
query = query,
k = 1)
RNA_paired[i] <- rownames(embedding.rna)[pair_knn$nn.index]
embedding.rna <- embedding.rna[-pair_knn$nn.index, ]
}
ATAC_paired <- rownames(embedding.atac)
}
message("Finished!\n")
all.pairs <- data.frame(ATAC=ATAC_paired, RNA=RNA_paired, stringsAsFactors=FALSE)
}
all.pairs$cell_name <- paste0("cell_", 1:nrow(all.pairs))
return(all.pairs)
}
#' Create paired object
#'
#' This function will create a Seurat object including single-cell multimodal
#' data based on the paired cells.
#'
#' @param df.pair A data frame containing the cell pairing results
#' generated by the function \code{\link{PairCells}}.
#' @param object Seurat object generated by the function \code{\link{CoembedData}}
#' @param use.assay1 A string indicating the first assay
#' @param use.assay2 A string indicating the first assay
#' @param sep Separators to use for strings encoding genomic coordinates.
#' First element is used to separate the chromosome from the coordinates,
#' second element is used to separate the start from end coordinate.
#'
#' @return A Seurat object
#' @export
#'
#' @examples
#' \dontrun{
#' obj <- CreatePairedObject(
#' df.pair = df.pair,
#' object = obj.coembed,
#' use.assay1 = "RNA",
#' use.assay2 = "ATAC",
#' )
#' }
CreatePairedObject <- function(df.pair,
obj.coembed = NULL,
obj.rna = NULL,
obj.atac = NULL,
rna.assay = NULL,
atac.assay = NULL,
sep = c("-", "-")) {
message("Merging objects...")
rna.counts <-
LayerData(obj.rna, assay = rna.assay, layer = "counts", cells = df.pair$RNA)
atac.counts <-
LayerData(obj.atac, assay = atac.assay, layer = "counts", cells = df.pair$ATAC)
colnames(rna.counts) <- df.pair$cell_name
colnames(atac.counts) <- df.pair$cell_name
# create a Seurat object containing the RNA adata
obj.pair <- CreateSeuratObject(counts = rna.counts,
assay = rna.assay)
# create ATAC assay and add it to the object
obj.pair[[atac.assay]] <-
CreateChromatinAssay(counts = atac.counts,
sep = sep,
min.cells = 10)
for (reduction in names(obj.coembed@reductions)) {
embedding <- Embeddings(
obj.coembed,
reduction = reduction)[df.pair$RNA, ]
rownames(embedding) <- df.pair$cell_name
obj.pair[[reduction]] <- CreateDimReducObject(
embeddings = embedding,
assay = DefaultAssay(obj.pair))
}
# add metadata, here we use the metadata from RNA assay
meta.data <- obj.coembed@meta.data[df.pair$RNA,]
rownames(meta.data) <- df.pair$cell_name
obj.pair <- AddMetaData(obj.pair, metadata = meta.data)
return(obj.pair)
}
#' Get list of paired cells
#'
#' This function takes in the output of the \code{\link{fullmatch}} function and sort the
#' results into a list of ATAC-RNA pairs.
#' It was modified from \url{https://github.com/buenrostrolab/stimATAC_analyses_code/blob/master/R/optMatching_functions.R} \cr
#' Author: Vinay Kartha, Yan Hu \cr
#' Contact: <vinay_kartha@g.harvard.edu> \cr
#' Affiliation: Buenrostro Lab, Department of Stem Cell and Regenerative Biology, Harvard University \cr
#'
#' @param cell_matches The output object of \code{\link{fullmatch}} in the package optmatch
#' @param ATAC_barcodes Barcode of ATAC cells, must match the order of IDs in cell_matches.
#' @param RNA_barcodes Barcode of RNA cells, must match the order of IDs in cell_matches.
#'
#' @export
#'
#' @return A data frame containing the cell pairs
GetPairList <- function(cell_matches,
ATAC_barcodes,
RNA_barcodes)
{
cell_matches <- sort(cell_matches) # Sort to get ATAC, RNA tuples
if (length(cell_matches) == 0) {
stop(
"Matches could not be found .. Perhaps try adjusting the constraints to allow optimal matching to be solved?\n"
)
}
if (any(is.na(cell_matches))) {
warning("NA pairs exist ..\n")
}
# Currently the result list contain cells paired to multiple other cells
# If a cell is paired to k cells, we duplicate this cell k times to make k pairs
# Thus we generate a new list consisting pairs of 1 ATAC - 1 RNA cells
# We also make sure that in a pair, the first ID is ATAC and the second ID is RNA
matches <- character()
pair_ids <- unique(unname(cell_matches))
for (pair_id in 1:length(pair_ids)) {
new_match <-
names(cell_matches[unname(cell_matches) == pair_ids[pair_id]])
new_match_ATAC <-
new_match[splitAndFetch(new_match, "_", 1) == "ATAC"]
new_match_RNA <-
new_match[splitAndFetch(new_match, "_", 1) == "RNA"]
new_match <- vector()
if (length(new_match_ATAC) > length(new_match_RNA)) {
new_match[seq(1, 2 * length(new_match_ATAC), 2)] <- new_match_ATAC
new_match[seq(2, 2 * length(new_match_ATAC), 2)] <-
rep(new_match_RNA, length(new_match_ATAC))
} else {
new_match[seq(1, 2 * length(new_match_RNA), 2)] <-
rep(new_match_ATAC, length(new_match_RNA))
new_match[seq(2, 2 * length(new_match_RNA), 2)] <-
new_match_RNA
}
matches <- c(matches, new_match)
}
# Make sure pair groupings (factors) are adjacent
#stopifnot(all.equal(cell_matches[seq(1,length(cell_matches),2)],cell_matches[seq(2,length(cell_matches),2)],check.attributes=FALSE))
ATAC_IDs <-
matches[seq(1, length(matches), 2)] # Names of ATAC cells
RNA_IDs <-
matches[seq(2, length(matches), 2)] # Names of RNA cells
# Checking if ATAC and RNA tuples are concordant
stopifnot(all(splitAndFetch(ATAC_IDs, "_", 1) %in% "ATAC"))
stopifnot(all(splitAndFetch(RNA_IDs, "_", 1) %in% "RNA"))
# This is just to make sure 1-1, with the names we gave
# (can still comprise actual doublets from upsampling if any)
# stopifnot(all(unique(unique(ATACp))) & all(unique(RNAp)))
# Get corresponding index relative to input matrix order
ATAC_inds <- as.numeric(splitAndFetch(ATAC_IDs, "_", 2))
RNA_inds <- as.numeric(splitAndFetch(RNA_IDs, "_", 2))
matches_mat <-
matrix(c(ATAC_inds, RNA_inds), ncol = 2, byrow = FALSE) # ATAC col1, RNA col2
message("Assembling pair list ..")
# Make data frame of matches
pair_df <- data.frame("ATAC" = ATAC_inds,
"RNA" = RNA_inds)
pair_df <- pair_df %>% arrange(ATAC)
# Convert to labels if they exist
pair_df$ATAC <-
ATAC_barcodes[pair_df$ATAC] # ATAC cell labels
pair_df$RNA <-
RNA_barcodes[pair_df$RNA] # RNA cell labels
pair_df
}
splitAndFetch <- function(vec,
delim,
part) {
if (length(part) == 1) {
sapply(strsplit(as.character(vec), delim, fixed = TRUE), "[[", part)
} else {
sapply(strsplit(as.character(vec), delim, fixed = TRUE), function(x)
paste(x[part], collapse = delim))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.