#' @include utils.R
#'
NULL
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Main Functions
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#' scPSM integration
#'
#' Correct for batch effects, impute dropouts, and denoise data
#' using the propensity score matching method for single-cell RNA-sequencing data.
#'
#' @param batches One or more log-expression matrices where genes correspond to rows and cells correspond to columns.
#' Each matrix should contain the same number of rows, corresponding to the same genes in the same order.
#' Each matrix represents a batch.
#' @param markers A vector specifying which features used as marker genes to compute propensity scores.
#' @param hvg A vector specifying which features used as HVGs for identifying MNN group.
#' @param k.self An integer scalar specifying the number of nearest neighbors in searching KNNs.
#' @param k.mnn An integer scalar specifying the number of nearest neighbors in matching MNN pairs.
#' @param correct.all A logical scalar specifying whether correction should be applied to all genes, even if only a subset is used for the MNN group identification.
#' @param merge.order An integer vector containing the linear merge order of batches.
#'
#' @importFrom Matrix t
#' @importFrom stats predict glm formula binomial
#' @importFrom BiocNeighbors queryKNN KmknnParam
#' @importFrom BiocParallel SerialParam
#' @importFrom S4Vectors DataFrame
#' @export
#'
#' @examples
#' #Please see the tutorial for the preprocessing and then run the following code
#' #psm.data <- psm_integrate(batches = batches, markers = markers, hvg = hvg, merge.order = 1:4)
psm_integrate <- function(batches, markers, hvg, k.self=10, k.mnn=10,
correct.all=TRUE, merge.order=1:4)
{
prep.out <- .prepare_input_data(batches=batches, cos.norm.in=T, cos.norm.out=T,
subset.row=hvg, correct.all=T)
in.batches <- prep.out$In # matrices of hvgs
out.batches <- prep.out$Out # matrices of all genes
subset.row <- prep.out$Subset # position of hvgs
same.set <- prep.out$Same
in.batches <- lapply(in.batches, t)
if (!same.set) {
out.batches <- lapply(out.batches, t)
}
restrict <- NULL
merge.tree <- .create_tree_predefined(in.batches, restrict, merge.order)
NEXT <- .get_next_merge
UPDATE <- .update_tree
# Putting the out.batches information in the 'extras'.
merge.tree <- .add_out_batches_to_tree(merge.tree, if (same.set) NULL else out.batches)
nbatches <- length(unlist(merge.tree))
nmerges <- nbatches - 1L
mnn.pairings <- left.set <- right.set <- vector("list", nmerges)
for (mdx in seq_len(nmerges)) {
# Traversing the merge tree to find the next two batches to merge.
next.step <- NEXT(merge.tree) # list of nmerges
left <- next.step$left # Large MNN_treenode
right <- next.step$right # Large MNN_treenode
left.data <- .get_node_data(left) # dgCMatrix of cell * hvgs
left.restrict <- .get_node_restrict(left) # NULL
left.index <- .get_node_index(left) # 1L
left.origin <- .get_node_origin(left) # seq of 1
left.extras <- .get_node_extras(left)[[1]] # dgCMatrix of cell * genes
right.data <- .get_node_data(right)
right.restrict <- .get_node_restrict(right)
right.index <- .get_node_index(right)
right.origin <- .get_node_origin(right)
right.extras <- .get_node_extras(right)[[1]]
# Really no point being too cute here with other matrix representations,
# there are coercions for the NN search _and_ for the dense per-gene output.
left.data <- as.matrix(left.data)
right.data <- as.matrix(right.data)
# Find the k-nearest neighbors in data for each point in query
W_right <- queryKNN(right.data, query=right.data, k=k.self,
BNPARAM=KmknnParam(), BPPARAM=SerialParam(), get.distance=FALSE)
# Find mnn pairs using hvgs
mnn.sets <- .restricted_mnn(left.data, left.restrict, right.data, right.restrict,
k=k.mnn, prop.k=NULL, BNPARAM=KmknnParam(), BPPARAM=SerialParam())
s1 <- mnn.sets$first
s2 <- mnn.sets$second
mnn.pairings[[mdx]] <- DataFrame(left=s1, right=s2) # DF of mnn pairs
left.set[[mdx]] <- left.index
right.set[[mdx]] <- right.index
# calculate corrected right. data and extras
# to project right to left, for each cell in right, find mnn pairs in left to cells of knn pairs in right
n_left <- nrow(left.data)
n_right <- nrow(right.data)
parings <- lapply(1:n_right, function(x){
subset(as.data.frame(mnn.pairings[[mdx]]), right %in% W_right[["index"]][x,])})
# calculate propensity scores and weights
# left - comparison, right - treatment, project treat to comparison
batch.id <- rep(c(0,1), c(n_left, n_right))
if (correct.all) {
raw <- rbind(left.extras, right.extras)
} else {
raw <- rbind(left.data, right.data)
}
psm.data <- as.matrix(raw)[, markers]
psm.data <- cbind(as.vector(batch.id), psm.data)
colnames(psm.data)[1] <- "treat"
# for all cells in right batch
psm.datas <- lapply(1:n_right, function(x){
tmp0=as.data.frame(psm.data[c(parings[[x]]$left, parings[[x]]$right+n_left),])
#tmp0$treat=factor(tmp0$treat)
return(tmp0)
})
psm.logit <- lapply(1:n_right, function(x){
if (length(parings[[x]]$left) > 1) {
glm(formula(psm.datas[[x]]),
data = psm.datas[[x]],
family = binomial)
} else NULL
})
w.logit <- lapply(1:n_right, function(x){
if (length(parings[[x]]$left) > 1) {
#tmp0=psm.datas[[x]]
exp(predict(psm.logit[[x]],subset(psm.datas[[x]],treat==0)))
} else NULL
})
w.logit <- lapply(w.logit, function(x) replace(x, !is.finite(x), 1))
#save(psm.logit,psm.datas,n_right,w.logit,parings,left.extras,left.data,right.data,file="~/Desktop/test_00.RData")
# Applying the correction and expanding the reference batch.
right.extras.cor <- sapply(1:n_right, function(x){
if (length(parings[[x]]$left) > 1) {
Matrix::colSums(w.logit[[x]]*left.extras[parings[[x]]$left, ])/sum(w.logit[[x]])
} else {
right.extras[x,]
}
})
right.data.cor <- sapply(1:n_right, function(x){
if (length(parings[[x]]$left) > 1) {
Matrix::colSums(w.logit[[x]]*left.data[parings[[x]]$left, ])/sum(w.logit[[x]])
} else {
right.data[x,]
}
})
right.extras.cor[is.na(right.extras.cor)] <- 0 # matrix of hvgs*cells
right.data.cor[is.na(right.data.cor)] <- 0 # matrix of genes*cells
colnames(right.extras.cor) <- row.names(right.extras)
colnames(right.data.cor) <- row.names(right.data)
merge.tree <- UPDATE(merge.tree, next.step$chosen,
data=rbind(left.data, t(right.data.cor)),
index=c(left.index, right.index),
restrict=.combine_restrict(left.data, left.restrict, t(right.data.cor), right.restrict),
origin=c(left.origin, right.origin),
extras=list(rbind(left.extras, t(right.extras.cor))))
}
if (!correct.all) {
full.data <- .get_node_data(merge.tree)
} else {
full.data <- .get_node_extras(merge.tree)[[1]]
}
full.data
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.