#' Load the Allen Brain Atlas reference set
#'
#' This function loads the Allen Brain Atlas reference data matrix. Different
#' pre-defined gene lists or sample lists allow prefiltering of the loaded data.
#'
#' @param sampleSet The pre-defined sample list for output.
#' @param geneSet The pre-defined gene list for output.
#' @param geneList The customized gene list for output. When it is NULL, the pre-defined gene list will be used.
#' @param excludeG2M When it is TRUE, G2 and M phrase marker genes will be excluded from the gene list.
#' @param logTransform When it is TRUE, log-transformation with pseudo-count will be applied to the expression matrix (in RPKM).
#' @return A list of the loaded data, in which there are three components: 'expr' is the data matrix, 'rowMeta' is the row (gene) information, and 'colMeta' is the column (reference sample) information.
#' @export
retrieveABARef <- function(sampleSet = c("full","fetal","fetal_before10pcw","fetal_after10pcw"),
geneSet = c("full", "highvar", "fetal_highvar", "region_fetalB10", "region_fetalA10", "region_fetalB10-2", "regionGroup_fetalB10", "regionGroup_fetalA10"),
geneList = NULL,
excludeG2M = TRUE, logTransform = TRUE){
sampleSet <- match.arg(sampleSet)
geneSet <- match.arg(geneSet)
if (sampleSet == "full"){
idxCol <- 1:ncol(RefSimSpec::expr_ABA)
idxRow <- which(RefSimSpec::rows_ABA$expressed_full & RefSimSpec::rows_ABA$gene_type == "protein_coding")
} else if (sampleSet == "fetal"){
idxCol <- which(RefSimSpec::columns_ABA$fetal_before10pcw | RefSimSpec::columns_ABA$fetal_after10pcw)
idxRow <- which(RefSimSpec::rows_ABA$expressed_fetal & RefSimSpec::rows_ABA$gene_type == "protein_coding")
} else{
idxCol <- which(RefSimSpec::columns_ABA[,sampleSet])
idxRow <- which(RefSimSpec::rows_ABA$expressed_fetal & RefSimSpec::rows_ABA$gene_type == "protein_coding")
}
colDat <- RefSimSpec::columns_ABA[idxCol,]
rownames(colDat) <- sprintf("Ref%05d", 1:nrow(colDat))
if (is.null(geneList)) {
if (geneSet == "highvar"){
idxRow <- which(p.adjust(RefSimSpec::rows_ABA$p_highvar, method="BH")<0.1)
} else if (geneSet == "fetal_highvar"){
idxRow <- which(p.adjust(RefSimSpec::rows_ABA$p_highvar_fetal, method="BH")<0.1)
} else if (geneSet == "region_fetalB10"){
idxRow <- which(p.adjust(RefSimSpec::rows_ABA$p_region_b10pcw, method="BH")<0.01 & RefSimSpec::rows_ABA$log2fc_region_b10pcw>1)
} else if (geneSet == "region_fetalA10"){
idxRow <- which(p.adjust(RefSimSpec::rows_ABA$p_region_a10pcw)<0.01 & RefSimSpec::rows_ABA$log2fc_region_a10pcw>1)
} else if (geneSet == "region_fetalB10-2"){
idxRow <- which(p.adjust(RefSimSpec::rows_ABA$p_region_b10pcw)<0.05 & RefSimSpec::rows_ABA$log2fc_region_b10pcw>1)
} else if (geneSet == "regionGroup_fetalB10"){
idxRow <- which(p.adjust(RefSimSpec::rows_ABA$p_regiongroup_b10pcw)<0.01 & RefSimSpec::rows_ABA$log2fc_regiongroup_b10pcw>1)
} else if (geneSet == "regionGroup_fetalA10"){
idxRow <- which(p.adjust(RefSimSpec::rows_ABA$p_regiongroup_a10pcw)<0.01 & RefSimSpec::rows_ABA$log2fc_regiongroup_a10pcw>1)
}
} else{
numMatch <- apply(RefSimSpec::rows_ABA[,3:5], 2, function(x){ sum(x %in% geneList) })
if (max(numMatch) < length(geneList) / 2)
stop("Genes matching the ABA reference data set are too few. Make sure that they are in ENSEMBL IDs, gene symbols or Entrez IDs.")
idxRow <- which(RefSimSpec::rows_ABA[,which.max(numMatch) + 2] %in% geneList)
}
if (excludeG2M) idxRow <- setdiff(idxRow, which(RefSimSpec::rows_ABA$G2M))
rowDat <- RefSimSpec::rows_ABA[idxRow,]
dat <- RefSimSpec::expr_ABA[idxRow, idxCol]
colnames(dat) <- rownames(colDat)
if (logTransform)
dat <- log(dat + 1)
res <- list(expr = dat, rowMeta = rowDat, colMeta = colDat)
return(res)
}
#' Calculate RSS matrix
#'
#' This function calculates the Reference Similarity Spectrum (RSS) matrix,
#' based on the given input matrix and the reference data set matrix.
#'
#' @param input The input expression matrix, assuming its rows as genes and columns as samples. Its rows are expected to be named by the same types of gene representative (e.g. ENSEMBL IDs) as ref.
#' @param ref The reference expression matrix, assuming its rows as genes and columns as samples.
#' @param method The type of correlation to calculate. It allows any method used in the cor function.
#' @param scale When it is TRUE, similarities of one sample (cell) in the input matrix across the reference samples are central-scaled transformed.
#' @param threads The number of threads to calculate correlations.
#' @return The RSS matrix, with rows for the input samples and columns for the reference samples.
#' @export
calculateRSS <- function(input, ref, method = "pearson", scale = TRUE, threads = 1){
candidates <- intersect(rownames(input), rownames(ref))
corr <- NA
if(threads == 1){
corr <- cor(input[candidates,], ref[candidates,], method = method)
} else{
library(doParallel)
library(foreach)
registerDoParallel(threads)
corr <- foreach(i = 1:ncol(ref), .combine = cbind, .multicombine = TRUE) %dopar%{
x <- ref[candidates,i]
res <- apply(input[candidates,], 2, function(y){ cor(x,y, method = method) })
return(res)
}
rownames(corr) <- colnames(input)
colnames(corr) <- colnames(ref)
stopImplicitCluster()
}
if (scale){
corr <- t(scale(t(corr)))
}
return(corr)
}
#' Calculate RSS matrix: updated correlation calculation
#'
#' This function calculates the Reference Similarity Spectrum (RSS) matrix,
#' based on the given input matrix and the reference data set matrix.
#'
#' @param input The input expression matrix, assuming its rows as genes and columns as samples. Its rows are expected to be named by the same types of gene representative (e.g. ENSEMBL IDs) as ref.
#' @param ref The reference expression matrix, assuming its rows as genes and columns as samples.
#' @param method The type of correlation to calculate. It allows any method used in the cor function.
#' @param scale When it is TRUE, similarities of one sample (cell) in the input matrix across the reference samples are central-scaled transformed.
#' @return The RSS matrix, with rows for the input samples and columns for the reference samples.
#' @export
calculateRSS_2 <- function(input, ref, method = "pearson", scale = TRUE){
candidates <- intersect(rownames(input), rownames(ref))
corr <- cor(ref[candidates,], input[candidates,], method = method)
if (scale){
corr <- t(scale(t(corr)))
}
return(corr)
}
#' Plot feature's gradients/classes across all samples (cells) given the plotting coordinates
#'
#' This function plot the samples based on the given coordinates, coloring
#' each dot based on the value of its corresponding feature value/class.
#'
#' @param coord The plotting coordinates, expected to be of two columns. Each row represents one dot for one sample (cell).
#' @param value The values to be shown, with each value representing one sample (cell) with the same order as coord.
#' @param emphasize The Indices of samples (cells) to be emphasized. When it is set, the colors of all the other cells are set to be #bdbdbd30.
#' @param col The customized colors of dots. Either col or value must be given.
#' @param ... Other arguments passing to the plot function.
#' @export
plotFeature <- function(coord, values = NULL, emphasize = NULL, col = NULL, colorPal = NULL, edges = NULL, main = NA, xlab = "Dim-1", ylab = "Dim-2", cex.main = 1, cex.lab = 1, cex.axis = 1, lwd.edges = 0.2, col.edges = "#bdbdbd50", cex = 1, ...){
if (is.null(col)){
if (is.numeric(values)){
if(is.null(colorPal)){
colorPal <- grDevices::colorRampPalette(c("darkgreen", "yellow","red"))
}
cellColor <- adjustcolor(colorPal(30), alpha=.8)[as.numeric(cut(values, breaks=30, right=F,include.lowest=T))]
if(min(values, na.rm=T) == 0) cellColor[values == 0] <- "#bdbdbd30"
} else{
cols <- setNames(scales::hue_pal()(length(unique(values))), unique(values))
cellColor <- cols[values]
}
} else{
if (length(col) == 1) col <- rep(col, nrow(coord))
cellColor <- col
}
plot(coord, type="n", main = main, xlab = xlab, ylab = ylab, cex.main = cex.main, cex.lab = cex.lab, cex.axis = cex.axis, ...)
if (! is.null(edges) & (is.matrix(edges) | is.data.frame(edges))){
for(i in 1:nrow(edges)) lines(coord[as.numeric(edges[i,]),1], coord[as.numeric(edges[i,]),2], lwd = lwd.edges, col = col.edges)
}
if (is.null(emphasize)){
points(coord, col = cellColor, pch=16, cex = cex)
} else{
points(coord, col = "#efefef30", pch=16, cex = cex)
points(coord[emphasize, ], col = cellColor[emphasize], pch=16, cex = cex)
}
}
#' Train the model to classify progenitors and neurons
#'
#' This function uses three single cell RNA-seq data sets (Camp, et al. 2015;
#' Darmanis, et al. 2015; Close, et al. 2016) to construct a rank-based LASSO-logistic
#' regression model to classify progenitor cells from neurons given their transcriptome
#' profiles.
#'
#' @param input The input matrix, with its rows named by gene symbol.
#' @param threads The number of threads used to run training.
#' @param numLasso The number of training used to determine genes used in the final model.
#' @param verbose Whether to output the progress information.
#' @return A list with two components: 'gene' for the list of genes to be ranked; 'coefficients' for the model coefficients.
#' @export
retrainProgenitorModel <- function(input, threads = 1, numLasso = 100, verbose = TRUE){
expr <- do.call(cbind, lapply(RefSimSpec::expr_progenitors_neurons, function(x){ x$expr[,x$idx] }))
expr <- expr[intersect(rownames(expr), rownames(input)),]
types <- do.call(c, lapply(RefSimSpec::expr_progenitors_neurons, function(x){ x$types[x$idx] }))
rank <- apply(expr, 2, rank) / nrow(expr)
propTest <- 1/3
trainIdx <- which(runif(ncol(rank)) >= propTest)
expr_train <- rank[,trainIdx]
lab_train <- as.factor(types[trainIdx])
expr_test <- rank[,-trainIdx]
lab_test <- as.factor(types[-trainIdx])
library(glmnet)
library(doParallel)
library(foreach)
registerDoParallel(threads)
models <- foreach(i = 1:numLasso, .combine = list, .multicombine = TRUE) %dopar%{
model <- cv.glmnet(x = t(expr_train), y = lab_train, nfolds = 10, family = "binomial", type.measure = "auc")
return(model)
}
markers <- table(unlist(lapply(models, function(x){ names(which(coefficients(x)[-1,1]!=0)) })))
stopImplicitCluster()
mergedModel <- glm(lab_train ~ t(expr_train[names(markers)[markers>1],]), family = "binomial")
predictModel <- function(model, x){
invLogit <- function(x){ exp(x)/(1+exp(x)) }
markers <- gsub("^t\\(.+\\)", "", names(coefficients(model)[-1]))
invLogit(t(matrix(coefficients(model))) %*% rbind(rep(1,ncol(x)),x[markers,]))
}
resTab_test <- table(predictModel(mergedModel, expr_test) > 0.5, lab_test)
cat("PROGRESS: tested in the test test:\n")
print(resTab_test)
mergedModel <- glm(as.factor(c(lab_train, lab_test)) ~ t(cbind(expr_train[names(markers),], expr_test[names(markers),])), family = "binomial")
coefs <- coefficients(mergedModel)
names(coefs)[-1] <- gsub("^t\\(.+\\)", "", names(coefs)[-1])
res <- list(genes = rownames(expr), coefficients = coefs)
return(res)
}
#' Calculate progenitor scores for each sample (cell)
#'
#' This function uses the predictive model to estimate a progenitor scores
#' for the input samples, which can be used to determine whether the samples
#' (cells) are progenitors or neurons.
#'
#' @param input The input expression matrix, with rows representing genes and columns representing samples (cells)
#' @param nameMap A matrix/data.frame showing the gene name mappings. It is expected to be of multiple columns, one of which is for gene symbols.
#' @param model The model used for progenitor scores estimation. It is expected to be a list with at least two components: 'gene' for the list of genes to be ranked; 'coefficients' for the model coefficients. When it is NULL, the default model is used.
#' @param forceRetrain When it is TRUE, the model will be rebuilt by only considering genes involved in the input matrix.
#' @param returnModel Whether to return the retrained model for progenitor score estimation.
#' @param verbose Whether to output the progress information.
#' @param ... Other arguments passed to the 'retrainProgenitorModel' function.
#' @return A list with two components: 'scores' for the estimated progenitor scores; 'model' for the model predicting the scores (NULL when returnModel is FALSE).
#' @export
calculateProgenitorScore <- function(input, nameMap = NULL, model = NULL, forceRetrain = FALSE, returnModel = TRUE, verbose = TRUE, ...){
if (is.null(model))
model <- RefSimSpec::progenitorModel
rankGenes <- unique(model$genes)
if (length(intersect(rownames(input), rankGenes)) < length(rankGenes) / 2){ # rename the rownames according to nameMap
if (is.null(nameMap))
nameMap <- RefSimSpec::rows_ABA[, 3:5]
numMatches_input <- apply(nameMap, 2, function(x){ sum(rownames(input) %in% x) })
if (max(numMatches_input) < nrow(input) / 2)
stop("Please check the rownames of the input matrix. They do not match with the nameMap table.")
numMatches_symbol <- apply(nameMap, 2, function(x){ sum(rankGenes %in% x) })
if (max(numMatches_symbol) < length(rankGenes) / 2)
stop("Please check the nameMap table. There should be one column of gene symbol.")
nameMap <- data.frame(nameMap[,which.max(numMatches_symbol)], row.names = nameMap[,which.max(numMatches_input)])
input <- input[intersect(rownames(input), rownames(nameMap)),]
rownames(input) <- nameMap[rownames(input),1]
if (verbose) cat("PROGRESS: renamed rows of the input matrix as gene symbol\n")
}
if (length(intersect(rownames(input), rankGenes)) == length(rankGenes) & ! forceRetrain){
rankedInput <- input[rankGenes,]
coefs <- model$coefficients
} else{
if (verbose) cat("PROGRESS: need to re-train the progenitor/neuron classification model...\n")
model <- retrainProgenitorModel(input, verbose = verbose, ...)
rankedInput <- input[model$genes,]
coefs <- model$coefficients
if (verbose) cat("PROGRESS: done model retraining\n")
}
rankedInput <- apply(rankedInput, 2, rank)/nrow(rankedInput)
cat("PROGRESS: gene ranked for all cells\n")
scores <- t(matrix(coefs)) %*% rbind(rep(1,ncol(rankedInput)),rankedInput[names(coefs)[-1],])
scores <- (exp(scores) / (1 + exp(scores)))[1,]
res <- list(scores = scores, model = NULL)
if (returnModel) res$model <- model
return(res)
}
#' Do clustering of cells
#'
#' This function does clustering of cells based on their input feature matrix
#' (e.g. RSS matrix). Possible clustering methods include k-means and k-medoids
#' (PAM, or Partitioning Around Medoid). It is also allowed to only select a
#' subset of samples (cells) for clustering, and then train an SVM-based classifier
#' to assign clusters for the remaining cells.
#'
#' @param input The input expression matrix, with rows representing SAMPLES (cells) and columns representing FEATURES.
#' @param clustMethod The method used to do cell clustering.
#' @param numClust The expected number of clusters.
#' @param pcNum When PCA is used for dimension reduction, it defines the number of dimensions.
#' @param maxCells The maximum number of cells selected for clustering. NULL or NA for unlimitation.
#' @param seedIdx The indices of samples (cells) used for clustering. The remaining samples will be assigned to clusters based on the SVM-classification model.
#' @return A list with two elements: 'clust' is the vector of class labels; 'seedset' is the vector of indices of samples (cells) used in the initial clustering
#' @export
cellClustering <- function(input, clustMethod = c("pam", "kmeans", "pca-kmeans"), numClust = 20, pcNum = 50, maxCells = 40000, seedIdx = NULL, threads = 1, verbose = TRUE){
clustMethod <- match.arg(clustMethod)
numCells <- nrow(input)
if (is.na(maxCells) | is.null(maxCells))
maxCells <- numCells
if (numCells > maxCells){
if (is.null(seedIdx) | length(seedIdx) > maxCells)
seedIdx <- sample(1:numCells, maxCells)
} else if (is.null(seedIdx)){
seedIdx <- 1:numCells
}
if (clustMethod == "pca-kmeans"){
PCA <- prcomp(RSS)
input <- PCA$x[,1:pcNum]
}
inputSeed <- input[seedIdx,]
inputRemain <- input[-seedIdx,]
if (clustMethod == "kmeans" | clustMethod == "pca-kmeans"){
clustSeed <- kmeans(inputSeed, numClust)$cluster
names(clustSeed) <- rownames(inputSeed)
} else{
library(ClusterR)
pamClust <- Cluster_Medoids(inputSeed, numClust, distance_metric = "pearson_correlation", threads = threads, verbose = verbose)
clustSeed <- pamClust$silhouette_matrix$clusters
names(clustSeed) <- rownames(inputSeed)
}
if (nrow(inputRemain) > 0){
if (verbose) cat("PROGRESS: start to predict cluster labels for the remaining cells\n")
library(e1071)
model <- svm(x = inputSeed, y = as.factor(clustSeed))
pred <- predict(model, inputRemain)
clust <- rep(NA, nrow(input))
clust[seedIdx] <- clustSeed
clust[-seedIdx] <- pred
clust <- as.numeric(as.character(clust))
names(clust) <- rownames(input)
if (verbose) cat("PROGRESS: finish cell cluster label prediction\n")
} else{
clust <- clustSeed
}
clust <- list(clust = clust, seedset = seedIdx)
return(clust)
}
#' The wrapper function to calculate ABA-RSS with the following analysis
#'
#' This function integrates the basic ABA-RSS analysis, including RSS
#' calculation, dimension reduction for visualization using tSNE, sample
#' (cell) clustering based on the RSS matrix, progenitor/neuron
#' classification with potential intermediate progenitor clusters
#' identified.
#'
#' @param input The input matrix of sample (cell) transcriptome profiles. Should be properly normalized and log-transformed. Its rows representing genes should be named by ENSEMBL IDs, gene symbols or Entrez IDs.
#' @param onlyFetal Whether only considering the fetal ABA reference samples.
#' @param highVar Whether only considering the highly variable genes across the ABA reference samples.
#' @param tsneMethod The method used for tSNE (Rtsne).
#' @param tsne_pcaDim When pca method is used for tSNE, it defines the initial dimension used by PCA.
#' @param tsne_perplexity The perplexity parameter of tSNE.
#' @param clustMethod The method used to do cell clustering.
#' @param numClust The expected number of clusters
#' @param maxCells The maximum number of cells selected for clustering. NULL or NA for unlimitation.
#' @param seedIdx The indices of samples (cells) used for clustering. The remaining samples will be assigned to clusters based on the SVM-classification model.
#' @param progenitorScores The pre-calculated progenitor scores. When it is NULL, the score calculation is involved in the pipeline.
#' @param nameMap A matrix/data.frame showing the gene name mappings. It is expected to be of multiple columns, one of which is for gene symbols.
#' @param progenitorModel The model used for progenitor scores estimation. It is expected to be a list with at least two components: 'gene' for the list of genes to be ranked; 'coefficients' for the model coefficients. When it is NULL, either the default or a re-trained model is used.
#' @param forceRetrain When it is TRUE, the progenitor model is always retrained using only genes appears in the input matrix.
#' @param returnModel Whether to return the retrained model for progenitor score estimation.
#' @param progenitorThres The progenitor score threshold to distinct progenitors and neurons
#' @param threads The number of threads to run.
#' @param verbose Whether to output the progress information.
#' @return A list with all the results involved
#' @export
wrapper_ABA_RSS <- function(input, onlyFetal = TRUE, highVar = TRUE,
tsneMethod = c("pca", "cor_dist"), tsne_pcaDim = 100, tsne_perplexity = 50,
clustMethod = c("pam", "kmeans", "pca-kmeans"), numClust = 20, maxCells = 50000, seedset = NULL,
progenitorScores = NULL, nameMap = NULL, progenitorModel = NULL, forceRetrain = TRUE, returnModel = TRUE, progenitorThres = 0.5,
threads = 1, verbose = TRUE){
tsneMethod <- match.arg(tsneMethod)
clustMethod <- match.arg(clustMethod)
## get the ABA reference set
ref <- retreiveABARef(sampleSet = ifelse(onlyFetal, "fetal", "full"), geneSet = ifelse(highVar, ifelse(onlyFetal, "fetal_highvar", "highvar"), "full"), excludeG2M = TRUE, logTransform = TRUE)
numMatches <- c(sum(rownames(input) %in% ref$rowMeta$ensembl_gene_id), sum(rownames(input) %in% ref$rowMeta$gene_symbol), sum(rownames(input) %in% ref$rowMeta$entrez_id))
if (max(numMatches) < nrow(input) / 2 & max(numMatches) < nrow(ref$expr) / 2)
stop("Please check the rownames of the input matrix. They should be in ENSEMBL IDs, gene official symbols or Entrez IDs.")
rownames(ref$expr) <- ref$rowMeta[,which.max(numMatches) + 2]
if (verbose) cat("PROGRESS: retreived the ABA reference set for RSS calculation\n\n")
## calculate RSS matrix
if (verbose) cat("PROGRESS: start calculating RSS matrix\n")
RSS <- calculateRSS(input, ref$expr, threads = threads)
if (verbose) cat("PROGRESS: calculated RSS matrix\n\n")
## tSNE
if (verbose) cat("PROGRESS: start calculating tSNE\n")
if (tsneMethod == "pca" | ncol(input) > 45000){
if (tsneMethod == "cor_dist" & ncol(input) > 45000) warning("Too many cells, use PCA for tSNE instead.")
tsne <- Rtsne::Rtsne(RSS, initial_dims = tsne_pcaDim, perplexity = tsne_perplexity)
} else{
corrDist <- 1 - cor(t(RSS))
tsne <- Rtsne::Rtsne(corrDist, is_distance=TRUE, perplexity = tsne_perplexity)
}
if (verbose) cat("PROGRESS: calculated tSNE\n\n")
## clustering
if (verbose) cat("PROGRESS: start clustering of cells\n")
clust <- cellClustering(RSS, clustMethod = clustMethod, numClust = numClust, pcNum = tsne_pcaDim, maxCells = maxCells, seedIdx = seedset, threads = threads, verbose = verbose)
selected <- 1:nrow(RSS) %in% clust$seedIdx
names(selected) <- rownames(RSS)
clust <- clust$clust
if (verbose) cat("PROGRESS: done clustering of cells\n\n")
## progenitor-neuron classification
if (verbose) cat("PROGRESS: start classifying progenitors and neurons\n")
if (is.null(progenitorScores) | sum(colnames(input) %in% names(progenitorScores)) != ncol(input)){
scores <- calculateProgenitorScore(input, nameMap, verbose = verbose, threads = threads, model = progenitorModel, returnModel = returnModel)
progenitorModel <- scores$model
scores <- scores$scores
} else{
progenitorModel <- NULL
scores <- progenitorScores[colnames(input)]
}
names(scores) <- colnames(input)
if (verbose) cat("PROGRESS: finished scoring cells\n")
progenitorClustIdx <- which(p.adjust(sapply(sort(unique(clust)), function(cl){
tab <- matrix(rep(0,4), nrow=2); rownames(tab) <- c("FALSE", "TRUE"); colnames(tab) <- c("FALSE", "TRUE")
freq <- table(clust==cl, scores>progenitorThres)
tab[rownames(freq), colnames(freq)] <- freq
fisher.test(tab, alternative="g")$p.value
}))<0.01)
progenitorPropInProgenitorClust <- apply(table(clust, scores>progenitorThres)[progenitorClustIdx,], 2, sum); progenitorPropInProgenitorClust <- progenitorPropInProgenitorClust["TRUE"] / sum(progenitorPropInProgenitorClust)
progenitorPropInNeuronClust <- apply(table(clust, scores>progenitorThres)[-progenitorClustIdx,], 2, sum); progenitorPropInNeuronClust <- progenitorPropInNeuronClust["TRUE"] / sum(progenitorPropInNeuronClust)
p_ip <- sapply(sort(unique(clust)), function(cl) ifelse(sum(progenitorClustIdx == cl)==1, pbinom(sum(clust==cl & scores>progenitorThres), sum(clust==cl), progenitorPropInProgenitorClust), 1-pbinom(sum(clust==cl & scores>progenitorThres), sum(clust==cl), progenitorPropInNeuronClust)) )
if (verbose) cat("PROGRESS: done progenitors identification\n")
## output
res <- list(RSS = RSS, tsne = tsne, clust = clust, progenitorModel = progenitorModel, progenitorScores = scores, progenitorClustIdx = progenitorClustIdx, p_IP = p_ip,
parameters = list(onlyFetalRef = onlyFetal, highVar = highVar,
tsnePar = list(method = tsneMethod, initial_dims = tsne_pcaDim, perplexity = tsne_perplexity),
clustPar = list(method = clustMethod, k = numClust, pcaDim = tsne_pcaDim, selected = selected),
progenitorThres = progenitorThres))
return(res)
}
#' The wrapper function to assign region information to sample (cell) clusters
#'
#' This function assign the brain region information to the given sample
#' (cell) clusters, based on their transcriptome similarities to the ABA
#' reference data sets. Progenitors and neurons are processed separately.
#'
#' @param input The input matrix of sample (cell) transcriptome profiles. Should be properly normalized and log-transformed. Its rows representing genes should be named by ENSEMBL IDs, gene symbols or Entrez IDs.
#' @param clust Cluster indices of samples (cells)
#' @param progenitorClustIdx Names of the progenitor clusters
#' @param neuronClustIdx Names of the neuron clusters (by default, it is the complementary cluster set of progenitorClustIdx)
#' @param geneSetNeurons The predefined gene set for calculation of ABA-RSS of neurons
#' @param geneSetProgenitors The predefined gene set for calculation of ABA-RSS of progenitors
#' @param strucGroup The structure labels to use: 'structure_acronym' for the directly measured structures; 'structure_group' for the grouped structures.
#' @param thresBinA10 The correlation thresholds for different brain regions to determine highly correlated neuron samples (cells). Five predefined threshold lists are available (X0.75, X0.8, X0.85, X0.9 and X0.95). When it is NULL, X0.85 is used. The users can also customized the threshold by assigning a numeric vector with the same length of the number of brain regions (19).
#' @param thresBinB10 Similar to thresBinA10 but for the progenitor samples (cells).
#' @param threads The number of threads to run.
#' @param verbose Whether to output the progress information.
#' @param previous Previous output. When it is given, the RSS will not be re-calculated.
#' @return A list with all the results involved.
#' @export
wrapper_regionAssignment <- function(input, clust, progenitorClustIdx, neuronClustIdx = setdiff(sort(unique(clust)), progenitorClustIdx),
geneSetNeurons = "regionGroup_fetalA10", geneSetProgenitors = "regionGroup_fetalB10", strucGroups = c("structure_group", "structure_acronym"),
thresBinA10 = NULL, thresBinB10 = NULL, threads = 1, verbose = TRUE,
previous = NULL){
strucGroups <- match.arg(strucGroups)
skipRSS <- ! is.null(previous) & ! is.null(previous$progenitors) & ! is.null(previous$neurons)
if (skipRSS) skipRSS <- skipRSS & nrow(previous$neurons$RSS) == length(clust)
if (skipRSS){ # skip RSS calculation if a previous result list is given
strucGroups <- names(RefSimSpec::thresBinA10)[sapply(RefSimSpec::thresBinA10, nrow) == ncol(previous$neurons$RSS)]
RSS_a10pcw_grouped <- previous$neurons$RSS
RSS_b10pcw_grouped <- previous$progenitors$RSS
cat("PROGRESS: use the previously calculated RSS matrices\n\n")
} else{
ref_a10pcw <- retreiveABARef(sampleSet = "fetal_after10pcw", geneSet = geneSetNeurons, excludeG2M = FALSE, logTransform = TRUE)
numMatches <- c(sum(rownames(input) %in% ref_a10pcw$rowMeta$ensembl_gene_id), sum(rownames(input) %in% ref_a10pcw$rowMeta$gene_symbol), sum(rownames(input) %in% ref_a10pcw$rowMeta$entrez_id))
if (max(numMatches) < nrow(input) / 2 & max(numMatches) < nrow(ref_a10pcw$expr) / 2)
stop("Please check the rownames of the input matrix. They should be in ENSEMBL IDs, gene official symbols or Entrez IDs.")
rownames(ref_a10pcw$expr) <- ref_a10pcw$rowMeta[,which.max(numMatches) + 2]
if (verbose) cat("PROGRESS: loaded the ABA reference set (fetal > 10pcw)\n")
if (verbose) cat("PROGRESS: start calculating RSS matrix\n")
RSS_a10pcw <- calculateRSS(input, ref_a10pcw$expr, scale = TRUE, threads = threads)
RSS_a10pcw_grouped <- sapply(unique(ref_a10pcw$colMeta[,strucGroups]), function(x){ apply(RSS_a10pcw[, ref_a10pcw$colMeta[,strucGroups] == x], 1, mean) })
colnames(RSS_a10pcw_grouped) <- unique(ref_a10pcw$colMeta[,strucGroups])
if (verbose) cat("PROGRESS: finished RSS calculation\n")
ref_b10pcw <- retreiveABARef(sampleSet = "fetal_before10pcw", geneSet = geneSetProgenitors, excludeG2M = FALSE, logTransform = TRUE)
rownames(ref_b10pcw$expr) <- ref_b10pcw$rowMeta[,which.max(numMatches) + 2]
if (verbose) cat("PROGRESS: loaded the ABA reference set (fetal < 10pcw)\n")
if (verbose) cat("PROGRESS: start calculating RSS matrix\n")
RSS_b10pcw <- calculateRSS(input, ref_b10pcw$expr, scale = TRUE, threads = threads)
RSS_b10pcw_grouped <- sapply(unique(ref_b10pcw$colMeta[,strucGroups]), function(x){
if (sum(ref_b10pcw$colMeta[,strucGroups] == x) == 1)
return(RSS_b10pcw[, ref_b10pcw$colMeta[,strucGroups] == x])
return(apply(RSS_b10pcw[, ref_b10pcw$colMeta[,strucGroups] == x], 1, mean))
})
colnames(RSS_b10pcw_grouped) <- unique(ref_b10pcw$colMeta[,strucGroups])
if (verbose) cat("PROGRESS: finished RSS calculation\n\n")
}
## RSS to ABA fetal after 10pcw, for neurons
if (verbose) cat("PROGRESS: binarize RSS (neurons) and test for cluster enrichment\n")
p1_neurons_a10pcw <- sapply(1:ncol(RSS_a10pcw_grouped), function(i){ sapply(neuronClustIdx, function(cl){
tab <- matrix(rep(0,4), nrow=2); rownames(tab) <- c("FALSE", "TRUE"); colnames(tab) <- c("FALSE", "TRUE")
freq <- table(clust[clust %in% neuronClustIdx]==cl, apply(RSS_a10pcw_grouped[clust %in% neuronClustIdx,], 1, which.max) == i)
tab[rownames(freq), colnames(freq)] <- freq
fisher.test(tab, alternative="g")$p.value
}) })
colnames(p1_neurons_a10pcw) <- colnames(RSS_a10pcw_grouped)
if (is.null(thresBinA10) | sum(length(thresBinA10) %in% c(1, ncol(RSS_a10pcw_grouped))) == 0){
thresBinA10 <- RefSimSpec::thresBinA10[[strucGroups]]$X0.85
} else if (sum(thresBinA10 %in% colnames(RefSimSpec::thresBinA10[[strucGroups]])) == 1){
thresBinA10 <- RefSimSpec::thresBinA10[[strucGroups]][,thresBinA10]
}
binarized_a10pcw <- t(apply(RSS_a10pcw_grouped, 1, function(x) x > thresBinA10 ))
p2_neurons_a10pcw <- apply(binarized_a10pcw[which(clust %in% neuronClustIdx & apply(binarized_a10pcw, 1, sum) > 0),], 2, function(x){ sapply(neuronClustIdx, function(cl){
tab <- matrix(rep(0,4), nrow=2); rownames(tab) <- c("FALSE", "TRUE"); colnames(tab) <- c("FALSE", "TRUE")
freq <- table(clust[which(clust %in% neuronClustIdx & apply(binarized_a10pcw, 1, sum) > 0)]==cl, x)
tab[rownames(freq), colnames(freq)] <- freq
fisher.test(tab, alternative="g")$p.value
}) })
rownames(p1_neurons_a10pcw) <- neuronClustIdx
rownames(p2_neurons_a10pcw) <- neuronClustIdx
if (verbose) cat("PROGRESS: finished - neurons\n\n")
## RSS to ABA fetal before 10pcw, for progenitors
if (verbose) cat("PROGRESS: binarize RSS (progenitors) and test for cluster enrichment\n")
p1_progenitors_b10pcw <- sapply(1:ncol(RSS_b10pcw_grouped), function(i){ sapply(progenitorClustIdx, function(cl){
tab <- matrix(rep(0,4), nrow=2); rownames(tab) <- c("FALSE", "TRUE"); colnames(tab) <- c("FALSE", "TRUE")
freq <- table(clust[clust %in% progenitorClustIdx]==cl, apply(RSS_b10pcw_grouped[clust %in% progenitorClustIdx,], 1, which.max) == i)
tab[rownames(freq), colnames(freq)] <- freq
fisher.test(tab, alternative="g")$p.value
}) })
colnames(p1_progenitors_b10pcw) <- colnames(RSS_b10pcw_grouped)
if (is.null(thresBinB10) | sum(length(thresBinB10) %in% c(1, ncol(RSS_b10pcw_grouped))) == 0){
thresBinB10 <- RefSimSpec::thresBinB10[[strucGroups]]$X0.85
} else if (sum(thresBinB10 %in% colnames(RefSimSpec::thresBinB10[[strucGroups]])) == 1){
thresBinB10 <- RefSimSpec::thresBinB10[[strucGroups]][,thresBinB10]
}
binarized_b10pcw <- t(apply(RSS_b10pcw_grouped, 1, function(x) x > thresBinB10 ))
p2_progenitors_b10pcw <- apply(binarized_b10pcw[which(clust %in% progenitorClustIdx & apply(binarized_b10pcw, 1, sum) > 0),], 2, function(x){ sapply(progenitorClustIdx, function(cl){
tab <- matrix(rep(0,4), nrow=2); rownames(tab) <- c("FALSE", "TRUE"); colnames(tab) <- c("FALSE", "TRUE")
freq <- table(clust[which(clust %in% progenitorClustIdx & apply(binarized_b10pcw, 1, sum) > 0)]==cl, x)
tab[rownames(freq), colnames(freq)] <- freq
fisher.test(tab, alternative="g")$p.value
}) })
rownames(p1_progenitors_b10pcw) <- progenitorClustIdx
rownames(p2_progenitors_b10pcw) <- progenitorClustIdx
if (verbose) cat("PROGRESS: finished - progenitors\n\n")
## output
res <- list(progenitors = list(RSS = RSS_b10pcw_grouped, binRSS = binarized_b10pcw, p1 = p1_progenitors_b10pcw, p2 = p2_progenitors_b10pcw), neurons = list(RSS = RSS_a10pcw_grouped, binRSS = binarized_a10pcw, p1 = p1_neurons_a10pcw, p2 = p2_neurons_a10pcw))
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.