R/RSS.r

Defines functions retrieveABARef calculateRSS calculateRSS_2 plotFeature retrainProgenitorModel calculateProgenitorScore cellClustering wrapper_ABA_RSS wrapper_regionAssignment

Documented in calculateProgenitorScore calculateRSS cellClustering plotFeature retrainProgenitorModel retrieveABARef wrapper_ABA_RSS wrapper_regionAssignment

#' Load the Allen Brain Atlas reference set
#'
#' This function loads the Allen Brain Atlas reference data matrix. Different
#' pre-defined gene lists or sample lists allow prefiltering of the loaded data.
#'
#' @param sampleSet 	The pre-defined sample list for output.
#' @param geneSet 	The pre-defined gene list for output.
#' @param geneList 	The customized gene list for output. When it is NULL, the pre-defined gene list will be used.
#' @param excludeG2M 	When it is TRUE, G2 and M phrase marker genes will be excluded from the gene list.
#' @param logTransform 	When it is TRUE, log-transformation with pseudo-count will be applied to the expression matrix (in RPKM).
#' @return A list of the loaded data, in which there are three components: 'expr' is the data matrix, 'rowMeta' is the row (gene) information, and 'colMeta' is the column (reference sample) information.
#' @export
retrieveABARef <- function(sampleSet = c("full","fetal","fetal_before10pcw","fetal_after10pcw"),
	geneSet = c("full", "highvar", "fetal_highvar", "region_fetalB10", "region_fetalA10", "region_fetalB10-2", "regionGroup_fetalB10", "regionGroup_fetalA10"),
	geneList = NULL,
	excludeG2M = TRUE, logTransform = TRUE){
	
	sampleSet <- match.arg(sampleSet)
	geneSet <- match.arg(geneSet)
	
	if (sampleSet == "full"){
		idxCol <- 1:ncol(RefSimSpec::expr_ABA)
		idxRow <- which(RefSimSpec::rows_ABA$expressed_full & RefSimSpec::rows_ABA$gene_type == "protein_coding")
	} else if (sampleSet == "fetal"){
		idxCol <- which(RefSimSpec::columns_ABA$fetal_before10pcw | RefSimSpec::columns_ABA$fetal_after10pcw)
		idxRow <- which(RefSimSpec::rows_ABA$expressed_fetal & RefSimSpec::rows_ABA$gene_type == "protein_coding")
	} else{
		idxCol <- which(RefSimSpec::columns_ABA[,sampleSet])
		idxRow <- which(RefSimSpec::rows_ABA$expressed_fetal & RefSimSpec::rows_ABA$gene_type == "protein_coding")
	}
	colDat <- RefSimSpec::columns_ABA[idxCol,]
	rownames(colDat) <- sprintf("Ref%05d", 1:nrow(colDat))
	
	if (is.null(geneList)) {
		if (geneSet == "highvar"){
			idxRow <- which(p.adjust(RefSimSpec::rows_ABA$p_highvar, method="BH")<0.1)
		} else if (geneSet == "fetal_highvar"){
			idxRow <- which(p.adjust(RefSimSpec::rows_ABA$p_highvar_fetal, method="BH")<0.1)
		} else if (geneSet == "region_fetalB10"){
			idxRow <- which(p.adjust(RefSimSpec::rows_ABA$p_region_b10pcw, method="BH")<0.01 & RefSimSpec::rows_ABA$log2fc_region_b10pcw>1)
		} else if (geneSet == "region_fetalA10"){
			idxRow <- which(p.adjust(RefSimSpec::rows_ABA$p_region_a10pcw)<0.01 & RefSimSpec::rows_ABA$log2fc_region_a10pcw>1)
		} else if (geneSet == "region_fetalB10-2"){
			idxRow <- which(p.adjust(RefSimSpec::rows_ABA$p_region_b10pcw)<0.05 & RefSimSpec::rows_ABA$log2fc_region_b10pcw>1)
		} else if (geneSet == "regionGroup_fetalB10"){
			idxRow <- which(p.adjust(RefSimSpec::rows_ABA$p_regiongroup_b10pcw)<0.01 & RefSimSpec::rows_ABA$log2fc_regiongroup_b10pcw>1)
		} else if (geneSet == "regionGroup_fetalA10"){
			idxRow <- which(p.adjust(RefSimSpec::rows_ABA$p_regiongroup_a10pcw)<0.01 & RefSimSpec::rows_ABA$log2fc_regiongroup_a10pcw>1)
		}
	} else{
		numMatch <- apply(RefSimSpec::rows_ABA[,3:5], 2, function(x){ sum(x %in% geneList) })
		if (max(numMatch) < length(geneList) / 2)
			stop("Genes matching the ABA reference data set are too few. Make sure that they are in ENSEMBL IDs, gene symbols or Entrez IDs.")
		idxRow <- which(RefSimSpec::rows_ABA[,which.max(numMatch) + 2] %in% geneList)
	}
	if (excludeG2M) idxRow <- setdiff(idxRow, which(RefSimSpec::rows_ABA$G2M))
	rowDat <- RefSimSpec::rows_ABA[idxRow,]
	
	dat <- RefSimSpec::expr_ABA[idxRow, idxCol]
	colnames(dat) <- rownames(colDat)
	if (logTransform)
		dat <- log(dat + 1)
	
	res <- list(expr = dat, rowMeta = rowDat, colMeta = colDat)
	return(res)
}

#' Calculate RSS matrix
#'
#' This function calculates the Reference Similarity Spectrum (RSS) matrix,
#' based on the given input matrix and the reference data set matrix.
#'
#' @param input 	The input expression matrix, assuming its rows as genes and columns as samples. Its rows are expected to be named by the same types of gene representative (e.g. ENSEMBL IDs) as ref.
#' @param ref 	The reference expression matrix, assuming its rows as genes and columns as samples.
#' @param method 	The type of correlation to calculate. It allows any method used in the cor function.
#' @param scale 	When it is TRUE, similarities of one sample (cell) in the input matrix across the reference samples are central-scaled transformed.
#' @param threads 	The number of threads to calculate correlations.
#' @return The RSS matrix, with rows for the input samples and columns for the reference samples.
#' @export
calculateRSS <- function(input, ref, method = "pearson", scale = TRUE, threads = 1){
	candidates <- intersect(rownames(input), rownames(ref))

	corr <- NA
	if(threads == 1){
		corr <- cor(input[candidates,], ref[candidates,], method = method)
	} else{
		library(doParallel)
		library(foreach)
		registerDoParallel(threads)
		corr <- foreach(i = 1:ncol(ref), .combine = cbind, .multicombine = TRUE) %dopar%{
			x <- ref[candidates,i]
			res <- apply(input[candidates,], 2, function(y){ cor(x,y, method = method) })
			return(res)
		}
		rownames(corr) <- colnames(input)
		colnames(corr) <- colnames(ref)
		stopImplicitCluster()
	}
	
	if (scale){
		corr <- t(scale(t(corr)))
	}
	return(corr)
}

#' Calculate RSS matrix: updated correlation calculation
#'
#' This function calculates the Reference Similarity Spectrum (RSS) matrix,
#' based on the given input matrix and the reference data set matrix.
#'
#' @param input 	The input expression matrix, assuming its rows as genes and columns as samples. Its rows are expected to be named by the same types of gene representative (e.g. ENSEMBL IDs) as ref.
#' @param ref 	The reference expression matrix, assuming its rows as genes and columns as samples.
#' @param method 	The type of correlation to calculate. It allows any method used in the cor function.
#' @param scale 	When it is TRUE, similarities of one sample (cell) in the input matrix across the reference samples are central-scaled transformed.
#' @return The RSS matrix, with rows for the input samples and columns for the reference samples.
#' @export
calculateRSS_2 <- function(input, ref, method = "pearson", scale = TRUE){
	candidates <- intersect(rownames(input), rownames(ref))
	corr <- cor(ref[candidates,], input[candidates,], method = method)
	
	if (scale){
		corr <- t(scale(t(corr)))
	}
	return(corr)
}

#' Plot feature's gradients/classes across all samples (cells) given the plotting coordinates
#'
#' This function plot the samples based on the given coordinates, coloring
#' each dot based on the value of its corresponding feature value/class.
#'
#' @param coord 	The plotting coordinates, expected to be of two columns. Each row represents one dot for one sample (cell).
#' @param value 	The values to be shown, with each value representing one sample (cell) with the same order as coord.
#' @param emphasize 	The Indices of samples (cells) to be emphasized. When it is set, the colors of all the other cells are set to be #bdbdbd30.
#' @param col 	The customized colors of dots. Either col or value must be given.
#' @param ... 	Other arguments passing to the plot function.
#' @export
plotFeature <- function(coord, values = NULL, emphasize = NULL, col = NULL, colorPal = NULL, edges = NULL, main = NA, xlab = "Dim-1", ylab = "Dim-2", cex.main = 1, cex.lab = 1, cex.axis = 1, lwd.edges = 0.2, col.edges = "#bdbdbd50", cex = 1, ...){
	if (is.null(col)){
		if (is.numeric(values)){
			if(is.null(colorPal)){
				colorPal <- grDevices::colorRampPalette(c("darkgreen", "yellow","red"))
			}
			cellColor <- adjustcolor(colorPal(30), alpha=.8)[as.numeric(cut(values, breaks=30, right=F,include.lowest=T))]
			if(min(values, na.rm=T) == 0) cellColor[values == 0] <- "#bdbdbd30"
		} else{
			cols <- setNames(scales::hue_pal()(length(unique(values))), unique(values))
			cellColor <- cols[values]
		}
	} else{
		if (length(col) == 1) col <- rep(col, nrow(coord))
		cellColor <- col
	}
	
	plot(coord, type="n", main = main, xlab = xlab, ylab = ylab, cex.main = cex.main, cex.lab = cex.lab, cex.axis = cex.axis, ...)
	if (! is.null(edges) & (is.matrix(edges) | is.data.frame(edges))){
		for(i in 1:nrow(edges)) lines(coord[as.numeric(edges[i,]),1], coord[as.numeric(edges[i,]),2], lwd = lwd.edges, col = col.edges)
	}
	if (is.null(emphasize)){
		points(coord, col = cellColor, pch=16, cex = cex)
	} else{
		points(coord, col = "#efefef30", pch=16, cex = cex)
		points(coord[emphasize, ], col = cellColor[emphasize], pch=16, cex = cex)
	}
}

#' Train the model to classify progenitors and neurons
#'
#' This function uses three single cell RNA-seq data sets (Camp, et al. 2015;
#' Darmanis, et al. 2015; Close, et al. 2016) to construct a rank-based LASSO-logistic
#' regression model to classify progenitor cells from neurons given their transcriptome
#' profiles.
#'
#' @param input 	The input matrix, with its rows named by gene symbol.
#' @param threads 	The number of threads used to run training.
#' @param numLasso 	The number of training used to determine genes used in the final model.
#' @param verbose 	Whether to output the progress information.
#' @return A list with two components: 'gene' for the list of genes to be ranked; 'coefficients' for the model coefficients.
#' @export
retrainProgenitorModel <- function(input, threads = 1, numLasso = 100, verbose = TRUE){
	expr <- do.call(cbind, lapply(RefSimSpec::expr_progenitors_neurons, function(x){ x$expr[,x$idx] }))
	expr <- expr[intersect(rownames(expr), rownames(input)),]
	types <- do.call(c, lapply(RefSimSpec::expr_progenitors_neurons, function(x){ x$types[x$idx] }))
	rank <- apply(expr, 2, rank) / nrow(expr)
	
	propTest <- 1/3
	trainIdx <- which(runif(ncol(rank)) >= propTest)
	expr_train <- rank[,trainIdx]
	lab_train <- as.factor(types[trainIdx])
	expr_test <- rank[,-trainIdx]
	lab_test <- as.factor(types[-trainIdx])
	
	library(glmnet)
	library(doParallel)
	library(foreach)
	registerDoParallel(threads)
	models <- foreach(i = 1:numLasso, .combine = list, .multicombine = TRUE) %dopar%{
		model <- cv.glmnet(x = t(expr_train), y = lab_train, nfolds = 10, family = "binomial", type.measure = "auc")
		return(model)
	}
	markers <- table(unlist(lapply(models, function(x){ names(which(coefficients(x)[-1,1]!=0)) })))
	stopImplicitCluster()
	
	mergedModel <- glm(lab_train ~ t(expr_train[names(markers)[markers>1],]), family = "binomial")
	predictModel <- function(model, x){
		invLogit <- function(x){ exp(x)/(1+exp(x)) }
		markers <- gsub("^t\\(.+\\)", "", names(coefficients(model)[-1]))
		invLogit(t(matrix(coefficients(model))) %*% rbind(rep(1,ncol(x)),x[markers,]))
	}
	resTab_test <- table(predictModel(mergedModel, expr_test) > 0.5, lab_test)
	cat("PROGRESS:     tested in the test test:\n")
	print(resTab_test)
	
	mergedModel <- glm(as.factor(c(lab_train, lab_test)) ~ t(cbind(expr_train[names(markers),], expr_test[names(markers),])), family = "binomial")
	coefs <- coefficients(mergedModel)
	names(coefs)[-1] <- gsub("^t\\(.+\\)", "", names(coefs)[-1])
	res <- list(genes = rownames(expr), coefficients = coefs)
	return(res)
}

#' Calculate progenitor scores for each sample (cell)
#'
#' This function uses the predictive model to estimate a progenitor scores
#' for the input samples, which can be used to determine whether the samples
#' (cells) are progenitors or neurons.
#' 
#' @param input 	The input expression matrix, with rows representing genes and columns representing samples (cells)
#' @param nameMap 	A matrix/data.frame showing the gene name mappings. It is expected to be of multiple columns, one of which is for gene symbols.
#' @param model 	The model used for progenitor scores estimation. It is expected to be a list with at least two components: 'gene' for the list of genes to be ranked; 'coefficients' for the model coefficients. When it is NULL, the default model is used.
#' @param forceRetrain 	When it is TRUE, the model will be rebuilt by only considering genes involved in the input matrix.
#' @param returnModel 	Whether to return the retrained model for progenitor score estimation.
#' @param verbose 	Whether to output the progress information.
#' @param ... 	Other arguments passed to the 'retrainProgenitorModel' function.
#' @return A list with two components: 'scores' for the estimated progenitor scores; 'model' for the model predicting the scores (NULL when returnModel is FALSE).
#' @export
calculateProgenitorScore <- function(input, nameMap = NULL, model = NULL, forceRetrain = FALSE, returnModel = TRUE, verbose = TRUE, ...){
	if (is.null(model))
		model <- RefSimSpec::progenitorModel
	
	rankGenes <- unique(model$genes)
	if (length(intersect(rownames(input), rankGenes)) < length(rankGenes) / 2){ # rename the rownames according to nameMap
		if (is.null(nameMap))
			nameMap <- RefSimSpec::rows_ABA[, 3:5]
		
		numMatches_input <- apply(nameMap, 2, function(x){ sum(rownames(input) %in% x) })
		if (max(numMatches_input) < nrow(input) / 2)
			stop("Please check the rownames of the input matrix. They do not match with the nameMap table.")
		numMatches_symbol <- apply(nameMap, 2, function(x){ sum(rankGenes %in% x) })
		if (max(numMatches_symbol) < length(rankGenes) / 2)
			stop("Please check the nameMap table. There should be one column of gene symbol.")
		
		nameMap <- data.frame(nameMap[,which.max(numMatches_symbol)], row.names = nameMap[,which.max(numMatches_input)])
		input <- input[intersect(rownames(input), rownames(nameMap)),]
		rownames(input) <- nameMap[rownames(input),1]
		if (verbose) cat("PROGRESS:   renamed rows of the input matrix as gene symbol\n")
	}
	
	if (length(intersect(rownames(input), rankGenes)) == length(rankGenes) & ! forceRetrain){
		rankedInput <- input[rankGenes,]
		coefs <- model$coefficients
	} else{
		if (verbose) cat("PROGRESS:   need to re-train the progenitor/neuron classification model...\n")
		model <- retrainProgenitorModel(input, verbose = verbose, ...)
		rankedInput <- input[model$genes,]
		coefs <- model$coefficients
		if (verbose) cat("PROGRESS:   done model retraining\n")
	}
	rankedInput <- apply(rankedInput, 2, rank)/nrow(rankedInput)
	cat("PROGRESS:   gene ranked for all cells\n")
	
	scores <- t(matrix(coefs)) %*% rbind(rep(1,ncol(rankedInput)),rankedInput[names(coefs)[-1],])
	scores <- (exp(scores) / (1 + exp(scores)))[1,]
	res <- list(scores = scores, model = NULL)
	if (returnModel) res$model <- model
	return(res)
}

#' Do clustering of cells
#'
#' This function does clustering of cells based on their input feature matrix
#' (e.g. RSS matrix). Possible clustering methods include k-means and k-medoids
#' (PAM, or Partitioning Around Medoid). It is also allowed to only select a
#' subset of samples (cells) for clustering, and then train an SVM-based classifier
#' to assign clusters for the remaining cells.
#'
#' @param input 	The input expression matrix, with rows representing SAMPLES (cells) and columns representing FEATURES.
#' @param clustMethod 	The method used to do cell clustering.
#' @param numClust 	The expected number of clusters.
#' @param pcNum 	When PCA is used for dimension reduction, it defines the number of dimensions.
#' @param maxCells 	The maximum number of cells selected for clustering. NULL or NA for unlimitation.
#' @param seedIdx 	The indices of samples (cells) used for clustering. The remaining samples will be assigned to clusters based on the SVM-classification model.
#' @return A list with two elements: 'clust' is the vector of class labels; 'seedset' is the vector of indices of samples (cells) used in the initial clustering
#' @export
cellClustering <- function(input, clustMethod = c("pam", "kmeans", "pca-kmeans"), numClust = 20, pcNum = 50, maxCells = 40000, seedIdx = NULL, threads = 1, verbose = TRUE){
	clustMethod <- match.arg(clustMethod)
	
	numCells <- nrow(input)
	if (is.na(maxCells) | is.null(maxCells))
		maxCells <- numCells
	if (numCells > maxCells){
		if (is.null(seedIdx) | length(seedIdx) > maxCells)
			seedIdx <- sample(1:numCells, maxCells)
	} else if (is.null(seedIdx)){
		seedIdx <- 1:numCells
	}
	
	if (clustMethod == "pca-kmeans"){
		PCA <- prcomp(RSS)
		input <- PCA$x[,1:pcNum]
	}
	inputSeed <- input[seedIdx,]
	inputRemain <- input[-seedIdx,]
	
	if (clustMethod == "kmeans" | clustMethod == "pca-kmeans"){
		clustSeed <- kmeans(inputSeed, numClust)$cluster
		names(clustSeed) <- rownames(inputSeed)
	} else{
		library(ClusterR)
		pamClust <- Cluster_Medoids(inputSeed, numClust, distance_metric = "pearson_correlation", threads = threads, verbose = verbose)
		clustSeed <- pamClust$silhouette_matrix$clusters
		names(clustSeed) <- rownames(inputSeed)
	}
	
	if (nrow(inputRemain) > 0){
		if (verbose) cat("PROGRESS:   start to predict cluster labels for the remaining cells\n")
		library(e1071)
		model <- svm(x = inputSeed, y = as.factor(clustSeed))
		pred <- predict(model, inputRemain)
		
		clust <- rep(NA, nrow(input))
		clust[seedIdx] <- clustSeed
		clust[-seedIdx] <- pred
		clust <- as.numeric(as.character(clust))
		names(clust) <- rownames(input)
		if (verbose) cat("PROGRESS:    finish cell cluster label prediction\n")
	} else{
		clust <- clustSeed
	}
	clust <- list(clust = clust, seedset = seedIdx)
	return(clust)
}

#' The wrapper function to calculate ABA-RSS with the following analysis
#'
#' This function integrates the basic ABA-RSS analysis, including RSS
#' calculation, dimension reduction for visualization using tSNE, sample
#' (cell) clustering based on the RSS matrix, progenitor/neuron
#' classification with potential intermediate progenitor clusters
#' identified.
#'
#' @param input 	The input matrix of sample (cell) transcriptome profiles. Should be properly normalized and log-transformed. Its rows representing genes should be named by ENSEMBL IDs, gene symbols or Entrez IDs.
#' @param onlyFetal 	Whether only considering the fetal ABA reference samples.
#' @param highVar 	Whether only considering the highly variable genes across the ABA reference samples.
#' @param tsneMethod 	The method used for tSNE (Rtsne).
#' @param tsne_pcaDim 	When pca method is used for tSNE, it defines the initial dimension used by PCA.
#' @param tsne_perplexity 	The perplexity parameter of tSNE.
#' @param clustMethod 	The method used to do cell clustering.
#' @param numClust 	The expected number of clusters
#' @param maxCells 	The maximum number of cells selected for clustering. NULL or NA for unlimitation.
#' @param seedIdx 	The indices of samples (cells) used for clustering. The remaining samples will be assigned to clusters based on the SVM-classification model.
#' @param progenitorScores 	The pre-calculated progenitor scores. When it is NULL, the score calculation is involved in the pipeline.
#' @param nameMap 	A matrix/data.frame showing the gene name mappings. It is expected to be of multiple columns, one of which is for gene symbols.
#' @param progenitorModel 	The model used for progenitor scores estimation. It is expected to be a list with at least two components: 'gene' for the list of genes to be ranked; 'coefficients' for the model coefficients. When it is NULL, either the default or a re-trained model is used.
#' @param forceRetrain 	When it is TRUE, the progenitor model is always retrained using only genes appears in the input matrix.
#' @param returnModel 	Whether to return the retrained model for progenitor score estimation.
#' @param progenitorThres 	The progenitor score threshold to distinct progenitors and neurons
#' @param threads 	The number of threads to run.
#' @param verbose 	Whether to output the progress information.
#' @return A list with all the results involved
#' @export
wrapper_ABA_RSS <- function(input, onlyFetal = TRUE, highVar = TRUE,
	tsneMethod = c("pca", "cor_dist"), tsne_pcaDim = 100, tsne_perplexity = 50,
	clustMethod = c("pam", "kmeans", "pca-kmeans"), numClust = 20, maxCells = 50000, seedset = NULL, 
	progenitorScores = NULL, nameMap = NULL, progenitorModel = NULL, forceRetrain = TRUE, returnModel = TRUE, progenitorThres = 0.5,
	threads = 1, verbose = TRUE){
	
	tsneMethod <- match.arg(tsneMethod)
	clustMethod <- match.arg(clustMethod)
	
	## get the ABA reference set
	ref <- retreiveABARef(sampleSet = ifelse(onlyFetal, "fetal", "full"), geneSet = ifelse(highVar, ifelse(onlyFetal, "fetal_highvar", "highvar"), "full"), excludeG2M = TRUE, logTransform = TRUE)
	numMatches <- c(sum(rownames(input) %in% ref$rowMeta$ensembl_gene_id), sum(rownames(input) %in% ref$rowMeta$gene_symbol), sum(rownames(input) %in% ref$rowMeta$entrez_id))
	if (max(numMatches) < nrow(input) / 2 & max(numMatches) < nrow(ref$expr) / 2)
		stop("Please check the rownames of the input matrix. They should be in ENSEMBL IDs, gene official symbols or Entrez IDs.")
	rownames(ref$expr) <- ref$rowMeta[,which.max(numMatches) + 2]
	if (verbose) cat("PROGRESS: retreived the ABA reference set for RSS calculation\n\n")
	
	## calculate RSS matrix
	if (verbose) cat("PROGRESS: start calculating RSS matrix\n")
	RSS <- calculateRSS(input, ref$expr, threads = threads)
	if (verbose) cat("PROGRESS: calculated RSS matrix\n\n")
	
	## tSNE
	if (verbose) cat("PROGRESS: start calculating tSNE\n")
	if (tsneMethod == "pca" | ncol(input) > 45000){
		if (tsneMethod == "cor_dist" & ncol(input) > 45000) warning("Too many cells, use PCA for tSNE instead.")
		tsne <- Rtsne::Rtsne(RSS, initial_dims = tsne_pcaDim, perplexity = tsne_perplexity)
	} else{
		corrDist <- 1 - cor(t(RSS))
		tsne <- Rtsne::Rtsne(corrDist, is_distance=TRUE, perplexity = tsne_perplexity)
	}
	if (verbose) cat("PROGRESS: calculated tSNE\n\n")
	
	## clustering
	if (verbose) cat("PROGRESS: start clustering of cells\n")
	clust <- cellClustering(RSS, clustMethod = clustMethod, numClust = numClust, pcNum = tsne_pcaDim, maxCells = maxCells, seedIdx = seedset, threads = threads, verbose = verbose)
	selected <- 1:nrow(RSS) %in% clust$seedIdx
	names(selected) <- rownames(RSS)
	clust <- clust$clust
	if (verbose) cat("PROGRESS: done clustering of cells\n\n")
	
	## progenitor-neuron classification
	if (verbose) cat("PROGRESS: start classifying progenitors and neurons\n")
	if (is.null(progenitorScores) | sum(colnames(input) %in% names(progenitorScores)) != ncol(input)){
		scores <- calculateProgenitorScore(input, nameMap, verbose = verbose, threads = threads, model = progenitorModel, returnModel = returnModel)
		progenitorModel <- scores$model
		scores <- scores$scores
	} else{
		progenitorModel <- NULL
		scores <- progenitorScores[colnames(input)]
	}
	names(scores) <- colnames(input)
	if (verbose) cat("PROGRESS: finished scoring cells\n")
	progenitorClustIdx <- which(p.adjust(sapply(sort(unique(clust)), function(cl){
		tab <- matrix(rep(0,4), nrow=2); rownames(tab) <- c("FALSE", "TRUE"); colnames(tab) <- c("FALSE", "TRUE")
		freq <- table(clust==cl, scores>progenitorThres)
		tab[rownames(freq), colnames(freq)] <- freq
		fisher.test(tab, alternative="g")$p.value
	}))<0.01)
	progenitorPropInProgenitorClust <- apply(table(clust, scores>progenitorThres)[progenitorClustIdx,], 2, sum); progenitorPropInProgenitorClust <- progenitorPropInProgenitorClust["TRUE"] / sum(progenitorPropInProgenitorClust)
	progenitorPropInNeuronClust <- apply(table(clust, scores>progenitorThres)[-progenitorClustIdx,], 2, sum); progenitorPropInNeuronClust <- progenitorPropInNeuronClust["TRUE"] / sum(progenitorPropInNeuronClust)
	p_ip <- sapply(sort(unique(clust)), function(cl) ifelse(sum(progenitorClustIdx == cl)==1, pbinom(sum(clust==cl & scores>progenitorThres), sum(clust==cl), progenitorPropInProgenitorClust), 1-pbinom(sum(clust==cl & scores>progenitorThres), sum(clust==cl), progenitorPropInNeuronClust)) )
	if (verbose) cat("PROGRESS: done progenitors identification\n")
	
	## output
	res <- list(RSS = RSS, tsne = tsne, clust = clust, progenitorModel = progenitorModel, progenitorScores = scores, progenitorClustIdx = progenitorClustIdx, p_IP = p_ip,
		parameters = list(onlyFetalRef = onlyFetal, highVar = highVar,
			tsnePar = list(method = tsneMethod, initial_dims = tsne_pcaDim, perplexity = tsne_perplexity),
			clustPar = list(method = clustMethod, k = numClust, pcaDim = tsne_pcaDim, selected = selected),
			progenitorThres = progenitorThres))
	return(res)
}

#' The wrapper function to assign region information to sample (cell) clusters
#'
#' This function assign the brain region information to the given sample
#' (cell) clusters, based on their transcriptome similarities to the ABA
#' reference data sets. Progenitors and neurons are processed separately.
#'
#' @param input 	The input matrix of sample (cell) transcriptome profiles. Should be properly normalized and log-transformed. Its rows representing genes should be named by ENSEMBL IDs, gene symbols or Entrez IDs.
#' @param clust 	Cluster indices of samples (cells)
#' @param progenitorClustIdx 	Names of the progenitor clusters
#' @param neuronClustIdx	Names of the neuron clusters (by default, it is the complementary cluster set of progenitorClustIdx)
#' @param geneSetNeurons 	The predefined gene set for calculation of ABA-RSS of neurons
#' @param geneSetProgenitors 	The predefined gene set for calculation of ABA-RSS of progenitors
#' @param strucGroup 	The structure labels to use: 'structure_acronym' for the directly measured structures; 'structure_group' for the grouped structures.
#' @param thresBinA10 	The correlation thresholds for different brain regions to determine highly correlated neuron samples (cells). Five predefined threshold lists are available (X0.75, X0.8, X0.85, X0.9 and X0.95). When it is NULL, X0.85 is used. The users can also customized the threshold by assigning a numeric vector with the same length of the number of brain regions (19).
#' @param thresBinB10 	Similar to thresBinA10 but for the progenitor samples (cells).
#' @param threads 	The number of threads to run.
#' @param verbose 	Whether to output the progress information.
#' @param previous 	Previous output. When it is given, the RSS will not be re-calculated.
#' @return A list with all the results involved.
#' @export
wrapper_regionAssignment <- function(input, clust, progenitorClustIdx, neuronClustIdx = setdiff(sort(unique(clust)), progenitorClustIdx),
	geneSetNeurons = "regionGroup_fetalA10", geneSetProgenitors = "regionGroup_fetalB10", strucGroups = c("structure_group", "structure_acronym"), 
	thresBinA10 = NULL, thresBinB10 = NULL, threads = 1, verbose = TRUE,
	previous = NULL){
	
	strucGroups <- match.arg(strucGroups)
	
	skipRSS <- ! is.null(previous) & ! is.null(previous$progenitors) & ! is.null(previous$neurons)
	if (skipRSS) skipRSS <- skipRSS & nrow(previous$neurons$RSS) == length(clust)
	if (skipRSS){ # skip RSS calculation if a previous result list is given
		strucGroups <- names(RefSimSpec::thresBinA10)[sapply(RefSimSpec::thresBinA10, nrow) == ncol(previous$neurons$RSS)]
		RSS_a10pcw_grouped <- previous$neurons$RSS
		RSS_b10pcw_grouped <- previous$progenitors$RSS
		cat("PROGRESS: use the previously calculated RSS matrices\n\n")
	} else{
		ref_a10pcw <- retreiveABARef(sampleSet = "fetal_after10pcw", geneSet = geneSetNeurons, excludeG2M = FALSE, logTransform = TRUE)
		numMatches <- c(sum(rownames(input) %in% ref_a10pcw$rowMeta$ensembl_gene_id), sum(rownames(input) %in% ref_a10pcw$rowMeta$gene_symbol), sum(rownames(input) %in% ref_a10pcw$rowMeta$entrez_id))
		if (max(numMatches) < nrow(input) / 2 & max(numMatches) < nrow(ref_a10pcw$expr) / 2)
			stop("Please check the rownames of the input matrix. They should be in ENSEMBL IDs, gene official symbols or Entrez IDs.")
		rownames(ref_a10pcw$expr) <- ref_a10pcw$rowMeta[,which.max(numMatches) + 2]
		if (verbose) cat("PROGRESS: loaded the ABA reference set (fetal > 10pcw)\n")
		
		if (verbose) cat("PROGRESS: start calculating RSS matrix\n")
		RSS_a10pcw <- calculateRSS(input, ref_a10pcw$expr, scale = TRUE, threads = threads)
		RSS_a10pcw_grouped <- sapply(unique(ref_a10pcw$colMeta[,strucGroups]), function(x){ apply(RSS_a10pcw[, ref_a10pcw$colMeta[,strucGroups] == x], 1, mean) })
		colnames(RSS_a10pcw_grouped) <- unique(ref_a10pcw$colMeta[,strucGroups])
		if (verbose) cat("PROGRESS: finished RSS calculation\n")
		
		ref_b10pcw <- retreiveABARef(sampleSet = "fetal_before10pcw", geneSet = geneSetProgenitors, excludeG2M = FALSE, logTransform = TRUE)
		rownames(ref_b10pcw$expr) <- ref_b10pcw$rowMeta[,which.max(numMatches) + 2]
		if (verbose) cat("PROGRESS: loaded the ABA reference set (fetal < 10pcw)\n")
		
		if (verbose) cat("PROGRESS: start calculating RSS matrix\n")
		RSS_b10pcw <- calculateRSS(input, ref_b10pcw$expr, scale = TRUE, threads = threads)
		RSS_b10pcw_grouped <- sapply(unique(ref_b10pcw$colMeta[,strucGroups]), function(x){
			if (sum(ref_b10pcw$colMeta[,strucGroups] == x) == 1)
				return(RSS_b10pcw[, ref_b10pcw$colMeta[,strucGroups] == x])
			return(apply(RSS_b10pcw[, ref_b10pcw$colMeta[,strucGroups] == x], 1, mean))
		})
		colnames(RSS_b10pcw_grouped) <- unique(ref_b10pcw$colMeta[,strucGroups])
		if (verbose) cat("PROGRESS: finished RSS calculation\n\n")
	}
	
	## RSS to ABA fetal after 10pcw, for neurons
	if (verbose) cat("PROGRESS: binarize RSS (neurons) and test for cluster enrichment\n")
	p1_neurons_a10pcw <- sapply(1:ncol(RSS_a10pcw_grouped), function(i){ sapply(neuronClustIdx, function(cl){
		tab <- matrix(rep(0,4), nrow=2); rownames(tab) <- c("FALSE", "TRUE"); colnames(tab) <- c("FALSE", "TRUE")
		freq <- table(clust[clust %in% neuronClustIdx]==cl, apply(RSS_a10pcw_grouped[clust %in% neuronClustIdx,], 1, which.max) == i)
		tab[rownames(freq), colnames(freq)] <- freq
		fisher.test(tab, alternative="g")$p.value
	}) })
	colnames(p1_neurons_a10pcw) <- colnames(RSS_a10pcw_grouped)
	
	if (is.null(thresBinA10) | sum(length(thresBinA10) %in% c(1, ncol(RSS_a10pcw_grouped))) == 0){
		thresBinA10 <- RefSimSpec::thresBinA10[[strucGroups]]$X0.85
	} else if (sum(thresBinA10 %in% colnames(RefSimSpec::thresBinA10[[strucGroups]])) == 1){
		thresBinA10 <- RefSimSpec::thresBinA10[[strucGroups]][,thresBinA10]
	}
	binarized_a10pcw <- t(apply(RSS_a10pcw_grouped, 1, function(x) x > thresBinA10 ))
	p2_neurons_a10pcw <- apply(binarized_a10pcw[which(clust %in% neuronClustIdx & apply(binarized_a10pcw, 1, sum) > 0),], 2, function(x){ sapply(neuronClustIdx, function(cl){
		tab <- matrix(rep(0,4), nrow=2); rownames(tab) <- c("FALSE", "TRUE"); colnames(tab) <- c("FALSE", "TRUE")
		freq <- table(clust[which(clust %in% neuronClustIdx & apply(binarized_a10pcw, 1, sum) > 0)]==cl, x)
		tab[rownames(freq), colnames(freq)] <- freq
		fisher.test(tab, alternative="g")$p.value
	}) })
	rownames(p1_neurons_a10pcw) <- neuronClustIdx
	rownames(p2_neurons_a10pcw) <- neuronClustIdx
	if (verbose) cat("PROGRESS: finished - neurons\n\n")
	
	## RSS to ABA fetal before 10pcw, for progenitors
	if (verbose) cat("PROGRESS: binarize RSS (progenitors) and test for cluster enrichment\n")
	p1_progenitors_b10pcw <- sapply(1:ncol(RSS_b10pcw_grouped), function(i){ sapply(progenitorClustIdx, function(cl){
		tab <- matrix(rep(0,4), nrow=2); rownames(tab) <- c("FALSE", "TRUE"); colnames(tab) <- c("FALSE", "TRUE")
		freq <- table(clust[clust %in% progenitorClustIdx]==cl, apply(RSS_b10pcw_grouped[clust %in% progenitorClustIdx,], 1, which.max) == i)
		tab[rownames(freq), colnames(freq)] <- freq
		fisher.test(tab, alternative="g")$p.value
	}) })
	colnames(p1_progenitors_b10pcw) <- colnames(RSS_b10pcw_grouped)
	
	if (is.null(thresBinB10) | sum(length(thresBinB10) %in% c(1, ncol(RSS_b10pcw_grouped))) == 0){
		thresBinB10 <- RefSimSpec::thresBinB10[[strucGroups]]$X0.85
	} else if (sum(thresBinB10 %in% colnames(RefSimSpec::thresBinB10[[strucGroups]])) == 1){
		thresBinB10 <- RefSimSpec::thresBinB10[[strucGroups]][,thresBinB10]
	}
	binarized_b10pcw <- t(apply(RSS_b10pcw_grouped, 1, function(x) x > thresBinB10 ))
	p2_progenitors_b10pcw <- apply(binarized_b10pcw[which(clust %in% progenitorClustIdx & apply(binarized_b10pcw, 1, sum) > 0),], 2, function(x){ sapply(progenitorClustIdx, function(cl){
		tab <- matrix(rep(0,4), nrow=2); rownames(tab) <- c("FALSE", "TRUE"); colnames(tab) <- c("FALSE", "TRUE")
		freq <- table(clust[which(clust %in% progenitorClustIdx & apply(binarized_b10pcw, 1, sum) > 0)]==cl, x)
		tab[rownames(freq), colnames(freq)] <- freq
		fisher.test(tab, alternative="g")$p.value
	}) })
	rownames(p1_progenitors_b10pcw) <- progenitorClustIdx
	rownames(p2_progenitors_b10pcw) <- progenitorClustIdx
	if (verbose) cat("PROGRESS: finished - progenitors\n\n")
	
	## output
	res <- list(progenitors = list(RSS = RSS_b10pcw_grouped, binRSS = binarized_b10pcw, p1 = p1_progenitors_b10pcw, p2 = p2_progenitors_b10pcw), neurons = list(RSS = RSS_a10pcw_grouped, binRSS = binarized_a10pcw, p1 = p1_neurons_a10pcw, p2 = p2_neurons_a10pcw))
	return(res)
}
maplesword/RefSimSpec documentation built on May 23, 2019, 1:47 p.m.