#' Find genes with high biological coefficent of variation
#'
#' @description find_highbcv_genes() return genes with a high biological coefficent of variation (bcv) from a
#' monocle CellDataSet object, subject to a number of thresholds including expression in number of cells,
#' expression levels and residuals.
#' @param cds a monocle CellDataSet object
#' @param annotated_fData annotated fData object produced by the cellwrangler annotate_bcv() function; if NULL,
#' parameters of bcv will be calculated from cds provided.
#' @param scaled_matrix a scaled expression matrix derived from cds provided. Exprs(cds) should be divided by
#' the colsums and multiplied by 10,000.
#' @param cell_threshold threshold for expression in number of cells
#' @param exprs_threshold mean expression threshold for expression levels
#' @param resid_threshold threshold for residuals
#' @keywords find_highbcv_genes
#' @export
#' @examples
#' find_highbcv_genes(dat)
find_highbcv_genes <- function(cds, annotated_fData = NULL, scaled_matrix = NULL, cell_threshold=10, exprs_threshold=0.001,
resid_threshold=0.2) {
if(is.null(annotated_fData) == T) {
cds <- cds
if(is.null(scaled_matrix) == F) {
scaled_mtx <- scaled_matrix
} else {
col_sums<-Matrix::colSums(exprs(cds))
scaled_mtx <- Matrix::t( Matrix::t( exprs(cds) )/col_sums)*10000
}
fData(cds)$mean_exprs <- Matrix::rowMeans(scaled_mtx)
cds_subsets<- lapply(seq(1,ceiling(nrow(cds)/10000),1), function(x){
tmp <- scaled_mtx[((x-1)*10000+1):min(c((x)*10000, max(nrow(cds)))),]
return(tmp) })
subset_sd <- lapply(cds_subsets, function(x){
tmp <- as.matrix(apply(x,1,sd))
return(tmp)
} )
cds_sd <- do.call(rbind, subset_sd)
print(table(rownames(cds_sd) == rownames(exprs(cds))))
fData(cds)$sd_expr <- cds_sd
fData(cds)$bcv<-(fData(cds)$sd_expr/fData(cds)$mean_exprs)**2
fData(cds)$num_cells_expressed<-Matrix::rowSums(scaled_mtx> 0 )
fData(cds)$percent_detection<-(fData(cds)$num_cells_expressed/dim(cds)[2])*100
annotated_fData <- fData(cds)
} else { annotated_fData <- annotated_fData }
#fit model
cds_fit<-mgcv::gam(log(bcv)~s(log(mean_exprs)),data=annotated_fData)
cds_expressed_genes<-rownames(subset(annotated_fData,num_cells_expressed >= cell_threshold & mean_exprs > exprs_threshold))
high_bcv_genes_idx<-resid(cds_fit) > resid_threshold
high_bcv_genes<-intersect(rownames(resid(cds_fit))[high_bcv_genes_idx],cds_expressed_genes)
return(high_bcv_genes)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.