#' Main function for estimating ASE in single-cell data
#'
#' Fits a beta-binomial model for each gene and returns a results data frame.
#'
#' @description This function fits a beta-binomial model and returns estimates
#' of p and confidence intervals
#'
#' @param matrix1 a matrix of counts where the rows are genes and the columns
#' are cells for allele 1. This one is the one that gets its probability
#' modeled. Must have row names and column names to identify genes and cells.
#'
#' @param matrix2 a matrix of counts where the rows are genes and the columns
#' are cells for allele 2.
#'
#' @param min.cells numeric specifying the minimum number of cells a gene
#' should be present on to fit. Default is 10 cells.
#'
#' @param cores number of cores to use for parallelization. Default is 1.
#'
#' @param genes which genes to fit the model on, should be a subset of the
#' rownames of matrix1, default is all of the genes
#'
#' @param add.var amount of additional variance to add, default is 0. If
#' non-zero, addds additional columns to result data frame suffixed with .adj
#' to indicate calculated with the adjusted standard error
#'
#' @param verbose whether or not to print a lot of status messages. Default is
#' FALSE.
#'
#' @return A data frame of results containing the gene names, estimated p and
#' standard errors on the logit scale.
#'
#' @importFrom aod betabin
#' @importFrom dplyr left_join
#' @import foreach
#' @import parallel
#' @import doSNOW
#'
#'
#' @export
#'
#'
scase <- function(matrix1, matrix2, covariates=NULL, method='betabinomial',
min.cells = 10, cores = 1, genes = NULL, add.var=0,
verbose = F) {
if(!(is(matrix1, 'matrix') | is(matrix1, 'sparseMatrix')) &
!(is(matrix2, 'matrix') | is(matrix2, 'sparseMatrix'))) {
stop('input matrix1 and matrix2 must be a matrix or sparseMatrix')
}
if (ncol(matrix1)!=ncol(matrix2)) {
stop('matrices dont have same number of columns')
}
if (nrow(matrix1) != nrow(matrix2)) {
stop('matrices dont have same number of rows')
}
if (is.null(rownames(matrix1))) {
stop('input matrix1 doesnt have rownames')
}
if (is.null(colnames(matrix1))) {
stop('input matrix1 doesnt have colnames')
}
if (!is.null(covariates)) {
colnames(covariates)[1] <- 'cell'
message(paste('assuming covariates first column is cell, using',
colnames(covariates[,-1]), 'as baseline covariates'))
if (any(!sapply(covariates[,-1], is.factor))) {
stop('cannot handle non-factor covariates rn; either convert all covariates
to factor or remove non-factor columns')
}
}
cl <- makeCluster(cores)
registerDoSNOW(cl)
# Smart-seq uses NAs in the matrices which are different from 0's;
# make sure don't use cells for which gene counts are NA in one allele
if (any(is.na(matrix1))) {
warning('Warning: matrix1 contains NAs. Setting these positions to NA
in matrix2.')
matrix2[is.na(matrix1)] <- NA
}
if (any(is.na(matrix2))) {
warning('Warning: matrix2 contains NAs. Setting these positions to NA
in matrix1.')
matrix1[is.na(matrix2)] <- NA
}
numcells <- rowSums((matrix1>0)|(matrix2>0), na.rm=T)
remove.idx <- which(numcells < min.cells)
if (length(remove.idx)!=0) {
matrix1 <- matrix1[-remove.idx,]; matrix2 <- matrix2[-remove.idx,]
}
if (is.null(genes)) {
genes <- rownames(matrix1)
message(paste(nrow(matrix1), 'genes pass min threshold of', min.cells, 'cells'))
} else {
message('using user-supplied genes')
}
if (verbose) {
pb <- txtProgressBar(max = length(genes), style = 3)
progress <- function(n) setTxtProgressBar(pb, n)
opts <- list(progress = progress)
} else {
opts <- list()
}
result <- foreach(
i = 1:length(genes),
.combine = rbind,
.options.snow = opts) %dopar% {
y <- matrix1[genes[i],]
idx <- !is.na(y)
y <- y[idx]
total <- y + matrix2[genes[i],idx]
y <- y[total > 0]; total <- total[total > 0]
n <- sum(total)
if (is.null(covariates)) {
if (all(y==total)) {
return(data.frame(gene = genes[i], totalUMI = n, totalCells = length(y),
logit.p = NA, logit.p.sd = NA,
phi = NA, phi.sd = NA, flag = 'monoallelic1'))
} else if (sum(y)==0) {
return(data.frame(gene = genes[i], totalUMI = n, totalCells = length(y),
logit.p = NA, logit.p.sd = NA,
phi = NA, phi.sd = NA, flag = 'monoallelic2'))
}
if (method=='betabinomial') {
fit <- betabinase(y, total)
} else if (method=='quasibinomial') {
fit <- quasibinase(y, total)
}
res <- fit$result
if (!is.null(res)){
if (method=='betabinomial') {
logit.p <- unname(res@param['(Intercept)'])
logit.p.sd <- sqrt(res@varparam[1,1])
phi <- unname(res@param['phi.(Intercept)'])
phi.sd <- sqrt(res@varparam[2,2])
} else if (method=='quasibinomial') {
logit.p <- coef(res)['(Intercept)']
logit.p.sd <- sqrt(vcov(res)[1,1])
phi <- summary(res)$dispersion
phi.sd <- NA
}
return(data.frame(gene = genes[i], totalUMI = n, totalCells = length(y),
logit.p = logit.p,
logit.p.sd = logit.p.sd,
phi = phi,
phi.sd = phi.sd,
flag = ''))
} else {
return(data.frame(gene = genes[i], totalUMI = n, totalCells = length(y),
logit.p = NA, logit.p.sd = NA,
phi = NA, phi.sd = NA, flag = 'conv'))
}
} else {
if (all(y==total)) {
return(data.frame(gene = genes[i], factor = NA, lvl = NA, totalUMI = n,
totalCells = length(y),logit.p = NA, logit.p.sd = NA,
phi = NA, phi.sd = NA, flag = 'monoallelic1'))
} else if (sum(y)==0) {
return(data.frame(gene = genes[i], factor = NA, lvl = NA, totalUMI = n,
totalCells = length(y),logit.p = NA, logit.p.sd = NA,
phi = NA, phi.sd = NA, flag = 'monoallelic2'))
}
covari <- left_join(data.frame(cell=names(y)), covariates, by='cell')
baseline.covari <- covari[,-1,drop=F]
bcov.names <- colnames(baseline.covari)
# fit one model to each cell type and return one entry per gene per level
dfres <- NULL
for (b in bcov.names) {
lvls <- table(baseline.covari[,b])
lvls <- names(lvls[lvls>min.cells])
for (l in lvls) {
ii <- which(baseline.covari[,b]==l)
n <- sum(total[ii])
ncell <- length(ii)
if (method=='betabinomial') {
fit <- betabinase(y[ii], total[ii])
} else if (method=='quasibinomial') {
fit <- quasibinase(y[ii], total[ii])
}
res <- fit$result
if (!is.null(res)) {
if (method=='betabinomial') {
logit.p <- unname(res@param['(Intercept)'])
logit.p.sd <- sqrt(res@varparam[1,1])
phi <- unname(res@param['phi.(Intercept)'])
phi.sd <- sqrt(res@varparam[2,2])
} else if (method=='quasibinomial') {
logit.p <- unname(coef(res)['(Intercept)'])
logit.p.sd <- sqrt(vcov(res)[1,1])
phi <- summary(res)$dispersion
phi.sd <- NA
}
if (is.null(dfres)) {
dfres <- data.frame(gene = genes[i], factor = b, lvl = l, totalUMI = n,
totalCells = ncell,
logit.p = logit.p,
logit.p.sd = logit.p.sd,
phi = phi,
phi.sd = phi.sd, flag = '')
} else {
dfres <- rbind(dfres,
data.frame(gene = genes[i], factor = b, lvl = l, totalUMI = n,
totalCells = ncell,
logit.p = logit.p,
logit.p.sd = logit.p.sd,
phi = phi,
phi.sd = phi.sd, flag = ''))
}
}
}
}
return(dfres)
}
}
stopCluster(cl)
result$p <- expit(result$logit.p)
result$ci.low <- expit(result$logit.p - 2*result$logit.p.sd)
result$ci.high <- expit(result$logit.p + 2*result$logit.p.sd)
result$z <- result$logit.p/result$logit.p.sd
result$pval <- pnorm(abs(result$z), lower.tail=F)
result$qval <- p.adjust(result$pval, method='BH')
if (add.var > 0) {
result$logit.p.sd.adj <- sqrt(result$logit.p.sd^2 + add.var)
result$ci.low.adj <- expit(result$logit.p-2*result$logit.p.sd.adj)
result$ci.high.adj <- expit(result$logit.p+2*result$logit.p.sd.adj)
result$z.adj <- result$logit.p/result$logit.p.sd.adj
result$pval.adj <- pnorm(abs(result$z.adj), lower.tail=F)
result$qval.adj <- p.adjust(result$pval.adj, method='BH')
}
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.