Nothing
#' Adjust for batch effects using an empirical Bayes framework in RNA-seq raw counts
#'
#' ComBat_seq is an improved model from ComBat using negative binomial regression,
#' which specifically targets RNA-Seq count data.
#'
#' @param counts Raw count matrix from genomic studies (dimensions gene x sample)
#' @param batch Vector / factor for batch
#' @param group Vector / factor for biological condition of interest
#' @param covar_mod Model matrix for multiple covariates to include in linear model (signals from these variables are kept in data after adjustment)
#' @param full_mod Boolean, if TRUE include condition of interest in model
#' @param shrink Boolean, whether to apply shrinkage on parameter estimation
#' @param shrink.disp Boolean, whether to apply shrinkage on dispersion
#' @param gene.subset.n Number of genes to use in empirical Bayes estimation, only useful when shrink = TRUE
#'
#' @return data A gene x sample count matrix, adjusted for batch effects.
#'
#' @importFrom edgeR DGEList estimateGLMCommonDisp estimateGLMTagwiseDisp glmFit glmFit.default getOffset
#' @importFrom stats dnbinom lm pnbinom qnbinom
#' @importFrom utils capture.output
#'
#' @examples
#'
#' count_matrix <- matrix(rnbinom(400, size=10, prob=0.1), nrow=50, ncol=8)
#' batch <- c(rep(1, 4), rep(2, 4))
#' group <- rep(c(0,1), 4)
#'
#' # include condition (group variable)
#' adjusted_counts <- ComBat_seq(count_matrix, batch=batch, group=group, full_mod=TRUE)
#'
#' # do not include condition
#' adjusted_counts <- ComBat_seq(count_matrix, batch=batch, group=NULL, full_mod=FALSE)
#'
#' @export
#'
ComBat_seq <- function(counts, batch, group=NULL, covar_mod=NULL, full_mod=TRUE,
shrink=FALSE, shrink.disp=FALSE, gene.subset.n=NULL){
######## Preparation ########
## Does not support 1 sample per batch yet
batch <- as.factor(batch)
if(any(table(batch)<=1)){
stop("ComBat-seq doesn't support 1 sample per batch yet")
}
## Remove genes with only 0 counts in any batch
keep_lst <- lapply(levels(batch), function(b){
which(apply(counts[, batch==b], 1, function(x){!all(x==0)}))
})
keep <- Reduce(intersect, keep_lst)
rm <- setdiff(1:nrow(counts), keep)
countsOri <- counts
counts <- counts[keep, ]
# require bioconductor 3.7, edgeR 3.22.1
dge_obj <- DGEList(counts=counts)
## Prepare characteristics on batches
n_batch <- nlevels(batch) # number of batches
batches_ind <- lapply(1:n_batch, function(i){which(batch==levels(batch)[i])}) # list of samples in each batch
n_batches <- sapply(batches_ind, length)
#if(any(n_batches==1)){mean_only=TRUE; cat("Note: one batch has only one sample, setting mean.only=TRUE\n")}
n_sample <- sum(n_batches)
cat("Found",n_batch,'batches\n')
## Make design matrix
# batch
batchmod <- model.matrix(~-1+batch) # colnames: levels(batch)
# covariate
group <- as.factor(group)
if(full_mod & nlevels(group)>1){
cat("Using full model in ComBat-seq.\n")
mod <- model.matrix(~group)
}else{
cat("Using null model in ComBat-seq.\n")
mod <- model.matrix(~1, data=as.data.frame(t(counts)))
}
# drop intercept in covariate model
if(!is.null(covar_mod)){
if(is.data.frame(covar_mod)){
covar_mod <- do.call(cbind, lapply(1:ncol(covar_mod), function(i){model.matrix(~covar_mod[,i])}))
}
covar_mod <- covar_mod[, !apply(covar_mod, 2, function(x){all(x==1)})]
}
# bind with biological condition of interest
mod <- cbind(mod, covar_mod)
# combine
design <- cbind(batchmod, mod)
## Check for intercept in covariates, and drop if present
check <- apply(design, 2, function(x) all(x == 1))
#if(!is.null(ref)){check[ref]=FALSE} ## except don't throw away the reference batch indicator
design <- as.matrix(design[,!check])
cat("Adjusting for",ncol(design)-ncol(batchmod),'covariate(s) or covariate level(s)\n')
## Check if the design is confounded
if(qr(design)$rank<ncol(design)){
#if(ncol(design)<=(n_batch)){stop("Batch variables are redundant! Remove one or more of the batch variables so they are no longer confounded")}
if(ncol(design)==(n_batch+1)){stop("The covariate is confounded with batch! Remove the covariate and rerun ComBat-Seq")}
if(ncol(design)>(n_batch+1)){
if((qr(design[,-c(1:n_batch)])$rank<ncol(design[,-c(1:n_batch)]))){stop('The covariates are confounded! Please remove one or more of the covariates so the design is not confounded')
}else{stop("At least one covariate is confounded with batch! Please remove confounded covariates and rerun ComBat-Seq")}}
}
## Check for missing values in count matrix
NAs = any(is.na(counts))
if(NAs){cat(c('Found',sum(is.na(counts)),'Missing Data Values\n'),sep=' ')}
######## Estimate gene-wise dispersions within each batch ########
cat("Estimating dispersions\n")
## Estimate common dispersion within each batch as an initial value
disp_common <- sapply(1:n_batch, function(i){
if((n_batches[i] <= ncol(design)-ncol(batchmod)+1) | qr(mod[batches_ind[[i]], ])$rank < ncol(mod)){
# not enough residual degree of freedom
return(estimateGLMCommonDisp(counts[, batches_ind[[i]]], design=NULL, subset=nrow(counts)))
}else{
return(estimateGLMCommonDisp(counts[, batches_ind[[i]]], design=mod[batches_ind[[i]], ], subset=nrow(counts)))
}
})
## Estimate gene-wise dispersion within each batch
genewise_disp_lst <- lapply(1:n_batch, function(j){
if((n_batches[j] <= ncol(design)-ncol(batchmod)+1) | qr(mod[batches_ind[[j]], ])$rank < ncol(mod)){
# not enough residual degrees of freedom - use the common dispersion
return(rep(disp_common[j], nrow(counts)))
}else{
return(estimateGLMTagwiseDisp(counts[, batches_ind[[j]]], design=mod[batches_ind[[j]], ],
dispersion=disp_common[j], prior.df=0))
}
})
names(genewise_disp_lst) <- paste0('batch', levels(batch))
## construct dispersion matrix
phi_matrix <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts))
for(k in 1:n_batch){
phi_matrix[, batches_ind[[k]]] <- vec2mat(genewise_disp_lst[[k]], n_batches[k])
}
######## Estimate parameters from NB GLM ########
cat("Fitting the GLM model\n")
glm_f <- glmFit(dge_obj, design=design, dispersion=phi_matrix, prior.count=1e-4) #no intercept - nonEstimable; compute offset (library sizes) within function
alpha_g <- glm_f$coefficients[, 1:n_batch] %*% as.matrix(n_batches/n_sample) #compute intercept as batch-size-weighted average from batches
new_offset <- t(vec2mat(getOffset(dge_obj), nrow(counts))) + # original offset - sample (library) size
vec2mat(alpha_g, ncol(counts)) # new offset - gene background expression # getOffset(dge_obj) is the same as log(dge_obj$samples$lib.size)
glm_f2 <- glmFit.default(dge_obj$counts, design=design, dispersion=phi_matrix, offset=new_offset, prior.count=1e-4)
gamma_hat <- glm_f2$coefficients[, 1:n_batch]
mu_hat <- glm_f2$fitted.values
phi_hat <- do.call(cbind, genewise_disp_lst)
######## In each batch, compute posterior estimation through Monte-Carlo integration ########
if(shrink){
cat("Apply shrinkage - computing posterior estimates for parameters\n")
mcint_fun <- monte_carlo_int_NB
monte_carlo_res <- lapply(1:n_batch, function(ii){
if(ii==1){
mcres <- mcint_fun(dat=counts[, batches_ind[[ii]]], mu=mu_hat[, batches_ind[[ii]]],
gamma=gamma_hat[, ii], phi=phi_hat[, ii], gene.subset.n=gene.subset.n)
}else{
invisible(capture.output(mcres <- mcint_fun(dat=counts[, batches_ind[[ii]]], mu=mu_hat[, batches_ind[[ii]]],
gamma=gamma_hat[, ii], phi=phi_hat[, ii], gene.subset.n=gene.subset.n)))
}
return(mcres)
})
names(monte_carlo_res) <- paste0('batch', levels(batch))
gamma_star_mat <- lapply(monte_carlo_res, function(res){res$gamma_star})
gamma_star_mat <- do.call(cbind, gamma_star_mat)
phi_star_mat <- lapply(monte_carlo_res, function(res){res$phi_star})
phi_star_mat <- do.call(cbind, phi_star_mat)
if(!shrink.disp){
cat("Apply shrinkage to mean only\n")
phi_star_mat <- phi_hat
}
}else{
cat("Shrinkage off - using GLM estimates for parameters\n")
gamma_star_mat <- gamma_hat
phi_star_mat <- phi_hat
}
######## Obtain adjusted batch-free distribution ########
mu_star <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts))
for(jj in 1:n_batch){
mu_star[, batches_ind[[jj]]] <- exp(log(mu_hat[, batches_ind[[jj]]])-vec2mat(gamma_star_mat[, jj], n_batches[jj]))
}
phi_star <- rowMeans(phi_star_mat)
######## Adjust the data ########
cat("Adjusting the data\n")
adjust_counts <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts))
for(kk in 1:n_batch){
counts_sub <- counts[, batches_ind[[kk]]]
old_mu <- mu_hat[, batches_ind[[kk]]]
old_phi <- phi_hat[, kk]
new_mu <- mu_star[, batches_ind[[kk]]]
new_phi <- phi_star
adjust_counts[, batches_ind[[kk]]] <- match_quantiles(counts_sub=counts_sub,
old_mu=old_mu, old_phi=old_phi,
new_mu=new_mu, new_phi=new_phi)
}
#dimnames(adjust_counts) <- dimnames(counts)
#return(adjust_counts)
## Add back genes with only 0 counts in any batch (so that dimensions won't change)
adjust_counts_whole <- matrix(NA, nrow=nrow(countsOri), ncol=ncol(countsOri))
dimnames(adjust_counts_whole) <- dimnames(countsOri)
adjust_counts_whole[keep, ] <- adjust_counts
adjust_counts_whole[rm, ] <- countsOri[rm, ]
return(adjust_counts_whole)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.