#' @title Batch Effects Evaluation and Correction
#'
#'
#' @description Evaluates if the batch effects with a gene expression dataset using linear
#' regression and if the batch is associated with the data, the data is batch-corrected
#' using the ComBat Algorithm.
#'
#' Please set your working directory before you call the function as all output files will be saved to this folder.
#'
#'
#'
#' @param expr gene expression data; rows should be genes and columns should be samples.
#' @param batch.info contains the batch information. The first column should be sample names.
#' @param batch The title of the batch for which you want to evaluate and do the
#' correction. Default = "Batch"
#' @param NameString String that will be added in all output filenames. Default=NA.
#' @param discrete.batch Logical value indicating whether the samples are already
#' grouped in discrete batches. If the value is FALSE, contiguous batch information is
#' clustered into discrete batches. Useful for clustering batch variables that are
#' contiguous, for example, used reads and useful reads in mClust. Default = TRUE
#' @param cond The column name in the *batch.info* data which denotes a biological
#' condition variable that may need to be considered for batch correction.
#' @param clus.method Method to be used for clustering. "km" denotes k-means clustering, "NMF"
#' denotes NMF with 30 runs. Default employs both the methods, using *clus.method= c("NMF", "km")*.
#' @param nrun.NMF number of runs for NMF. Default = 30.
#'
#' @return the batch-corrected dataset is returned.
#'
#' @import utils
#' @import mclust
#'
#' @examples
#' \dontrun{
#' beacon(expr1 = dataset,
#' batch.info = batch_info,
#' batch = "Batch",
#' discrete.batch = TRUE)
#' }
#'
#'
#' @export
BatchEC <- function(expr, batch.info, batch="Batch", NameString = "", discrete.batch = TRUE, cond="", clus.method = c("NMF", "km"), nrun.NMF = 30){
#matching batch and condition
if(cond==""){
mat <- match(batch, colnames(batch.info))
batch.info <- batch.info [,c(1, mat)]
} else if(cond!=""){
mat <- match(batch, colnames(batch.info))
mat2 <- match(cond, colnames(batch.info))
batch.info <- batch.info [,c(1, mat, mat2)]
}
if (discrete.batch== FALSE){
batch.info[,2] <- mclust_cluster(batch.info[,2])
#writing the batch cluster information to file
date <- as.character(format(Sys.Date(), "%Y%m%d"))
clusterInfoFile <- paste0(date, "_", NameString, "_", batch, "_batch_info_mClust.txt")
print(paste0("Writing the batch information from mClust for ", batch, " to: ", clusterInfoFile))
write.table(batch.info,
file = clusterInfoFile,
sep = "\t",
col.names = TRUE,
row.names = FALSE)
}
batch.info[,2]<- as.factor(batch.info[,2])
expr1<-expr
exprData1 <- t(expr1)
#linear regression before batch correction
lm_before <- lin_reg(exprData=exprData1,
batch.info = batch.info,
batch=batch,
NameString = NameString,
when = "before",
return.plot =TRUE)
#checking if any of the p values are less than 0.05
if(all(as.numeric(lm_before[[1]][,2])>0.05)){
stop("Halting execution since batch is not associated with data...")
}else{
print("Batch is associated with the data...")
#pca with batch before correction
if(cond==""){
pca_batch_before<- pca_batch (exprData = exprData1,
batch.info= batch.info,
batch= batch,
NameString = NameString,
when = "before_ComBat_correction",
return.plot=TRUE)
}else if (cond!=""){
pca_batch_before<- pca_batch (exprData = exprData1,
batch.info= batch.info,
batch= batch,
cond=cond,
NameString = NameString,
when = "before_ComBat_correction",
return.plot=TRUE)
}
if(any(clus.method=="km")){
#pca with batch and kmeans before batch correction
km_before <- kmeans_PCA(exprData = exprData1,
batch.info = batch.info,
batch = batch,
NameString = NameString,
when = "before_correction",
return.plot=TRUE)
}
if(any(clus.method=="NMF")){
NMF_before <- NMF_PCA(expr=expr1,
batch.info = batch.info,
nrun = nrun.NMF,
batch = batch,
NameString = NameString,
when = "before_correction",
return.plot = TRUE)
}
#Batch Correction using ComBat
if(cond==""){
expr2 <- ComBat_data (expr = expr1,
batch.info= batch.info,
batch = batch,
NameString = NameString)
} else if (cond!=""){
expr2 <- ComBat_data (expr = expr1,
batch.info= batch.info,
batch = batch,
NameString = NameString,
cond=cond)
}
exprData2 <- t(expr2)
#linear regression after batch correction
lm_after <- lin_reg(exprData=exprData2,
batch.info = batch.info,
batch=batch,
NameString = NameString,
when = "after",
return.plot=TRUE)
#pca with batch after correction
if(cond==""){
pca_batch_after<- pca_batch (exprData = exprData2,
batch.info= batch.info,
batch= batch,
NameString = NameString,
when = "after_ComBat_correction",
return.plot=TRUE)
} else if (cond!=""){
pca_batch_after<- pca_batch (exprData = exprData2,
batch.info= batch.info,
batch= batch,
cond=cond,
NameString = NameString,
when = "after_ComBat_correction",
return.plot=TRUE)
}
if(any(clus.method=="km")){
#pca with batch and k-means after correction
km_after <- kmeans_PCA(exprData = exprData2,
batch.info = batch.info,
batch = batch,
NameString = NameString,
when = "after_correction",
return.plot=TRUE)
}
if(any(clus.method=="NMF")){
#NMF with PCA after correction
NMF_after <- NMF_PCA(expr=expr2,
batch.info = batch.info,
nrun = nrun.NMF,
batch = batch,
NameString = NameString,
when = "after_correction",
return.plot = TRUE)
}
#PCA Proportion of Variation
prop_var_plot <- pca_prop_var(batch.title = batch,
exprData1 = exprData1,
exprData2 = exprData2,
NameString = NameString,
return.plot=TRUE)
#pearson correlation scatter plot
correlationPlot(exprData1 = exprData1,
exprData2 = exprData2,
batch = batch,
NameString = NameString)
#boxplots before and after batch correction
exp_boxplot_before <- boxplot_data (expr = expr1,
when = "before",
NameString = NameString,
batch = batch,
return.plot =TRUE)
exp_boxplot_after <-boxplot_data (expr = as.data.frame(expr2),
when = "after",
NameString = NameString,
batch = batch,
return.plot =TRUE)
}
print("========================================================")
date <- as.character(format(Sys.Date(), "%Y%m%d"))
PCA_plot_file <- ifelse(NameString =="",
paste0(date, "_plot_", batch, "_PCA_plots.pdf"),
paste0(date, "_plot_", NameString, "_", batch, "_PCA_plots.pdf"))
boxplot_file <- ifelse(NameString =="",
paste0(date, "_plot_", batch, "_expr_plots.pdf"),
paste0(date, "_plot_", NameString, "_", batch, "_expr_boxplots.pdf"))
print("Saving all plots!")
pdf(PCA_plot_file)
plot(lm_before[[2]])
plot(lm_before[[3]])
plot(pca_batch_before)
if(any(clus.method=="km")){
plot(km_before[[2]])
plot(km_before[[3]])}
if(any(clus.method=="NMF")){
plot(NMF_before[[2]])
plot(NMF_before[[3]])}
plot(lm_after[[2]])
plot(lm_after[[3]])
plot(pca_batch_after)
if(any(clus.method=="km")){
plot(km_after[[2]])
plot(km_after[[3]])}
if(any(clus.method=="NMF")){
plot(NMF_after[[2]])
plot(NMF_after[[3]])}
plot(prop_var_plot)
dev.off()
pdf(boxplot_file, height = 9, width =12)
plot(exp_boxplot_before)
plot(exp_boxplot_after)
dev.off()
print("========================================================")
return(expr2)
print(sessionInfo())
}
#mClust turns contiguous data into discrete batches for batch correction
mclust_cluster <- function(data){
yBIC2_Q <-mclust::mclustBIC(as.numeric(data),G=1:8)
rs2_Q <-summary.mclustBIC(object = yBIC2_Q,data = as.numeric(data))
tau_Q <-rs2_Q$classification
return(tau_Q)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.