BatchEC.R

#' @title Batch Effects Evaluation and Correction
#'
#'
#' @description Evaluates if the batch effects with a gene expression dataset using linear
#' regression and if the batch is associated with the data, the data is batch-corrected
#' using the ComBat Algorithm.
#'
#' Please set your working directory before you call the function as all output files will be saved to this folder.
#'
#'
#'
#' @param expr gene expression data; rows should be genes and columns should be samples.
#' @param batch.info contains the batch information. The first column should be sample names.
#' @param batch The title of the batch for which you want to evaluate and do the
#'  correction. Default = "Batch"
#' @param NameString  String that will be added in all output filenames. Default=NA.
#' @param discrete.batch Logical value indicating whether the samples are already
#' grouped in discrete batches. If the value is FALSE, contiguous batch information is
#' clustered into discrete batches. Useful for clustering batch variables that are
#' contiguous, for example, used reads and useful reads in mClust. Default = TRUE
#' @param cond.input Logical value indicating if there is any condition variables other
#' than batch which need to be included in the analysis. Default = FALSE
#' @param cond The column name in the *batch.info* data which denotes a biological
#' condition variable that may need to be considered for batch correction.
#' @param clus.method Method to be used for clustering. "km" denotes k-means clustering, "NMF"
#' denotes NMF with 30 runs. Default employs both the methods, using *clus.method= c("NMF", "km")*.
#' @param nrun.NMF number of runs for NMF. Default = 30.
#'
#' @return the batch-corrected dataset is returned.
#'
#' @import utils
#' @import mclust
#'
#' @examples
#' \dontrun{
#' beacon(expr1 = dataset,
#'        batch.info = batch_info,
#'        batch = "Batch",
#'        discrete.batch = TRUE)
#' }
#'
#'
#' @export
BatchEC <- function(expr, batch.info, batch="Batch", NameString = "", discrete.batch = TRUE, cond.input = FALSE, cond="", clus.method = c("NMF", "km"), nrun.NMF = 30){

  #matching batch and condition
  if(cond.input ==FALSE && cond==""){
  mat <- match(batch, colnames(batch.info))
  batch.info <- batch.info [,c(1, mat)]
  } else if(cond.input==TRUE && cond!=""){
    mat <- match(batch, colnames(batch.info))
    mat2 <- match(cond, colnames(batch.info))
    batch.info <- batch.info [,c(1, mat, mat2)]
  } else{
    stop("cond.input and cond variables are contrasting. Add a condition title
         if cond.input = TRUE or if cond.input=FALSE, remove the condition title!")
  }

  if (discrete.batch== FALSE){
    batch.info[,2] <- mclust_cluster(batch.info[,2])

    #writing the batch cluster information to file
    date <- as.character(format(Sys.Date(), "%Y%m%d"))
    clusterInfoFile <- paste0(date, "_", NameString, "_", batch, "_batch_info_mClust.txt")
    print(paste0("Writing the batch information from mClust for ", batch, " to: ", clusterInfoFile))
    write.table(batch.info,
                file = clusterInfoFile,
                sep = "\t",
                col.names = TRUE,
                row.names = FALSE)
  }
  batch.info[,2]<- as.factor(batch.info[,2])


  #removing genes with zero variance
  expr1<- expr
  std_genes <- apply(expr1, MARGIN =1, sd)
  genes_sd_0 <- length(which (std_genes ==0))
  remgenes <- length (std_genes) - genes_sd_0
  expr1 <- expr1[which (std_genes > 0),]
  print(paste0("Removed ", genes_sd_0, " genes with zero variance..."))
  print (paste0(remgenes, " genes remain..."))


  exprData1 <- t(expr1)
  #linear regression before batch correction
  lm_before <- lin_reg(exprData=exprData1,
                       batch.info = batch.info,
                       batch=batch,
                       NameString = NameString,
                       when = "before",
                       return.plot =TRUE)

  #checking if any of the p values are less than 0.05
  if(all(as.numeric(lm_before[[1]][,2])>0.05)){
    stop("Halting execution since batch is not associated with data...")

  }else{

    print("Batch is associated with the data...")

    #pca with batch before correction
    pca_batch_before<- pca_batch (exprData = exprData1,
                                  batch.info= batch.info,
                                  batch= batch,
                                  NameString = NameString,
                                  when = "before_ComBat_correction",
                                  return.plot=TRUE)

    if(any(clus.method=="km")){
      #pca with batch and kmeans before batch correction
      km_before <- kmeans_PCA(exprData = exprData1,
                              batch.info = batch.info,
                              batch = batch,
                              NameString = NameString,
                              when = "before_correction",
                              return.plot=TRUE)
    }
    if(any(clus.method=="NMF")){
      NMF_before <- NMF_PCA(expr=expr1,
                            batch.info = batch.info,
                            nrun = nrun.NMF,
                            batch = batch,
                            NameString = NameString,
                            when = "before_correction",
                            return.plot = TRUE)
    }

    #Batch Correction using ComBat
    if(cond.input==FALSE){
      expr2 <- ComBat_data (expr = expr1,
                            batch.info= batch.info,
                            batch = batch,
                            NameString = NameString,
                            cond.input=FALSE)

    } else if (cond.input==TRUE){
      expr2 <- ComBat_data (expr = expr1,
                            batch.info= batch.info,
                            batch = batch,
                            NameString = NameString,
                            cond.input=TRUE,
                            cond=cond)
    } else{
      stop("cond.input is a logical value, indicate TRUE or FALSE")
    }
    exprData2 <- t(expr2)

    #linear regression after batch correction
    lm_after <- lin_reg(exprData=exprData2,
                        batch.info = batch.info,
                        batch=batch,
                        NameString = NameString,
                        when = "after",
                        return.plot=TRUE)

    #pca with batch after correction
    pca_batch_after<- pca_batch (exprData = exprData2,
                                 batch.info= batch.info,
                                 batch= batch,
                                 NameString = NameString,
                                 when = "after_ComBat_correction",
                                 return.plot=TRUE)

    if(any(clus.method=="km")){
      #pca with batch and k-means after correction
      km_after <- kmeans_PCA(exprData = exprData2,
                             batch.info = batch.info,
                             batch = batch,
                             NameString = NameString,
                             when = "after_correction",
                             return.plot=TRUE)
    }

    if(any(clus.method=="NMF")){
      #NMF with PCA after correction
      NMF_after <- NMF_PCA(expr=expr2,
                           batch.info = batch.info,
                           nrun = nrun.NMF,
                           batch = batch,
                           NameString = NameString,
                           when = "after_correction",
                           return.plot = TRUE)
    }

    #PCA Proportion of Variation
    prop_var_plot <- pca_prop_var(batch.title = batch,
                                  exprData1 = exprData1,
                                  exprData2 = exprData2,
                                  NameString = NameString,
                                  return.plot=TRUE)

    #pearson correlation scatter plot
    correlationPlot(exprData1 = exprData1,
                    exprData2 = exprData2,
                    batch = batch,
                    NameString = NameString)

    #boxplots before and after batch correction
    exp_boxplot_before <- boxplot_data (expr = expr1,
                                        when = "before",
                                        NameString = NameString,
                                        batch = batch,
                                        return.plot =TRUE)

    exp_boxplot_after <-boxplot_data (expr = as.data.frame(expr2),
                                      when = "after",
                                      NameString = NameString,
                                      batch = batch,
                                      return.plot =TRUE)

  }
  print("========================================================")
  date <- as.character(format(Sys.Date(), "%Y%m%d"))
  final_plot_file <-  ifelse(NameString =="",
                             paste0(date, "_plot_", batch, "_plots.pdf"),
                             paste0(date, "_plot_", NameString, "_", batch, "_plots.pdf"))
  print(paste0("Saving all plots to ", final_plot_file))
  pdf(final_plot_file)
  plot(pca_batch_before)
  plot(lm_before[[2]])
  plot(lm_before[[3]])
  if(any(clus.method=="km")){
    plot(km_before[[2]])
    plot(km_before[[3]])}
  if(any(clus.method=="NMF")){
    plot(NMF_before[[2]])
    plot(NMF_before[[3]])}

  plot(pca_batch_after)
  plot(lm_after[[2]])
  plot(lm_after[[3]])
  if(any(clus.method=="km")){
    plot(km_after[[2]])
    plot(km_after[[3]])}
  if(any(clus.method=="NMF")){
    plot(NMF_after[[2]])
    plot(NMF_after[[3]])}
  plot(prop_var_plot)

  plot(exp_boxplot_before)
  plot(exp_boxplot_after)

  dev.off()

  print("========================================================")
  return(expr2)
  print(sessionInfo())


}


#mClust turns contiguous data into discrete batches for batch correction
mclust_cluster <- function(data){
  yBIC2_Q <-mclust::mclustBIC(as.numeric(data),G=1:8)
  rs2_Q <-summary.mclustBIC(object = yBIC2_Q,data = as.numeric(data))
  tau_Q <-rs2_Q$classification
  return(tau_Q)
}
jankinsan/BatchEC documentation built on Sept. 9, 2021, 8:12 p.m.