R/BatchEC.R

Defines functions mclust_cluster BatchEC

Documented in BatchEC

#' @title Batch Effects Evaluation and Correction
#'
#'
#' @description Evaluates if the batch effects with a gene expression dataset using linear
#' regression and if the batch is associated with the data, the data is batch-corrected
#' using the ComBat Algorithm.
#'
#' Please set your working directory before you call the function as all output files will be saved to this folder.
#'
#'
#'
#' @param expr gene expression data; rows should be genes and columns should be samples.
#' @param batch.info contains the batch information. The first column should be sample names.
#' @param batch The title of the batch for which you want to evaluate and do the
#'  correction. Default = "Batch"
#' @param NameString  String that will be added in all output filenames. Default=NA.
#' @param discrete.batch Logical value indicating whether the samples are already
#' grouped in discrete batches. If the value is FALSE, contiguous batch information is
#' clustered into discrete batches. Useful for clustering batch variables that are
#' contiguous, for example, used reads and useful reads in mClust. Default = TRUE
#' @param cond The column name in the *batch.info* data which denotes a biological
#' condition variable that may need to be considered for batch correction.
#' @param clus.method Method to be used for clustering. "km" denotes k-means clustering, "NMF"
#' denotes NMF with 30 runs. Default employs both the methods, using *clus.method= c("NMF", "km")*.
#' @param nrun.NMF number of runs for NMF. Default = 30.
#'
#' @return the batch-corrected dataset is returned.
#'
#' @import utils
#' @import mclust
#'
#' @examples
#' \dontrun{
#' beacon(expr1 = dataset,
#'        batch.info = batch_info,
#'        batch = "Batch",
#'        discrete.batch = TRUE)
#' }
#'
#'
#' @export
BatchEC <- function(expr, batch.info, batch="Batch", NameString = "", discrete.batch = TRUE, cond="", clus.method = c("NMF", "km"), nrun.NMF = 30){

  #matching batch and condition
  if(cond==""){
    mat <- match(batch, colnames(batch.info))
    batch.info <- batch.info [,c(1, mat)]
  } else if(cond!=""){
    mat <- match(batch, colnames(batch.info))
    mat2 <- match(cond, colnames(batch.info))
    batch.info <- batch.info [,c(1, mat, mat2)]
  }

  if (discrete.batch== FALSE){
    batch.info[,2] <- mclust_cluster(batch.info[,2])

    #writing the batch cluster information to file
    date <- as.character(format(Sys.Date(), "%Y%m%d"))
    clusterInfoFile <- paste0(date, "_", NameString, "_", batch, "_batch_info_mClust.txt")
    print(paste0("Writing the batch information from mClust for ", batch, " to: ", clusterInfoFile))
    write.table(batch.info,
                file = clusterInfoFile,
                sep = "\t",
                col.names = TRUE,
                row.names = FALSE)
  }
  batch.info[,2]<- as.factor(batch.info[,2])

  expr1<-expr
  exprData1 <- t(expr1)

  #linear regression before batch correction
  lm_before <- lin_reg(exprData=exprData1,
                       batch.info = batch.info,
                       batch=batch,
                       NameString = NameString,
                       when = "before",
                       return.plot =TRUE)

  #checking if any of the p values are less than 0.05
  if(all(as.numeric(lm_before[[1]][,2])>0.05)){
    stop("Halting execution since batch is not associated with data...")

  }else{

    print("Batch is associated with the data...")

    #pca with batch before correction
    if(cond==""){
      pca_batch_before<- pca_batch (exprData = exprData1,
                                    batch.info= batch.info,
                                    batch= batch,
                                    NameString = NameString,
                                    when = "before_ComBat_correction",
                                    return.plot=TRUE)
    }else if (cond!=""){
      pca_batch_before<- pca_batch (exprData = exprData1,
                                    batch.info= batch.info,
                                    batch= batch,
                                    cond=cond,
                                    NameString = NameString,
                                    when = "before_ComBat_correction",
                                    return.plot=TRUE)
    }

    if(any(clus.method=="km")){
      #pca with batch and kmeans before batch correction
      km_before <- kmeans_PCA(exprData = exprData1,
                              batch.info = batch.info,
                              batch = batch,
                              NameString = NameString,
                              when = "before_correction",
                              return.plot=TRUE)
    }
    if(any(clus.method=="NMF")){
      NMF_before <- NMF_PCA(expr=expr1,
                            batch.info = batch.info,
                            nrun = nrun.NMF,
                            batch = batch,
                            NameString = NameString,
                            when = "before_correction",
                            return.plot = TRUE)
    }

    #Batch Correction using ComBat
    if(cond==""){
      expr2 <- ComBat_data (expr = expr1,
                            batch.info= batch.info,
                            batch = batch,
                            NameString = NameString)

    } else if (cond!=""){
      expr2 <- ComBat_data (expr = expr1,
                            batch.info= batch.info,
                            batch = batch,
                            NameString = NameString,
                            cond=cond)
    }

    exprData2 <- t(expr2)

    #linear regression after batch correction
    lm_after <- lin_reg(exprData=exprData2,
                        batch.info = batch.info,
                        batch=batch,
                        NameString = NameString,
                        when = "after",
                        return.plot=TRUE)

    #pca with batch after correction
    if(cond==""){
      pca_batch_after<- pca_batch (exprData = exprData2,
                                   batch.info= batch.info,
                                   batch= batch,
                                   NameString = NameString,
                                   when = "after_ComBat_correction",
                                   return.plot=TRUE)
    } else if (cond!=""){
      pca_batch_after<- pca_batch (exprData = exprData2,
                                   batch.info= batch.info,
                                   batch= batch,
                                   cond=cond,
                                   NameString = NameString,
                                   when = "after_ComBat_correction",
                                   return.plot=TRUE)
    }

    if(any(clus.method=="km")){
      #pca with batch and k-means after correction
      km_after <- kmeans_PCA(exprData = exprData2,
                             batch.info = batch.info,
                             batch = batch,
                             NameString = NameString,
                             when = "after_correction",
                             return.plot=TRUE)
    }

    if(any(clus.method=="NMF")){
      #NMF with PCA after correction
      NMF_after <- NMF_PCA(expr=expr2,
                           batch.info = batch.info,
                           nrun = nrun.NMF,
                           batch = batch,
                           NameString = NameString,
                           when = "after_correction",
                           return.plot = TRUE)
    }

    #PCA Proportion of Variation
    prop_var_plot <- pca_prop_var(batch.title = batch,
                                  exprData1 = exprData1,
                                  exprData2 = exprData2,
                                  NameString = NameString,
                                  return.plot=TRUE)

    #pearson correlation scatter plot
    correlationPlot(exprData1 = exprData1,
                    exprData2 = exprData2,
                    batch = batch,
                    NameString = NameString)

    #boxplots before and after batch correction
    exp_boxplot_before <- boxplot_data (expr = expr1,
                                        when = "before",
                                        NameString = NameString,
                                        batch = batch,
                                        return.plot =TRUE)

    exp_boxplot_after <-boxplot_data (expr = as.data.frame(expr2),
                                      when = "after",
                                      NameString = NameString,
                                      batch = batch,
                                      return.plot =TRUE)

  }
  print("========================================================")
  date <- as.character(format(Sys.Date(), "%Y%m%d"))
  PCA_plot_file <-  ifelse(NameString =="",
                           paste0(date, "_plot_", batch, "_PCA_plots.pdf"),
                           paste0(date, "_plot_", NameString, "_", batch, "_PCA_plots.pdf"))
  boxplot_file <-  ifelse(NameString =="",
                          paste0(date, "_plot_", batch, "_expr_plots.pdf"),
                          paste0(date, "_plot_", NameString, "_", batch, "_expr_boxplots.pdf"))
  print("Saving all plots!")
  pdf(PCA_plot_file)
  plot(lm_before[[2]])
  plot(lm_before[[3]])
  plot(pca_batch_before)
  if(any(clus.method=="km")){
    plot(km_before[[2]])
    plot(km_before[[3]])}
  if(any(clus.method=="NMF")){
    plot(NMF_before[[2]])
    plot(NMF_before[[3]])}


  plot(lm_after[[2]])
  plot(lm_after[[3]])
  plot(pca_batch_after)
  if(any(clus.method=="km")){
    plot(km_after[[2]])
    plot(km_after[[3]])}
  if(any(clus.method=="NMF")){
    plot(NMF_after[[2]])
    plot(NMF_after[[3]])}
  plot(prop_var_plot)
  dev.off()

  pdf(boxplot_file, height = 9, width =12)
  plot(exp_boxplot_before)
  plot(exp_boxplot_after)
  dev.off()
  print("========================================================")
  return(expr2)
  print(sessionInfo())

}


#mClust turns contiguous data into discrete batches for batch correction
mclust_cluster <- function(data){
  yBIC2_Q <-mclust::mclustBIC(as.numeric(data),G=1:8)
  rs2_Q <-summary.mclustBIC(object = yBIC2_Q,data = as.numeric(data))
  tau_Q <-rs2_Q$classification
  return(tau_Q)
}
jankinsan/BatchEC documentation built on Sept. 9, 2021, 8:12 p.m.