R/BatchEC_file.R

Defines functions mclust_cluster BatchEC_file

Documented in BatchEC_file

#' @title Batch Effects Evaluation and Correction
#'
#'
#' @description Evaluates if the batch effects with a gene expression dataset using linear
#' regression and if the batch is associated with the data, the data is batch-corrected
#' using the ComBat Algorithm.
#'
#' Please set your working directory before you call the function as all output files will be saved to this folder.
#' Ensure that you add the correct input file paths.
#'
#'
#' @param exprFile Path to .txt (tab-delimited) or a .csv file that contains the expression data. In the file, rows
#' should be genes and columns should be sample IDs.
#' @param batchFile Path to the .txt (tab-delimited) or a .csv file that contains the batch information.
#' The first column should be sample names/IDs
#' @param batch The title of the batch for which you want to evaluate and do the
#'  correction. Default = "Batch"
#' @param NameString  String that will be added in to all output filenames. Default="".
#' @param discrete.batch Logical value, indicating whether the samples are already
#' grouped in discrete batches. If the value is FALSE, contiguous batch information is
#' clustered into discrete batches using mClust. Useful for clustering batch variables that are
#' contiguous. Default = TRUE
#' @param cond The column name in the *batch.info* data which denotes biological condition,
#' if there are any to be considered for batch correction.
#' @param clus.method Method to be used for clustering. "km" denotes k-means, "NMF"
#' denotes NMF with 30 runs. Default employs both the methods, using *clus.method= c("NMF", "km")*
#' @param nrun.NMF number of runs for NMF. Default = 30.
#'
#' @return the batch-corrected dataset is returned.
#'
#' @import utils
#' @import mclust
#'
#' @examples
#' \dontrun{batch_eval_cor(exprFile = "~/exprData.txt",
#'                batchFile = "~/batchData.txt",
#'                Batch = "Batch",
#'                discrete.batch = TRUE)
#' }
#'
#'
#' @export
BatchEC_file <- function(exprFile, batchFile, batch="Batch", NameString = "", discrete.batch = TRUE, cond="", clus.method = c("NMF", "km"), nrun.NMF = 30){

  #reading expression data from file
  print (paste0("Reading gene expression data from ", exprFile))
  if (length(grep(pattern = ".txt", exprFile)) ==1||length(grep(pattern = ".tsv", exprFile)) ==1){
    expr1 <- read.table(exprFile,
                        header = TRUE,
                        stringsAsFactors = FALSE,
                        row.names = 1,
                        sep= "\t",
                        check.names=FALSE)
  } else if (length(grep(pattern = ".csv", exprFile)) ==1){
    expr1 <- read.csv(exprFile,
                      header = TRUE,
                      stringsAsFactors = FALSE,
                      row.names = 1,
                      check.names=FALSE)
  } else{
    stop("exprFile is not a csv or tab-delimited file")
  }


  #reading batch information
  print (paste0("Reading batch data from ", batchFile))
  if (length(grep(pattern = ".txt", exprFile)) ==1||length(grep(pattern = ".tsv", exprFile)) ==1){
    batch.info <- read.table(batchFile,
                             header = TRUE,
                             stringsAsFactors = FALSE,
                             sep = "\t")
  } else if (length(grep(pattern = ".csv", batchFile)) ==1){
    batch.info <- read.csv(batchFile,
                           header = TRUE,
                           stringsAsFactors = FALSE)
  } else{
    stop("batchFile is not a csv or tab-delimited file")
  }

  #matching batch and condition
  if(cond==""){
    mat <- match(batch, colnames(batch.info))
    batch.info <- batch.info [,c(1, mat)]
  } else if(cond!=""){
    mat <- match(batch, colnames(batch.info))
    mat2 <- match(cond, colnames(batch.info))
    batch.info <- batch.info [,c(1, mat, mat2)]
  }

  if (discrete.batch== FALSE){
    batch.info[,2] <- mclust_cluster(batch.info[,2])

    #writing the batch cluster information to file
    date <- as.character(format(Sys.Date(), "%Y%m%d"))
    clusterInfoFile <- paste0(date, "_", NameString, "_", batch, "_batch_info_mClust.txt")
    print(paste0("Writing the batch information from mClust for ", batch, "to: ", clusterInfoFile))
    write.table(batch.info,
                file = clusterInfoFile,
                sep = "\t",
                col.names = TRUE,
                row.names = FALSE)
  }

  exprData1 <- t(expr1)

  #linear regression before batch correction
  lm_before <- lin_reg(exprData=exprData1,
                          batch.info = batch.info,
                          batch=batch,
                          NameString = NameString,
                          when = "before",
                          return.plot =TRUE)

  #checking if any of the p values are less than 0.05
  if(all(as.numeric(lm_before[[1]][,2])>0.05)){
    stop("Halting execution since batch is not associated with data...")

  }else{

    print("Batch is associated with the data...")

    #pca with batch before correction
    if(cond==""){
    pca_batch_before<- pca_batch (exprData = exprData1,
                                 batch.info= batch.info,
                                 batch= batch,
                                 NameString = NameString,
                                 when = "before_ComBat_correction",
                                 return.plot=TRUE)
    }else if (cond!=""){
      pca_batch_before<- pca_batch (exprData = exprData1,
                                    batch.info= batch.info,
                                    batch= batch,
                                    cond=cond,
                                    NameString = NameString,
                                    when = "before_ComBat_correction",
                                    return.plot=TRUE)
    }

    if(any(clus.method=="km")){
    #pca with batch and kmeans before batch correction
      km_before <- kmeans_PCA(exprData = exprData1,
                            batch.info = batch.info,
                            batch = batch,
                            NameString = NameString,
                            when = "before_correction",
                            return.plot=TRUE)
    }
    if(any(clus.method=="NMF")){
      NMF_before <- NMF_PCA(expr=expr1,
                                  batch.info = batch.info,
                                  nrun = nrun.NMF,
                                  batch = batch,
                                  NameString = NameString,
                                  when = "before_correction",
                                  return.plot = TRUE)
    }

    #Batch Correction using ComBat
    if(cond==""){
      expr2 <- ComBat_data (expr = expr1,
                            batch.info= batch.info,
                            batch = batch,
                            NameString = NameString)

    } else if (cond!=""){
      expr2 <- ComBat_data (expr = expr1,
                            batch.info= batch.info,
                            batch = batch,
                            NameString = NameString,
                            cond=cond)
    }

    exprData2 <- t(expr2)

    #linear regression after batch correction
    lm_after <- lin_reg(exprData=exprData2,
                           batch.info = batch.info,
                           batch=batch,
                           NameString = NameString,
                           when = "after",
                        return.plot=TRUE)

    #pca with batch after correction
    if(cond==""){
    pca_batch_after<- pca_batch (exprData = exprData2,
                                  batch.info= batch.info,
                                  batch= batch,
                                  NameString = NameString,
                                  when = "after_ComBat_correction",
                                  return.plot=TRUE)
    } else if (cond!=""){
      pca_batch_after<- pca_batch (exprData = exprData2,
                                   batch.info= batch.info,
                                   batch= batch,
                                   cond=cond,
                                   NameString = NameString,
                                   when = "after_ComBat_correction",
                                   return.plot=TRUE)
    }

    if(any(clus.method=="km")){
    #pca with batch and k-means after correction
    km_after <- kmeans_PCA(exprData = exprData2,
                          batch.info = batch.info,
                          batch = batch,
                          NameString = NameString,
                          when = "after_correction",
                          return.plot=TRUE)
    }

    if(any(clus.method=="NMF")){
    #NMF with PCA after correction
      NMF_after <- NMF_PCA(expr=expr2,
                           batch.info = batch.info,
                           nrun = nrun.NMF,
                           batch = batch,
                           NameString = NameString,
                           when = "after_correction",
                           return.plot = TRUE)
    }

    #PCA Proportion of Variation
    prop_var_plot <- pca_prop_var(batch.title = batch,
                                 exprData1 = exprData1,
                                 exprData2 = exprData2,
                                 NameString = NameString,
                                 return.plot=TRUE)

    #pearson correlation scatter plot
    correlationPlot(exprData1 = exprData1,
                    exprData2 = exprData2,
                    batch = batch,
                    NameString = NameString)

    #boxplots before and after batch correction
    exp_boxplot_before <- boxplot_data (expr = expr1,
                                        when = "before",
                                        NameString = NameString,
                                        batch = batch,
                                        return.plot =TRUE)

    exp_boxplot_after <-boxplot_data (expr = as.data.frame(expr2),
                                      when = "after",
                                      NameString = NameString,
                                      batch = batch,
                                      return.plot =TRUE)

  }
  print("========================================================")
  date <- as.character(format(Sys.Date(), "%Y%m%d"))
  PCA_plot_file <-  ifelse(NameString =="",
                           paste0(date, "_plot_", batch, "_PCA_plots.pdf"),
                           paste0(date, "_plot_", NameString, "_", batch, "_PCA_plots.pdf"))
  boxplot_file <-  ifelse(NameString =="",
                          paste0(date, "_plot_", batch, "_expr_plots.pdf"),
                          paste0(date, "_plot_", NameString, "_", batch, "_expr_boxplots.pdf"))
  print("Saving all plots!")
  pdf(PCA_plot_file)
  plot(lm_before[[2]])
  plot(lm_before[[3]])
  plot(pca_batch_before)
  if(any(clus.method=="km")){
    plot(km_before[[2]])
    plot(km_before[[3]])}
  if(any(clus.method=="NMF")){
    plot(NMF_before[[2]])
    plot(NMF_before[[3]])}


  plot(lm_after[[2]])
  plot(lm_after[[3]])
  plot(pca_batch_after)
  if(any(clus.method=="km")){
    plot(km_after[[2]])
    plot(km_after[[3]])}
  if(any(clus.method=="NMF")){
    plot(NMF_after[[2]])
    plot(NMF_after[[3]])}
  plot(prop_var_plot)
  dev.off()

  pdf(boxplot_file, height = 9, width =12)
  plot(exp_boxplot_before)
  plot(exp_boxplot_after)
  dev.off()
  print("========================================================")
  return(expr2)
  print(sessionInfo())

}


#mClust turns contiguous data into discrete batches for batch correction
mclust_cluster <- function(data){
  yBIC2_Q <-mclust::mclustBIC(as.numeric(data),G=1:8)
  rs2_Q <-summary.mclustBIC(object = yBIC2_Q,data = as.numeric(data))
  tau_Q <-rs2_Q$classification
  return(tau_Q)
}
jankinsan/BatchEC documentation built on Sept. 9, 2021, 8:12 p.m.