R/lin_reg.R

Defines functions lin_reg

Documented in lin_reg

#' @title Linear Regression of Principal Components
#'
#' @description Calculates Linear Regression of Principal Components 1 and 2 of the data
#' to see if the batch is correlated to the data.
#'
#' @param exprData A matrix containing gene expression data. Row names should be samples and column names should be genes.
#' @param batch.info A data frame containing the samples names and details of the batch they belong to.
#' @param batch Title of the batch the data is being corrected for.
#' @param when String indicating when the linear regression analysis is taking place
#' (before batch correction or after batch correction?). Values should be either "before" or "after".
#' @param return.plot Should the plot be returned as an object to the environment?
#' If FALSE, plot is saved to a pdf file, if TRUE, plot is returned to the environment.
#' Default = FALSE
#'
#' @import stats
#'
#'
#' @return Returns the p-value associated with first ten principal components
#' after linear regression analysis. If *return.plot=TRUE*, plots showing batch
#' effects are also returned.
#'
#' @export
lin_reg <- function(exprData, batch.info, batch = "Batch", NameString = "", when = "", return.plot = FALSE)
{
  print("===========================Linear Regression Analysis=====================")

  #calculating the principal components
  print("Calculating Principal Components...")
  std_genes <- apply(exprData, MARGIN =2, sd)
  genes_sd_0 <- length(which (std_genes ==0))
  remgenes <- length (std_genes) - genes_sd_0
  exprData_sd0 <- exprData[,which (std_genes > 0)]
  pca.dat <- prcomp(exprData_sd0, center = TRUE, scale. = TRUE)

  #matching sample IDs with batches
  if (is.character(batch.info[,1])){
    id <- as.character(rownames(exprData))
    s<- match (id, batch.info[,1])

  } else if (is.numeric(batch.info[,1])){
    id <-as.numeric(rownames(exprData))
    s<- match (id, batch.info[,1])
  }

  #Adding the principal components and batch information to pca_data
  pca_data <- cbind.data.frame(batch.info[s, 2], signif(pca.dat$x[, 1:10], 5))
  colnames (pca_data)[1] <- "Batch"

  #linear regression to check which PC is associated with batch
  PCs <- colnames(pca_data[, -1])
  lm.pc <- function(x){
    print ("======================================================")
    print(paste0("Performing Linear Regression Analysis for ", x))
    pc.lm <-  lm(formula = (pca_data[,as.character(x)]) ~ as.factor(pca_data$Batch))
    print (summary(pc.lm))
    f_stat<- summary(pc.lm)$fstatistic
    p_val <- pf(f_stat[1], f_stat[2], f_stat[3], lower.tail = FALSE)
    return(p_val)
  }
  p.val <- sapply(PCs, lm.pc, simplify = TRUE)

  #plotting boxplots for first ten PCs
  plot.data <- reshape2::melt(pca_data,  id ="Batch")
  plot.data[,1]<- as.factor(plot.data[,1])

  #to add p-value labels to the plot
  names (p.val) <- strsplit(names(p.val), split = ".value")
  p.val.annon <- as.character(paste0("p-val=", signif(p.val, 4)))
  p.val.dat <- vector(mode = "character", length = dim(plot.data)[1])
  index <- c((dim(plot.data)[1]/10)*c(0:9) +1)
  p.val.dat[index] <- p.val.annon


  #boxplot to show association of batch effect with the 10 PCs
  boxplot.PC <- ggplot2::ggplot(data= plot.data, ggplot2::aes(x = Batch, y=value)) +

    ggplot2::geom_boxplot(ggplot2::aes( fill = Batch))+

    ggplot2::labs(x = "Batch", y ="Principal Components",
                  title = paste0("Boxplots showing variation in PCs due to batch ", when, " correction"))+

    ggplot2::scale_fill_manual(values = c("#fcba03",  "#19e01c",
                                                        "#ff470f","#0fdbca",
                                                        "#ff217e","#405ce6",
                                                        "#6b6769","#b264ed"))+
    ggplot2::theme(
      #adjusting axis lines and text
      axis.line.x = ggplot2::element_line(size =0.5),
      axis.line.y = ggplot2::element_line(size =0.5),
      axis.text.x= ggplot2::element_text(size=10, colour ="black"),
      axis.text.y= ggplot2::element_text(size=10, colour ="black"),

      #Center align the title
      plot.title = ggplot2::element_text(face = "bold", hjust =0.5),

      # Adjust panel border
      panel.border = ggplot2::element_rect(fill=NA, size= 0.75),

      # Remove panel grid lines
      panel.grid.major = ggplot2::element_blank(),
      panel.grid.minor = ggplot2::element_blank(),

      # Remove panel background
      panel.background = ggplot2::element_blank())+
    ggplot2::geom_text(mapping = ggplot2::aes(x =length(unique(batch.info[,2]))/2,
                                              y = (max(pca_data[,-1])+10)),
                       label =p.val.dat,
                       size = 3) +
    ggplot2::facet_wrap(.~variable, scales = "free")



  #plotting p-values for linear regression for Batch Variation

  p.val.data <- cbind.data.frame(colnames(pca_data)[-1], -log10(p.val))
  colnames(p.val.data)<- c("PC", "log.p.value")

  dot.plot <- ggplot2::ggplot(data = p.val.data) +
    ggplot2::geom_vline(ggplot2::aes(xintercept = -log10(0.05)), size =2,
                        linetype = 'dashed', colour = "dodgerblue3")+
    ggplot2::geom_point(size = 5, colour = "orange", ggplot2::aes(x =log.p.value, y =PC))+
    ggplot2::scale_y_discrete()+
    ggplot2::labs(x = "-log10(p-value)",
                  y = "Principal Components",
                  title = paste0("Plot showing PCs associated with batch ", when, " correction")) +
    ggplot2::scale_y_discrete(limits = colnames(pca_data[,-1]))+
    ggplot2::theme(
      #adjusting axis lines and text
      axis.line.x = ggplot2::element_line(size =0.5),
      axis.line.y = ggplot2::element_line(size =0.5),
      axis.text.x= ggplot2::element_text(size=10, colour ="black"),
      axis.text.y= ggplot2::element_text(size=10, colour ="black"),

      #Center align the title
      plot.title = ggplot2::element_text(face = "bold", hjust =0.5),

      # Adjust panel border
      panel.border = ggplot2::element_rect(fill=NA, size= 0.75),

      # Remove panel grid lines
      panel.grid.major = ggplot2::element_blank(),
      panel.grid.minor = ggplot2::element_blank(),

      # Remove panel background
      panel.background = ggplot2::element_blank())+
    ggplot2::geom_text(mapping = ggplot2::aes(x =3, y = 9),
                       label ="Significant PCs",
                       size = 3.5, color="dodgerblue3")

  if(return.plot==FALSE){
  #plotting boxplots
  date <- as.character(format(Sys.Date(), "%Y%m%d"))
  if (NameString==""){
    plotFile <- paste0(date, "_plot_", batch, "_batch_effect_boxplot_", when, "_correction.pdf")
  } else{
    plotFile <- paste0(date, "_plot_", NameString, "_", batch, "_batch_effect_boxplot_", when, "_correction.pdf")
  }
  print(paste0("Plotting boxplots showing how batches are associated with the first ten principal components to ", plotFile))
  pdf (plotFile, height = 10, width = 14)
  plot(boxplot.PC)
  plot(dot.plot)
  dev.off()

  } else if(return.plot==TRUE){

  return(list(values =cbind(names(p.val), p.val), boxplot.PC, dot.plot ))}
}
jankinsan/BatchEC documentation built on Sept. 9, 2021, 8:12 p.m.