R/preprocess_data.R

#' @title Generate MA plots of raw and normalized data
#' @description Normalize input raw data using quantile and mloess methods. 
#' Plots of the normalized data along with a dendrogram clustering all samples 
#' will be stored in newly created pipeline directory.
#' @usage preprocessPlots(input.file, file.sheet=1, ntext=2, data.col, 
#' symbol.index=1, id.index=2)
#' @param input.file Path to the microarray expression file, be it .xlsx or .csv
#' @param file.sheet Sheet number in the spreadsheet with data
#' @param ntext Number of leading text columns
#' @param data.col Range of columns which contain data (indexing begins with
#' first column of file)
#' @param symbol.index Column index which contains gene symbols
#' @param id.index Column index which contains gene ID's
#' @param batch.vector Character vector indicating to which batch each sample
#' belongs
#' @return A list with components
#' \item{ntext}{Number of leading text columns}
#' \item{data.col}{Vector of column indices containing array data}
#' \item{id}{Vector containing gene ID's}
#' \item{id.index}{Column index containing gene ID information}
#' \item{symbol}{Vector containing gene symbols}
#' \item{symbol.index}{Column index containing gene symbol information}
#' \item{desc.stats}{Vector of column indices containing descriptive statistics}
#' \item{pipeline.name}{Name of pipeline generated from input file name sans 
#' extension}
#' \item{mloess}{Data rame of quantile normalized data}
#' \item{quantile}{Data rame of quantile normalized data}
#' @export

PreprocessData <- function(input.file, file.sheet=1, ntext=2, data.col, 
                           symbol.index=1, id.index=2, batch.vector=NA) {

  if (grepl("/", input.file)) {
    stop("Input file must come from current working directory")
  }
  ext <- tools::file_ext(input.file)
  if (ext == "xlsx")
    X <- openxlsx::read.xlsx(input.file, sheet=file.sheet)
  else if (ext == "csv")
    X <- read.csv(input.file, header=TRUE, sep=",", stringsAsFactors=F)
  else if (ext == "txt")
    X <- read.table(input.file, header=TRUE, sep="\t", stringsAsFactors=F)
  else
    stop("Input file must be a .xlsx spreadsheet, comma-separated .csv, or tab-seperated .txt")
  
  pipeline.name <- tools::file_path_sans_ext(input.file)
  panels <- length(data.col)
  format <- c(2, 3)
  remove.batch <- F

  if (!any(is.na(batch.vector))) {
    if (!length(batch.vector == length(data.col))) {
      stop("Length of batch vector must equal number of data columns.")
    }
    remove.batch <- T
    X[, data.col] <- exp(limma::removeBatchEffect(log(X[, data.col]), batch=batch.vector))
  }
  
  labels <- colnames(X)[data.col]
  text.col <- 1:ntext  
  data <- X[, data.col]
  
  if(ntext == 1) {
    symbol.index = 1
    id.index = 2
    data.col <- data.col + 1
    desc.stats <- X[, rep(text.col, 2)]
  } else {
    desc.stats <- X[, text.col]
  }
  
  data.list = list()
  
  check.sequence <- rle(diff(data.col))
  if (any(check.sequence$lengths < length(data.col)-1 & check.sequence$values==1)) {
    columns.string <- as.character(split(data.col, cumsum(c(TRUE, diff(data.col) != 1))))
    columns.string <- paste0(gsub(":","_", columns.string), collapse = "_")
    pipeline.name <- paste0(pipeline.name, "_columns_", columns.string)
    
    data.list$original.columns <- data.col
    data.col <- seq(min(data.col),min(data.col)+length(data.col)-1)
  }
  
  data.list$ntext=ntext
  data.list$data.col=data.col
  data.list$id=X[, id.index]
  data.list$id.index=id.index
  data.list$symbol=X[, symbol.index]
  data.list$symbol.index=symbol.index
  data.list$desc.stats=desc.stats
  data.list$pipeline.name=pipeline.name
  
  if (!dir.exists(paste0(getwd(), "/", pipeline.name, "_pipeline"))) {
    cat("Creating output directory at ./", pipeline.name, "_pipeline\n", sep="")
    dir.create(file.path(getwd(), paste0(pipeline.name, "_pipeline")))
  }
  
  for (normType in c("raw", "mloess", "quantile")) {
    if (!remove.batch) {
      ps.plots <- paste0("./", pipeline.name, "_pipeline/", pipeline.name, "_", 
                         normType, "_plots.ps")
      pdf.plots <- paste0("./", pipeline.name, "_pipeline/", pipeline.name, "_", 
                          normType, "_plots.pdf")
    } else {
      ps.plots <- paste0("./", pipeline.name, "_pipeline/", pipeline.name, "_", 
                         normType, "_remove_batch_plots.ps")
      pdf.plots <- paste0("./", pipeline.name, "_pipeline/", pipeline.name, "_", 
                          normType, "_remove_batch_plots.pdf")
    }

    if (normType == "raw") {
      data.norm <- data
      dist <- dist(t(data.norm))
      
      ps.cluster <- paste0("./", pipeline.name, "_pipeline/", pipeline.name,
                           "_cluster.ps")
      pdf.cluster <- paste0("./", pipeline.name, "_pipeline/", pipeline.name, 
                            "_cluster.pdf")

      data.list$raw <- cbind(desc.stats, data.norm)

      postscript(file=ps.cluster, paper="letter")
      plot(hclust(dist, method="ward.D2"))
      garbage <- dev.off()      

      pdf(file=pdf.cluster, paper="letter")
      plot(hclust(dist, method="ward.D2"))
      garbage <- dev.off()
      
      cat("Dendrogram of raw data plotted at", ps.cluster, "\n")
      cat("Raw data plots created at", ps.plots, "\n")
      
    } else if (normType == "mloess") {
      mat <- as.matrix(data)
      mat[mat == 0] <- 1

      capture.output(data.norm <- affy::normalize.loess(mat))

      data.list$mloess <- cbind(desc.stats, data.norm)
      write.csv(data.list$mloess, paste0("./", pipeline.name, 
                                        "_pipeline/", pipeline.name, 
                                        "_mloess_data.csv"), row.names=F)
      cat("MLOESS normalized data plots created at", ps.plots, "\n")
      
    } else if (normType == "quantile") {
      data.norm <- limma::normalizeQuantiles(as.matrix(data))

      data.list$quantile <- cbind(desc.stats, data.norm)
      write.csv(data.list$quantile, paste0("./", pipeline.name, 
                                          "_pipeline/", pipeline.name, 
                                          "_quantile_data.csv"), row.names=F)
      cat("Quantile normalized data plots created at", ps.plots, "\n")
    }

    # Postscript output
    postscript(file=ps.plots, paper="letter")
    par(mfrow=format, pty="s")
        
    for (i in 1:panels) {     
      x <- as.numeric(data.norm[, ((i-1) %% panels) + 1])
      y <- as.numeric(data.norm[, (i %% panels) + 1])
      A <- (log2(x) + log2(y)) / 2
      M <- log2(y) - log2(x)
      plot(A, M, xlim=c(4, 14), ylim=c(-3, 3), cex=1, pch=".", 
           xlab=labels[((i - 1) %% panels) + 1], 
           main=labels[(i %% panels) + 1], font.main=1, cex.main=1)
      lines(c(0, 18), c(0, 0), col="cyan")
    }
    garbage <- dev.off() 
    
    # PDF output
    pdf(file=pdf.plots, paper="letter")
    par(mfrow=format, pty="s")
    
    for (i in 1:panels) {     
      x <- as.numeric(data.norm[, ((i - 1) %% panels) + 1])
      y <- as.numeric(data.norm[, (i %% panels) + 1])
      A <- (log2(x) + log2(y)) / 2
      M <- log2(y) - log2(x)
      plot(A, M, xlim=c(4, 14), ylim=c(-3, 3), cex=1, pch=".", 
           xlab=labels[((i - 1) %% panels) + 1], 
           main=labels[(i %% panels) + 1], font.main=1, cex.main=1)
      lines(c(0, 18), c(0, 0), col="cyan")
    }
    garbage <- dev.off() 
  }
  return(data.list)
}
TomNash/NASHMAP documentation built on May 9, 2019, 4:54 p.m.