molbio: Analysis of common experimental data

#' Function to automate the analysis of qPCR data
#'
#' Designed for tabular qPCR data of a specific format generated by the Ernst Lab at
#'  Michigan State University. To see an example of the data, load an example external dataset with
#'  data("adar_data").

#' @param data data.frame of tabular qPCR data
#' @param target_gene character with name of target gene used in analysis
#' @param ref_gene character with name of reference gene used in analysis
#' @param calibrator character with name of sample used for calibration
#' @param rm_files character vector containing names of samples to remove
#' @param text_size numeric specifying the size of the axis and axis lables
#' @param order character vector specifying the order that the tissues appear in the resulting plot
#' @param background logical indicating if traditional ggplot gray background is desired. If FALSE,
#' a white background with horizontal lines is used
#' @return a new data.frame with same fields but added "ratios" column to provide fold-expression changes
#'  relative to calibrator
#' @import ggplot2
#' @export
qPCR <- function(data,
                 target,
                 reference,
                 calibrator,
                 rm_files = "none",
                 text_size = 15,
                 orders = NULL,
                 background = TRUE) {

  # Enforce that data has correct fields and that Ct_mean is numeric for calculations
  if (any(colnames(data) != c("well", "sample_name", "id", "tissue", "target_name", "Ct_mean"))) {
    stop("Data supplied must contain the exact fields:
         well, sample_name, id, tissue, target_name, Ct_mean")
  }
  data$Ct_mean <- as.numeric(as.character(data$Ct_mean))

  # For convenience, split datasets in two. One for target gene and one for the reference gene.
  ref_data <- data[data$target_name == reference, ]
  target_data <- data[data$target_name == target, ]

  # Only keep samples that have a record for both target and reference gene and confirm order of
  #  samples between reference and target are the same.
  samples <- ref_data$sample_name[ref_data$sample_name %in% target_data$sample_name]
  ref_data <- ref_data[ref_data$sample_name %in% samples, ]

  # Enforce that the order of samples between datasets is the same.
  target_data <- target_data[match(ref_data$sample_name, target_data$sample_name), ]

  # Calculate delta_ct_cal, the difference in expression between reference and target genes for
  #  the calibrator sample.
  delta_ct_cal <-
    target_data[target_data$sample_name == calibrator, "Ct_mean"] -
    ref_data[ref_data$sample_name == calibrator, "Ct_mean"]

  # Do the same but for all test samples.
  delta_ct_test <-
    target_data[, "Ct_mean"] -
    ref_data[, "Ct_mean"]

  # Calculate delta_delta_ct.
  delta_delta_ct <- delta_ct_test - delta_ct_cal

  # Calculate normalized expression ratios, assuming efficiencies are 100%
  ratios <- 2^(-delta_delta_ct)

  # Append ratios to target_data.
  results <- cbind(target_data, ratios)

  # Remove desired samples from results
  results <- results[!results$sample_name %in% rm_files, ]

  # Get desired order of tissues
  if (is.null(orders))
    results$tissue <- factor(results$tissue)
  else
    results$tissue <- factor(results$tissue, levels = orders)

  # Plot!
  p <- ggplot(results, aes(x = tissue, y = ratios, fill = tissue))
  p <- p + geom_boxplot()
  p <- p + theme(axis.text = element_text(size = text_size),
                 axis.title = element_text(size = text_size),
                 title = element_text(size = text_size),
                 legend.position = "none")
  p <- p + labs(x = "Tissue",
                y = "Fold-change in expression")
  if (!background)
    p <- p + background_grid(major = "y",
                             minor = "none",
                             size.major = 0.8)
  plot(p)
  return (results)
}