hpgltools: A pile of (hopefully) useful R functions

Documented in basic_pairwise choose_basic_dataset write_basic

## de_basic.r: An implementation of a simplified, statistical model unaware
## differential expression method.  This is intended essentially as a negative
## control to get a sense of how 'intrusive' other methods need to be in order
## to get their various results when performing a differential expression
## analysis.

#' The simplest possible differential expression method.
#'
#' Perform a pairwise comparison among conditions which takes
#' nothing into account.  It _only_ takes the conditions, a mean value/variance
#' among them, divides by condition, and returns the result.  No fancy
#' nomalizations, no statistical models, no nothing.  It should be the very
#' worst method possible. But, it should also provide a baseline to compare the
#' other tools against, they should all do better than this, always.
#'
#' Tested in test_27de_basic.R
#' This function was written after the corresponding functions in de_deseq.R,
#' de_edger.R, and de_limma.R.  Like those, it performs the full set of pairwise
#' comparisons and returns a list of the results.  As mentioned above, unlike
#' those, it is purposefully stupid.
#'
#' @param input Count table by sample.
#' @param design Data frame of samples and conditions.
#' @param conditions Not currently used, but passed from all_pairwise()
#' @param batches Not currently used, but passed from all_pairwise()
#' @param model_cond Not currently used, but passed from all_pairwise()
#' @param model_intercept Not currently used, but passed from all_pairwise()
#' @param alt_model Not currently used, but passed from all_pairwise()
#' @param model_batch Not currently used, but passed from all_pairwise()
#' @param force Force as input non-normalized data?
#' @param keepers Set of specific contrasts to perform instead of all.
#' @param fx What function to use for mean/median?
#' @param ... Extra options passed to arglist.
#' @return Df of pseudo-logFC, p-values, numerators, and denominators.
#' @seealso [deseq_pairwise()] [limma_pairwise()] [edger_pairwise()] [ebseq_pairwise()]
#' @examples
#' \dontrun{
#'  expt <- create_expt(metadata = "sample_sheet.xlsx", gene_info = "annotations")
#'  basic_de <- basic_pairwise(expt)
#'  basic_tables <- combine_de_tables(basic_de)
#' }
#' @export
basic_pairwise <- function(input = NULL, design = NULL, conditions = NULL,
                           batches = NULL, model_cond = TRUE, model_intercept = FALSE,
                           alt_model = NULL, model_batch = FALSE, force = FALSE,
                           keepers = NULL, fx = "mean", ...) {
  arglist <- list(...)
  if (!is.null(arglist[["input"]])) {
    input <- arglist[["input"]]
  }
  if (!is.null(arglist[["design"]])) {
    conditions <- arglist[["design"]]
  }
  if (!is.null(arglist[["force"]])) {
    batches <- arglist[["force"]]
  }
  message("Starting basic pairwise comparison.")
  input <- sanitize_expt(input)
  input_data <- choose_basic_dataset(input, force = force)
  design <- pData(input)
  conditions <- input_data[["conditions"]]
  batches <- input_data[["batches"]]
  data <- input_data[["data"]]

  conditions <- gsub(pattern = "^(\\d+)$", replacement = "c\\1", x = conditions)
  batches <- gsub(pattern = "^(\\d+)$", replacement = "b\\1", x = batches)
  types <- levels(as.factor(conditions))
  num_conds <- length(types)
  ## These will be filled with num_conds columns and numRows(input) rows.
  median_table <- data.frame()
  variance_table <- data.frame()
  ## First use conditions to rbind a table of medians by condition.
  message("Basic step 1/3: Creating ", fx, " and variance tables.")
  median_colnames <- c()
  for (c in seq_len(num_conds)) {
    condition_name <- types[c]
    median_colnames <- append(median_colnames, condition_name)
    columns <- which(conditions == condition_name)
    if (length(columns) == 1) {
      med <- data.frame(data[, columns], stringsAsFactors = FALSE)
      var <- as.data.frame(matrix(NA, ncol = 1, nrow = nrow(med)), stringsAsFactors = FALSE)
    } else {
      med_input <- as.matrix(data[, columns])
      if (fx == "mean") {
        med <- data.frame(matrixStats::rowMeans2(x = med_input, na.rm = TRUE))
      } else {
        med <- data.frame(Biobase::rowMedians(as.matrix(med_input)))
      }
      colnames(med) <- c(condition_name)
      var <- as.data.frame(genefilter::rowVars(as.matrix(med_input)))
      colnames(var) <- c(condition_name)
    }
    if (c == 1) {
      median_table <- med
      variance_table <- var
    } else {
      median_table <- cbind(median_table, med)
      variance_table <- cbind(variance_table, var)
    }
  } ## end creation of median and variance tables.
  colnames(median_table) <- median_colnames
  colnames(variance_table) <- median_colnames
  rownames(median_table) <- rownames(data)
  rownames(variance_table) <- rownames(data)
  ## We have tables of the median values by condition
  ## Now perform the pairwise comparisons
  comparisons <- data.frame()
  tvalues <- data.frame()
  pvalues <- data.frame()
  num_done <- 0
  column_list <- c()
  total_contrasts <- length(levels(as.factor(conditions)))
  total_contrasts <- (total_contrasts * (total_contrasts + 1)) / 2
  message("Basic step 2/3: Performing ", total_contrasts, " comparisons.")

  model_choice <- sm(choose_model(
      input, conditions = conditions, batches = batches, model_batch = FALSE,
      model_cond = TRUE, model_intercept = FALSE, alt_model = NULL,
      ...))
  model_data <- model_choice[["chosen_model"]]
  ## basic_pairwise() does not support extra contrasts, but they may be passed through via ...
  apc <- make_pairwise_contrasts(model_data, conditions, do_identities = FALSE, do_extras = FALSE,
                                 keepers = keepers, ...)
  contrasts_performed <- c()
  show_progress <- interactive() && is.null(getOption("knitr.in.progress"))
  if (isTRUE(show_progress)) {
    bar <- utils::txtProgressBar(style = 3)
  }
  for (c in seq_along(apc[["names"]])) {
    if (isTRUE(show_progress)) {
      pct_done <- c / length(apc[["names"]])
      utils::setTxtProgressBar(bar, pct_done)
    }
    num_done <- num_done + 1
    name  <- apc[["names"]][[c]]
    c_name <- gsub(pattern = "^(.*)_vs_(.*)$", replacement = "\\1", x = name)
    d_name <- gsub(pattern = "^(.*)_vs_(.*)$", replacement = "\\2", x = name)
    contrasts_performed <- append(name, contrasts_performed)
    if (! c_name %in% colnames(median_table)) {
      message("The contrast ", name, " is not in the results.")
      message("If this is not an extra contrast, then this is an error.")
      next
    }
    division <- data.frame(
        median_table[, c_name] - median_table[, d_name])
    column_list <- append(column_list, name)
    colnames(division) <- name
    ## Lets see if I can make a dirty p-value
    xcols <- which(conditions == c_name)
    ycols <- which(conditions == d_name)
    xdata <- as.data.frame(data[, xcols])
    ydata <- as.data.frame(data[, ycols])

    t_data <- vector("list", nrow(xdata))
    p_data <- vector("list", nrow(xdata))
    for (j in seq_len(nrow(xdata))) {
      test_result <- try(t.test(xdata[j, ], ydata[j, ]), silent = TRUE)
      if (class(test_result) == "htest") {
        t_data[[j]] <- test_result[[1]]
        p_data[[j]] <- test_result[[3]]
      } else {
        t_data[[j]] <- 0
        p_data[[j]] <- 1
      }
    } ## Done calculating cheapo p-values

    if (c == 1) {
      comparisons <- division
      tvalues <- t_data
      pvalues <- p_data
    } else {
      comparisons <- cbind(comparisons, division)
      tvalues <- cbind(tvalues, t_data)
      pvalues <- cbind(pvalues, p_data)
    }
  } ## End for each contrast
  if (isTRUE(show_progress)) {
    close(bar)
  }

  ## Because of the way I made tvalues/pvalues into a list
  ## If only 1 comparison was performed, the resulting data structure never gets coerced into a
  ## data frame.  Therefore I am performing this check which, if a single comparison was done, adds
  ## a second column, performs the coercion, then strips it away.  This is a stupid way
  ## of doing what I want.
  if (num_done == 1) {
    tvalues <- cbind(tvalues, t_data)
    pvalues <- cbind(pvalues, p_data)
    tvalues <- as.data.frame(tvalues)
    pvalues <- as.data.frame(pvalues)
    tvalues <- tvalues[-1]
    pvalues <- pvalues[-1]
  }
  comparisons[is.na(comparisons)] <- 0
  tvalues[is.na(tvalues)] <- 0
  pvalues[is.na(pvalues)] <- 1
  rownames(comparisons) <- rownames(data)
  rownames(tvalues) <- rownames(data)
  rownames(pvalues) <- rownames(data)
  all_tables <- list()

  message("Basic step 3/3: Creating faux DE Tables.")
  for (e in seq_along(colnames(comparisons))) {
    colname <- colnames(comparisons)[[e]]
    fc_column <- comparisons[, e]
    t_column <- as.numeric(tvalues[, e])
    p_column <- as.numeric(pvalues[, e])
    fc_column[mapply(is.infinite, fc_column)] <- 0
    numer_denom <- strsplit(x = colname, split = "_vs_")[[1]]
    numerator <- numer_denom[1]
    denominator <- numer_denom[2]
    num_col <- paste0("numerator_", fx)
    den_col <- paste0("denominator_", fx)
    fc_table <- data.frame(
        "numerator_var" = variance_table[[numerator]],
        "denominator_var" = variance_table[[denominator]],
        "t" = t_column,
        "p" = p_column,
        "logFC" = fc_column)
    fc_table[[num_col]] <- median_table[[numerator]]
    fc_table[[den_col]] <- median_table[[denominator]]
    fc_table <- fc_table[, c(num_col, den_col, "numerator_var",
                             "denominator_var", "t", "p", "logFC")]
    fc_table[["adjp"]] <- stats::p.adjust(as.numeric(fc_table[["p"]]), method = "BH")

    fc_table[[num_col]] <- signif(
        x = fc_table[[num_col]], digits = 4)
    fc_table[[den_col]] <- signif(
        x = fc_table[[den_col]], digits = 4)
    ## I am thinking to change my mind about this formatting, since
    ## it recasts the numbers as characters, and that is dumb.
    fc_table[["t"]] <- signif(x = fc_table[["t"]], digits = 4)
    fc_table[["logFC"]] <- signif(x = fc_table[["logFC"]], digits = 4)
    rownames(fc_table) <- rownames(data)
    all_tables[[e]] <- fc_table
  }
  message("Basic: Returning tables.")
  names(all_tables) <- colnames(comparisons)

  retlist <- list(
      "all_pairwise" = comparisons,
      "all_tables" = all_tables,
      "conditions_table" = table(conditions),
      "conditions" = conditions,
      "contrasts_performed" = contrasts_performed,
      "input_data" = data,
      "medians" = median_table,
      "method" = "basic",
      "variances" = variance_table)
  class(retlist) <- c("basic_pairwise", "list")
  if (!is.null(arglist[["basic_excel"]])) {
    retlist[["basic_excel"]] <- write_basic(retlist, excel = arglist[["basic_excel"]])
  }
  return(retlist)
}

#' Attempt to ensure that input data to basic_pairwise() is suitable.
#'
#' basic_pairwise() assumes log2 data as input, use this to ensure that is true.
#'
#' @param input An expressionset containing expt to test and/or modify.
#' @param force If we want to try out other distributed data sets, force it in using me.
#' @param ... future options, I think currently unused.
#' @return data ready for basic_pairwise()
#' @seealso [Biobase] [choose_dataset()] [normalize_expt()]
#' @examples
#' \dontrun{
#'  ready <- choose_basic_dataset(expt)
#' }
choose_basic_dataset <- function(input, force = FALSE, ...) {
  ## arglist <- list(...)
  warn_user <- 0
  conditions <- input[["conditions"]]
  batches <- input[["batches"]]
  data <- as.data.frame(exprs(input))
  tran_state <- input[["state"]][["transform"]]
  libsize <- NULL
  if (is.null(tran_state)) {
    tran_state <- "raw"
  }
  conv_state <- input[["state"]][["conversion"]]
  ## Note that voom takes care of this for us.
  if (is.null(conv_state)) {
    conv_state <- "raw"
  }
  norm_state <- input[["state"]][["normalization"]]
  if (is.null(norm_state)) {
    norm_state <- "raw"
  }
  filt_state <- input[["state"]][["filter"]]
  if (is.null(filt_state)) {
    filt_state <- "raw"
  }

  ready <- input
  if (isTRUE(force)) {
    message("Leaving the data alone, regardless of normalization state.")
  } else {
    if (filt_state == "raw") {
      message("Basic step 0/3: Filtering data.")
      ready <- sm(normalize_expt(ready, filter = TRUE))
    }
    if (norm_state == "raw") {
      message("Basic step 0/3: Normalizing data.")
      ready <- sm(normalize_expt(ready, norm = "quant"))
    }
    if (conv_state == "raw") {
      message("Basic step 0/3: Converting data.")
      ready <- sm(normalize_expt(ready, convert = "cbcbcpm"))
    }

  }
  ## No matter what we do, it must be logged.
  if (tran_state == "raw") {
    message("Basic step 0/3: Transforming data.")
    ready <- sm(normalize_expt(ready, transform = "log2"))
  }
  data <- as.data.frame(exprs(ready))
  libsize <- colSums(data)
  rm(ready)
  retlist <- list(
      "libsize" = libsize,
      "conditions" = conditions,
      "batches" = batches,
      "data" = data)
  return(retlist)
}

#' Writes out the results of a basic search using write_de_table()
#'
#' Looking to provide a single interface for writing tables from basic and friends.
#'
#' Tested in test_26basic.R
#'
#' @param data Output from basic_pairwise()
#' @param ... Options for writing the xlsx file.
#' @seealso [basic_pairwise()] [write_de_table()]
#' @examples
#' \dontrun{
#'  finished_comparison <- basic_pairwise(expressionset)
#'  data_list <- write_basic(finished_comparison)
#' }
#' @export
write_basic <- function(data, ...) {
  result <- write_de_table(data, type = "basic", ...)
  return(result)
}

## EOF