R/de_shared.R

Defines functions semantic_copynumber_filter semantic_copynumber_extract sanitize_expt mymakeContrasts make_pairwise_contrasts get_sig_genes ihw_adjust get_pairwise_gene_abundances get_abundant_genes do_pairwise disjunct_pvalues compare_significant_contrasts compare_logfc_plots correlate_de_tables compare_de_tables compare_de_results choose_model choose_limma_dataset choose_dataset choose_binom_dataset sva_modify_pvalues calculate_aucc binary_pairwise all_pairwise

Documented in all_pairwise binary_pairwise calculate_aucc choose_binom_dataset choose_dataset choose_limma_dataset choose_model compare_de_results compare_de_tables compare_logfc_plots compare_significant_contrasts correlate_de_tables disjunct_pvalues do_pairwise get_abundant_genes get_pairwise_gene_abundances get_sig_genes ihw_adjust make_pairwise_contrasts mymakeContrasts sanitize_expt semantic_copynumber_extract semantic_copynumber_filter sva_modify_pvalues

## de_shared.r: Merge differential expression methods into single invocations.
## The functionsin this file seek to make it possible to treat all the DE
## methods identically and/or simultaneously.

#' Perform limma, DESeq2, EdgeR pairwise analyses.
#'
#' This takes an expt object, collects the set of all possible pairwise
#' comparisons, sets up experimental models appropriate for the differential
#' expression analyses, and performs them.
#'
#' This runs limma_pairwise(), deseq_pairwise(), edger_pairwise(),
#' basic_pairwise() each in turn. It collects the results and does some simple
#' comparisons among them.
#'
#' @param input Dataframe/vector or expt class containing count tables,
#'  normalization state, etc.
#' @param conditions Factor of conditions in the experiment.
#' @param batches Factor of batches in the experiment.
#' @param model_cond Include condition in the model?  This is likely always true.
#' @param modify_p Depending on how it is used, sva may require a modification
#'  of the p-values.
#' @param model_batch Include batch in the model?  This may be true/false/"sva"
#'  or other methods supported by all_adjusters().
#' @param filter Added because I am tired of needing to filter the data before
#'  invoking all_pairwise().
#' @param model_intercept Use an intercept model instead of cell means?
#' @param extra_contrasts Optional extra contrasts beyone the pairwise
#'  comparisons.  This can be pretty neat, lets say one has conditions
#'  A,B,C,D,E and wants to do (C/B)/A and (E/D)/A or (E/D)/(C/B) then use this
#'  with a string like:
#'   "c_vs_b_ctrla = (C-B)-A, e_vs_d_ctrla = (E-D)-A, de_vs_cb = (E-D)-(C-B)".
#' @param alt_model Alternate model to use rather than just condition/batch.
#' @param libsize Library size of the original data to help voom().
#' @param test_pca Perform some tests of the data before/after applying a given
#'  batch effect.
#' @param annot_df Annotations to add to the result tables.
#' @param parallel Use dopar to run limma, deseq, edger, and basic simultaneously.
#' @param do_basic Perform a basic analysis?
#' @param do_deseq Perform DESeq2 pairwise?
#' @param do_ebseq Perform EBSeq (caveat, this is NULL as opposed to TRUE/FALSE
#'  so it can choose).
#' @param do_edger Perform EdgeR?
#' @param do_limma Perform limma?
#' @param do_noiseq Perform noiseq?
#' @param do_dream Perform dream?
#' @param keepers Limit the pairwise search to a set of specific contrasts.
#' @param convert Modify the data with a 'conversion' method for PCA?
#' @param norm Modify the data with a 'normalization' method for PCA?
#' @param verbose Print extra information while running?
#' @param surrogates Either a number of surrogates or method to estimate it.
#' @param ...  Picks up extra arguments into arglist.
#' @return A list of limma, deseq, edger results.
#' @seealso [limma_pairwise()] [edger_pairwise()] [deseq_pairwise()] [ebseq_pairwise()]
#'  [basic_pairwise()]
#' @examples
#' \dontrun{
#'  lotsodata <- all_pairwise(input = expt, model_batch = "svaseq")
#'  summary(lotsodata)
#'  ## limma, edger, deseq, basic results; plots; and summaries.
#' }
#' @export
all_pairwise <- function(input = NULL, conditions = NULL,
                         batches = NULL, model_cond = TRUE,
                         modify_p = FALSE, model_batch = TRUE, filter = NULL,
                         model_intercept = FALSE, extra_contrasts = NULL,
                         alt_model = NULL, libsize = NULL, test_pca = TRUE,
                         annot_df = NULL, parallel = TRUE,
                         do_basic = TRUE, do_deseq = TRUE, do_ebseq = NULL,
                         do_edger = TRUE, do_limma = TRUE, do_noiseq = TRUE,
                         do_dream = FALSE, keepers = NULL,
                         convert = "cpm", norm = "quant", verbose = TRUE,
                         surrogates = "be", ...) {
  arglist <- list(...)
  if (is.null(model_cond)) {
    model_cond <- TRUE
  }
  if (is.null(model_batch)) {
    model_batch <- FALSE
  }
  if (is.null(model_intercept)) {
    model_intercept <- FALSE
  }

  ## EBSeq made an incompatible change in its most recent release.
  ## I unthinkingly changed my code to match it without considering the old
  ## bioconductor release.
  if (as.numeric(as.character(BiocManager::version())) < 3.18) {
    warning("I changed ebseq_pairwise for the new bioc release, it needs >= 3.18.")
    do_ebseq <- FALSE
  }

  if (isTRUE(model_cond)) {
    if (is.null(keepers)) {
      mesg("This DE analysis will perform all pairwise comparisons among:")
    } else {
      mesg("This DE analysis will perform the subset of comparisons provided.")
    }
    print(table(pData(input)[["condition"]]))
    if (isTRUE(model_batch)) {
      mesg("This analysis will include a batch factor in the model comprised of:")
      print(table(pData(input)[["batch"]]))
    } else if ("character" %in% class(model_batch)) {
      mesg("This analysis will include surrogate estimates from: ", model_batch, ".")
    } else if ("matrix" %in% class(model_batch)) {
      mesg("This analysis will include a matrix of surrogate estimates.")
    }
    if (!is.null(filter)) {
      mesg("This will pre-filter the input data using normalize_expt's: ",
           filter, " argument.")
    }
  } else {
    mesg("This analysis is not using the condition factor from the data.")
  }

  if (!is.null(filter)) {
    input <- sm(normalize_expt(input, filter = filter))
  }
  null_model <- NULL
  sv_model <- NULL
  model_type <- model_batch
  if (class(model_batch)[1] == "character") {
    model_params <- all_adjusters(input, estimate_type = model_batch,
                                  surrogates = surrogates)
    model_batch <- model_params[["model_adjust"]]
    null_model <- model_params[["null_model"]]
    sv_model <- model_batch
  }

  ## Add a little logic to do a before/after batch PCA plot.
  pre_pca <- NULL
  post_pca <- NULL
  if (isTRUE(test_pca)) {
    pre_batch <- sm(normalize_expt(input, filter = TRUE, batch = FALSE,
                                   transform = "log2", convert = convert, norm = norm))
    mesg("Plotting a PCA before surrogate/batch inclusion.")
    pre_pca <- plot_pca(pre_batch, plot_labels = FALSE,
                        ...)
    post_batch <- pre_batch
    if (isTRUE(model_type)) {
      model_type <- "batch in model/limma"
      if (isTRUE(verbose)) {
        mesg("Using limma's removeBatchEffect to visualize with(out) batch inclusion.")
      }
      post_batch <- sm(normalize_expt(input, filter = TRUE, batch = TRUE, transform = "log2"))
    } else if (class(model_type)[1] == "character") {
      mesg("Using ", model_type, " to visualize before/after batch inclusion.")
      test_norm <- "quant"
      if (model_type != "TRUE" && model_type != FALSE) {
        ## Then it is probably some sort of sva which will have a hard time with quantile.
        test_norm <- "raw"
      }
      mesg("Performing a test normalization with: ", test_norm)
      if (!isFALSE(model_batch)) {
        post_batch <- try(normalize_expt(input, filter = TRUE, batch = model_type,
                                         transform = "log2", convert = "cpm",
                                         norm = test_norm, surrogates = surrogates))
      } else {
        post_batch <- NULL
      }
    } else {
      model_type <- "none"
      mesg("Assuming no batch in model for testing pca.")
    }
    post_pca <- plot_pca(post_batch, plot_labels = FALSE,
                         ...)
  }

  ## do_ebseq defaults to NULL, this is so that we can query the number of
  ## conditions and choose accordingly. EBSeq is very slow, so if there are more
  ## than 3 or 4 conditions I do not think I want to wait for it.
  num_conditions <- 0
  if (is.null(conditions)) {
    num_conditions <- length(levels(as.factor(input[["conditions"]])))
  } else {
    num_conditions <- length(levels(as.factor(conditions)))
  }
  if (is.null(do_ebseq)) {
    if (num_conditions > 4) {
      do_ebseq <- FALSE
    } else if (num_conditions < 2) {
      stop("Unable to find the number of conditions in the data.")
    } else {
      do_ebseq <- TRUE
    }
  }

  ## Put a series of empty lists in the final results data structure
  ## so that later I will know to perform each of these analyses without
  ## having to query do_method.
  num_cpus_needed <- 0
  results <- list()
  if (isTRUE(do_basic)) {
    num_cpus_needed <- num_cpus_needed + 1
    results[["basic"]] <- list()
  }
  if (isTRUE(do_deseq)) {
    num_cpus_needed <- num_cpus_needed + 1
    results[["deseq"]] <- list()
  }
  if (isTRUE(do_ebseq)) {
    num_cpus_needed <- num_cpus_needed + 1
    results[["ebseq"]] <- list()
  }
  if (isTRUE(do_edger)) {
    num_cpus_needed <- num_cpus_needed + 1
    results[["edger"]] <- list()
  }
  if (isTRUE(do_limma)) {
    num_cpus_needed <- num_cpus_needed + 1
    results[["limma"]] <- list()
  }
  if (isTRUE(do_noiseq)) {
    num_cpus_needed <- num_cpus_needed + 1
    results[["noiseq"]] <- list()
  }
  if (isTRUE(do_dream)) {
    num_cpus_needed <- num_cpus_needed + 1
    results[["dream"]] <- list()
  }

  res <- NULL
  if (isTRUE(parallel)) {
    ## Make a cluster with one cpu for each method used: basic, edger, ebseq, limma, deseq.
    cl <- parallel::makeCluster(num_cpus_needed)
    registered <- doParallel::registerDoParallel(cl)
    tt <- sm(requireNamespace("parallel"))
    tt <- sm(requireNamespace("doParallel"))
    tt <- sm(requireNamespace("iterators"))
    tt <- sm(requireNamespace("foreach"))
    res <- foreach(c = seq_along(results),
                   .packages = c("hpgltools")) %dopar% {
      type <- names(results)[c]
      results[[type]] <- do_pairwise(
        type, input = input, conditions = conditions, batches = batches,
        model_cond = model_cond, model_batch = model_batch, model_intercept = model_intercept,
        extra_contrasts = extra_contrasts, alt_model = alt_model, libsize = libsize,
        annot_df = annot_df, surrogates = surrogates, keepers = keepers, ...)
    } ## End foreach() %dopar% { }
    parallel::stopCluster(cl)
    if (isTRUE(verbose)) {
      mesg("Finished running DE analyses, collecting outputs.")
    }
    ## foreach returns the results in no particular order
    ## Therefore, I will reorder the results now and ensure that they are happy.
    for (r in seq_along(res)) {
      a_result <- res[[r]]
      type <- a_result[["type"]]
      results[[type]] <- a_result
    }
    rm(res)
    ## End performing parallel comparisons
  } else {
    for (type in names(results)) {
      if (isTRUE(verbose)) {
        mesg("Starting ", type, "_pairwise().")
      }
      results[[type]] <- do_pairwise(
        type, input = input, conditions = conditions, batches = batches,
        model_cond = model_cond, model_batch = model_batch, model_intercept = model_intercept,
        extra_contrasts = extra_contrasts, alt_model = alt_model, libsize = libsize,
        annot_df = annot_df, surrogates = surrogates, keepers = keepers,
        ...)
    }
  } ## End performing a serial comparison

  original_pvalues <- NULL
  ## Add in a little work to re-adjust the p-values in the situation where sva
  ## was used Only perform this f adjustment if you modify the data without
  ## making limma/deseq/edger aware of the modified model.  Ergo, if we feed
  ## sv_model to this function, then by definition, we do not want to use this
  ## function.
  ## Thus we will use modified_data to note the data was modified by sva.
  modified_data <- FALSE
  if (is.null(sv_model) && isTRUE(modified_data)) {
    ret <- sva_modify_pvalues(results)
    results <- ret[["results"]]
    original_pvalues <- ret[["original_pvalues"]]
  }

  result_comparison <- correlate_de_tables(results, annot_df = annot_df,
                                           extra_contrasts = extra_contrasts)
  ## The first few elements of this list are being passed through into the return
  ## So that if I use combine_tables() I can report in the resulting tables
  ## some information about what was performed.
  ret <- list(
    "basic" = results[["basic"]],
    "deseq" = results[["deseq"]],
    "ebseq" = results[["ebseq"]],
    "edger" = results[["edger"]],
    "limma" = results[["limma"]],
    "noiseq" = results[["noiseq"]],
    "batch_type" = model_type,
    "comparison" = result_comparison,
    "extra_contrasts" = extra_contrasts,
    "input" = input,
    "model_cond" = model_cond,
    "model_batch" = model_batch,
    "original_pvalues" = original_pvalues,
    "pre_batch" = pre_pca,
    "post_batch" = post_pca)
  class(ret) <- c("all_pairwise", "list")

  if (!is.null(arglist[["combined_excel"]])) {
    if (isTRUE(verbose)) {
      mesg("Invoking combine_de_tables().")
    }
    combined <- combine_de_tables(ret, excel = arglist[["combined_excel"]], ...)
    ret[["combined"]] <- combined
  }
  return(ret)
}

#' Perform all_pairwise only using deseq/edger.
#'
#' The thing I want to do which I presume will be of use to Zhezhen is
#' to have a variant of this which takes the list of interesting
#' contrasts and only performs them rather than my default of doing
#' all possible pairwise contrasts.  I think that will only require a
#' little logic in make_contrasts to skip contrasts not in the list of
#' interest.
#'
#' @param ... Args usually passed to all_pairwise()
#' @export
binary_pairwise <- function(...) {
  all_pairwise(..., do_limma = FALSE, do_edger = TRUE, do_basic = FALSE,
               do_deseq = TRUE, do_ebseq = FALSE, do_noiseq = FALSE, do_dream = FALSE)
}

#' Calculate the Area under the Concordance Curve.
#'
#' This is taken verbatim from a recent paper sent to me by Julie
#' Cridland.
#'
#' @param tbl DE table
#' @param tbl2 Second table
#' @param px first set of p-values column
#' @param py second set
#' @param lx first set of logFCs column
#' @param ly second set
#' @param cor_method Method to pass to cor().
#' @param topn Number of genes to consider (or percentage of the
#'  whole).
#' @export
calculate_aucc <- function(tbl, tbl2 = NULL, px = "deseq_adjp", py = "edger_adjp",
                           lx = "deseq_logfc", ly = "edger_logfc", cor_method = "pearson",
                           topn = 0.1) {
  ## If the topn argument is an integer, the just ask for that number.
  ## If it is a floating point 0<x<1, then set topn to that proportion
  ## of the number of genes.
  if (topn <= 0) {
    stop("topn need to be either a float from 0-1 or an interger bigger than 100.")
  } else if (topn > 1 && topn <= 100) {
    stop("topn need to be either a float from 0-1 or an interger bigger than 100.")
  } else if (topn < 1) {
    topn <- ceiling(nrow(tbl) * topn)
  }

  ## By default, assume all the relevant columns are in the tbl variable.
  ## However, if a second tbl is provided, then do not assume that.
  if (is.null(tbl2)) {
    mesg("A second table was not providing, performing comparison among columns of the first.")
  } else {
    tmp_tbl <- merge(tbl, tbl2, by = "row.names")
    rownames(tmp_tbl) <- tmp_tbl[["Row.names"]]
    tmp_tbl[["Row.names"]] <- NULL
    tbl <- tmp_tbl
    rm("tmp_tbl")
    if (px == py) {
      px <- paste0(px, ".x")
      py <- paste0(py, ".y")
      lx <- paste0(lx, ".x")
      ly <- paste0(ly, ".y")
    }
  }

  x_df <- tbl[, c(px, lx)]
  x_order <- rownames(x_df)
  y_df <- tbl[x_order, c(py, ly)]

  ## Do a simple correlation test just to have it as a comparison point
  simple_cor <- cor.test(tbl[[lx]], tbl[[ly]], method = cor_method)

  ## curve (AUCC), we ranked genes in both the single-cell and bulk datasets in
  ## descending order by the statistical significance of their differential expression.
  x_idx <- order(x_df[[px]], decreasing = FALSE)
  x_df <- x_df[x_idx, ]
  y_idx <- order(y_df[[py]], decreasing = FALSE)
  y_df <- y_df[y_idx, ]

  ## Then, we created lists of the top-ranked genes in each dataset of
  ## matching size, up to some maximum size k. For each of these lists
  ## (that is, for the top-1 genes, top-2 genes, top-3 genes, and so
  ## on), we computed the size of the intersection between the
  ## single-cell and bulk DE genes. This procedure yielded a curve
  ## relating the number of shared genes between datasets to the
  ## number of top-ranked genes considered.

  intersections <- rep(0, topn)
  for (i in seq_len(topn)) {
    if (i == 1) {
      intersections[i] <- rownames(x_df)[i] == rownames(y_df)[i]
    } else {
      x_set <- rownames(x_df)[1:i]
      y_set <- rownames(y_df)[1:i]
      intersections[i] <- sum(x_set %in% y_set)
    }
  }

  ## The area under this curve was computed by summing the size of all
  ## intersections, and normalized to the range [0, 1] by dividing it
  ## by its maximum possible value, k × ( k +1) / 2. To evaluate the
  ## concordance of DE analysis, we used k =500 except where otherwise
  ## noted, but found our results were insensitive to the
  sumint <- sum(intersections)
  norm <- (topn * (topn + 1)) /2
  aucc <- sumint / norm

  intersection_df <- data.frame(x = 1:topn, y = intersections)
  intersection_lm <- lm(intersection_df, formula = y ~ x)
  inter <- as.numeric(coef(intersection_lm))
  inter_inter <- inter[1]
  inter_slope <- inter[2]
  intersection_plot <- ggplot(data = intersection_df,
                              aes(x = .data[["x"]], y = .data[["y"]])) +
    ggplot2::geom_point() +
    ggplot2::geom_abline(colour = "grey", slope = 1, intercept = 0) +
    ggplot2::scale_x_continuous(expand = c(0, 0), limits = c(0, topn)) +
    ggplot2::scale_y_continuous(expand = c(0, 0)) +
    ggplot2::geom_abline(colour = "blue", slope = inter_slope, intercept = inter_inter) +
    ggplot2::geom_smooth(method = "loess", formula = y ~ x) +
    ggplot2::annotate("text", x = topn / 1.5, y = topn / 10,
                      label = glue::glue("AUCC: {signif(aucc, 3)},
 AUCC slope: {signif(inter_slope, 3)}"))

  retlist <- list(
    "aucc" = aucc,
    "cor" = simple_cor,
    "plot" = intersection_plot)
  class(retlist) <- "aucc_info"
  return(retlist)
}

#' Use sva's f.pvalue to adjust p-values for data adjusted by combat.
#'
#' This is from section 5 of the sva manual:  "Adjusting for surrogate
#' values using the f.pvalue function." The following chunk of code is longer
#' and more complex than I would like. This is because f.pvalue() assumes a
#' pairwise comparison of a data set containing only two experimental
#' factors. As a way to provide an example of _how_ to calculate
#' appropriately corrected p-values for surrogate factor adjusted models,
#' this is great; but when dealing with actual data, it falls a bit short.
#'
#' @param results Table of differential expression results.
#' @seealso [sva]
sva_modify_pvalues <- function(results) {
  input <- results[["input"]]
  original_pvalues <- data.table::data.table(
    rownames = rownames(results[["edger"]][["all_tables"]][[1]]))
  if (isTRUE(verbose)) {
    mesg("Using f.pvalue() to modify the returned p-values of deseq/limma/edger.")
  }
  for (it in seq_along(results[["edger"]][["all_tables"]])) {
    name <- names(results[["edger"]][["all_tables"]])[it]
    if (isTRUE(verbose)) {
      mesg("Readjusting the p-values for comparison: ", name)
    }
    namelst <- strsplit(x = name, split = "_vs_")
    ## something like 'mutant'
    first <- namelst[[1]][[1]]
    ## something like 'wildtype', ergo the contrast was "mutant_vs_wildtype"
    second <- namelst[[1]][[2]]
    ## The comments that follow will use mutant and wildtype as examples

    ## I am going to need to extract the set of data for the samples in
    ## 'first' and 'second'. I will need to also extract the surrogates for
    ## those samples from sv_model. Then I rewrite null_model as the
    ## subset(null_model, samples included) and rewrite sv_model as
    ## subset(sv_model, samples_included) in that rewrite, there will just be
    ## conditions a, b where a and b are the subsets for first and
    ## second. Then the sv_model will be a for the first samples and b for the
    ## second. With that information, I should e able to feed sva::f.pvalue
    ## the appropriate information for it to run properly. The resulting
    ## pvalues will then be appropriate for backfilling the various tables
    ## from edger/limma/deseq.

    ## Get the samples from the limma comparison which are condition 'mutant'
    samples_first_idx <- results[["limma"]][["conditions"]] == first
    num_first <- sum(samples_first_idx)
    ## Subset the expressionset and make a new matrix of only the 'mutant' samples
    samples_first <- exprs(input)[, samples_first_idx]
    ## Repeat for the 'wildtype' samples, when finished the m columns for
    ## samples_first 'mutant' and the n samples of samples_second will be
    ## 'wildtype'
    samples_second_idx <- results[["limma"]][["conditions"]] == second
    num_second <- sum(samples_second_idx)
    samples_second <- exprs(results[["input"]])[, samples_second_idx]
    ## Concatenate the 'mutant' and 'wildtype' samples by column
    included_samples <- cbind(samples_first, samples_second)
    ## Arbitrarily call them 'first' and 'second'
    colnames(included_samples) <- c(rep("first", times = num_first),
                                    rep("second", times = num_second))
    ## Do the same thing, but using the rows of the sva model adjustment
    first_sva <- results[["limma"]][["sv_model"]][samples_first_idx, ]
    second_sva <- results[["limma"]][["sv_model"]][samples_second_idx, ]
    ## But instead of just appending them, I need a matrix of 0s and 1s
    ## identifying which sv rows correspond to 'wildtype' and 'mutant'
    first_model <- append(rep(1, num_first), rep(0, num_second))
    second_model <- append(rep(0, num_first), rep(1, num_second))
    ## Once I have them, make the subset model matrix with append and cbind
    new_sv_model <- append(first_sva, second_sva)
    new_model <- cbind(first_model, second_model, new_sv_model)
    colnames(new_model) <- c("first", "second", "sv")
    ## The sva f.pvalue requires a null model of the appropriate size, create
    ## that here.
    new_null <- cbind(rep(1, (num_first + num_second)), new_sv_model)
    ## And give its columns suitable names
    colnames(new_null) <- c("null", "sv")
    ## Now all the pieces are in place, call f.pvalue().
    new_pvalues <- try(sva::f.pvalue(included_samples, new_model, new_null), silent = TRUE)
    ## For some things, f.pvalue returns NA, this is unacceptable.
    na_pvalues_idx <- is.na(new_pvalues)
    ## For the NaN pvalues, just set it to 1 under the assumption that
    ## something is fubar.
    new_pvalues[na_pvalues_idx] <- 2.2E-16
    ## For strange non-pairwise contrasts, the f.pvalue() should fail.
    if (class(new_pvalues)[1] == "try-error") {
      new_pvalues <- NULL
      warning("Unable to adjust pvalues for: ", name)
      warning("If this was not for an extra contrast, then this is a serious problem.")
    } else {
      ## Most of the time it should succeed, so do a BH adjustment of the new values.
      new_adjp <- p.adjust(new_pvalues, method = "BH")
      ## Now I need to fill in the tables with these new values.
      ## This section is a little complex.  In brief, it pulls the appropriate
      ## columns from each of the limma, edger, and deseq tables
      ## copies them to a new data.table named 'original_pvalues', and
      ## then replaces them with the just-calculated (adj)p-values.

      ## Start with limma, make no assumptions about table return-order
      limma_table_order <- rownames(results[["limma"]][["all_tables"]][[name]])
      reordered_pvalues <- new_pvalues[limma_table_order]
      reordered_adjp <- new_adjp[limma_table_order]
      ## Create a temporary dt with the old p-values and merge it into original_pvalues.
      tmpdt <- data.table::data.table(
        results[["limma"]][["all_tables"]][[name]][["P.Value"]])
      tmpdt[["rownames"]] <- rownames(results[["limma"]][["all_tables"]][[name]])
      ## Change the column name of the new data to reflect that it is from limma.
      colnames(tmpdt) <- c(glue("limma_{name}"), "rownames")
      original_pvalues <- merge(original_pvalues, tmpdt, by = "rownames")
      ## Swap out the p-values and adjusted p-values.
      results[["limma"]][["all_tables"]][[name]][["P.Value"]] <- reordered_pvalues
      results[["limma"]][["all_tables"]][[name]][["adj.P.Val"]] <- reordered_adjp

      ## Repeat the above verbatim, but for edger
      edger_table_order <- rownames(results[["edger"]][["all_tables"]][[name]])
      reordered_pvalues <- new_pvalues[edger_table_order]
      reordered_adjp <- new_adjp[edger_table_order]
      tmpdt <- data.table::data.table(
        results[["edger"]][["all_tables"]][[name]][["PValue"]])
      tmpdt[["rownames"]] <- rownames(results[["edger"]][["all_tables"]][[name]])
      colnames(tmpdt) <- c(glue("edger_{name}"), "rownames")
      original_pvalues <- merge(original_pvalues, tmpdt, by = "rownames")
      results[["edger"]][["all_tables"]][[name]][["PValue"]] <- reordered_pvalues
      results[["edger"]][["all_tables"]][[name]][["FDR"]] <- reordered_adjp

      ## Ibid.
      deseq_table_order <- rownames(results[["deseq"]][["all_tables"]][[name]])
      tmpdt <- data.table::data.table(
        results[["deseq"]][["all_tables"]][[name]][["P.Value"]])
      tmpdt[["rownames"]] <- rownames(results[["deseq"]][["all_tables"]][[name]])
      colnames(tmpdt) <- c(glue("deseq_{name}"), "rownames")
      original_pvalues <- merge(original_pvalues, tmpdt, by = "rownames")
      results[["deseq"]][["all_tables"]][[name]][["P.Value"]] <- reordered_pvalues
      results[["deseq"]][["all_tables"]][[name]][["adj.P.Val"]] <- reordered_adjp
    } ## End checking that f.pvalue worked.
  }  ## End foreach table
  original_pvalues <- as.data.frame(original_pvalues)
  retlist <- list(
    "original" = original_pvalues,
    "results" = results)
  return(retlist)
}

#' A sanity check that a given set of data is suitable for methods which assume
#' a negative binomial distribution of input.
#'
#' Take an expt and poke at it to ensure that it will not result in troubled
#' results.
#'
#' Invoked by deseq_pairwise() and edger_pairwise().
#'
#' @param input Expressionset containing expt object.
#' @param verbose Print some information about what is happening?
#' @param force Ignore every warning and just use this data.
#' @param ... Extra arguments passed to arglist.
#' @return dataset suitable for limma analysis
#' @seealso [DESeq2] [edgeR] [choose_basic_dataset()] [choose_limma_dataset()]
choose_binom_dataset <- function(input, verbose = TRUE, force = FALSE, ...) {
  ## arglist <- list(...)
  input_class <- class(input)
  ## I think I would like to make this function smarter so that it will remove
  ## the log2 from transformed data.
  data <- NULL
  warn_user <- 0
  libsize <- NULL
  if ("expt" %in% input_class) {
    conditions <- input[["conditions"]]
    batches <- input[["batches"]]
    data <- as.data.frame(exprs(input))
    ## As I understand it, EdgeR fits a binomial distribution
    ## and expects data as integer counts, not floating point nor a log2
    ## transformation. Thus, having the 'normalization' state set to something
    ## other than 'raw' is a likely violation of its stated preferred/demanded
    ## input.  There are of course ways around this but one should not take them
    ## lightly, or ever.
    tran_state <- input[["state"]][["transform"]]
    if (is.null(tran_state)) {
      tran_state <- "raw"
    }
    conv_state <- input[["state"]][["conversion"]]
    if (is.null(conv_state)) {
      conv_state <- "raw"
    }
    norm_state <- input[["state"]][["normalization"]]
    if (is.null(norm_state)) {
      norm_state <- "raw"
    }
    filt_state <- input[["state"]][["filter"]]
    if (is.null(filt_state)) {
      filt_state <- "raw"
    }
    if (norm_state == "round") {
      norm_state <- "raw"
    }
    libsize <- input[["libsize"]]

    if (isTRUE(force)) {
      ## Setting force to TRUE allows one to round the data to fool edger/deseq
      ## into accepting it. This is a pretty terrible thing to do
      if (isTRUE(verbose)) {
        message("About to round the data, this is a pretty terrible thing to do. ",
                "But if you, like me, want to see what happens when you put ",
                "non-standard data into deseq, then here you go.")
      }
      data <- round(data)
      less_than <- data < 0
      data[less_than] <- 0
      na_idx <- is.na(data)
      data[na_idx] <- 0
      warn_user <- 1
    } else if (norm_state != "raw" && tran_state != "raw" && conv_state != "raw") {
      ## These if statements may be insufficient to check for the appropriate
      ## input for deseq.
      backup <- get_backup_expression_data(input)
      data <- exprs(backup)
      libsize <- backup[["libsize"]]
    } else if (norm_state != "raw" || tran_state != "raw") {
      ## This makes use of the fact that the order of operations in the
      ## normalization function is
      ## static. filter->normalization->convert->batch->transform. Thus, if the
      ## normalized state is not raw, we can look back either to the filtered or
      ## original data. The same is true for the transformation state.
      if (isTRUE(verbose)) {
        message("EdgeR/DESeq expect raw data as input, reverting to count filtered data.")
      }
      data <- input[["normalized"]][["intermediate_counts"]][["filter"]][["count_table"]]
      if (is.null(data)) {
        data <- input[["normalized"]][["intermediate_counts"]][["original"]]
      }
    } else {
      if (isTRUE(verbose)) {
        message("The data should be suitable for EdgeR/DESeq/EBSeq.\n",
                "If they freak out, check the state of the count table\n",
                "and ensure that it is in integer counts.")
      }
    }
    ## End testing if normalization has been performed
  } else {
    ## This is another function which would profit from using a method test.
    data <- as.data.frame(input)
    libsize <- colSums(data)
  }

  retlist <- list(
    "libsize" = libsize,
    "conditions" = conditions,
    "batches" = batches,
    "data" = data)
  if (warn_user == 1) {
    warning("This data was inappropriately forced into integers.")
  }
  return(retlist)
}

#' Choose a suitable data set for Edger/DESeq
#'
#' The _pairwise family of functions all demand data in specific formats.
#' This tries to make that consistent.
#'
#' Invoked by _pairwise().
#'
#' @param input Expt input.
#' @param choose_for One of limma, deseq, edger, or basic.  Defines the
#'  requested data state.
#' @param force Force non-standard data?
#' @param verbose Print some information about what is happening?
#' @param ... More options for future expansion.
#' @return List the data, conditions, and batches in the data.
#' @seealso [choose_binom_dataset()] [choose_limma_dataset()] [choose_basic_dataset()]
#' @examples
#' \dontrun{
#'  starting_data <- create_expt(metadata)
#'  modified_data <- normalize_expt(starting_data, transform = "log2", norm = "quant")
#'  a_dataset <- choose_dataset(modified_data, choose_for = "deseq")
#'  ## choose_dataset should see that log2 data is inappropriate for DESeq2 and
#'  ## return it to a base10 state.
#' }
#' @export
choose_dataset <- function(input, choose_for = "limma", force = FALSE, verbose = TRUE, ...) {
  ## arglist <- list(...)
  result <- NULL
  if (choose_for == "limma") {
    result <- choose_limma_dataset(input, force = force, ...)
  } else if (choose_for == "basic") {
    result <- choose_basic_dataset(input, force = force, ...)
  } else if (choose_for == "edger") {
    result <- choose_binom_dataset(input, force = force, ...)
  } else if (choose_for == "deseq") {
    result <- choose_binom_dataset(input, force = force, ...)
  } else {
    if (isTRUE(verbose)) {
      message("Unknown tool for which to choose a data set.")
    }
  }
  return(result)
}

#' A sanity check that a given set of data is suitable for analysis by limma.
#'
#' Take an expt and poke at it to ensure that it will not result in troubled
#' limma results.
#'
#' @param input Expressionset containing expt object.
#' @param force Ingore warnings and use the provided data asis.
#' @param which_voom Choose between limma'svoom, voomWithQualityWeights, or the
#'  hpgl equivalents.
#' @param verbose Print some information about what is happening?
#' @param ... Extra arguments passed to arglist.
#' @return dataset suitable for limma analysis
#' @seealso [limma] [choose_dataset()]
choose_limma_dataset <- function(input, force = FALSE, which_voom = "limma", verbose = TRUE, ...) {
  ## arglist <- list(...)
  input_class <- class(input)
  data <- NULL
  warn_user <- 0
  libsize <- NULL
  ## It turns out, a more careful examination of how normalization affects the
  ## results, the above seems only to be true if the following are true:
  ## 1.  There are >2-3k features(genes/transcripts) with a full range of count values.
  ## 2.  One does not attempt to use sva, or at least one uses sva before
  ##     messing with the normalization state.
  ## 2a. #2 primarily applies if one is using quantile normalization, it looks
  ##     like tmm/rle does not have so profound an effect, and this effect is
  ##     tightly bound with the state of #1 -- in other words, if one has nice
  ##     dense data with low->high counts in an even distribution, then
  ##     quantile+sva might be ok. But if that is not true, then one should
  ##     expect a poor result.
  ## For these reasons I am telling this function to revert to non-normalized
  ## data unless force is on, just like I do for edger/deseq.  I think to help
  ## this, I will add a parameter which allows one to to turn on/off
  ## normalization at the voom() step.

  if ("expt" %in% input_class) {
    conditions <- input[["conditions"]]
    batches <- input[["batches"]]
    libsize <- input[["libsize"]]
    data <- as.data.frame(exprs(input))

    tran_state <- input[["state"]][["transform"]]
    ## Note that voom will take care of this for us.
    if (is.null(tran_state)) {
      tran_state <- "raw"
    }
    conv_state <- input[["state"]][["conversion"]]
    ## Note that voom takes care of this for us.
    if (is.null(conv_state)) {
      conv_state <- "raw"
    }
    norm_state <- input[["state"]][["normalization"]]
    if (is.null(norm_state)) {
      norm_state <- "raw"
    }
    filt_state <- input[["state"]][["filter"]]
    if (is.null(filt_state)) {
      filt_state <- "raw"
    }

    ## ready <- input
    data <- exprs(input)
    if (isTRUE(force)) {
      if (isTRUE(verbose)) {
        mesg("Leaving the data alone, regardless of normalization state.")
      }
      retlist <- list(
        "libsize" = libsize,
        "conditions" = conditions,
        "batches" = batches,
        "data" = data)
      return(retlist)
    }

    ## If we are using limma::voom*, then make sure we do things the limma way.
    ## If we use the hpgltools::hpgl_voom*, let the freak flags fly.
    if (grepl(pattern = "limma", x = which_voom)) {
      ## Limma's voom requires we return log2(cpm()) to base 10.
      ## Otherwise it should accept pretty much anything.
      if (tran_state == "log2") {
        if (isTRUE(verbose)) {
          mesg("Using limma's voom, returning to base 10.")
        }
        data <- (2 ^ data) - 1
      }
    }
  } else {
    data <- as.data.frame(input)
    libsize <- colSums(data)
  }
  retlist <- list(
    "libsize" = libsize,
    "conditions" = conditions,
    "batches" = batches,
    "data" = data)
  return(retlist)
}

#' Try out a few experimental models and return a likely working option.
#'
#' The _pairwise family of functions all demand an experimental model.  This
#' tries to choose a consistent and useful model for all for them.  This does
#' not try to do multi-factor, interacting, nor dependent variable models, if
#' you want those do them yourself and pass them off as alt_model.
#'
#' Invoked by the _pairwise() functions.
#'
#' @param input Input data used to make the model.
#' @param conditions Factor of conditions in the putative model.
#' @param batches Factor of batches in the putative model.
#' @param model_batch Try to include batch in the model?
#' @param model_cond Try to include condition in the model? (Yes!)
#' @param model_intercept Use an intercept model instead of cell-means?
#' @param alt_model Use your own model.
#' @param alt_string String describing an alternate model.
#' @param intercept Choose an intercept for the model as opposed to 0.
#' @param reverse Reverse condition/batch in the model?  This shouldn't/doesn't
#'  matter but I wanted to test.
#' @param contr List of contrasts.arg possibilities.
#' @param surrogates Number of or method used to choose the number of surrogate
#'  variables.
#' @param verbose Print some information about what is happening?
#' @param ... Further options are passed to arglist.
#' @return List including a model matrix and strings describing cell-means and
#'  intercept models.
#' @seealso [stats::model.matrix()]
#' @examples
#' \dontrun{
#'  a_model <- choose_model(expt, model_batch = TRUE, model_intercept = FALSE)
#'  a_model$chosen_model
#'  ## ~ 0 + condition + batch
#' }
#' @export
choose_model <- function(input, conditions = NULL, batches = NULL, model_batch = TRUE,
                         model_cond = TRUE, model_intercept = FALSE,
                         alt_model = NULL, alt_string = NULL,
                         intercept = 0, reverse = FALSE, contr = NULL,
                         surrogates = "be", verbose = TRUE, ...) {
  arglist <- list(...)
  design <- NULL
  if (class(input)[1] != "matrix" && class(input)[1] != "data.frame") {
    design <- pData(input)
  }
  if (is.null(design)) {
    conditions <- as.factor(conditions)
    batches <- as.factor(batches)
    design <- data.frame("condition" = conditions,
                         "batch" = batches,
                         stringsAsFactors = TRUE)
  }
  ## Make a model matrix which will have one entry for
  ## each of the condition/batches
  ## It would be much smarter to generate the models in the following if() {} blocks
  ## But I have it in my head to eventually compare results using different models.

  ## The previous iteration of this had an explicit contrasts.arg set, like this:
  ## contrasts.arg = list(condition = "contr.treatment"))
  ## Which looked like this for a full invocation:
  ## condbatch_int_model <- try(stats::model.matrix(~ 0 + conditions + batches,
  ##                                   contrasts.arg = list(condition = "contr.treatment",
  ##                                                      batch = "contr.treatment")),
  ## The contrasts.arg has been removed because it seems to result in the same model.

  clist <- list("condition" = "contr.treatment")
  blist <- list("batch" = "contr.treatment")
  cblist <- list("condition" = "contr.treatment", "batch" = "contr.treatment")
  if (!is.null(contr)) {
    if (!is.null(contr[["condition"]]) && !is.null(contr[["batch"]])) {
      cblist <- list("condition" = contr[["condition"]], "batch" = contr[["batch"]])
    } else if (!is.null(contr[["condition"]])) {
      clist <- list("condition" = contr[["condition"]])
      cblist[["condition"]] <- contr[["condition"]]
    } else if (!is.null(contr[["batch"]])) {
      blist <- list("batch" = contr[["batch"]])
      cblist[["batch"]] <- contr[["batch"]]
    }
  }

  cond_noint_string <- "~ 0 + condition"
  cond_noint_model <- try(stats::model.matrix(
    object = as.formula(cond_noint_string),
    contrasts.arg = clist,
    data = design), silent = TRUE)

  batch_noint_string <- "~ 0 + batch"
  batch_noint_model <- try(stats::model.matrix(
    object = as.formula(batch_noint_string),
    contrasts.arg = blist,
    data = design), silent = TRUE)

  condbatch_noint_string <- "~ 0 + condition + batch"
  condbatch_noint_model <- try(stats::model.matrix(
    object = as.formula(condbatch_noint_string),
    contrasts.arg = cblist,
    data = design), silent = TRUE)

  batchcond_noint_string <- "~ 0 + batch + condition"
  batchcond_noint_model <- try(stats::model.matrix(
    object = as.formula(batchcond_noint_string),
    contrasts.arg = cblist,
    data = design), silent = TRUE)

  cond_int_string <- "~ condition"
  cond_int_model <- try(stats::model.matrix(
    object = as.formula(cond_int_string),
    contrasts.arg = clist,
    data = design), silent = TRUE)

  batch_int_string <- "~ batch"
  batch_int_model <- try(stats::model.matrix(
    object = as.formula(batch_int_string),
    contrasts.arg = blist,
    data = design), silent = TRUE)

  condbatch_int_string <- "~ condition + batch"
  condbatch_int_model <- try(stats::model.matrix(
    object = as.formula(condbatch_int_string),
    contrasts.arg = cblist,
    data = design), silent = TRUE)

  batchcond_int_string <- "~ batch + condition"
  batchcond_int_model <- try(stats::model.matrix(
    object = as.formula(batchcond_int_string),
    contrasts.arg = cblist,
    data = design), silent = TRUE)

  noint_model <- NULL
  int_model <- NULL
  noint_string <- NULL
  int_string <- NULL
  including <- NULL
  if (!is.null(alt_model)) {
    chosen_model <- stats::model.matrix(object = as.formula(alt_model),
                                        data = design)
    if (!is.null(contr)) {
      chosen_model <- stats::model.matrix(object = as.formula(alt_model),
                                          contrasts.arg = contr,
                                          data = design)
    }
    int_model <- chosen_model
    noint_model <- chosen_model
    int_string <- alt_model
    notint_string <- alt_model
    including <- "alt"
  } else if (is.null(model_batch)) {
    int_model <- cond_int_model
    noint_model <- cond_noint_model
    int_string <- cond_int_string
    noint_string <- cond_noint_string
    including <- "condition"
  } else if (isTRUE(model_cond) && isTRUE(model_batch)) {
    if (class(condbatch_int_model)[1] == "try-error") {
      if (isTRUE(verbose)) {
        message("The condition+batch model failed. ",
                "Does your experimental design support both condition and batch? ",
                "Using only a conditional model.")
      }
      int_model <- cond_int_model
      noint_model <- cond_noint_model
      int_string <- cond_int_string
      noint_string <- cond_noint_string
      including <- "condition"
    } else if (isTRUE(reverse)) {
      int_model <- batchcond_int_model
      noint_model <- batchcond_noint_model
      int_string <- batchcond_int_string
      noint_string <- batchcond_noint_string
      including <- "batch+condition"
    } else {
      int_model <- condbatch_int_model
      noint_model <- condbatch_noint_model
      int_string <- condbatch_int_string
      noint_string <- condbatch_noint_string
      including <- "condition+batch"
    }
  } else if (class(model_batch)[1] == "character") {
    ## Then calculate the estimates using all_adjusters
    if (isTRUE(verbose)) {
      mesg("Extracting surrogate estimates from ", model_batch,
              " and adding them to the model.")
    }
    model_batch_info <- all_adjusters(input, estimate_type = model_batch,
                                      surrogates = surrogates)
    ## Changing model_batch from 'sva' to the resulting matrix.
    ## Hopefully this will simplify things later for me.
    model_batch <- model_batch_info[["model_adjust"]]
    int_model <- stats::model.matrix(~ condition + model_batch,
                                     contrasts.arg = clist,
                                     data = design)
    noint_model <- stats::model.matrix(~ 0 + condition + model_batch,
                                       contrasts.arg = clist,
                                       data = design)
    sv_names <- glue("SV{1:ncol(model_batch)}")
    noint_string <- cond_noint_string
    int_string <- cond_int_string
    sv_string <- ""
    for (sv in sv_names) {
      sv_string <- glue("{sv_string} + {sv}")
    }
    noint_string <- glue("{noint_string}{sv_string}")
    int_string <- glue("{int_string}{sv_string}")
    rownames(model_batch) <- rownames(int_model)
    including <- glue("condition{sv_string}")
  } else if (class(model_batch)[1] == "numeric" || class(model_batch)[1] == "matrix") {
    if (isTRUE(verbose)) {
      mesg("Including batch estimates from sva/ruv/pca in the model.")
    }
    int_model <- stats::model.matrix(~ condition + model_batch,
                                     contrasts.arg = clist,
                                     data = design)
    noint_model <- stats::model.matrix(~ 0 + condition + model_batch,
                                       contrasts.arg = clist,
                                       data = design)
    sv_names <- glue("SV{1:ncol(model_batch)}")
    int_string <- cond_int_string
    noint_string <- cond_noint_string
    sv_string <- ""
    for (sv in sv_names) {
      sv_string <- glue("{sv_string} + {sv}")
    }
    int_string <- glue("{int_string}{sv_string}")
    noint_string <- glue("{noint_string}{sv_string}")
    rownames(model_batch) <- rownames(int_model)
    including <- glue("condition{sv_string}")
  } else if (isTRUE(model_cond)) {
    int_model <- cond_int_model
    noint_model <- cond_noint_model
    int_string <- cond_int_string
    noint_string <- cond_noint_string
    including <- "condition"
  } else if (isTRUE(model_batch)) {
    int_model <- batch_int_model
    noint_model <- batch_noint_model
    int_string <- batch_int_string
    noint_string <- batch_noint_string
    including <- "batch"
  } else {
    ## Default to the conditional model
    int_model <- cond_int_model
    noint_model <- cond_noint_model
    int_string <- cond_int_string
    noint_string <- cond_noint_string
    including <- "condition"
  }

  tmpnames <- colnames(int_model)
  tmpnames <- gsub(pattern = "model_batch", replacement = "SV1", x = tmpnames)
  tmpnames <- gsub(pattern = "data[[:punct:]]", replacement = "", x = tmpnames)
  tmpnames <- gsub(pattern = "-", replacement = "", x = tmpnames)
  tmpnames <- gsub(pattern = "+", replacement = "", x = tmpnames)
  ## The next lines ensure that conditions/batches which are all numeric will
  ## not cause weird errors for contrasts. Ergo, if a condition is something
  ## like '111', now it will be 'c111' Similarly, a batch '01' will be 'b01'
  tmpnames <- gsub(pattern = "^condition(\\d+)$", replacement = "c\\1", x = tmpnames)
  tmpnames <- gsub(pattern = "^batch(\\d+)$", replacement = "b\\1", x = tmpnames)
  tmpnames <- gsub(pattern = "condition", replacement = "", x = tmpnames)
  tmpnames <- gsub(pattern = "batch", replacement = "", x = tmpnames)
  colnames(int_model) <- tmpnames

  tmpnames <- colnames(noint_model)
  tmpnames <- gsub(pattern = "model_batch", replacement = "SV1", x = tmpnames)
  tmpnames <- gsub(pattern = "data[[:punct:]]", replacement = "", x = tmpnames)
  tmpnames <- gsub(pattern = "-", replacement = "", x = tmpnames)
  tmpnames <- gsub(pattern = "+", replacement = "", x = tmpnames)
  ## The next lines ensure that conditions/batches which are all numeric will
  ## not cause weird errors for contrasts. Ergo, if a condition is something
  ## like '111', now it will be 'c111' Similarly, a batch '01' will be 'b01'
  tmpnames <- gsub(pattern = "condition^(\\d+)$", replacement = "c\\1", x = tmpnames)
  tmpnames <- gsub(pattern = "batch^(\\d+)$", replacement = "b\\1", x = tmpnames)
  tmpnames <- gsub(pattern = "condition", replacement = "", x = tmpnames)
  tmpnames <- gsub(pattern = "batch", replacement = "", x = tmpnames)
  colnames(noint_model) <- tmpnames

  chosen_model <- NULL
  chosen_string <- NULL
  if (isTRUE(model_intercept)) {
    if (isTRUE(verbose)) {
      mesg("Choosing the intercept containing model.")
    }
    chosen_model <- int_model
    chosen_string <- int_string
  } else {
    if (isTRUE(verbose)) {
      mesg("Choosing the non-intercept containing model.")
    }
    chosen_model <- noint_model
    chosen_string <- noint_string
  }

  retlist <- list(
    "int_model" = int_model,
    "noint_model" = noint_model,
    "int_string" = int_string,
    "noint_string" = noint_string,
    "chosen_model" = chosen_model,
    "chosen_string" = chosen_string,
    "model_batch" = model_batch,
    "including" = including)
  return(retlist)
}

#' Compare the results of separate all_pairwise() invocations.
#'
#' Where compare_led_tables looks for changes between limma and friends, this
#' function looks for differences/similarities across the models/surrogates/etc
#' across invocations of limma/deseq/edger.
#'
#' Tested in 29de_shared.R
#'
#' @param first One invocation of combine_de_tables to examine.
#' @param second A second invocation of combine_de_tables to examine.
#' @param cor_method Method to use for cor.test().
#' @param try_methods List of methods to attempt comparing.
#' @return A list of compared columns, tables, and methods.
#' @seealso [all_pairwise()]
#' @examples
#' \dontrun{
#'  first <- all_pairwise(expt, model_batch = FALSE, excel = "first.xlsx")
#'  second <- all_pairwise(expt, model_batch = "svaseq", excel = "second.xlsx")
#'  comparison <- compare_de_results(first$combined, second$combined)
#' }
#' @export
compare_de_results <- function(first, second, cor_method = "pearson",
                               try_methods = c("limma", "deseq", "edger")) {

  result <- list()
  logfc_result <- list()
  p_result <- list()
  adjp_result <- list()
  comparisons <- c("logfc", "p", "adjp")
  ## First make sure we can collect the data for each differential expression method.
  methods <- c()
  for (m in seq_along(try_methods)) {
    method <- try_methods[m]
    mesg("Testing method: ", method, ".")
    test_column <- glue("{method}_logfc")
    if (is.null(first[["data"]][[1]][[test_column]])) {
      message("The first datum is missing method: ", method, ".")
    } else if (is.null(second[["data"]][[1]][[test_column]])) {
      message("The second datum is missing method: ", method, ".")
    } else {
      mesg("Adding method: ", method, " to the set.")
      methods <- c(methods, method)
    }
  }

  ## Now start building tables containing the correlations between the methods/contrasts.
  for (m in seq_along(methods)) {
    method <- methods[m]
    result[[method]] <- list()
    tables <- names(first[["data"]])
    for (t in seq_along(tables)) {
      table <- tables[t]
      mesg(" Starting method ", method, ", table ", table, ".")
      result[[method]][[table]] <- list()
      for (c in seq_along(comparisons)) {
        comparison <- comparisons[c]
        column_name <- glue("{method}_{comparison}")
        f_column <- as.vector(as.numeric(first[["data"]][[table]][[column_name]]))
        s_column <- as.vector(as.numeric(second[["data"]][[table]][[column_name]]))
        if (length(f_column) == 0 || length(s_column) == 0) {
          ## The column of data does not exist.
          break
        }
        names(f_column) <- rownames(first[["data"]][[table]])
        names(s_column) <- rownames(second[["data"]][[table]])
        fs <- merge(f_column, s_column, by = "row.names")
        comp <- cor(x = fs[["x"]], y = fs[["y"]], method = cor_method)
        result[[method]][[table]][[comparison]] <- comp
        if (comparison == "logfc") {
          logfc_result[table] <- comp
          tmp <- as.vector(as.numeric(logfc_result))
          names(tmp) <- names(logfc_result)
          logfc_result <- tmp
        } else if (comparison == "p") {
          p_result[table] <- comp
          tmp <- as.vector(as.numeric(p_result))
          names(tmp) <- names(p_result)
          p_result <- tmp
        } else if (comparison == "adjp") {
          adjp_result[table] <- comp
          tmp <- as.vector(as.numeric(adjp_result))
          names(tmp) <- names(adjp_result)
          adjp_result <- tmp
        }
      }
    }
  }
  comp_df <- data.frame(row.names = names(result[[1]]))
  p_df <- data.frame(row.names = names(result[[1]]))
  adjp_df <- data.frame(row.names = names(result[[1]]))
  cols <- names(result)
  rows <- names(result[[1]])
  for (i in seq_along(rows)) {
    row <- rows[i]
    for (j in seq_along(cols)) {
      col <- cols[j]
      if (col == "basic") {
        next
      }
      element <- result[[col]][[row]][["logfc"]]

      ## Note that in the case of tables with 'extra' contrasts
      ## this may come out null and therefore need to be skipped.
      if (is.null(element)) {
        comp_df[row, col] <- NA
        p_df[row, col] <- NA
        adjp_df[row, col] <- NA
      } else {
        comp_df[row, col] <- element
        element <- result[[col]][[row]][["p"]]
        p_df[row, col] <- element
        element <- result[[col]][[row]][["adjp"]]
        adjp_df[row, col] <- element
      }
    }
  }
  comp_df <- na.omit(comp_df)
  p_df <- na.omit(p_df)
  adjp_df <- na.omit(adjp_df)

  original <- par(mar = c(7, 4, 4, 2) + 0.1)
  text_size <- 1.0
  heat_colors <- grDevices::colorRampPalette(c("white", "darkblue"))
  lfc_heatmap <- try(heatmap.3(as.matrix(comp_df), scale = "none",
                               trace = "none", keysize = 1.5, linewidth = 0.5,
                               margins = c(12, 8), cexRow = text_size, cexCol = text_size,
                               col = heat_colors, dendrogram = "none",
                               Rowv = FALSE, Colv = FALSE,
                               main = "Compare lFC results"), silent = TRUE)
  lfc_heat <- NULL
  if (! "try-error" %in% class(lfc_heatmap)) {
    lfc_heat <- recordPlot()
  }
  heat_colors <- grDevices::colorRampPalette(c("white", "darkred"))
  p_heatmap <- try(heatmap.3(as.matrix(p_df), scale = "none",
                             trace = "none", keysize = 1.5, linewidth = 0.5,
                             margins = c(12, 8), cexRow = text_size, cexCol = text_size,
                             col = heat_colors, dendrogram = "none",
                             Rowv = FALSE, Colv = FALSE,
                             main = "Compare p-values"), silent = TRUE)
  p_heat <- NULL
  if (! "try-error" %in% class(p_heatmap)) {
    p_heat <- recordPlot()
  }
  heat_colors <- grDevices::colorRampPalette(c("white", "darkgreen"))
  adjp_heatmap <- try(heatmap.3(as.matrix(adjp_df), scale = "none",
                                trace = "none", keysize = 1.5, linewidth = 0.5,
                                margins = c(12, 8), cexRow = text_size, cexCol = text_size,
                                col = heat_colors, dendrogram = "none",
                                Rowv = FALSE, Colv = FALSE,
                                main = "Compare adjp-values"), silent = TRUE)
  adjp_heat <- NULL
  if (! "try-error" %in% class(adjp_heatmap)) {
    adjp_heat <- recordPlot()
  }
  new <- par(original)

  retlist <- list(
    "result" = result,
    "logfc" = logfc_result,
    "p" = p_result,
    "adjp" = adjp_result,
    "lfc_heat" = lfc_heat,
    "p_heat" = p_heat,
    "adjp_heat" = adjp_heat)
  return(retlist)
}

#' Use plot_linear_scatter to compare to de tables.
#'
#' @param first First table to compare.
#' @param second Second table to compare.
#' @param fcx Column for the x-axis fold-change.
#' @param px Column for the x-axis p-value.
#' @param fcy Column containing the y-axis fold-change.
#' @param py Column containing the y-axis p-value.
#' @param first_table If the input data are actually of type de_table, then find the table(s) inside them.
#' @param second_table Ibid.
#' @return List result from plot_linear_scatter
#' @export
compare_de_tables <- function(first, second, fcx = "deseq_logfc", px = "deseq_adjp",
                              fcy = "deseq_logfc", py = "deseq_adjp",
                              first_table = NULL, second_table = NULL) {
  if (!is.null(first_table)) {
    ## Then assume this is a combine_de_tables() result.
    first <- first[["data"]][[first_table]]
  }
  if (!is.null(second_table)) {
    second <- second[["data"]][[second_table]]
  }
  merged <- merge(first, second, by = "row.names")
  rownames(merged) <- merged[["Row.names"]]
  merged[["Row.names"]] <- NULL
  kept_columns <- c(fcx, px, fcy, py)
  if (fcx == fcy) {
    kept_columns <- c(paste0(fcx, ".x"), paste0(px, ".x"),
                      paste0(fcy, ".y"), paste0(fcy, ".y"))
  }
  merged <- merged[, kept_columns]
  colnames(merged) <- c("first_lfc", "first_p", "second_lfc", "second_p")
  scatter <- plot_linear_scatter(merged[, c("first_lfc", "second_lfc")])
  class(scatter) <- "cross_table_comparison"
  scatter[["first_table"]] <- first_table
  scatter[["fcx_column"]] <- fcx
  scatter[["fcy_column"]] <- fcy
  scatter[["second_table"]] <- second_table
  return(scatter)
}

#' See how similar are results from limma/deseq/edger/ebseq.
#'
#' limma, DEseq2, and EdgeR all make somewhat different assumptions.
#' and choices about what makes a meaningful set of differentially.
#' expressed genes.  This seeks to provide a quick and dirty metric
#' describing the degree to which they (dis)agree.
#'
#' Invoked by all_pairwise().
#'
#' @param results Data from do_pairwise()
#' @param annot_df Include annotation data?
#' @return Heatmap showing how similar they are along with some
#'  correlations betwee the three players.
#' @param extra_contrasts include some extra contrasts when comparing results.
#' @seealso [limma_pairwise()] [edger_pairwise()] [deseq_pairwise()]
#' @examples
#' \dontrun{
#'  l = limma_pairwise(expt)
#'  d = deseq_pairwise(expt)
#'  e = edger_pairwise(expt)
#'  fun = compare_led_tables(limma = l, deseq = d, edger = e)
#' }
#' @export
correlate_de_tables <- function(results, annot_df = NULL, extra_contrasts = NULL) {
  ## Fill each column/row of these with the correlation between tools for one
  ## contrast performed
  retlst <- list()
  methods <- c()
  if (class(results[["limma"]])[1] == "limma_pairwise") {
    retlst[["limma"]] <- results[["limma"]][["all_tables"]]
    methods <- c(methods, "limma")
  }
  if (class(results[["deseq"]])[1] == "deseq_pairwise") {
    retlst[["deseq"]] <- results[["deseq"]][["all_tables"]]
    methods <- c(methods, "deseq")
  }
  if (class(results[["edger"]])[1] == "edger_pairwise") {
    retlst[["edger"]] <- results[["edger"]][["all_tables"]]
    methods <- c(methods, "edger")
  }
  if (class(results[["ebseq"]])[1] == "ebseq_pairwise") {
    retlst[["ebseq"]] <- results[["ebseq"]][["all_tables"]]
    methods <- c(methods, "ebseq")
  }
  if (class(results[["basic"]])[1] == "basic_pairwise") {
    retlst[["basic"]] <- results[["basic"]][["all_tables"]]
    methods <- c(methods, "basic")
  }
  if (class(results[["noiseq"]])[1] == "noiseq_pairwise") {
    retlst[["noiseq"]] <- results[["noiseq"]][["all_tables"]]
    methods <- c(methods, "noiseq")
  }
  if (class(results[["dream"]])[1] == "dream_pairwise") {
    retlst[["dream"]] <- results[["dream"]][["all_tables"]]
    methods <- c(methods, "dream")
  }

  extra_eval_names <- NULL
  if (!is.null(extra_contrasts)) {
    extra_eval_strings <- strsplit(extra_contrasts, ",")[[1]]
    extra_eval_names <- extra_eval_strings
    extra_eval_names <- stringi::stri_replace_all_regex(extra_eval_strings,
                                                        "^(\\s*)(\\w+)\\s*=\\s*.*$", "$2")
    extra_eval_names <- gsub(pattern = "^\\s+", replacement = "", x = extra_eval_names, perl = TRUE)
  }

  ## The recorded heatmap comparing the methods/contrasts if there are sufficient methods employed.
  heat <- NULL
  ## What it says on the tin, this will provide a shorter version of the contrast names
  ## so that the heatmap labels are not annoying.
  contrast_name_list <- c()
  ## These are the lists of the results of comparisons/plots comparing each set of FC values.
  complst <- list()
  plotlst <- list()
  comparison_df <- data.frame()
  lenminus <- length(methods) - 1
  mesg("Comparing analyses.")
  meth <- methods[1]
  len <- length(names(retlst[[meth]]))
  ## FIXME: This is wrong.
  for (c in seq_len(lenminus)) {
    c_name <- methods[c]
    nextc <- c + 1
    for (d in seq(from = nextc, to = length(methods))) {
      d_name <- methods[d]
      method_comp_name <- glue("{c_name}_vs_{d_name}")
      for (l in seq_len(len)) {
        contr <- names(retlst[[c_name]])[l]
        mesg(glue("Comparing {contr} of {c_name} vs. {d_name}."))
        if (contr %in% extra_eval_names) {
          next
        }

        ## assume all three have the same names() -- note that limma has more
        ## than the other two though
        num_den_names <- strsplit(x = contr, split = "_vs_")[[1]]
        num_name <- num_den_names[1]
        den_name <- num_den_names[2]
        rev_contr <- glue("{den_name}_vs_{num_name}")
        num_reversed <- 0
        fst <- retlst[[c_name]][[contr]]
        scd <- retlst[[d_name]][[contr]]
        if (is.null(fst)) {
          fst <- retlst[[c_name]][[rev_contr]]
          fst[["logFC"]] <- fst[["logFC"]] * -1
          mesg("Used reverse contrast for ", c_name, ".")
          num_reversed <- num_reversed + 1
        }
        if (is.null(scd)) {
          scd <- retlst[[d_name]][[rev_contr]]
          scd[["logFC"]] <- scd[["logFC"]] * -1
          mesg("Used reverse contrast for ", d_name, ".")
          num_reversed <- num_reversed + 1
        }
        ## An extra test condition in case of extra contrasts not performed by all methods.
        if (is.na(contr)) {
          next
        }
        contrast_name_list <- c(contr, contrast_name_list)
        fs <- merge(fst, scd, by = "row.names")
        if (nrow(fs) == 0) {
          warning("The merge of ", c_name, ", ", contr, " and ",
                  d_name, ", ", contr, " failed.")
          next
        }
        fs <- fs[, c("logFC.x", "logFC.y")]
        colnames(fs) <- c(glue("{c_name} logFC"), glue("{d_name} logFC"))
        fs_cor <- stats::cor.test(x = fs[, 1], y = fs[, 2])[["estimate"]]
        comparison_df[method_comp_name, contr] <- fs_cor
        fs_plt <- plot_scatter(fs) +
          ggplot2::labs(title = glue("{contr}: {c_name} vs. {d_name}.")) +
          ggplot2::geom_abline(intercept = 0.0, slope = 1.0, colour = "blue")
        complst[[method_comp_name]] <- fs_cor
        plotlst[[method_comp_name]] <- fs_plt
      } ## End iterating through the contrasts
    } ## End the second method loop
  } ## End the first method loop
  comparison_df <- as.matrix(comparison_df)
  ## I think this next line is a likely source of errors because
  ## of differences when using extra_contrasts.
  ## I am sure there is a reason, but I cannot see why I have either of these right now.
  ## colnames(comparison_df) <- names(retlst[["deseq"]])
  ## colnames(comparison_df) <- contrast_name_list
  new_colnames <- abbreviate(colnames(comparison_df), minlength = 10)
  colnames(comparison_df) <- new_colnames
  if (len > 2) {
    heat_colors <- grDevices::colorRampPalette(c("white", "black"))
    original <- par(mar = c(7, 4, 4, 2) + 0.1)
    comparison_heatmap <- try(heatmap.3(comparison_df, scale = "none",
                                        trace = "none", keysize = 1.5,
                                        cexCol = 1.0, cexRow = 1.0,
                                        linewidth = 0.5, margins = c(12, 8),
                                        col = heat_colors, dendrogram = "none",
                                        Rowv = FALSE, Colv = FALSE,
                                        main = "Compare DE tools"), silent = TRUE)
    new <- par(original)
    if (! "try-error" %in% class(comparison_heatmap)) {
      heat <- recordPlot()
    }
    ## End checking if there is more than 1 method to compare.
  }

  ret <- append(complst, plotlst)
  ret[["comp"]] <- comparison_df
  ret[["heat"]] <- heat
  return(ret)
}

#' Compare logFC values from limma and friends
#'
#' There are some peculiar discrepencies among these tools, what is up with that?
#'
#' Invoked by combine_de_tables() in order to compare the results.
#'
#' @param combined_tables The combined tables from limma et al.
#' @return Some plots
#' @seealso [plot_linear_scatter()]
#' @examples
#' \dontrun{
#'  limma_vs_deseq_vs_edger <- compare_logfc_plots(combined)
#'  ## Get a list of plots of logFC by contrast of LvD, LvE, DvE
#'  ## It provides comparisons against the basic analysis, but who cares about that.
#' }
#' @export
compare_logfc_plots <- function(combined_tables) {
  plots <- list()
  data <- NULL
  if (!is.null(combined_tables[["data"]])) {
    data <- combined_tables[["data"]]
  } else {
    data <- combined_tables
  }
  tnames <- names(data)
  retlist <- list()
  for (c in seq_along(tnames)) {
    tname <- tnames[c]
    tab <- data[[tname]]
    if (!is.null(tab[["limma_logfc"]]) && !is.null(tab[["edger_logfc"]])) {
      le_data <- tab[, c("limma_logfc", "edger_logfc", "limma_adjp", "edger_adjp")]
      le <- sm(plot_linear_scatter(le_data, pretty_colors = FALSE)[["scatter"]])
    } else {
      le <- NULL
    }
    if (!is.null(tab[["limma_logfc"]]) && !is.null(tab[["deseq_logfc"]])) {
      ld_data <- tab[, c("limma_logfc", "deseq_logfc", "limma_adjp", "deseq_adjp")]
      ld <- sm(plot_linear_scatter(ld_data, pretty_colors = FALSE)[["scatter"]])
    } else {
      ld <- NULL
    }
    if (!is.null(tab[["deseq_logfc"]]) && !is.null(tab[["edger_logfc"]])) {
      de_data <- tab[, c("deseq_logfc", "edger_logfc", "deseq_adjp", "edger_adjp")]
      de <- sm(plot_linear_scatter(de_data, pretty_colors = FALSE)[["scatter"]])
    } else {
      de <- NULL
    }
    if (!is.null(tab[["limma_logfc"]]) && !is.null(tab[["basic_logfc"]])) {
      lb_data <- tab[, c("limma_logfc", "basic_logfc", "limma_adjp", "basic_p")]
      lb <- sm(plot_linear_scatter(lb_data, pretty_colors = FALSE)[["scatter"]])
    } else {
      lb <- NULL
    }
    if (!is.null(tab[["deseq_logfc"]]) && !is.null(tab[["basic_logfc"]])) {
      db_data <- tab[, c("deseq_logfc", "basic_logfc", "deseq_adjp", "basic_p")]
      db <- sm(plot_linear_scatter(db_data, pretty_colors = FALSE)[["scatter"]])
    } else {
      db <- NULL
    }
    if (!is.null(tab[["edger_logfc"]]) && !is.null(tab[["basic_logfc"]])) {
      eb_data <- tab[, c("edger_logfc", "basic_logfc", "edger_adjp", "basic_p")]
      eb <- sm(plot_linear_scatter(eb_data, pretty_colors = FALSE)[["scatter"]])
    } else {
      db <- NULL
    }
    compared <- list(
      "name" = tname,
      "le" = le, "ld" = ld, "de" = de,
      "lb" = lb, "db" = db, "eb" = eb)
    retlist[[tname]] <- compared
  }
  return(retlist)
}

#' Implement a cleaner version of 'subset_significants' from analyses with Maria
#' Adelaida.
#'
#' This should provide nice venn diagrams and some statistics to compare 2 or 3
#' contrasts in a differential expression analysis.
#'
#' @param sig_tables Set of significance tables to poke at.
#' @param second_sig_tables Separate set of significant results, intra vs. inter comparisons.
#' @param compare_by Use which program for the comparisons?
#' @param weights When printing venn diagrams, weight them?
#' @param contrasts List of contrasts to compare.
#' @return List containing the intersections of the contrasts and plots describing them.
#' @seealso [Vennerable]
#' @export
compare_significant_contrasts <- function(sig_tables, second_sig_tables = NULL,
                                          compare_by = "deseq", weights = FALSE,
                                          contrasts = c(1, 2, 3)) {
  retlist <- NULL
  contrast_names <- names(sig_tables[[compare_by]][["ups"]])
  for (i in seq_along(contrasts)) {
    contr <- contrasts[i]
    if (is.numeric(contr)) {
      contrasts[i] <- contrast_names[i]
    } else if (!is.na(sm(as.numeric(contr)))) {
      ## If one changes one number to character, then they all get recast, ergo this foolishness.
      contrasts[i] <- contrast_names[i]
    }
  }
  up_lst <- list()
  down_lst <- list()
  for (c in seq_along(contrasts)) {
    contr <- contrasts[c]
    if (is.null(second_sig_tables)) {
      up_lst[[contr]] <- rownames(sig_tables[[compare_by]][["ups"]][[contr]])
      down_lst[[contr]] <- rownames(sig_tables[[compare_by]][["downs"]][[contr]])
    } else {
      up_lst[["first"]] <- rownames(sig_tables[[compare_by]][["ups"]][[contr]])
      down_lst[["first"]] <- rownames(sig_tables[[compare_by]][["downs"]][[contr]])
      up_lst[["second"]] <- rownames(second_sig_tables[[compare_by]][["ups"]][[contr]])
      down_lst[["second"]] <- rownames(second_sig_tables[[compare_by]][["downs"]][[contr]])
    }
  }

  up_venn <- Vennerable::Venn(Sets = up_lst)
  up_intersect <- rename_vennerable_intersections(up_venn, up_lst)
  down_venn <- Vennerable::Venn(Sets = down_lst)
  down_intersect <- rename_vennerable_intersections(down_venn, down_lst)

  up_tables <- get_vennerable_rows(sig_tables[[compare_by]][["ups"]], up_intersect)
  down_tables <- get_vennerable_rows(sig_tables[[compare_by]][["downs"]], down_intersect)

  up_plot <- Vennerable::plot(up_venn, doWeights = weights)
  up_plot <- grDevices::recordPlot(up_plot)
  down_plot <- Vennerable::plot(down_venn, doWeights = weights)
  down_plot <- grDevices::recordPlot(down_plot)

  retlst <- list(
    "up_intersections" = up_intersect,
    "down_intersections" = down_intersect,
    "up_venn" = up_venn,
    "down_venn" = down_venn,
    "up_tables" = up_tables,
    "down_tables" = down_tables,
    "up_plot" = up_plot,
    "down_plot" = down_plot)
  return(retlst)
}

#' Test for infected/control/beads -- a placebo effect?
#'
#' This was a function I copied out of Keith/Hector/Laura/Cecilia's paper in
#' which they sought to discriminate the effect of inert beads on macrophages
#' vs. the effect of parasites.  The simpler way of expressing it is: take the
#' worst p-value observed for the pair of contrasts, infected/uninfected and
#' beads/uninfected.
#'
#' The goal is therefore to find responses different than beads
#' The null hypothesis is (H0): (infected == uninfected) | (infected == beads)
#' The alt hypothesis is (HA): (infected != uninfected) & (infected != beads)
#'
#' @param contrast_fit Result of lmFit.
#' @param cellmeans_fit Result of a cellmeans fit.
#' @param conj_contrasts Result from the makeContrasts of the first set.
#' @param disj_contrast Result of the makeContrasts of the second set.
disjunct_pvalues <- function(contrast_fit, cellmeans_fit, conj_contrasts, disj_contrast) {
  ## arglist <- list(...)
  contr_level_counts <- rowSums(
    contrast_fit[["contrasts"]][, c(conj_contrasts, disj_contrast)] != 0)
  ## Define the condition levels involved in the tests
  levels_to_use <- names(contr_level_counts)[contr_level_counts > 0]
  ## Extract the average counts for each, make into table
  ave_expression_mat <- cellmeans_fit[["coef"]][, levels_to_use]
  exp_table <- data.frame("ID" = rownames(ave_expression_mat), stringsAsFactors = FALSE)
  exp_table <- cbind(exp_table, as.data.frame(ave_expression_mat))
  names(exp_table)[-1] <- paste("AveExpr", gsub("condition", "", levels_to_use), sep = ":")

  stat <- BiocGenerics::pmin(abs(contrast_fit[["t"]][, conj_contrasts]))
  pval <- BiocGenerics::pmax(contrast_fit[["p.value"]][, conj_contrasts])

  adj.pval <- p.adjust(pval, method = "BH")
  fcs <- as.data.frame(contrast_fit[["coef"]][, conj_contrasts])
  names(fcs) <- paste("logFC", names(fcs), sep = ":")
  conj_pvals <- as.data.frame(
    apply(contrast_fit[["p.value"]][, conj_contrasts], 2, p.adjust, method = "BH"))
  names(conj_pvals) <- paste("adj.P.Val", names(conj_pvals), sep = ":")
  conj_table <- data.frame("ID" = rownames(contrast_fit), stringsAsFactors = FALSE)
  conj_table <- cbind(conj_table, fcs, conj_pvals, stat = stat, adj.P.Value = adj.pval)
  names(conj_table)[seq(2 + 2 * length(conj_contrasts), ncol(conj_table))] <- paste(
    c("stat", "adj.P.Value"), paste(conj_contrasts, collapse = ":"), sep = ":")

  ## Make the table for the 'other' test
  disj_table <- data.frame(
    "ID" = rownames(contrast_fit),
    "logFC" = contrast_fit[["coef"]][, disj_contrast],
    "adj.P.Value" = p.adjust(contrast_fit[["p.value"]][, disj_contrast], method = "BH"),
    stringsAsFactors = FALSE)
  names(disj_table)[-1] <- paste(c("logFC", "adj.P.Value"), disj_contrast, sep = ":")
  ## Combine tables, making sure all tables are in the same order
  stopifnot(all(exp_table$ID == conj_table[["ID"]] & exp_table[["ID"]] == disj_table[["ID"]]))
  out_table <- cbind(exp_table, conj_table[, -1], disj_table[, -1])
  ## order output table by the statistic in the disjunctive test
  o <- order(-stat)
  out_table <- out_table[o, ]
  return(out_table)
}

#' Generalize pairwise comparisons
#'
#' I want to multithread my pairwise comparisons, this is the first step in
#' doing so.
#'
#' Used to make parallel operations easier.
#'
#' @param type Which type of pairwise comparison to perform
#' @param ... Set of arguments intended for limma_pairwise(),
#'  edger_pairwise(), and friends.
#' @return Result from limma/deseq/edger/basic
#' @seealso [all_pairwise()]
#' @export
do_pairwise <- function(type, ...) {
  res <- NULL

  switchret <- switch(
    type,
    "basic" = {
      res <- try(basic_pairwise(...))
    },
    "limma" = {
      res <- try(limma_pairwise(...))
    },
    "ebseq" = {
      res <- try(ebseq_pairwise(...))
    },
    "edger" = {
      res <- try(edger_pairwise(...))
    },
    "deseq" = {
      res <- try(deseq2_pairwise(...))
    },
    "noiseq" = {
      res <- try(noiseq_pairwise(...))
    },
    "dream" = {
      res <- try(dream_pairwise(...))
    },
    {
      message("I do not understand this argument, running DESeq2.")
      res <- try(deseq2_pairwise(...))
    })
  res[["type"]] <- type
  return(res)
}

#' Find the set of most/least abundant genes according to limma and friends
#' following a differential expression analysis.
#'
#' Given a data set provided by limma, deseq, edger, etc; one might want to know
#' what are the most and least abundant genes, much like get_sig_genes() does to
#' find the most significantly different genes for each contrast.
#'
#' @param datum Output from the _pairwise() functions.
#' @param type Extract abundant genes according to what?
#' @param n Perhaps take just the top/bottom n genes.
#' @param z Or take genes past a given z-score.
#' @param fx Choose a function when choosing the most abundant genes.
#' @param unique Unimplemented: take only the genes unique among the conditions surveyed.
#' @return List of data frames containing the genes of interest.
#' @seealso [get_sig_genes()]
#' @examples
#' \dontrun{
#'  abundant <- get_abundant_genes(all_pairwise_output, type = "deseq", n = 100)
#'  ## Top 100 most abundant genes from deseq
#'  least <- get_abundant_genes(all_pairwise_output, type = "deseq", n = 100, least = TRUE)
#'  ## Top 100 least abundant genes from deseq
#'  abundant <- get_abundant_genes(all_pairwise_output, type = "edger", z = 1.5)
#'  ## Get the genes more than 1.5 standard deviations from the mean.
#' }
#' @export
get_abundant_genes <- function(datum, type = "limma", n = NULL, z = NULL,
                               fx = "mean", unique = FALSE) {
  if (is.null(z) && is.null(n)) {
    n <- 100
  }
  if (!is.null(datum[["limma"]])) {
    if (type == "basic") {
      datum <- datum[["basic"]]
    } else if (type == "edger") {
      datum <- datum[["edger"]]
    } else if (type == "deseq") {
      datum <- datum[["deseq"]]
    } else {
      datum <- datum[["limma"]]
    }
  }

  ## Extract the coefficent df
  if (type == "edger") {
    ## In this case of edger, this can be a little tricky.
    ## I should probably therefore improve the returns from edger_pairwise()
    ## FIXME: I think this is the wrong way to handle this.
    coefficient_df <- datum[["lrt"]][[1]][["coefficients"]]
    if (max(coefficient_df) <= 0) {
      coefficient_df <- coefficient_df * -1.0
    }
    ## There are a couple of extraneous columns in this table.
    removers <- c("b", "z")
    keepers <- !(colnames(coefficient_df) %in% removers)
    coefficient_df <- coefficient_df[, keepers]
  } else if (type == "limma") {
    coefficient_df <- datum[["identity_comparisons"]][["coefficients"]]
    all_coefficients <- colnames(coefficient_df)
    keepers <- !grepl(pattern = "_vs_", x = all_coefficients)
    coefficient_df <- coefficient_df[, keepers]
  } else if (type == "deseq") {
    coefficient_df <- datum[["coefficients"]]
  } else if (type == "basic") {
    coefficient_df <- datum[["medians"]]
    if (is.null(coefficient_df)) {
      coefficient_df <- datum[["means"]]
    }
  }

  abundant_list <- list(
    "high" = list(),
    "low" = list())
  coefficient_df <- as.data.frame(coefficient_df)
  coefficients <- colnames(coefficient_df)
  coefficient_rows <- rownames(coefficient_df)
  coef_ordered <- NULL
  for (coef in coefficients) {
    new_order <- order(coefficient_df[[coef]], decreasing = TRUE)
    new_ordered_df <- coefficient_df[new_order, ]
    coef_ordered <- new_ordered_df[[coef]]
    new_names <- rownames(new_ordered_df)
    names(coef_ordered) <- new_names
    kept_rows <- NULL
    if (is.null(n)) {
      ## Then do it on a z-score
      tmp_summary <- summary(coef_ordered)
      if (fx == "mean") {
        tmp_mad <- stats::sd(as.numeric(coef_ordered, na.rm = TRUE))
        tmp_up_median_dist <- tmp_summary["Median"] + (tmp_mad * z)
        tmp_down_median_dist <- tmp_summary["Median"] - (tmp_mad * z)
      } else if (fx == "median") {
        tmp_mad <- stats::mad(as.numeric(coef_ordered, na.rm = TRUE))
        tmp_up_median_dist <- tmp_summary["Mean"] + (tmp_mad * z)
        tmp_down_median_dist <- tmp_summary["Mean"] - (tmp_mad * z)
      } else {
        stop("I do not understand this summary statistic.")
      }
      high_idx <- coef_ordered >= tmp_up_median_dist
      low_idx <- coef_ordered <= tmp_down_median_dist
      high_rows <- coef_ordered[high_idx]
      low_rows <- coef_ordered[low_idx]
      abundant_list[["high"]][[coef]] <- high_rows
      abundant_list[["low"]][[coef]] <- low_rows
    } else {
      ## Then do it in a number of rows
      abundant_list[["high"]][[coef]] <- head(coef_ordered, n = n)
      abundant_list[["low"]][[coef]] <- tail(coef_ordered, n = n)
    }
  }
  class(abundant_list) <- c("abundant_genes", "list")
  return(abundant_list)
}

#' A companion function for get_abundant_genes()
#'
#' Instead of pulling to top/bottom abundant genes, get all abundances and
#' variances or stderr.
#'
#' @param datum Output from _pairwise() functions.
#' @param type According to deseq/limma/ed ger/basic?
#' @param excel Print this to an excel file?
#' @return List containing the expression values and some metrics of
#'  variance/error.
#' @seealso [get_abundant_genes()]
#' @examples
#' \dontrun{
#'  abundance_excel <- get_pairwise_gene_abundances(combined, excel = "abundances.xlsx")
#'  ## This should provide a set of abundances after voom by condition.
#' }
#' @export
get_pairwise_gene_abundances <- function(datum, type = "limma", excel = NULL) {
  if (type == "limma") {
    ## Make certain we have a consistent set of column and row orders for the
    ## future operations
    conds <- names(datum[["limma"]][["identity_tables"]])
    row_order <- rownames(datum[["limma"]][["identity_tables"]][[1]])
    nconds <- length(conds)
    num_rows <- length(row_order)
    ## Pre-allocate and set the row/colnames
    expression_mtrx <- matrix(ncol = nconds, nrow = num_rows)
    stdev_mtrx <- matrix(ncol = nconds, nrow = num_rows)
    sigma_mtrx <- matrix(ncol = nconds, nrow = num_rows)
    s2post_mtrx <- matrix(ncol = nconds, nrow = num_rows)
    t_mtrx <- matrix(ncol = nconds, nrow = num_rows)
    colnames(expression_mtrx) <- conds
    colnames(stdev_mtrx) <- conds
    colnames(sigma_mtrx) <- conds
    colnames(s2post_mtrx) <- conds
    colnames(t_mtrx) <- conds
    rownames(expression_mtrx) <- row_order
    rownames(stdev_mtrx) <- row_order
    rownames(sigma_mtrx) <- row_order
    rownames(s2post_mtrx) <- row_order
    rownames(t_mtrx) <- row_order
    ## Recast these as data frames because they are easier syntactically.
    ## E.g. I like setting columns with df[[thingie]]
    ## I am explicitly setting the row order because it seems that the output
    ## from limma is not set from one table to the next.
    for (cond in conds) {
      expression_mtrx[, cond] <- datum[["limma"]][["identity_tables"]][[cond]][row_order, "logFC"]
      t_mtrx[, cond] <- datum[["limma"]][["identity_tables"]][[cond]][row_order, "t"]

      ## When attempting to extract something suitable for error bars:
      ## https://support.bioconductor.org/p/79349/
      ## The discussion seems to me to have settled on:
      ## std.error =  fit$stdev.unscaled * fit$sigma
      stdev_mtrx[, cond] <- as.numeric(
        datum[["limma"]][["identity_comparisons"]][["stdev.unscaled"]][row_order, cond])
      sigma_mtrx[, cond] <- as.numeric(
        datum[["limma"]][["identity_comparisons"]][["sigma"]])
      s2post_mtrx[, cond] <- as.numeric(
        datum[["limma"]][["identity_comparisons"]][["s2.post"]])
      std_error <- stdev_mtrx * sigma_mtrx
      another_error <- stdev_mtrx * s2post_mtrx
      ## another_error <- stdev_table * s2post_table
    }
  } ## End if type is limma

  retlist <- list(
    "expression_values" = expression_mtrx,
    "t_values" = t_mtrx,
    "error_values" = std_error,
    "another_error" = another_error,
    "stdev_values" = stdev_mtrx)
  if (!is.null(excel)) {
    annotations <- fData(datum[["input"]])
    expressions <- retlist[["expression_values"]]
    colnames(expressions) <- glue("expr_{colnames(expressions)}")
    errors <- retlist[["error_values"]]
    colnames(errors) <- glue("err_{colnames(errors)}")
    expression_table <- merge(annotations, expressions, by = "row.names")
    rownames(expression_table) <- expression_table[["Row.names"]]
    expression_table <- expression_table[, -1]
    expression_table <- merge(expression_table, errors, by = "row.names")
    rownames(expression_table) <- expression_table[["Row.names"]]
    expression_table <- expression_table[, -1]
    expression_written <- write_xlsx(
      data = expression_table,
      sheet = "expression_values",
      title = "Values comprising the logFCs and errors (expression / t-statistic)")
    write_result <- openxlsx::saveWorkbook(wb = expression_written[["workbook"]],
                                           file = excel, overwrite = TRUE)
  }
  return(retlist)
}

#' Make sure the outputs from limma and friends are in a format suitable for IHW.
#'
#' IHW seems like an excellent way to improve the confidence in the p-values
#' provided by the various DE methods.  It expects inputs fairly specific to
#' DESeq2, however, it is trivial to convert other methods to this, ergo this
#' function.
#'
#' https://bioconductor.org/packages/release/bioc/vignettes/IHW/inst/doc/introduction_to_ihw.html
#'
#' @param de_result Table which should have the 2 types of requisite columns:
#'  mean value of counts and p-value.
#' @param pvalue_column Name of the column of p-values.
#' @param type If specified, this will explicitly perform the calculation for
#'  the given type of differential expression analysis: limma, edger, deseq,
#'  etc.
#' @param mean_column Name of the column of mean values.
#' @param significance IHW uses this parameter, I don't know why.
#' @return weight adjusted p-values.
#' @seealso [IHW]
ihw_adjust <- function(de_result, pvalue_column = "pvalue", type = NULL,
                       mean_column = "baseMean", significance = 0.05) {
  ## We need to know the method used, because the values returned are not
  ## necessarily in the scale expected by IHW.
  if (is.null(type)) {
    ## This little if statement is dumb.  FIXME.
    if (grepl(pattern = "^limma", x = pvalue_column)) {
      type <- "limma"
    } else if (grepl(pattern = "^deseq", x = pvalue_column)) {
      type <- "deseq"
    } else if (grepl(pattern = "^edger", x = pvalue_column)) {
      type <- "edger"
    } else if (grepl(pattern = "^basic", x = pvalue_column)) {
      type <- "basic"
    } else if (grepl(pattern = "^ebseq", x = pvalue_column)) {
      type <- "ebseq"
    } else if (grepl(pattern = "^noiseq", x = pvalue_column)) {
      type <- "noiseq"
    } else {
      stop("Unable to determine the type of pvalues in this data.")
    }
  }

  ## Now that we know the method used, coerce the result from method x into something
  ## which makes sense to IHW.
  tmp_table <- de_result
  if (type == "limma") {
    tmp_table[["base10_mean"]] <- 2 ^ tmp_table[[mean_column]]
    mean_column <- "base10_mean"
  } else if (type == "edger") {
    tmp_table[["base10_mean"]] <- 2 ^ tmp_table[[mean_column]]
    mean_column <- "base10_mean"
  } else if (type == "basic") {
    tmp_table[["base10_median"]] <- 2 ^ ((tmp_table[["basic_num"]] + tmp_table[["basic_den"]]) / 2)
    mean_column <- "base10_median"
  } else if (type == "noiseq") {
    tmp_table[["base10_mean"]] <- 2 ^ ((tmp_table[["noiseq_num"]] + tmp_table[["noiseq_den"]]) / 2)
    mean_column <- "base10_mean"
  }

  ## Add a hopefully unneccessary check that everything is numeric (which is currently not true
  ## because I have invoked format() on some numbers, a task I subsequently
  ## removed since it was bad and dumb.
  tmp_table[[pvalue_column]] <- as.numeric(tmp_table[[pvalue_column]])
  tmp_table[[mean_column]] <- as.numeric(tmp_table[[mean_column]])

  ## Finally, invoke IHW and get its interpretation of adjusted p-values.
  formula <- as.formula(glue::glue("{pvalue_column} ~ {mean_column}"))
  ihw_result <- sm(IHW::ihw(formula, data = tmp_table, alpha = significance))
  adjusted_p_values <- IHW::adj_pvalues(ihw_result)
  return(adjusted_p_values)
}

#' Get a set of up/down differentially expressed genes.
#'
#' Take one or more criteria (fold change, rank order, (adj)p-value,
#' z-score from median FC) and use them to extract the set of genes
#' which are defined as 'differentially expressed.'  If no criteria
#' are provided, it arbitrarily chooses all genes outside of 1-z.
#'
#' @param table Table from limma/edger/deseq.
#' @param n Rank-order top/bottom number of genes to take.
#' @param z Number of z-scores >/< the median to take.
#' @param lfc Fold-change cutoff.
#' @param p P-value cutoff.
#' @param min_mean_exprs Exclude genes with less than this mean expression.
#' @param exprs_column Use this column for filtering by expression.
#' @param column Table's column used to distinguish top vs. bottom.
#' @param fold Identifier reminding how to get the bottom portion of a
#'  fold-change (plusminus says to get the negative of the
#'  positive, otherwise 1/positive is taken).  This effectively
#'  tells me if this is a log fold change or not.
#' @param p_column Table's column containing (adjusted or not)p-values.
#' @param comparison When set to orequal, use >=/<= instead of jsut >/<.
#' @return Subset of the up/down genes given the provided criteria.
#' @seealso [extract_significant_genes()] [get_abundant_genes()]
#' @examples
#' \dontrun{
#'  sig_table <- get_sig_genes(table, lfc = 1)
#' }
#' @export
get_sig_genes <- function(table, n = NULL, z = NULL, lfc = NULL, p = NULL,
                          min_mean_exprs = NULL, exprs_column = "deseq_basemean",
                          column = "logFC", fold = "plusminus", p_column = "adj.P.Val",
                          comparison = "orequal") {
  if (is.null(z) && is.null(n) && is.null(lfc) && is.null(p)) {
    mesg("No n, z, p, nor lfc provided, setting p to 0.05 and lfc to 1.0.")
    p <- 0.05
    lfc <- 1.0
  }
  up_genes <- table
  down_genes <- table

  if (is.null(table[[column]])) {
    message("There is no ", column, " column in the table.")
    message("The columns are: ", toString(colnames(table)))
    stop("There is no ", column, " column in the table.")
  }

  if (!is.null(p)) {
    up_idx <- as.numeric(up_genes[[p_column]]) <= p
    if (comparison != "orequal") {
      up_idx <- as.numeric(up_genes[[p_column]]) < p
    }
    ## Remember we have these reformatted as scientific
    up_genes <- up_genes[up_idx, ]
    down_idx <- as.numeric(down_genes[[p_column]]) <= p
    if (comparison != "orequal") {
      down_idx <- as.numeric(down_genes[[p_column]]) < p
    }
    down_genes <- down_genes[down_idx, ]
    ## Going to add logic in case one does not ask for fold change
    ## In that case, a p-value assertion should still know the difference
    ## between up and down. But it should also still know the difference between
    ## ratio and log changes.
    if (fold == "plusminus" || fold == "log") {
      ## up_idx <- up_genes[, column] > 0.0
      up_idx <- as.numeric(up_genes[[column]]) > 0.0
      up_genes <- up_genes[up_idx, ]
      down_idx <- as.numeric(down_genes[[column]]) < 0.0
      down_genes <- down_genes[down_idx, ]
    } else {
      ## plusminus refers to a positive/negative number of logfold changes from
      ## a logFC(1) = 0
      up_idx <- as.numeric(up_genes[[column]]) >= 1.0
      if (comparison != "orequal") {
        up_idx <- as.numeric(up_genes[[column]]) > 1.0
      }
      up_genes <- up_genes[up_idx, ]
      down_idx <- as.numeric(down_genes[[column]]) <= -1.0
      if (comparison != "orequal") {
        down_idx <- as.numeric(down_genes[[column]]) < -1.0
      }
      down_genes <- down_genes[down_idx, ]
    }
  }

  if (!is.null(lfc)) {
    up_idx <- as.numeric(up_genes[[column]]) >= lfc
    if (comparison != "orequal") {
      up_idx <- as.numeric(up_genes[[column]]) > lfc
    }
    up_genes <- up_genes[up_idx, ]
    if (fold == "plusminus" || fold == "log") {
      ## plusminus refers to a positive/negative number of logfold changes from
      ## a logFC(1) = 0
      down_idx <- as.numeric(down_genes[[column]]) <= (lfc * -1.0)
      if (comparison != "orequal") {
        down_idx <- as.numeric(down_genes[[column]]) < (lfc * -1.0)
      }
      down_genes <- down_genes[down_idx, ]
    } else {
      ## If it isn't log fold change, then values go from 0..x where 1 is unchanged
      down_idx <- as.numeric(down_genes[[column]]) <= (1.0 / lfc)
      if (comparison != "orequal") {
        down_idx <- as.numeric(down_genes[[column]]) < (1.0 / lfc)
      }
      down_genes <- down_genes[down_idx, ]
    }
  }

  if (!is.null(z)) {
    ## Take an arbitrary number which are >= and <= a value which is z zscores
    ## from the median.
    ## Use the entire table for the summary
    out_summary <- summary(as.numeric(table[[column]]))
    out_mad <- stats::mad(as.numeric(table[[column]]), na.rm = TRUE)
    up_median_dist <- out_summary["Median"] + (out_mad * z)
    down_median_dist <- out_summary["Median"] - (out_mad * z)
    ## But use the (potentially already trimmed) up/down tables for indexing
    up_idx <- as.numeric(up_genes[[column]]) >= up_median_dist
    if (comparison != "orequal") {
      up_idx <- as.numeric(up_genes[[column]]) > up_median_dist
    }
    up_genes <- up_genes[up_idx, ]
    down_idx <- as.numeric(down_genes[[column]]) <= down_median_dist
    if (comparison != "orequal") {
      down_idx <- as.numeric(down_genes[[column]]) < down_median_dist
    }
    down_genes <- down_genes[down_idx, ]
  }

  if (!is.null(n)) {
    ## Take a specific number of genes at the top/bottom of the rank ordered list.
    mesg("Getting the top and bottom ", n, " genes.")
    upranked <- up_genes[order(as.numeric(up_genes[[column]]), decreasing = TRUE), ]
    up_genes <- head(upranked, n = n)
    downranked <- down_genes[order(as.numeric(down_genes[[column]])), ]
    down_genes <- head(downranked, n = n)
  }
  up_genes <- up_genes[order(as.numeric(up_genes[[column]]), decreasing = TRUE), ]
  down_genes <- down_genes[order(as.numeric(down_genes[[column]]), decreasing = FALSE), ]

  if (!is.null(min_mean_exprs)) {
    if (is.null(up_genes[[exprs_column]])) {
      warning("The column ", exprs_column,
              " does not appears to be in the table, cannot filter by expression.")
    } else {
      mesg("Subsetting the up/down genes to those with >= ",
           min_mean_exprs, " expression.")
      up_idx <- up_genes[[exprs_column]] >= min_mean_exprs
      up_genes <- up_genes[up_idx, ]
      down_idx <- down_genes[[exprs_column]] >= min_mean_exprs
      down_genes <- down_genes[down_idx, ]
    }
  }
  ret <- list(
    "up_genes" = up_genes,
    "down_genes" = down_genes)
  return(ret)
}

#' Run makeContrasts() with all pairwise comparisons.
#'
#' In order to have uniformly consistent pairwise contrasts, I decided
#' to avoid potential human erors(sic) by having a function generate
#' all contrasts.
#'
#' Invoked by the _pairwise() functions.
#'
#' @param model Describe the conditions/batches/etc in the experiment.
#' @param conditions Factor of conditions in the experiment.
#' @param do_identities Include all the identity strings? Limma can
#'  use this information while edgeR can not.
#' @param do_extras Include extra contrasts?  This seems redundant with extra_contrasts
#'  below, but there is a reason for it.
#' @param do_pairwise Include all pairwise strings? This shouldn't
#'  need to be set to FALSE, but just in case.
#' @param keepers Only extract this subset of all possible pairwise contrasts.
#' @param extra_contrasts Optional string of extra contrasts to include.
#' @param ... Extra arguments passed here are caught by arglist.
#' @return List including the following information:
#' \enumerate{
#'  \item all_pairwise_contrasts = the result from makeContrasts(...)
#'  \item identities = the string identifying each condition alone
#'  \item all_pairwise = the string identifying each pairwise comparison alone
#'  \item contrast_string = the string passed to R to call makeContrasts(...)
#'  \item names = the names given to the identities/contrasts
#' }
#' @seealso [limma::makeContrasts()]
#' @examples
#' \dontrun{
#'  pretend <- make_pairwise_contrasts(model, conditions)
#' }
#' @export
make_pairwise_contrasts <- function(model, conditions, do_identities = FALSE,
                                    do_extras = TRUE, do_pairwise = TRUE,
                                    keepers = NULL, extra_contrasts = NULL, ...) {
  arglist <- list(...)
  tmpnames <- colnames(model)
  tmpnames <- gsub(pattern = "data[[:punct:]]", replacement = "", x = tmpnames)
  tmpnames <- gsub(pattern = "-", replacement = "", x = tmpnames)
  tmpnames <- gsub(pattern = "+", replacement = "", x = tmpnames)
  tmpnames <- gsub(pattern = "conditions^(\\d+)$", replacement = "c\\1", x = tmpnames)
  tmpnames <- gsub(pattern = "conditions", replacement = "", x = tmpnames)
  colnames(model) <- tmpnames
  conditions <- gsub(pattern = "^(\\d+)$", replacement = "c\\1", x = conditions)
  condition_table <- table(conditions)
  identities <- list()
  contrast_string <- ""
  eval_strings <- list()
  for (c in seq_along(condition_table)) {
    identity_name <- names(condition_table[c])
    identity_string <- paste(identity_name, " = ", identity_name, ",", sep = "")
    identities[identity_name] <- identity_string
  }
  ## If I also create a sample condition 'alice', and also perform a subtraction
  ## of 'alice' from 'bob', then the full makeContrasts() will be:
  ## makeContrasts(bob = bob, alice = alice, bob_vs_alice=(bob)-(alice), levels = design)
  ## The parentheses in this case are superfluous, but they remind me that I am
  ## finally doing some match, and also they remind me that we can do much more
  ## complex things like:
  ## ((bob-alice)-(jane-alice)) or whatever....
  all_pairwise <- list()
  identity_names <- names(identities)
  lenminus <- length(identities) - 1
  numerators <- c()
  denominators <- c()

  if (is.null(keepers)) {
    for (c in seq_len(lenminus)) {
      c_name <- names(identities[c])
      nextc <- c + 1
      for (d in seq(from = nextc, to = length(identities))) {
        d_name <- names(identities[d])
        minus_string <- paste0(d_name, "_vs_", c_name)
        exprs_string <- paste0(minus_string, "=", d_name, "-", c_name, ",")
        all_pairwise[minus_string] <- exprs_string
        numerators <- c(numerators, d_name)
        denominators <- c(denominators, c_name)
      }
    }
  } else {
    for (keeper in keepers) {
      c_name <- keeper[1]
      d_name <- keeper[2]
      minus_string <- paste0(d_name, "_vs_", c_name)
      exprs_string <- paste0(minus_string, "=", d_name, "-", c_name, ",")
      all_pairwise[minus_string] <- exprs_string
      numerators <- c(numerators, d_name)
      denominators <- c(denominators, c_name)
    }
  }

  ## At this point, I have strings which represent the definition of every
  ## sample condition as well as strings which represent every possible
  ## B-A where B comes somewhere after A in the model matrix.
  ## The goal now is to create the variables in the R environment
  ## and add them to makeContrasts()

  if (isTRUE(do_identities)) {
    eval_strings <- append(eval_strings, identities)
  }
  if (isTRUE(do_pairwise)) {
    eval_strings <- append(eval_strings, all_pairwise)
  }
  eval_names <- names(eval_strings)

  if (!is.null(extra_contrasts) && isTRUE(do_extras)) {
    extra_eval_strings <- strsplit(extra_contrasts, ",")[[1]]
    extra_eval_names <- extra_eval_strings
    extra_eval_names <- stringi::stri_replace_all_regex(extra_eval_strings,
                                                        "^(\\s*)(\\w+)\\s*=\\s*.*$", "$2")
    extra_eval_names <- gsub(pattern = "^\\s+", replacement = "",
                             x = extra_eval_names, perl = TRUE)
    for (i in seq_along(extra_eval_strings)) {
      new_name <- extra_eval_names[[i]]
      extra_eval_string <- extra_eval_strings[[i]]
      extra_eval_string <- gsub(pattern = "^\\s+", replacement = "",
                                x = extra_eval_string, perl = TRUE)
      extra_contrast <- glue("{extra_eval_string}, ")
      eval_strings <- append(eval_strings, extra_contrast)
      eval_names <- append(eval_names, new_name)
      all_pairwise[new_name] <- extra_contrast
      extra_numerator <- gsub(x = new_name, pattern = "(.*)_vs_.*", replacement = "\\1")
      extra_denominator <- gsub(x = new_name, pattern = ".*_vs_(.*)", replacement = "\\1")
      numerators <- c(numerators, extra_numerator)
      denominators <- c(denominators, extra_denominator)
    }
    names(eval_strings) <- eval_names
  }
  ## Add them to makeContrasts()
  contrast_string <- "all_pairwise_contrasts = mymakeContrasts("
  for (f in seq_along(eval_strings)) {
    eval_name <- names(eval_strings[f])
    ## Get a little defensive to make sure I do not have contrasts which start with
    ## silly things like numbers of punctuation.
    if (grepl(x = eval_name, pattern = "^([[:digit:]]|[[:punct:]])")) {
      stop("This function requires contrast names to start with a letter.")
    }
    eval_string <- eval_strings[f]

    contrast_string <- glue("{contrast_string} {eval_string}")
  }
  ## The final element of makeContrasts() is the design from voom()
  contrast_string <- glue("{contrast_string} levels = model)")
  eval(parse(text = contrast_string))
  ## I like to change the column names of the contrasts because by default
  ## they are kind of obnoxious and too long to type

  pairwise_name_idx <- grepl(x = eval_names, pattern = "_vs_")
  pairwise_names <- eval_names[pairwise_name_idx]
  names(numerators) <- pairwise_names
  names(denominators) <- pairwise_names
  colnames(all_pairwise_contrasts) <- eval_names
  result <- list(
    "all_pairwise_contrasts" = all_pairwise_contrasts,
    "identities" = identities,
    "identity_names" = identity_names,
    "all_pairwise" = all_pairwise,
    "contrast_string" = contrast_string,
    "names" = eval_names,
    "numerators" = numerators,
    "denominators" = denominators)
  return(result)
}

#' A copy of limma::makeContrasts() with special sauce.
#'
#' This is a copy of limma::makeContrasts without the test of make.names()
#' Because I want to be able to use it with interaction models potentially
#' and if a model has first:second, make.names() turns the ':' to a '.'
#' and then the equivalence test fails, causing makeContrasts() to error
#' spuriously (I think).
#'
#' @param ... Conditions used to make the contrasts.
#' @param contrasts Actual contrast names.
#' @param levels contrast levels used.
#' @return Same contrasts as used in makeContrasts, but with unique names.
#' @seealso [limma::makeContrasts()]
mymakeContrasts <- function(..., contrasts = NULL, levels) {
  e <- substitute(list(...))
  if (is.factor(levels)) {
    levels <- levels(levels)
  }
  if (!is.character(levels)) {
    levels <- colnames(levels)
  }
  if (levels[1] == "(Intercept)") {
    levels[1] <- "Intercept"
    warning("Renaming (Intercept) to Intercept")
  }
  n <- length(levels)
  if (n < 1)
    stop("No levels to construct contrasts from")
  indicator <- function(i, n) {
    out <- rep(0, n)
    out[i] <- 1
    out
  }
  levelsenv <- new.env()
  for (i in seq_len(n)) {
    assign(levels[i], indicator(i, n), pos = levelsenv)
  }
  if (!is.null(contrasts)) {
    if (length(e) > 1)
      stop("Can't specify both ... and contrasts")
    e <- as.character(contrasts)
    ne <- length(e)
    cm <- matrix(0, n, ne, dimnames = list(Levels = levels,
                                           Contrasts = e))
    if (ne == 0)
      return(cm)
    for (j in seq_len(ne)) {
      ej <- parse(text = e[j])
      cm[, j] <- eval(ej, envir = levelsenv)
    }
    return(cm)
  }
  ne <- length(e)
  enames <- names(e)[2:ne]
  easchar <- as.character(e)[2:ne]
  if (is.null(enames))
    cn <- easchar
  else cn <- ifelse(enames == "", easchar, enames)
  cm <- matrix(0, n, ne - 1, dimnames = list(Levels = levels,
                                             Contrasts = cn))
  if (ne < 2)
    return(cm)
  for (j in seq(from = 1, to = ne - 1)) {
    ej <- e[[j + 1]]
    if (is.character(ej))
      ej <- parse(text = ej)
    ej <- eval(ej, envir = levelsenv)
    if (!is.numeric(ej)) {
      colnames(cm)[j] <- as.character(ej)[1]
      if (is.character(ej))
        ej <- parse(text = ej)
      ej <- eval(ej, envir = levelsenv)
    }
    cm[, j] <- ej
  }
  cm
}

#' Get rid of characters which will mess up contrast making and such before
#' playing with an expt.
#'
#' @param expt An expt object to clean.
sanitize_expt <- function(expt) {
  design <- pData(expt)
  conditions <- gsub(
    pattern = "^(\\d+)$", replacement = "c\\1", x = as.character(design[["condition"]]))
  batches <- gsub(
    pattern = "^(\\d+)$", replacement = "b\\1", x = as.character(design[["batch"]]))
  ## To be honest, there is absolutely no way I would have thought of this
  ## regular expression:
  ## https://stackoverflow.com/questions/30945993
  ## In theory I am pretty good with regexes, but this is devious to me!
  conditions <- gsub(pattern = "[^\\PP_]", replacement = "", x = conditions, perl = TRUE)
  batches <- gsub(pattern = "[^\\PP_]", replacement = "", x = batches, perl = TRUE)
  conditions <- gsub(pattern = "[[:blank:]]", replacement = "", x = conditions)
  batches <- gsub(pattern = "[[:blank:]]", replacement = "", x = batches)
  conditions <- gsub(pattern="[[:punct:]]", replacement = "", x = conditions)
  batches <- gsub(pattern="[[:punct:]]", replacement = "", x = batches)
  conditions <- as.factor(conditions)
  batches <- as.factor(batches)
  expressionset <- expt[["expressionset"]]
  Biobase::pData(expressionset)[["condition"]] <- conditions
  Biobase::pData(expressionset)[["batch"]] <- batches
  expt[["expressionset"]] <- expressionset
  expt[["conditions"]] <- conditions
  expt[["batches"]] <- batches
  return(expt)
}

#' Extract multicopy genes from up/down gene expression lists.
#'
#' The function semantic_copynumber_filter() is the inverse of this.
#'
#' Currently untested, used for Trypanosome analyses primarily, thus the default
#' strings.
#'
#' @param ... Arguments for semantic_copynumber_filter()
#' @export
semantic_copynumber_extract <- function(...) {
  ret <- semantic_copynumber_filter(..., invert = TRUE)
  return(ret)
}

#' Remove multicopy genes from up/down gene expression lists.
#'
#' In our parasite data, there are a few gene types which are
#' consistently obnoxious.  Multi-gene families primarily where the
#' coding sequences are divergent, but the UTRs nearly identical.  For
#' these genes, our sequence based removal methods fail and so this
#' just excludes them by name.
#'
#' Currently untested, used for Trypanosome analyses primarily, thus the default
#' strings.
#'
#' @param input List of sets of genes deemed significantly
#'  up/down with a column expressing approximate count numbers.
#' @param max_copies Keep only those genes with <= n putative
#'  copies.
#' @param use_files Use a set of sequence alignments to define the copy numbers?
#' @param invert Keep these genes rather than drop them?
#' @param semantic Set of strings with gene names to exclude.
#' @param semantic_column Column in the DE table used to find the
#'  semantic strings for removal.
#' @return Smaller list of up/down genes.
#' @seealso [semantic_copynumber_extract()]
#' @examples
#' \dontrun{
#'  pruned <- semantic_copynumber_filter(table, semantic = c("ribosomal"))
#'  ## Get rid of all genes with 'ribosomal' in the annotations.
#' }
#' @export
semantic_copynumber_filter <- function(input, max_copies = 2, use_files = FALSE, invert = TRUE,
                                       semantic = c("mucin", "sialidase", "RHS",
                                                    "MASP", "DGF", "GP63"),
                                       semantic_column = "product") {
  if ("expt" %in% class(input)) {
    result <- semantic_expt_filter(input, invert = invert, semantic = semantic,
                                   semantic_column = semantic_column)
    return(result)
  }

  table_type <- "significance"
  if (!is.null(input[["data"]])) {
    table_type <- "combined"
  }
  type <- "Kept"

  table_list <- NULL
  if (table_type == "combined") {
    table_list <- input[["data"]]
  } else {
    ## The set of significance tables will be 2x the number of contrasts
    ## Therefore, when we get to > 1x the number of contrasts, all the tables
    ## will be 'down'
    table_list <- c(input[["ups"]], input[["downs"]])
    upnames <- glue("up_{names(input[['ups']])}")
    downnames <- glue("down_{names(input[['downs']])}")
    names(table_list) <- c(upnames, downnames)
    up_to_down <- length(input[["ups"]])
  }

  numbers_removed <- list()
  for (count in seq_along(table_list)) {
    tab <- table_list[[count]]
    table_name <- names(table_list)[[count]]
    numbers_removed[[table_name]] <- list()
    mesg("Working on ", table_name)
    if (isTRUE(use_files)) {
      file <- ""
      if (table_type == "combined") {
        file <- file.path("singletons", "gene_counts",
                          glue("{table_name}.fasta.out.count"))
      } else {
        file <- file.path("singletons", "gene_counts",
                          glue("up_{table_name}.fasta.out.count"))
      }
      tmpdf <- try(read.table(file), silent = TRUE)
      if (class(tmpdf)[1] == "data.frame") {
        colnames(tmpdf) <- c("ID", "members")
        tab <- merge(tab, tmpdf, by.x = "row.names", by.y = "ID")
        rownames(tab) <- tab[["Row.names"]]
        tab <- tab[, -1, drop = FALSE]
        tab <- tab[count <= max_copies, ]
      }
    }  ## End using empirically defined groups of multi-gene families.

    ## Now remove genes by name.
    kept_list <- new_table <- NULL
    for (string in semantic) {
      pre_remove_size <- nrow(tab)
      idx <- NULL
      if (semantic_column == "rownames") {
        idx <- grepl(pattern = string, x = rownames(tab))
      } else {
        idx <- grepl(pattern = string, x = tab[, semantic_column])
      }
      type <- "Removed"
      if (!isTRUE(invert)) {
        idx <- !idx
        type <- "Kept"
      }
      num_removed <- sum(idx)
      numbers_removed[[table_name]][[string]] <- num_removed
      if (num_removed > 0) {
        tab <- tab[idx, ]
        table_list[[count]] <- tab
        mesg("Table started with: ", pre_remove_size, ". ", type,
                " entries with string ", string,
                ", found ", num_removed, "; table has ",
                nrow(tab),  " rows left.")
      } else {
        message("Found no entries of type ", string, ".")
      }
    } ## End of the foreach semantic thing to remove

    ## Now recreate the original table lists as either de tables or significance.
    if (table_type == "combined") {
      for (count in seq_along(table_list)) {
        input[["data"]][[count]] <- table_list[[count]]
      }
    } else {
      ## Then it is a set of significance tables.
      if (count <= up_to_down) {
        table_name <- names(table_list)[count]
        if (grep(pattern = "^up_", table_name)) {
          newname <- gsub(pattern = "^up_", replacement = "", x = table_name)
          input[["ups"]][[newname]] <- table_list[[count]]
        } else {
          newname <- gsub(pattern = "^down_", replacement = "", x = table_name)
          input[["downs"]][[newname]] <- table_list[[count]]
        }
      }
    }
  }
  ## Now the tables should be reconstructed.
  input[["numbers_removed"]] <- numbers_removed
  input[["type"]] <- type
  return(input)
}

## EOF
elsayed-lab/hpgltools documentation built on May 9, 2024, 5:02 a.m.