R/fdge-compare.R

Defines functions format.FacileTtestComparisonAnalysisResult print.FacileTtestComparisonAnalysisResult .prefix_contrast .interaction_fdge viz.FacileTtestComparisonAnalysisResult samples.FacileTtestComparisonAnalysisResult model.FacileTtestComparisonAnalysisResult tidy.FacileTtestComparisonAnalysisResult result.FacileTtestComparisonAnalysisResult compare.FacileTtestAnalysisResult

Documented in compare.FacileTtestAnalysisResult tidy.FacileTtestComparisonAnalysisResult

#' @rdname fdge
#' @export
#'
#' @section Comparing DGE Results (interaction effect):
#' It is often useful to compare the results of two t-tests, and for many
#' experimental designs, this can be a an intuitive way to perform test an
#' interaction effect.
#'
#' The filtering strategy in the interaction model dictates that the union
#' of all features found in `x` are `y` are used in the test.
#'
#' @param rerun When comparing two results, the features analyzed in each may
#'   differ, making comparisons between the two objects sparse, at times.
#'   When `rerun = TRUE` (default), the original linear models are rerun with
#'   their features set to `union(features(x), features(y))`.
#'
#' @examples
#' # Comparing two T-test results ----------------------------------------------
#' # Let's compare the tumor vs normal DGE results in CRC vs BLCA
#'
#' efds <- FacileData::exampleFacileDataSet()
#' dge.crc <- FacileData::filter_samples(efds, indication == "CRC") |>
#'   flm_def("sample_type", "tumor", "normal", "sex") |>
#'   fdge()
#' dge.blca <- FacileData::filter_samples(efds, indication == "BLCA") |>
#'   flm_def("sample_type", "tumor", "normal", "sex") |>
#'   fdge()
#' dge.comp <- compare(dge.crc, dge.blca)
#' comp.hi <- tidy(dge.comp) |>
#'   dplyr::group_by(interaction_group) |>
#'   dplyr::slice(1:3) |>
#'   dplyr::ungroup()
#' # Static visualization generates the main "4-way" plot, as well as the
#' # facets for each category.
#' sviz <- viz(dge.comp, labels = c(x = "CRC", y = "BLCA"),
#'             subtitle = "Tumor vs normal comparisons across indications",
#'             highlight = comp.hi)
#' # highlight some of them
#' s.hi <- sviz$input_data |>
#'   dplyr::group_by(interaction_group) |>
#'   dplyr::slice(1:3) |>
#'   dplyr::ungroup()
#' if (requireNamespace("patchwork")) {
#'   patchwork::wrap_plots(
#'     sviz$plot + ggplot2::theme(legend.position = "bottom"),
#'     sviz$plot_facets + ggplot2::theme(legend.position = "none"),
#'     nrow = 1)
#'   viz(dge.comp, labels = c(x = "CRC", y = "BLCA"),
#'       colors = "darkgrey")$plot_facets
#' }
compare.FacileTtestAnalysisResult <- function(x, y,
                                              treat_lfc = param(x, "treat_lfc"),
                                              rerun = TRUE, ...) {
  messages <- character()
  warnings <- character()
  errors <- character()

  # TODO: assert_comparable(x, y, ...)
  # TODO: assert_comparable.FacileDgeAnalysisResult <- function(...)
  assert_class(x, "FacileTtestAnalysisResult")
  assert_class(y, "FacileTtestAnalysisResult")
  fds. <- assert_class(fds(x), "FacileDataStore")

  # are there any joint features found in here?
  if (length(intersect(features(x)$feature_id, features(y)$feature_id)) == 0) {
    stop("No joint features found, comparison is not possible")
  }

  assert_flag(rerun)
  if (is.null(treat_lfc)) treat_lfc <- 0
  assert_number(treat_lfc, lower = 0)

  # override downstream with "failed/incomplete/with_i_stats" version of class?
  clazz <- NULL
  classes <- c("FacileTtestComparisonAnalysisResult",
               "FacileTtestAnalysisResult",
               "FacileDgeAnalysisResult",
               "FacileComparisonAnalysis",
               "FacileAnalysisResult")

  out <- list(
    result = NULL,
    xystats = NULL,
    params = list(x = x, y = y, treat_lfc = treat_lfc),
    fds = fds.)

  on.exit({
    out[["messages"]] <- messages
    out[["warnings"]] <- warnings
    out[["errors"]] <- errors
    class(out) <- c(clazz, classes)
    return(out)
  })

  idge <- .interaction_fdge(x, y, treat_lfc = treat_lfc, rerun = rerun, ...)

  if (!isTRUE(idge[["with_stats"]])) {
    # this happens if idge is null, too
    samples. <- bind_rows(samples(x), samples(y))
    samples. <- set_fds(samples., fds.)
    samples. <- distinct(samples., dataset, sample_id)
  } else {
    classes <- c("FacileTtestComparisonInteractionAnalysisResult", classes)
    samples. <- samples(idge[["result"]])
    if (idge[["rerun"]]) {
      # This is non-orthodox-facile:
      # 1. the x,y params may not be the precise versions that were sent in here
      out[["params"]][["x"]] <- x <- idge[["x"]]
      out[["params"]][["y"]] <- y <- idge[["y"]]
    }
  }

  xres <- tidy(x)
  yres <- tidy(y)

  jcols <- intersect(colnames(xres), colnames(yres))

  meta.cols <- c("feature_type", "feature_id", "symbol", "name", "meta")
  drop.cols <- c("seqnames", "start", "end", "strand", "effective_length",
                 "source")
  stat.cols <- setdiff(colnames(xres),  c(meta.cols, drop.cols))
  stat.cols.y <- intersect(stat.cols, colnames(yres))

  meta.cols <- intersect(meta.cols, jcols)
  xystats <- full_join(
    select(xres, {{meta.cols}}, {{stat.cols}}),
    select(yres, feature_type, feature_id, {{stat.cols.y}}),
    by = c("feature_type", "feature_id"))

  if (isTRUE(idge[["with_stats"]])) {
    ires <- tidy(idge[["result"]]) |>
      select(feature_type, feature_id, {{stat.cols}})
  } else {
    ires <- idge[["result"]]
  }
  xystats <- left_join(xystats, ires, by = c("feature_type", "feature_id"))
  # put stats for interaction test up front, followed by *.x, *.y
  # xystats <- select(xystats, !!c(meta.cols, stat.cols), everything())
  take.cols <- intersect(c(meta.cols, stat.cols), colnames(xystats))
  xystats <- select(xystats, {{take.cols}}, everything())
  out[["result"]] <- idge[["result"]]

  out[["xystats"]] <- xystats
  out[["samples"]] <- samples.

  out
}

#' This `result()` of comparison will return the FacileTtestAnalysisResult
#' for the interaction, if it was run, otherwise NULL.
#'
#' This definition is redundant, since result.FacileAnalysisResult x returns
#' x[["result"]], but I just want to make this explicit for consumers of this
#' code (even myself).
#'
#' @noRd
#' @export
result.FacileTtestComparisonAnalysisResult <- function(x, ...) {
  x[["result"]]
}

#' @section Statistics Tables:
#' The stats table from the differential expression analysis can be retrieved
#' using the `tidy()` function. Depending upon the type of analysis run, the
#' exact columns of the returned table may differ.
#'
#' **Interaction Statistics**
#' Calling `tidy()` on an interaction test result
#' (`FacileTtestComparisonAnalysisResult`), returns the statistics for the
#' interaction test itself (if it was performed), as well as the statistics
#' for the individual DGE results that were run vai `compare(x, y)` to get the
#' interaction results itself. The columns of statistics related to the
#' individual tests will be suffixed with `*.x` and `*.y`, respectively.
#'
#' An `interaction_group` column will also be added to indicate what type of
#' statisticaly significance was found for each gene. The values in here can be:
#'
#' 1. `"both"`: this gene was statistically significant in both tests
#' 2. `"x"`: this gene was only significant in the `x` dge result
#' 3. `"y"`: this gene was only significant in the `y` dge result
#' 4. `"none"`: was not statistically significant in either test result
#'
#' The genes selected for significance from the input results `x` and `y` are
#' based on their `padj` and `logFC` values. The thresholds are tweaked by the
#' following parameters in the call to `tidy(compare(x,y))`:
#'
#' 1. `max_padj_(x|y)`: if not specified, defaults to `0.10`
#' 2. `min_logFC_(x|y)`: if not specified, we will take the value that was
#'    used in the `treat_lfc` parameters to `x` and `y` if those tests were
#'    run against a threshold, otherwise defaults to `1`.
#'
#' @rdname fdge
#' @export
tidy.FacileTtestComparisonAnalysisResult <- function(
    x, max_padj = 0.1, min_logFC = NULL, labels = NULL, ...,
    max_padj_x = max_padj, max_padj_y = max_padj,
    min_logFC_x = min_logFC, min_logFC_y = min_logFC) {

  out <- x[["xystats"]]
  xres <- param(x, "x")
  yres <- param(x, "y")

  assert_number(max_padj_x, lower = 1e-6, upper = 1)
  assert_number(max_padj_y, lower = 1e-6, upper = 1)

  if (is.null(min_logFC_x)) {
    min_logFC_x <- param(xres, "treat_lfc")
    if (is.null(min_logFC_x)) min_logFC_x <- 1
  }
  assert_number(min_logFC_x, lower = 0, upper = Inf)
  if (is.null(min_logFC_y)) {
    min_logFC_y <- param(yres, "treat_lfc")
    if (is.null(min_logFC_y)) min_logFC_y <- 1
  }
  assert_number(min_logFC_y, lower = 0, upper = Inf)

  default.labels <- c(x = "x", y = "y", both = "both", none = "none")
  assert_character(labels, null.ok = TRUE, names = "unique")
  labels <- c(labels, default.labels)
  labels <- labels[!duplicated(names(labels))]
  assert_subset(c("x", "y", "both", "none"), names(labels))

  .xsig <- out$padj.x <= max_padj_x & abs(out$logFC.x) >= min_logFC_x
  .ysig <- out$padj.y <= max_padj_y & abs(out$logFC.y) >= min_logFC_y

  out <- mutate(
    out,
    interaction_group = case_when(
       .xsig &  .ysig         ~ labels["both"],
       .xsig & !.ysig         ~ labels["x"],
      !.xsig &  .ysig         ~ labels["y"],
      TRUE                    ~ labels["none"])) |>
    select(interaction_group, everything())
  attr(out, "labels") <- labels
  out
}

#' @noRd
#' @export
model.FacileTtestComparisonAnalysisResult <- function(x, ...) {
  # model.NULL takes care of if/else testing NULL-ness of result()
  model(result(x))
}

#' @noRd
#' @export
samples.FacileTtestComparisonAnalysisResult <- function(x, ...) {
  x[["samples"]]
}

#' If `interacive` is `TRUE` (default), then this visualizaiton will drop the
#' interactive points by setting `insignificant = "drop"`. Static plots keep the
#' insignificant points. If you want to change this behavior, set `interactive`
#' and `insignificant` as you please.
#' 
#' Use the tidy.FacileTtestComparisonAnalysisResult(labels, ...) parameter
#' to rename the facet ribbons/tiles
#' 
#' @noRd
#' @export
viz.FacileTtestComparisonAnalysisResult <- function(
    x, features = NULL, highlight = NULL,
    colors = NULL, color_highlight = "red",
    cor.method = "spearman", title = "DGE Comparison",
    subtitle = NULL, with_cor = TRUE, interactive = TRUE,
    insignificant = if (interactive) "drop" else "keep",
    facets_nrow = if (insignificant == "drop") 3 else 2, 
    alphas = NULL, ...) {

  xdat.all <- tidy(x, ...)
  labels <- attr(xdat.all, "labels")[c("none", "both", "x", "y")]
  insignificant <- match.arg(insignificant, c("drop", "keep"))

  cols.default <- c("lightgrey", "darkgrey", "cornflowerblue", "orange")
  names(cols.default) <- labels
  
  if (length(colors) == 1L && is.null(names(colors))) {
    colors <- stats::setNames(rep(1, length(labels)), labels)
  }
  if (test_character(colors, names = "unique", min.len = 1)) {
    cq <- cols.default
    take <- intersect(names(cq), names(colors))
    cq[take] <- colors[take]
    colors <- cq
  } else {
    colors <- cols.default
  }

  xdat.all$insig <- xdat.all$interaction_group == labels["none"]
  if (insignificant == "drop") {
    xdat <- filter(xdat.all, !insig)
    labels <- labels[-1]
    colors <- colors[-1]
  } else {
    xdat <- xdat.all
  }
  
  alphas.in <- alphas
  alphas <- stats::setNames(rep(1, length(labels)), labels)
  alphas[labels["none"]] <- 0.1
  if (test_numeric(alphas.in, names = "unique", min.len = 1)) {
    alphas.in <- pmin(alphas.in, 1)
    alphas.in <- pmax(alphas.in, 0)
    take <- intersect(labels, names(alphas.in))
    if (length(take)) {
      alphas[take] <- alphas.in[take]
    }
  }

  xdat[["interaction_group"]] <- factor(xdat[["interaction_group"]],
                                        unname(labels))

  if (with_cor) {
    cor.quadrants <- c("all", levels(xdat[["interaction_group"]]))
    cors.all <- lapply(cor.quadrants, function(wut) {
      if (wut == "all") {
        xs <- xdat
      } else {
        xs <- filter(xdat, .data$interaction_group == .env$wut)
      }
      xs <- filter(xs, !is.na(logFC.x) & !is.na(logFC.y))
      if (nrow(xs) < 3) return(NULL)
      ct <- suppressWarnings(
        cor.test(xs$logFC.x, xs$logFC.y, method = cor.method)
      )
      mutate(tidy(ct), interaction_group = wut, n = nrow(xs))
    })
    cors <- mutate(bind_rows(cors.all),
                   label = sprintf("cor: %0.2f\nN: %d", estimate, n))
  } else {
    cors <- NULL
  }
  
  lims.square <- range(c(xdat$logFC.x, xdat$logFC.y))
  lims.square <- c(-1, 1) * (max(abs(lims.square)) + 0.1)

  if (is.null(subtitle) && !(labels["x"] == "xsig" || labels["y"] == "ysig")) {
    subtitle <- sprintf("%s vs %s", labels["x"], labels["y"])
  }

  lnone <- labels["none"]
  lboth <- labels["both"]

  xdat$padj.min <- pmin(xdat$padj, xdat$padj.x, xdat$padj.y)
  xdat <- arrange(xdat, interaction_group, desc(padj.min))

  gg.base <- xdat |>
    ggplot2::ggplot(ggplot2::aes(x = logFC.x, y = logFC.y)) +
    ggplot2::geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
    ggplot2::geom_vline(xintercept = 0, color = "red", linetype = "dashed")
  
  gg.base <- gg.base +
    suppressWarnings({
      ggplot2::geom_point(
        ggplot2::aes(color = interaction_group, 
                     alpha = interaction_group,
                     text = symbol))
    })
  
  gg.base <- gg.base +
    ggplot2::geom_abline(intercept = 0, slope = 1, color = "red",
                         linetype = "dotted") +
    ggplot2::scale_color_manual(values = colors) +
    ggplot2::scale_alpha_manual(values = alphas) +
    ggplot2::labs(
      x = sprintf("log2FC %s", labels["x"]),
      y = sprintf("log2FC %s", labels["y"]),
      title = title,
      subtitle = subtitle
    ) +
    ggplot2::xlim(lims.square) +
    ggplot2::ylim(lims.square)

  if (!is.null(highlight)) {
    fids <- extract_feature_id(highlight)
    highlight <- filter(xdat, feature_id %in% .env$fids)
    if (nrow(highlight)) {
      gg.base <- gg.base +
        suppressWarnings({
          ggplot2::geom_point(
            ggplot2::aes(text = symbol),
            data = highlight,
            color = color_highlight)
        })
    }
  }

  if (with_cor) {
    gg.main <- gg.base +
      ggplot2::geom_text(
        mapping = ggplot2::aes(x = -Inf, y = Inf, label = label),
        hjust = -0.1, vjust = 1.2,
        data = filter(cors, interaction_group == "all"))
  } else {
    gg.main <- gg.base
  }

  gg.facets <- gg.base +
    ggplot2::facet_wrap(~ interaction_group, nrow = facets_nrow) +
    ggplot2::labs(title = NULL, subtitle = NULL)

  if (with_cor) {
    fcors <- filter(cors, interaction_group != "all")
    fcors[["interaction_group"]] <- factor(fcors[["interaction_group"]],
                                           levels(xdat[["interaction_group"]]))
    gg.facets <- gg.facets +
      ggplot2::geom_text(
        mapping = ggplot2::aes(x = -Inf, y = Inf, label = label),
        hjust = -0.1, vjust = 1.2,
        data = fcors)
  }

  out <- list(
    plot = gg.main,
    plot_facets = gg.facets,
    input_data = xdat,
    correlation = cors,
    params = list())

  class(out) <- c("FacileTtestComparisonViz", "FacileStaticViz", "FacileViz")
  out
}


#' Helper function to run an interaction model to generate statistics when
#' comparing two Ttest models.
#'
#' This function is only to be called from within compare.FacileTTestDGEResult.
#'
#' If we can't generate an interaction result, this will return NULL.
#' @noRd
.interaction_fdge <- function(x, y, treat_lfc = NULL, rerun = FALSE, ...,
                              .run_interaction = TRUE) {
  # If these results aren't from the same FacileDataStore, get outta here
  # NOTE: This is not really a robust way to compare if two fds are the same
  xmod <- model(x)
  ymod <- model(y)
  xres <- tidy(x)
  yres <- tidy(y)

  covariate <- param(xmod, "covariate")

  concordant.analysis <- covariate == param(ymod, "covariate") &&
    param(x, "assay_name") == param(y, "assay_name") &&
    param(x, "method") == param(y, "method")

  # Calculate estimated logFC's between the *.x and *.y logFC in case we don't
  # make it all the way through the formal interaction analysis
  ires.tmp <- xres |>
    full_join(yres, by = c("feature_id", "feature_type", "name")) |>
    transmute(feature_id, feature_type, name, logFC = logFC.x - logFC.y,
              pval = NA_real_, padj = NA_real_)

  out <- list(result = ires.tmp, x = x, y = y, rerun = FALSE,
              with_stats = FALSE)
  if (!concordant.analysis) {
    warning("Analyses are not concordant. Interaction stats will not be ",
            "generated, but delta logFC's will be provided")
    return(out)
  }
  if (!.run_interaction) {
    return(out)
  }

  xsamples <- samples(xmod)
  ysamples <- samples(ymod)

  ibatch <- unique(c(param(xmod, "batch"), param(ymod, "batch")))

  # If the models were defined on separate samples, then we have to take
  # extra care, since if the samples were split differently, then the same
  # covariate values can identify the same samples across models.
  # For example, if we did a tumor vs normal comparison in x of CRC and the
  # same comparison in y from BLCA, then we need to be more careful with
  # contrast for (crc tumor / crc normal) / (blca tumor / blca normal)
  same.samples <- setequal(
    paste(xsamples$dataset, xsamples$sample_id),
    paste(ysamples$dataset, ysamples$sample_id))

  if (same.samples) {
    samples. <- xsamples
    icovariate <- covariate
  } else {
    icovariate <- ".grp."
    xsamples[[icovariate]] <- paste0("xgrp.", xsamples[[covariate]])
    ysamples[[icovariate]] <- paste0("ygrp.", ysamples[[covariate]])
    # bind_rows can not be overriden, so we have to ensure that fds is stored
    # back into the samples. facile_frame
    samples. <- set_fds(bind_rows(xsamples, ysamples), fds(x))
    samples. <- distinct(samples., dataset, sample_id, .keep_all = TRUE)
  }

  if (length(ibatch)) {
    # remove and add values to be thorough
    for (bcov in ibatch)  samples.[[bcov]] <- NULL
    samples. <- with_sample_covariates(samples., ibatch)
  }

  # The x and y samples that came in here should have no samples that have
  # NA values in the covariates used under their testing scenario, but it's
  # possible that now that they are rbind'd together, they do.
  #
  # I *believe* this should only happen he xysamples samples. data.frame can
  # have rows with NA values for covariates that come from the `batch`
  # covaraites used in either x or y. If this is the case, I guess we just have
  # to remove that covariate from the model(?)
  test.na <- unique(c(icovariate, ibatch))
  has.na <- sapply(samples.[, test.na], function(vals) any(is.na(vals)))
  has.na <- names(has.na)[has.na]
  if (length(has.na)) {
    msg <- paste("The following covariates have samples with NA values, and",
                 "therefore can't be used in the interaction model: ",
                 paste(has.na, collapse = ","))
    not.batch <- setdiff(has.na, ibatch)
    if (length(not.batch)) {
      msg <- glue(
        msg, "\n\n",
        "These covariates are not just the 'batch' covarites in the upstream ",
        "`fdge` results. Skipping the interaction model ...",
        paste(not.batch, collapse = ","))
      warning(msg)
      return(NULL)
    }
    msg <- glue(
      msg, "\n\n",
      "The covariates HAVE BEEN REMOVED in order to run the ",
      "interaction fdge")
    warning(msg)
    ibatch <- setdiff(ibatch, has.na)
  }

  xcontrast <- xmod[["contrast_string"]]
  ycontrast <- ymod[["contrast_string"]]
  if (!same.samples) {
    xcontrast <- .prefix_contrast(xcontrast, "xgrp.")
    ycontrast <- .prefix_contrast(ycontrast, "ygrp.")
  }
  contrast. <- glue("( {xcontrast} ) - ( {ycontrast} )")

  args <- list(...)
  imodel <- flm_def(samples., icovariate, batch = ibatch,
                    block = args[["block"]], contrast. = contrast.)
  genes. <- unique(c(xres[["feature_id"]], yres[["feature_id"]]))

  ires <- tryCatch(
    fdge(imodel, features = genes.,
         method = param(x, "method"),
         assay_name = param(x, "assay_name"),
         with_sample_weights = param(x, "with_sample_weights"),
         treat_lfc = treat_lfc),
    error = function(x) NULL)

  if (!is.null(ires)) {
    out[["result"]] <- ires
    out[["with_stats"]] <- TRUE
    rerun <- rerun && !setequal(xres[["feature_id"]], genes.)

    if (rerun) {
      x <- fdge(xmod, features = genes., method = param(x, "method"),
                assay_name = param(x, "assay_name"),
                with_sample_weights = param(x, "with_sample_weights"))
      y <- fdge(ymod, features = genes., method = param(y, "method"),
                assay_name = param(y, "assay_name"),
                with_sample_weights = param(y, "with_sample_weights"))
      out[["rerun"]] <- TRUE
      out[["x"]] <- x
      out[["y"]] <- y
    }
  }

  out
}

#' @noRd
#' @importFrom stringr str_split
.prefix_contrast <- function(eqn, prefix = "x.") {
  assert_string(eqn)
  assert_string(prefix)
  splitted <- str_split(eqn, " +")[[1L]]
  out <- lapply(splitted, function(x) {
    if (x == make.names(x)) paste0(prefix, x) else x
  })
  paste(out, collapse = " ")
}

#' @noRd
#' @export
print.FacileTtestComparisonAnalysisResult <- function(x, ...) {
  cat(format(x, ...), "\n")
}

#' @noRd
#' @export
format.FacileTtestComparisonAnalysisResult <- function(x, ...) {
  has.istats <- !is.null(result(x))
  status <- if (has.istats) "with interaction" else "no interaction"

  out <- paste(
    "===========================================================\n",
    sprintf("FacileTtestComparisonAnalysisResult (%s)\n", status),
    "-----------------------------------------------------------\n",
    sep = "")

  if (has.istats) {
    mdef <- model(x)
    des <- design(mdef)
    formula <- mdef[["design_formula"]]
    test <- mdef[["contrast_string"]]
    res <- tidy(x)
    ntested <- nrow(res)
    nsig <- sum(!is.na(res[["padj"]]) & res[["padj"]] < 0.10)
    out <- paste(
      out,
      glue("Significant Results (FDR < 0.1): ({nsig} / {ntested})"), "\n",
      "Formula: ", formula, "\n",
      "Tested: ", test, "\n",
      sep = "")
  } else {
    xform <- model(param(x, "x"))[["contrast_string"]]
    yform <- model(param(x, "y"))[["contrast_string"]]
    out <- paste(out, sprintf("(%s) - (%s)\n", xform, yform), sep = "")
  }

  paste(out,
        "===========================================================\n",
        sep = "")
}
facilebio/FacileAnalysis documentation built on March 15, 2024, 7:37 a.m.