R/test_de.R

Defines functions test_de_q test_de

Documented in test_de

#' Test for Differential Expression
#'
#' Conduct a quasi-likelihood ratio test for a Gamma-Poisson fit.
#'
#' @inheritParams glm_gp
#' @param fit object of class `glmGamPoi`. Usually the result of calling `glm_gp(data, ...)`
#' @param contrast The contrast to test. Can be a single column name (quoted or as a string)
#'   that is removed from the  full model matrix of `fit`. Or a complex contrast comparing two
#'   or more columns: e.g. `A - B`, `"A - 3 * B"`, `(A + B) / 2 - C` etc. \cr
#'   Only one of `contrast` or `reduced_design` must be specified.
#' @param reduced_design a specification of the reduced design used as a comparison to see what
#'   how much better `fit` describes the data.
#'   Analogous to the `design` parameter in `glm_gp()`, it can be either a `formula`,
#'   a `model.matrix()`, or a `vector`. \cr
#'    Only one of `contrast` or `reduced_design` must be specified.
#' @param full_design option to specify an alternative `full_design` that can differ from
#'   `fit$model_matrix`. Can be a `formula` or a `matrix`. Default: `fit$model_matrix`
#' @param subset_to a vector with the same length as `ncol(fit$data)` or  an expression
#'   that evaluates to such a vector. The expression can reference columns from `colData(fit$data)`.
#'   A typical use case in single cell analysis would be to subset to a specific cell type
#'   (e.g. `subset_to = cell_type == "T-cells"`). Note that if this argument is set a new
#'   the model for the `full_design` is re-fit.\cr
#'   Default: `NULL` which means that the data is not subset.
#' @param pseudobulk_by a vector with the same length as `ncol(fit$data)` that is used to
#'   split the columns into different groups (calls [split()]). `pseudobulk_by` can also be an
#'   expression that evaluates to a vector. The expression can reference columns from `colData(fit$data)`. \cr
#'   The counts are summed across the groups
#'   to create "pseudobulk" samples. This is typically used in single cell analysis if the cells come
#'   from different samples to get a proper estimate of the differences. This is particularly powerful
#'   in combination with the `subset_to` parameter to analyze differences between samples for
#'   subgroups of cells. Note that this does a fresh fit for both the full and the reduced design.
#'   Default: `NULL` which means that the data is not aggregated.
#' @param pval_adjust_method one of the p-value adjustment method from
#'   [p.adjust.methods]. Default: `"BH"`.
#' @param sort_by the name of the column or an expression used to sort the result. If `sort_by` is `NULL`
#'   the table is not sorted. Default: `NULL`
#' @param decreasing boolean to decide if the result is sorted increasing or decreasing
#'   order. Default: `FALSE`.
#' @param n_max the maximum number of rows to return. Default: `Inf` which means that all
#'   rows are returned
#'
#' @return a `data.frame` with the following columns
#' \describe{
#'   \item{name}{the rownames of the input data}
#'   \item{pval}{the p-values of the quasi-likelihood ratio test}
#'   \item{adj_pval}{the adjusted p-values returned from [p.adjust()]}
#'   \item{f_statistic}{the F-statistic: \eqn{F = (Dev_full - Dev_red) / (df_1 * disp_ql-shrunken)}}
#'   \item{df1}{the degrees of freedom of the test: `ncol(design) - ncol(reduced_design)`}
#'   \item{df2}{the degrees of freedom of the fit: `ncol(data) - ncol(design) + df_0`}
#'   \item{lfc}{the log2-fold change. If the alternative model is specified by `reduced_design` will
#'    be `NA`.}
#' }
#'
#' @examples
#'   Y <- matrix(rnbinom(n = 30 * 100, mu = 4, size = 0.3), nrow = 30, ncol = 100)
#'   annot <- data.frame(sample = sample(LETTERS[1:6], size = 100, replace = TRUE),
#'                       cont1 = rnorm(100), cont2 = rnorm(100, mean = 30))
#'   annot$condition <- ifelse(annot$sample %in% c("A", "B", "C"), "ctrl", "treated")
#'   head(annot)
#'   se <- SummarizedExperiment::SummarizedExperiment(Y, colData = annot)
#'   fit <- glm_gp(se, design = ~ condition + cont1 + cont2)
#'
#'   # Test with reduced design
#'   res <- test_de(fit, reduced_design = ~ condition + cont1)
#'   head(res)
#'
#'   # Test with contrast argument, the results are identical
#'   res2 <- test_de(fit, contrast = cont2)
#'   head(res2)
#'
#'   # The column names of fit$Beta are valid variables in the contrast argument
#'   colnames(fit$Beta)
#'
#'
#'   # You can also have more complex contrasts:
#'   # the following compares cont1 vs cont2:
#'   test_de(fit, cont1 - cont2, n_max = 4)
#'
#'
#'   # You can also sort the output
#'   test_de(fit, cont1 - cont2, n_max = 4,
#'           sort_by = "pval")
#'
#'   test_de(fit, cont1 - cont2, n_max = 4,
#'           sort_by = - abs(f_statistic))
#'
#'   # If the data has multiple samples, it is a good
#'   # idea to aggregate the cell counts by samples.
#'   # This is called "pseudobulk".
#'   test_de(fit, contrast = "conditiontreated", n_max = 4,
#'           pseudobulk_by = sample)
#'
#'
#'   # You can also do the pseudobulk only on a subset of cells:
#'   cell_types <- sample(c("Tcell", "Bcell", "Makrophages"), size = 100, replace = TRUE)
#'   test_de(fit, contrast = "conditiontreated", n_max = 4,
#'           pseudobulk_by = sample,
#'           subset_to = cell_types == "Bcell")
#'
#'
#'   # Be care full, if you included the cell type information in
#'   # the original fit, after subsetting the design matrix would
#'   # be degenerate. To fix this, specify the full_design in 'test_de()'
#'   SummarizedExperiment::colData(se)$ct <- cell_types
#'   fit_with_celltype <- glm_gp(se, design = ~ condition + cont1 + cont2 + ct)
#'   test_de(fit_with_celltype, contrast = cont1, n_max = 4,
#'           full_design =  ~ condition + cont1 + cont2,
#'           pseudobulk_by = sample,
#'           subset_to = ct == "Bcell")
#'
#'
#'
#' @references
#' * Lund, S. P., Nettleton, D., McCarthy, D. J., & Smyth, G. K. (2012). Detecting differential expression
#'   in RNA-sequence data using quasi-likelihood with shrunken dispersion estimates. Statistical
#'   Applications in Genetics and Molecular Biology, 11(5).
#'   [https://doi.org/10.1515/1544-6115.1826](https://doi.org/10.1515/1544-6115.1826).
#'
#' @seealso [glm_gp()]
#'
#' @export
test_de <- function(fit,
                    contrast,
                    reduced_design = NULL,
                    full_design = fit$model_matrix,
                    subset_to = NULL, pseudobulk_by = NULL,
                    pval_adjust_method = "BH", sort_by = NULL,
                    decreasing = FALSE, n_max = Inf,
                    verbose = FALSE){
  # Capture all NSE variables
  contrast_capture <- substitute(contrast)
  subset_to_capture <- substitute(subset_to)
  pseudobulk_by_capture <- substitute(pseudobulk_by)
  sort_by_capture <- substitute(sort_by)

  test_de_q(fit, contrast = contrast_capture, reduced_design = reduced_design, full_design = full_design,
            subset_to = subset_to_capture, pseudobulk_by = pseudobulk_by_capture,
            pval_adjust_method = pval_adjust_method, sort_by = sort_by_capture,
            decreasing = decreasing, n_max = n_max,
            verbose = verbose,
            env = parent.frame())
}

# This function is necessary to handle the NSE stuff
# for an explanation see:
# http://adv-r.had.co.nz/Computing-on-the-language.html
test_de_q <- function(fit,
                      contrast,
                      reduced_design = NULL,
                      full_design = fit$model_matrix,
                      subset_to = NULL, pseudobulk_by = NULL,
                      pval_adjust_method = "BH", sort_by = NULL,
                      decreasing = FALSE, n_max = Inf,
                      verbose = FALSE,
                      env = parent.frame()){
  if(! is.matrix(full_design) || length(full_design) != length(fit$model_matrix) || ! all(full_design == fit$model_matrix) ){
    full_design <- handle_design_parameter(full_design, fit$data, SummarizedExperiment::colData(fit$data), NULL)$model_matrix
  }
  if(is.null(reduced_design) == missing(contrast)){
    stop("Please provide either an alternative design (formula or matrix) or a contrast ",
         "(name of a column in fit$model_matrix or a combination of them).")
  }

  pseudobulk_by_e <- eval_with_q(pseudobulk_by, SummarizedExperiment::colData(fit$data), env = env)
  subset_to_e <- eval_with_q(subset_to, SummarizedExperiment::colData(fit$data), env = env)
  if(! is.null(pseudobulk_by_e)){
    if(verbose) { message("Aggregate counts of columns that belong to the same sample") }
    # This method aggregates the data
    # then does a fresh fit for the full model
    # then calls test_de() with the reduced dataset
    return(test_pseudobulk_q(fit$data, design = full_design,
                             aggregate_cells_by = pseudobulk_by_e,
                             contrast = contrast,
                             reduced_design = reduced_design,
                             subset_to = subset_to_e,
                             pval_adjust_method = pval_adjust_method, sort_by = sort_by,
                             decreasing = decreasing, n_max = n_max,
                             verbose = verbose, env = env))
  }

  if(! is.null(subset_to_e)){
    if(verbose) { message("Subset the dataset, thus a new model is fitted!") }
    # Only run test on a subset of things:
    # Redo fit, but keep dispersion
    data_subset <- fit$data[,subset_to_e,drop=FALSE]

    model_matrix_subset <- full_design[subset_to_e, ,drop=FALSE]
    size_factor_subset <- fit$size_factors[subset_to_e]

    fit_subset <- glm_gp(data_subset, design = model_matrix_subset, size_factors = size_factor_subset,
                  overdispersion = fit$overdispersions, on_disk = is_on_disk.glmGamPoi(fit),
                  verbose = verbose)
    test_res <- test_de_q(fit_subset, contrast = contrast, reduced_design = reduced_design,
                          subset_to = NULL, pseudobulk_by = NULL,
                          pval_adjust_method = pval_adjust_method, sort_by = sort_by,
                          decreasing = decreasing, n_max = n_max,
                          verbose = verbose, env = env)
    return(test_res)
  }
  if(is.null(fit$overdispersion_shrinkage_list)){
    stop("fit$overdispersion_shrinkage_list is NULL. Run 'glm_gp' with ",
         "'overdispersion_shrinkage = TRUE'.")
  }
  disp_trend <- fit$overdispersion_shrinkage_list$dispersion_trend
  if(! missing(contrast)){
    # e.g. a vector with c(1, 0, -1, 0) for contrast = A - C
    cntrst <- parse_contrast_q(contrast, levels = colnames(fit$model_matrix),
                               env = env)
    # The modifying matrix of reduced_design has ncol(model_matrix) - 1 columns and rank.
    # The columns are all orthogonal to cntrst.
    # see: https://scicomp.stackexchange.com/a/27835/36204
    # Think about this as a rotation of of the design matrix. The QR decomposition approach
    # has the added benefit that the new columns are all orthogonal to each other, which
    # isn't necessary, but makes fitting more robust
    # The following is a simplified version of edgeR's glmLRT (line 159 in glmfit.R)
    qrc <- qr(cntrst)
    rot <- qr.Q(qrc, complete = TRUE)[,-1,drop=FALSE]
    reduced_design <- fit$model_matrix %*% rot
    lfc <- fit$Beta %*% cntrst / log(2)
  }else{
    lfc <- NA
  }
  if(verbose){message("Fit reduced model")}
  data <- fit$data
  do_on_disk <- is_on_disk.glmGamPoi(fit)
  fit_alt <- glm_gp(data, design = reduced_design,
                    size_factors = fit$size_factors,
                    overdispersion = disp_trend,
                    overdispersion_shrinkage = FALSE,
                    on_disk = do_on_disk)

  if(ncol(fit_alt$model_matrix) >= ncol(fit$model_matrix)){
    stop("The reduced model is as complex (or even more complex) than ",
         "the 'fit' model. The 'reduced_design' should contain fewer terms ",
         "the original 'design'.")
  }

  # Likelihood ratio
  if(verbose){message("Calculate quasi likelihood ratio")}
  lr <- fit_alt$deviances - fit$deviances
  df_test <- ncol(fit$model_matrix) - ncol(fit_alt$model_matrix)
  df_test <- ifelse(df_test == 0, NA, df_test)
  df_fit <- fit$overdispersion_shrinkage_list$ql_df0 + (ncol(data) - ncol(fit_alt$model_matrix))
  f_stat <- lr / df_test / fit$overdispersion_shrinkage_list$ql_disp_shrunken
  pval <- pf(f_stat, df_test, df_fit, lower.tail = FALSE, log.p = FALSE)
  adj_pval <- p.adjust(pval, method = pval_adjust_method)

  names <- rownames(data)
  if(is.null(names)){
    names <- sprintf(paste0("row_%0", floor(log10(nrow(data))), "i"), seq_len(nrow(data)))
  }
  if(verbose){message("Preprare results")}
  res <- data.frame(name = names, pval = pval, adj_pval = adj_pval,
                    f_statistic = f_stat, df1 = df_test, df2 = df_fit,
                    lfc = lfc,
                    stringsAsFactors = FALSE, row.names = NULL)
  sort_by_e <- eval_with_q(sort_by, res, env = env)
  res <- if(is.null(sort_by_e)) {
    res
  }else{
    res[order(sort_by_e, decreasing = decreasing), ]
  }
  res[seq_len(min(nrow(res), n_max)), ,drop=FALSE]
}

Try the glmGamPoi package in your browser

Any scripts or data that you put into this service are public.

glmGamPoi documentation built on Nov. 8, 2020, 7:14 p.m.