Objective

Update and replace the cal_diff_limma() function from rnaseqTools to use a HermesData object and a string with factor name for comparison as input.

Consideration: where to filter out low expression genes, as well as samples with low depth or technical failure from subsequent workflow

In the rnaseqTools vignettes pg101_example_voom.Rmd, the authors suggest to filter out low expression genes, as well as samples with low depth or technical failure from subsequent workflow. As we directly take a HermesData as object, users cannot manually take filtering.

Idea

Workflow

Later the user would proceed like this maybe: - first create, add flags, filter the HermesData object - then do diff_expression() on it, choose the method as argument - then apply autoplot() on the result to obtain e.g. volcano plot - implies that we need to have e.g. HermesDataDiffExpr object as result of function call above

Prototypes

Example:

object <- hermes_data %>%
  add_quality_flags() %>%
  filter() 

Class

We require that the HermesDataDiffExpr has columns: - log2_fc: estimate of the log2 fold change between the 2 levels of the provided factor - stat: test statistic (depends on the method used) - p_val: raw p-value - adj_p_val: adjusted p-value

.HermesDataDiffExpr <- setClass(
  Class = "HermesDataDiffExpr",
  contains = "data.frame"
)

.diff_expr_cols <- c(
  "log2_fc",
  "stat",
  "p_val",
  "adj_p_val"
)

S4Vectors::setValidity2(
  Class = "HermesDataDiffExpr",
  method = function(object) {
    msg <- validate_cols(
      required = .diff_expr_cols,
      actual = colnames(object)
    )
    if (is.null(msg)) TRUE else msg
  }
)

diff_expression()

We use here new getter functions for QC flags.

diff_expression <- function(object,
                            group,
                            method = c("limma_voom", "deseq2")) {
  assert_that(
    is_hermes_data(object),
    is.string(group)
  )
  expect_factor(colData(object)[[group]], n.levels = 2L)
  method <- match.arg(method, c("limma_voom", "deseq2"))

  if (anyNA(get_tech_failure(object))) {
    warning("NAs in technical failure flags, please make sure to use `add_quality_flags()` beforehand")
  }
  if (anyNA(get_low_depth(object))) {
    warning("NAs in low depth flags, please make sure to use `add_quality_flags()` beforehand")
  }
  if (anyNA(get_low_expression(object))) {
    warning("NAs in low expression flags, please make sure to use `add_quality_flags()` beforehand")
  }
  form <- as.formula(paste("~", group))
  design <- model.matrix(form, data = colData(object))  
  result <- switch(
    method,
    "limma_voom" = h_diff_expr_limma(object, design),
    "deseq2" = h_diff_expr_deseq2(object, design)
  )
  .HermesDataDiffExpr(result)
}

Note that we mainly do checks in this function and produce the model matrix.

Analysis using limma package

library(limma)

h_diff_expr_limma <- function(object, design) {
  assert_that(
    is_hermes_data(object),
    is.matrix(design)
  )
  obj_count_voom <- limma::voom(counts(object))
  fit <- limma::lmFit(obj_count_voom, design)
  eb <- limma::eBayes(fit)
  top_tab <- limma::topTable(
    eb, 
    coef = 2L, 
    n = nrow(obj_count_voom),  # Retain all genes.
    sort.by = "p"  # Use adjusted p-value to sort.
  ) 
  with(
    top_tab,
    data.frame(
      log2_fc = logFC,
      stat = t,
      p_val = P.Value,
      adj_p_val = adj.P.Val,
      row.names = rownames(top_tab)
    )
  )
}

Let's try this out:

design <- model.matrix(~ SEX, colData(object))
res_limma <- h_diff_expr_limma(object, design)
head(res_limma)

Analysis using DESeq2 package

We try here to obtain the same result structure - data frame with these columns above.

library(DESeq2)

h_diff_expr_deseq2 <- function(object, design) {
  assert_that(
    is_hermes_data(object),
    is.matrix(design)
  )
  deseq_data <- DESeqDataSet(se = object, design = design)
  deseq_data_processed <- DESeq(deseq_data, quiet = TRUE)
  deseq_data_res <- results(deseq_data_processed) # Note: this has multiple options we might want to use.
  deseq_data_res_df <- as.data.frame(deseq_data_res)
  deseq_data_res_df <- deseq_data_res_df[order(deseq_data_res_df$padj), ]  # Use adj p-value to sort.
  with(
    deseq_data_res_df,
    data.frame(
      log2_fc = log2FoldChange,
      stat = stat,
      p_val = pvalue,
      adj_p_val = padj,
      row.names = rownames(deseq_data_res_df)
    )
  )
}

Let's also try this out:

design <- model.matrix(~ SEX, colData(object))
res_deseq2 <- h_diff_expr_deseq2(object, design)
head(res_deseq2)

Now we can try out the wrapper function too:

colData(object)$SEX <- factor(colData(object)$SEX)
res1 <- diff_expression(object, "SEX", method = "limma_voom")
res2 <- diff_expression(object, "SEX", method = "deseq2")

head(res1)
head(res2)

autoplot for vacano plot

Now let's look at the volcano plot.

library(ggrepel)

setMethod(
  f = "autoplot",
  signature = signature(object = "HermesDataDiffExpr"),
  definition = function(object,
                        adj.P.Val.threshold = 0.05,
                        logFC.threshold = 2.5) {

      object$difexpr <- "NO"
      object$difexpr[object$log2_fc > abs(logFC.threshold) &
                       object$adj_p_val < adj.P.Val.threshold] <- "UP" 
      object$difexpr[object$log2_fc < -abs(logFC.threshold) &
                       object$adj_p_val < adj.P.Val.threshold] <- "DOWN"

      object$label <- NA
      object$label[object$difexpr != "NO"] <- rownames(object)[object$difexpr != "NO"]

      ggplot(data=object, 
             aes(x=log2_fc, 
             y=-log10(adj_p_val), 
             col=difexpr,
             label=label)
      ) +
      geom_point() + 
      geom_text_repel() +
      xlab("log2 fold change") + 
      ylab("-log10 adjusted p-value") +
      geom_vline(xintercept=c(-abs(logFC.threshold), abs(logFC.threshold)), col="black") +
      geom_hline(yintercept=-log10(adj.P.Val.threshold), col="black")
  }
)

Let's try it out:

autoplot(res1)


insightsengineering/hermes documentation built on Sept. 19, 2024, 9:06 p.m.