This vignette explores the design of the normalization workflow and functions in hermes.

Plan

Starting point

Control function

control_normalize <- function(log = TRUE,
                              lib_sizes = NULL,
                              prior_count = 1) {
  assert_that(
    is.flag(log),
    is.null(lib_sizes) || utils.nest::is_integer_vector(lib_sizes),
    is.number(prior_count) && prior_count >= 0
  )
  list(
    log = log,
    lib_sizes = lib_sizes,
    prior_count = prior_count
  )
}
cont <- control_normalize()

Additional Assertions

Just to simplify our life.

is_hermes_data <- function(object) {
  is_class(object, "AnyHermesData")
}

h <- hermes_data
is_hermes_data(h)

Helper functions

For detailed background on the normalization methods, see here.

CPM

h_cpm <- function(object, 
                  control) {
  assert_that(
    is_hermes_data(object),
    utils.nest::is_fully_named_list(control)
  )
  edgeR::cpm(
    y = counts(object),
    lib.size = control$lib_sizes,
    log = control$log,
    prior.count = control$prior_count
  )
}

counts_cpm <- h_cpm(h, cont)
str(counts_cpm)

RPKM

h_rpkm <- function(object, 
                   control) {
  assert_that(
    is_hermes_data(object),
    utils.nest::is_fully_named_list(control),
    noNA(rowData(object)$WidthBP)
  )
  edgeR::rpkm(
    y = counts(object),
    gene.length = rowData(object)$WidthBP,
    lib.size = control$lib_sizes,
    log = control$log,
    prior.count = control$prior_count
  )
}

counts_rpkm <- h_rpkm(h, cont)
str(counts_rpkm)

TPM

h_tpm <- function(object, 
                  control) {
  rpkm <- h_rpkm(object, control_normalize(log = FALSE))
  rpkm_sums <- colSums(rpkm, na.rm = TRUE)
  tpm <- sweep(rpkm, MARGIN = 2, STATS = rpkm_sums, FUN = "/") * 1e6
  if (control$log) {
    log2(tpm + control$prior_count)
  } else {
    tpm
  }
}

counts_tpm <- h_tpm(h, cont)
str(counts_tpm)

Double check that this gives the same as the more manual calculation:

widths <- rowData(h)$WidthBP
geneLengthPerKb <- 1000 / widths
rate <- sweep(counts(h), 1, geneLengthPerKb,"*")
mls  <- colSums(rate, na.rm = TRUE)
tpm  <- sweep(rate, 2, mls, "/") * 1E6
tpm2 <- h_tpm(h, control_normalize(log = FALSE))
tpm[1:3, 1:3]
tpm2[1:3, 1:3]

VOOM

h_voom <- function(object, 
                   control) {
  assert_that(
    is_hermes_data(object),
    utils.nest::is_fully_named_list(control)
  )
  norm_log2 <- limma::voom(
    counts = counts(object),
    lib.size = control$lib_sizes
  )$E
  if (control$log) {
    norm_log2
  } else {
    2^norm_log2
  }
}

counts_voom <- h_voom(h, cont)
str(counts_voom)

Compare with CPM:

cnts <- counts(h)
lib.size <- colSums(cnts)
y <- t(log2(t(cnts + 0.5)/(lib.size + 1) * 1e+06))
y[1:3, 1:3]
counts_voom[1:3, 1:3]
h_cpm(h, control_normalize(log = TRUE, prior_count = 0.5, lib_sizes = as.integer(lib.size + 1L)))[1:3, 1:3]

So that means this is just a very slight variation of CPM. We might therefore drop this later if not needed.

Method

Let's put this all together in the method.

setMethod(
  f = "normalize",
  signature = "AnyHermesData",
  definition = function(object, 
                        methods = c("tpm", "cpm", "rpkm", "voom"),
                        control = control_normalize(),
                        ...) {
    methods <- match.arg(methods, several.ok = TRUE)
    for (method in methods) {
      fun_name <- paste0("h_", method)
      assay(object, method) <- do.call(fun_name, list(object = object, control = control))
    }
    metadata(object) <- c(metadata(object), list(control_normalize = control))
    object
  }
)

Let's try it out.

h_norm <- normalize(h)
assayNames(h_norm)
assay(h_norm, "tpm")[1:3, 1:3]

So that looks good.



insightsengineering/hermes documentation built on March 11, 2024, 11:04 p.m.