knitr::opts_chunk$set(echo = TRUE)

How can this work ?

Idea 1 - "Manual implementation"

Take the summarized experiment and calculate the Variance across each gene using the rowVars function. Then subset the data to only use the top 500 genes with max variance. - Pros: Quick and simple to implement in tmh - Cons: Limited testing options in tmh and no availability to hermes user

library(hermes)
object <- hermes_data
object2 <- top_genes(
  object,
  assay_name = "counts",
  summary_fun = rowVars,
  n_top = 500
)
object3 <- object[rowData(object)$GeneID %in% as.character(object2$name)]

Note that we can also use the subset() function to make this better readable:

object4 <- subset(object, subset = GeneID %in% as.character(object2$name))
identical(object3, object4)

Idea 2 - Use Filter function

Implement filtering top variant genes and implement in filter function - Pros: Valuable if filtering by top variant genes would be of interest beyond PCA plotting functionality e.g. we can check this set of genes with other modules. - Cons: Not sure if it fits with gene filtering based on low expression, low depth, tech failure. As far as I understand from the notes, the use cases is really in relation to the PCA analysis and does not go beyond that application - Challenge: filter_genes does not return named logical vector, therefore we would need to create a function or transform into a logical vector first before it can be incorporated into filter function - see below.

library(assertthat)
setMethod(
  f = "filter",
  signature = signature(object = "AnyHermesData"),
  definition = function(object,
                        what = c("genes", "samples"),
                        annotation_required = "WidthBP",
                        assay_name = "counts",
                        top_genes = NULL) {
    low_exp <- get_low_expression(object)
    low_depth <- get_low_depth(object)
    tech_fail <- get_tech_failure(object)
    what <- match.arg(what, c("genes", "samples"), several.ok = TRUE)
    assert_that(
      noNA(low_exp),
      noNA(low_depth),
      noNA(tech_fail),
      msg = "still NA in quality flags, please first run add_quality_flags() to fill them"
    )
    rows <- if ("genes" %in% what) {
      initial <- !low_exp & h_has_req_annotations(object, annotation_required)

      is_top_gene <- if (is.null(top_genes)) {
        rep(TRUE, length(low_exp))
      } else {
        filter_genes <- top_genes(
          object[initial, ], # only give here the pre-filtered object
          assay_name = "counts",
          summary_fun = rowVars,
          n_top = top_genes
        )
        genes(object) %in% as.character(filter_genes$name)
      }

      initial & is_top_gene
    } else {
      rep_len(TRUE, length(low_exp))
    }
    cols <- if ("samples" %in% what) {
      !low_depth & !tech_fail
    } else {
      rep_len(TRUE, length(low_depth))
    }
    if (!any(rows)) {
      warning("filtering out all genes")
    }
    if (!any(cols)) {
      warning("filtering out all samples")
    }
    object[rows, cols]
  }
)

Try it out:

f1 <- filter(object)
f1
f2 <- filter(object, top_genes = 200)
f2

Does look quite nice I have to say. It is a "second step" row filtering here in terms of the logic.

Idea 3 - PCA filter function

Implement functionality from idea 1 into calc_pca by implementing a new argument.

In production will need to add if statement, for when or when not n_top is selected - Pros: full usage of testing capabilities in Hermes, can be used outside of tmh - Cons: * can only be utilized in calc_pca function * not sure if we get consistent results when we first calculate PCA with 500 genes and then correlate that with sample variables? should be fine because there is one value for each sample, and they are not changed

calc_pca_filtered <- function(object,
                              assay_name = "counts",
                              n_top = 500) {
  assert_that(
    is_hermes_data(object),
    is.string(assay_name)
  )

  x_samples_filter <- top_genes(
    object = object,
    assay_name = assay_name,
    summary_fun = rowVars,
    n_top = n_top
  )

  x_samples_filtered <- object[rowData(object)$GeneID %in% as.character(x_samples_filter$name)]
  x_samples <- assay(x_samples_filtered, assay_name) 

  x_genes <- t(x_samples)
  gene_is_constant <- apply(x_genes, MARGIN = 2L, FUN = S4Vectors::isConstant)
  x_genes_remaining <- x_genes[, !gene_is_constant]

  pca <- stats::prcomp(
    x = x_genes_remaining, 
    center = TRUE,
    scale = TRUE,
    tol = sqrt(.Machine$double.eps)
  )

  hermes:::.HermesDataPca(pca)
}

Test calc_pca_filtered out

object <- hermes_data
result <- calc_pca_filtered(object, assay_name = "counts", 700)
summary(result)
autoplot(result)

Agree that overall this looks like the best fit. We avoid a 2-stage approach as with filtering and we make it very easy for users.



insightsengineering/hermes documentation built on March 11, 2024, 11:04 p.m.