FacileData: A fluent API for accessing multi-assay high-throughput genomics data

context("batch effect removal")

# Currently the remove_batch_effect functionality is only tested through its
# delegation via the fetch_assay_data(..., normalized = TRUE, batch = "xxx")
# modality.

if (!exists("FDS")) FDS <- exampleFacileDataSet()

samples <- FDS |>
  filter_samples(stage == "III") |>
  select(dataset, sample_id)

genes <- c(
  PRF1='5551',
  GZMA='3001',
  CD274='29126',
  TIGIT='201633')

features <- tibble(assay='rnaseq', feature_id=genes)

test_that("batch effect correction mimics limma::removeBatchEffect", {
  smpls <- FDS |>
    filter_samples(indication == "BLCA") |>
    with_sample_covariates() |>
    mutate(sample_key = paste(dataset, sample_id, sep = "__"))

  dat <- fetch_assay_data(FDS, features, smpls, normalized = TRUE,
                          as.matrix = TRUE)
  expect_equal(smpls$sample_key, colnames(dat))

  # Normalize by one batch covaraite ...........................................
  dat.norm1 <- fetch_assay_data(FDS, features, smpls, normalized = TRUE,
                                as.matrix = TRUE, batch = "sex",
                                maintain.rowmeans = FALSE)
  e.norm1 <- limma::removeBatchEffect(dat, batch = smpls$sex)
  expect_equal(dat.norm1, e.norm1)

  # the fixed expression matrix has shifted rowMeans. ..........................
  expect_true(!isTRUE(all.equal(rowMeans(dat.norm1), rowMeans(dat))))

  # We can maintain the rowmeans by setting saintain.rowmanes = TRUE
  # which is the default.
  same.mean <- fetch_assay_data(FDS, features, smpls, normalized = TRUE,
                                as.matrix = TRUE, batch = "sex",
                                maintain.rowmeans = TRUE)
  expect_equal(rowMeans(same.mean), rowMeans(dat))

  # Normalize by two batch covaraites ..........................................
  set.seed(123)
  smpls$dummy <- sample(c("a", "b"), nrow(smpls), replace = TRUE)
  dat.norm2 <- fetch_assay_data(FDS, features, smpls, normalized = TRUE,
                                as.matrix = TRUE, batch = c("sex", "dummy"),
                                maintain.rowmeans = FALSE)
  e.norm2 <- limma::removeBatchEffect(dat, batch = smpls$sex,
                                      batch2 = smpls$dummy)
  expect_equal(dat.norm2, e.norm2)

  # Normalize with a real valued covariate .....................................
  smpls$real <- rnorm(nrow(smpls), mean = 0)
  smpls$real[1:7] <- rnorm(7, mean = 1)
  dat.normR <- fetch_assay_data(FDS, features, smpls, normalized = TRUE,
                                as.matrix = TRUE, batch = c("real"),
                                maintain.rowmeans = FALSE)
  e.normR <- limma::removeBatchEffect(dat, covariates = smpls$real)
  expect_equal(dat.normR, e.normR)

  # full bore ..................................................................
  dat.norm.uber <- fetch_assay_data(FDS, features, smpls, normalized = TRUE,
                                    as.matrix = TRUE, batch = c("sex", "real"),
                                    main = "sample_type",
                                    maintain.rowmeans = FALSE)

  des <- model.matrix(~ sample_type + sex + real, smpls)
  e.norm.uber <- limma::removeBatchEffect(dat, design = des[, 1:2],
                                          covariates = des[, -(1:2)])
  expect_equal(dat.norm.uber, e.norm.uber)

  # Final check for equal rowmeans functionality ...............................
  expect_true(!isTRUE(all.equal(rowMeans(dat.norm.uber), rowMeans(dat))))
  d2 <- fetch_assay_data(FDS, features, smpls, normalized = TRUE,
                         as.matrix = TRUE, batch = c("sex", "real"),
                         main = "sample_type",
                         maintain.rowmeans = TRUE)
  expect_equal(rowMeans(d2), rowMeans(dat))
})

test_that("batch correction using 'facile' covariate works", {
  smpls <- FDS |>
    filter_samples(indication == "BLCA") |>
    with_sample_covariates() |>
    mutate(sample_key = paste(dataset, sample_id, sep = "__"))

  # Unnormalized
  unnorm <- fetch_assay_data(FDS, features, smpls, normalized = TRUE,
                             as.matrix = TRUE)

  # Normalize by 'sex' when 'sex' is in the sample frame
  dat.norm1 <- fetch_assay_data(FDS, features, smpls, normalized = TRUE,
                                as.matrix = TRUE, batch = "sex")
  
  # Check that all attributes (dimensions, colnames, rownames) of corrected and
  # uncorrected data are the same ...
  expect_matrix(dat.norm1, nrows = nrow(unnorm), ncols = ncol(unnorm))
  expect_equal(rownames(dat.norm1), rownames(unnorm))
  expect_equal(colnames(dat.norm1), colnames(unnorm))
  # ... but the actual values should be different
  expect_string(all.equal(unnorm, dat.norm1), pattern = "relative difference")
  
  # Normalize by 'sex' when it's not in the sample frame
  dat.norm2 <- fetch_assay_data(FDS, features, select(smpls, -sex),
                                normalized = TRUE, as.matrix = TRUE,
                                batch = "sex")
  expect_true(all.equal(dat.norm1, dat.norm2))
})

test_that("batch effect correction will handle missing covariate levels", {
  set.seed(122)
  s <- samples |>
    mutate(bcov = sample(c("a", "b", NA), nrow(samples), replace = TRUE))
  raw <- fetch_assay_data(s, genes, normalize = TRUE, as.matrix = TRUE) 
  bc <- fetch_assay_data(s, genes, batch = "bcov", normalize = TRUE,
                         as.matrix = TRUE)
  
  # this should not be equal
  expect_string(all.equal(raw, bc), pattern = "relative difference")
  
  # manually setting NA batch variables to an outgroup should match `bc`
  s <- mutate(s, bcov2 = ifelse(is.na(bcov), "outgroup", bcov))
  bc.man <- remove_batch_effect(raw, s, batch = "bcov2")
  expect_equal(bc.man, bc)
})

test_that("single-gene batch correction is equivalent to all data correction", {
  set.seed(0xBEEF)
  smpls <- filter_samples(FDS, indication == "BLCA") |>
    collect(n = INF) |>
    mutate(real.batch = rnorm(n()))

  bc.all <- fetch_assay_data(smpls, normalized = TRUE, as.matrix = TRUE,
                             batch = c("sex", "real.batch"))
  fname <- rownames(bc.all)[3]
  bc.1 <- fetch_assay_data(smpls, features = fname, normalized = TRUE,
                           as.matrix = TRUE, batch = c("sex", "real.batch"))
  expect_equal(bc.1[1,], bc.all[fname,])

  # ensure we were testing a batch corrected thing
  orig.1 <- fetch_assay_data(smpls, features = fname, normalized = TRUE,
                             as.matrix = TRUE)
  expect_equal(colnames(bc.1), colnames(orig.1))
  expect_true(!isTRUE(all.equal(bc.1[1,], orig.1[1,])))
})

facilebio/FacileData documentation built on Feb. 23, 2024, 9:14 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

facilebio/FacileData
A fluent API for accessing multi-assay high-throughput genomics data

tests/testthat/test-remove-batch-effect.R
In facilebio/FacileData: A fluent API for accessing multi-assay high-throughput genomics data

R Package Documentation

Browse R Packages

We want your feedback!

facilebio/FacileData A fluent API for accessing multi-assay high-throughput genomics data

tests/testthat/test-remove-batch-effect.R In facilebio/FacileData: A fluent API for accessing multi-assay high-throughput genomics data

R Package Documentation

Browse R Packages

We want your feedback!

facilebio/FacileData
A fluent API for accessing multi-assay high-throughput genomics data

tests/testthat/test-remove-batch-effect.R
In facilebio/FacileData: A fluent API for accessing multi-assay high-throughput genomics data