tests/testthat/test-import-utils.R

context("test-import-utils")
data("trichoptera")
counts     <- data.matrix(trichoptera$Abundance)
covariates <- trichoptera$Covariate

## Test common_samples ------------------------------------------------------------------------------------

test_that("common_samples throws warnings on matrices with no dimension names",  {
  ## No names for abundance matrix
  expect_warning(common_samples(`dimnames<-`(counts, NULL),
                                covariates))
  ## No rownames for abundance matrix (and therefore no matching names)
  expect_warning(common_samples(`rownames<-`(counts, NULL),
                                covariates))
  ## No names for covariates matrix
  ## (must transform covariates to matrix as data.frames always have rownames)
  expect_warning(common_samples(counts,
                                data.matrix(covariates, rownames.force = FALSE)))
})

test_that("common_samples fails on matrices with no dimension names and incompatibles dimensions",  {
  ## No names for abundance matrix
  expect_error(suppressWarnings(common_samples(unname(counts) %>% t(), covariates)))
  ## No rownames for covariates matrix
  expect_error(suppressWarnings(common_samples(counts %>% t(), data.matrix(covariates, rownames.force = FALSE))))
})

test_that("common_samples succeeds on matrices with no or partial dimension names but compatible dimensions",  {
  ## No dimensions names
  expect_warning(result <- common_samples(unname(counts), data.matrix(covariates)))
  expect_equal(result,
               list(transpose_counts = FALSE,
                    common_samples   = paste0("Sample_", 1:49),
                    default_names    = TRUE))
  ## Partial dimensions names: sample names only in count table
  expect_warning(result <- common_samples(`colnames<-`(counts, NULL),
                                          data.matrix(covariates)))
  expect_equal(result,
               list(transpose_counts = FALSE,
                    common_samples   = paste0(1:49),
                    default_names    = TRUE))
  ## Partial dimensions names: samples names only in covariates data.frame
  expect_warning(result <- common_samples(`rownames<-`(counts, NULL),
                                          covariates))
  expect_equal(result,
               list(transpose_counts = FALSE,
                    common_samples   = paste0(1:49),
                    default_names    = TRUE))
})

test_that("common_samples fails on matrices with dimension names but no common samples",  {
  expect_error(
    suppressWarnings(
      common_samples(
        `rownames<-`(counts, paste0("Sample_", 1:49)),
        covariates)),
    "Conflicting samples names in count matrix and covariates data frames"
  )
})

test_that("common_samples succeeds on matrices with dimension names and different orientations",  {
  expect_equal(common_samples(t(counts), covariates),
               list(transpose_counts = TRUE,
                    common_samples   = as.character(1:49),
                    default_names    = FALSE))
})

test_that("common_samples find biggest subset of common samples and produces message.",  {
  expect_message(result <- common_samples(counts, covariates[1:35, ]),
                 "14 samples were dropped from the abundance matrix for lack of associated covariates.")
  expect_length(result$common_samples, 35)
  expect_equal(result$common_samples,
               as.character(1:35))
})

## Test offset_functions --------------------------------------------------------------------------------

## all samples are identical up to a proportionality coefficient (induce explicit simple computations)
test_that("compute_offset provides correct answers for proportional samples", {
  sizes <- c(1, 2, 3, 5, 6)
  counts <- sizes %o% 1:10
  median_scale_size <- median(sizes)
  geom_mean_size <- geom_mean(sizes)
  gmpr <- sapply(seq_along(sizes), function(i) { geom_mean(sizes[i]/sizes[-i]) } )
  expected_tss <- sizes * sum(1:10)

  expect_equal(compute_offset(counts, "TSS"),  expected_tss)
  expect_equal(compute_offset(counts, "TSS", scale = "count"),  expected_tss)
  expect_equal(compute_offset(counts, "CSS"),  sizes / median_scale_size)
  expect_equal(compute_offset(counts, "CSS", scale = "count"),  expected_tss)
  expect_equal(compute_offset(counts, "RLE"),  sizes / geom_mean_size)
  expect_equal(compute_offset(counts, "RLE", scale = "count"),  expected_tss)
  expect_equal(compute_offset(counts, "TMM"), sizes / geom_mean_size)
  expect_equal(compute_offset(counts, "TMM", scale = "count"), expected_tss)
  expect_equal(compute_offset(counts, "GMPR"), gmpr)
  expect_equal(compute_offset(counts, "Wrench", type = "simple"), sizes / geom_mean_size)
  expect_equal(compute_offset(counts, "Wrench", type = "simple", scale = "count"), expected_tss)
  expect_equal(compute_offset(counts, "Wrench", type = "wrench"), sizes / geom_mean_size)
  expect_equal(compute_offset(counts, "Wrench", type = "wrench", scale = "count"), expected_tss)
  expect_null(compute_offset(counts, "none"))
  expect_null(compute_offset(counts, "none", scale = "count"))
})

test_that("compute_offset( , 'GMPR') provides correct answers on the scale count", {
  sizes <- c(1, 1, 1)
  counts <- sizes %o% 1:10
  gmpr <- sapply(seq_along(sizes), function(i) { geom_mean(sizes[i]/sizes[-i]) } )
  expected_tss <- sizes * sum(1:10)

  expect_equal(compute_offset(counts, "GMPR"), gmpr)
  expect_equal(compute_offset(counts, "GMPR", scale = "count"), gmpr * expected_tss)
})


test_that("compute_offset provides correct answers for single row matrices", {
  sizes <- c(2)
  counts <- sizes %o% 1:10
  median_scale_size <- median(sizes)
  geom_mean_size <- exp(mean(log(sizes)))

  expect_equal(compute_offset(counts, "TSS"),  sizes * sum(1:10))
  expect_equal(compute_offset(counts, "CSS"),  sizes / median_scale_size)
  expect_equal(compute_offset(counts, "RLE"),  sizes / geom_mean_size)
  expect_equal(compute_offset(counts, "TMM"),  sizes / geom_mean_size)
  expect_error(compute_offset(counts, "GMPR"), "GMPR is not defined when there is only one sample.")
  expect_error(compute_offset(counts, "Wrench"), "Wrench is not defined when there is only one sample.")
  expect_null(compute_offset(counts, "none"))
})

test_that("compute_offset provides correct answers for single column matrices", {
  sizes <- c(1, 2, 5, 6)
  counts <- sizes %o% 1
  median_scale_size <- median(sizes)
  geom_mean_size <- exp(mean(log(sizes)))
  gmpr <- sapply(seq_along(sizes), function(i) { exp(mean(log(sizes[i]/sizes[-i]))) } )

  expect_equal(compute_offset(counts, "TSS"),  sizes)
  expect_equal(compute_offset(counts, "CSS"),  sizes / median_scale_size)
  expect_equal(compute_offset(counts, "RLE"),  sizes / geom_mean_size)
  expect_equal(compute_offset(counts, "TMM"),  sizes / geom_mean_size)
  expect_equal(compute_offset(counts, "GMPR"), gmpr)
  expect_equal(compute_offset(counts, "Wrench"), sizes / geom_mean_size)
  expect_null(compute_offset(counts, "none"))
})

test_that("compute_offset provides correct answers for identical samples", {
  sizes <- rep(1, 5)
  counts <- sizes %o% 1:10

  expect_equal(compute_offset(counts, "TSS"),  sizes * sum(1:10))
  expect_equal(compute_offset(counts, "CSS"),  sizes)
  expect_equal(compute_offset(counts, "RLE"),  sizes)
  expect_equal(compute_offset(counts, "GMPR"), sizes)
  expect_equal(compute_offset(counts, "TMM"), sizes)
  expect_equal(compute_offset(counts, "Wrench"), sizes)
  expect_null(compute_offset(counts, "none"))
})

test_that("compute_offset fails with an informative error when given a data.frame", {
  sizes <- rep(1, 5)
  counts <- sizes %o% 1:10

  expect_error(compute_offset(counts, data.frame(counts)),
               regexp = "You supplied a data.frame to compute_offset(). Did you mean to supply a numeric matrix?
  Try converting your data.frame to a matrix with as.matrix().",
               fixed = TRUE)

})

test_that("offset_rle provides correct answers when adding pseudocounts", {
  sizes <- c(1, 2)
  counts <- sizes %o% rep(1, 10)
  geom_mean_size <- exp(mean(log(sizes+1)))
  ## External consistency
  expect_equal(compute_offset(counts, "RLE", pseudocount = 1),  (sizes+1) / geom_mean_size)
  ## Internal consistency
  expect_equal(compute_offset(counts, "RLE", pseudocount = 1),
               compute_offset(counts + 1, "RLE"))
})

## low or no redundancy across samples
test_that("offset_css throws a warning for nearly samples with limited inner variations", {
  sizes <- seq(from = 0.8, to = 1.2, by = 0.1)
  counts <- sizes %o% rep(1, 10)
  expect_warning(compute_offset(counts, "CSS"),
                 "No instability detected in quantile distribution across samples, falling back to scaled TSS normalization.")
})

test_that("offset_css throws a warning when a sample has less than two positive counts", {
  #      [,1] [,2] [,3]
  # [1,]    1    1    0
  # [2,]    0    0    1
  counts <- matrix(c(1, 1, 0,
                     0, 0, 1),
                   nrow = 2, byrow = TRUE)
  expect_warning(compute_offset(counts, "CSS"),
                 "Some samples only have 1 positive values. Can't compute quantiles and fall back to TSS normalization")
})

test_that("offset_rle throws a warning when data is too sparse but samples share a common species", {
  ## One common species, 4 specific species
  # [,1] [,2] [,3] [,4] [,5]
  # [1,]    1    1    0    1    0
  # [2,]    1    0    1    0    1
  counts <- matrix(c(1, 1, 0, 1, 0,
                     1, 0, 1, 0, 1),
                   nrow = 2, byrow = TRUE)
  expect_warning(compute_offset(counts, "RLE"),
                 "Because of high sparsity, some samples have null or infinite offset.")
})

test_that("offset_rle fails when no species is shared across samples", {
  #      [,1] [,2] [,3]
  # [1,]    1    1    0
  # [2,]    0    0    1
  counts <- matrix(c(1, 1, 0,
                     0, 0, 1),
                   nrow = 2, byrow = TRUE)
  expect_error(compute_offset(counts, "RLE"), "Samples do not share any common species, RLE normalization failed.")
})

test_that("offset_rle succeeds when no species is shared across samples but poscounts is activated", {
  #      [,1] [,2] [,3]
  # [1,]    1    1    0
  # [2,]    0    0    1
  counts <- matrix(c(1, 1, 0,
                     0, 0, 1),
                   nrow = 2, byrow = TRUE)
  expect_equal(compute_offset(counts, "RLE", type = "poscounts"), rep(1, 2))
})

test_that("offset_gmpr fails when a sample shares no species with any other sample", {
  #      [,1] [,2] [,3]
  # [1,]    1    1    0
  # [2,]    0    0    1
  counts <- matrix(c(1, 1, 0,
                     0, 0, 1),
                   nrow = 2, byrow = TRUE)
  expect_error(compute_offset(counts, "GMPR"),
               "Some sample(s) do not share any species with other samples, GMPR normalization failed.",
               fixed = TRUE)
  ## repaired when some species are shared
  counts[2, 1] <- 1
  expect_equal(compute_offset(counts, "GMPR"), c(1, 1))
})

test_that("offset_wrench works correctly with different groups and identical samples in each group", {
  sizes <- c(1, 2, 5, 6, 7)
  counts_A <- c(9L, 4L, 7L, 1L, 2L, 5L, 3L, 11L, 10L, 8L, 6L)
  counts_B <- c(1L, 5L, 10L, 2L, 6L, 7L, 8L, 3L, 11L, 4L, 9L)
  conditions <- rep(c('A', 'B'), times = c(2, 3))
  counts <- rbind(
    sizes[1:2] %o% counts_A, ## group A
    sizes[3:5] %o% counts_B  ## group B
  )
  comp_factors <- colMeans(5 * cbind(A = counts_A, B = counts_B) / (2 * counts_A + 3 * counts_B))[conditions] %>% unname()
  expect_equal(offset_wrench(counts, groups = conditions, type = "simple"),
               comp_factors / geom_mean(comp_factors) * sizes / geom_mean(sizes))
  expect_equal(offset_wrench(counts, groups = conditions, type = "wrench"),
               comp_factors / geom_mean(comp_factors) * sizes / geom_mean(sizes))

  ## equal variances for all species
  sizes <- c(1, 2, 3, 5, 6, 7)
  counts_A <- rep(c(1L, 2L, each = 5))
  counts_B <- rep(c(2L, 1L, each = 5))
  counts <- rbind(
    sizes[1:3] %o% counts_A, ## group A
    sizes[4:6] %o% counts_B  ## group B
  )
  expect_equal(offset_wrench(counts, type = "simple"), sizes / geom_mean(sizes))
  expect_equal(offset_wrench(counts, type = "wrench"), sizes / geom_mean(sizes))
})



## Numeric offset
test_that("offset_numeric fails when the offsets are incompatible with the counts table", {
  #   a b
  # A 1 4
  # B 2 5
  # C 3 6
  counts <- structure(1:6, .Dim = 3:2, .Dimnames = list(c("A", "B", "C"),
                                                      c("a", "b")))
  offset <- counts
  ## No sample names
  expect_error(offset_numeric(counts, unname(offset[, 1])),
               "Rownames are used for sample matching.\nPlease specify them in the offset vector/matrix.",
               fixed = TRUE
  )
  ## No offset for some samples
  rownames(offset) <- c("A", "B", "c")
  expect_error(offset_numeric(counts, offset),
               "Sample(s) C from the count table lack an offset.\nConsider checking your offset (orientation, rownames).", fixed = TRUE
               )
  ##  Wrong number of features
  rownames(offset) <- c("A", "B", "C")
  expect_error(offset_numeric(counts, offset[, c(1, 1, 2)]),
               "There should be one offset per feature in the count table.\nYou have 2 features but 3 offsets.",
               fixed = TRUE
  )
})

test_that("offset_numeric works for vectors and column-matrices.", {
  #   a b
  # A 1 4
  # B 2 5
  # C 3 6
  counts <- structure(1:6, .Dim = 3:2, .Dimnames = list(c("A", "B", "C"),
                                                        c("a", "b")))
  offset <- c(A = 1, C = 3, B = 2)
  ## No difference between vectors and column matrices
  expect_equal(offset_numeric(counts, offset),
               offset_numeric(counts, as.matrix(offset)))
  ##  Correct dimensions
  expect_equal(dim(offset_numeric(counts, offset)),
               dim(counts))
  ## Values in the correct order
  expect_equal(rownames(offset_numeric(counts, offset)),
               rownames(counts))
  ## Correct values
  expect_equal(offset_numeric(counts, offset)[, 1],
               offset[rownames(counts)])
})

test_that("offset_numeric works for matrices.", {
  #   a b
  # A 1 4
  # B 2 5
  # C 3 6
  counts <- structure(1:6, .Dim = 3:2, .Dimnames = list(c("A", "B", "C"),
                                                        c("a", "b")))
  offset <- rbind(counts, counts)
  rownames(offset) <- LETTERS[6:1] ## too many offsets
  ## Correct values
  expect_equal(offset_numeric(counts, offset),
               offset[c("A", "B", "C"), ])
})

## Test helper functions --------------------------------------------------------------------------
test_that("geom_mean works as intented", {
  x <- c(1, 2, 4)
  expect_equal(geom_mean(rep(1, 4)), 1)
  expect_equal(geom_mean(x), 2)
  ## na.rm works as intended
  expect_equal(geom_mean(c(x, NA), na.rm = TRUE), 2) ## removes NA
  expect_equal(geom_mean(c(x, NA), na.rm = FALSE), NA_real_)
  ## poscounts works as intended
  expect_equal(geom_mean(c(x, 0), poscounts = FALSE), 0)
  expect_equal(geom_mean(c(x, 0), poscounts = TRUE), 2)
})

test_that("species_variance works as intented", {
  sizes <- c(1, 2, 6, 8)
  counts_A <- 1:10
  counts_B <- 10:1
  conditions <- rep(c('A', 'B'), times = c(2, 2))
  counts <- rbind(
    sizes[1:2] %o% counts_A, ## group A
    sizes[3:4] %o% counts_B  ## group B
  )
  ## Null variances when correcting for condition
  expect_equal(species_variance(counts, conditions, depths_as_offset = TRUE), rep(.Machine$double.eps, 10))
  ## variances equal to [(log(i) - log(i * (10-i) / 2)]^2 * 4 / (n - 1)
  expect_equal(species_variance(counts, depths_as_offset = TRUE), (log(1:10) - log(10:1))^2 / 3)
})


## Test prepare_data functions --------------------------------------------------------------------------

test_that("prepare_data drops samples with no positive counts", {
  counts[1:2, ] <- 0
  expect_warning(result <- prepare_data(counts, covariates, offset = "none"),
                 "Sample(s) 1 and 2 dropped for lack of positive counts.",
                 fixed = TRUE
                 )
  expect_equal(dim(result), c(nrow(counts)-2, ncol(covariates)+1))
  expect_equal(dim(result$Abundance), c(nrow(counts)-2, ncol(counts)))
})

test_that("prepare_data replace NA with 0s", {
  counts[1:2, 1] <- NA
  expect_warning(result <- prepare_data(counts, covariates, offset = "none"),
                 "NA values in count table replaced with 0.")
  expect_equal(dim(result), c(nrow(counts), ncol(covariates)+1))
  expect_equal(dim(result$Abundance), c(nrow(counts), ncol(counts)))
})

result <- data.frame(
  Abundance = NA,
  covariates
  )
result$Abundance <- counts

test_that("prepare_data succeeds on simple data", {
  expect_identical(prepare_data(counts, covariates, offset = "none"),
                   result)
})

test_that("prepare_data succeeds on simple data with missing names", {
  expect_warning(res <- prepare_data(`rownames<-`(counts, NULL),
                                     covariates, offset = "none"))
  expect_identical(res,
                   ## small hack to account for the fact that setting rownames via rownames<-
                   ## changes its mode to character
                   `rownames<-`(result, rownames(result))
  )
})

test_that("prepare_data succeeds on simple data with missing names and numeric offset", {
  expect_warning(res <- prepare_data(`rownames<-`(counts, NULL),
                                     covariates,
                                     offset = rowSums(counts)))
  expect_equal(dim(res$Offset), dim(counts))
  expect_equal(rownames(res$Offset), as.character(1:nrow(res)))
  expect_equal(colnames(res$Offset), colnames(counts))
})

test_that("prepare_data provides automatic variable names when missing from covariate", {
  res <- prepare_data(counts, unname(covariates))
  expect_identical(colnames(res),
                   c("Abundance", paste0("Variable", 1:ncol(covariates)), "Offset"))
})

test_that("prepare_data succeeds when specifying a numeric offset", {
  ## On simple data
  res <- prepare_data(counts, covariates, offset = "TSS")
  res$Offset <- matrix(rep(res$Offset, ncol(counts)),
                       ncol = ncol(counts),
                       dimnames = dimnames(counts))
  expect_identical(prepare_data(counts, covariates, offset = rowSums(counts)),
                   res)
  ## On strange data (transposed count matrix)
  expect_identical(prepare_data(t(counts), covariates, offset = rowSums(counts)),
                   res)
})

test_that("prepare_data works on 1 column abundance matrices", {
  expect_warning(toy_data <- prepare_data(
    counts     = matrix(c(1, 3, 1, 1), ncol = 1),
    covariates = data.frame(Cov = c("A", "B", "A", "A")),
    offset     = rep(0, 4)
  ))
  expect_equal(dim(toy_data), c(4, 3))
  expect_equal(dim(toy_data$Abundance), c(4, 1))
})

## Test prepare_data_* functions ------------------------------------------------------------------------

# test_that("prepare_data_from_biom fails when covariates data.frame is missing", {
#   biom <- biomformat::make_biom(data = t(counts), sample_metadata = NULL)
#   expect_error(prepare_data_from_biom(biom, offset = "none"))
# })
#
# test_that("prepare_data_from_biom succeeds on biom specified by file name.", {
#   biom_file <- system.file("extdata", "rich_sparse_otu_table.biom", package = "biomformat")
#   biom <- biomformat::read_biom(biom_file)
#   expect_equal(prepare_data_from_biom(biom_file),
#                prepare_data_from_biom(biom))
# })
#
# test_that("prepare_data_from_biom succeeds on proper biom objects (but converts all columns to character...)", {
#   biom <- biomformat::make_biom(data = t(counts),
#                                sample_metadata = lapply(covariates, as.character) %>% as.data.frame)
#   result_biom <- result
#   result_biom[-1] <- lapply(result_biom[-1], as.character)
#   rownames(result_biom) <- as.character(rownames(result_biom))
#   expect_equal(prepare_data_from_biom(biom, offset = "none"),
#                result_biom)
# })
#
# ## massage rownames to fit phyloseq sample naming conventions
# rownames(counts) <- rownames(covariates) <- paste0("Sample", 1:nrow(covariates))
#
# test_that("prepare_data_from_phyloseq fails when covariate data.frame is missing", {
#   physeq <- phyloseq::phyloseq(phyloseq::otu_table(counts, taxa_are_rows = FALSE))
#   expect_error(prepare_data_from_phyloseq(physeq, offset = "none"),
#                "physeq should be a phyloseq object.")
#   physeq <- phyloseq::phyloseq(
#     phyloseq::otu_table(counts, taxa_are_rows = FALSE),
#     phyloseq::tax_table(matrix(rep("", ncol(counts)),
#                                dimnames = list(colnames(counts), "Kingdom")))
#   )
#   expect_error(prepare_data_from_phyloseq(physeq, offset = "none"),
#                paste("No covariates detected in physeq Consider:",
#                      "- extracting count data from biom with as(otu_table(physeq), \"matrix\")",
#                      "- preparing a covariates data.frame",
#                      "- using prepare_data instead of prepare_data_from_phyloseq",
#                      sep = "\n"),
#                fixed = TRUE)
# })
#
# test_that("prepare_data_from_phyloseq succeeds on proper phyloseq class objects", {
#   rownames(counts) <- rownames(covariates) <- paste0("Sample", 1:nrow(covariates))
#   physeq <- phyloseq::phyloseq(
#     phyloseq::otu_table(counts, taxa_are_rows = FALSE),
#     phyloseq::sample_data(covariates)
#     )
#   result_physeq <- result; rownames(result_physeq$Abundance) <- rownames(result_physeq) <- paste0("Sample", 1:nrow(covariates))
#   expect_equal(prepare_data_from_phyloseq(physeq, offset = "none"), result_physeq)
# })
PLN-team/PLNmodels documentation built on April 15, 2024, 9:01 a.m.