testrds <- function(fn) readRDS(test_path("data", "summarize_dataset", fn))
# test_summarize_dataset --------------------------------------------------
test_that("summarize_dataset produces additional summaries", {
results <- testrds("results.rds")
results_expected <- testrds("results_mod.rds")
results_mod <- summarize_dataset(results)
expect_equal(results_mod, results_expected)
})
test_that("summarize_dataset works with known genotypes", {
# Several additional things should happen if genotypes_known is given to
# summarize_dataset. The extra data frame should be appended to the results
# list, and a new distance matrix should be present comparing samples to
# known genotypes.
# For cases where a Name column is present in the dataset data frame at the
# start (and so is now present in the results summary data frame) the
# summary data frame will be expanded with three additional columns.
# Two (CorrectAllele1Seq and CorrectAllele2Seq) will track the correct
# sequences, ordered to match the called alleles where possible.
# GenotypeResult will provide a category for each case (row): Correct,
# Incorrect, Dropped Allele, and Blank.
results <- testrds("results_known.rds")
results_expected <- testrds("results_known_mod.rds")
genotypes_known <- testrds("genotypes_known.rds")
results_mod <- summarize_dataset(results, genotypes_known = genotypes_known)
expect_equal(results_mod, results_expected)
})
# test_make_dist_mat ------------------------------------------------------
test_that("make_dist_mat produces a valid distance matrix", {
results <- testrds("results.rds")
dist_mat_expected <- testrds("dist_mat.rds")
dist_mat <- make_dist_mat(results$summary)
expect_equal(dist_mat, dist_mat_expected)
# row and column names should both be equal to the sample identifiers
samps <- unique(results$summary$Sample)
expect_equal(rownames(dist_mat), samps)
expect_equal(colnames(dist_mat), samps)
# For our test data samples are far apart from each other, but with no
# missing entries the diagonal is zero.
dists <- matrix(7, nrow = length(samps), ncol = length(samps))
diag(dists) <- 0
expect_true(all(dist_mat == dists))
})
test_that("make_dist_mat can handle just one sample", {
# TODO
# results <- testrds("results.rds")
# summary_stub <- subset(results$summary, Sample == 1)
# dist_mat <- make_dist_mat(summary_stub)
# dist_mat_expected <- NULL # TODO
# expect_identical(dist_mat, dist_mat_expected)
})
test_that(
"make_dist_mat_known produces sample-to-individual distance matrix", {
skip("test not yet implemented")
})
# test_calc_genotype_distance ---------------------------------------------
test_that("calc_genotype_distance scores genotypes correctly", {
g1 <- c(5, 10, 200, 204, 37, 37, 180, 184, 190, 190)
g2 <- c(5, 5, 204, 208, 37, 39, 176, 180, 190, 190)
d <- calc_genotype_distance(g1, g2)
expect_equal(d, 4)
})
test_that("calc_genotype_distance handles multiple data types", {
# As written it should work with character, numeric, whatever-- it just uses
# the not-equal operator.
g_num <- c(1, 2, 3, 4)
g_alpha <- c("a", "b", "c", "d")
d <- calc_genotype_distance(g_num, g_alpha)
d_num <- calc_genotype_distance(g_num, g_num)
d_alpha <- calc_genotype_distance(g_alpha, g_alpha)
expect_equal(d, 4)
expect_equal(d_num, 0)
expect_equal(d_alpha, 0)
})
test_that("calc_genotype_distance is commutative", {
g1 <- c(5, 10, 200, 204, 37, 37, 180, 184, 190, 190)
g2 <- c(5, 5, 204, 208, 37, 39, 176, 180, 190, 190)
d.fwd <- calc_genotype_distance(g1, g2)
d.rev <- calc_genotype_distance(g2, g1)
expect_equal(d.fwd, 4)
expect_equal(d.rev, 4)
})
test_that("calc_genotype_distance handles NA entries", {
g1 <- c(5, 5, 200, 204, 37, 39)
g2 <- c(NA, NA, 204, 208, 37, 37)
g3 <- c(NA, NA, 200, 204, 37, 39)
# The two NA entries will by default count towards the distance whether the
# other genotype has an NA there or not. If na_reject is FALSE, NAs are
# treated like any other entry.
d12 <- calc_genotype_distance(g1, g2)
d12_na <- calc_genotype_distance(g1, g2, na_reject = FALSE)
d23 <- calc_genotype_distance(g2, g3)
d23_na <- calc_genotype_distance(g2, g3, na_reject = FALSE)
expect_equal(d12, 4)
expect_equal(d12_na, 2)
expect_equal(d23, 4)
expect_equal(d23_na, 2)
})
test_that("calc_genotype_distance invalid genotype lengths", {
# These are valid but not the same length
g1 <- c(5, 5, 200, 204)
g2 <- c(NA, NA, 204, 208, 37, 39)
# This has an odd length
g3 <- c(NA, NA, 204, 208, 37)
expect_warning(calc_genotype_distance(g1, g2))
expect_warning(calc_genotype_distance(g3, g3))
})
# test_align_alleles ------------------------------------------------------
test_that("align_alleles produces sequence alignments", {
results <- testrds("results.rds")
alignments_expected <- testrds("alignments.rds")
alignments <- align_alleles(results$summary)
expect_equal(alignments, alignments_expected)
expect_equal(names(alignments), levels(results$summary$Locus))
expect_equal(names(as.character(alignments[["A"]])),
c("182_2", "194_1", "162_2", "178_1"))
expect_equal(as.character(alignments[["A"]])[[1]],
paste0("TATCACTGGTGTTAGTCCTCTGTAGATAGA",
"TAGATAGATAGATAGATAGATAGATAGATA",
"GATAGATAGATAGATAGATAGATAGATAGA",
"TAGATAGATAGATAGATAGATAGATAGATA",
"GATAGATAGATAGATAGATAGATAGATAGA",
"TAGATAG------------ATAGACACAGT",
"TGTGTGAGCCAGTC"))
})
test_that("align_alleles produces per-allele alignments", {
# By default the allele sequences are dereplicated and then aligned, but
# this tests the other option.
results <- testrds("results.rds")
alignments_expected <- testrds("alignments_no_derep.rds")
alignments <- align_alleles(results$summary, derep = FALSE)
expect_equal(alignments, alignments_expected)
expect_equal(names(alignments), levels(results$summary$Locus))
n1 <- paste(rownames(subset(results$summary, Locus == "A")), 1, sep = "_")
n2 <- paste(rownames(subset(results$summary, Locus == "A")), 2, sep = "_")
expect_equal(sort(names(as.character(alignments[["A"]]))),
sort(c(n1, n2)))
})
test_that("align_alleles works for empty sequences", {
# Empty sequences should be handled automatically before the msa function is
# called.
results <- testrds("results.rds")
# Empty out one entry for locus A
idx <- which(results$summary$Locus == "A")
results$summary[idx[1], "Allele1Seq"] <- NA
results$summary[idx[1], "Allele2Seq"] <- NA
# Locus A's alignment should be OK, just with one stub entry
alignments <- align_alleles(results$summary)
expect_true(any(grepl("^-+$", as.character(alignments$A))))
})
test_that("align_alleles works for empty sequence sets", {
# Completely empty sequence lists for a given locus are a special case; for
# those we should just get "NA" for the whole entry.
results <- testrds("results.rds")
# Empty out all sequences for locus A
idx <- results$summary$Locus == "A"
results$summary[idx, "Allele1Seq"] <- NA
results$summary[idx, "Allele2Seq"] <- NA
# This should still work, just with NA for A's alignment
alignments <- align_alleles(results$summary)
expect_equal(alignments$A, NULL)
})
test_that("align_alleles works for identical sequence sets", {
# Apparently msa() fails when given a single sequence (derep=TRUE for
# align_alleles), throwing a "There is an invalid aln file!" error.
# I'll do its validating for it and account for this inside align_alleles.
results <- testrds("results.rds")
# Overwrite Locus A seqs with same content
idx <- which(results$summary$Locus == "A")
results$summary[idx, "Allele1Seq"] <- results$summary[idx[1],
"Allele1Seq"]
results$summary[idx, "Allele2Seq"] <- NA
alignments <- align_alleles(results$summary)
# This should still work, just with the same sequence back for A's
# alignment
expect_equal(unname(alignments$A),
as.character(results$summary[idx[1], "Allele1Seq"]))
})
# test_tally_cts_per_locus ------------------------------------------------
test_that("tally_cts_per_locus counts matches per locus per sample", {
results <- testrds("results.rds")
cts_expected <- testrds("cts_per_locus.rds")
cts_observed <- tally_cts_per_locus(results)
# We should see 5000 reads total, and then 17 split away for each of the
# four other loci. Rownames should match sample row IDs.
expect_equal(cts_observed, cts_expected)
})
test_that("tally_cts_per_locus uses only loci present in samples", {
# What if we drop Locus 2 from the results, as though we'd only analyzed
# three loci even though we have four defined in the attributes table?
# Locus 2 should not show up in the cts_per_locus data frame.
# NOTE: Currently the Total column shows the total across the loci shown,
# not the grand total of reads in the raw file.
results <- testrds("results.rds")
cts_expected <- testrds("cts_per_locus.rds")
# We should see 5000 reads total, and then 17 split away for each of the
# four other loci. Rownames should match sample row IDs.
ids_rm <- c("1-2", "2-2", "3-2")
cts_expected <- subset(
cts_expected, ! rownames(cts_expected) %in% ids_rm, select = -`2`)
cts_expected$Total <- rowSums(cts_expected[, 3:5])
results$summary <- results$summary[-match(ids_rm,
rownames(results$summary)), ]
results$samples <- results$samples[-match(ids_rm,
names(results$samples))]
results$summary$Locus <- droplevels(results$summary$Locus)
# (This is a slightly goofy way of testing this outcome since each data
# frame in results$files still will have "2" in the MatchingLocus column,
# but it should work.)
cts_observed <- tally_cts_per_locus(results)
expect_equal(cts_observed, cts_expected)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.