tests/testthat/test-ts.R

skip_if(!is_slendr_env_present())

init_env(quiet = TRUE)

map <- world(xrange = c(0, 3500), yrange = c(0, 700), landscape = "blank")

N <- 100; y <- 350; r <- 240
p1 <- population("pop1", time = 1, N = N, map = map, center = c(750, y), radius = r)
p2 <- population("pop2", parent = p1, time = 2, N = N, map = map, center = c(1750, y), radius = r)

res <- 1
desc <- "Test model without CRS"

model_dir <- file.path(tempdir(), "ts")

model <- compile_model(
  populations = list(p1, p2),
  generation_time = 1, resolution = res, simulation_length = 300,
  competition = 10, mating = 10, dispersal = 5,
  path = model_dir, overwrite = TRUE, force = TRUE,
  description = desc
)

samples <- rbind(
  schedule_sampling(model, times = 2, list(p1, 2), list(p2, 2)),
  schedule_sampling(model, times = 300, list(p1, 10), list(p2, 10))
)

slim_ts <- normalizePath(tempfile(fileext = ".trees"), winslash = "/", mustWork = FALSE)
msprime_ts <- normalizePath(tempfile(fileext = ".trees"), winslash = "/", mustWork = FALSE)

locations_file <- normalizePath(tempfile(fileext = ".gz"), winslash = "/", mustWork = FALSE)

slim(model, sequence_length = 100000, recombination_rate = 0,
     locations = locations_file, burnin = 0,
     method = "batch", random_seed = 314159,
     samples = samples, verbose = FALSE) %>% ts_write(slim_ts)

msprime(model, sequence_length = 100000, recombination_rate = 0,
        random_seed = 314159, samples = samples, verbose = FALSE) %>% ts_write(msprime_ts)

test_that("ts_read fails gracefully on incorrect filename", {
  expect_error(ts <- ts_read(model, file = paste0(msprime_ts, "blah")), "File not found")
})

test_that("ts_read generates an object of the correct type (SLiM)", {
  ts <- ts_read(model, file = slim_ts) %>% ts_recapitate(Ne = 1, recombination_rate = 0) %>% ts_simplify()
  expect_true(inherits(ts, "tskit.trees.TreeSequence"))
})

test_that("unnecessary recapitation is prevented (msprime)", {
  expect_warning(
    ts_read(model, file = msprime_ts) %>% ts_recapitate(Ne = 1, recombination_rate = 0),
    "There is no need to recapitate"
  )
})

test_that("ts_read generates an object of the correct type (msprime)", {
  ts <- ts_read(model, file = msprime_ts)
  expect_true(inherits(ts, "tskit.trees.TreeSequence"))
})

test_that("ts_write and ts_read result in the same tree sequence (SLiM)", {
  ts1 <- ts_read(model, file = slim_ts)
  file <- paste0(tempfile(), "_slim.trees")
  ts_write(ts1, file)
  ts2 <- ts_read(model, file = file)

  data1 <- ts_nodes(ts1); data2 <- ts_nodes(ts2)
  samples1 <- ts_samples(ts1); samples2 <- ts_samples(ts2)

  expect_equal(data1, data2)
  expect_equal(samples1, samples2)
})

test_that("ts_write and ts_read result in the same tree sequence (msprime)", {
  ts1 <- ts_read(model, file = msprime_ts)
  file <- paste0(tempfile(), "_msprime.trees")
  ts_write(ts1, file)
  ts2 <- ts_read(model, file = file)

  data1 <- ts_nodes(ts1); data2 <- ts_nodes(ts2)
  samples1 <- ts_samples(ts1); samples2 <- ts_samples(ts2)

  expect_equal(data1, data2)
  expect_equal(samples1, samples2)
})

test_that("tree sequence contains the right number of sampled individuals (SLiM)", {
  ts <- ts_read(model, file = slim_ts) %>% ts_recapitate(Ne = 1, recombination_rate = 0) %>%
    ts_simplify()
  counts <- ts_nodes(ts) %>%
    dplyr::filter(sampled) %>%
    dplyr::as_tibble() %>%
    dplyr::distinct(ind_id, time, pop) %>%
    dplyr::count(time, pop)
  expect_true(all(counts == samples[, c("time", "pop", "n")]))
})

test_that("locations and times in the tree sequence match values saved by SLiM", {
  ts <- ts_read(model, file = slim_ts) %>% ts_recapitate(Ne = 1, recombination_rate = 0) %>% ts_simplify()
  individuals <- ts_nodes(ts) %>% dplyr::filter(sampled) %>% dplyr::distinct(ind_id, .keep_all = TRUE)
  true_locations <- readr::read_tsv(locations_file, col_types = "iicidd") %>%
    dplyr::mutate(time = convert_slim_time(gen, model))
  joined <- dplyr::inner_join(individuals, true_locations,
                              by = c("pedigree_id" = "ind")) %>%
    dplyr::mutate(location_x = as.vector(sf::st_coordinates(location)[, 1]),
                  location_y = as.vector(sf::st_coordinates(location)[, 2])) %>%
    as.data.frame()
  # for some reason the values differ in terms of decimal digits saved?
  # but they *are* equal
  expect_true(all.equal(joined$x, joined$location_x, tolerance = 1e-5))
  expect_true(all.equal(joined$y, joined$location_y, tolerance = 1e-5))
  expect_true(all.equal(joined$time.x, joined$time.y))
})

test_that("extracted individual, node, edge, and mutation counts match the tree sequence (SLiM)", {
  ts1 <- ts_read(model, file = slim_ts)
  table1 <- ts_nodes(ts1)

  ts2 <- ts_read(model, file = slim_ts) %>% ts_recapitate(Ne = 1000, recombination_rate = 0)
  table2 <- ts_nodes(ts2)

  ts3 <- ts_read(model, file = slim_ts) %>% ts_recapitate(Ne = 1000, recombination_rate = 0) %>% ts_simplify()
  table3 <- ts_nodes(ts3)

  suppressWarnings(ts4 <- ts_read(model, file = slim_ts) %>% ts_recapitate(Ne = 1000, recombination_rate = 0) %>% ts_mutate(mutation_rate = 1e-6))
  table4 <- ts_nodes(ts4)

  expect_true(ts1$num_individuals == sum(!is.na(unique(table1$ind_id))))
  expect_true(ts2$num_individuals == sum(!is.na(unique(table2$ind_id))))
  expect_true(ts3$num_individuals == sum(!is.na(unique(table3$ind_id))))
  expect_true(ts4$num_individuals == sum(!is.na(unique(table4$ind_id))))

  expect_true(ts1$num_nodes == nrow(table1))
  expect_true(ts2$num_nodes == nrow(table2))
  expect_true(ts3$num_nodes == nrow(table3))
  expect_true(ts4$num_nodes == nrow(table4))

  expect_true(all(sort(table1$node_id) == seq(0, ts1$num_nodes - 1)))
  expect_true(all(sort(table2$node_id) == seq(0, ts2$num_nodes - 1)))
  expect_true(all(sort(table3$node_id) == seq(0, ts3$num_nodes - 1)))
  expect_true(all(sort(table4$node_id) == seq(0, ts4$num_nodes - 1)))

  expect_true(all(sort(unique(table1$ind_id)) == seq(0, ts1$num_individuals - 1)))
  expect_true(all(sort(unique(table2$ind_id)) == seq(0, ts2$num_individuals - 1)))
  expect_true(all(sort(unique(table3$ind_id)) == seq(0, ts3$num_individuals - 1)))
  expect_true(all(sort(unique(table4$ind_id)) == seq(0, ts4$num_individuals - 1)))

  expect_true(ts1$num_edges == nrow(ts_table(ts1, "edges")))
  expect_true(ts2$num_edges == nrow(ts_table(ts2, "edges")))
  expect_true(ts3$num_edges == nrow(ts_table(ts3, "edges")))
  expect_true(ts4$num_edges == nrow(ts_table(ts4, "edges")))

  expect_true(ts1$num_mutations == nrow(ts_table(ts1, "mutations")))
  expect_true(ts2$num_mutations == nrow(ts_table(ts2, "mutations")))
  expect_true(ts3$num_mutations == nrow(ts_table(ts3, "mutations")))
  expect_true(ts4$num_mutations == nrow(ts_table(ts4, "mutations")))
})

test_that("extracted individual, node, edge, and mutation counts match the tree sequence (msprime)", {
  # this is not a super useful test as no recapitation or simplification would
  # be performed -- but a good sanity check to enforce that ts1 == ts2 == ts3
  ts1 <- ts_read(model, file = msprime_ts)
  table1 <- ts_nodes(ts1)

  suppressWarnings(ts2 <- ts_read(model, file = msprime_ts) %>% ts_recapitate(Ne = 1000, recombination_rate = 0))
  table2 <- ts_nodes(ts2)

  suppressWarnings(ts3 <- ts_read(model, file = msprime_ts) %>% ts_recapitate(Ne = 1000, recombination_rate = 0) %>% ts_simplify())
  table3 <- ts_nodes(ts3)

  suppressWarnings(ts4 <- ts_read(model, file = msprime_ts) %>% ts_recapitate(Ne = 1000, recombination_rate = 0) %>% ts_simplify() %>% ts_mutate(mutation_rate = 1e-6))
  table4 <- ts_nodes(ts4)

  expect_true(ts1$num_individuals == sum(!is.na(unique(table1$ind_id))))
  expect_true(ts2$num_individuals == sum(!is.na(unique(table2$ind_id))))
  expect_true(ts3$num_individuals == sum(!is.na(unique(table3$ind_id))))
  expect_true(ts4$num_individuals == sum(!is.na(unique(table4$ind_id))))

  expect_true(ts1$num_nodes == nrow(table1))
  expect_true(ts2$num_nodes == nrow(table2))
  expect_true(ts3$num_nodes == nrow(table3))
  expect_true(ts4$num_nodes == nrow(table4))

  expect_true(all(sort(table1$node_id) == seq(0, ts1$num_nodes - 1)))
  expect_true(all(sort(table2$node_id) == seq(0, ts2$num_nodes - 1)))
  expect_true(all(sort(table3$node_id) == seq(0, ts3$num_nodes - 1)))
  expect_true(all(sort(table4$node_id) == seq(0, ts4$num_nodes - 1)))

  expect_true(all(sort(unique(table1$ind_id)) == seq(0, ts1$num_individuals - 1)))
  expect_true(all(sort(unique(table2$ind_id)) == seq(0, ts2$num_individuals - 1)))
  expect_true(all(sort(unique(table3$ind_id)) == seq(0, ts3$num_individuals - 1)))
  expect_true(all(sort(unique(table4$ind_id)) == seq(0, ts4$num_individuals - 1)))

  expect_true(ts1$num_edges == nrow(ts_table(ts1, "edges")))
  expect_true(ts2$num_edges == nrow(ts_table(ts2, "edges")))
  expect_true(ts3$num_edges == nrow(ts_table(ts3, "edges")))
  expect_true(ts4$num_edges == nrow(ts_table(ts4, "edges")))

  expect_true(ts1$num_mutations == nrow(ts_table(ts1, "mutations")))
  expect_true(ts2$num_mutations == nrow(ts_table(ts2, "mutations")))
  expect_true(ts3$num_mutations == nrow(ts_table(ts3, "mutations")))
  expect_true(ts4$num_mutations == nrow(ts_table(ts4, "mutations")))

  expect_true(ts1 == ts2)

  # the following is no longer true because simplification must remove potential unary nodes,
  # at the very least
  # expect_true(ts1 == ts3)
})

test_that("simplification works only for samples that are really present", {
  msg <- "The following individuals are not present"
  expect_error(ts_read(model, file = slim_ts) %>% ts_recapitate(Ne = 1, recombination_rate = 0) %>% ts_simplify(simplify_to = "xyz"), msg)
  expect_error(ts_read(model, file = msprime_ts) %>% ts_simplify(simplify_to = "xyz"), msg)
})

test_that("simplification retains only specified samples (SLiM)", {
  simplify_to <- c("pop1_1", "pop1_2", "pop2_7")
  ts <- ts_read(model, file = slim_ts) %>%
    ts_recapitate(Ne = 1, recombination_rate = 0) %>%
    ts_simplify(simplify_to = simplify_to)
  df_samples <- ts_samples(ts) %>% dplyr::arrange(name, time)
  df_data <- ts_nodes(ts) %>% stats::na.omit() %>% dplyr::distinct(name, .keep_all = TRUE) %>% dplyr::arrange(name, time)
  expect_true(all(df_data$name == df_samples$name))
  expect_true(all(df_data$time == df_samples$time))
  expect_true(all(df_data$pop == df_samples$pop))

  suppressWarnings(ts2 <- ts_read(model, file = slim_ts) %>%
    ts_recapitate(recombination_rate = 0, Ne = 1))
  simplify_to <- sample(ts_samples(ts2)$name, 10)

  ts2 <- ts_simplify(ts2, simplify_to = simplify_to)
  df_samples2 <- ts_samples(ts2) %>% dplyr::arrange(name, time)
  df_data2 <- ts_nodes(ts2) %>% stats::na.omit() %>% dplyr::distinct(name, .keep_all = TRUE) %>% dplyr::arrange(name, time)
  expect_true(all(df_data2$name == df_samples2$name))
  expect_true(all(df_data2$time == df_samples2$time))
  expect_true(all(df_data2$pop == df_samples2$pop))
})

test_that("simplification retains only specified samples (msprime)", {
  simplify_to <- c("pop1_1", "pop1_2", "pop2_7")
  ts <- ts_read(model, file = msprime_ts) %>% ts_simplify(simplify_to = simplify_to)
  df_samples <- ts_samples(ts) %>% dplyr::arrange(name, time)
  df_data <- ts_nodes(ts) %>% stats::na.omit() %>% dplyr::distinct(name, .keep_all = TRUE) %>% dplyr::arrange(name, time)
  expect_true(all(df_data$name == df_samples$name))
  expect_true(all(df_data$time == df_samples$time))
  expect_true(all(df_data$pop == df_samples$pop))

  suppressWarnings(ts2 <- ts_read(model, file = msprime_ts) %>% ts_recapitate(recombination_rate = 0, Ne = 1))
  simplify_to <- sample(ts_samples(ts2)$name, 10)

  ts2 <- ts_simplify(ts2, simplify_to = simplify_to)
  df_samples2 <- ts_samples(ts2) %>% dplyr::arrange(name, time)
  df_data2 <- ts_nodes(ts2) %>% stats::na.omit() %>% dplyr::distinct(name, .keep_all = TRUE) %>% dplyr::arrange(name, time)
  expect_true(all(df_data2$name == df_samples2$name))
  expect_true(all(df_data2$time == df_samples2$time))
  expect_true(all(df_data2$pop == df_samples2$pop))
})

test_that("ts_samples() names match ts_nodes() information (SLiM)", {
  simplify_to <- c("pop1_1", "pop1_2", "pop2_7")

  ts1 <- ts_read(model, file = slim_ts) %>% ts_recapitate(Ne = 1, recombination_rate = 0)
  ts2 <- ts_read(model, file = slim_ts) %>% ts_recapitate(Ne = 1000, recombination_rate = 0)
  ts3 <- ts_read(model, file = slim_ts) %>% ts_recapitate(Ne = 1, recombination_rate = 0) %>%
    ts_simplify(simplify_to = simplify_to)
  simplify_to <- sample(ts_samples(ts1)$name, 10)
  ts4 <- ts_simplify(ts1, simplify_to = simplify_to)

  df_samples1 <- ts_samples(ts1) %>% dplyr::arrange(name, time)
  df_data1 <- ts_nodes(ts1) %>% stats::na.omit() %>% dplyr::distinct(name, .keep_all = TRUE) %>% dplyr::arrange(name, time) %>% as.data.frame()
  expect_true(all(df_data1$name == df_samples1$name))
  expect_true(all(df_data1$time == df_samples1$time))
  expect_true(all(df_data1$pop == df_samples1$pop))

  df_samples2 <- ts_samples(ts2) %>% dplyr::arrange(name, time)
  df_data2 <- ts_nodes(ts2) %>% stats::na.omit() %>% dplyr::distinct(name, .keep_all = TRUE) %>% dplyr::arrange(name, time) %>% as.data.frame()
  expect_true(all(df_data2$name == df_samples2$name))
  expect_true(all(df_data2$time == df_samples2$time))
  expect_true(all(df_data2$pop == df_samples2$pop))

  df_samples3 <- ts_samples(ts3) %>% dplyr::arrange(name, time)
  df_data3 <- ts_nodes(ts3) %>% stats::na.omit() %>% dplyr::distinct(name, .keep_all = TRUE) %>% dplyr::arrange(name, time) %>% as.data.frame()
  expect_true(all(df_data3$name == df_samples3$name))
  expect_true(all(df_data3$time == df_samples3$time))
  expect_true(all(df_data3$pop == df_samples3$pop))

  df_samples4 <- ts_samples(ts4) %>% dplyr::arrange(name, time)
  df_data4 <- ts_nodes(ts4) %>% stats::na.omit() %>% dplyr::distinct(name, .keep_all = TRUE) %>% dplyr::arrange(name, time) %>% as.data.frame()
  expect_true(all(df_data4$name == df_samples4$name))
  expect_true(all(df_data4$time == df_samples4$time))
  expect_true(all(df_data4$pop == df_samples4$pop))
})

test_that("ts_samples() names match ts_nodes() information (msprime)", {
  simplify_to <- c("pop1_1", "pop1_2", "pop2_7")

  ts1 <- ts_read(model, file = msprime_ts)
  ts3 <- ts_read(model, file = msprime_ts) %>% ts_simplify(simplify_to = simplify_to)
  simplify_to <- sample(ts_samples(ts1)$name, 10)
  ts4 <- ts_simplify(ts1, simplify_to = simplify_to)

  df_samples1 <- ts_samples(ts1) %>% dplyr::arrange(name, time)
  df_data1 <- ts_nodes(ts1) %>% stats::na.omit() %>% dplyr::distinct(name, .keep_all = TRUE) %>% dplyr::arrange(name, time) %>% as.data.frame()
  expect_true(all(df_data1$name == df_samples1$name))
  expect_true(all(df_data1$time == df_samples1$time))
  expect_true(all(df_data1$pop == df_samples1$pop))

  df_samples3 <- ts_samples(ts3) %>% dplyr::arrange(name, time)
  df_data3 <- ts_nodes(ts3) %>% stats::na.omit() %>% dplyr::distinct(name, .keep_all = TRUE) %>% dplyr::arrange(name, time) %>% as.data.frame()
  expect_true(all(df_data3$name == df_samples3$name))
  expect_true(all(df_data3$time == df_samples3$time))
  expect_true(all(df_data3$pop == df_samples3$pop))

  df_samples4 <- ts_samples(ts4) %>% dplyr::arrange(name, time)
  df_data4 <- ts_nodes(ts4) %>% stats::na.omit() %>% dplyr::distinct(name, .keep_all = TRUE) %>% dplyr::arrange(name, time) %>% as.data.frame()
  expect_true(all(df_data4$name == df_samples4$name))
  expect_true(all(df_data4$time == df_samples4$time))
  expect_true(all(df_data4$pop == df_samples4$pop))
})

test_that("ts_eigenstrat requires recapitated and mutated data (SLiM)", {
  skip_if(Sys.which("qpDstat") == "")
  ts1 <- ts_read(model, file = slim_ts)
  ts2 <- ts_read(model, file = slim_ts) %>% ts_recapitate(Ne = 1000, recombination_rate = 0)
  ts3 <- ts_read(model, file = slim_ts) %>% ts_recapitate(Ne = 1, recombination_rate = 0) %>% ts_simplify(simplify_to = c("pop1_1", "pop1_2"))
  ts4 <- ts_read(model, file = slim_ts) %>% ts_recapitate(recombination_rate = 0, Ne = 10000) %>%
    ts_simplify()
  ts5 <- ts_mutate(ts4, mutation_rate = 1e-7)
  ts6 <- ts_read(model, file = slim_ts) %>% ts_recapitate(recombination_rate = 0, Ne = 10000) %>%
    ts_simplify() %>%
    ts_mutate(mutation_rate = 1e-7)

  prefix <- file.path(tempdir(), "eigen")
  expect_error(ts_eigenstrat(ts1, prefix), "Tree sequence was not recapitated")
  expect_error(ts_eigenstrat(ts2, prefix), "Attempting to extract genotypes")
  expect_error(ts_eigenstrat(ts3, prefix), "Attempting to extract genotypes")
  expect_error(ts_eigenstrat(ts4, prefix), "Attempting to extract genotypes")
  suppressMessages(expect_s3_class(ts_eigenstrat(ts5, prefix), "EIGENSTRAT"))
  suppressMessages(expect_s3_class(ts_eigenstrat(ts6, prefix), "EIGENSTRAT"))

  path <- file.path(tempdir(), "gt.vcf.gz")
  expect_error(ts_vcf(ts1, path), "Tree sequence was not recapitated")
  expect_error(ts_vcf(ts2, path), "Attempting to extract genotypes")
  expect_error(ts_vcf(ts3, path), "Attempting to extract genotypes")
  expect_error(ts_vcf(ts4, path), "Attempting to extract genotypes")
  expect_silent(suppressMessages(ts_vcf(ts5, path)))
  expect_silent(suppressMessages(ts_vcf(ts6, path)))
})

test_that("ts_eigenstrat requires recapitated and mutated data (msprime)", {
  skip_if(Sys.which("qpDstat") == "")
  ts1 <- ts_read(model, file = msprime_ts)
  ts3 <- ts_read(model, file = msprime_ts) %>% ts_simplify(simplify_to = c("pop1_1", "pop1_2", "pop2_7"))
  ts4 <- ts_read(model, file = msprime_ts) %>% ts_simplify()
  ts5 <- ts_mutate(ts4, mutation_rate = 1e-7)
  ts6 <- ts_read(model, file = msprime_ts) %>% ts_simplify(simplify_to = c("pop1_1", "pop1_2", "pop2_7")) %>% ts_mutate(mutation_rate = 1e-7)

  # this is not supposed to fail (unlike the previous SLiM case) because all
  # msprime sequences are "recapitated" by virtue of beling coalescent!
  prefix <- file.path(tempdir(), "eigen")
  expect_error(ts_eigenstrat(ts1, prefix), "Attempting to extract genotypes")
  expect_error(ts_eigenstrat(ts3, prefix), "Attempting to extract genotypes")
  expect_error(ts_eigenstrat(ts4, prefix), "Attempting to extract genotypes")
  expect_s3_class(ts_eigenstrat(ts5, prefix), "EIGENSTRAT")
  expect_s3_class(ts_eigenstrat(ts6, prefix), "EIGENSTRAT")

  path <- file.path(tempdir(), "gt.vcf.gz")
  expect_error(ts_vcf(ts1, path), "Attempting to extract genotypes")
  expect_error(ts_vcf(ts3, path), "Attempting to extract genotypes")
  expect_error(ts_vcf(ts4, path), "Attempting to extract genotypes")
  expect_silent(suppressMessages(ts_vcf(ts5, path)))
  expect_silent(suppressMessages(ts_vcf(ts6, path)))
})

test_that("ts_mutate cannot be called on an already mutated tree sequence (SLiM)", {
  ts <- ts_read(model, file = slim_ts)
  ts_mut <- ts_mutate(ts, mutation_rate = 1e-7)
  expect_error(ts_mutate(ts_mut, mutation_rate = 2e-7),
               "Tree sequence already mutated")
})

test_that("ts_mutate cannot be called on an already mutated tree sequence (msprime)", {
  ts <- ts_read(model, file = msprime_ts)
  ts_mut <- ts_mutate(ts, mutation_rate = 1e-7)
  expect_error(ts_mutate(ts_mut, mutation_rate = 2e-7),
               "Tree sequence already mutated")
})

test_that("ts_simplify-ing a non-recapitated tree sequence gives a warning", {
  ts <- ts_read(model, file = slim_ts)
  expect_warning(ts_simplify(ts), "Simplifying a non-recapitated tree sequence.")
  expect_silent(ts_simplify(ts, keep_input_roots = TRUE))
})

test_that("ts_eigenstrat and tsv_cf create correct data (SLiM)", {
  skip_if(Sys.which("qpDstat") == "")
  ts <- ts_read(model, file = slim_ts) %>% ts_recapitate(recombination_rate = 0, Ne = 10000) %>%
    ts_simplify() %>%
    ts_mutate(mutation_rate = 1e-7)

  ts_names <- sort(unique(ts_nodes(ts) %>% dplyr::filter(sampled) %>% .$name))

  # match EIGENSTRAT contents
  prefix <- file.path(tempdir(), "eigen")
  eigenstrat <- suppressMessages(ts_eigenstrat(ts, prefix))
  ind_names <- sort(admixr::read_ind(eigenstrat)$id)
  expect_true(all(ind_names == ts_names))

  # match VCF contents
  path <- file.path(tempdir(), "gt.vcf.gz")
  ts_vcf(ts, path)
  file <- gzfile(path)
  vcf_names <- readLines(file) %>%
    grep("^#CHROM", ., value = TRUE) %>%
    strsplit("\t") %>% .[[1]] %>% .[10 : length(.)] %>% sort
  on.exit(close(file))
  expect_true(all(vcf_names == ts_names))
})

test_that("ts_eigenstrat and tsv_cf create correct data (msprime)", {
  skip_if(Sys.which("qpDstat") == "")
  ts <- ts_read(model, file = msprime_ts) %>% ts_mutate(mutation_rate = 1e-7)

  ts_names <- sort(unique(ts_nodes(ts) %>% .$name))

  # match EIGENSTRAT contents
  prefix <- file.path(tempdir(), "eigen")
  eigenstrat <- suppressMessages(ts_eigenstrat(ts, prefix))
  ind_names <- sort(admixr::read_ind(eigenstrat)$id)
  expect_true(all(ind_names == ts_names))

  # match VCF contents
  path <- file.path(tempdir(), "gt.vcf.gz")
  ts_vcf(ts, path)
  file <- gzfile(path)
  vcf_names <- readLines(file) %>%
    grep("^#CHROM", ., value = TRUE) %>%
    strsplit("\t") %>% .[[1]] %>% .[10 : length(.)] %>% sort
  on.exit(close(file))
  expect_true(all(vcf_names == ts_names))
})

test_that("ts_eigenstrat correctly adds an outgroup when instructed (SLiM)", {
  skip_if(Sys.which("qpDstat") == "")
  ts <- ts_read(model, file = slim_ts) %>% ts_recapitate(recombination_rate = 0, Ne = 10000) %>%
    ts_simplify() %>%
    ts_mutate(mutation_rate = 1e-7)

  ts_names <- sort(unique(ts_nodes(ts) %>% dplyr::filter(sampled) %>% .$name))

  # match EIGENSTRAT contents
  prefix <- file.path(tempdir(), "eigen")
  eigenstrat <- suppressMessages(ts_eigenstrat(ts, prefix, outgroup = "outgroup_ind"))
  ind_names <- sort(admixr::read_ind(eigenstrat)$id)
  expect_true(all(ind_names == c("outgroup_ind", ts_names)))
})

test_that("ts_eigenstrat correctly adds an outgroup when instructed (msprime)", {
  skip_if(Sys.which("qpDstat") == "")
  ts <- ts_read(model, file = msprime_ts) %>% ts_mutate(mutation_rate = 1e-7)

  ts_names <- sort(unique(ts_nodes(ts) %>% .$name))

  # match EIGENSTRAT contents
  prefix <- file.path(tempdir(), "eigen")
  eigenstrat <- suppressMessages(ts_eigenstrat(ts, prefix, outgroup = "outgroup_ind"))
  ind_names <- sort(admixr::read_ind(eigenstrat)$id)
  expect_true(all(ind_names == c("outgroup_ind", ts_names)))
})

test_that("slendr metadata is correctly loaded (spatial model without CRS)", {
  output <- normalizePath(paste0(tempfile(), "spatial_test"), winslash = "/", mustWork = FALSE)

  burnin_length <- 123
  max_attempts <- 3
  RECOMBINATION_RATE <- 0.001
  locations_file <- normalizePath(tempfile(fileext = ".gz"), winslash = "/", mustWork = FALSE)
  seed <- 987
  sequence_length <- 999

  ts <- slim(model, sequence_length = sequence_length, recombination_rate = RECOMBINATION_RATE,
       locations = locations_file, burnin = burnin_length,
       method = "batch", random_seed = seed, max_attempts = max_attempts,
       samples = samples, verbose = FALSE)
  ts_write(ts, output)

  metadata <- ts_metadata(ts)

  expect_true(gsub("slendr_", "", metadata$version) == packageVersion("slendr"))
  expect_true(all(sf::st_bbox(map) == metadata$map$EXTENT))
  expect_true(metadata$map$resolution == res)
  expect_true(is.null(metadata$map$crs))
  expect_true(metadata$description == desc)

  args <- metadata$arguments
  expect_equal(args$BURNIN_LENGTH, burnin_length)
  expect_equal(args$MAX_ATTEMPTS, max_attempts)
  expect_equal(args$RECOMBINATION_RATE, RECOMBINATION_RATE)
  expect_equal(args$SEED, seed)
  expect_equal(args$SEQUENCE_LENGTH, sequence_length)
})

test_that("slendr metadata is correctly loaded (non-spatial SLiM model)", {
  output <- normalizePath(paste0(tempfile(), "non-spatial_SLiM_test"), winslash = "/", mustWork = FALSE)

  burnin_length <- 123
  RECOMBINATION_RATE <- 0.001
  locations_file <- normalizePath(tempfile(fileext = ".gz"), winslash = "/", mustWork = FALSE)
  seed <- 987
  sequence_length <- 999
  spatial <- FALSE

  ts <- slim(model, sequence_length = sequence_length, recombination_rate = RECOMBINATION_RATE,
       locations = locations_file, burnin = burnin_length,
       method = "batch", random_seed = seed,
       samples = samples, verbose = FALSE, spatial = spatial)
  ts_write(ts, output)

  metadata <- ts_metadata(ts)

  expect_true(gsub("slendr_", "", metadata$version) == packageVersion("slendr"))
  expect_true(is.null(metadata$map))
  expect_true(metadata$description == desc)

  args <- metadata$arguments
  expect_equal(args$BURNIN_LENGTH, burnin_length)
  expect_equal(args$RECOMBINATION_RATE, RECOMBINATION_RATE)
  expect_equal(args$SEED, seed)
  expect_equal(args$SEQUENCE_LENGTH, sequence_length)
})

test_that("slendr metadata is correctly loaded (non-spatial msprime model)", {
  output <- paste0(tempfile(), "non-spatial_msprime_test")

  burnin_length <- 123
  RECOMBINATION_RATE <- 0.001
  seed <- 987
  sequence_length <- 999
  spatial <- FALSE

  ts <- msprime(model, sequence_length = sequence_length, recombination_rate = RECOMBINATION_RATE,
       random_seed = seed, samples = samples, verbose = FALSE)
  ts_write(ts, output)
  metadata <- ts_metadata(ts)

  expect_true(gsub("slendr_", "", metadata$version) == packageVersion("slendr"))
  expect_true(is.null(metadata$map))
  expect_true(metadata$description == desc)

  args <- metadata$arguments
  expect_equal(args$RECOMBINATION_RATE, RECOMBINATION_RATE)
  expect_equal(args$SEED, seed)
  expect_equal(args$SEQUENCE_LENGTH, sequence_length)
})

test_that("ts_mutate and mutation through ts_read give the same result (SLiM)", {
  ts <- ts_read(model, file = slim_ts) %>%
    ts_recapitate(random_seed = 123, recombination_rate = 0, Ne = 100) %>% ts_simplify()
  ts_mut1 <- ts_mutate(ts, mutation_rate = 1e-7, random_seed = 123)
  ts_mut2 <- ts_read(model, file = slim_ts) %>% ts_recapitate(random_seed = 123, recombination_rate = 0, Ne = 100) %>% ts_simplify() %>% ts_mutate(mutation_rate = 1e-7, random_seed = 123)
  expect_equal(suppressMessages(ts_genotypes(ts_mut1)),
               suppressMessages(ts_genotypes(ts_mut2)))
})

test_that("ts_mutate and mutation through ts_read give the same result (msprime)", {
  ts <- ts_read(model, file = msprime_ts)
  ts_mut1 <- ts_mutate(ts, mutation_rate = 1e-7, random_seed = 123)
  ts_mut2 <- ts_read(msprime_ts, model) %>% ts_mutate(mutation_rate = 1e-7, random_seed = 123)
  expect_equal(suppressMessages(ts_genotypes(ts_mut1)),
               suppressMessages(ts_genotypes(ts_mut2)))
})

test_that("ts_mutate correctly specifies the SLiM mutation type", {
  ts <- ts_read(slim_ts, model) %>%
    ts_recapitate(random_seed = 123, recombination_rate = 0, Ne = 100) %>%
    ts_simplify()
  ts_mut1 <- ts_mutate(ts, mutation_rate = 1e-7, random_seed = 123)
  msprime_module <- reticulate::import("msprime")
  ts_mut2 <- ts_mutate(ts, mutation_rate = 1e-7, random_seed = 123,
                       mutation_model = msprime_module$SLiMMutationModel(123456789L))

  get_mut_type <- function(m) {
    mut_metadata <- m$metadata$mutation_list
    if (length(mut_metadata) > 0)
      return(mut_metadata[[1]]$mutation_type)
    else
      return(NULL)
  }

  mut_types1 <- unlist(reticulate::iterate(ts_mut1$mutations(), get_mut_type))
  mut_types2 <- unlist(reticulate::iterate(ts_mut2$mutations(), get_mut_type))

  expect_true(is.null(mut_types1))
  expect_true(all(mut_types2 == 123456789))
})

test_that("tree sequence contains the specified number of sampled individuals (default sampling)", {
  ts <- slim(model, sequence_length = 100000, recombination_rate = 0, locations = locations_file, burnin = 10,
       method = "batch", random_seed = 314159, verbose = FALSE) %>%
    ts_recapitate(Ne = 1, recombination_rate = 0) %>%
    ts_simplify()

  counts <- ts_nodes(ts) %>%
    dplyr::filter(sampled) %>%
    dplyr::as_tibble() %>%
    dplyr::distinct(ind_id, time, pop) %>%
    dplyr::count(time, pop)
  expect_true(all(counts$n == N))
})

test_that("locations and times in the tree sequence match values saved by SLiM (default sampling)", {
  ts <- slim(model, sequence_length = 100000, recombination_rate = 0, locations = locations_file, burnin = 10,
       method = "batch", random_seed = 314159, verbose = FALSE) %>%
    ts_recapitate(Ne = 1, recombination_rate = 0) %>%
    ts_simplify()

  individuals <- ts_nodes(ts) %>% dplyr::filter(sampled) %>% dplyr::distinct(ind_id, .keep_all = TRUE)
  true_locations <- readr::read_tsv(locations_file, col_types = "iicidd") %>%
    dplyr::mutate(time = convert_slim_time(gen, model))
  joined <- dplyr::inner_join(individuals, true_locations,
                              by = c("pedigree_id" = "ind")) %>%
    as.data.frame() %>%
    dplyr::mutate(location_x = as.vector(sf::st_coordinates(location)[, 1]),
                  location_y = as.vector(sf::st_coordinates(location)[, 2]))
  # for some reason the values differ in terms of decimal digits saved?
  # but they *are* equal
  expect_true(all.equal(joined$x, joined$location_x, tolerance = 1e-5))
  expect_true(all.equal(joined$y, joined$location_y, tolerance = 1e-5))
  expect_true(all.equal(joined$time.x, joined$time.y))
})

test_that("metadata is the same for SLiM and msprime conditional on a model", {
  samples <- rbind(
    schedule_sampling(model, times = 2, list(p1, 2), list(p2, 2)),
    schedule_sampling(model, times = 212, list(p1, 5), list(p2, 3)),
    schedule_sampling(model, times = 300, list(p1, 10), list(p2, 10))
  )

  slim_ts <- normalizePath(file.path(model_dir, "output_slim.trees"), winslash = "/", mustWork = FALSE)
  msprime_ts <- normalizePath(file.path(model_dir, "msprime_output.trees"), winslash = "/", mustWork = FALSE)

  slim(model, sequence_length = 100000, recombination_rate = 0,
       locations = locations_file, burnin = 10,
       method = "batch", random_seed = 314159,
       samples = samples, verbose = FALSE) %>% ts_write(slim_ts)

  msprime(model, sequence_length = 100000, recombination_rate = 0,
          random_seed = 314159, samples = samples, verbose = FALSE) %>% ts_write(msprime_ts)

  simplify_to <- c("pop1_1", "pop1_2", "pop1_17")

  sts1 <- ts_read(model = model, file = slim_ts)
  sts2 <- ts_read(model = model, file = slim_ts) %>%
    ts_recapitate(Ne = 1000, recombination_rate = 0)
  sts3 <- ts_read(model = model, file = slim_ts) %>%
    ts_recapitate(Ne = 1, recombination_rate = 0) %>%
    ts_simplify(simplify_to = simplify_to)
  sts4 <- ts_read(model = model, file = slim_ts) %>%
    ts_recapitate(recombination_rate = 0, Ne = 10000) %>%
    ts_simplify()
  sts5 <- ts_mutate(sts4, mutation_rate = 1e-7)
  sts6 <- ts_read(model = model, file = slim_ts) %>%
    ts_recapitate(recombination_rate = 0, Ne = 10000) %>%
    ts_simplify() %>%
    ts_mutate(mutation_rate = 1e-7)

  mts1 <- ts_read(model = model, file = msprime_ts)
  mts2 <- ts_read(model = model, file = msprime_ts)
  mts3 <- ts_read(model = model, file = msprime_ts) %>% ts_simplify(simplify_to = simplify_to)
  suppressWarnings(mts4 <- ts_read(model = model, file = msprime_ts) %>% ts_simplify())
  mts5 <- ts_mutate(mts4, mutation_rate = 1e-7)
  mts6 <- ts_read(model = model, file = msprime_ts) %>% ts_mutate(mutation_rate = 1e-7)

  expect_equal(ts_samples(sts1), ts_samples(mts1))
  expect_equal(ts_samples(sts2), ts_samples(mts2))
  expect_equal(ts_samples(sts3), ts_samples(mts3))
  expect_equal(ts_samples(sts4), ts_samples(mts4))
  expect_equal(ts_samples(sts5), ts_samples(mts5))
  expect_equal(ts_samples(sts6), ts_samples(mts6))

  sdata1 <- ts_nodes(sts1) %>% dplyr::filter(sampled) %>% dplyr::arrange(name) %>% as.data.frame()
  mdata1 <- ts_nodes(mts1) %>% stats::na.omit() %>% dplyr::arrange(name) %>% as.data.frame()
  sdata2 <- ts_nodes(sts2) %>% dplyr::filter(sampled) %>% dplyr::arrange(name) %>% as.data.frame()
  mdata2 <- ts_nodes(mts2) %>% stats::na.omit() %>% dplyr::arrange(name) %>% as.data.frame()
  sdata3 <- ts_nodes(sts3) %>% dplyr::filter(sampled) %>% dplyr::arrange(name) %>% as.data.frame()
  mdata3 <- ts_nodes(mts3) %>% stats::na.omit() %>% dplyr::arrange(name) %>% as.data.frame()
  sdata4 <- ts_nodes(sts4) %>% dplyr::filter(sampled) %>% dplyr::arrange(name) %>% as.data.frame()
  mdata4 <- ts_nodes(mts4) %>% stats::na.omit() %>% dplyr::arrange(name) %>% as.data.frame()
  sdata5 <- ts_nodes(sts5) %>% dplyr::filter(sampled) %>% dplyr::arrange(name) %>% as.data.frame()
  mdata5 <- ts_nodes(mts5) %>% stats::na.omit() %>% dplyr::arrange(name) %>% as.data.frame()
  sdata6 <- ts_nodes(sts6) %>% dplyr::filter(sampled) %>% dplyr::arrange(name) %>% as.data.frame()
  mdata6 <- ts_nodes(mts6) %>% stats::na.omit() %>% dplyr::arrange(name) %>% as.data.frame()

  expect_equal(sdata1$name, mdata1$name)
  expect_equal(sdata2$name, mdata2$name)
  expect_equal(sdata3$name, mdata3$name)
  expect_equal(sdata4$name, mdata4$name)
  expect_equal(sdata5$name, mdata5$name)
  expect_equal(sdata6$name, mdata6$name)

  expect_equal(sdata1$time, mdata1$time)
  expect_equal(sdata2$time, mdata2$time)
  expect_equal(sdata3$time, mdata3$time)
  expect_equal(sdata4$time, mdata4$time)
  expect_equal(sdata5$time, mdata5$time)
  expect_equal(sdata6$time, mdata6$time)
})

test_that("all names of individuals must be present in the tree sequence", {
  ts <- ts_read(file = msprime_ts, model = model)
  expect_error(ts_diversity(ts, c("p1_1", "p2_2")), "Not all individual names")
  expect_s3_class(ts_diversity(ts, c("pop1_1", "pop2_2")), "data.frame")
})

# --------------------------------------------------------------------------------
# IBD-related test for spatial tree sequences (more is in test-ibd.R,
# test-ts-pure-spatial.R and test-ts-pure-nonspatial.R)

test_that("ts_ibd() on spatial SLiM/slendr tree sequences works properly (no CRS)", {
  suppressWarnings(ts_slim <- ts_read(model, file = slim_ts) %>% ts_simplify())

  suppressWarnings(slim_ibd_sf <- ts_ibd(ts_slim, coordinates = TRUE))
  suppressWarnings(slim_ibd_nosf <- ts_ibd(ts_slim, coordinates = TRUE, sf = FALSE))

  # returned object is of a sf class (or not), as requested by the user
  expect_s3_class(slim_ibd_sf, "sf")
  expect_true(!inherits(slim_ibd_nosf, "sf"))

  # the model does not have a CRS, and neither should the IBD table
  expect_true(is.na(sf::st_crs(slim_ibd_sf)))

  # spatial IBD links within the same individual give an EMPTY linestring with
  # no coordinates
  within_ibd <- slim_ibd_sf %>% dplyr::filter(name1 == name2) %>% dplyr::select(connection)
  expect_true(nrow(sf::st_coordinates(within_ibd)) == 0)

  # except for the spatial columns, the IBD results are the same
  expect_true(all(dplyr::select(as.data.frame(slim_ibd_sf),
                            -node1_location, -node2_location, -connection) ==
               as.data.frame(slim_ibd_nosf)))
})

test_that("ts_ibd() on msprime/slendr tree sequences works properly", {
  ts_msprime <- ts_read(model, file = msprime_ts)

  suppressWarnings(msprime_ibd_sf <- ts_ibd(ts_msprime, coordinates = TRUE))
  suppressWarnings(msprime_ibd_nosf <- ts_ibd(ts_msprime, coordinates = TRUE, sf = FALSE))

  # returned object is of a sf class (or not), as requested by the user
  expect_true(!inherits(msprime_ibd_sf, "sf"))
  expect_true(!inherits(msprime_ibd_nosf, "sf"))

  # except for the spatial columns, the IBD results are the same
  expect_equal(as.data.frame(msprime_ibd_sf),
               as.data.frame(msprime_ibd_nosf))
})

test_that("ts_ibd() on spatial SLiM/slendr tree sequences works properly (with CRS)", {
  map <- readRDS("map.rds")
  p <- population(competition = 10e3, mating = 10e3, dispersal = 10e3, name = "pop1",
                  N = 1000, time = 3000, radius = 600000, center = c(10, 25), map = map)
  ts <- compile_model(populations = list(p), generation_time = 30,
                      resolution = 10e3, direction = "backward") %>%
    slim(sequence_length = 1e6, recombination_rate = 1e-8, random_seed = 42)

  set.seed(42)
  samples <- sample(ts_samples(ts)$name, 20)

  suppressWarnings(ibd_sf <- ts_ibd(ts, coordinates = TRUE, within = samples))
  suppressWarnings(ibd_nosf <- ts_ibd(ts, coordinates = TRUE, within = samples, sf = FALSE))

  # returned object is of a sf class (or not), as requested by the user
  expect_s3_class(ibd_sf, "sf")
  expect_true(!inherits(ibd_nosf, "sf"))

  # the IBD sf object has the CRS attribute of the map itself
  expect_true(sf::st_crs(map) == sf::st_crs(ibd_sf))

  # spatial IBD links within the same individual give an EMPTY linestring with
  # no coordinates
  within_ibd <- ibd_sf %>% dplyr::filter(name1 == name2) %>% dplyr::select(connection)
  expect_true(nrow(sf::st_coordinates(within_ibd)) == 0)

  # except for the spatial columns, the IBD results are the same
  expect_true(all(dplyr::select(as.data.frame(ibd_sf),
                            -node1_location, -node2_location, -connection) ==
               as.data.frame(ibd_nosf)))
})

test_that("ts_read tracks a path to the tree-sequence file if loaded from a file", {
  ts1 <- ts_read(slim_ts, model)
  ts2 <- ts_recapitate(ts1, recombination_rate = 0, Ne = 100)
  ts3 <- ts_simplify(ts2)
  ts4 <- ts_mutate(ts1, 1e-8)

  expect_true(attr(ts1, "path") == normalizePath(slim_ts))
  expect_true(attr(ts2, "path") == normalizePath(slim_ts))
  expect_true(attr(ts3, "path") == normalizePath(slim_ts))
  expect_true(attr(ts4, "path") == normalizePath(slim_ts))
})

test_that("ts_read does not track any file path if a tree sequence is directly loaded", {
  nonser_model <- model
  nonser_model$path <- NULL

  ts1 <- msprime(nonser_model, sequence_length = 100000, recombination_rate = 0,
                 random_seed = 314159, samples = samples)

  suppressWarnings(ts2 <- ts_recapitate(ts1, recombination_rate = 0, Ne = 100))
  ts3 <- ts_simplify(ts2)
  ts4 <- ts_mutate(ts1, 1e-8)

  expect_true(is.null(attr(ts1, "path")))
  expect_true(is.null(attr(ts2, "path")))
  expect_true(is.null(attr(ts3, "path")))
  expect_true(is.null(attr(ts4, "path")))
})

#------------------------------------------------------------------
# testing names extraction

ts_msprime <- ts_read(msprime_ts, model)
ts_slim <- ts_read(slim_ts, model)

test_that("ts_names(ts) returns a simple character vector", {
  expect_type(ts_names(ts_msprime), "character")
  expect_type(ts_names(ts_slim), "character")

  expect_true(length(ts_names(ts_msprime)) == sum(samples$n))
  expect_true(length(ts_names(ts_slim)) == sum(samples$n))
})

test_that("ts_names(ts, split = 'pop') returns a list of character vectors", {
  expect_type(ts_names(ts_msprime, split = "pop"), "list")
  expect_type(ts_names(ts_slim, split = "pop"), "list")

  expect_true(length(ts_names(ts_msprime, split = "pop")) == 2)
  expect_true(length(ts_names(ts_slim, split = "pop")) == 2)
})

test_that("ts_names(ts, split = 'time') returns a list of character vectors", {
  expect_type(ts_names(ts_msprime, split = "time"), "list")
  expect_type(ts_names(ts_slim, split = "time"), "list")

  expect_true(length(ts_names(ts_msprime, split = "time")) == length(unique(samples$time)))
  expect_true(length(ts_names(ts_slim, split = "time")) == length(unique(samples$time)))
})
bodkan/spannr documentation built on Dec. 19, 2024, 11:43 p.m.