tests/testthat/test-ts-pure-nonspatial.R

skip_if(!is_slendr_env_present())

init_env(quiet = TRUE)

set.seed(42)

simulate_slim_ts <- function(N) {
  script_file <- normalizePath(tempfile(), winslash = "/", mustWork = FALSE)
  ts_file <- normalizePath(tempfile(), winslash = "/", mustWork = FALSE)

  writeLines(sprintf('initialize() {
    setSeed(42);
  	initializeTreeSeq();
  	initializeMutationRate(0);
  	initializeMutationType("m1", 0.5, "f", 0.0);
  	initializeGenomicElementType("g1", m1, 1.0);
  	initializeGenomicElement(g1, 0, 1e5);
  	initializeRecombinationRate(1e-8);
  }
  1 early() {
  	sim.addSubpop("p0", %d);
  }
  100 late() {
    sim.treeSeqRememberIndividuals(sim.subpopulations.individuals);
  	sim.treeSeqOutput("%s");
  }
  ', N, ts_file), script_file)
  binary <- get_binary("batch")
  system2(binary, script_file, stdout = FALSE)

  ts_file
}

msprime_ts_sim_ancestry <- function(N) {
  msprime_file <- normalizePath(tempfile(), winslash = "/", mustWork = FALSE)
  slendr:::msp$sim_ancestry(N)$dump(msprime_file)
  msprime_file
}

msprime_ts_simulate <- function(N) {
  msprime_file <- normalizePath(tempfile(), winslash = "/", mustWork = FALSE)
  slendr:::msp$simulate(as.integer(N))$dump(msprime_file)
  msprime_file
}

# Make sure the unsimplified tree sequence has the same set of nodes as
# what we extracted with ts_nodes (ssampled and all)
compare_ts_nodes <- function(ts, N) {
  data <- ts_nodes(ts)

  expect_true(nrow(data) == ts$num_nodes)
  expect_true(nrow(data[data$sampled, ]) == ts$num_samples)
  if (any(!is.na(data$ind_id))) expect_true(nrow(data[data$sampled, ]) == 2 * N)

  ts2 <- ts_simplify(ts)
  data2 <- ts_nodes(ts2)

  # make sure the same holds also for a simplified tree sequence
  expect_true(nrow(data2) == ts2$num_nodes)
  expect_true(sum(data2$sampled) == ts2$num_samples)
  if (any(!is.na(data$ind_id))) expect_true(sum(data2$sampled) == 2 * N)
}

# Make sure the extracted phylo tree structure is the same as what is encoded
# in the edge table
compare_ts_phylo <- function(ts, N) {
  ts2 <- ts_simplify(ts)

  tskit_tree <- ts2$at(0)
  tree <- ts_phylo(ts2, i = 0, quiet = TRUE)
  tree_data <- ts_nodes(tree)
  # make sure that the converted phylo tree has the same number of edges and
  # nodes as the original tskit Tree object
  expect_true(nrow(tree$edge) == tskit_tree$num_edges)
  expect_true((tree$Nnode + length(tree$tip.label)) == length(reticulate::iterate(tskit_tree$nodes(tskit_tree$root))))
  # the root must be the same node as well
  phylo_root <- length(tree$tip.label) + 1
  tskit_root <- tskit_tree$root
  expect_true(which(tree_data$node_id == tskit_root) == which(tree_data$phylo_id == phylo_root))

  expect_true(length(tree$tip.label) == nrow(tree_data[tree_data$sampled, ]))
}

# SLiM --------------------------------------------------------------------

test_that("non-slendr SLiM ts_nodes corresponds to the expected outcome", {
  N <- 5
  ts_file <- simulate_slim_ts(N)
  suppressMessages(ts <- ts_read(ts_file))
  compare_ts_nodes(ts, N)
})

test_that("non-slendr SLiM simplified ts_nodes corresponds to the expected outcome", {
  N <- 500
  ts_file <- simulate_slim_ts(N)
  suppressMessages(ts <- ts_read(ts_file))
  simplify_to <- ts_nodes(ts) %>% dplyr::filter(sampled) %>% dplyr::pull(node_id) %>% sample(3)
  expect_warning(ts2 <- ts_simplify(ts, simplify_to = simplify_to),
                 "Simplifying a non-recapitated tree sequence")
  expect_silent(ts2 <- ts_recapitate(ts, Ne = 100, recombination_rate = 1e-8) %>%
                  ts_simplify(simplify_to = simplify_to))

  # make sure that the simplified nodes match the pedigree_id values in the
  # original tree-sequence data
  orig_ids <- ts_nodes(ts) %>% dplyr::filter(node_id %in% simplify_to) %>% dplyr::pull(pedigree_id)
  simplified_ids <- ts_nodes(ts2) %>% dplyr::filter(sampled) %>% dplyr::pull(pedigree_id)
  expect_true(all(orig_ids == simplified_ids))
})

test_that("non-slendr SLiM ts_phylo corresponds to the expected outcome", {
  N <- 500
  ts_file <- simulate_slim_ts(N)
  suppressMessages(ts <- ts_read(ts_file) %>% ts_recapitate(recombination_rate = 1e-8, Ne = 100))
  compare_ts_phylo(ts, N)
})

test_that("non-slendr SLiM ts_nodes can be recapitated", {
  N <- 10000
  ts_file <- simulate_slim_ts(N)
  suppressMessages(ts <- ts_read(ts_file) %>% ts_recapitate(Ne = 100, recombination_rate = 1e-8))
  expect_silent(compare_ts_nodes(ts, N))
})

test_that("non-slendr SLiM ts_nodes carries correct population names", {
  ts_file <- simulate_slim_ts(50)
  suppressMessages(ts <- ts_read(ts_file))
  expect_true(unique(ts_nodes(ts)$pop) == "p0")
})

# msprime tree sequences (sim_ancestry) -------------------------------------

test_that("non-slendr msprime simplification on its own gives warning", {
  N <- 5
  ts_file <- msprime_ts_sim_ancestry(N)
  suppressMessages(ts <- ts_read(ts_file))
  compare_ts_nodes(ts, N)
})

test_that("non-slendr SLiM simplified ts_nodes corresponds to the expected outcome", {
  N <- 5
  ts_file <- msprime_ts_sim_ancestry(N)
  suppressMessages(ts <- ts_read(ts_file))
  simplify_to <- ts_nodes(ts) %>% dplyr::filter(sampled) %>% dplyr::pull(node_id) %>% sample(3)
  ts2 <- ts_simplify(ts, simplify_to = simplify_to)

  # there are no fixed pedigree_id values, so let's check that the number of
  # "sampled" nodes corresponds to the number of nodes used for simplification
  expect_true(sum(ts_nodes(ts2)$sampled) == 3)
})

test_that("non-slendr msprime ts_phylo corresponds to the expected outcome", {
  N <- 5
  ts_file <- msprime_ts_sim_ancestry(N)
  suppressMessages(ts <- ts_read(ts_file))
  compare_ts_phylo(ts, N)
})

test_that("non-slendr msprime ts_phylo (simplified) corresponds to the expected outcome", {
  N <- 5
  ts_file <- msprime_ts_sim_ancestry(N)
  suppressMessages(ts <- ts_read(ts_file))
  simplify_to <- ts_nodes(ts) %>% dplyr::filter(sampled) %>% dplyr::pull(node_id) %>% sample(3)
  ts2 <- ts_simplify(ts, simplify_to = simplify_to)
  compare_ts_phylo(ts2, N)
})

test_that("non-slendr msprime ts_nodes carries correct population names", {
  ts_file <- msprime_ts_sim_ancestry(50)
  suppressMessages(ts <- ts_read(ts_file))
  expect_true(unique(ts_nodes(ts)$pop) == "pop_0")
})

# msprime tree sequences (simulate) -------------------------------------

test_that("non-slendr msprime simplification on its own gives warning (simulate)", {
  N <- 5
  ts_file <- msprime_ts_simulate(N)
  suppressMessages(ts <- ts_read(ts_file))
  compare_ts_nodes(ts, N)
})

test_that("non-slendr SLiM simplified ts_nodes corresponds to the expected outcome (simulate)", {
  N <- 5
  ts_file <- msprime_ts_simulate(N)
  suppressMessages(ts <- ts_read(ts_file))
  simplify_to <- ts_nodes(ts) %>% dplyr::filter(sampled) %>% dplyr::pull(node_id) %>% sample(3)
  ts2 <- ts_simplify(ts, simplify_to = simplify_to)

  # there are no fixed pedigree_id values, so let's check that the number of
  # "sampled" nodes corresponds to the number of nodes used for simplification
  expect_true(sum(ts_nodes(ts2)$sampled) == 3)
})

test_that("non-slendr msprime ts_phylo corresponds to the expected outcome (simulate)", {
  N <- 5
  ts_file <- msprime_ts_simulate(N)
  suppressMessages(ts <- ts_read(ts_file))
  compare_ts_phylo(ts, N)
})

test_that("non-slendr msprime ts_phylo (simplified) corresponds to the expected outcome (simulate)", {
  N <- 5
  ts_file <- msprime_ts_simulate(N)
  suppressMessages(ts <- ts_read(ts_file))
  simplify_to <- ts_nodes(ts) %>% dplyr::filter(sampled) %>% dplyr::pull(node_id) %>% sample(3)
  ts2 <- ts_simplify(ts, simplify_to = simplify_to)
  compare_ts_phylo(ts2, N)
})

test_that("non-slendr msprime ts_nodes carries correct population names (simulate)", {
  ts_file <- msprime_ts_simulate(50)
  suppressMessages(ts <- ts_read(ts_file))
  expect_true(unique(ts_nodes(ts)$pop) == "0")
})

# SLiM tskit statistics interface -----------------------------------------

test_that("tskit statistics interface works on non-slendr SLiM outputs", {
  script_file <- normalizePath(tempfile(), winslash = "/", mustWork = FALSE)
  ts_file <- normalizePath(tempfile(), winslash = "/", mustWork = FALSE)

  writeLines(sprintf('initialize() {
    setSeed(123);
  	initializeTreeSeq();
  	initializeMutationRate(0);
  	initializeMutationType("m1", 0.5, "f", 0.0);
  	initializeGenomicElementType("g1", m1, 1.0);
  	initializeGenomicElement(g1, 0, 1e6);
  	initializeRecombinationRate(1e-8);
  }
  1 early() {
  	sim.addSubpop("p1", 10);
  }
  1000 early() {
  	sim.addSubpopSplit("p2", 500, p1);
  }
  3000 early() {
  	sim.addSubpopSplit("p3", 2500, p1);
  }
  5000 early() {
  	sim.addSubpopSplit("p4", 10000, p1);
  }
  20000 late() {
  	sim.treeSeqOutput("%s");
  }
  ', ts_file), script_file)

  binary <- get_binary("batch")
  system2(binary, script_file, stdout = FALSE)

  suppressMessages(
    ts <- ts_read(ts_file) %>%
      ts_recapitate(Ne = 10, recombination_rate = 1e-8) %>%
      ts_simplify() %>%
      ts_mutate(mutation_rate = 1e-7)
  )

  data <- ts_nodes(ts) %>% dplyr::filter(sampled)

  groups <- split(data$node_id, data$pop)

  # diversity must increase p1 -> p2 -> p3 -> p4
  diversity <- ts_diversity(ts, sample_sets = groups)
  expect_true(all(diff(diversity$diversity) > 0))

  # divergence from p1 must decrease p2 -> p3 -> p4
  divergence <- ts_divergence(ts, sample_sets = groups)
  expect_true(all((divergence %>% dplyr::filter(x == 1) %>% .$divergence %>% diff) < 0))

  expect_true(all(sort(unique(ts_nodes(ts)$pop) == c("p1", "p2", "p3", "p4"))))
})

test_that("ts_ibd() on nonspatial SLiM tree sequences works with coordinates = (T|F)", {
  ts_file <- simulate_slim_ts(1000)
  suppressMessages(ts <- ts_read(ts_file))

  suppressWarnings(ibd_totals <- ts_ibd(ts, coordinates = FALSE))
  suppressWarnings(ibd_fragments <- ts_ibd(ts, coordinates = TRUE))

  # compute IBD totals from individual fragments manually
  ibd_totals2 <-
    dplyr::group_by(ibd_fragments, node1, node2, node1_time, node2_time) %>%
    dplyr::summarise(count = dplyr::n(), total = sum(length), .groups = "keep") %>%
    dplyr::select(node1, node2, count, total, node1_time, node2_time) %>%
    dplyr::ungroup()

  expect_equal(ibd_totals, ibd_totals2)
})

test_that("ts_ibd() on nonspatial msprime tree sequences works with coordinates = (T|F)", {
  ts_file <- msprime_ts_sim_ancestry(100)
  suppressMessages(ts <- ts_read(ts_file))

  suppressWarnings(ibd_totals <- ts_ibd(ts, coordinates = FALSE))
  suppressWarnings(ibd_fragments <- ts_ibd(ts, coordinates = TRUE))

  # compute IBD totals from individual fragments manually
  ibd_totals2 <-
    dplyr::group_by(ibd_fragments, node1, node2, node1_time, node2_time) %>%
    dplyr::summarise(count = dplyr::n(), total = sum(length), .groups = "keep") %>%
    dplyr::select(node1, node2, count, total, node1_time, node2_time) %>%
    dplyr::ungroup()

  expect_equal(ibd_totals, ibd_totals2)
})

test_that("ts_ibd() on non-spatial msprime tree sequences gives a correct object", {
  ts_file <- msprime_ts_sim_ancestry(100)
  suppressMessages(ts <- ts_read(ts_file))

  ibd_sf <- ts_ibd(ts, coordinates = FALSE, minimum_length = 0)
  ibd_nosf <- ts_ibd(ts, coordinates = FALSE, minimum_length = 0, sf = FALSE)

  # returned object is never of a sf class as it's not from a spatial tree sequence
  expect_true(!inherits(ibd_sf, "sf"))
  expect_true(!inherits(ibd_nosf, "sf"))

  # except for the spatial columns, the IBD results are the same
  expect_equal(ibd_sf, ibd_nosf)
})
bodkan/spannr documentation built on Dec. 19, 2024, 11:43 p.m.