Nothing
# Note that this script pre-dates automatic loading of tree-sequence outputs by slim() and msprime()
# functions. This is why it loads simulated outputs via the original ts_load() method (which is still
# valid, so it actually makes sense to test things this way).
skip_if(!is_slendr_env_present())
init_env(quiet = TRUE)
# Let's start by defining a couple of parameters for our simulations
seed <- 42 # random seed
seq_len <- 100e6 # amount of sequence to simulate
rec_rate <- 1e-8 # uniform recombination rate
mut_rate <- 1e-8 # mutation rate
# Now we define a very simple population model. Note that the Ne of the
# populations `x1` and `x2` is set to be much higher than the rest. The Ne of
# the other populations is low to speed up the simulation (they won't affect the
# results anyway), but increasing the Ne of the `x1` and `x2` will ensure that
# the effect of the drift acting on those two populations will be minimal.
# Because we will simulate (and, later, measure) the proportion of ancestry from
# the population `c`` to `x1`, this will ensure that the ancestry proportion
# will not drift too far away from the expectation.
#
# (Of course, this is all done for demonstration purposes only and to speed up
# the SLiM simulations by making the Ne of the other populations smaller.)
o <- population("o", time = 1, N = 1)
b <- population("b", parent = o, time = 500, N = 10)
c <- population("c", parent = b, time = 1000, N = 10)
x1 <- population("x1", parent = c, time = 2000, N = 10000)
x2 <- population("x2", parent = c, time = 2000, N = 10000)
a <- population("a", parent = b, time = 1500, N = 10)
# no gene flow model
model_nogf <- compile_model(populations = list(a, b, x1, x2, c, o), generation_time = 1, overwrite = TRUE, force = TRUE, simulation_length = 2200)
samples <- schedule_sampling(model_nogf, times = 2200, list(a, 1), list(b, 1), list(x1, 50), list(x2, 50), list(c, 1), list(o, 1))
ts_slim_nogf <- normalizePath(tempfile(), winslash = "/", mustWork = FALSE)
ts_msprime_nogf <- normalizePath(tempfile(), winslash = "/", mustWork = FALSE)
slim(model_nogf, output = ts_slim_nogf, sequence_length = seq_len, recombination_rate = rec_rate, samples = samples, random_seed = seed)
msprime(model_nogf, output = ts_msprime_nogf, sequence_length = seq_len, recombination_rate = rec_rate, samples = samples, random_seed = seed)
# model with gene flow
gf <- gene_flow(from = b, to = x1, start = 2010, end = 2200, rate = 0.1)
model_gf <- compile_model(populations = list(a, b, x1, x2, c, o), gene_flow = gf, generation_time = 1, overwrite = TRUE, force = TRUE, simulation_length = 2200)
samples <- schedule_sampling(model_gf, times = 2200, list(a, 1), list(b, 1), list(x1, 50), list(x2, 50), list(c, 1), list(o, 1))
ts_slim_gf <- normalizePath(tempfile(), winslash = "/", mustWork = FALSE)
ts_msprime_gf <- normalizePath(tempfile(), winslash = "/", mustWork = FALSE)
slim(model_gf, output = ts_slim_gf, sequence_length = seq_len, recombination_rate = rec_rate, samples = samples, random_seed = seed)
msprime(model_gf, output = ts_msprime_gf, sequence_length = seq_len, recombination_rate = rec_rate, samples = samples, random_seed = seed)
# Load tree sequence files saved by the SLiM backend script from the two models:
slim_nogf <- ts_load(model = model_nogf, file = ts_slim_nogf) %>%
ts_recapitate(Ne = 10, recombination_rate = rec_rate, random_seed = seed) %>%
ts_simplify() %>%
ts_mutate(mutation_rate = mut_rate, random_seed = seed)
slim_gf <- ts_load(model = model_gf, file = ts_slim_gf) %>%
ts_recapitate(Ne = 10, recombination_rate = rec_rate, random_seed = seed) %>%
ts_simplify() %>%
ts_mutate(mutation_rate = mut_rate, random_seed = seed)
# Extract vector of names of the "test individuals" in populations `x1` and `x2`:
slim_individuals <- ts_samples(slim_gf) %>%
dplyr::filter(pop %in% c("x1", "x2")) %>%
dplyr::pull(name)
# Calculate f4-statistics on individuals of `x1` and `x2` populations using data
# from the two models (a model with no gene flow and a gene flow model):
df_slim_f4 <- rbind(
purrr::map_dfr(slim_individuals, ~ ts_f4(slim_nogf, "c_1", .x, "b_1", "o_1")) %>% dplyr::mutate(model = "no gene flow"),
purrr::map_dfr(slim_individuals, ~ ts_f4(slim_gf, "c_1", .x, "b_1", "o_1")) %>% dplyr::mutate(model = "gene flow")
) %>%
dplyr::select(X, f4, model) %>%
dplyr::mutate(simulator = "SLiM backend")
# Compute the proportions of `b` ancestry in `x1` (expected 10%) and `x2`
# (expected 0% because this population did not receive any gene flow from `b`):
df_slim_f4ratio <- rbind(
ts_f4ratio(slim_nogf, slim_individuals, "a_1", "b_1", "c_1", "o_1") %>% dplyr::mutate(model = "no gene flow"),
ts_f4ratio(slim_gf, slim_individuals, "a_1", "b_1", "c_1", "o_1") %>% dplyr::mutate(model = "gene flow")
) %>%
dplyr::select(X, alpha, model) %>%
dplyr::mutate(simulator = "SLiM backend")
# Load tree sequence files saved by the SLiM backend script from the two models:
msprime_nogf <- ts_load(model = model_nogf, file = ts_msprime_nogf) %>%
ts_mutate(mutation_rate = mut_rate, random_seed = seed)
msprime_gf <- ts_load(model = model_gf, file = ts_msprime_gf) %>%
ts_mutate(mutation_rate = mut_rate, random_seed = seed)
# Extract vector of names of the "test individuals" in populations `x1` and `x2`:
msprime_individuals <- ts_samples(msprime_gf) %>%
dplyr::filter(pop %in% c("x1", "x2")) %>%
dplyr::pull(name)
test_that("msprime and SLiM produce the same set of sampled individuals", {
expect_equal(slim_individuals, msprime_individuals)
})
# Calculate f4-statistics on individuals of `x1` and `x2` populations using data
# from the two models (a model with no gene flow and a gene flow model):
df_msprime_f4 <- rbind(
purrr::map_dfr(msprime_individuals, ~ ts_f4(msprime_nogf, "c_1", .x, "b_1", "o_1")) %>% dplyr::mutate(model = "no gene flow"),
purrr::map_dfr(msprime_individuals, ~ ts_f4(msprime_gf, "c_1", .x, "b_1", "o_1")) %>% dplyr::mutate(model = "gene flow")
) %>%
dplyr::select(X, f4, model) %>%
dplyr::mutate(simulator = "msprime backend")
# Compute the proportions of `b` ancestry in `x1` (expected 10%) and `x2`
# (expected 0% because this population did not receive any gene flow from `b`):
df_msprime_f4ratio <- rbind(
ts_f4ratio(msprime_nogf, msprime_individuals, "a_1", "b_1", "c_1", "o_1") %>% dplyr::mutate(model = "no gene flow"),
ts_f4ratio(msprime_gf, msprime_individuals, "a_1", "b_1", "c_1", "o_1") %>% dplyr::mutate(model = "gene flow")
) %>%
dplyr::select(X, alpha, model) %>%
dplyr::mutate(simulator = "msprime backend")
# Now for the real test: we defined several population genetic models. If
# everything works as expected, this model should produce the same result (or
# nearly the same result, taking into account uncertainty and randomness)
# regardless of whether we run it through the SLiM forward simulation backend
# engine or msprime coalescent backend engine:
df_f4 <- rbind(df_slim_f4, df_msprime_f4) %>%
dplyr::mutate(population = ifelse(grepl("x1_", X),
"x1 (received gene flow)",
"x2 (no gene flow)")) %>%
as.data.frame()
# current_f4_tsv <- paste0(tempfile(), ".tsv.gz")
# readr::write_tsv(df_f4, current_f4_tsv, progress = FALSE)
original_f4_tsv <- sprintf("f4_%s.tsv.gz", Sys.info()["sysname"])
# readr::write_tsv(df_f4, original_f4_tsv, progress = FALSE)
orig_df_f4 <- readr::read_tsv(original_f4_tsv, show_col_types = FALSE, progress = FALSE) %>%
as.data.frame()
# library(ggplot2)
# p_f4 <- ggplot(df_f4, aes(f4, fill = population)) +
# geom_histogram(bins = 50) +
# facet_grid(simulator ~ model) +
# geom_vline(xintercept = 0, linetype = 2) +
# labs(y = "number of individuals", x = "f4 statistic",
# title = "f4 statistics calculated on simulated data",
# subtitle = "Note that for f4 values ~0, the hypothesis of no gene flow can't be rejected") +
# theme(legend.position = "bottom")
# png_file <- sprintf("f4_%s.png", Sys.info()["sysname"])
# ggsave(png_file, p_f4, width = 8, height = 5)
test_that("f4 distributions from SLiM and msprime simulations match", {
expect_equal(df_f4, orig_df_f4, tolerance = 1e-8)
})
df_f4ratio <- rbind(df_slim_f4ratio, df_msprime_f4ratio) %>%
dplyr::mutate(population = ifelse(grepl("x1_", X),
"x1 (received gene flow)",
"x2 (no gene flow)")) %>%
as.data.frame()
# current_f4r_tsv <- paste0(tempfile(), ".tsv.gz")
# readr::write_tsv(df_f4ratio, current_f4r_tsv, progress = FALSE)
original_f4r_tsv <- sprintf("f4ratio_%s.tsv.gz", Sys.info()["sysname"])
# readr::write_tsv(df_f4ratio, original_f4r_tsv, progress = FALSE)
orig_df_f4ratio <- readr::read_tsv(original_f4r_tsv, show_col_types = FALSE, progress = FALSE) %>%
as.data.frame()
# library(ggplot2)
# p_f4ratio <- ggplot(df_f4ratio, aes(alpha, fill = population)) +
# geom_histogram(bins = 30) +
# facet_grid(simulator ~ model) +
# geom_vline(xintercept = 0.1, linetype = 2) +
# labs(y = "number of individuals", x = "ancestry proportion (f4-ratio statistic)",
# title = "f4-ratio estimate of 'b' ancestry calculated from simulated data",
# subtitle = "Population 'x1' receives 10% gene flow (vertical dotted line)
# from 'b' in gene flow models, 'x2' never does") +
# theme(legend.position = "bottom")
# png_file <- sprintf("f4ratio_%s.png", Sys.info()["sysname"])
# ggsave(png_file, p_f4ratio, width = 8, height = 5)
test_that("f4-ratio distributions from SLiM and msprime simulations match", {
expect_equal(df_f4ratio, orig_df_f4ratio, tolerance = 1e-8)
})
# Great! We got almost the same results, as expected! We can also inspect the
# variance of the statistics between different simulation back ends and see that
# they are, indeed, extremely similar between both simulators:
#
# df_f4 %>% dplyr::group_by(model, simulator) %>% dplyr::summarise(var(f4))
# df_f4ratio %>% dplyr::group_by(model, simulator) %>% dplyr::summarise(var(alpha))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.