tests/testthat/test-msprime.R

# This long script generates from simple slendr models:
#  - constant N
#  - step reduction of N
#  - step increase of N
#  - exponential increase of N
#  - exponential decrease of N
#
# Those models are configured in both forward and backward time specifications,
# so each model is specified twice.
#
# It then runs those slendr model configurations through both SLiM and msprime
# backends bundled with the slendr R package. Finally, an allele frequency
# spectrum is estimated from tree sequence files saved by both backends for
# each model variant (forward and backward). The AFS are then compared for each
# of the two backends and we make sure that forward and backward configurations
# of the same model give *exactly* the same AFS (it has to be exactly the same
# because it is, in fact, generated from the same theoretical model). Then, we
# also compare the AFS between SLiM and msprime runs for the same model
# configuration. These won't be exactly the same (the tree sequences are
# generated by two completely different pieces of software after all - SLiM
# and msprime Python library), but they should be *nearly* the same.

skip_if(!is_slendr_env_present())
init_env(quiet = TRUE)

RERUN <- FALSE

seed <- 42
N <- 1000
N_factor <- 5
n_samples <- 50
seq_len <- 100e6
rec_rate <- 1e-8
mut_rate <- 1e-8

# constant population size models - forward and backward direction, SLiM and msprime

forward_const_dir <- file.path(tempdir(), "forward_const")
forward_const_pop <- population("forward_const_pop", time = 1, N = N, map = FALSE)
forward_const_model <- compile_model(forward_const_pop, path = forward_const_dir, generation_time = 1,
                               overwrite = TRUE, force = TRUE, direction = "forward", simulation_length = 5000)
forward_const_samples <- schedule_sampling(forward_const_model, times = 5001, list(forward_const_pop, n_samples))

backward_const_dir <- file.path(tempdir(), "backward_const")
backward_const_pop <- population("backward_const_pop", time = 5000, N = N, map = FALSE)
backward_const_model <- compile_model(backward_const_pop, path = backward_const_dir , generation_time = 1,
                                overwrite = TRUE, force = TRUE, direction = "backward")
backward_const_samples <- schedule_sampling(backward_const_model, times = 0, list(backward_const_pop, n_samples))

const_ts <- run_slim_msprime(
  forward_const_model, backward_const_model,
  forward_const_samples, backward_const_samples,
  seq_len, rec_rate, seed, verbose = FALSE
)

# population size contraction models - forward and backward direction, SLiM and msprime

forward_contr_dir <- file.path(tempdir(), "forward_contr")
forward_contr_pop <- population("forward_contr_pop", time = 1, N = N, map = FALSE) %>%
  resize(time = 2001, N = N / N_factor, how = "step")
forward_contr_model <- compile_model(forward_contr_pop, path = forward_contr_dir, generation_time = 1,
                               overwrite = TRUE, force = TRUE, direction = "forward", simulation_length = 5000)
forward_contr_samples <- schedule_sampling(forward_contr_model, times = 5001, list(forward_contr_pop, n_samples))

backward_contr_dir <- file.path(tempdir(), "backward_contr")
backward_contr_pop <- population("backward_contr_pop", time = 5000, N = N, map = FALSE) %>%
  resize(time = 3000, N = N / N_factor, how = "step")
backward_contr_model <- compile_model(backward_contr_pop, path = backward_contr_dir, generation_time = 1,
                                overwrite = TRUE, force = TRUE, direction = "backward")
backward_contr_samples <- schedule_sampling(backward_contr_model, times = 0, list(backward_contr_pop, n_samples))

contr_ts <- run_slim_msprime(
  forward_contr_model, backward_contr_model,
  forward_contr_samples, backward_contr_samples,
  seq_len, rec_rate, seed, verbose = FALSE
)

# population size increase models - forward and backward direction, SLiM and msprime

forward_expansion_dir <- file.path(tempdir(), "forward_expansion")
forward_expansion_pop <- population("forward_expansion_pop", time = 1, N = N, map = FALSE) %>%
  resize(time = 2001, N = N * N_factor, how = "step")
forward_expansion_model <- compile_model(forward_expansion_pop, path = forward_expansion_dir, generation_time = 1,
                                   overwrite = TRUE, force = TRUE, direction = "forward", simulation_length = 5000)
forward_expansion_samples <- schedule_sampling(forward_expansion_model, times = 5001, list(forward_expansion_pop, n_samples))

backward_expansion_dir <- file.path(tempdir(), "backward_expansion")
backward_expansion_pop <- population("backward_expansion_pop", time = 5000, N = N, map = FALSE) %>%
  resize(time = 3000, N = N * N_factor, how = "step")
backward_expansion_model <- compile_model(backward_expansion_pop, path = backward_expansion_dir, generation_time = 1,
                                    overwrite = TRUE, force = TRUE, direction = "backward")
backward_expansion_samples <- schedule_sampling(backward_expansion_model, times = 0, list(backward_expansion_pop, n_samples))

expansion_ts <- run_slim_msprime(
  forward_expansion_model, backward_expansion_model,
  forward_expansion_samples, backward_expansion_samples,
  seq_len, rec_rate, seed, verbose = FALSE
)

# exponential increase  models - forward and backward direction, SLiM and msprime

forward_exp_inc_dir <- file.path(tempdir(), "forward_exp_inc")
forward_exp_inc_pop <- population("forward_exp_inc_pop", time = 1, N = N / N_factor, map = FALSE) %>%
  resize(time = 2001, end = 3001, N = N, how = "exponential")
forward_exp_inc_model <- compile_model(forward_exp_inc_pop, path = forward_exp_inc_dir, generation_time = 1,
                                   overwrite = TRUE, force = TRUE, direction = "forward", simulation_length = 5000)
forward_exp_inc_samples <- schedule_sampling(forward_exp_inc_model, times = 5001, list(forward_exp_inc_pop, n_samples))

backward_exp_inc_dir <- file.path(tempdir(), "backward_exp_inc")
backward_exp_inc_pop <- population("backward_exp_inc_pop", time = 1, N = N / N_factor, map = FALSE) %>%
  resize(time = 2001, end = 3001, N = N, how = "exponential")
backward_exp_inc_model <- compile_model(backward_exp_inc_pop, path = backward_exp_inc_dir, generation_time = 1,
                                   overwrite = TRUE, force = TRUE, direction = "forward", simulation_length = 5000)
backward_exp_inc_samples <- schedule_sampling(backward_exp_inc_model, times = 5001, list(backward_exp_inc_pop, n_samples))

exp_inc_ts <- run_slim_msprime(
  forward_exp_inc_model, backward_exp_inc_model,
  forward_exp_inc_samples, backward_exp_inc_samples,
  seq_len, rec_rate, seed, verbose = FALSE
)

# exponential decrease  models - forward and backward direction, SLiM and msprime

forward_exp_decr_dir <- file.path(tempdir(), "forward_exp_decr")
forward_exp_decr_pop <- population("forward_exp_decr_pop", time = 1, N = N, map = FALSE) %>%
  resize(time = 2001, end = 3001, N = N / N_factor, how = "exponential")
forward_exp_decr_model <- compile_model(forward_exp_decr_pop, path = forward_exp_decr_dir, generation_time = 1,
                                 overwrite = TRUE, force = TRUE, direction = "forward", simulation_length = 5000)
forward_exp_decr_samples <- schedule_sampling(forward_exp_decr_model, times = 5001, list(forward_exp_decr_pop, n_samples))

backward_exp_decr_dir <- file.path(tempdir(), "backward_exp_decr")
backward_exp_decr_pop <- population("backward_exp_decr_pop", time = 1, N = N, map = FALSE) %>%
  resize(time = 2001, end = 3001, N = N / N_factor, how = "exponential")
backward_exp_decr_model <- compile_model(backward_exp_decr_pop, path = backward_exp_decr_dir, generation_time = 1,
                                  overwrite = TRUE, force = TRUE, direction = "forward", simulation_length = 5000)
backward_exp_decr_samples <- schedule_sampling(backward_exp_decr_model, times = 5001, list(backward_exp_decr_pop, n_samples))

exp_decr_ts <- run_slim_msprime(
  forward_exp_decr_model, backward_exp_decr_model,
  forward_exp_decr_samples, backward_exp_decr_samples,
  seq_len, rec_rate, seed, verbose = FALSE
)

# load tree sequence files from msprime
msprime_forward_const_ts <- load_tree_sequence("msprime", "forward", const_ts, forward_const_model, N, rec_rate, mut_rate, seed)
msprime_backward_const_ts <- load_tree_sequence("msprime", "backward", const_ts, backward_const_model, N, rec_rate, mut_rate, seed)

msprime_forward_contr_ts <- load_tree_sequence("msprime", "forward", contr_ts, forward_contr_model, N, rec_rate, mut_rate, seed)
msprime_backward_contr_ts <- load_tree_sequence("msprime", "backward", contr_ts, backward_contr_model, N, rec_rate, mut_rate, seed)

msprime_forward_expansion_ts <- load_tree_sequence("msprime", "forward", expansion_ts, forward_expansion_model, N, rec_rate, mut_rate, seed)
msprime_backward_expansion_ts <- load_tree_sequence("msprime", "backward", expansion_ts, backward_expansion_model, N, rec_rate, mut_rate, seed)

msprime_forward_exp_inc_ts <- load_tree_sequence("msprime", "forward", exp_inc_ts, forward_exp_inc_model, N, rec_rate, mut_rate, seed)
msprime_backward_exp_inc_ts <- load_tree_sequence("msprime", "backward", exp_inc_ts, backward_exp_inc_model, N, rec_rate, mut_rate, seed)

msprime_forward_exp_decr_ts <- load_tree_sequence("msprime", "forward", exp_decr_ts, forward_exp_decr_model, N, rec_rate, mut_rate, seed)
msprime_backward_exp_decr_ts <- load_tree_sequence("msprime", "backward", exp_decr_ts, backward_exp_decr_model, N, rec_rate, mut_rate, seed)

# load tree sequence files from SLiM
slim_forward_const_ts <- load_tree_sequence("SLiM", "forward", const_ts, forward_const_model, N, rec_rate, mut_rate, seed)
slim_backward_const_ts <- load_tree_sequence("SLiM", "backward", const_ts, backward_const_model, N, rec_rate, mut_rate, seed)

slim_forward_contr_ts <- load_tree_sequence("SLiM", "forward", contr_ts, forward_contr_model, N, rec_rate, mut_rate, seed)
slim_backward_contr_ts <- load_tree_sequence("SLiM", "backward", contr_ts, backward_contr_model, N, rec_rate, mut_rate, seed)

slim_forward_expansion_ts <- load_tree_sequence("SLiM", "forward", expansion_ts, forward_expansion_model, N, rec_rate, mut_rate, seed)
slim_backward_expansion_ts <- load_tree_sequence("SLiM", "backward", expansion_ts, backward_expansion_model, N, rec_rate, mut_rate, seed)

slim_forward_exp_inc_ts <- load_tree_sequence("SLiM", "forward", exp_inc_ts, forward_exp_inc_model, N, rec_rate, mut_rate, seed)
slim_backward_exp_inc_ts <- load_tree_sequence("SLiM", "backward", exp_inc_ts, forward_exp_inc_model, N, rec_rate, mut_rate, seed)

slim_forward_exp_decr_ts <- load_tree_sequence("SLiM", "forward", exp_decr_ts, forward_exp_decr_model, N, rec_rate, mut_rate, seed)
slim_backward_exp_decr_ts <- load_tree_sequence("SLiM", "backward", exp_decr_ts, forward_exp_decr_model, N, rec_rate, mut_rate, seed)

# compute AFS from all tree sequence files - msprime
msprime_forward_const_afs <- ts_afs(msprime_forward_const_ts, polarised = TRUE)[-1]
msprime_backward_const_afs <- ts_afs(msprime_backward_const_ts, polarised = TRUE)[-1]

msprime_forward_contr_afs <- ts_afs(msprime_forward_contr_ts, polarised = TRUE)[-1]
msprime_backward_contr_afs <- ts_afs(msprime_backward_contr_ts, polarised = TRUE)[-1]

msprime_forward_expansion_afs <- ts_afs(msprime_forward_expansion_ts, polarised = TRUE)[-1]
msprime_backward_expansion_afs <- ts_afs(msprime_backward_expansion_ts, polarised = TRUE)[-1]

msprime_forward_exp_inc_afs <- ts_afs(msprime_forward_exp_inc_ts, polarised = TRUE)[-1]
msprime_backward_exp_inc_afs <- ts_afs(msprime_backward_exp_inc_ts, polarised = TRUE)[-1]

msprime_forward_exp_decr_afs <- ts_afs(msprime_forward_exp_decr_ts, polarised = TRUE)[-1]
msprime_backward_exp_decr_afs <- ts_afs(msprime_backward_exp_decr_ts, polarised = TRUE)[-1]

# compute AFS from all tree sequence files - SLiM
slim_forward_const_afs <- ts_afs(slim_forward_const_ts, polarised = TRUE)[-1]
slim_backward_const_afs <- ts_afs(slim_backward_const_ts, polarised = TRUE)[-1]

slim_forward_contr_afs <- ts_afs(slim_forward_contr_ts, polarised = TRUE)[-1]
slim_backward_contr_afs <- ts_afs(slim_backward_contr_ts, polarised = TRUE)[-1]

slim_forward_expansion_afs <- ts_afs(slim_forward_expansion_ts, polarised = TRUE)[-1]
slim_backward_expansion_afs <- ts_afs(slim_backward_expansion_ts, polarised = TRUE)[-1]

slim_forward_exp_inc_afs <- ts_afs(slim_forward_exp_inc_ts, polarised = TRUE)[-1]
slim_backward_exp_inc_afs <- ts_afs(slim_backward_exp_inc_ts, polarised = TRUE)[-1]

slim_forward_exp_decr_afs <- ts_afs(slim_forward_exp_decr_ts, polarised = TRUE)[-1]
slim_backward_exp_decr_afs <- ts_afs(slim_backward_exp_decr_ts, polarised = TRUE)[-1]

# bind together all allele frequency spectra results
afs <- dplyr::bind_rows(
  dplyr::tibble(f = msprime_forward_const_afs, n = 1:(2 * n_samples), sim = "msprime", direction = "forward", model = sprintf("constant %d", N)),
  dplyr::tibble(f = msprime_backward_const_afs, n = 1:(2 * n_samples), sim = "msprime", direction = "backward", model = sprintf("constant %d", N)),

  dplyr::tibble(f = msprime_forward_contr_afs, n = 1:(2 * n_samples), sim = "msprime", direction = "forward", model = sprintf("step contraction %d to %d", N, N / N_factor)),
  dplyr::tibble(f = msprime_backward_contr_afs, n = 1:(2 * n_samples), sim = "msprime", direction = "backward", model = sprintf("step contraction %d to %d", N, N / N_factor)),

  dplyr::tibble(f = msprime_forward_expansion_afs, n = 1:(2 * n_samples), sim = "msprime", direction = "forward", model = sprintf("step expansion %d to %d", N, N * N_factor)),
  dplyr::tibble(f = msprime_backward_expansion_afs, n = 1:(2 * n_samples), sim = "msprime", direction = "backward", model = sprintf("step expansion %d to %d", N, N * N_factor)),

  dplyr::tibble(f = msprime_forward_exp_inc_afs, n = 1:(2 * n_samples), sim = "msprime", direction = "forward", model = sprintf("exponential increase %d to %d", N / N_factor, N)),
  dplyr::tibble(f = msprime_backward_exp_inc_afs, n = 1:(2 * n_samples), sim = "msprime", direction = "backward", model = sprintf("exponential increase %d to %d", N / N_factor, N)),

  dplyr::tibble(f = msprime_forward_exp_decr_afs, n = 1:(2 * n_samples), sim = "msprime", direction = "forward", model = sprintf("exponential decrease %d to %d", N, N / N_factor)),
  dplyr::tibble(f = msprime_backward_exp_decr_afs, n = 1:(2 * n_samples), sim = "msprime", direction = "backward", model = sprintf("exponential decrease %d to %d", N, N / N_factor)),

  dplyr::tibble(f = slim_forward_const_afs, n = 1:(2 * n_samples), sim = "slim", direction = "forward", model = sprintf("constant %d", N)),
  dplyr::tibble(f = slim_backward_const_afs, n = 1:(2 * n_samples), sim = "slim", direction = "backward", model = sprintf("constant %d", N)),

  dplyr::tibble(f = slim_forward_contr_afs, n = 1:(2 * n_samples), sim = "slim", direction = "forward", model = sprintf("step contraction %d to %d", N, N / N_factor)),
  dplyr::tibble(f = slim_backward_contr_afs, n = 1:(2 * n_samples), sim = "slim", direction = "backward", model = sprintf("step contraction %d to %d", N, N / N_factor)),

  dplyr::tibble(f = slim_forward_expansion_afs, n = 1:(2 * n_samples), sim = "slim", direction = "forward", model = sprintf("step expansion %d to %d", N, N * N_factor)),
  dplyr::tibble(f = slim_backward_expansion_afs, n = 1:(2 * n_samples), sim = "slim", direction = "backward", model = sprintf("step expansion %d to %d", N, N * N_factor)),

  dplyr::tibble(f = slim_forward_exp_inc_afs, n = 1:(2 * n_samples), sim = "slim", direction = "forward", model = sprintf("exponential increase %d to %d", N / N_factor, N)),
  dplyr::tibble(f = slim_backward_exp_inc_afs, n = 1:(2 * n_samples), sim = "slim", direction = "backward", model = sprintf("exponential increase %d to %d", N / N_factor, N)),

  dplyr::tibble(f = slim_forward_exp_decr_afs, n = 1:(2 * n_samples), sim = "slim", direction = "forward", model = sprintf("exponential decrease %d to %d", N, N / N_factor)),
  dplyr::tibble(f = slim_backward_exp_decr_afs, n = 1:(2 * n_samples), sim = "slim", direction = "backward", model = sprintf("exponential decrease %d to %d", N, N / N_factor))
) %>%
  dplyr::mutate(f = as.vector(f),
                sim = factor(sim, levels = c("msprime", "slim")),
                model = factor(
                  model,
                  levels = c(sprintf("constant %d", N),
                             sprintf("step contraction %d to %d", N, N / N_factor),
                             sprintf("step expansion %d to %d", N, N * N_factor),
                             sprintf("exponential increase %d to %d", N / N_factor, N),
                             sprintf("exponential decrease %d to %d", N, N / N_factor))))

test_that("msprime forward/backward sims are exactly the same", {
  expect_true({
    df <- afs[afs$sim == "msprime" & grepl("constant", afs$model), ]
    all(df[df$direction == "forward", "f"] == df[df$direction == "backward", "f"])
  })
  expect_true({
    df <- afs[afs$sim == "msprime" & grepl("step contraction", afs$model), ]
    all(df[df$direction == "forward", "f"] == df[df$direction == "backward", "f"])
  })
  expect_true({
    df <- afs[afs$sim == "msprime" & grepl("step expansion", afs$model), ]
    all(df[df$direction == "forward", "f"] == df[df$direction == "backward", "f"])
  })
  expect_true({
    df <- afs[afs$sim == "msprime" & grepl("exponential increase", afs$model), ]
    all(df[df$direction == "forward", "f"] == df[df$direction == "backward", "f"])
  })
  expect_true({
    df <- afs[afs$sim == "msprime" & grepl("exponential decrease", afs$model), ]
    all(df[df$direction == "forward", "f"] == df[df$direction == "backward", "f"])
  })
})

test_that("SLiM forward/backward sims are exactly the same", {
  expect_true({
    df <- afs[afs$sim == "slim" & grepl("constant", afs$model), ]
    all(df[df$direction == "forward", "f"] == df[df$direction == "backward", "f"])
  })
  expect_true({
    df <- afs[afs$sim == "slim" & grepl("step contraction", afs$model), ]
    all(df[df$direction == "forward", "f"] == df[df$direction == "backward", "f"])
  })
  expect_true({
    df <- afs[afs$sim == "slim" & grepl("step expansion", afs$model), ]
    all(df[df$direction == "forward", "f"] == df[df$direction == "backward", "f"])
  })
  expect_true({
    df <- afs[afs$sim == "slim" & grepl("exponential increase", afs$model), ]
    all(df[df$direction == "forward", "f"] == df[df$direction == "backward", "f"])
  })
  expect_true({
    df <- afs[afs$sim == "slim" & grepl("exponential decrease", afs$model), ]
    all(df[df$direction == "forward", "f"] == df[df$direction == "backward", "f"])
  })
})

if (RERUN) {
# SLiM and msprime simulations from the same model give the same result
# (tested by comparing the distribution plots)
library(ggplot2)
p <- ggplot(afs, aes(n, f, color = direction, linetype = sim)) +
  geom_line(stat = "identity", alpha = 0.5) +
  facet_wrap(~ model) +
  labs(x = "number of derived alleles", y = "count",
       title = "Site frequency spectra obtained from five demographic models",
       subtitle = "Each model was specified in forward or backward direction of time and executed by
two different backend scripts in slendr (implemented in SLiM and msprime)") +
  guides(color = guide_legend("direction of\ntime in slendr"),
         linetype = guide_legend("slendr backend\nengine used")) +
  scale_x_continuous(breaks = c(1, seq(20, 2 * n_samples, 20)),
                     limits = c(1, 2 * n_samples))
png_file <- sprintf("afs_%s.png", Sys.info()["sysname"])
ggsave(png_file, p, width = 8, height = 5)
}

# make sure that the distributions as they were originally inspected and
# verified visually match the new distributions plot -- this is obviously not
# a rigorous test but the allele frequency spectra distributions from msprime
# and SLiM simulations match almost perfectly assuming we simulate large enough
# data to eliminate most of the nose
test_that("AFS distributions from SLiM and msprime simulations match", {
  afs <- afs %>% dplyr::mutate(sim = as.character(sim), model = as.character(model))
  if (RERUN) {
  current_tsv <- paste0(tempfile(), ".tsv.gz")
  readr::write_tsv(afs, current_tsv, progress = FALSE)
  }

  original_tsv <- sprintf("afs_%s.tsv.gz", Sys.info()["sysname"])
  if (RERUN) {
  readr::write_tsv(afs, original_tsv, progress = FALSE)
  }
  orig_afs <- readr::read_tsv(original_tsv, show_col_types = FALSE, progress = FALSE)

  expect_equal(afs, orig_afs)
})
bodkan/spannr documentation built on Dec. 19, 2024, 11:43 p.m.