inst/doc/vignette-11-extensions.R

## ----include = FALSE----------------------------------------------------------
run_vignette <- Sys.getenv("RUNNER_OS") == "" && slendr::check_dependencies(python = TRUE, slim = TRUE, quit = FALSE) 

knitr::opts_chunk$set(
  collapse = FALSE,
  comment = "#>",
  fig.width = 8,
  fig.height = 6,
  dpi = 60,
  eval = run_vignette
)

RERUN <- FALSE

## ----collapse = TRUE, message = FALSE-----------------------------------------
library(slendr)
init_env()

library(dplyr)
library(ggplot2)
library(readr)

seed <- 42
set.seed(seed)

## ----eval=FALSE---------------------------------------------------------------
#  afr <- population("AFR", time = 100000, N = 20000)
#  eur <- population("EUR", time = 60000, N = 2000, parent = afr)
#  
#  # <... compile model and simulate a tree sequence `ts` ...>

## ----eval=FALSE---------------------------------------------------------------
#  # compute heterozygosity in the individual "EUR"
#  ts_diversity(ts, "EUR_1")
#  
#  # compute genetic divergence between selected Africans and Europeans
#  afr_samples <- c("AFR_1", "AFR_2", "AFR_3")
#  eur_samples <- c("EUR_1", "EUR_2", "EUR_3")
#  ts_divergence(ts, list(afr = afr_samples, eur = eur_samples))

## ----eval=FALSE---------------------------------------------------------------
#  afr <- population("AFR", time = 90000, N = 20000)
#  eur <- population("EUR", time = 60000, N = 2000, parent = afr)

## ----eval=FALSE---------------------------------------------------------------
#  model <- compile_model(populations = list(afr, eur), generation_time = 30)
#  
#  slim(model, sequence_length = 100000, recombination_rate = 1e-8, method = "gui")

## ----eval=FALSE---------------------------------------------------------------
#  pop <- population("pop", time = 1, N = 1000)
#  simple_model <- compile_model(pop, generation_time = 1, simulation_length = 1000)
#  
#  slim(simple_model, sequence_length = 1000, recombination_rate = 0, method = "gui")

## ----eval=FALSE---------------------------------------------------------------
#  pop <- population("pop", time = 1, N = 1000)
#  simple_model <- compile_model(pop, generation_time = 1, simulation_length = 1000)
#  
#  slim(simple_model, sequence_length = 1000, recombination_rate = 0, burnin = 100, method = "gui")

## -----------------------------------------------------------------------------
library(slendr)
init_env()

# African ancestral population
afr <- population("AFR", time = 90000, N = 3000)

# first migrants out of Africa
ooa <- population("OOA", parent = afr, time = 60000, N = 500, remove = 23000) %>%
  resize(N = 2000, time = 40000, how = "step")

# Eastern hunter-gatherers
ehg <- population("EHG", parent = ooa, time = 28000, N = 1000, remove = 6000)

# European population
eur <- population("EUR", parent = ehg, time = 25000, N = 5000)

# Anatolian farmers
ana <- population("ANA", time = 28000, N = 3000, parent = ooa, remove = 4000)

# Yamnaya steppe population
yam <- population("YAM", time = 8000, N = 500, parent = ehg, remove = 2500)

# define gene-flow events
gf <- list(
  gene_flow(from = ana, to = yam, rate = 0.4, start = 7900, end = 7800),
  gene_flow(from = ana, to = eur, rate = 0.5, start = 6000, end = 5000),
  gene_flow(from = yam, to = eur, rate = 0.65, start = 4000, end = 3500)
)

## ----model_plot, echo=FALSE---------------------------------------------------
model_for_plot <- compile_model(
  populations = list(afr, ooa, ehg, eur, ana, yam),
  gene_flow = gf, generation_time = 30
)

plot_model(model_for_plot, order = c("EUR", "EHG", "YAM", "ANA", "OOA", "AFR"), proportions = TRUE)

## -----------------------------------------------------------------------------
extension_path <- system.file("extdata", "extension_trajectory.txt", package = "slendr")

## ----echo=FALSE---------------------------------------------------------------
Sys.setenv(EXTENSION1 = extension_path)

## -----------------------------------------------------------------------------
model <- compile_model(
  populations = list(afr, ooa, ehg, eur, ana, yam),
  gene_flow = gf, generation_time = 30,
  extension = extension_path  # <--- include the SLiM extension snippet
)

## ----eval=FALSE---------------------------------------------------------------
#  slim(model, sequence_length = 1e6, recombination_rate = 0, ts = FALSE, method = "gui")

## ----echo=FALSE---------------------------------------------------------------
slim(model, sequence_length = 1e6, recombination_rate = 0, ts = FALSE, path = tempdir())

## -----------------------------------------------------------------------------
extension_path <- system.file("extdata", "extension_trajectory_params.txt", package = "slendr")

## ----echo=FALSE---------------------------------------------------------------
Sys.setenv(EXTENSION = extension_path)

## -----------------------------------------------------------------------------
extension <- substitute_values(
  extension_path,
  s = 0.1, onset_time = 15000,
  origin_pop = "EUR", target_pop = "EUR"
)

## ----eval = FALSE-------------------------------------------------------------
#  extension <- substitute_values(
#    extension_path,
#    onset_time = 15000,
#    origin_pop = "EUR", target_pop = "EUR"
#  )
#  
#  # Error: The extension script contains the following unsubstituted patterns: {{s}}

## -----------------------------------------------------------------------------
model <- compile_model(
  populations = list(afr, ooa, ehg, eur, ana, yam),
  gene_flow = gf, generation_time = 30,
  extension = extension
)

## ----echo = FALSE-------------------------------------------------------------
slim(model, sequence_length = 1e6, recombination_rate = 1e-8, ts = FALSE, path = tempdir())

## ----eval = FALSE-------------------------------------------------------------
#  slim(model, sequence_length = 1e6, recombination_rate = 0, path = tempdir(), ts = FALSE, random_seed = 42)

## -----------------------------------------------------------------------------
run_model <- function(origin_pop, onset_time) {
  extension <- substitute_values(
    extension_path,
    s = 0.1, onset_time = onset_time,
    origin_pop = origin_pop, target_pop = "EUR"
  )
  
  model <- compile_model(
    populations = list(afr, ooa, ehg, eur, ana, yam),
    gene_flow = gf, generation_time = 30,
    extension = extension
  )
  
  slim(model, sequence_length = 1e6, recombination_rate = 0,
       path = tempdir(), ts = FALSE, random_seed = 42)
}

run_model(origin_pop = "EUR", onset_time = 15000)
run_model(origin_pop = "ANA", onset_time = 15000)
run_model(origin_pop = "EHG", onset_time = 15000)
run_model(origin_pop = "YAM", onset_time = 8000)

## ----trajectories-------------------------------------------------------------
load_traj <- function(origin_pop) {
  df <- read.table(paste0(tempdir(), "/traj_EUR_", origin_pop, ".tsv"), header = TRUE)
  df$origin <- origin_pop
  df$target <- "EUR"
  df
}

traj <- rbind(load_traj("EUR"), load_traj("ANA"), load_traj("EHG"), load_traj("YAM"))

library(ggplot2)

ggplot(traj) +
  geom_line(aes(time, freq_target, linetype = "EUR"), color = "black") +
  geom_line(aes(time, freq_origin, color = origin), linetype = "dashed") +
  xlim(15000, 0) +
  labs(title = "Allele frequency in EUR given the origin in another population",
       x = "years before present", y = "allele frequency",
       color = "frequency\nin original\npopulation",
       linetype = "frequency\nin target\npopulation") +
  scale_linetype_manual(values = c("solid", "dashed")) +
  facet_wrap(~ origin)

## -----------------------------------------------------------------------------
# extension "template" provided as a single string (this contains the same code
# as the script used just above, except specified directly in R)
extension_template <- r"(
// Because we want to simulate non-neutral evolution, we have to provide a
// custom initialization callback -- slendr will use it to replace its default
// neutral genomic architecture (i.e. the initialize() {...} callback it uses
// by default for neutral simulations). Note that we can refer to slendr's
// constants SEQUENCE_LENGTH and RECOMBINATION_RATE, which will carry values
// passed through from R via slendr's slim() R function.
initialize() {
    initializeMutationType("m1", 0.5, "f", 0.0);

    initializeGenomicElementType("g1", m1, 1.0);
    initializeGenomicElement(g1, 0, SEQUENCE_LENGTH - 1);

    initializeMutationRate(0);
    initializeRecombinationRate(RECOMBINATION_RATE);
}

// Define model constants (to be substituted) all in one place
// (each {{placeholder}} will be replaced by a value passed from R).
// Note that string constant template patterns are surrounded by "quotes"!
initialize() {
    defineConstant("s", {{s}});
    defineConstant("onset_time", {{onset_time}});
    defineConstant("target_pop", "{{target_pop}}");
    defineConstant("origin_pop", "{{origin_pop}}");

    // compose the path to a trajectory file based on given parameters
    defineConstant("traj_file",
                   PATH + "/" + "traj_" + target_pop + "_" + origin_pop + ".tsv");
}

function (void) add_mutation(void) {
    // sample one target carrier of the new mutation...
    target = sample(population(origin_pop).genomes, 1);
    // ... and add the mutation in the middle of it
    mut = target.addNewMutation(m1, s, position = asInteger(SEQUENCE_LENGTH / 2));

    // save the mutation for later reference
    defineGlobal("MUTATION", mut);

    write_log("adding beneficial mutation to population " + origin_pop);

    writeFile(traj_file, "tick\ttime\tfreq_origin\tfreq_target");
}

tick(onset_time) late() {
    // save simulation state in case we need to restart if the mutation is lost
    save_state();

    add_mutation();
}

tick(onset_time):SIMULATION_END late() {
    // the mutation is not segregating and is not fixed either -- we must restart
    if (!MUTATION.isSegregating & !MUTATION.isFixed) {
        write_log("mutation lost -- restarting");

        reset_state();

        add_mutation();
    }

    // compute the frequency of the mutation of interest and save it (if the
    // mutation is missing at this time, save its frequency as NA)
    freq_origin = "NA";
    freq_target = "NA";
    if (population(origin_pop, check = T))
      freq_origin = population(origin_pop).genomes.mutationFrequenciesInGenomes();
    if (population(target_pop, check = T))
      freq_target = population(target_pop).genomes.mutationFrequenciesInGenomes();

    writeFile(traj_file,
              community.tick + "\t" +
              model_time(community.tick) + "\t" +
              freq_origin + "\t" +
              freq_target, append = T);
}
)"

## -----------------------------------------------------------------------------
run_model <- function(origin_pop, onset_time) {
  extension <- substitute_values(
    extension_template, # <--- template SLiM code string directly (not as a file!)
    s = 0.1, onset_time = onset_time,
    origin_pop = origin_pop, target_pop = "EUR"
  )
  
  model <- compile_model(
    populations = list(afr, ooa, ehg, eur, ana, yam),
    gene_flow = gf, generation_time = 30,
    extension = extension
  )
  
  slim(model, sequence_length = 1e6, recombination_rate = 0,
       path = tempdir(), ts = FALSE, random_seed = 42)
}

run_model("EUR", onset_time = 15000)

head(load_traj("EUR"))

## -----------------------------------------------------------------------------
# create the ancestor of everyone and a chimpanzee outgroup
# (we set both N = 1 to reduce the computational time for this model)
chimp <- population("CH", time = 6.5e6, N = 1000)

# two populations of anatomically modern humans: Africans and Europeans
afr <- population("AFR", parent = chimp, time = 6e6, N = 10000)
eur <- population("EUR", parent = afr, time = 70e3, N = 5000)

# Neanderthal population splitting at 600 ky ago from modern humans
# (becomes extinct by 40 ky ago)
nea <- population("NEA", parent = afr, time = 600e3, N = 1000, remove = 40e3)

# 5% Neanderthal introgression into Europeans between 55-50 ky ago
gf <- gene_flow(from = nea, to = eur, rate = 0.05, start = 55000, end = 45000)

model <- compile_model(
  populations = list(chimp, nea, afr, eur), gene_flow = gf,
  generation_time = 30
)

## ----introgression_model, echo=FALSE, fig.width=6, fig.height=4---------------
cowplot::plot_grid(
  plot_model(model, sizes = FALSE, proportions = TRUE),
  plot_model(model, sizes = FALSE, log = TRUE, proportions = TRUE),
  nrow = 1
)

## ----echo=FALSE---------------------------------------------------------------
eur <- population("EUR", time = 100e3, N = 10000)
nea <- population("NEA", time = 100e3, N = 1000, remove = 40000)

gf <- gene_flow(from = nea, to = eur, rate = 0.05, start = 55000, end = 54500)

## -----------------------------------------------------------------------------
extension <- r"(
initialize() {
  // model parameters to be substitute_values()'d from R below
  defineConstant("gene_length", {{gene_length}});
  defineConstant("n_genes", {{n_genes}});
  defineConstant("n_markers", {{n_markers}});
  defineConstant("introgression_time", {{introgression_time}});
  defineConstant("freq_file", PATH + "/{{freq_file}}");

  // total length of the genome to be simulated
  defineConstant("total_length", n_genes * gene_length);

  // positions of neutral Neanderthal markers along the genome
  defineConstant("neutral_pos", seq(0, total_length - 1, by = gene_length / n_markers));
  // positions of deleterious mutations in the center of each gene
  defineConstant("selected_pos", seq(gene_length / 2, total_length - 1, by = gene_length));
}

// Because we want to simulate non-neutral evolution, we have to provide a
// custom initialization callback -- slendr will use it to replace its default
// neutral genomic architecture (i.e. the initialize() {...} callback it uses
// by default for neutral simulations).
initialize() {
  initializeMutationType("m0", 0.5, "f", 0.0); // neutral Neanderthal marker mutation type
  initializeMutationType("m1", 0.5, "f", {{s}}); // deleterious Neanderthal mutation type
  initializeGenomicElementType("g1", m0, 1.0); // genomic type of 'genes'

  genes = c(); // gene start-end coordinates
  breakpoints = c(); // recombination breakpoints
  rates = c(); // recombination rates

  // compute coordinates of genes, as well as the recombination map breakpoints
  // between each gene
  start = 0;
  for (i in seqLen(n_genes)) {
    // end of the next gene
    end = start + gene_length - 1;
    genes = c(genes, start, end);

    // uniform recombination within a gene, followed by a 0.5 recombination breakpoint
    rates = c(rates, RECOMBINATION_RATE, 0.5);
    breakpoints = c(breakpoints, end, end + 1);

    // start of the following genes
    start = end + 1;
  }

  // odd elements --> starts of genes
  gene_starts = integerMod(seqAlong(genes), 2) == 0;
  // even elements --> ends of genes
  gene_ends = integerMod(seqAlong(genes), 2) == 1;

  // set up all the genes at once
  initializeGenomicElement(g1, genes[gene_starts], genes[gene_ends]);

  // set up the recombination map
  initializeRecombinationRate(rates, breakpoints);

  // no mutation rate (we will add neutral variation after the SLiM run)
  initializeMutationRate(0);
}

// Add Neanderthal-specific mutations one tick prior to the introgression
tick(introgression_time) - 1 late() {
  // get all Neanderthal chromosomes just prior to the introgression
  target = population("NEA").genomes;

  write_log("adding neutral Neanderthal markers");

  mutations = target.addNewDrawnMutation(m0, position = asInteger(neutral_pos));
  defineConstant("MARKERS", target.mutations);

  write_log("adding deleterious Neanderthal mutation");

  target.addNewDrawnMutation(m1, position = asInteger(selected_pos));
}

// At the end, write Neanderthal ancestry proportions along the genome to a file
// (in our R analysis code we will compare the results of this with the
// equivalent computation using a tree sequence)
SIMULATION_END late() {
  df = DataFrame(
    "gene", asInteger(MARKERS.position / gene_length),
    "pos", MARKERS.position,
    "freq", sim.mutationFrequencies(population("EUR"), MARKERS)
  );
  writeFile(freq_file, df.serialize(format = "tsv"));
}
)" %>%
  substitute_values(
    introgression_time = 55000, s = -0.003,
    gene_length = 5e6, n_genes = 200, n_markers = 100,
    freq_file = "frequencies.tsv"
  )

## -----------------------------------------------------------------------------
model <- compile_model(
  populations = list(nea, eur), gene_flow = gf,
  generation_time = 30,
  extension = extension
)

## -----------------------------------------------------------------------------
nea_samples <- schedule_sampling(model, times = 50000, list(nea, 1))
modern_samples <- schedule_sampling(model, times = 0, list(eur, 100))

samples <- rbind(nea_samples, modern_samples)

## ----echo=FALSE, eval = RERUN & run_vignette----------------------------------
#  x = Sys.time()
#  data_dir <- "~/Projects/introgression_data/"
#  slim(model, recombination_rate = 1e-8, samples = samples, path = data_dir)
#  y = Sys.time()
#  y - x # Time difference of 45.62365 mins

## ----eval=FALSE---------------------------------------------------------------
#  slim(model, recombination_rate = 1e-8, samples = samples, path = "~/Projects/introgression_data/")

## -----------------------------------------------------------------------------
n_genes <- 200
gene_length <- 5e6
window_length <- 100e3

## ----eval = run_vignette & file.exists("~/Projects/introgression_data/frequencies.tsv")----
freqs <- read_tsv("~/Projects/introgression_data/frequencies.tsv")

## ----introgression_freqs, eval = run_vignette & file.exists("~/Projects/introgression_data/frequencies.tsv")----
freqs %>%
mutate(pos = pos %% 5e6) %>%
group_by(pos) %>%
summarise(freq = 100 * mean(freq)) %>%
ggplot() +
  geom_line(aes(pos, freq)) +
  geom_vline(xintercept = gene_length / 2, linetype = 2) +
  labs(x = "coordinate along a gene", y = "Neanderthal ancestry proportion [%]",
       title = "Proportion of Neanderthal ancestry in Europeans along 5Mb independent genes",
       subtitle = "(dashed line indicates introgressed deleterious Neanderthal allele)") +
  coord_cartesian(ylim = c(0, 3))

## ----eval = run_vignette & file.exists("~/Projects/introgression_data/slim.trees")----
# load a tree sequence and extract the names of recorded individuals
ts <- ts_read(file = "~/Projects/introgression_data/slim.trees", model)
samples <- ts_names(ts, split = "pop")

samples

# compute coordinates of sliding windows along the genome
windows <- seq(from = 0, to = n_genes * gene_length - 1, by = window_length)

head(windows)
tail(windows)

# compute divergence from the tree sequence in each window separately
divergence <- ts_divergence(ts, samples, windows = windows, mode = "branch")$divergence[[1]]

## ----eval = run_vignette & file.exists("~/Projects/introgression_data/slim.trees")----
# compute average divergence at each position in a gene, and a 95% C.I.
div_df <- tibble(
  pop = "eur",
  pos = windows %% gene_length,
  div = divergence
) %>%
  group_by(pop, pos) %>%
  summarise(
    mean = mean(div), n = n(), std = sd(div),
    ci_low = mean - 2 * std / sqrt(n),
    ci_up = mean + 2 * std / sqrt(n)
  ) 

## ----introgression_divergence, eval = run_vignette & file.exists("~/Projects/introgression_data/slim.trees")----
ggplot(div_df) +
  geom_ribbon(aes(pos, ymin = ci_low, ymax = ci_up), fill = "grey70") +
  geom_line(aes(pos, mean)) +
  geom_vline(aes(xintercept = gene_length / 2), linetype = 2) +
  labs(x = "coordinate along a gene", y = "divergence to Neanderthal",
       title = "Divergence of Europeans to a Neanderthal genome along 5Mb independent genes",
       subtitle = "(dashed line indicates introgressed deleterious Neanderthal allele)")

Try the slendr package in your browser

Any scripts or data that you put into this service are public.

slendr documentation built on April 12, 2025, 1:34 a.m.