sample.clade | R Documentation |
Generates occurrence times or time ranges (as most empirical fossil occurrences) for each of the desired species using a Poisson process. Allows for the Poisson rate to be (1) a constant, (2) a function of time, (3) a function of time and a time-series (usually environmental) variable, or (4) a vector of numbers (rates in a step function). Allows for age-dependent sampling with a parameter for a distribution representing the expected occurrence number over a species duration. Allows for further flexibility in rates by a shift times vector and environmental matrix parameters. Finally, allows for the simulation of trait-dependent fossil sampling when trait value information is supplied.
sample.clade(
sim,
rho,
tMax,
S = NULL,
envR = NULL,
rShifts = NULL,
returnTrue = TRUE,
returnAll = FALSE,
bins = NULL,
adFun = NULL,
...
)
sim |
A |
rho |
Sampling rate (per species per million years) over time. It can
be a |
tMax |
The maximum simulation time, used by |
S |
A vector of species numbers to be sampled. The default is all
species in |
envR |
A data frame containing time points and values of an
environmental variable, like temperature, for each time point. This will be
used to create a sampling rate, so |
rShifts |
Vector of rate shifts. First element must be the starting
time for the simulation ( |
returnTrue |
If set to |
returnAll |
If set to |
bins |
A vector of time intervals corresponding to geological time
ranges. It must be supplied if |
adFun |
A density function representing the age-dependent preservation model. It must be a density function, and consequently integrate to 1 (though this condition is not verified by the function). If not provided, a uniform distribution will be used by default. The function must also have the following properties:
|
... |
Additional parameters used by |
Optionally takes a vector of time bins representing geologic periods, if the user wishes occurrence times to be represented as a range instead of true points.
The age-dependent preservation function assumes that all extant
species at the end of the simulations have TE = 0
(i.e., the function
assumes all extant species got extinct exaclty when the simulation ended.
This might create distortion for some adFun
- especially in the case
of bell-shaped functions. As interpretations of what age-dependent
preservation mean to species alive at the end of the simulation, we
recommend users to implement their own preservation functions for the
species that are extant at the end of the simulation.
A data.frame
containing species names/numbers, whether each
species is extant or extinct, and the true occurrence times of each fossil,
a range of occurrence times based on bins
, or both.
Matheus Januario and Bruno do Rosario Petrucci.
# vector of times
time <- seq(10, 0, -0.1)
###
# we can start with a constant case
# set seed
set.seed(1)
# simulate a group
sim <- bd.sim(n0 = 1, lambda = 0.1, mu = 0.1, tMax = 10)
# sampling rate
rho <- 2
# bins for fossil ranges
bins <- seq(from = 10, to = 0, by = -1)
# simulate fossil occurrences data frame
fossils <- sample.clade(sim, rho, tMax = 10,
bins = bins, returnTrue = FALSE)
# draw simulation with fossil occurrences as ranges
draw.sim(sim, fossils = fossils, fossilsFormat = "ranges")
###
# sampling can be any function of time
# set seed
set.seed(1)
# simulate a group
sim <- bd.sim(n0 = 1, lambda = 0.1, mu = 0.1, tMax = 10)
# sampling rate
rho <- function(t) {
return(2 - 0.15*t)
}
# plot sampling function
plot(x = time, y = rho(time), type = "l",
ylab = "Preservation rate",
xlab = "Time since the start of the simulation (My)")
# note for these examples we do not reverse time in the plot
# see other functions in the package for examples where we do
# bins for fossil ranges
bins <- seq(from = 10, to = 0, by = -1)
# simulate fossil occurrences data frame
fossils <- sample.clade(sim, rho, tMax = 10,
bins = bins, returnTrue = FALSE)
# draw simulation with fossil occurrences as ranges
draw.sim(sim, fossils = fossils, fossilsFormat = "ranges")
###
# now we can try a step function rate
# not running because it takes a long time
## Not run:
# set seed
set.seed(1)
# simulate a group
sim <- bd.sim(n0 = 1, lambda = 0.1, mu = 0.1, tMax = 10)
# we will use the less efficient method of creating a step function
# one could instead use ifelse()
# rates vector
rList <- c(2, 5, 0.5)
# rate shifts vector
rShifts <- c(0, 4, 8)
# make it a function so we can plot it
rho <- make.rate(rList, 10, rateShifts = rShifts)
# plot sampling function
plot(x = time, y = rho(time), type = "l",
ylab = "Preservation rate",
xlab = "Time since the start of the simulation (My)")
# bins for fossil ranges
bins <- seq(from = 10, to = 0, by = -1)
# simulate fossil occurrences data frame
fossils <- sample.clade(sim, rho = rList, rShifts = rShifts, tMax = 10,
bins = bins, returnTrue = FALSE)
# draw simulation with fossil occurrences as ranges
draw.sim(sim, fossils = fossils, fossilsFormat = "ranges")
## End(Not run)
###
# finally, sample.clade also accepts an environmental variable
# get temperature data
data(temp)
# set seed
set.seed(1)
# simulate a group
sim <- bd.sim(n0 = 1, lambda = 0.1, mu = 0.1, tMax = 10)
# rho will be temperature dependent
envR <- temp
# function describing environmental dependence
r_t <- function(t, env) {
return(((env) / 12) ^ 6)
}
# make it a function so we can plot it
rho <- make.rate(r_t, tMax = tMax, envRate = envR)
# plot sampling function
plot(x = time, y = rho(time), type = "l",
ylab = "Preservation rate",
xlab = "Time since the start of the simulation (My)")
# simulate fossil occurrences data frame
fossils <- sample.clade(sim, rho = r_t, envR = envR, tMax = 10, bins = bins)
# now we record the true time of fossil occurrences
# draw simulation with fossil occurrences as time points
draw.sim(sim, fossils = fossils)
# note that any techniques used in examples for ?bd.sim to create more
# complex mixed scenarios can be used here as well
###
# sampling can also be age-dependent
# set seed
set.seed(1)
# simulate a group
sim <- bd.sim(n0 = 1, lambda = 0.1, mu = 0.1, tMax = 10)
# sampling rate
rho <- 3
# here we will use the PERT function. It is described in:
# Silvestro et al 2014
# age-dependence distribution
# note that a and b define the beta distribution used, and can be modified
dPERT <- function(t, s, e, sp, a = 3, b = 3, log = FALSE) {
# check if it is a valid PERT
if (e >= s) {
message("There is no PERT with e >= s")
return(rep(NaN, times = length(t)))
}
# find the valid and invalid times
id1 <- which(t <= e | t >= s)
id2 <- which(!(t <= e | t >= s))
t <- t[id2]
# initialize result vector
res <- vector()
# if user wants a log function
if (log) {
# invalid times get -Inf
res[id1] <- -Inf
# valid times calculated with log
res[id2] <- log(((s - t) ^ 2) * ((-e + t) ^ 2) /
((s - e) ^ 5 * beta(a, b)))
}
# otherwise
else{
res[id1] <- 0
res[id2] <- ((s - t) ^ 2) * ((-e + t) ^ 2) / ((s - e) ^ 5 * beta(a, b))
}
return(res)
}
# plot it for an example species who lived from 10 to 5 million years ago
plot(time, rev(dPERT(t = time, s = 10, e = 5, a = 1)),
main = "Age-dependence distribution",
xlab = "Species age (My)", ylab = "Density",
xlim = c(0, 5), type = "l")
# bins for fossil ranges
bins <- seq(from = 10, to = 0, by = -1)
# simulate fossil occurrences data frame
fossils <- sample.clade(sim, rho, tMax = 10, adFun = dPERT, bins = bins,
returnAll = TRUE)
# can use returnAll to get occurrences as both time points and ranges
# draw simulation with fossil occurrences as time points
draw.sim(sim, fossils = fossils)
# the warning is to let you know the ranges won't be used
# and also as ranges
draw.sim(sim, fossils = fossils, fossilsFormat = "ranges")
###
# we can have more parameters on adFun
# sampling rate
rho <- function(t) {
return(1 + 0.5*t)
}
# since here rho is time-dependent, the function finds the
# number of occurrences using rho, and their distribution
# using a normalized rho * adFun
# set seed
set.seed(1)
# simulate a group
sim <- bd.sim(n0 = 1, lambda = 0.1, mu = 0.1, tMax = 10)
# here we will use the triangular distribution
# age-dependence distribution
dTRI <- function(t, s, e, sp, md) {
# make sure it is a valid TRI
if (e >= s) {
message("There is no TRI with e >= s")
return(rep(NaN, times = length(t)))
}
# another condition we must check
if (md < e | md > s) {
message("There is no TRI with md outside [s, e] interval")
return(rep(NaN, times = length(t)))
}
# needed to vectorize the function:
id1 <- which(t >= e & t < md)
id2 <- which(t == md)
id3 <- which(t > md & t <= s)
id4 <- which( !(1:length(t) %in% c(id1,id2,id3)))
# actually vetorizing function
res <- vector()
# (t >= e & t < md)
res[id1] <- (2*(t[id1] - e)) / ((s - e) * (md - e))
# (t == md)
res[id2] <- 2 / (s - e)
# (md < t & t <= s)
res[id3] <- (2*(s - t[id3])) / ((s - e) * (s - md))
# outside function's limits
res[id4] <- 0
return(res)
}
# set mode at 8
md <- 8
# plot it for an example species who lived from 10mya to the present
plot(time, rev(dTRI(time, 10, 5, 1, md)),
main = "Age-dependence distribution",
xlab = "Species age (My)", ylab = "Density",
xlim = c(0, 5), type = "l")
# bins for fossil ranges
bins <- seq(from = 10, to = 0, by = -1)
# simulate fossil occurrences for the first species
fossils <- sample.clade(sim, rho, tMax = 10, S = 1, adFun = dTRI,
bins = bins, returnTrue = FALSE, md = md)
# note we provide the peak for the triangular sampling as an argument
# here that peak is assigned in absolute geological, but
# it usually makes more sense to express this in terms
# of age (a given percentile of the age, for instance) - see below
# draw simulation with fossil occurrences as ranges
draw.sim(sim, fossils = fossils, fossilsFormat = "ranges")
###
# we can also have a hat-shaped increase through the duration of a species
# with more parameters than TS and TE, but with the parameters relating to
# the relative age of each lineage
# sampling rate
rho <- function(t) {
return(1 + 0.1*t)
}
# set seed
set.seed(1)
# simulate a group
sim <- bd.sim(n0 = 1, lambda = 0.1, mu = 0.1, tMax = 10)
# age-dependence distribution, with the "mde" of the triangle
# being exactly at the last quarter of the duration of EACH lineage
dTRImod1 <- function(t, s, e, sp) {
# note that now we don't have the "md" parameter here,
# but it is calculated inside the function
# check if it is a valid TRI
if (e >= s) {
message("There is no TRI with e >= s")
return(rep(NaN, times = length(t)))
}
# calculate md
md <- ((s - e) / 4) + e
# md is at the last quarter of the duration of the lineage
# please note that the same logic can be used to sample parameters
# internally in the function, running for instance:
# md <- runif (n = 1, min = e, max = s)
# check it is a valid md
if (md < e | md > s) {
message("There is no TRI with md outside [s, e] interval")
return(rep(NaN, times = length(t)))
}
# needed to vectorize function
id1 <- which(t >= e & t < md)
id2 <- which(t == md)
id3 <- which(t>md & t <= s)
id4 <- which( !(1:length(t) %in% c(id1,id2,id3)))
# vectorize the function
res<-vector()
res[id1] <- (2 * (t[id1] - e)) / ((s - e) * (md - e))
res[id2] <- 2 / (s - e)
res[id3] <- (2 * (s - t[id3])) / ((s - e) * (s - md))
res[id4] <- 0
return(res)
}
# plot for a species living between 10 and 0 mya
plot(time, rev(dTRImod1(time, 10, 0, 1)),
main = "Age-dependence distribution",
xlab = "Species age (My)", ylab = "Density",
xlim = c(0, 10), type = "l")
# sample first two species
fossils <- sample.clade(sim = sim, rho = rho, tMax = 10, adFun = dTRImod1)
# draw simulation with fossil occurrences as time points
draw.sim(sim, fossils = fossils)
# here, we fix md at the last quarter
# of the duration of the lineage
###
# the parameters of adFun can also relate to each specific lineage,
# which is useful when using variable parameters for each species
# to keep track of those parameters after the sampling is over
# set seed
set.seed(1)
# simulate a group
sim <- bd.sim(n0 = 1, lambda = 0.1, mu = 0.1, tMax = 10)
# sampling rate
rho <- 3
# get the par and par1 vectors
# get mins vector
minsPar <- ifelse(is.na(sim$TE), 0, sim$TE)
# a random time inside each species' duration
par <- runif(n = length(sim$TE), min = minsPar, max = sim$TS)
# its complement to the middle of the lineage's age.
par1 <- (((sim$TS - minsPar) / 2) + minsPar) - par
# note that the interaction between these two parameters creates a
# deterministic parameter, but inside the function one of them ("par")
# is a random parameter
dTRImod2 <- function(t, s, e, sp) {
# make sure it is a valid TRI
if (e >= s) {
message("There is no TRI with e >= s")
return(rep(NaN, times = length(t)))
}
# md depends on parameters
md <- par[sp] + par1[sp]
# check that md is valid
if (md < e | md > s) {
message("There is no TRI with md outside [s, e] interval")
return(rep(NaN, times = length(t)))
}
id1 <- which(t >= e & t < md)
id2 <- which(t == md)
id3 <- which(t > md & t <= s)
id4 <- which(!(1:length(t) %in% c(id1,id2,id3)))
res <- vector()
res[id1] <- (2*(t[id1] - e)) / ((s - e) * (md - e))
res[id2] <- 2 / (s - e)
res[id3] <- (2*(s - t[id3])) / ((s - e) * (s - md))
res[id4] <- 0
return(res)
}
# plot for a species living between 10 and 0 mya
plot(time, rev(dTRImod2(time, 10, 0, 1)),
main = "Age-dependence distribution",
xlab = "Species age (My)", ylab = "Density",
xlim = c(0, 10), type = "l")
# simulate fossil occurrences data frame
fossils <- sample.clade(sim, rho, tMax = 10, adFun = dTRImod2, bins = bins,
returnTrue = FALSE)
# draw simulation with fossil occurrences as time ranges
draw.sim(sim, fossils = fossils, fossilsFormat = "ranges")
###
# we can also have a mix of age-independent and age-dependent
# sampling in the same simulation
# set seed
set.seed(1)
# simulate a group
sim <- bd.sim(n0 = 1, lambda = 0.1, mu = 0.1, tMax = 10)
# sampling rate
rho <- 7
# define a uniform to represente age-independence
# age-dependence distribution (a uniform density distribution in age)
# in the format that the function needs
custom.uniform <- function(t, s, e, sp) {
# make sure it is a valid uniform
if (e >= s) {
message("There is no uniform function with e >= s")
return(rep(NaN, times = length(t)))
}
res <- dunif(x = t, min = e, max = s)
return(res)
}
# PERT as above
dPERT <- function(t, s, e, sp, a = 3, b = 3, log = FALSE) {
# check if it is a valid PERT
if (e >= s) {
message("There is no PERT with e >= s")
return(rep(NaN, times = length(t)))
}
# find the valid and invalid times
id1 <- which(t <= e | t >= s)
id2 <- which(!(t <= e | t >= s))
t <- t[id2]
# initialize result vector
res <- vector()
# if user wants a log function
if (log) {
# invalid times get -Inf
res[id1] <- -Inf
# valid times calculated with log
res[id2] <- log(((s - t) ^ 2) * ((-e + t) ^ 2) /
((s - e) ^ 5 * beta(a, b)))
}
# otherwise
else{
res[id1] <- 0
res[id2] <- ((s - t) ^ 2) * ((-e + t) ^ 2) / ((s - e) ^ 5 * beta(a, b))
}
return(res)
}
# actual age-dependency defined by a mix
dPERTAndUniform <- function(t, s, e, sp) {
return(
ifelse(t > 5, custom.uniform(t, s, e, sp),
dPERT(t, s, e, sp))
)
}
# starts out uniform, then becomes PERT
# after 5my (in absolute geological time)
# plot it for an example species who lived from 10 to 0 million years ago
plot(time, rev(dPERTAndUniform(time, 10, 0, 1)),
main = "Age-dependence distribution",
xlab = "Species age (My)", ylab = "Density",
xlim = c(0, 10), type = "l")
# bins for fossil ranges
bins <- seq(from = 10, to = 0, by = -1)
# simulate fossil occurrences data frame
fossils <- sample.clade(sim, rho, tMax = 10, adFun = dPERTAndUniform,
bins = bins, returnTrue = FALSE)
# draw simulation with fossil occurrences as ranges
draw.sim(sim, fossils = fossils, fossilsFormat = "ranges")
# note how occurrences cluster close to the speciation time of
# species 1, but not its extinction time, since around 5mya
# the PERT becomes the effective age-dependence distribution
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.