#' @title The Structured Wright Fisher Model for IBD
#' @param pos vector; the genomic coordinates for chromosome and position of the sites
#' @param N integer vector; The number of individuals to consider in each deme
#' @param m numeric numeric; Probability of internal migration where m represents
#' the probability of moving from host_{origin} to host_{new} by m*(1-1/N) of each deme
#' @param mean_coi numeric vector; The lambda of a right-shifted Poisson process,
#' 1 + Pos(lambda) representing the average COI of each deme
#' @param migr_mat numeric matrix; Migrations rates or probabilities between
#' destination and origin. Note, because this is a Wright-Fisher model, we are
#' drawing parents and therefore migration matrix is parameterized towards
#' "where one came from" versus "where one is headed": origin specified as
#' columns and destination in rows. Default value of 1 indicates non-spatial
#' model. Note, if using a probability matrix, rows must sum to 1
#' (valid marginal probability); otherwise, values will be assumed to be rates
#' and converted to probabilities.
#' @param rho numeric; expected recombination rate
#' @param tlim numeric; the maximum number of generations to consider before
#' exiting gracefully if all samples have not coalesced
#' @param verbose boolean
#' @return Returns a list of length six that contains \enumerate{
#' \item pos: The simulated genetic coordinates
#' \item coi: The COI of each individual
#' \item recomb: A recombination list of length of tlim where each element contains
#' the recombination block -- as a boolean -- of the two parental haplotypes.
#' \item parent_host1: the parental host assignments for the "paternal" haplotype
#' \item parent_host1: the parental host assignments for the "maternal" haplotype
#' \item parent_haplo1 "paternal" haplotype assigment (as above)
#' \item parent_haplo2 "maternal" haplotype assigment (as above)
#' }
#'
#' @description Simulate a population forwards with recombination that approximates
#' the Structured Wright Fisher Process and tracks haplotype identity by descent
#' where individuals represent demes, such that within a deme individual-level COI is
#' considered. The model is also extended to consider spatial demes that individual hosts
#' can move between.
#' @details Demes are assumed to be ordered throughout (i.e. the order needs to be
#' consistent between N, m, mean_coi, and the rows and columns of the migration matrix).
#' @details The migration matrix is assumed to be a distance matrix that is either
#' a rate or a probability. The program will coerce the matrix into a probability
#' distribution between origin and destination based on the row-sums.
#' @details This function is intended to be fed into the [polySimIBD::get_arg]
#' function to summarize the simulation results,
#'
#' @export
sim_swf <- function(pos, N, m, rho, mean_coi, tlim,
migr_mat = 1,
verbose = FALSE){
# assertions
goodegg::assert_vector(pos)
goodegg::assert_numeric(pos)
goodegg::assert_increasing(pos)
if (length(migr_mat) == 1) {
goodegg::assert_eq(migr_mat, 1,
message = "Distance matrix can be set to 1 to indicate a non-spatial model. Otherwise, needs to be a matrix")
goodegg::assert_length(N, 1,
message = "If non-spatial model considered, deme size is of length 1")
goodegg::assert_length(m, 1,
message = "If non-spatial model considered, internal migration rates is of length 1")
goodegg::assert_length(mean_coi, 1,
message = "If non-spatial model considered, mean coi is of length 1")
} else {
goodegg::assert_matrix(migr_mat)
goodegg::assert_eq(ncol(migr_mat), length(N),
message = "Migration matrix and deme sizes must be of same relative dimensions. In other words, you need to specify a deme size for every deme")
goodegg::assert_eq(ncol(migr_mat), length(m),
message = "Migration matrix and internal migration rates must be of same relative dimensions. In other words, you need to specify a deme size for internal migration")
goodegg::assert_eq(ncol(migr_mat), length(mean_coi),
message = "Migration matrix and mean COI must be of same relative dimensions. In other words, you need to specify a mean COI for every deme")
}
goodegg::assert_pos_int(N, zero_allowed = FALSE)
goodegg::assert_bounded(m, left = 0, right = 1)
goodegg::assert_bounded(rho, left = 0, right = 1, inclusive_left = FALSE, inclusive_right = FALSE)
goodegg::assert_pos(mean_coi, zero_allowed = FALSE)
goodegg::assert_single_pos_int(tlim, zero_allowed = FALSE)
# precalculate probability of an odd number of recombination events between
# any interval
odd_prob <- exp(-rho*diff(pos))*sinh(rho*diff(pos))
if (any(!is.finite(odd_prob))) { # catch underflow issue
odd_prob[is.nan(odd_prob)] <- 0.5
}
# calculate migration rates and probability from distance matrix
if (is.matrix(migr_mat)) {
if(any(rowSums(migr_mat) != 1)) { # if not probability, assume that it is rates
migr_mat <- 1 - exp(-migr_mat)
migr_mat <- migr_mat/rowSums(migr_mat)
}
# ensure for Cpp that psum is 1 (this is needed for `sample1` fxn) and that above worked correctly
if (is.matrix(migr_mat)) {
if ( any(round(rowSums(migr_mat), 4) != 1) ) { # add small margine of tolerance
stop("Migration Matrix must sum to 1 to be properly passed to internal Cpp `sample` fxn for psum argument")
}
}
# split out for cpp import
migr_mat <- split(migr_mat, 1:nrow(migr_mat))
migr_mat <- unname(migr_mat)
} else {
migr_mat <- 1
}
# define argument list
args <- list(pos = pos,
maxN = max(N),
demecnt = length(N),
mig_mat_prob = migr_mat,
N = N,
m = m,
mean_coi = mean_coi,
tlim = tlim,
odd_prob = odd_prob)
# run efficient C++ function
ret <- sim_swf_cpp(args)
# keep pos around for ease
pos <- list(pos = pos)
ret <- append(pos, ret)
# return as custom class
class(ret) <-"swfsim"
return(ret)
}
#' @title Get ancestral recombination graph from forward simulations
#' @description Given an object \code{swf}, which is the result of forward
#' simulation using the function \code{sim_swf()}, walk backwards through the
#' ancestry and calculate the coalescent tree at every locus for the
#' specified hosts and/or haplotypes.
#' @param swf result of forwards simulation using the function \code{sim_swf()}
#' @param host_index a vector of target hosts. Defaults to all hosts
#' @param haplo_index a list of target haplotypes within the hosts specified by
#' \code{host_index}. Defaults to all haplotypes within the specified hosts.
#' @importFrom methods new
#' @export
get_arg <- function(swf, host_index = NULL, haplo_index = NULL) {
# check inputs and define defaults
goodegg::assert_class(swf, "swfsim")
if (is.null(host_index)) {
host_index <- 1:length(swf$coi)
}
if (is.null(haplo_index)) {
haplo_index <- mapply(function(x) 1:x, swf$coi[host_index], SIMPLIFY = FALSE)
}
goodegg::assert_vector(host_index)
goodegg::assert_pos_int(host_index, zero_allowed = FALSE)
goodegg::assert_list(haplo_index)
goodegg::assert_same_length(host_index, haplo_index)
goodegg::assert_pos_int(unlist(haplo_index), zero_allowed = FALSE)
# define arguments
args <- c(swf, list(host_index = rep(host_index, mapply(length, haplo_index)) - 1,
haplo_index = unlist(haplo_index) - 1))
# pass to efficient C++ function
output_raw <- get_arg_cpp(args)
# create a list of bvtrees
L <- length(output_raw$coalesce_target)
ARG <- lapply(1:L, function(x) return(new("bvtree")))
for (l in 1:L) {
ARG[[l]]@c = output_raw$coalesce_target[[l]]
ARG[[l]]@t = output_raw$coalesce_time[[l]]
t_temp <- ARG[[l]]@t
t_temp[which(t_temp == -1)] <- NA
ARG[[l]]@z = order(t_temp) - 1
}
# custom class
class(ARG) <- "argraph"
return(ARG)
}
#' @title Subset an object of class bvtree
#' @description Given a bvtree and a vector of indices \code{s}, creates a new
#' tree which is a subset of the original tree focusing only on the elements
#' \code{s}.
#' @param bvtree an object of class "bvtree"
#' @param s a vector specifying which elements in the bvtree to focus on
#' @export
subset_bvtree <- function(bvtree, s) {
# check inputs
goodegg::assert_class(bvtree, "bvtree")
goodegg::assert_vector(s)
goodegg::assert_gr(length(s), 1)
goodegg::assert_noduplicates(s)
goodegg::assert_pos_int(s, zero_allowed = FALSE)
goodegg::assert_leq(s, length(bvtree@c))
# create mask vector
m <- rep(0, length(bvtree@c))
m[s] <- 1
# run efficient C++ function
output_raw <- subset_bvtree_cpp(bvtree@c, bvtree@t, m)
# update tree
bvtree@c <- output_raw$c[s]
bvtree@t <- output_raw$t[s]
# update indices
bvtree@c <- c(-1, (1:length(s))-1)[match(bvtree@c, c(-1,s-1))]
# update z
tmp <- bvtree@t
tmp[tmp == -1] <- NA
bvtree@z <- order(tmp) - 1
return(bvtree)
}
#' @title Get Between-Host Identity by Descent from forward simulations
#' @description Given an object \code{swf}, which is the result of forward
#' simulation using the function \code{sim_swf()}, walks backwards through the
#' ancestry and calculate the between host identity by descent. The IBD
#' calculation is based on \cite{Verity et. al 2020, Nat Comms, PMC7192906} and
#' assumes relatedness if there are any IBD among the haplotypes between hosts
#' at a given loci (i.e. a loci is considered to be in IBD if there is any between
#' host IBD among the strains, regardless of COI. This means that as COI increases,
#' IBD may be overestimated, which has been shown to be a conservative estimand).
#' @inheritParams get_arg
#' @importFrom methods new
#' @export
get_bvibd <- function(swf, host_index = NULL, haplo_index = NULL) {
# check inputs and define defaults
goodegg::assert_class(swf, "swfsim")
if (is.null(host_index)) {
host_index <- 1:length(swf$coi)
}
if (is.null(haplo_index)) {
haplo_index <- mapply(function(x) 1:x, swf$coi[host_index], SIMPLIFY = FALSE)
}
goodegg::assert_vector(host_index)
goodegg::assert_pos_int(host_index, zero_allowed = FALSE)
goodegg::assert_list(haplo_index)
goodegg::assert_same_length(host_index, haplo_index)
goodegg::assert_pos_int(unlist(haplo_index), zero_allowed = FALSE)
# define arguments
args <- c(swf, list(host_index = rep(host_index, mapply(length, haplo_index)) - 1,
haplo_index = unlist(haplo_index) - 1))
# pass to efficient C++ function
output_raw <- get_bvibd_cpp(args)
numerator <- output_raw$ibd_target[-1]
# under SNP vs PSMC (Li/Durbin model) don't know begin and end, so treat as missing info - ie burn first loci
wi <- diff(swf$pos)/sum(diff(swf$pos))
# weighted average (each loci, denom is 1)
bv_ibd <- sum( numerator*wi )
return(bv_ibd)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.