Nothing
# ============================================================
# Surveillance data simulator
# ============================================================
#' Simulate genomic surveillance data
#'
#' Generates synthetic surveillance datasets with realistic features:
#' multiple regions with unequal sequencing rates, multiple lineages
#' with time-varying prevalence, configurable reporting delays, and
#' multiple sample sources.
#'
#' @param n_regions Integer. Number of geographic regions. Default 5.
#' @param n_weeks Integer. Number of epiweeks. Default 26.
#' @param total_positive_per_week Integer. Mean total positive cases
#' per week across all regions. Default 1000.
#' @param sequencing_rates Numeric vector of length `n_regions`.
#' Per-region sequencing probability. If `NULL`, generated from
#' a Beta distribution with realistic inequality. Default `NULL`.
#' @param lineage_dynamics Named list of functions, each taking a
#' week number and returning a positive weight. If `NULL`, uses
#' a default four-lineage scenario. Default `NULL`.
#' @param delay_params List with `mu` and `size` for negative binomial
#' reporting delay. Default `list(mu = 10, size = 3)`.
#' @param sources Character vector of sample source types.
#' Default `c("clinical", "wastewater", "sentinel")`.
#' @param source_weights Numeric vector (same length as `sources`).
#' Default `c(0.7, 0.2, 0.1)`.
#' @param seed Integer or `NULL`. Random seed. Default `NULL`.
#'
#' @return A named list with elements:
#' \describe{
#' \item{sequences}{Tibble of individual sequence records.}
#' \item{population}{Tibble with one row per region.}
#' \item{truth}{Tibble of true lineage prevalence by region and week.}
#' \item{parameters}{List of all input parameters.}
#' }
#'
#' @examples
#' sim <- surv_simulate(n_regions = 3, n_weeks = 8, seed = 42)
#' head(sim$sequences)
#' sim$population
#'
#' @export
surv_simulate <- function(n_regions = 5L,
n_weeks = 26L,
total_positive_per_week = 1000L,
sequencing_rates = NULL,
lineage_dynamics = NULL,
delay_params = list(mu = 10, size = 3),
sources = c("clinical", "wastewater", "sentinel"),
source_weights = c(0.7, 0.2, 0.1),
seed = NULL) {
checkmate::assert_count(n_regions, positive = TRUE)
checkmate::assert_count(n_weeks, positive = TRUE)
checkmate::assert_count(total_positive_per_week, positive = TRUE)
checkmate::assert_list(delay_params)
checkmate::assert_character(sources, min.len = 1L)
checkmate::assert_numeric(source_weights, len = length(sources), lower = 0)
if (!is.null(seed)) {
old_seed <- if (exists(".Random.seed", envir = globalenv()))
get(".Random.seed", envir = globalenv()) else NULL
on.exit({
if (is.null(old_seed)) {
rm(".Random.seed", envir = globalenv())
} else {
assign(".Random.seed", old_seed, envir = globalenv())
}
}, add = TRUE)
set.seed(seed)
}
region_names <- paste0("Region_", LETTERS[seq_len(n_regions)])
pop_shares <- stats::rgamma(n_regions, shape = 2, rate = 1)
pop_shares <- pop_shares / sum(pop_shares)
if (is.null(sequencing_rates)) {
sequencing_rates <- stats::rbeta(n_regions, shape1 = 2, shape2 = 10)
sequencing_rates <- pmax(sequencing_rates, 0.005)
sequencing_rates <- pmin(sequencing_rates, 0.5)
} else {
checkmate::assert_numeric(sequencing_rates, len = n_regions,
lower = 0, upper = 1)
}
names(sequencing_rates) <- region_names
if (is.null(lineage_dynamics)) {
lineage_dynamics <- list(
"BA.5" = function(w) pmax(0.01, 0.6 * exp(-0.05 * w)),
"XBB.1.5" = function(w) pmax(0.01, 0.3 * exp(-0.03 * (w - 5))),
"BA.2.86" = function(w) {
if (w < 8) 0.01 else 0.01 + 0.5 * stats::plogis((w - 15) / 3)
},
"Other" = function(w) 0.10
)
}
lineage_names <- names(lineage_dynamics)
n_lineages <- length(lineage_names)
all_records <- vector("list", n_regions * n_weeks)
truth_records <- vector("list", n_regions * n_weeks)
idx <- 0L
for (w in seq_len(n_weeks)) {
week_start <- as.Date("2024-01-01") + (w - 1L) * 7L
ew_label <- paste0(format(week_start, "%G"), "-W", format(week_start, "%V"))
global_prev <- vapply(lineage_dynamics, function(f) f(w), numeric(1))
global_prev <- global_prev / sum(global_prev)
for (r in seq_len(n_regions)) {
idx <- idx + 1L
region <- region_names[r]
n_positive <- max(1L, stats::rpois(1L, total_positive_per_week * pop_shares[r]))
n_sequenced <- stats::rbinom(1L, n_positive, sequencing_rates[r])
region_prev <- global_prev + stats::rnorm(n_lineages, 0, 0.02)
region_prev <- pmax(region_prev, 0.001)
region_prev <- region_prev / sum(region_prev)
truth_records[[idx]] <- tibble::tibble(
epiweek = ew_label, region = region,
lineage = lineage_names, true_prevalence = region_prev,
n_positive = n_positive, n_sequenced = n_sequenced
)
if (n_sequenced == 0L) next
coll_dates <- week_start + sample(0:6, n_sequenced, replace = TRUE)
delays <- stats::rnbinom(n_sequenced, mu = delay_params$mu, size = delay_params$size)
all_records[[idx]] <- tibble::tibble(
sequence_id = paste0("seq_", w, "_", r, "_", seq_len(n_sequenced)),
region = region,
source_type = sample(sources, n_sequenced, replace = TRUE,
prob = source_weights / sum(source_weights)),
lineage = sample(lineage_names, n_sequenced, replace = TRUE,
prob = region_prev),
collection_date = coll_dates,
report_date = coll_dates + delays,
epiweek = ew_label
)
}
}
sequences <- dplyr::bind_rows(all_records)
truth <- dplyr::bind_rows(truth_records)
pop_summary <- truth |>
dplyr::group_by(.data$region) |>
dplyr::summarise(
n_positive_total = sum(.data$n_positive),
n_sequenced_total = sum(.data$n_sequenced),
.groups = "drop"
) |>
dplyr::mutate(
n_positive = as.integer(.data$n_positive_total / n_lineages),
n_sequenced = as.integer(.data$n_sequenced_total / n_lineages),
seq_rate = .data$n_sequenced / pmax(.data$n_positive, 1L),
pop_total = as.integer(.data$n_positive / mean(sequencing_rates) * 10)
) |>
dplyr::select("region", "n_positive", "n_sequenced", "seq_rate", "pop_total")
list(
sequences = sequences,
population = pop_summary,
truth = truth,
parameters = list(
n_regions = n_regions, n_weeks = n_weeks,
total_positive_per_week = total_positive_per_week,
sequencing_rates = sequencing_rates, delay_params = delay_params,
sources = sources, source_weights = source_weights,
seed = seed, lineage_names = lineage_names
)
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.