#------------------------------------------------
#' @title Simulate data
#'
#' @description Simulate data under a specified model
#'
#' @param sentinel_lon vector giving longitudes of sentinel sites.
#' @param sentinel_lat vector giving latitudes of sentinel sites.
#' @param sentinel_radius observation radius of the sentinel site (km).
#' @param K the number of sources.
#' @param source_weights the proportion of events coming from each source
#' @param source_lon_min minimum limit on source longitudes.
#' @param source_lon_max maximum limit on source longitudes.
#' @param source_lat_min minimum limit on source latitudes.
#' @param source_lat_max maximum limit on source latitudes.
#' @param source_lon manually define source longitude positions. If \code{NULL}
#' then drawn uniformly from limits specified in \code{source_lon_min} and
#' \code{source_lon_max}.
#' @param source_lat manually define source latitude positions. If \code{NULL}
#' then drawn uniformly from limits specified in \code{source_lat_min} and
#' \code{source_lat_max}.
#' @param sigma_model set as "single" to use the same dispersal distance for all
#' sources, or "independent" to use an independently drawn dispersal distance for
#' each source.
#' @param sigma_mean the prior mean of the parameter sigma (km).
#' @param sigma_var the prior variance of the parameter sigma (km). Set to zero
#' to use a fixed distance.
#' @param expected_popsize the expected total number of observations (observed
#' and unobserved) in the study area.
#' @param data_type what model we wish to simulate under - a poisson, binomial
#' or vanilla finite mixture corresponding to "counts", "prevalence" or
#' "point-pattern" respectively
#' @param test_rate The rate of the Poisson distribution with which we draw the
#' number of individuals tested at each sentinel site
#' @param N the number of events to distributed under a point-pattern model
#' @param dispersal_model distribute points via a "normal", "cauchy" or
#' "laplace" model
#'
#' @import stats
#' @export
#'
#' @examples
#' # State the number of sources to be generated
#' K_sim <- 3
#' # Create some sentinel site locations
#' sentinal_lon <- seq(-0.2, 0.0, l=11)
#' sentinal_lat <- seq(51.45, 51.55, l=11)
#' sentinal_grid <- expand.grid(sentinal_lon, sentinal_lat)
#' names(sentinal_grid) <- c("longitude", "latitude")
#' # Set their sentinel radius (this constant times true sigma)
#' sentinel_radius <- 0.25
#' # sim count data under a Poisson model
#' sim1 <- sim_data(sentinal_grid$longitude,
#' sentinal_grid$latitude,
#' sigma_model = "single",
#' sigma_mean = 1,
#' sigma_var = 0.5,
#' sentinel_radius = sentinel_radius,
#' K = K_sim,
#' expected_popsize = 300)
sim_data <- function(sentinel_lon,
sentinel_lat,
sentinel_radius = 0.1,
K = 3,
source_weights = NULL,
source_lon_min = -0.2,
source_lon_max = 0.0,
source_lat_min = 51.45,
source_lat_max = 51.55,
source_lon = NULL,
source_lat = NULL,
sigma_model = "single",
sigma_mean = 1.0,
sigma_var = 0.1,
expected_popsize = 100,
data_type = "counts",
test_rate = 5,
N = 150,
dispersal_model = "normal")
{
# check inputs
assert_in(data_type, c("counts", "prevalence", "point-pattern"))
assert_in(dispersal_model, c("normal", "cauchy", "laplace"))
if(data_type == "counts" | data_type == "prevalence"){
assert_numeric(sentinel_lon)
assert_numeric(sentinel_lat)
assert_single_pos(sentinel_radius, zero_allowed = FALSE)
assert_same_length(sentinel_lon, sentinel_lat)
assert_single_pos(expected_popsize, zero_allowed = FALSE)
assert_single_pos_int(test_rate, zero_allowed = FALSE)
}
assert_single_pos_int(K, zero_allowed = FALSE)
if(is.null(source_weights)){
source_weights <- rep(1/K, K)
}
assert_length(source_weights, K)
assert_bounded(source_weights)
assert_single_numeric(source_lon_min)
assert_single_numeric(source_lon_max)
assert_single_numeric(source_lat_min)
assert_single_numeric(source_lat_max)
if (is.null(source_lon)) {
source_lon <- runif(K, source_lon_min, source_lon_max)
}
if (is.null(source_lat)) {
source_lat <- runif(K, source_lat_min, source_lat_max)
}
assert_vector(source_lon)
assert_numeric(source_lon)
assert_vector(source_lat)
assert_numeric(source_lat)
assert_same_length(source_lon, source_lat)
assert_length(source_lon, K)
assert_single_string(sigma_model)
assert_in(sigma_model, c("single", "independent"))
switch(sigma_model,
"single" = {
assert_single_pos(sigma_var, zero_allowed = TRUE)
assert_single_pos(sigma_mean, zero_allowed = FALSE)
},
"independent" = {
assert_length(sigma_mean, K)
assert_length(sigma_var, K)
assert_pos(sigma_mean, zero_allowed = FALSE)
assert_pos(sigma_var, zero_allowed = TRUE)
})
assert_single_pos_int(N, zero_allowed = FALSE)
# draw total number of points
if(data_type == "counts" | data_type == "prevalence"){
N <- rpois(1, expected_popsize)
if (N == 0) {
stop("N = 0 events generated")
}
}
# draw true allocation of all points to sources
group <- sort(sample(K, N, replace = TRUE, prob = source_weights))
source_N <- tabulate(group)
# draw sigma
varlog <- log(sigma_var/sigma_mean^2 + 1)
meanlog <- log(sigma_mean) - varlog/2
switch(sigma_model,
"single" = {
sigma <- rep(rlnorm(1, meanlog, sqrt(varlog)), K)
},
"independent" = {
sigma <- rlnorm(K, meanlog, sqrt(varlog))
})
#-----------------------------------------------------------------------------
if(data_type == "counts"){
df_all <- NULL
for (k in 1:K) {
if (source_N[k]>0) {
rand_k <- dispersal_sphere(n = source_N[k],
source_lon[k],
source_lat[k],
scale = sigma[k],
dispersal_model = "normal")
df_all <- rbind(df_all, as.data.frame(rand_k))
}
}
# draw points around sources
# get distance between all points and sentinel sites
gc_dist <- mapply(function(x, y) {
lonlat_to_bearing(x, y, df_all$longitude, df_all$latitude)$gc_dist
}, x = sentinel_lon, y = sentinel_lat)
counts <- colSums(gc_dist < sentinel_radius)
df_observed <- data.frame(longitude = sentinel_lon,
latitude = sentinel_lat,
counts = counts)
# add record of whether data point is observed or unobserved to df_all
df_all$observed <- rowSums(gc_dist < sentinel_radius)
observed_by <- as.list(apply(gc_dist, 1, function(x) which(x < sentinel_radius)))
if (length(observed_by) == 0) {
observed_by <- replicate(nrow(df_all), integer())
}
df_all$observed_by <- observed_by
# create true q-matrix as proportion of points belonging to each group per sentinel site
true_qmatrix <- t(apply(gc_dist, 2, function(x) {
ret <- tabulate(group[x < sentinel_radius], nbins = K)
ret <- ret/sum(ret)
ret[is.na(ret)] <- NA
ret
}))
class(true_qmatrix) <- "rgeoprofile_qmatrix"
# return simulated data and true parameter values
ret_data <- df_observed
ret_record <- list()
ret_record$sentinel_radius <- sentinel_radius
ret_record$true_group <- group
ret_record$true_qmatrix <- true_qmatrix
ret_record$data_all <- df_all
} else if (data_type == "prevalence"){
# get distances from source locations to sentinel sites
gc_dist <- mapply(function(x, y) {lonlat_to_bearing(x, y, source_lon, source_lat)$gc_dist},
x = sentinel_lon, y = sentinel_lat)
# calculate height of each sentinel site on the mixture of bivariate normals
heights <- dnorm(gc_dist, 0, sigma)*dnorm(0, 0, sigma)
if(is.vector(heights)){
heights <- heights
} else{
heights <- apply(heights*source_weights, 2, sum)
}
rate <- expected_popsize*heights
# transform the rate to a trial success probability
binom_prob <- rate/(1 + rate)
# pick how many individuals are tested at each site and use the binom_prob to draw
# the number of positive individuals
tested <- rpois(length(sentinel_lon), lambda = test_rate)
tested[tested == 0] <- 1
positive <- rbinom(length(sentinel_lon), tested, binom_prob)
df_observed <- data.frame(longitude = sentinel_lon,
latitude = sentinel_lat,
positive = positive,
tested = tested)
# return simulated data and true parameter values
ret_data <- df_observed
ret_record <- list()
ret_record$binomial_probability <- binom_prob
} else if(data_type == "point-pattern"){
# draw points around sources
df_all <- NULL
for (k in 1:K) {
if (source_N[k] > 0) {
rand_k <- dispersal_sphere(n = source_N[k],
source_lon[k],
source_lat[k],
scale = sigma[k],
dispersal_model = dispersal_model)
df_all <- rbind(df_all, as.data.frame(rand_k))
}
}
# TODO create true q-matrix as proportion of points belonging to each group per sentinel site
# true_qmatrix <- t(apply(gc_dist, 2, function(x) {
# ret <- tabulate(group[x < sentinel_radius], nbins = K)
# ret <- ret/sum(ret)
# ret[is.na(ret)] <- NA
# ret
# }))
# class(true_qmatrix) <- "rgeoprofile_qmatrix"
# return simulated data and true parameter values
ret_record <- list()
ret_record$true_group <- group
# ret_record$true_qmatrix <- true_qmatrix
ret_record$data_all <- ret_data <- df_all
}
ret_record$true_source <- data.frame(longitude = source_lon, latitude = source_lat)
ret_record$true_source_N <- source_N
ret_record$true_sigma <- sigma
ret <- list(data = ret_data,
record = ret_record)
# make custom class
class(ret) <- "rgeoprofile_simdata"
return(ret)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.