# Simulation test for bias in AR1 sdmTMB models
######################
######################
# Steps first executed independently and then in functions that repeat simulation and compile results
library(sdmTMB)
library(ggplot2)
library(dplyr)
library(purrr)
library(future)
library(cowplot)
# Set spatial grid (also run this section before running functions)
######################
## create fine-scale square 50 x 50 grid to predict on
grid <- expand.grid(X = seq(1:50), Y = seq(1:50))
## or use the boundaries of the Queen Charlotte Sound
# grid <- qcs_grid
# 1. Simulate 'true' data
######################
#' Simulate data in list format with a vector of input values
#'
#' @param x A vector of x coordinates.
#' @param y A vector of y coordinates.
#' @param time_steps The number of time steps.
#' @param ar1_fields Logical for whether or not to include AR1 structure.
#' @param ar1_phi Correlation between years; should be between -1 and 1.
#' @param sigma_O SD of spatial process (Omega).
#' @param sigma_E SD of spatiotemporal process (Epsilon).
#' @param kappa Parameter that controls the decay of spatial correlation (3/kappa is roughly the distance at which points are %10 correlated)
#' @param phi Observation error scale parameter.
#' @param plot Logical for whether or not to produce a plot.
#'
#' @return
#' @export
#'
#' @examples
sim_args_vec <- function(x = stats::runif(400, 0, 10), y = stats::runif(400, 0, 10),
time_steps = 1L, ar1_fields = FALSE, ar1_phi = 0.5,
sigma_O = 0.4, sigma_E = 0.3, kappa = 1.3, phi = 0.2,
plot = FALSE) {
d <- sim(
x = x, y = y, time_steps = time_steps, ar1_fields = ar1_fields, plot = plot,
ar1_phi = ar1_phi, sigma_O = sigma_O, sigma_E = sigma_E, kappa = kappa, phi = phi
)
list(d, inputs = c(
ar1_phi = ar1_phi,
sigma_O = sigma_O,
sigma_E = sigma_E,
kappa = kappa,
phi = phi
))
}
simdat <- sim_args_vec(
x = grid$X, y = grid$Y,
time_steps = 9,
ar1_fields = TRUE,
ar1_phi = 0.5,
sigma_O = 0.3,
sigma_E = 0.3,
kappa = 0.05,
phi = 0.05
)
# 2. Sub-sample from 'true' data
######################
dat <- simdat[[1]] %>% group_by(time) %>% sample_n(1500) %>% ungroup()
# dat <- simdat %>% filter((x <25|x>50) & (y<25|y>50)) # try removing whole chunks of data
# 3. Model from sub-sample
######################
spde <- make_spde(x = dat$x, y = dat$y, n_knots = 150)
plot_spde(spde)
m <- sdmTMB(
silent = FALSE,
spatiotemporal = "AR1",
spatial = TRUE, # no fixed spatial random field
data = dat, formula = z ~ 1, time = "time",
family = gaussian(link = "identity"), spde = spde
)
r <- m$tmb_obj$report()
# r$range # distance at which ~%10 correlation
# 4. Compare parameter estimates with input values
######################
# Input values (from vectorized simdat list element)
inputs <- as.data.frame(reshape::melt(simdat[[2]]))
# back transform parameter estimates from model report() for comparison with input values
estimates <- c(
ar1_phi = (2 * plogis(m$model$par[["ar1_phi"]]) - 1),
sigma_O = r$sigma_O,
sigma_E = r$sigma_E,
kappa = exp(r$ln_kappa),
phi = exp(r$ln_phi)
)
estimates <- as.data.frame(reshape::melt(estimates))
inputs
estimates
# 5. Use model to predict for all grid points in simulated data
######################
# replicate for each time period
original_time <- sort(unique(m$data[[m$time]]))
nd <- do.call("rbind", replicate(length(original_time), grid, simplify = FALSE))
nd[[m$time]] <- rep(original_time, each = nrow(grid))
# run TMB with prediction turned on but replace sample 'dat' with new grid 'nd'
predictions <- predict(m, newdata = nd, xy_cols = c("X", "Y"))$data
head(predictions)
# 6. Contrast true and predicted values for each point in space and time
######################
spatial_bias_dat <- full_join(simdat[[1]], predictions, by = c("x" = "X", "y" = "Y", "time" = "time"), suffix = c("", "_est")) %>% mutate(diff = est - real_z)
head(spatial_bias_dat)
hist(spatial_bias_dat$diff, breaks = 40)
#' Function to plot spatial distribution of model estimates beside simulated true values and the difference between them
#'
#' @param dataframe Dataframe containing all simulated and predicted values to be plotted spatially
#' @param id List of columns in dataframe that together identify unique observations (Default = c("x", "y", "time"))
#' @param values List of values to be shown in plotted on separate panels (Default = c("real_z", "est", "diff"))
#' @param time_periods List of time periods to be shown in plot (Default = c("1", "5", "9"))
#'
#' @return
#' @export
#'
#' @examples
#' simdat <- sim_args_vec()
#' dat <- simdat[[1]] %>% group_by(time) %>% sample_n(1500) %>% ungroup()
#' spde <- make_spde(x = dat$x, y = dat$y, n_knots = 150)
#' m <- sdmTMB(data = dat,
#' formula = z ~ 1,
#' time = "time",
#' family = gaussian(link = "identity"),
#' spde = spde)
#' original_time <- sort(unique(m$data[[m$time]]))
#' nd <- do.call("rbind", replicate(length(original_time), grid, simplify = FALSE))
#' nd[[m$time]] <- rep(original_time, each = nrow(grid))
#' predictions <- predict(m, newdata = nd, xy_cols = c("X", "Y"))$data
#' spatial_bias_dat <- full_join(simdat[[1]], predictions,
#' by = c("x" = "X", "y" = "Y", "time" = "time"),
#' suffix = c("", "_est")) %>%
#' mutate(diff = est - real_z)
#' plot_map_diff(spatial_bias_dat)
plot_map_diff <- function(dataframe,
id = c("x", "y", "time"),
values = c("real_z", "est", "diff"),
time_periods = c("1", "5", "9")) {
melted <- reshape2::melt(dataframe, id) %>% # could be replace with tidyr::gather(spatial_bias_dat, "variable", "value",... )?
filter(variable %in% values) %>%
filter(time %in% time_periods)
ggplot(melted, aes_string("x", "y", fill = "value")) +
geom_raster() +
facet_grid(time ~ variable) +
scale_fill_viridis_c() +
coord_fixed()
}
plot_map_diff(spatial_bias_dat)
######################
######################
# Save parameter estimates from single simulation
######################
######################
#' Function that saves a tibble of parameter inputs and estimates for a single iteration
#'
#' @param iter Iteration id number; default is a random number; used to set.seed
#' @param grid Dataframe of spatial coordinates eg. c(X, Y)
#' @param x A vector of x coordinates.
#' @param y A vector of y coordinates.
#' @param time_steps The number of time steps.
#' @param plot Logical for whether or not to produce a plot.
#' @param ar1_fields Logical for whether or not to include AR1 structure.
#' @param ar1_phi Correlation between years; should be between -1 and 1.
#' @param sigma_O SD of spatial process (Omega).
#' @param sigma_E SD of spatiotemporal process (Epsilon).
#' @param kappa Parameter that controls the decay of spatial correlation (3/kappa is roughly the distance at which points are %10 correlated)
#' @param phi Observation error scale parameter.
#' @param N Sub-sample size = number of observations included in the sdmTMB model
#' @param n_knots Number of knots for spatial process#' @param formula
#' @param formula Define model to be assessed with the simulated data
#' @param family Set family of model to be assessed
#'
#' @return
#' @export
#'
#' @examples
sim_parameters <- function(iter = sample.int(1e3, 1), plot = TRUE,
grid = grid, x = grid$X, y = grid$Y,
time_steps = 9,
ar1_fields = TRUE,
ar1_phi = 0.5,
sigma_O = 0.3,
sigma_E = 0.3,
kappa = 0.05,
phi = 0.05,
N = 300, n_knots = 150,
formula = z ~ 1, family = gaussian(link = "identity")) {
set.seed(iter * 581267)
simdat <- sdmTMB::sim(
x = x, y = y, time_steps = time_steps, plot = plot,
ar1_fields = ar1_fields, ar1_phi = ar1_phi, sigma_O = sigma_O, sigma_E = sigma_E, kappa = kappa, phi = phi
)
dat <- simdat %>% group_by(time) %>% sample_n(N) %>% ungroup() # sub-sample from 'true' data
spde <- make_spde(dat$x, dat$y, n_knots)
plot_spde(spde)
# browser()
m <- sdmTMB(
silent = FALSE,
spatiotemporal = "AR1",
spatial = TRUE,
data = dat, formula = formula, time = "time",
family = family, spde = spde
)
r <- m$tmb_obj$report()
estimates <- c(
ar1_phi = (2 * plogis(m$model$par[["ar1_phi"]]) - 1),
sigma_O = r$sigma_O,
sigma_E = r$sigma_E,
kappa = exp(r$ln_kappa),
phi = exp(r$ln_phi)
)
inputs <- c(
ar1_phi = ar1_phi,
sigma_O = sigma_O,
sigma_E = sigma_E,
kappa = kappa,
phi = phi
)
parameter <- names(inputs)
converg <- m$model$convergence
# diff <- estimates - inputs
run <- tibble(parameter = parameter, inputs = inputs, estimates = estimates, iter = iter, converg = converg)
run
}
single_run <- sim_parameters()
single_run
######################
######################
# Run simulations j times for each set of parameter values
# save lists of tibbles
######################
######################
#' Function that saves a list of tibbles of:
#' [[1]] parameter inputs and estimates &
#' [[2]] real values and predicted values
#'
#' @param iter Iteration id number; default is a random number; used to set.seed
#' @param grid Dataframe of spatial coordinates eg. c(X, Y)
#' @param x A vector of x coordinates.
#' @param y A vector of y coordinates.
#' @param time_steps The number of time steps.
#' @param plot Logical for whether or not to produce a plot.
#' @param ar1_phi Correlation between years; should be between -1 and 1.
#' @param sigma_O SD of spatial process (Omega).
#' @param sigma_E SD of spatiotemporal process (Epsilon).
#' @param kappa Parameter that controls the decay of spatial correlation (3/kappa is roughly the distance at which points are %10 correlated)
#' @param phi Observation error scale parameter.
#' @param N Sub-sample size = number of observations included in the sdmTMB model
#' @param n_knots Number of knots for spatial process#' @param formula
#' @param formula Define model to be assessed with the simulated data
#' @param family Set family of model to be assessed
#'
#' @return
#' @export
#'
#' @examples
#'
sim_predictions <- function(iter = sample.int(1e3, 1), plot = TRUE,
grid = grid, x = grid$X, y = grid$Y,
time_steps = 4,
ar1_fields = TRUE,
ar1_phi = 0.5,
sigma_O = 0.3,
sigma_E = 0.3,
kappa = 0.05,
phi = 0.1,
N = 1000, n_knots = 150, # iter.max=1e4, eval.max=1e4,
formula = z ~ 1, family = gaussian(link = "identity")) {
set.seed(iter * 581267)
simdat <- sim(
plot = plot, x = x, y = y,
time_steps = time_steps, ar1_fields = ar1_fields,
ar1_phi = ar1_phi, sigma_O = sigma_O, sigma_E = sigma_E, kappa = kappa, phi = phi
)
dat <- simdat %>% group_by(time) %>% sample_n(N) %>% ungroup() # sub-sample from 'true' data
spde <- make_spde(dat$x, dat$y, n_knots)
plot_spde(spde)
# browser()
m <- sdmTMB(
silent = FALSE,
spatiotemporal = "AR1",
spatial = TRUE,
data = dat, formula = formula, time = "time",
family = family, spde = spde
)
r <- m$tmb_obj$report()
estimates <- c(
ar1_phi = (2 * plogis(m$model$par[["ar1_phi"]]) - 1),
sigma_O = r$sigma_O,
sigma_E = r$sigma_E,
kappa = exp(r$ln_kappa),
phi = exp(r$ln_phi)
)
inputs <- c(
ar1_phi = ar1_phi,
sigma_O = sigma_O,
sigma_E = sigma_E,
kappa = kappa,
phi = phi
)
parameter <- names(inputs)
converg <- m$model$convergence
# replicate grid for each time period
original_time <- sort(unique(m$data[[m$time]]))
nd <- do.call("rbind", replicate(length(original_time), grid, simplify = FALSE))
nd[[m$time]] <- rep(original_time, each = nrow(grid))
# run TMB with prediction turned on but replace sample 'dat' with new grid 'nd'
predictions <- predict(m, newdata = nd, xy_cols = c("X", "Y"))$data
head(predictions)
# combine true and predicted values for each point in space and time
spatial_bias <- full_join(simdat, predictions, by = c("x" = "X", "y" = "Y", "time" = "time"), suffix = c("", "_est")) %>% mutate(diff = est - real_z)
spatial_bias$converg <- converg
unsampled <- anti_join(spatial_bias, dat)
run <- list(par = tibble(parameter = parameter, inputs = inputs, estimates = estimates, iter = iter, converg = converg),
allpredicted = as_tibble(spatial_bias),
unsampled = as_tibble(unsampled))
run
}
#' Generate dataframe with parameter arguments for multiple simulation runs
#'
#' Note: This function creates a dataframe of all possible combinations,
#' therefore these parameters can be fixed to a single value, or vary using c(value1, value2, value3, ...)
#' @param ar1_phi Correlation between years; should be between -1 and 1.
#' @param sigma_O SD of spatial process (Omega).
#' @param sigma_E SD of spatiotemporal process (Epsilon).
#' @param kappa Parameter that controls the decay of spatial correlation (3/kappa is roughly the distance at which points are %10 correlated)
#' @param phi Observation error scale parameter.
#' @param time_steps The number of time steps.
#' @param N Sub-sample size = number of observations included in the sdmTMB model
#' @param n_knots Number of knots for spatial process
#' @param j Number of runs to be conducted for each unique combination of parameter values
#'
#' @return
#' @export
#'
#' @examples
#' grid <- expand.grid(X = seq(1:40), Y = seq(1:40))
#' args <- generate_arg(kappa = c(0.005, 0.1, 1), time_steps = 3, repeats = 3L)
#' all_iter <- purrr::pmap(args, sim_predictions,
#' grid = grid, x = grid$X, y = grid$Y,
#' formula = z ~ 1, family = gaussian(link = "identity"))
#' predictions <- all_iter %>% map(~ .x[["predicted"]])
#' spatial_bias_plots <- purrr::map(predictions, plot_map_diff, time_periods = c(1,2,3))
#' params <- all_iter %>% map(~ .x[["par"]]) %>%
#' bind_rows()
#' par_diff <- params %>% group_by(iter, parameter) %>%
#' mutate(diff = inputs - estimates) %>%
#' ungroup()
#'
generate_arg <- function(time_steps = 4,
ar1_phi = 0.5, # c(-0.85, 0.1, 0.85),
sigma_O = 0.3,
sigma_E = 0.3,
kappa = 0.2, # c(0.005, 0.1, 1),
phi = 0.1, # c(0.01, 0.1),
N = 300,
n_knots = 150,
j = 2L) {
arguments <- expand.grid(
ar1_phi = ar1_phi,
sigma_O = sigma_O,
sigma_E = sigma_E,
kappa = kappa,
phi = phi,
N = N,
n_knots = n_knots,
time_steps = time_steps
)
# browser()
nrow(arguments)
arguments$count <- j
arguments <- arguments[rep(seq_len(nrow(arguments)), arguments$count), ]
arguments_apply <- dplyr::select(arguments, -count)
nrow(arguments_apply)
arguments_apply$iter <- 1:nrow(arguments_apply)
arguments_apply
}
args <- generate_arg(j = 2L, ar1_phi = c(0.01, 0.7))
glimpse(args)
all_iter <- purrr::pmap(args, sim_predictions,
grid = grid, x = grid$X, y = grid$Y,
formula = z ~ 1, family = gaussian(link = "identity")
)
# Paralell process (but not currently working)
# library(future)
# plan(multisession)
# all_iter <- purrr::pmap(args, ~ future(sim_predictions(.x, grid = grid, x = grid$X, y = grid$Y,
# plot = TRUE, formula = z ~ 1, family = gaussian(link = "identity")
# )))
######################
######################
# Explore and plot simulation results
######################
######################
# if input parameter values varied, merge parameter and predicted dataframes
inputs_spread <- all_iter %>% map(~ .x[["par"]]) %>% bind_rows() %>% select( iter, parameter, inputs) %>% tidyr::spread(., key = parameter, value = c(inputs))
inputs_spread$iter <- as.factor(inputs_spread$iter)
allpredictions <- all_iter %>% map(~ .x[["allpredicted"]]) %>% bind_rows(.id = "iter") %>% left_join(., inputs_spread, by = "iter")
allunsampled <- all_iter %>% map(~ .x[["unsampled"]]) %>% bind_rows(.id = "iter") %>% left_join(., inputs_spread, by = "iter")
pred.hist <- ggplot(data = allunsampled, aes(x = diff)) +
geom_histogram(bins=40) +
geom_histogram(data = allpredictions, bins=40, alpha = 0.4) +
scale_fill_viridis_d() +
geom_vline(xintercept = 0, linetype="dashed") +
labs(title = "Prediction error for unsampled locations in darker colour", x = "(real - predicted)")
pred.hist
# pdf("prediction_hist.pdf")
# pred.hist
# dev.off()
# each run can be plotted spatio-temporally
predictions_by_iter <- all_iter %>% map(~ .x[["allpredicted"]]) #%>% bind_rows(.id = "iter") %>% left_join(., inputs_spread, by = "iter") %>% split(., list("iter"))
spatial.bias.plots <- purrr::map(predictions_by_iter, plot_map_diff, time_periods = c(1,2,3))
spatial.bias.plots
# pdf("spatial_bias_plots.pdf")
# spatial.bias.plots
# dev.off()
# result tibble of parameter estimates
params <- all_iter %>% map(~ .x[["par"]]) %>%
bind_rows()
# Calculate difference between parameter value input into simulation and estimate based on sdmTMB model
par_diff <- params %>% group_by(parameter) %>%
mutate(sd_est = sd(estimates), n = n()) %>%
group_by(iter, parameter) %>%
mutate(diff = (inputs - estimates), std_diff = (inputs - estimates)/sd_est) %>%
ungroup()
#' Plot histograms of parameter estimates from n simulations
#' Note: that n must
#'
#' @param dataframe Dataframe containing all simulated parameter estimates
#' @param x Varible to be plotted (Default = data$std_diff)
#' @param xlabel Description of variable to be plotted for use on x axis label
#' @param fill Varible used to colour bars to indicate if some estimates should be trusted more than others (Default = data$converg)
#' @param notes Description of fill choice or other caveats
#' @param bins Number of bins in histogram (Default = n/4)
#'
#' @return
#' @export
#'
#' @examples
#' params <- all_iter %>% map(~ .x[["par"]]) %>%
#' bind_rows()
#' par_error_hist(params)
#'
par_error_hist <- function(dataframe = params,
x = dataframe$std_diff,
xlabel = "Relative difference from input value",
fill = dataframe$converg,
notes = "if 2 colours, than some models did not converg",
bin_number = n/2){
dataframe <- dataframe %>% group_by(parameter) %>%
mutate(sd_est = sd(estimates), n = n()) %>%
group_by(iter, parameter) %>%
mutate(diff = (inputs - estimates), std_diff = (inputs - estimates)/sd_est) %>%
ungroup()
n <- dataframe$n[1]
binn <- bin_number
initial <- dataframe %>% group_by(parameter) %>% summarize(initial = mean(inputs), sd = round(mean(sd_est), 3))
table <- gridExtra::tableGrob(initial, rows = NULL)
fill <- as.factor(fill)
plot <- ggplot(data = dataframe, aes(x = x)) +
geom_histogram(aes(fill = fill), bins = binn) +
scale_fill_viridis_d() +
geom_vline(xintercept = 0, linetype="dashed") +
labs(title = "Simulated parameter estimates", x = xlabel) + #, caption = notes) +
facet_wrap(~parameter, scales = "free_x") +
theme(legend.position="none", plot.caption=element_text(size=12))
text <- RGraphics::splitTextGrob(notes)
ggdraw() + draw_plot(plot, x = 0, y = 0, width = 1, height = 1) +
draw_plot(table, x = .7, y = 0.1, width = .3, height = .5) +
draw_plot(text, x = .7, y = -.3, width = .3, height = .5)
}
par.hist <- par_error_hist(params)
par.hist
# pdf("par_error_hist.pdf")
# par.hist
# dev.off()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.