##' Prepare data for use with the [`particle_filter`]. This function
##' is required to use the particle filter as helps arrange data and
##' be explicit about the off-by-one errors that can occur. It takes
##' as input your data to compare against a model, including some
##' measure of "time". We need to convert this time into model time
##' steps (see Details).
##'
##' We require that the time variable increments in unit steps; this
##' may be relaxed in future to even steps, or possibly irregular
##' steps, but for now this assumption is required. We assume that
##' the data in the first column is recorded at the end of a period of
##' 1 time unit. So if you have in the first column `t = 10, data =
##' 100` we assume that the model steps from `t = 9` to to `t = 10`
##' and at that period the data has value 100.
##'
##' For continuous time models, time is simple to think about; time is
##' continuous (and real-valued) and really any time is
##' acceptable. For discrete time models there are two correlated
##' measures of time we need to consider - (1) the `dust` "time step",
##' a non-negative integer value that increases in unit steps, and (2)
##' the "model time" which is related to the dust time step based on
##' the `rate` parameter here as `<model time> = <dust time> *
##' <rate>`. For a concrete example, consider a model where we want
##' to think in terms of days, but which we take 10 steps per
##' day. Time step 0 and model time 0 are the same, but day 1 occurs
##' at step 10, day 15 at step 150 and so on.
##'
##' @title Prepare data for use with particle filter
##'
##' @param data A [data.frame()] of data
##'
##' @param time The name of a column within `data` that represents
##' your measure of time. This column must be integer-like. To
##' avoid confusion, this cannot be called `step`, `time`, or
##' `model_time`.
##'
##' @param rate The number of model "time steps" that occur between
##' each time point (in model time `time`). This must also be
##' integer-like for discrete time models and must be `NULL` for
##' continuous time models.
##'
##' @param initial_time An initial time to start the model from. This
##' should always be provided, and must be provided for continuous
##' time models. For discrete time models, this is expressed in
##' model time. It must be a non-negative integer and must be at
##' most equal to the first value of the `time` column, minus 1
##' (i.e., `data[[time]] - 1`). For historical reasons if not given
##' we take the first value of the `time` column minus one, but with
##' a warning - this behaviour will be removed in a future version
##' of mcstate.
##'
##' @param population Optionally, the name of a column within `data` that
##' represents different populations. Must be a factor.
##'
##' @return If `population` is NULL, a data.frame with new columns
##' `time_start` and `time_end` (required by [`particle_filter`]),
##' along side all previous data except for the time variable, which
##' is replaced by new `<time>_start` and `<time>_end` columns. If
##' `population` is not NULL then a named list of data.frames as
##' described above where each element represents populations in the
##' order specified in the data.
##'
##' @export
##' @examples
##' d <- data.frame(day = 5:20, y = runif(16))
##' mcstate::particle_filter_data(d, "day", rate = 4, initial_time = 4)
##'
##' # If providing an initial day, then the first epoch of simulation
##' # will be longer (see the first row)
##' mcstate::particle_filter_data(d, "day", rate = 4, initial_time = 0)
##'
##' # If including populations:
##' d <- data.frame(day = 5:20, y = runif(16),
##' population = factor(rep(letters[1:2], each = 16)))
##' mcstate::particle_filter_data(d, "day", 4, 0, "population")
particle_filter_data <- function(data, time, rate, initial_time = NULL,
population = NULL) {
assert_is(data, "data.frame")
assert_scalar_character(time)
if (!(time %in% names(data))) {
stop(sprintf("Did not find column '%s', representing time, in data",
time))
}
if (time %in% c("step", "time", "model_time")) {
stop(sprintf("The time column cannot be called '%s'", time))
}
is_continuous <- is.null(rate)
if (is_continuous) {
time_type <- "particle_filter_data_continuous"
} else {
rate <- assert_scalar_positive_integer(rate)
time_type <- "particle_filter_data_discrete"
}
if (is.null(population)) {
nesting_type <- "particle_filter_data_single"
populations <- NULL
model_time_end <- data[[time]]
} else {
nesting_type <- "particle_filter_data_nested"
assert_scalar_character(population)
if (!(population %in% names(data))) {
stop(sprintf(
"Did not find column '%s', representing population, in data",
population))
}
if (!is.factor(data[[population]])) {
stop(sprintf("Column '%s' must be a factor", population))
}
populations <- levels(data[[population]])
data <- data[order(data[[population]], data[[time]]), ]
time_split <- split(data[[time]], data[[population]])
if (length(unique(time_split)) != 1) {
stop("Unequal time between populations")
}
model_time_end <- time_split[[1]]
}
## This is only required for discrete time models really
assert_integer(model_time_end, name = sprintf("data$%s", time))
if (is.null(initial_time)) {
if (is_continuous) {
stop("'initial_time' must be given for continuous models")
} else {
initial_time <- model_time_end[[1L]] - 1
fmt <- paste("'initial_time' should be provided. I'm assuming '%d'",
"which is one time unit before the first time in your",
"data (%d), but this might not be appropriate. This",
"will become an error in a future version of mcstate")
warning(sprintf(fmt, initial_time, model_time_end[[1L]]),
immediate. = TRUE)
## This is only an issue while we allow not providing an initial
## time.
if (model_time_end[[1L]] < 1) {
stop(sprintf("The first time must be at least 1 (but was given %d)",
model_time_end[[1L]]))
}
}
}
if (!is_continuous && any(model_time_end < 0)) {
stop("All times must be non-negative")
}
initial_time <- assert_integer(initial_time)
if (initial_time < 0) {
## This condition is actually only required for discrete time
## models; for continuous time models this would be fine.
stop("'initial_time' must be non-negative")
}
if (initial_time > model_time_end[[1L]]) {
stop(sprintf("'initial_time' must be <= %d", model_time_end[[1L]]))
}
model_time_start <- c(initial_time, model_time_end[-length(model_time_end)])
model_times <- cbind(model_time_start, model_time_end, deparse.level = 0)
times <- model_times * (if (is_continuous) 1 else rate)
ret <- data.frame(model_time_start = model_times[, 1],
model_time_end = model_times[, 2],
time_start = times[, 1],
time_end = times[, 2],
data[names(data) != time],
stringsAsFactors = FALSE, check.names = FALSE)
names(ret)[1:2] <- paste0(time, c("_start", "_end"))
attr(ret, "rate") <- rate
attr(ret, "time") <- time
attr(ret, "times") <- times
attr(ret, "model_times") <- model_times
attr(ret, "population") <- population
attr(ret, "populations") <- populations
class(ret) <- c(nesting_type, time_type, "particle_filter_data", "data.frame")
ret
}
particle_filter_data_split <- function(data, compiled_compare) {
population <- attr(data, "population")
## Drop off lots of attributes that are just annoying after the data
## has been split:
attr(data, "rate") <- NULL
attr(data, "time") <- NULL
attr(data, "times") <- NULL
attr(data, "model_time") <- NULL
attr(data, "population") <- NULL
attr(data, "populations") <- NULL
class(data) <- "data.frame"
if (compiled_compare) {
dust::dust_data(data, "time_end", population)
} else if (is.null(population)) {
lapply(unname(split(data, seq_len(nrow(data)))), as.list)
} else {
## largely copied from dust::dust_data
rows <- lapply(seq_len(nrow(data)), function(i) as.list(data[i, ]))
rows_grouped <- unname(split(rows, data[[population]]))
lapply(seq_len(nrow(data) / nlevels(data[[population]])),
function(i) lapply(rows_grouped, "[[", i))
}
}
##' @export
`[.particle_filter_data` <- function(x, i, j, ...) { # nolint
ret <- NextMethod("[")
## It's hard to detect this based on 'i' and 'j' but we can detect
## the effect of subsetting fairly efficiently:
if (!identical(x$time_start, ret$time_start)) {
## TODO (#180): Stricter checks to come on the subset.
## TODO: add a test of this, it was broken and not sure if tests caught it
k <- seq_len(nrow(x))[i]
if (!is.null(attr(x, "population"))) {
k <- k[k <= nrow(attr(x, "times"))]
}
attr(ret, "model_times") <- attr(x, "model_times")[k, , drop = FALSE]
attr(ret, "times") <- attr(x, "times")[k, , drop = FALSE]
}
ret
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.