#' Fit exponential models to incidence data
#'
#' The function `fit` fits two exponential models to incidence data, of the
#' form: \eqn{log(y) = r * t + b} \cr where 'y' is the incidence, 't' is time
#' (in days), 'r' is the growth rate, and 'b' is the origin. The function `fit`
#' will fit one model by default, but will fit two models on either side of a
#' splitting date (typically the peak of the epidemic) if the argument `split`
#' is provided. When groups are present, these are included in the model as main
#' effects and interactions with dates. The function `fit_optim_split()` can be
#' used to find the optimal 'splitting' date, defined as the one for which the
#' best average R2 of the two models is obtained. Plotting can be done using
#' `plot`, or added to an existing incidence plot by the piping-friendly
#' function `add_incidence_fit()`.
#'
#' @export
#'
#' @rdname fit
#'
#' @return For `fit()`, a list with the class `incidence_fit` (for a
#' single model), or a list containing two `incidence_fit` objects (when
#' fitting two models). `incidence_fit` objects contain:
#'
#' - `$model`: the fitted linear model
#' - `$info`: a list containing various information extracted from the model
#' (detailed further)
#' - `$origin`: the date corresponding to day '0'
#'
#' The `$info` item is a list containing:
#'
#' - `r`: the growth rate
#' - `r.conf`: the confidence interval of 'r'
#' - `pred`: a `data.frame` containing predictions of the model,
#' including the true dates (`dates`), their numeric version used in the
#' model (`dates.x`), the predicted value (`fit`), and the lower
#' (`lwr`) and upper (`upr`) bounds of the associated confidence
#' interval.
#'
#' - `doubling`: the predicted doubling time in days; exists only if 'r' is
#' positive
#' - `doubling.conf`: the confidence interval of the doubling time
#' - `halving`: the predicted halving time in days; exists only if 'r' is
#' negative
#' - `halving.conf`: the confidence interval of the halving time
#'
#' For `fit_optim_split`, a list containing:
#'
#' - `df`: a `data.frame` of dates that were used in the optimization
#' procedure, and the corresponding average R2 of the resulting models.
#' - `split`: the optimal splitting date
#' - `fit`: an `incidence_fit_list` object containing the fit for each split.
#' If the `separate_split = TRUE`, then the `incidence_fit_list` object will
#' contain these splits nested within each group. All of the `incidence_fit`
#' objects can be retrieved with [get_fit()].
#' - `plot`: a plot showing the content of `df` (ggplot2 object)
#'
#' @author Thibaut Jombart \email{thibautjombart@@gmail.com}, Zhian N. Kamvar
#' \email{zkamvar@@gmail.com}.
#'
#' @seealso the [incidence()] function to generate the 'incidence'
#' objects. The [get_fit()] function to flatten `incidence_fit_list` objects to
#' a list of `incidence_fit` objects.
#'
#' @param x An incidence object, generated by the function
#' [incidence()]. For the plotting function, an `incidence_fit`
#' object.
#'
#' @param split An optional time point identifying the separation between the
#' two models. If NULL, a single model is fitted. If provided, two models would
#' be fitted on the time periods on either side of the split.
#'
#' @param level The confidence interval to be used for predictions; defaults to
#' 95%.
#'
#' @param quiet A logical indicating if warnings from `fit` should be
#' hidden; FALSE by default. Warnings typically indicate some zero incidence,
#' which are removed before performing the log-linear regression.
#'
#' @examples
#'
#' if (require(outbreaks)) { withAutoprint({
#' dat <- ebola_sim$linelist$date_of_onset
#'
#' ## EXAMPLE WITH A SINGLE MODEL
#'
#' ## compute weekly incidence
#' i.7 <- incidence(dat, interval=7)
#' plot(i.7)
#' plot(i.7[1:20])
#'
#' ## fit a model on the first 20 weeks
#' f <- fit(i.7[1:20])
#' f
#' names(f)
#' head(get_info(f, "pred"))
#'
#' ## plot model alone (not recommended)
#' plot(f)
#'
#' ## plot data and model (recommended)
#' plot(i.7, fit = f)
#' plot(i.7[1:25], fit = f)
#'
#' ## piping versions
#' if (require(magrittr)) { withAutoprint({
#' plot(i.7) %>% add_incidence_fit(f)
#'
#'
#' ## EXAMPLE WITH 2 PHASES
#' ## specifying the peak manually
#' f2 <- fit(i.7, split = as.Date("2014-10-15"))
#' f2
#' plot(i.7) %>% add_incidence_fit(f2)
#'
#' ## finding the best 'peak' date
#' f3 <- fit_optim_split(i.7)
#' f3
#' plot(i.7) %>% add_incidence_fit(f3$fit)
#' })}
#'
#' })}
#'
## The model fitted is a simple linear regression on the log-incidence.
## Non-trivial bits involve:
## 1) Fitting several models
## I.e. in case there is a increasing and a decreasing phase, we fit one
## model for each phase separately.
## 2) log(0)
## No satisfying solutions so far; for now removing the NAs
## 3) Several groups
## In this case, the number of models does not change, but models automatically
## include groups with interaction, whether or not it is significant.
## 4) Values of dates used as 'x'
## To retain generality, we need to use numbers (not Date or POSIXct) as 'x'
## axis for the model. Therefore, all dates are expressed as numbers of days
## since the first case (aka 'day 0' or 'origin'), picking the middle of each
## time interval. We also keep track of the origin, so that actual dates can be
## reconstructed during the plotting. Each 'fit' object has its own origin.
fit <- function(x, split = NULL, level = 0.95, quiet = FALSE){
n.groups <- ncol(x$counts)
## remove dates with one incidence of zero
to.keep <- apply(x$counts, 1, min) > 0
if (!quiet && !all(to.keep)) {
warning(sprintf("%d dates with incidence of 0 ignored for fitting",
sum(!to.keep)))
}
x <- x[to.keep]
## If there is only one date with non-zero incidence
## then no model cannot be fit. If there are no days with
## non-zero incidence, creation of incidence object throws
## error anyway.
if (x$timespan == 1) {
stop("Only 1 date with non-zero incidence. Cannot fit model to 1 data point.")
}
## Ensure that model coefficients are based on a daily timestep (not seconds)
if (inherits(x$dates, 'POSIXt')) {
x$dates <- as.Date(x$dates)
}
## Constructing the model based on number of groups present
if (n.groups == 1) {
the_model <- "log(counts) ~ dates.x"
} else {
the_model <- "log(counts) ~ dates.x * groups"
}
the_model <- stats::formula(the_model)
## model without split (1 model)
if (is.null(split)) {
df <- as.data.frame(x, long = TRUE)
## exact dates
df$dates.x <- get_dates(x, position = "center", count_days = TRUE)
lm1 <- stats::lm(the_model, data = df)
# updating the call for easier inspection by the user
lm1$call[[2]] <- the_model
out <- extract_info(lm1, min(get_dates(x)), level)
} else {
x1 <- x[x$dates <= split]
x2 <- x[x$dates >= split]
df1 <- as.data.frame(x1, long = TRUE)
df2 <- as.data.frame(x2, long = TRUE)
## exact dates
df1$dates.x <- get_dates(x1, position = "center", count_days = TRUE)
df2$dates.x <- get_dates(x2, position = "center", count_days = TRUE)
lm1 <- stats::lm(the_model, data = df1)
lm2 <- stats::lm(the_model, data = df2)
# updating the call for easier inspection by the user
lm1$call[[2]] <- the_model -> lm2$call[[2]]
before <- extract_info(lm1, min(get_dates(x1)), level)
after <- extract_info(lm2, min(get_dates(x2)), level)
out <- list(before = before,
after = after
)
attr(out, "locations") <- list("before", "after")
class(out) <- "incidence_fit_list"
}
out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.