R/fit.R
In incidence: Compute, Handle, Plot and Model Incidence of Dated Events

Documented in fit

#' Fit exponential models to incidence data
#'
#' The function `fit` fits two exponential models to incidence data, of the
#' form: \eqn{log(y) = r * t + b} \cr where 'y' is the incidence, 't' is time
#' (in days), 'r' is the growth rate, and 'b' is the origin. The function `fit`
#' will fit one model by default, but will fit two models on either side of a
#' splitting date (typically the peak of the epidemic) if the argument `split`
#' is provided. When groups are present, these are included in the model as main
#' effects and interactions with dates. The function `fit_optim_split()` can be
#' used to find the optimal 'splitting' date, defined as the one for which the
#' best average R2 of the two models is obtained. Plotting can be done using
#' `plot`, or added to an existing incidence plot by the piping-friendly
#' function `add_incidence_fit()`.
#'
#' @export
#'
#' @rdname fit
#'
#' @return For `fit()`, a list with the class `incidence_fit` (for a
#' single model), or a list containing two `incidence_fit` objects (when
#' fitting two models). `incidence_fit` objects contain:
#'
#' - `$model`: the fitted linear model
#' - `$info`: a list containing various information extracted from the model
#'   (detailed further)
#' - `$origin`: the date corresponding to day '0'
#'
#' The `$info` item is a list containing:
#'
#' - `r`: the growth rate
#' - `r.conf`: the confidence interval of 'r'
#' - `pred`: a `data.frame` containing predictions of the model,
#'   including the true dates (`dates`), their numeric version used in the
#'   model (`dates.x`), the predicted value (`fit`), and the lower
#'   (`lwr`) and upper (`upr`) bounds of the associated confidence
#'   interval.
#'
#' - `doubling`: the predicted doubling time in days; exists only if 'r' is
#'   positive
#' - `doubling.conf`: the confidence interval of the doubling time
#' - `halving`: the predicted halving time in days; exists only if 'r' is
#'   negative
#' - `halving.conf`: the confidence interval of the halving time
#'
#' For `fit_optim_split`, a list containing:
#'
#' - `df`: a `data.frame` of dates that were used in the optimization
#'   procedure, and the corresponding average R2 of the resulting models.
#' - `split`: the optimal splitting date
#' - `fit`: an `incidence_fit_list` object containing the fit for each split.
#'   If the `separate_split = TRUE`, then the `incidence_fit_list` object will
#'   contain these splits nested within each group. All of the `incidence_fit`
#'   objects can be retrieved with [get_fit()].
#' - `plot`: a plot showing the content of `df` (ggplot2 object)
#'
#' @author Thibaut Jombart \email{thibautjombart@@gmail.com}, Zhian N. Kamvar
#'   \email{zkamvar@@gmail.com}.
#'
#' @seealso the [incidence()] function to generate the 'incidence'
#' objects. The [get_fit()] function to flatten `incidence_fit_list` objects to
#' a list of `incidence_fit` objects.
#'
#' @param x An incidence object, generated by the function
#' [incidence()]. For the plotting function, an `incidence_fit`
#' object.
#'
#' @param split An optional time point identifying the separation between the
#' two models. If NULL, a single model is fitted. If provided, two models would
#' be fitted on the time periods on either side of the split.
#'
#' @param level The confidence interval to be used for predictions; defaults to
#' 95%.
#'
#' @param quiet A logical indicating if warnings from `fit` should be
#' hidden; FALSE by default. Warnings typically indicate some zero incidence,
#' which are removed before performing the log-linear regression.
#'
#' @examples
#'
#' if (require(outbreaks)) { withAutoprint({
#'  dat <- ebola_sim$linelist$date_of_onset
#'
#'  ## EXAMPLE WITH A SINGLE MODEL
#'
#'  ## compute weekly incidence
#'  i.7 <- incidence(dat, interval=7)
#'  plot(i.7)
#'  plot(i.7[1:20])
#'
#'  ## fit a model on the first 20 weeks
#'  f <- fit(i.7[1:20])
#'  f
#'  names(f)
#'  head(get_info(f, "pred"))
#'
#'  ## plot model alone (not recommended)
#'  plot(f)
#'
#'  ## plot data and model (recommended)
#'  plot(i.7, fit = f)
#'  plot(i.7[1:25], fit = f)
#'
#' ## piping versions
#' if (require(magrittr)) { withAutoprint({
#'   plot(i.7) %>% add_incidence_fit(f)
#'
#'
#'   ## EXAMPLE WITH 2 PHASES
#'   ## specifying the peak manually
#'   f2 <- fit(i.7, split = as.Date("2014-10-15"))
#'   f2
#'   plot(i.7) %>% add_incidence_fit(f2)
#'
#'   ## finding the best 'peak' date
#'   f3 <- fit_optim_split(i.7)
#'   f3
#'   plot(i.7) %>% add_incidence_fit(f3$fit)
#' })}
#'
#' })}
#'


## The model fitted is a simple linear regression on the log-incidence.

## Non-trivial bits involve:

## 1) Fitting several models
## I.e. in case there is a increasing and a decreasing phase, we fit one
##  model for each phase separately.

## 2) log(0)
## No satisfying solutions so far; for now removing the NAs

## 3) Several groups
## In this case, the number of models does not change, but models automatically
## include groups with interaction, whether or not it is significant.

## 4) Values of dates used as 'x'

## To retain generality, we need to use numbers (not Date or POSIXct) as 'x'
## axis for the model.  Therefore, all dates are expressed as numbers of days
## since the first case (aka 'day 0' or 'origin'), picking the middle of each
## time interval. We also keep track of the origin, so that actual dates can be
## reconstructed during the plotting. Each 'fit' object has its own origin.

fit <- function(x, split = NULL, level = 0.95, quiet = FALSE){
  n.groups <- ncol(x$counts)

  ## remove dates with one incidence of zero
  to.keep <- apply(x$counts, 1, min) > 0
  if (!quiet && !all(to.keep)) {
    warning(sprintf("%d dates with incidence of 0 ignored for fitting",
                    sum(!to.keep)))
  }
  x <- x[to.keep]
  ## If there is only one date with non-zero incidence
  ## then no model cannot be fit. If there are no days with
  ## non-zero incidence, creation of incidence object throws
  ## error anyway.
  if (x$timespan == 1) {
    stop("Only 1 date with non-zero incidence. Cannot fit model to 1 data point.")
  }
  ## Ensure that model coefficients are based on a daily timestep (not seconds)
  if (inherits(x$dates, 'POSIXt')) {
    x$dates <- as.Date(x$dates)
  }

  ## Constructing the model based on number of groups present
  if (n.groups == 1) {
    the_model <- "log(counts) ~ dates.x"
  } else {
    the_model <- "log(counts) ~ dates.x * groups"
  }
  the_model <- stats::formula(the_model)

  ## model without split (1 model)
  if (is.null(split)) {
    df <- as.data.frame(x, long = TRUE)
    ## exact dates
    df$dates.x <- get_dates(x, position = "center", count_days = TRUE)
    lm1 <- stats::lm(the_model, data = df)
    # updating the call for easier inspection by the user
    lm1$call[[2]] <- the_model
    out <- extract_info(lm1, min(get_dates(x)), level)
  } else {
    x1 <- x[x$dates <= split]
    x2 <- x[x$dates >= split]
    df1 <- as.data.frame(x1, long = TRUE)
    df2 <- as.data.frame(x2, long = TRUE)
    ## exact dates
    df1$dates.x <- get_dates(x1, position = "center", count_days = TRUE)
    df2$dates.x <- get_dates(x2, position = "center", count_days = TRUE)
    lm1 <- stats::lm(the_model, data = df1)
    lm2 <- stats::lm(the_model, data = df2)
    # updating the call for easier inspection by the user
    lm1$call[[2]] <- the_model -> lm2$call[[2]]
    before <- extract_info(lm1, min(get_dates(x1)), level)
    after <- extract_info(lm2, min(get_dates(x2)), level)
    out <- list(before = before,
                after = after
                )
    attr(out, "locations") <- list("before", "after")
    class(out) <- "incidence_fit_list"
  }
  out
}