Nothing
#' Fit excess count model
#'
#' This function estimates the increase in the rate for a count time series relative to
#' the rate for a typical year. Two options are available: 1 - model the rate increase as a
#' smooth function and estimate this function or 2 - estimate the total excess in intervals.
#' For 1 an `event` date can be provided and a discontinuity included in the model.
#' You can do either 1 or 2 or both.
#'
#' Three versions of the model are available: 1 - Assume counts are Poisson distributed,
#' 2 - assume counts are overdispersed Poisson, or 3 - assume a mixed model with
#' correlated errors. The second is the default and recommended for weekly count data. For daily counts we often find
#' evidence of correlation and recommend the third along with setting `weekday.effect = TRUE`.
#'
#' If the `counts` object includes a `expected` column produced by `compute_expected` these are used
#' as the expected counts. If not, then these are computed.
#'
#' @param counts A data frame with date, count and population columns.
#' @param start First day of interval to which model will be fit
#' @param end Last day of interval to which model will be fit
#' @param knots.per.year Number of knots per year used for the fitted smooth function
#' @param event If modeling a discontinuity is desired, this is the day in which it happens
#' @param intervals Instead of `start` and `end` a list of time intervals can be provided and excess is computed in each one
#' @param discontinuity Logical that determines if discontinuity is allowed at `event`
#' @param model Which version of the model to fit
#' @param exclude Dates to exclude when computing expected counts
#' @param include.trend Logical that determines if a slow trend is included in the model.
#' @param trend.knots.per.year Number of knots per year used by `compute_expected` to estimate the trend for the expected counts
#' @param harmonics Number of harmonics used by `compute_expected` to estimate seasonal trend
#' @param frequency Number of observations per year. If not provided an attempt is made to calculate it
#' @param weekday.effect Logical that determins if a day of the week effects is included in the model. Should be `FALSE` for weekly or monthly data.
#' @param control.dates When `model` is set to `correlated`, these dates are used to estimate the covariance matrix. The larger this is the slower the function runs.
#' @param max.control If the length of `control.dates` is larger than `max.control` the function stops.
#' @param order.max Larges order for the Autoregressive process used to model the covariance structure
#' @param aic A logical that determines if the AIC criterion is used to selected the order of the AR process
#' @param maxit Maxium number of iterations for the IRLS algorithm used when `model` is `correlated`
#' @param epsilon Difference in deviance requried to declare covergenace of IRLS
#' @param alpha Percentile used to define what is outside the normal range
#' @param min.rate The estimated expected rate is not permited to go below this value
#' @param keep.counts A logical that if `TRUE` forces the function to include the data used to fit the expected count model.
#' @param keep.components A logical that if `TRUE` forces the function to return the estimated trend, seasonal model, and weekday effect, if included in the model. Ignored if expected counts already provided or `keep.counts` is `FALSE`.
#' @param verbose Logical that determines if messages are displayed
#'
#' @return If only `intervals` are provided a data frame with excess estimates described below for `excess`.
#' if `start` and `end` are provided the a list with the following components are included:
#' \describe{
#' \item{date}{The dates for which the estimate was computed}
#' \item{observed}{The observed counts}
#' \item{expected}{The expected counts}
#' \item{fitted}{The fitted curve for excess counts}
#' \item{se}{The point-wise standard error for the fitted curve}
#' \item{population}{The population size}
#' \item{sd}{The standard deviation for observed counts on a typical year}
#' \item{cov}{The estimated covariance matrix for the observed counts}
#' \item{x}{The design matrix used for the fit}
#' \item{betacov}{The covariance matrix for the estimated coefficients}
#' \item{dispersion}{The estimated overdispersion parameter}
#' \item{detected_intervals}{Time intervals for which the 1 - `alpha` confidence interval does not include 0}
#' \item{ar}{The estimated coefficients for the autoregressive process}
#' \item{excess}{A data frame with information for the time intervals provided in `itervals`. This includes start, end, observed death rate (per 1,000 per year), expected death rate, standard deviation for the death rate, observed counts, expected counts, excess counts, standard deviation}
#' }
#'
#' @examples
#' data(cdc_state_counts)
#' counts <- cdc_state_counts[cdc_state_counts$state == "Massachusetts", ]
#' exclude_dates <- c(seq(as.Date("2017-12-16"), as.Date("2018-01-16"), by = "day"),
#' seq(as.Date("2020-01-01"), max(cdc_state_counts$date), by = "day"))
#' f <- excess_model(counts,
#' exclude = exclude_dates,
#' start = min(counts$date),
#' end = max(counts$date),
#' knots.per.year = 12)
#' data(new_jersey_counts)
#' exclude_dates <- as.Date("2012-10-29") + 0:180
#' control_dates <- seq(min(new_jersey_counts$date), min(exclude_dates) - 1, by = "day")
#' f <- excess_model(new_jersey_counts,
#' start = as.Date("2012-09-01"),
#' end = as.Date("2013-09-01"),
#' exclude = exclude_dates,
#' model = "correlated",
#' weekday.effect = TRUE,
#' control.dates = control_dates)
#'
#' @export
#' @importFrom stats ARMAacf glm poly qnorm fitted.values
excess_model <- function(counts,
start = NULL,
end = NULL,
knots.per.year = 12,
event = NULL,
intervals = NULL,
discontinuity = TRUE,
model = c("quasipoisson", "poisson", "correlated"),
exclude = NULL,
include.trend = TRUE,
trend.knots.per.year = 1/7,
harmonics = 2,
frequency = NULL,
weekday.effect = FALSE,
control.dates = NULL,
max.control = 5000,
order.max = 14,
aic = TRUE,
maxit = 25,
epsilon = 1e-8,
alpha = 0.05,
min.rate = 0.0001,
keep.counts = FALSE,
keep.components = TRUE,
verbose = TRUE){
if (!"compute_expected" %in% class(counts)) {
if (verbose) message("Computing expected counts.")
counts <- compute_expected(counts,
exclude = exclude,
include.trend = include.trend,
trend.knots.per.year = trend.knots.per.year,
harmonics = harmonics,
frequency = frequency,
weekday.effect = weekday.effect,
keep.components = keep.components,
verbose = verbose)
}
correlated.errors <- match.arg(model) == "correlated"
## number of observations per year
frequency <- attr(counts, "frequency")
dispersion <- attr(counts, "dispersion")
## checks
if (frequency == 12 & correlated.errors) {
stop("Correlated error model, model = \"correlated\", can't be fitted with monthly data.")
}
if (any(counts$excluded)) {
exclude <- counts$date[counts$excluded]
} else{
exclude <- NULL
}
if (correlated.errors & is.null(control.dates)) {
if (!is.null(exclude)) {
warning("No control region suplied, which is not recommended when model = \"correlated\". Using data up to first excluded point.")
control.dates <- seq(min(counts$date), min(exclude, na.rm = TRUE) - 1, by = "day")
} else{
warning("No control region suplied, which is not recommended when correlated.errors = TRUE. Using all the data")
control.dates <- counts$date
}
}
if (length(control.dates) > max.control & correlated.errors)
stop("Length of control longer than", max.control)
if ((is.null(start) & !is.null(end)) | (!is.null(start) & is.null(end)))
stop("You must provide both start and end, not just one.")
if (is.null(start) & is.null(end) & is.null(intervals))
stop("You must provide start and end or intervals.")
## check to see if counts per unit of time are high enough for model to work
if (mean(counts$outcome, na.rm = TRUE) < 1 & correlated.errors)
warning("Low counts per unit of time, consider fitting model with no correlation.")
## compute_expected always uses quasipoisson
if (match.arg(model) == "poisson") dispersion <- 1
## Use control days to compute the autocorrelation function
if (correlated.errors) {
arfit <- fit_ar(counts, control.dates, order.max = order.max, aic = aic)
s <- arfit$sigma
if (verbose) message("Order selected for AR model is ",length(arfit$ar),
". Estimated residual standard error is ", signif(arfit$sigma, 2), ".")
}
## Fit start and end provided, fit the curve
if (!is.null(start) & !is.null(end)) {
if (!is.null(event)) {
if (event >= end | event <= start)
stop("event must be between start and end.")
}
## now fit the GLS to the relevant subset of data
include_dates = seq(start, end, by = "day")
if (frequency == 365)
include_dates <- include_dates[!(lubridate::month(include_dates) == 2 & lubridate::day(include_dates) == 29)]
ind <- which(counts$date %in% include_dates)
date <- counts$date[ind]
n <- length(ind)
x <- 0:(n - 1) / frequency * 365
mu <- pmax(min.rate, counts$expected[ind])
if (any(mu == min.rate)) warning("Minimum expected rate reached and was set at ", min.rate, ".")
min.fhat <- min.rate / mu - 1
obs <- counts$outcome[ind]
pop <- counts$population[ind]
## compute residuals to fit ar model
if (correlated.errors) y <- (obs - mu) / mu
## create the design matrix
nknots <- round(knots.per.year * as.numeric(max(date) - min(date)) / 365)
knots <- x[round(seq(1, n, length = nknots + 2))]
knots <- knots[-c(1, length(knots))]
if (!is.null(event)) {
event_index <- x[which.min(abs(as.numeric(date - event)))]
i <- which.min(abs(knots - event_index))
##shift knots so that one of the internal knots falls on the event day
knots <- knots + (event_index - knots[i])
X <- cbind(1, splines::ns(x, knots = knots))
## add parameters to account for discontinuity
if (discontinuity) {
after_ind <- as.numeric(I(x >= event_index))
X <- cbind(X, after_ind, poly((x - event_index)*after_ind, degree = 2))
}
} else{
X <- cbind(1, splines::ns(x, knots = knots))
}
bad_cond_1 <- n <= ncol(X)
bad_cond_2 <- qr(X)$rank < ncol(X)
if (bad_cond_1 | bad_cond_2) {
## fit a saturated model: every month gets a mean value
if (bad_cond_1) warning("Model degrees of freedom exceeds number of observations: fitting a saturated model. Consider reducing knots.per.year.")
if (!bad_cond_1 & bad_cond_2) warning("Model design resulted in singular matrix: fitting a saturated model. Consider reducing knots.per.year.")
dates <- as.factor(date)
X <- model.matrix(~dates)
}
if (correlated.errors) {
fhat <- 0
beta <- 0;
dev <- 2*sum(ifelse(obs == 0, 0, obs * log(obs / mu)) - (obs - mu))
## convinience function
mysolve <- function(x) chol2inv(chol(x))
## parameters for covariance matrix
if (length(arfit$ar) > 0 & arfit$sigma > 0) {
rhos <- ARMAacf(ar = arfit$ar, ma = 0, lag.max = n)
}
## start iterations
count <- 0
flag <- TRUE
log_mu_vari <- counts$log_expected_se[ind]^2
while (count < maxit & flag) {
if (length(arfit$ar) > 0 & s > 0) {
Sigma <- apply(abs(outer(1:n, 1:n, "-")) + 1, 1, function(i) rhos[i]) *
outer(sqrt((1 + fhat)^2 * s^2 + (1 + fhat)/mu + (1 + fhat)^2 * log_mu_vari), sqrt((1 + fhat)^2 * s^2 + (1 + fhat)/mu + (1 + fhat)^2 * log_mu_vari))
Sigma_inv <- mysolve(Sigma)
} else{
Sigma <- diag((1 + fhat)^2 * s^2 + (1 + fhat)/mu + (1 + fhat)^2 * log_mu_vari)
Sigma_inv <- diag(1/((1 + fhat)^2 * s^2 + (1 + fhat)/mu + (1 + fhat)^2 * log_mu_vari))
}
## fit spline using weighted least squares
xwxi <- mysolve(t(X) %*% Sigma_inv %*% X)
beta <- xwxi %*% t(X) %*% Sigma_inv %*% y
count <- count + 1
fhat <- pmax(as.vector(X %*% beta), min.fhat)
devold <- dev
dev <- 2*sum(ifelse(obs == 0, 0, obs * log(obs / (mu*(1 + fhat)))) - (obs - mu*(1 + fhat)))
flag <- abs(dev - devold)/(0.1 + abs(dev)) >= epsilon
}
if (count > maxit) warning("No convergence after ", maxit, " imterations.")
se <- sqrt(apply(X, 1, function(x) matrix(x, nrow = 1) %*% xwxi %*% matrix(x, ncol = 1)))
betacov <- xwxi
} else{
fit <- glm(obs ~ X-1, offset = log(mu), family = "poisson")
tmp <- predict(fit, se = TRUE, type = "response")
fhat <- pmax(tmp$fit/mu - 1, min.fhat)
lambda <- fitted.values(fit)
cova <- summary(fit)$cov.unscaled*dispersion
lambda_vari <- lambda^2 * diag(X %*% cova %*% t(X))
mu_vari <- counts$expected[ind]^2*counts$log_expected_se[ind]^2
se <- sqrt((lambda_vari / mu^2) + (lambda^2 * mu_vari / mu^4))
Sigma <- diag(n)*summary(fit)$dispersion/mu
betacov <- summary(fit)$cov.unscaled*dispersion
}
## Warning if minimum reached
if (any(fhat == min.fhat)) warning("Minimum rate reached and was set so that estimated rate is ", min.rate, ".")
## Compute regions for which estimate was outside usual range
excess_ind <- which(fhat - qnorm(1 - alpha/2) * se >= 0)
if (length(excess_ind) > 0) {
cluster <- cumsum(c(2, diff(excess_ind)) > 1)
indexes <- split(excess_ind, cluster)
excess <- lapply(indexes, function(i){
n <- length(i)
excess_stats(min(counts$date[ind[i]]),
max(counts$date[ind[i]]),
counts$outcome[ind[i]],
counts$expected[ind[i]],
Sigma[i, i, drop = FALSE],
counts$population[ind[i]],
frequency,
fhat[i],
X[i,,drop = FALSE],
betacov)
})
detected_intervals <- do.call(rbind, excess)
} else{
detected_intervals <- data.frame(start = NA, end = NA, total = 0, se = NA, natural = NA)
}
ret <- list(date = date,
observed = obs,
expected = mu,
log_expected_se = counts$log_expected_se[ind],
fitted = fhat,
se = se,
population = pop,
sd = sqrt(diag(Sigma)),
cov = Sigma,
x = X,
betacov = betacov,
dispersion = dispersion,
detected_intervals = detected_intervals)
attr(ret, "frequency") <- frequency
attr(ret, "model") <- match.arg(model)
attr(ret, "type") <- "curve_fit"
if (correlated.errors) {
ret$ar <- arfit
}
}
## If intervals provided compute excess deaths
## The uncertainty is calculated under the null
if (!is.null(intervals)) {
res <- lapply(intervals, function(dates){
ind <- which(counts$date %in% dates)
date <- counts$date[ind]
n <- length(ind)
mu <- counts$expected[ind]
log_mu_vari <- counts$log_expected_se[ind]^2
if (correlated.errors) {
if (length(arfit$ar) > 0 & arfit$sigma > 0) {
rhos <- ARMAacf(ar = arfit$ar, ma = 0, lag.max = n)
}
if (length(arfit$ar) > 0 & arfit$sigma > 0) {
Sigma <- apply(abs(outer(1:n, 1:n, "-")) + 1, 1, function(i) rhos[i]) *
outer(sqrt(s^2 + 1/mu + log_mu_vari), sqrt(s^2 + 1/mu + log_mu_vari))
} else{
Sigma <- diag(s^2 + 1/mu + log_mu_vari)
}
} else{
Sigma <- diag(n) * dispersion / mu
}
excess_stats(min(dates),
max(dates),
counts$outcome[ind],
mu,
Sigma,
counts$population[ind],
frequency)
})
res <- do.call(rbind, res)
if (!exists("ret")) {
ret <- res
attr(ret, "type") <- "excess"
} else{
ret$excess <- res
attr(ret, "type") <- append(attr(ret, "type"), "excess")
}
}
attr(ret, "class") <- append("excess_model", class(ret))
attr(ret, "keep.counts") <- keep.counts
if (keep.counts) ret$counts <- counts
return(ret)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.