# R/estimate_beta_FC.R In davidearn/fastbeta: Fast Estimation of Time-Varying Transmission Rates

#### Documented in estimate_beta_FC

#' Estimate time-varying transmission rates (FC method)
#'
#' estimate_beta_FC() applies the FC method (see References)
#' to estimate the time-varying transmission rate
#' \ifelse{latex}{\out{$\beta(t)$}}{\ifelse{html}{\out{<i>&beta;</i>(<i>t</i>)}}{beta(t)}}
#' from time series of reported incidence and births, observed at
#' equally spaced time points
#' \ifelse{latex}{\out{$t_k = t_0 + k \Delta t$}}{\ifelse{html}{\out{<i>t<sub>k</sub></i> = <i>t</i><sub>0</sub>+<i>k&Delta;t</i>}}{t_k = t_0 + k*Dt}}
#' (for \ifelse{latex}{\out{$k = 0,\ldots,n$}}{\ifelse{html}{\out{<i>k</i> = 0,...,<i>n</i>}}{k = 0,...,n}}),
#' where
#' \ifelse{latex}{\out{$\Delta t$}}{\ifelse{html}{\out{<i>&Delta;t</i>}}{Dt}}
#' denotes the observation interval.
#'
#' @section Mock birth data:
#' If df$B is undefined in the function call, then df$B[i]
#' gets the value with(par_list, nu * hatN0 * 1) for all i.
#'
#' @section Missing data:
#' Missing values in df[, c("C", "B")] are not tolerated by
#' the FC method. They are imputed via linear interpolation
#' between observed values. If there are no observations
#' before the first missing value, then complete imputation
#' is impossible. In this case, the FC method may fail:
#' columns S and beta in the output may be filled with NA.
#'
#' Zeros in df$C cause divide-by-zero errors. To prevent these #' errors, zeros are imputed like missing values. If there are #' no nonzero observations before the first zero, then complete #' imputation is impossible. In this case, the FC method may fail, #' but only locally: column beta in the output may contain some #' NaN and Inf where an estimate would otherwise be expected. #' (Estimates are not expected everywhere. See "Effective #' observation interval".) #' #' @section Effective observation interval: #' In the output, every with(par_list, round(tgen)) rows are #' complete. The remaining rows contain NA in columns S, I, #' beta, Z_agg, and B_agg. This is a limitation of the FC #' method, which aggregates incidence and births over the mean #' generation interval, making with(par_list, round(tgen)) the #' *effective* observation interval. If this number is zero, then #' the method fails. #' #' @param df A data frame with numeric columns: #' #' \describe{ #' \item{t}{Time. t[i] is equal to #' \ifelse{latex}{\out{$t_{i-1} = t_0 + (i-1) \Delta t$}}{\ifelse{html}{\out{<i>t</i><sub><i>i</i>&minus;1</sub> = <i>t</i><sub>0</sub>+(<i>i</i>&minus;1)<i>&Delta;t</i>}}{t_i-1 = t_0 + (i-1)*Dt}} #' in units #' \ifelse{latex}{\out{$\Delta t$}}{\ifelse{html}{\out{<i>&Delta;t</i>}}{Dt}}, #' so that t[i] - t[i-1] is equal to 1. #' } #' \item{C}{Reported incidence. C[i] is the number of cases #' reported between times t[i-1] and t[i]. #' } #' \item{B}{Births. B[i] is the number of births between times #' t[i-1] and t[i]. #' } #' } #' #' B is optional if hatN0 and nu are defined in par_list #' (see Details). #' @param par_list A list containing: #' #' \describe{ #' \item{prep}{$\ifelse{latex}{\out{p_\text{rep}}}{\ifelse{html}{\out{<i>p</i><sub>rep</sub>}}{p_rep}}$ #' Case reporting probability. #' } #' \item{trep}{$\ifelse{latex}{\out{t_\text{rep}}}{\ifelse{html}{\out{<i>t</i><sub>rep</sub>}}{t_rep}}$ #' Case reporting delay in units #' \ifelse{latex}{\out{$\Delta t$}}{\ifelse{html}{\out{<i>&Delta;t</i>}}{Dt}}. #' } #' \item{S0}{$\ifelse{latex}{\out{S_0}}{\ifelse{html}{\out{<i>S</i><sub>0</sub>}}{S_0}}$ #' Number of susceptibles at time #' \ifelse{latex}{\out{$t = t_0$}}{\ifelse{html}{\out{<i>t</i> = <i>t</i><sub>0</sub>}}{t = t_0}}. #' } #' \item{hatN0}{$\ifelse{latex}{\out{\widehat{N}_0}}{\ifelse{html}{\out{<i>&Ntilde;</i><sub>0</sub>}}{hatN_0}}$ #' Population size at time #' \ifelse{latex}{\out{$t = 0$}}{\ifelse{html}{\out{<i>t</i> = 0}}{t = 0}} #' years. #' } #' \item{nu}{$\ifelse{latex}{\out{\nu_\text{c}}}{\ifelse{html}{\out{<i>&nu;<sub>c</sub></i>}}{nu_c}}$ #' Birth rate expressed per unit #' \ifelse{latex}{\out{$\Delta t$}}{\ifelse{html}{\out{<i>&Delta;t</i>}}{Dt}} #' and relative to #' \ifelse{latex}{\out{$\hat{N}_0$}}{\ifelse{html}{\out{<i>&Ntilde;</i><sub>0</sub>}}{hatN_0}} #' (if modeled as constant). #' } #' \item{tgen}{$\ifelse{latex}{\out{t_\text{gen}}}{\ifelse{html}{\out{<i>t</i><sub>gen</sub>}}{t_gen}}$ #' Mean generation interval of the disease of interest in units #' \ifelse{latex}{\out{$\Delta t$}}{\ifelse{html}{\out{<i>&Delta;t</i>}}{Dt}}. #' } #' } #' #' hatN0 and nu are optional if B is defined in df #' (see Details). #' #' @return #' A data frame with numeric columns: #' #' \describe{ #' \item{t}{Time. Identical to df$t.}
#'   \item{C}{Reported incidence, imputed. Identical to df$C, #' except with missing values and zeros imputed (see Details). #' } #' \item{Z}{Incidence. Z[i] is the estimated #' number of infections between times t[i-1] and t[i]. #' } #' \item{Z_agg}{Aggregated incidence. Z_agg[i] is the estimated #' number of infections between times t[i-round(tgen)+1] and #' t[i], for i in seq(1 + round(tgen), nrow(df), by = round(tgen)). #' Z_agg[i] is NA otherwise. #' } #' \item{B}{Births, imputed. Identical to df$B (if supplied),
#'     except with missing values imputed (see Details).
#'   }
#'   \item{B_agg}{Aggregated births. B_agg[i] is the
#'     number of births between times t[i-round(tgen)+1] and t[i],
#'     for i in seq(1 + round(tgen), nrow(df), by = round(tgen)).
#'     B_agg[i] is NA otherwise.
#'   }
#'   \item{S}{Number of susceptibles. S[i] is the estimated
#'     number of susceptibles at time t[i].
#'   }
#'   \item{I}{Number of infecteds. I[i] is the estimated
#'     number of infecteds at time t[i].
#'   }
#'   \item{beta}{Transmission rate. beta[i] is the estimated
#'     transmission rate at time t[i] expressed per unit
#'     \ifelse{latex}{\out{$\Delta t$}}{\ifelse{html}{\out{<i>&Delta;t</i>}}{Dt}}
#'     per susceptible per infected.
#'   }
#' }
#'
#' It possesses par_list as an attribute.
#'
#' @examples
#' # Simulate a reported incidence time series using
#' # a seasonally forced transmission rate
#' par_list <- make_par_list(dt_weeks = 1)
#' df <- make_data(
#'   par_list = par_list,
#'   n = 20 * 365 / 7, # 20 years is ~1042 weeks
#'   with_dem_stoch = FALSE
#' )
#'
#' # Estimate incidence, susceptibles, infecteds,
#' # and the seasonally forced transmission rate
#' df_FC <- estimate_beta_FC(df, par_list)
#'
#' # Estimation of susceptibles and transmission rate fails
#' # because the FC method ignores susceptible mortality
#' df_FC$t_years <- df$t_years
#' plot(S ~ t_years, df, type = "l", ylim = c(43, 83) * 1e03)
#' lines(S ~ t_years, df_FC[!is.na(df_FC$S), ], col = "red") #' plot(beta ~ t_years, df, type = "l", ylim = c(0.5, 1.2) * 1e-05) #' lines(beta ~ t_years, df_FC[!is.na(df_FC$beta), ], col = "red")
#'
#' @references
#' deJonge MS, Jagan M, Krylova O, Earn DJD. Fast estimation of
#' time-varying transmission rates for infectious diseases.
#'
#' @md
#' @export
estimate_beta_FC <- function(df       = data.frame(),
par_list = list()) {

## 1. Set-up -----------------------------------------------------------

# Load necessary elements of par_list into the execution environment
list2env(
par_list[c("prep", "trep", "S0", "hatN0", "nu", "tgen")],
envir = environment()
)

# Logical: Is df missing a column B?
B_was_not_def <- is.null(df$B) # Preallocate memory for output. Assume a constant birth rate, # if necessary. df <- data.frame( t = df$t,
C     = df$C, Z = NA, Z_agg = NA, B = if (B_was_not_def) nu * hatN0 * 1 else df$B,
B_agg = NA,
S     = NA,
I     = NA,
beta  = NA
)

## 2. Impute missing values in reported incidence, births --------------

df[c("C", "B")] <- lapply(df[c("C", "B")],
function(x) {
if (!any(is.na(x))) {
return(x)
}

# Indices of missing values
ind_na <- which(is.na(x))

# Function that performs linear interpolation
# between time points where x is observed
impute_na <- stats::approxfun(
x      = df$t[-ind_na], y = x[-ind_na], method = "linear", rule = 1 # return NA outside range of (argument) x ) # Impute missing values. Missing values outside of # the range of observed data are retained as NA. replace(x, ind_na, impute_na(df$t[ind_na]))
}
)

## 3. Impute zeros in reported incidence -------------------------------

df$C <- local( { if (!any(df$C == 0, na.rm = TRUE)) {
return(df$C) } # Indices of positive values ind_pos <- which(df$C > 0)

# Indices of zeros between two positive values
ind_zero <- which(df$C == 0) ind_zero <- ind_zero[ ind_zero > min(ind_pos) & ind_zero < max(ind_pos) ] # Function that performs linear interpolation # between time points where df$C is positive
impute_zero <- stats::approxfun(
x      = df$t[ind_pos], y = df$C[ind_pos],
method = "linear",
rule   = 1 # return NA outside range of x
)

# Impute zeros. Zeros outside of the range of
# positive data are retained as zeros due to
# the definition of ind_zero.
replace(df$C, ind_zero, impute_zero(df$t[ind_zero]))
}
)

## NOTE: In the best case, there are no longer missing values or zeros
##       in df$C. In the worst case, df$C is now looks like
##       c(NA,...,NA,0,...,0,+,...,+,0,...,0,NA,...,NA),
##       where + denotes a positive number.

## 4. Estimate incidence -----------------------------------------------

trepr <- round(trep)
df$Z <- c( (1 / prep) * df$C[(trepr+1):nrow(df)],
rep(NA, trepr)
)

## 5. Aggregate incidence, births --------------------------------------

# Aggregates are reported after each mean generation interval,
# starting at the first time point. Aggregates at the first
# time point cannot be computed, because there are no prior
# observations to aggregate.
tgenr <- round(tgen)
ind_with_agg <- seq(1, nrow(df), by = tgenr)
for (i in ind_with_agg[-1]) {
ind_into_agg <- seq(i - tgenr + 1, i, by = 1)
df$B_agg[i] <- sum(df$B[ind_into_agg])
df$Z_agg[i] <- sum(df$Z[ind_into_agg])
}

## 6. Estimate susceptibles, infecteds, transmission rate --------------

df[c("S", "I", "beta")] <- with(df,
{
S <- S0
for (i in ind_with_agg[-1]) {
S[i] <- S[i-tgenr] + B_agg[i] - Z_agg[i]
}
I <- Z_agg
beta[ind_with_agg] <- c(Z_agg[ind_with_agg[-1]], NA) /
(S[ind_with_agg] * Z_agg[ind_with_agg] * tgenr)
list(S, I, beta)
}
)

## NOTE: If too many NA were retained at the start of df$C or ## df$B, then df$S and df$beta will be filled with NA.
##       If zeros were retained in df$C, then NaN or Inf will ## appear in df$beta whenever a divide-by-zero error occurs.

## 7. Warn if S is ever negative -------------------------------------

if (any(df$S < 0, na.rm = TRUE)) { # May have underestimated prep or nu new_par_vals <- paste0( "\n* prep > ", sprintf("%.3f", prep), if (B_was_not_def) paste0("\n* nu > ", sprintf("%.3e", nu)) ) warning( "FC method: S[i] < 0 for some i. Retry with:", new_par_vals, call. = FALSE ) } ## 8. Warn if beta is ever NaN or Inf ---------------------------- if (any(is.nan(df$beta) | is.infinite(df$beta))) { warning( "FC method: beta[i] is NaN or Inf for some i. \n", "Is the first or last observation in df$C a zero?",
call. = FALSE
)
}

attr(df, "par_list") <- par_list
df
}

davidearn/fastbeta documentation built on June 14, 2020, 3:11 p.m.