R/estimate_beta_S.R

Defines functions estimate_beta_S

Documented in estimate_beta_S

#' Estimate time-varying transmission rates (S method)
#'
#' `estimate_beta_S()` applies the S method (see References)
#' to estimate the time-varying transmission rate
#' \ifelse{latex}{\out{$\beta(t)$}}{\ifelse{html}{\out{<i>&beta;</i>(<i>t</i>)}}{beta(t)}}
#' from time series of reported incidence, births, and
#' natural mortality, observed at equally spaced time points
#' \ifelse{latex}{\out{$t_k = t_0 + k \Delta t$}}{\ifelse{html}{\out{<i>t<sub>k</sub></i> = <i>t</i><sub>0</sub>+<i>k&Delta;t</i>}}{t_k = t_0 + k*Dt}}
#' (for \ifelse{latex}{\out{$k = 0,\ldots,n$}}{\ifelse{html}{\out{<i>k</i> = 0,...,<i>n</i>}}{k = 0,...,n}}),
#' where
#' \ifelse{latex}{\out{$\Delta t$}}{\ifelse{html}{\out{<i>&Delta;t</i>}}{Dt}}
#' denotes the observation interval.
#'
#' @section Mock vital data:
#' If `df$B` is undefined in the function call, then `df$B[i]`
#' gets the value `with(par_list, nu * hatN0 * 1)` for all `i`.
#' If `df$mu` is undefined the function call, then `df$mu[i]`
#' gets the value `with(par_list, mu)` for all `i`.
#'
#' @section Missing data:
#' Missing values in `df[, c("C", "B", "mu")]` are not tolerated
#' by the S method. They are imputed via linear interpolation
#' between observed values. If there are no observations before
#' the first missing value, then complete imputation is impossible.
#' In this case, the S method may fail: columns `S` and `beta` in
#' the output may be filled with `NA`.
#'
#' Zeros in `df$C` cause divide-by-zero errors. To prevent these
#' errors, zeros are imputed like missing values. If there are
#' no nonzero observations before the first zero, then complete
#' imputation is impossible. In this case, the S method may fail,
#' but only locally: column `beta` in the output may contain some
#' `NaN` and `Inf`, but give estimates everywhere else.
#'
#' @param df A data frame with numeric columns:
#'
#'   \describe{
#'     \item{`t`}{Time. `t[i]` is equal to
#'       \ifelse{latex}{\out{$t_{i-1} = t_0 + (i-1) \Delta t$}}{\ifelse{html}{\out{<i>t</i><sub><i>i</i>&minus;1</sub> = <i>t</i><sub>0</sub>+(<i>i</i>&minus;1)<i>&Delta;t</i>}}{t_i-1 = t_0 + (i-1)*Dt}}
#'       in units
#'       \ifelse{latex}{\out{$\Delta t$}}{\ifelse{html}{\out{<i>&Delta;t</i>}}{Dt}},
#'       so that `t[i] - t[i-1]` is equal to 1.
#'     }
#'     \item{`C`}{Reported incidence. `C[i]` is the number of cases
#'       reported between times `t[i-1]` and `t[i]`.
#'     }
#'     \item{`B`}{Births. `B[i]` is the number of births between times
#'       `t[i-1]` and `t[i]`.
#'     }
#'     \item{`mu`}{Natural mortality rate. `mu[i]` is the rate at time
#'       `t[i]` expressed per unit
#'       \ifelse{latex}{\out{$\Delta t$}}{\ifelse{html}{\out{<i>&Delta;t</i>}}{Dt}}
#'       and per capita.
#'     }
#'   }
#'
#'   `B` is optional if `hatN0` and `nu` are defined in `par_list`, and
#'   `mu` is optional if `mu` is defined in `par_list` (see Details).
#' @param par_list A list containing:
#'
#'   \describe{
#'     \item{`prep`}{\[ \ifelse{latex}{\out{$p_\text{rep}$}}{\ifelse{html}{\out{<i>p</i><sub>rep</sub>}}{p_rep}} \]
#'       Case reporting probability.
#'     }
#'     \item{`trep`}{\[ \ifelse{latex}{\out{$t_\text{rep}$}}{\ifelse{html}{\out{<i>t</i><sub>rep</sub>}}{t_rep}} \]
#'       Case reporting delay in units
#'       \ifelse{latex}{\out{$\Delta t$}}{\ifelse{html}{\out{<i>&Delta;t</i>}}{Dt}}.
#'     }
#'     \item{`S0`}{\[ \ifelse{latex}{\out{$S_0$}}{\ifelse{html}{\out{<i>S</i><sub>0</sub>}}{S_0}} \]
#'       Number of susceptibles at time
#'       \ifelse{latex}{\out{$t = t_0$}}{\ifelse{html}{\out{<i>t</i> = <i>t</i><sub>0</sub>}}{t = t_0}}.
#'     }
#'     \item{`hatN0`}{\[ \ifelse{latex}{\out{$\widehat{N}_0$}}{\ifelse{html}{\out{<i>&Ntilde;</i><sub>0</sub>}}{hatN_0}} \]
#'       Population size at time
#'       \ifelse{latex}{\out{$t = 0$}}{\ifelse{html}{\out{<i>t</i> = 0}}{t = 0}}
#'       years.
#'     }
#'     \item{`nu`}{\[ \ifelse{latex}{\out{$\nu_\text{c}$}}{\ifelse{html}{\out{<i>&nu;<sub>c</sub></i>}}{nu_c}} \]
#'       Birth rate expressed per unit
#'       \ifelse{latex}{\out{$\Delta t$}}{\ifelse{html}{\out{<i>&Delta;t</i>}}{Dt}}
#'       and relative to
#'       \ifelse{latex}{\out{$\hat{N}_0$}}{\ifelse{html}{\out{<i>&Ntilde;</i><sub>0</sub>}}{hatN_0}}
#'       (if modeled as constant).
#'     }
#'     \item{`mu`}{\[ \ifelse{latex}{\out{$\mu_\text{c}$}}{\ifelse{html}{\out{<i>&mu;</i><sub>c</sub>}}{mu_c}} \]
#'       Natural mortality rate expressed per unit
#'       \ifelse{latex}{\out{$\Delta t$}}{\ifelse{html}{\out{<i>&Delta;t</i>}}{Dt}}
#'       and per capita (if modeled as constant).
#'     }
#'     \item{`tgen`}{\[ \ifelse{latex}{\out{$t_\text{gen}$}}{\ifelse{html}{\out{<i>t</i><sub>gen</sub>}}{t_gen}} \]
#'       Mean generation interval of the disease of interest in units
#'       \ifelse{latex}{\out{$\Delta t$}}{\ifelse{html}{\out{<i>&Delta;t</i>}}{Dt}}.
#'     }
#'   }
#'
#'   `hatN0` and `nu` are optional if `B` is defined in `df`, and
#'   `mu` is optional if `mu` is defined in `df` (see Details).
#'
#' @return
#' A data frame with numeric columns:
#'
#' \describe{
#'   \item{`t`}{Time. Identical to `df$t`.}
#'   \item{`C`}{Reported incidence, imputed. Identical to `df$C`,
#'     except with missing values and zeros imputed (see Details).
#'   }
#'   \item{`Z`}{Incidence. `Z[i]` is the estimated
#'     number of infections between times `t[i-1]` and `t[i]`.
#'   }
#'   \item{`B`}{Births, imputed. Identical to `df$B` (if supplied),
#'     except with missing values imputed (see Details).
#'   }
#'   \item{`mu`}{Natural mortality rate, imputed. Identical to `df$mu`
#'     (if supplied), except with missing values imputed (see Details).
#'   }
#'   \item{`S`}{Number of susceptibles. `S[i]` is the estimated
#'     number of susceptibles at time `t[i]`.
#'   }
#'   \item{`I`}{Number of infecteds. `I[i]` is the estimated
#'     number of infecteds at time `t[i]`.
#'   }
#'   \item{`beta`}{Transmission rate. `beta[i]` is the estimated
#'     transmission rate at time `t[i]` expressed per unit
#'     \ifelse{latex}{\out{$\Delta t$}}{\ifelse{html}{\out{<i>&Delta;t</i>}}{Dt}}
#'     per susceptible per infected.
#'   }
#' }
#'
#' It possesses `par_list` as an attribute.
#'
#' @examples
#' # Simulate a reported incidence time series using
#' # a seasonally forced transmission rate
#' par_list <- make_par_list(dt_weeks = 1, epsilon = 0.5, prep = 0.5)
#' df <- make_data(
#'   par_list = par_list,
#'   n = 20 * 365 / 7, # 20 years is ~1042 weeks
#'   with_dem_stoch = TRUE,
#'   seed = 5
#' )
#' head(df)
#'
#' # Estimate incidence, susceptibles, infecteds,
#' # and the seasonally forced transmission rate
#' df_S <- estimate_beta_S(df, par_list)
#' head(df_S)
#'
#' # Fit a smooth loess curve to the transmission rate
#' # time series
#' loess_fit <- loess(
#'   formula   = beta ~ t,
#'   data      = df_S,
#'   span      = 65 / nrow(df_S),
#'   degree    = 2,
#'   na.action = "na.exclude"
#' )
#' df_S$beta_loess <- predict(loess_fit)
#'
#' # Inspect
#' df_S$t_years <- df$t_years
#' plot(S ~ t_years, df, type = "l", ylim = c(43, 58) * 1e03)
#' lines(S ~ t_years, df_S, col = "red")
#' plot(beta ~ t_years, df, type = "l", ylim = c(0.95, 1.25) * 1e-05)
#' lines(beta_loess ~ t_years, df_S, col = "red")
#' 
#' @references
#' deJonge MS, Jagan M, Krylova O, Earn DJD. Fast estimation of
#' time-varying transmission rates for infectious diseases.
#'
#' @md
#' @export
estimate_beta_S <- function(df       = data.frame(),
                            par_list = list()) {

## 1. Set-up -----------------------------------------------------------

# Load necessary elements of `par_list` into the execution environment
list2env(
  par_list[c("prep", "trep", "S0", "hatN0", "nu", "mu", "tgen")],
  envir = environment()
)

# Logical: Is `df` missing a column `B`? What about `mu`?
B_was_not_def <- is.null(df$B)
mu_was_not_def <- is.null(df$mu)

# Preallocate memory for output. Assume constant vital rates,
# if necessary.
df <- data.frame(
  t     = df$t,
  C     = df$C,
  Z     = NA,
  B     = if (B_was_not_def) nu * hatN0 * 1 else df$B,
  mu    = if (mu_was_not_def) mu else df$mu,
  S     = NA,
  I     = NA,
  beta  = NA
)


## 2. Impute missing values in reported incidence, ... -----------------
##    births, and natural mortality rate

df[c("C", "B", "mu")] <- lapply(df[c("C", "B", "mu")],
  function(x) {
    if (!any(is.na(x))) {
      return(x)
    }

    # Indices of missing values
    ind_na <- which(is.na(x))

    # Function that performs linear interpolation
    # between time points where `x` is observed
    impute_na <- stats::approxfun(
      x      = df$t[-ind_na],
      y      = x[-ind_na],
      method = "linear",
      rule   = 1 # return `NA` outside range of (argument) `x`
    )

    # Impute missing values. Missing values outside of
    # the range of observed data are retained as `NA`.
    replace(x, ind_na, impute_na(df$t[ind_na]))
  }
)


## 3. Impute zeros in reported incidence -------------------------------

df$C <- local(
  {
    if (!any(df$C == 0, na.rm = TRUE)) {
      return(df$C)
    }

    # Indices of positive values
    ind_pos <- which(df$C > 0)

    # Indices of zeros between two positive values
    ind_zero <- which(df$C == 0)
    ind_zero <- ind_zero[
      ind_zero > min(ind_pos) &
      ind_zero < max(ind_pos)
    ]

    # Function that performs linear interpolation
    # between time points where `df$C` is positive
    impute_zero <- stats::approxfun(
      x      = df$t[ind_pos],
      y      = df$C[ind_pos],
      method = "linear",
      rule   = 1 # return `NA` outside range of `x`
    )

    # Impute zeros. Zeros outside of the range of
    # positive data are retained as zeros due to
    # the definition of `ind_zero`.
    replace(df$C, ind_zero, impute_zero(df$t[ind_zero]))
  }
)


## NOTE: In the best case, there are no longer missing values or zeros
##       in `df$C`. In the worst case, `df$C` now looks like
##       `c(NA,...,NA,0,...,0,+,...,+,0,...,0,NA,...,NA)`,
##       where `+` denotes a positive number.


## 4. Estimate incidence -----------------------------------------------

trepr <- round(trep)
df$Z <- c(
  (1 / prep) * df$C[(trepr+1):nrow(df)],
  rep(NA, trepr)
)


## 5. Estimate susceptibles, infecteds, transmission rate --------------

tgenr <- round(tgen)
gamma <- 1 / tgen

df[c("S", "I", "beta")] <- with(df,
  {
    S[1] <- S0
    for (i in 2:nrow(df)) {
      S[i] <- (1 - mu[i-1]) * S[i-1] + B[i] - Z[i]
    }
    if (tgenr > 0) {
      I <- c(rep(NA, tgenr - 1), Z[1:(nrow(df)-tgenr+1)]) /
        ((gamma + mu) * 1)
      beta <- (gamma + mu) * c(Z[-1], NA) /
        (S * c(rep(NA, tgenr - 1), Z[1:(nrow(df)-tgenr+1)]))
    } else {
      I <- c(Z[-1], NA) / (gamma + mu)
      beta <- (gamma + mu) * c(Z[-1], NA) /
        (S * c(Z[-1], NA))
    }
    list(S, I, beta)
  }
)


## NOTE: If too many `NA` were retained at the start of `df$C`,
##       `df$B`, or `df$mu`, then `df$S` and `df$beta` will be
##       filled with `NA`. If zeros were retained in `df$C`,
##       then `NaN` or `Inf` will appear in `df$beta` whenever
##       a divide-by-zero error occurs.


## 6. Warn if `S` is ever negative -------------------------------------

if (any(df$S < 0, na.rm = TRUE)) {
  # May have underestimated `prep` or `nu`
  new_par_vals <- paste0(
    "\n* `prep` > ", sprintf("%.3f", prep),
    if (B_was_not_def) paste0("\n* `nu` > ", sprintf("%.3e", nu))
  )
  warning(
    "S method: `S[i]` < 0 for some `i`. Retry with:",
    new_par_vals,
    call. = FALSE
  )
}


## 7. Warn if `beta` is ever `NaN` or `Inf` ----------------------------

if (any(is.nan(df$beta) | is.infinite(df$beta))) {
  warning(
    "S method: `beta[i]` is `NaN` or `Inf` for some `i`. \n",
    "Is the first or last observation in `df$C` a zero?",
    call. = FALSE
  )
}


attr(df, "par_list") <- par_list
df
}
davidearn/fastbeta documentation built on June 14, 2020, 3:11 p.m.