R/estimR.R

Defines functions estimR

Documented in estimR

#' Estimation of the reproduction number with Laplacian-P-splines
#'
#' @description
#' This routine estimates the instantaneous reproduction number \eqn{R_t}; the mean number
#' of secondary infections generated by an infected individual at time \eqn{t} (White et
#' al. 2020); by using Bayesian P-splines and Laplace approximations (Gressani et al. 2022).
#' Estimation of \eqn{R_t} is based on a time series of incidence counts and (a discretized) serial
#' interval distribution. The negative binomial distribution is used to model incidence
#' count data and P-splines (Eilers and Marx, 1996) are used to smooth the epidemic curve. The link
#' between the epidemic curve and the reproduction number is established via the renewal equation.
#'
#' @usage estimR(incidence, si, K = 30, dates = NULL, maxmethod = c("NelderMead","HillClimb"),
#' CoriR = FALSE, WTR = FALSE, optimstep = 0.3, priors = Rmodelpriors())
#'
#' @param incidence A vector containing the incidence time series. If \code{incidence}
#'   contains NA values at certain time points, these are replaced by the average of
#'  the left- and right neighbor counts. If the right neighbor is NA, the left neighbor
#'  is used as a replacement value.
#' @param si The (discrete) serial interval distribution.
#' @param K Number of B-splines in the basis.
#' @param dates A vector of dates in format "YYYY-MM-DD" (optional).
#' @param maxmethod The method to maximize the hyperparameter posterior distribution.
#' @param CoriR Should the \eqn{R_t} estimate of Cori (2013) be also computed?
#' @param WTR Should the \eqn{R_t} estimate of Wallinga-Teunis (2004) be also computed?
#' @param optimstep Learning rate for the "HillClimb" method to maximize the
#'  posterior distribution of the hyperparameters.
#' @param priors A list containing the prior specification of the model
#'  hyperparameters as set in Rmodelpriors. See ?Rmodelpriors.
#'
#' @details The \code{estimR} routine estimates the reproduction number in a
#'  totally "sampling-free" fashion. The hyperparameter vector (containing the
#'  penalty parameter of the P-spline model and the overdispersion parameter of
#'  the negative binomial model for the incidence time series) is fixed at its
#'  maximum a posteriori (MAP). By default, the algorithm for maximization is
#'  the one of Nelder and Mead (1965). If \code{maxmethod} is set to "HillClimb",
#'  then a gradient ascent algorithm is used to maximize the hyperparameter posterior.
#'
#' @return A list with the following components:
#' \itemize{
#'  \item incidence: The incidence time series.
#'  \item si: The serial interval distribution.
#'  \item RLPS: A data frame containing estimates of the reproduction number
#'    obtained with the Laplacian-P-splines methodology.
#'  \item thetahat: The estimated vector of B-spline coefficients.
#'  \item Sighat: The estimated variance-covariance matrix of the Laplace
#'    approximation to the conditional posterior distribution of
#'    the B-spline coefficients.
#'  \item RCori: A data frame containing the estimates of the reproduction
#'    obtained with the method of Cori et al. (2013).
#'  \item RWT: A data frame containing the estimates of the reproduction
#'    obtained with the method of Wallinga-Teunis (2004).
#'  \item LPS_elapsed: The routine real elapsed time (in seconds) when estimation
#'    of the reproduction number is carried out with Laplacian-P-splines.
#'  \item Cori_elapsed: The routine real elapsed time (in seconds) when estimation
#'    of the reproduction number is carried out with the method of Cori et al. (2013).
#'  \item penparam: The estimated penalty parameter related to the P-spline model.
#'  \item K: The number of B-splines used in the basis.
#'  \item NegBinoverdisp: The estimated overdispersion parameter of the negative
#'    binomial distribution for the incidence time series.
#'  \item optimconverged: Indicates whether the algorithm to maximize the
#'    posterior distribution of the hyperparameters has converged.
#'  \item method: The method to estimate the reproduction number with Laplacian-P-splines.
#'  \item optim_method: The chosen method to to maximize the posterior distribution
#'    of the hyperparameters.
#' }
#'
#' @author Oswaldo Gressani \email{oswaldo_gressani@hotmail.fr}
#'
#' @references Gressani, O., Wallinga, J., Althaus, C. L., Hens, N. and Faes, C.
#'  (2022). EpiLPS: A fast and flexible Bayesian tool for estimation of the
#'  time-varying reproduction number. \emph{Plos Computational Biology},
#'  \strong{18}(10): e1010618.
#' @references Cori, A., Ferguson, N.M., Fraser, C., Cauchemez, S. (2013). A new
#'  framework and software to estimate time-varying reproduction numbers during
#'  epidemics. \emph{American Journal of Epidemiology}, \strong{178}(9):1505–1512.
#' @references Wallinga, J., & Teunis, P. (2004). Different epidemic curves for
#' severe acute respiratory syndrome reveal similar impacts of control measures.
#' \emph{American Journal of Epidemiology}, \strong{160}(6), 509-516.
#' @references White, L.F., Moser, C.B., Thompson, R.N., Pagano, M. (2021).
#'  Statistical estimation of the reproductive number from case notification
#'  data. \emph{American Journal of Epidemiology}, \strong{190}(4):611-620.
#' @references Eilers, P.H.C. and Marx, B.D. (1996). Flexible smoothing
#'  with B-splines and penalties. \emph{Statistical Science},
#'  \strong{11}(2):89-121.
#'
#' @examples
#' # Illustration on simulated data
#' si <- Idist(mean = 5, sd = 3)$pvec
#' datasim <- episim(si = si, endepi = 60, Rpattern = 5, dist="negbin", overdisp = 50)
#' epifit_sim <- estimR(incidence = datasim$y, si = si, CoriR = TRUE)
#' plot(epifit_sim, addfit = "Cori")
#'
#' # Illustration on the 2003 SARS epidemic in Hong Kong.
#' data(sars2003)
#' epifit_sars <- estimR(incidence = sars2003$incidence, si = sars2003$si, K = 40)
#' tail(epifit_sars$RLPS)
#' summary(epifit_sars)
#' plot(epifit_sars)
#'
#' @export

estimR <- function(incidence, si, K = 30, dates = NULL,
                   maxmethod = c("NelderMead", "HillClimb"),
                   CoriR = FALSE, WTR = FALSE, optimstep = 0.3,
                   priors = Rmodelpriors()){

  tic <- proc.time()             # Clock starts ticking
  y <- KerIncidCheck(incidence)  # Run checks on case incidence vector
  n <- length(y)                 # Total number of days of the epidemic
  simax <- length(si)            # Length of serial interval distribution
  B <- Rcpp_KercubicBspline(seq_len(n), lower = 1, upper = n, K = K) # C++ call
  D <- diag(K)
  penorder <- 2
  for(k in 1:penorder){D <- diff(D)}
  P <- t(D) %*% D           # Penalty matrix of dimension c(K,K)
  P <- P + diag(1e-06, K)   # Perturbation to ensure P is full rank

  # Prior specification
  priorspec <- priors
  a_delta <- priorspec$a_delta
  b_delta <- priorspec$b_delta
  phi <- priorspec$phi
  a_rho <- priorspec$a_rho
  b_rho <- priorspec$b_rho

  # Minus log-posterior of hyperparameter vector
  logphyper <- function(x) {
    w <- x[1] # w = log(rho)
    v <- x[2] # v = log(lambda)

    # Laplace approximation
    LL <- Rcpp_KerLaplace(theta0 = rep(1.5,K), exp(w), exp(v), K,
                       KerPtheta(Dobs = y, BB = B, Pen = P)$Dlogptheta,
                       KerPtheta(Dobs = y, BB = B, Pen = P)$D2logptheta)
    thetastar <- as.numeric(LL$Lapmode)
    logdetSigstar <- Re(LL$logdetSigma)

    equal <- (-1) * (0.5 * logdetSigstar + 0.5 * (K + phi) * v + a_rho * w -
                       (0.5 * phi + a_delta) * log(0.5 * phi * exp(v) + b_delta) +
                       KerLikelihood(Dobs = y, BB = B)$loglik(thetastar, exp(w)) -
                       0.5 * exp(v) * sum((thetastar * P) %*% thetastar) -
                       b_rho * exp(w))

    return(equal)
  }

  # Gradient of logphyper
  Dlogphyper <- function(x, thetavec){
    w <- x[1] # w = log(rho)
    v <- x[2] # v = log(lambda)

    # Laplace approximation
    LL <- Rcpp_KerLaplace(theta0 = thetavec, exp(w), exp(v), K,
                      KerPtheta(Dobs = y, BB = B, Pen = P)$Dlogptheta,
                      KerPtheta(Dobs = y, BB = B, Pen = P)$D2logptheta)
    thetastar <- as.numeric(LL$Lapmode)
    Btheta <- as.numeric(B%*%thetastar)

    Dloglik_w <- sum(exp(w) * (digamma(y + exp(w)) - digamma(exp(w)) +
                  (1 + w) - (log(exp(Btheta) + exp(w)) +
                    (1 / (1 + exp(Btheta - w))))) -
                       y * (1 / (1 + exp(Btheta - w))))

    grad_w <- 0.5 * Re(LL$dSigw) + a_rho - b_rho * exp(w) + Dloglik_w

    grad_v <- 0.5 * Re(LL$dSigv) + 0.5 * (K + phi) - ((0.5 * phi + a_delta) /
                                              (0.5 * phi * exp(v) + b_delta)) *
      (0.5 * phi * exp(v)) - 0.5 * exp(v) * sum((thetastar * P) %*% thetastar)

    return(list(grad=c(grad_w, grad_v), thetastar = thetastar))
  }

  # Maximization of the posterior hyperparameters
  optim_method <- match.arg(maxmethod)

  if(optim_method == "NelderMead") {
    optimhyper <- stats::optim(c(1, 5), fn = logphyper)
    hypermap <- optimhyper$par
    optimconverged <- (optimhyper$convergence == 0)
  } else if (optim_method == "HillClimb") {
    hypermap <- as.numeric(Rcpp_Kerhyperoptim(c(1, 5), BB = B,
                            grad = Dlogphyper, step = optimstep)$res)
    optimconverged <- "Convergence for HillClimb not yet available"
  }

  disphat <- exp(hypermap[1])
  lambhat <- exp(hypermap[2])
  Lap_approx <- Rcpp_KerLaplace(theta0 = rep(1.5,K), disphat, lambhat, K,
                             KerPtheta(Dobs = y, BB = B, Pen = P)$Dlogptheta,
                             KerPtheta(Dobs = y, BB = B, Pen = P)$D2logptheta)
  thetahat <- as.numeric(Lap_approx$Lapmode)
  muhat <- as.numeric(exp(B %*% thetahat))
  Sighat <- Lap_approx$Lapvar

  # Check if a correction of CI on R should be done
  cratio <- (1 + (1 / disphat) * mean(muhat)) / disphat
  if (cratio <= 1) {
    EMV <- (1 / (1 + muhat / disphat))
  } else if (cratio > 1) {
    EMV <- rep(1, n)
  }

  if(is.null(dates)) {
    Time <- seq_len(n)
  } else{
    Time <- dates
  }

  # Computation of summary statistics for reproduction number with LPS
  Routcpp <- Rcpp_KerRpostmap(BB = B, theta = thetahat, Covar = Sighat,
                              sinter = si, MVvec = EMV)


  RLPS <- data.frame(cbind(Time, as.numeric(Routcpp$R), as.numeric(Routcpp$Rsd),
                  matrix(unlist(lapply(c(0.025,0.05,0.25,0.50,0.75,0.95,0.975),
                  FUN = stats::qlnorm, meanlog = Routcpp$meanlogNorm,
                  sdlog = Routcpp$sdlogNorm)),nrow = n, byrow = F)))
  colnames(RLPS) <- c("Time", "R", "Rsd", "Rq0.025", "Rq0.05","Rq0.25",
                    "Rq0.50", "Rq0.75", "Rq0.95", "Rq0.975")
  RLPS$Time <- Time

  toc <- proc.time() - tic
  toc <- round(toc[3], 3)

  if (CoriR == TRUE) {# Use Cori method with weekly sliding windows
    tic_Cori <- proc.time()
    RCori <- KerCori(Dobs = incidence, sinter = si)
    toc_Cori <- proc.time() - tic_Cori
  } else{
    RCori <- "Not called"
    toc_Cori <- "Not called"
  }

  if(WTR == TRUE){# Use Wallinga-Teunis method to estimate R
    RWT <- KerWT(Dobs = incidence, sinter = si)
  } else{
    RWT <- "Not called"
  }

  #-- Output results in a list
  outputlist <- list(incidence = y, si = si, RLPS = RLPS,
                     thetahat = thetahat,
                     Sighat = Sighat,
                     RCori = RCori, RWT = RWT,
                     LPS_elapsed = toc,
                     Cori_elapsed = toc_Cori,
                     penparam = lambhat, K = K,
                     NegBinoverdisp = disphat,
                     optimconverged = optimconverged,
                     method = "LPSMAP",
                     optim_method = optim_method)

  attr(outputlist, "class") <- "Rt"
  outputlist
}
oswaldogressani/EpiLPS documentation built on Oct. 25, 2024, 8:15 p.m.