R/dllm.R
In dlmwwbe: Dynamic Linear Model for Wastewater-Based Epidemiology

Documented in dllm

#' Fit a Dynamic Local Level Model (DLM)
#'
#' Fits a dynamic linear model (DLM) using maximum likelihood estimation.
#'
#' @param data A data frame containing observed time series data.
#' @param obs_cols Character vector of column names in \code{data} to be used as observations.
#' @param S Character; the structure of latent states.
#' @param log10 logical; if TRUE, a log10 transformation is applied to the whole data.
#' @param date Optional; the name of the column in \code{data} representing date or time.
#' @param prior A list of prior specifications. Default priors are used if not supplied.
#' @param equal.state.var logical; if TRUE the variance is the same across all wastewater state.
#' @param equal.obs.var logical; if TRUE the variance is the same across all wastewater observation.
#' @param init_params Optional numeric vector of initial parameters.
#' @param auto_init Logical; if \code{TRUE} (default) and \code{init_params} is \code{NULL}, initial parameters
#'   are estimated automatically.
#' @param control List of control parameters for the optimization routine (\code{dlmMLE}).
#'
#' @return An object of class \code{dllm} containing the fitted model, filtered and smoothed estimates,
#' along with fit statistics (log-likelihood, AIC, BIC) and other diagnostic information.
#' \describe{
#'   \item{data}{The input data.}
#'   \item{date}{The input vector of date.}
#'   \item{obs_cols}{Character vector of column names in \code{data} to be used as observations.}
#'   \item{S}{Character; the structure of latent states.}
#'   \item{parameters}{A list of estimated parameters by maximum likelihood estimation.}
#'   \item{logLik}{The loglikelihood of the fitted model.}
#'   \item{aic}{AIC of the fitted model.}
#'   \item{bic}{BIC of the fitted model.}
#'   \item{convergence}{An integer code returned by \code{\link{optim}}}
#'   \item{model}{An \code{dlm} object of the fitted dynamic linear model. }
#'   \item{filter}{The corresponding dynamic linear filter returned by \code{\link[dlm]{dlmFilter}}}
#'   \item{smoother}{The corresponding dynamic linear smoother returned by \code{\link[dlm]{dlmSmooth}}}
#'   \item{yf}{A matrix of the filtered observed response variables.}
#'   \item{ys}{A matrix of the smoothed observed response variables.}}
#'
#'
#' @details
#' The function prepares the data, validates inputs, and (if necessary) automatically initializes parameters.
#' It then defines a helper function to build the model via \code{build_dlm} and fits the model using
#' maximum likelihood estimation (\code{dlmMLE}). Filtering and smoothing are applied to obtain state estimates.
#'
#' @examples
#' data<- wastewater[wastewater$Code == "TC",]
#' data$SampleDate <- as.Date(data$SampleDate)
#' fit <- dllm(
#' equal.state.var=TRUE,
#' equal.obs.var=FALSE,
#' log10=TRUE,
#' data = data,
#' date = "SampleDate",
#' obs_cols = c("ORFlab", "Nlab"),
#' S = 'kvariate')
#' summary(fit)
#' plot(fit, type='smoother', plot_data = TRUE)
#'
#' @export
dllm <- function(data,
                  obs_cols,
                  S = c('univariate', 'kvariate'),
                  log10 = FALSE,
                  date = NULL,
                  prior = list(),
                  equal.state.var = FALSE,
                  equal.obs.var = FALSE,
                  init_params = NULL,
                  auto_init = TRUE,
                  control = list(maxit = 500)) {
  S <- match.arg(S)
  k <- length(obs_cols)
  nS <- ifelse(S=='univariate', 1, k)
  nW <- ifelse(equal.state.var, 1, k)
  nV <- ifelse(equal.obs.var, 1, k)
  # Ensure the data is a data frame and extract observations.
  if (!is.data.frame(data))
    stop("data must be a data frame.")
  y <- as.matrix(data[, obs_cols, drop = FALSE])

  if(log10){
    y <- log(y, base=10)
  }


  # Automatic parameter initialization if required.
  if (auto_init && is.null(init_params)) {
    init_params <- estimate_init_params(y, equal.obs.var, equal.state.var)
  }
  if (length(init_params) != (nW + nV))
    stop(sprintf("Required %d parameters for this configuration, but received %d.",
                 nW+nV, length(init_params)))

  if (any(init_params <= 0)) {
    stop("Parameters must be positive.")
  }

  par_transform <- log(init_params)
  # Define transformation function to ensure positive parameters.
  #par_transform <- function(par) exp(par)
  # Build a helper function that constructs a DLM model given parameter vector.
  build_model <- function(par) {
    build_dllm(
      params = par,
      equal.state.var=equal.state.var,
      equal.obs.var=equal.obs.var,
      x0 <- y[1, ],
      obs_cols = obs_cols,
      S = S,
      nW=nW,
      nV=nV,
      k=k,
      prior = prior)}

  # Fit the model using dlmMLE with log-transformed initial parameters.
  fit <- dlm::dlmMLE(
    y = y,
    parm = (par_transform),
    method = "L-BFGS-B",
    build = build_model,
    control = control,
    debug = FALSE
  )

  # Build the final model and compute filtering and smoothing estimates.
  final_model <- build_model(fit$par)
  filter_out <- dlm::dlmFilter(y, final_model)
  smooth_out <- dlm::dlmSmooth(filter_out)

  # Name parameters based on the covariance structure.
  if (nV > 1){
    name.V <-  paste0("sigmaV_", obs_cols)
  }else{
    name.V <- c('sigmaV')
  }

  if (nW > 1){
    name.W <- paste0('sigmaW_', 'state', c(1:nW))
  }else{
    name.W <- c('sigmaW')
  }

  param_names <- c(name.W, name.V)
  parameters <- stats::setNames(exp(fit$par), param_names)
  yf <- t(final_model$FF %*% t(as.matrix(filter_out$m)[-1, , drop=FALSE]))
  ys <- t(final_model$FF %*% t(as.matrix(smooth_out$s)[-1, , drop=FALSE]))


  var.filter <- matrix(ncol=nS, nrow=0)
  var.smoother <- matrix(ncol=nS, nrow=0)

  for(t in 2:(nrow(y)+1) ){
    var.filter <- rbind(var.filter,
                        diag(filter_out$U.C[[t]] %*%
                               diag(filter_out$D.C[t,]^2, nrow=nS) %*%
                               t(filter_out$U.C[[t]])))
    var.smoother<- rbind(var.smoother,
                          diag(smooth_out$U.S[[t]] %*%
                                 diag(smooth_out$D.S[t,]^2, nrow=nS) %*%
                                 t(smooth_out$U.S[[t]])))
  }

  if (log10){
    yf <- 10**yf
    ys <- 10**ys}

  #print('A')
  #print(ncol(yf))
  #print(ncol(var.filter))
  #print(paste0(obs_cols, '_filter'))

  if (S=='univariate'){
    colnames(var.filter) <- 'filter'
    colnames(var.smoother) <- 'smoother'
  }else{
    colnames(var.filter) <- paste0(obs_cols, '_filter')
    colnames(var.smoother) <- paste0(obs_cols, '_smoother')
  }

  colnames(yf) <- paste0(obs_cols, '_filter')
  colnames(ys) <- paste0(obs_cols, '_smoother')
  # Compute log-likelihood, AIC, and BIC.
  logLik_val <- -fit$value
  aic_val <- 2 * fit$value + 2 * length(fit$par)
  bic_val <- 2 * fit$value + log(nrow(y)) * length(fit$par)
  structure(
    list(
      data = data,
      log10 = log10,
      obs_cols = obs_cols,
      S = S,
      date = date,
      parameters = parameters,
      logLik = logLik_val,
      aic = aic_val,
      bic = bic_val,
      convergence = fit$convergence,
      model = final_model,
      filter = filter_out,
      smooth = smooth_out,
      yf = yf,
      ys = ys,
      var.filter=var.filter,
      var.smoother=var.smoother,
      call = match.call()
    ),
    class = "dllm")}

# Automatic parameter initialization
estimate_init_params <- function(y, equal.obs.var, equal.state.var, n_obs = 10) {
  # Limit the data to only the first n_obs rows (or all rows if fewer are available)
  n_use <- min(n_obs, nrow(y))
  y_subset <- y[seq_len(n_use), , drop = FALSE]

  k <- ncol(y_subset)

  if (equal.state.var){
    sigmaW <- mean(apply(y_subset, 2, function(x) stats::sd(diff(stats::na.omit(x)), na.rm = TRUE)))
  }else{
    sigmaW <- apply(y_subset, 2, function(x) stats::sd(diff(stats::na.omit(x)), na.rm = TRUE))
  }

  if (equal.obs.var){
    sigmaV <- mean(apply(y_subset, 2, stats::sd, na.rm = TRUE))
  }else{
    sigmaV <- apply(y_subset, 2, stats::sd, na.rm = TRUE)
  }

  c(sigmaW, sigmaV)
}