R/residual_functions.R
In LaMa: Fast Numerical Maximum Likelihood Estimation for Latent Markov Models

Documented in pseudo_res pseudo_res_discrete

#' Calculate pseudo-residuals
#' 
#' @description
#' For HMMs, pseudo-residuals are used to assess the goodness-of-fit of the model. 
#' These are based on the cumulative distribution function (CDF)
#' \deqn{F_{X_t}(x_t) = F(x_t \mid x_1, \dots, x_{t-1}, x_{t+1}, \dots, x_T)}
#' and can be used to quantify whether an observation is extreme relative to its model-implied distribution.
#' 
#' This function calculates such residuals via probability integral transform, based on the local state probabilities obtained by \code{\link{stateprobs}} or \code{\link{stateprobs_g}} and the respective parametric family.
#'
#' @details
#' When used for discrete pseudo-residuals, this function is just a wrapper for \code{\link{pseudo_res_discrete}}.
#'
#' @param obs vector of continuous-valued observations (of length n)
#' @param dist character string specifying which parametric CDF to use (e.g., \code{"norm"} for normal or \code{"pois"} for Poisson)
#' @param par named parameter list for the parametric CDF
#' 
#' Names need to correspond to the parameter names in the specified distribution (e.g. \code{list(mean = c(1,2), sd = c(1,1))} for a normal distribution and 2 states).
#' This argument is as flexible as the parametric distribution allows. For example you can have a matrix of parameters with one row for each observation and one column for each state.
#' @param stateprobs matrix of local state probabilities for each observation (of dimension c(n,N), where N is the number of states) as computed by \code{\link{stateprobs}}, \code{\link{stateprobs_g}} or \code{\link{stateprobs_p}}
#' @param mod optional model object containing initial distribution \code{delta}, transition probability matrix \code{Gamma}, matrix of state-dependent probabilities \code{allprobs}, and potentially a \code{trackID} variable
#' 
#' If you are using automatic differentiation either with \code{RTMB::MakeADFun} or \code{\link{qreml}} and include \code{\link{forward}}, \code{\link{forward_g}} or \code{\link{forward_p}} in your likelihood function, the objects needed for state decoding are automatically reported after model fitting.
#' Hence, you can pass the model object obtained from running \code{RTMB::report()} or from \code{\link{qreml}} directly to this function and avoid calculating local state proabilities manually.
#' In this case, a call should look like \code{pseudo_res(obs, "norm", par, mod = mod)}.
#' @param normal logical, if \code{TRUE}, returns Gaussian pseudo residuals
#'
#' These will be approximately standard normally distributed if the model is correct.
#' @param discrete logical, if \code{TRUE}, computes discrete pseudo residuals (which slightly differ in their definition)
#'
#' By default, will be determined using \code{dist} argument, but only works for standard discrete distributions.
#' When used with a special discrete distribution, set to \code{TRUE} manually. See \code{\link{pseudo_res_discrete}} for details.
#' @param randomise for discrete pseudo residuals only. Logical, if \code{TRUE}, return randomised pseudo residuals. Recommended for discrete observations.
#' @param seed for discrete pseudo residuals only. Integer, seed for random number generation
#'
#' @return vector of pseudo residuals
#' @export
#'
#' @examples
#' ## continuous-valued observations
#' obs = rnorm(100)
#' stateprobs = matrix(0.5, nrow = 100, ncol = 2)
#' par = list(mean = c(1,2), sd = c(1,1))
#' pres = pseudo_res(obs, "norm", par, stateprobs)
#'
#' ## discrete-valued observations
#' obs = rpois(100, lambda = 1)
#' stateprobs = matrix(0.5, nrow = 100, ncol = 2)
#' par = list(lambda = c(1,2))
#' pres = pseudo_res(obs, "pois", par, stateprobs)
pseudo_res = function(obs, 
                      dist, 
                      par,
                      stateprobs = NULL,
                      mod = NULL,
                      normal = TRUE,
                      discrete = NULL, 
                      randomise = TRUE,
                      seed = NULL) {
  
  
  # check if a model with delta, Gamma and allprobs is provided
  if(!is.null(mod)){
    if(is.null(mod$type)){
      stop("Model object contains no type.")
    }
    if(!(mod$type) %in% c("homogeneous", "inhomogeneous", "periodic")){
      stop("Model object contains invalid type.")
    }
    
    if(is.null(mod$delta)){
      stop("Model object contains no initial distribution.")
    }
    if(is.null(mod$Gamma)){
      stop("Model object contains no transition matrix.")
    }
    if(is.null(mod$allprobs)){
      stop("Model object contains no state-dependent probabilities.")
    }
    
    if(mod$type == "periodic"){
      if(is.null(mod$tod)){
        stop("Model object contains no cyclic indexing variable.")
      }
    }
    
    # calculate state probabilities based on model object
    if(mod$type == "homogeneous"){
      stateprobs = stateprobs(mod = mod)
    }
    if(mod$type == "inhomogeneous"){
      stateprobs = stateprobs_g(mod = mod)
    }
    if(mod$type == "periodic"){
      stateprobs = stateprobs_p(mod = mod)
    }
  }
  
  # if discrete is not specified, try to determine
  if(is.null(discrete)){
    discrete = dist %in% c("pois", "binom", "geom", "nbinom")
  }
  
  if(discrete){
    cat("Discrete pseudo-residuals are calculated\n")
    
    residuals <- pseudo_res_discrete(obs, 
                                     dist,
                                     par, 
                                     stateprobs,
                                     normal,
                                     randomise,
                                     seed)
  } else{
    # Number of observations and number of states
    nObs <- length(obs)              # Length of observations
    N <- ncol(stateprobs)            # Number of states (columns in stateprobs)
    ind <- which(!is.na(obs))
    
    # Check that the number of rows in `stateprobs` matches the length of `obs`
    if (nrow(stateprobs) != nObs) {
      stop("The number of rows in `stateprobs` must match the length of `obs`.")
    }
    
    # Construct the CDF function name dynamically, e.g., "pnorm" for "norm"
    cdf_name <- paste0("p", dist)
    cdf_func <- get(cdf_name, mode = "function")
    
    # Initialize a matrix to store CDF values for each observation and state
    cdf_values <- matrix(0, nrow = nObs, ncol = N)
    
    # Loop over each state to compute the CDF values for each observation
    for (state in 1:N) {
      
      # Extract the parameters for the current state
      # can be either vector
      current_par <- lapply(par, function(param){
        if(is.matrix(param)){
          if(ncol(param) != N | nrow(param) != nObs){
            stop("Parameter matrix dimensions must match number of observations and states")
          } else{
            return(param[, state])
          }
        } else if(is.vector(param)){
          if(length(param) == 1){
            return(param)
          } else{
            if(length(param) != N){
              stop("Parameter vector must have the same length as the number of states")
            } else{
              return(param[state])
            }
          }
        }
      })
      
      # evaluate the CDF function at each observation with these parameters
      cdf_values[ind, state] <- do.call(cdf_func, args = c(list(obs[ind]), current_par))
    }
    
    # Compute pseudo-residuals by weighting CDF values with state probabilities
    residuals <- rowSums(cdf_values * stateprobs)
    
    if(normal){
      residuals <- qnorm(residuals)
    }
  }
  
  # handle infinite value
  residuals[is.infinite(residuals)] <- NA
  
  class(residuals) <- "LaMaResiduals"
  
  return(residuals)
}


#' Calculate pseudo-residuals for discrete-valued observations
#' 
#' @description
#' For HMMs, pseudo-residuals are used to assess the goodness-of-fit of the model. 
#' These are based on the cumulative distribution function (CDF)
#' \deqn{F_{X_t}(x_t) = F(x_t \mid x_1, \dots, x_{t-1}, x_{t+1}, \dots, x_T)}
#' and can be used to quantify whether an observation is extreme relative to its model-implied distribution.
#' 
#' This function calculates such residuals for \strong{discrete-valued} observations, based on the local state probabilities obtained by \code{\link{stateprobs}} or \code{\link{stateprobs_g}} and the respective parametric family.
#'
#' @details
#' For discrete observations, calculating pseudo residuals is slightly more involved, as the CDF is a step function.
#' Therefore, one can calculate the lower and upper CDF values for each observation. 
#' By default, this function does exactly that and then randomly samples the interval in between to give approximately Gaussian psuedo-residuals.
#' If \code{randomise} is set to \code{FALSE}, the lower, upper and mean pseudo-residuasl are returned.
#'
#' @param obs vector of discrete-valued observations (of length n)
#' @param dist character string specifying which parametric CDF to use (e.g., \code{"norm"} for normal or \code{"pois"} for Poisson)
#' @param par named parameter list for the parametric CDF
#' 
#' Names need to correspond to the parameter names in the specified distribution (e.g. \code{list(mean = c(1,2), sd = c(1,1))} for a normal distribution and 2 states).
#' This argument is as flexible as the parametric distribution allows. For example you can have a matrix of parameters with one row for each observation and one column for each state.
#' @param stateprobs matrix of local state probabilities for each observation (of dimension c(n,N), where N is the number of states)
#' @param normal logical, if \code{TRUE}, returns Gaussian pseudo residuals
#'
#' These will be approximately standard normally distributed if the model is correct.
#' @param randomise logical, if \code{TRUE}, return randomised pseudo residuals. Recommended for discrete observations.
#' @param seed integer, seed for random number generation
#'
#' @return vector of pseudo residuals
#' @export
#'
#' @examples
#' obs = rpois(100, lambda = 1)
#' stateprobs = matrix(0.5, nrow = 100, ncol = 2)
#' par = list(lambda = c(1,2))
#' pres = pseudo_res_discrete(obs, "pois", par, stateprobs)
pseudo_res_discrete <- function(obs, 
                                dist,
                                par,
                                stateprobs,
                                normal = TRUE,
                                randomise = TRUE,
                                seed = NULL) {
  
  # Number of observations and number of states
  nObs <- length(obs)              # Length of observations
  N <- ncol(stateprobs)            # Number of states (columns in stateprobs)
  ind <- which(!is.na(obs))
  
  # Check that the number of rows in `stateprobs` matches the length of `obs`
  if (nrow(stateprobs) != nObs) {
    stop("The number of rows in `stateprobs` must match the length of `obs`.")
  }
  
  # Check that each parameter in `par` has the correct length
  if (!all(sapply(par, length) == N)) {
    stop("Each entry in `par` must have a length equal to the number of columns in `stateprobs`.")
  }
  
  # Construct the CDF function name dynamically, e.g., "pnorm" for "norm"
  cdf_name <- paste0("p", dist)
  cdf_func <- get(cdf_name, mode = "function")
  
  # Initialize a matrix to store CDF values for each observation and state
  cdf_values_lower <- cdf_values_upper <- matrix(0, nrow = nObs, ncol = N)
  cdf_values_random <- matrix(0, nrow = nObs, ncol = N)
  
  # Set the random seed if specified #
  if (!is.null(seed)) {
    set.seed(seed)
  }
  
  # Loop over each state to compute the CDF values for each observation
  for (state in 1:N) {
    
    # Extract the parameters for the current state
    # can be either vector
    current_par <- lapply(par, function(param){
      if(is.matrix(param)){
        if(ncol(param) != N | nrow(param) != nObs){
          stop("Parameter matrix dimensions must match number of observations and states")
        } else{
          return(param[, state])
        }
      } else if(is.vector(param)){
        if(length(param) == 1){
          return(param)
        } else{
          if(length(param) != N){
            stop("Parameter vector must have the same length as the number of states")
          } else{
            return(param[state])
          }
        }
      }
    })
    
    
    # Use `mapply` to evaluate the CDF function at each observation with these parameters
    # safe evaluation of CDF at `obs - 1` in case of errors
    cdf_values_lower[ind, state] <- tryCatch(
      do.call(cdf_func, args = c(list(obs[ind] - 1), current_par)),
      error = function(e) do.call(cdf_func, args = c(list(pmax(obs[ind] - 1, 0)), current_par))
    )
    
    cdf_values_upper[ind, state] <- do.call(cdf_func, args = c(list(obs[ind]), current_par))
    
    if (randomise) {
      cdf_values_random[ind, state] <- sapply(1:length(ind), function(i) {
        stats::runif(1, cdf_values_lower[ind[i], state], cdf_values_upper[ind[i], state])
      })
    }
  }
  
  # Either return randomised pseudo-residuals or lower, upper, and mean
  if (randomise) {
    cat("Randomised between lower and upper\n")
    
    # Compute pseudo-residuals by weighting random CDF values with state probabilities
    residuals <- rowSums(cdf_values_random * stateprobs)
    
    if (normal) {
      return(qnorm(residuals))
    } else {
      return(residuals)
    }
  } else {
    # Compute pseudo-residuals by weighting lower and upper CDF values with state probabilities
    residuals_lower <- rowSums(cdf_values_lower * stateprobs)
    residuals_upper <- rowSums(cdf_values_upper * stateprobs)
    
    if (normal) {
      return(list(
        lower = qnorm(residuals_lower),
        upper = qnorm(residuals_upper),
        mean = qnorm((residuals_lower + residuals_upper) / 2)
      ))
    } else {
      return(list(
        lower = residuals_lower,
        upper = residuals_upper,
        mean = (residuals_lower + residuals_upper) / 2
      ))
    }
  }
}