mlmcusum: Detects Changes in Covariance for Multivariate TS

Documented in train_fd

#' Train methods
#'
#' Trains and applies a model based fault detection method for detecting the
#' changes in a multivariate time series. Can train a GRU, MRF, or VAR
#' model to pair with either an MCUSUM or MEWMA control chart. Can also apply
#' a centered and scaled Hotelling's \eqn{T^2} test.
#'
#' @param data A multivariate time series in dataframe or matrix form.
#' @param method An indicator of which model and fault detection method to use.
#' Options include gruMEWMA, mrfMCUSUM, varMEWMA, or htsquare.
#' @param data_exog Any exogenous variables to be considered in model training.
#' Must be a dataframe or matrix with the same number of rows as `data`.
#' @param lags The number of lags of each variable to be included in the design matrix.
#' @param k A tuning parameter for the MCUSUM, large k results in shorter memory.
#' @param r A tuning parameter for MEWMA, large r results in shorter memory.
#' @param center_scale A logical, whether or not to center and scale data before modeling.
#' @return A named list including the plotting statistic, trained model, residuals, and constants.
#' @name train_fd
#' @rdname train_fd
#' @export
train_fd <- function(data, method = "gruMEWMA", data_exog = NULL,
                     lags = 1, k = 1.1, r = .3, center_scale = TRUE) {
  # Constants
  method <- tolower(method)
  l <- ifelse(grepl("var", method), 1, lags)
  p <- ncol(data)
  mean_sd <- list(mean = apply(data, 2, mean),
                  sd   = apply(data, 2, sd))
  mean_sd_exog <- NULL

  # Center and Scale
  if(center_scale) {
    data <-
      purrr::pmap_dfc(list(data, mean_sd$mean, mean_sd$sd), \(x, y, z) (x - y)/z) |>
      as.matrix()

    if(!is.null(data_exog)) {
      mean_sd_exog <- list(mean = apply(data_exog, 2, mean),
                           sd   = apply(data_exog, 2, sd))

      data_exog <-
        purrr::pmap_dfc(list(data_exog, mean_sd_exog$mean, mean_sd_exog$sd),
                        \(x, y, z) (x - y)/z) |>
        as.matrix()
    }
  }

  # Constant for MCUSUM or MEWMA
  if(grepl("mcusum", method)) {
    constants <- c(k, l, p)
    names(constants) <- c("k", "lags", "p")
  } else if(grepl("mewma", method)) {
    constants <- c(r, l, p)
    names(constants) <- c("r", "lags", "p")
  } else if(grepl("htsquare", method)) {
    constants <- c(0, l, p)
    names(constants) <- c("NA", "lags", "p")
  }

  # Design and Prediction Matrices
  X <- create_X(data, lags = l, data_exog = data_exog)
  Y <- create_Y(data, lags = l)

  # Methods: GRU
  if(grepl("gru", method)) {
    fit <- train_gru(X, Y, l) # python

    preds <- pred_gru(fit, X) # python
    colnames(preds) <- colnames(data)

  # Methods: MRF
  } else if(grepl("mrf", method)) {
    fit <- randomForestSRC::rfsrc(formula = as.formula(paste0("Multivar(`", paste0(colnames(Y), collapse = "` , `"), "`) ~ .")),
                                  data = as.data.frame(cbind(Y, X)),
                                  ntree = 500,
                                  splitrule = "mahalanobis",
                                  samptype = "swr")


    preds <- randomForestSRC::get.mv.predicted(fit)
    colnames(preds) <- colnames(data)

  # Methods: VARMA
  } else if(grepl("var", method)) {
    fit <-
      MTS::VAR(data, p = 1, include.mean = TRUE)

    preds <- Y - fit$residuals
    colnames(preds) <- colnames(data)

  # Methods: Hotelling's T Square
  } else {
    fit <- NA

    preds <- matrix(0, nrow = nrow(Y), ncol = ncol(Y))
  }


  # Get Tau
  tau <- calc_tau(Y - preds)
  mu_tau <- colMeans(tau)
  sigma_tau_inv <- solve(cov(tau))

  # Calc Pstat
  if(grepl("mcusum", method)) {
    # MCUSUM
    S <- calc_S(tau, constants[1], mu_tau, sigma_tau_inv)
    pstat <- calc_PStat(S, sigma_tau_inv)
  } else if(grepl("mewma", method)) {
    # MEWMA
    D <- calc_D(tau, mu_tau, sigma_tau_inv)
    pstat <- calc_PStat_MEWMA(r, D, p)
  } else if(grepl("htsquare", method)) {
    # Hotelling's T^2
    pstat <- calc_D(tau, mu_tau, sigma_tau_inv)
  }



  # Return
  list(pstat = pstat,
       model = fit,
       method = method,
       residuals = Y - preds,
       center_scale = center_scale,
       mean_sd = mean_sd,
       mean_sd_exog = mean_sd_exog,
       mu_tau = mu_tau,
       sigma_tau_inv = sigma_tau_inv,
       constants = constants)
}

#' Train Multivariate Random Forest
#'
#' Similar to the build_forest_predict function from MultivariateRandomForest. However,
#' this function will save the model for future use.
#'
#'
#' @param trainX The design matrix for predictions. Can be made with create_X.
#' @param trainY The value of the response variables.
#' @param n_tree Number of trees in the forest, which must be positive integer.
#' @param m_feature Number of randomly selected features considered for a split in each regression tree node, which must be positive integer and less than N (number of input features)
#' @param min_leaf Minimum number of samples in the leaf node. If a node has less than or equal to min_leaf samples, then there will be no splitting in that node and this node will be considered as a leaf node. Valid input is positive integer, which is less than or equal to M (number of training samples)
#' @return A named list including the trained model and it's predictions.
#' @name build_mrf
#' @rdname build_mrf
#' @export
build_mrf <- function (trainX, trainY, n_tree, m_feature, min_leaf) {
  # Stopping conditions
  if (class(n_tree) == "character" || n_tree%%1 != 0 ||
      n_tree < 1)
    stop("Number of trees in the forest can not be fractional or negative integer or string")
  if (class(m_feature) == "character" || m_feature%%1 !=
      0 || m_feature < 1)
    stop("Number of randomly selected features considered for a split can not be fractional or negative integer or string")
  if (class(min_leaf) == "character" || min_leaf%%1 !=
      0 || min_leaf < 1 || min_leaf > nrow(trainX))
    stop("Minimum leaf number can not be fractional or negative integer or string or greater than number of samples")

  # Bootstrap setup
  theta <- function(trainX) {
    trainX
  }
  results <- bootstrap::bootstrap(1:nrow(trainX), n_tree, theta)
  b = results$thetastar
  Variable_number = ncol(trainY)

  # MRF or plain RF
  if (Variable_number > 1) {
    Command = 2
  }
  else if (Variable_number == 1) {
    Command = 1
  }

  # Set up prediction matrices
  Y_HAT = matrix(0 * (1:Variable_number * nrow(trainX)), ncol = Variable_number,
                 nrow = nrow(trainX))
  Y_pred = NULL

  # Build single trees
  single_trees <- 1:n_tree |>
    purrr::map(\(i) {
      Single_Model = NULL
      X = trainX[b[, i], ]
      Y = matrix(trainY[b[, i], ], ncol = Variable_number)
      Inv_Cov_Y = solve(cov(Y))

      if (Command == 1) {
        Inv_Cov_Y = matrix(rep(0, 4), ncol = 2)
      }

      Single_Model = MultivariateRandomForest::build_single_tree(X, Y, m_feature, min_leaf, Inv_Cov_Y, Command)
      Y_pred <<- MultivariateRandomForest::single_tree_prediction(Single_Model, trainX, Variable_number)

      1:Variable_number |>
        purrr::walk(\(j) {
          Y_HAT[, j] <<- Y_HAT[, j] + Y_pred[, j]
        })

      print(paste0("Fit tree ", i))

      Single_Model
    })

  # Divide by predictions to get average
  Y_HAT = Y_HAT/n_tree

  # Return value (for use in predict_mrf)
  list(preds = Y_HAT,
       trees = single_trees,
       constants = c(n_tree = n_tree,
                     m_feature = m_feature,
                     min_leaf = min_leaf,
                     p = Variable_number))

}