R/autoann.R
In autoann: Neural Network–Based Model Selection and Forecasting

Documented in nn_model_selector

#=====================================================
# Neural Network Model Selector
#=====================================================

#' Neural Network Model Selector
#'
#' Fits multiple single-hidden-layer neural network models by
#' evaluating all possible predictor combinations and hidden node sizes.
#' The best model is selected based on minimum RMSE on test data.
#'
#' @param data A data frame containing the response and predictor variables.
#' @param response_var Character string specifying the response variable name.
#' @param train_ratio Proportion of data used for training (default = 0.75).
#' @param max_nodes Maximum number of hidden layer nodes to evaluate (default = 10).
#' @param maxit Maximum number of iterations for neural network training (default = 500).
#' @param seed Random seed for reproducibility (default = 123).
#'
#' @return A list containing:
#' \itemize{
#'   \item best_predictors: Predictor variables of the best model
#'   \item best_hidden_nodes: Optimal number of hidden nodes
#'   \item best_performance: RMSE and MAPE of the best model
#'   \item performance_table: Performance metrics for all model combinations
#'   \item fitted: Actual vs fitted values for training data
#'   \item forecast: Actual vs forecasted values for test data
#' }
#'
#' @details
#' Predictors are standardized before model fitting.
#' Model performance is evaluated using RMSE and MAPE.
#'
#' @examples
#' data_nn <- data.frame(
#'   y = c(
#'     239.7255591, 239.6504622, 239.5848569, 239.5296290,
#'     239.4858835, 239.4547257, 239.4372607, 239.4345936,
#'     239.4478298, 239.4780743, 239.5264322, 239.5940089,
#'     239.6819094, 239.7912389, 239.9231027, 240.0786057,
#'     240.2588534, 240.4649507, 240.6980029, 240.9591152,
#'     241.2493927, 241.5699405, 241.9218640, 242.3062682
#'   ),
#'   x1 = c(
#'     9.968768102, 9.160298963, 7.294994564, 5.374395163,
#'     4.640671747, 5.495752064, 7.155488888, 8.532368787,
#'     8.032804811, 10.32506916, 12.17319856, 0.571302071,
#'     12.20714387, 27.13871523, 35.05310057, 42.40476672,
#'     46.28262184, 3.089076495, 40.31650327, 20.83471700,
#'     25.71428597, 21.06398002, 20.26911914, 22.17299909
#'   ),
#'   x2 = c(
#'     0.929946922, 4.246863796, 2.895052481, 6.827712819,
#'     11.53788333, 5.688668709, 26.08913871, 30.14926832,
#'     22.77412794, 4.519550904, 18.38195203, 40.50655053,
#'     58.61381025, 69.95404513, 76.08779720, 86.86779542,
#'     79.92326273, 32.26071629, 27.67652481, 66.80672448,
#'     86.54120883, 97.53881465, 95.49058569, 43.06666626
#'   ),
#'   x3 = c(
#'     143.7114315, 153.7664088, 158.5007862, 158.7973830,
#'     155.8340003, 150.2453258, 142.4471949, 132.8380705,
#'     121.6890278, 108.8662730, 94.52734991, 78.93448337,
#'     62.31616514, 44.76595425, 26.34367655, 7.109157889,
#'     12.72227903, 32.31332405, 50.67117014, 66.80301029,
#'     79.71603746, 88.41744464, 92.01533759, 90.21350491
#'   )
#' )
#'
#' result <- nn_model_selector(
#'   data = data_nn,
#'   response_var = "y",
#'   train_ratio = 0.75,
#'   max_nodes = 5,
#'   seed = 123
#' )
#'
#' result$best_performance
#'
#' @import nnet
#' @importFrom stats predict
#' @importFrom utils combn
#' @export
nn_model_selector <- function(data,
                              response_var,
                              train_ratio = 0.75,
                              max_nodes = 10,
                              maxit = 500,
                              seed = 123) {

  set.seed(seed)

  if (!is.character(response_var) || length(response_var) != 1) {
    stop("response_var must be a single column name as a character string")
  }

  predictor_vars <- setdiff(names(data), response_var)

  if (!is.numeric(data[[response_var]])) {
    stop("Response variable must be numeric")
  }

  if (any(!vapply(data[predictor_vars], is.numeric, logical(1)))) {
    stop("All predictors must be numeric")
  }

  # Scale predictors
  data[predictor_vars] <- scale(data[predictor_vars])

  # Train-test split
  n <- nrow(data)
  train_idx <- sample(seq_len(n), floor(train_ratio * n))

  train_data <- data[train_idx, , drop = FALSE]
  test_data  <- data[-train_idx, , drop = FALSE]

  # Predictor combinations
  predictor_sets <- unlist(
    lapply(
      seq_along(predictor_vars),
      function(k) combn(predictor_vars, k, simplify = FALSE)
    ),
    recursive = FALSE
  )

  results <- data.frame(
    predictors = character(),
    hidden_nodes = integer(),
    RMSE = numeric(),
    MAPE = numeric(),
    stringsAsFactors = FALSE
  )

  best_model <- NULL
  best_rmse  <- Inf
  best_spec  <- NULL

  for (preds in predictor_sets) {

    x_train <- as.matrix(train_data[, preds, drop = FALSE])
    y_train <- train_data[[response_var]]

    x_test <- as.matrix(test_data[, preds, drop = FALSE])
    y_test <- test_data[[response_var]]

    for (nodes in seq_len(max_nodes)) {

      model <- try(
        nnet(
          x = x_train,
          y = y_train,
          size = nodes,
          linout = TRUE,
          maxit = maxit,
          trace = FALSE
        ),
        silent = TRUE
      )

      if (inherits(model, "try-error")) next

      pred_test <- predict(model, x_test)

      rmse_val <- sqrt(mean((y_test - pred_test)^2))
      mape_val <- mean(abs((y_test - pred_test) / y_test)) * 100

      results <- rbind(
        results,
        data.frame(
          predictors = paste(preds, collapse = "+"),
          hidden_nodes = nodes,
          RMSE = rmse_val,
          MAPE = mape_val,
          stringsAsFactors = FALSE
        )
      )

      if (rmse_val < best_rmse) {
        best_rmse <- rmse_val
        best_model <- model
        best_spec <- list(
          predictors = preds,
          nodes = nodes,
          x_train = x_train,
          y_train = y_train,
          x_test = x_test,
          y_test = y_test
        )
      }
    }
  }

  list(
    best_predictors = best_spec$predictors,
    best_hidden_nodes = best_spec$nodes,
    best_performance = results[which.min(results$RMSE), ],
    performance_table = results,
    fitted = data.frame(
      actual = best_spec$y_train,
      fitted = predict(best_model, best_spec$x_train)
    ),
    forecast = data.frame(
      actual = best_spec$y_test,
      forecasted = predict(best_model, best_spec$x_test)
    )
  )
}