R/compiled_code_tslstmx.R

Defines functions forecast_best_model compare_predicted_vs_actual predict_y_values best_model_on_validation ts_lstm_x_tuning define_early_stopping convert_to_tensors reshape_for_lstm convert_to_numeric_matrices initialize_tensorflow split_data embed_columns check_and_format_data

Documented in best_model_on_validation check_and_format_data compare_predicted_vs_actual convert_to_numeric_matrices convert_to_tensors define_early_stopping embed_columns forecast_best_model initialize_tensorflow predict_y_values reshape_for_lstm split_data ts_lstm_x_tuning

#' Check and Format Data
#'
#' This function checks the compatibility of a given data frame and performs necessary formatting.
#'
#' @param data A data frame containing a 'Date' column and a numeric column 'A'.
#' @param n.head Number of rows to display from the formatted data frame (default is 6).
#'
#' @return A formatted data frame with the specified number of rows displayed.
#'
#' @details
#' This function checks the format of the 'Date' column and ensures it is in the format 'dd-mm-yy'.
#' It also checks the presence of the 'A' column and ensures it contains numeric values.
#'
#' @examples
#' \donttest{
#' data <- data.frame(
#'   Date = as.Date(c("01-04-18", "02-04-18", "03-04-18", "04-04-18", "05-04-18",
#'                    "06-04-18", "07-04-18", "08-04-18", "09-04-18", "10-04-18",
#'                    "11-04-18", "12-04-18", "13-04-18", "14-04-18", "15-04-18",
#'                    "16-04-18", "17-04-18", "18-04-18", "19-04-18", "20-04-18"),
#'                  format = "%d-%m-%y"),
#'   A = c(0, 0, 4, 12, 20, 16, 16, 0, 12, 18, 12, 18, 18, 0, 0, 33, 31, 38, 76, 198)
#' )
#' check_and_format_data(data)
#' # Add a new column 'X' based on the values in the second column
#' data$X <- ifelse(data$A != 0, 1, 0)
#' }
#' @importFrom stats embed predict
#' @importFrom utils head
#'
#' @export
check_and_format_data <- function(data, n.head = 6) {
  # Check if the 'Date' column is in the correct format
  if (!inherits(data$Date, "Date") && all(grepl("^\\d{2}-\\d{2}-\\d{2}$", data$Date))) {
    # Convert 'Date' column to Date format
    data$Date <- as.Date(data$Date, format = "%d-%m-%y")
    warning("Dates in the 'Date' column have been successfully reformatted.\n")
  } else if (!inherits(data$Date, "Date")) {
    warning("Warning: 'Date' column format is not recognized. Please make sure it is in the format 'dd-mm-yy'. Attempting to parse...\n")
    # Try to parse 'Date' column
    data$Date <- as.Date(data$Date, format = "%d-%m-%y")
    if (any(is.na(data$Date))) {
      stop("Error: Unable to parse 'Date' column. Please ensure it is in the format 'dd-mm-yy'.")
    } else {
      warning("Dates in the 'Date' column have been successfully reformatted.\n")
    }
  }

  # Check if the 'A' column exists
  if (!"A" %in% colnames(data)) {
    stop("Error: 'A' column not found in the data set. Please make sure it contains the required column 'A'.")
  }

  # Check if the 'A' column has numeric values
  if (!all(is.numeric(data$A))) {
    stop("Error: 'A' column should contain numeric values.")
  }

  warning("Data set compatibility check passed successfully.\n")
  return(head(data, n.head))
}
#' Embed columns and create a new data frame
#'
#' This function takes a data frame and embeds specified columns to create a new data frame.
#'
#' @param data A data frame containing the original columns.
#' @param n_lag Number of lags for embedding.
#'
#' @return A list containing the new data frame and column names of the embedded columns.
#'
#' @examples
#' \donttest{
#' data <- data.frame(
#'   Date = as.Date(c("01-04-18", "02-04-18", "03-04-18", "04-04-18", "05-04-18",
#'                    "06-04-18", "07-04-18", "08-04-18", "09-04-18", "10-04-18",
#'                    "11-04-18", "12-04-18", "13-04-18", "14-04-18", "15-04-18",
#'                    "16-04-18", "17-04-18", "18-04-18", "19-04-18", "20-04-18"),
#'                  format = "%d-%m-%y"),
#'   A = c(0, 0, 4, 12, 20, 16, 16, 0, 12, 18, 12, 18, 18, 0, 0, 33, 31, 38, 76, 198)
#' )
#' check_and_format_data(data)
#' # Add a new column 'X' based on the values in the second column
#' data$X <- ifelse(data$A != 0, 1, 0)
#'
#' result_embed <- embed_columns(data = data, n_lag = 2)
#' new_data <- result_embed$data_frame
#' embedded_colnames <- result_embed$column_names
#' }
#'
#' @export
embed_columns <- function(data, n_lag = 2) {
  A_embed <- embed(data$A, n_lag)
  X_embed <- embed(data$X, n_lag)
  embedded_colnames <- c("y", paste0("y", 1:(n_lag - 1)), "x")
  new_data <- data.frame(A_embed, X_embed[, 1])
  colnames(new_data) <- embedded_colnames
  return(list(data_frame = new_data, column_names = embedded_colnames))
}

#' Split data into training and validation sets
#'
#' This function takes a data frame and splits it into training and validation sets.
#'
#' @param new_data The data frame to be split.
#' @param val_ratio The ratio of the data to be used for validation (default is 0.1).
#'
#' @return A list containing the training and validation data frames.
#'
#' @examples
#' \donttest{
#' data <- data.frame(
#'   Date = as.Date(c("01-04-18", "02-04-18", "03-04-18", "04-04-18", "05-04-18",
#'                    "06-04-18", "07-04-18", "08-04-18", "09-04-18", "10-04-18",
#'                    "11-04-18", "12-04-18", "13-04-18", "14-04-18", "15-04-18",
#'                    "16-04-18", "17-04-18", "18-04-18", "19-04-18", "20-04-18"),
#'                  format = "%d-%m-%y"),
#'   A = c(0, 0, 4, 12, 20, 16, 16, 0, 12, 18, 12, 18, 18, 0, 0, 33, 31, 38, 76, 198)
#' )
#' check_and_format_data(data)
#' # Add a new column 'X' based on the values in the second column
#' data$X <- ifelse(data$A != 0, 1, 0)
#'
#' result_embed <- embed_columns(data = data, n_lag = 2)
#' new_data <- result_embed$data_frame
#' embedded_colnames <- result_embed$column_names
#'
#' result_split <- split_data(new_data = new_data, val_ratio = 0.1)
#' train_data <- result_split$train_data
#' validation_data <- result_split$validation_data
#' }
#'
#' @export
split_data <- function(new_data, val_ratio = 0.1) {
  validation_size <- ceiling(val_ratio * nrow(new_data))
  train_data <- new_data[1:(nrow(new_data) - validation_size), ]
  validation_data <- new_data[(nrow(new_data) - validation_size + 1):nrow(new_data), ]
  return(list(train_data = train_data, validation_data = validation_data))
}

#' Function to initialize TensorFlow and enable eager execution
#'
#' This function initializes TensorFlow and enables eager execution.
#'
#' @return No return value, called for smooth running
#'
#' @import tensorflow
#' @importFrom tensorflow tf
#'
#' @examples
#' \donttest{
#' initialize_tensorflow()
#' }
#'
#' @export
initialize_tensorflow <- function() {
  tf$compat$v1$enable_eager_execution()
}

#' Function to convert columns to numeric matrices
#'
#' This function converts specific columns in the data frames to numeric matrices.
#'
#' @param train_data Training data frame.
#' @param validation_data Validation data frame.
#' @param embedded_colnames Names of the embedded columns.
#'
#' @return A list containing numeric matrices for training and validation sets.
#'
#' @examples
#' \donttest{
#' data <- data.frame(
#'   Date = as.Date(c("01-04-18", "02-04-18", "03-04-18", "04-04-18", "05-04-18",
#'                    "06-04-18", "07-04-18", "08-04-18", "09-04-18", "10-04-18",
#'                    "11-04-18", "12-04-18", "13-04-18", "14-04-18", "15-04-18",
#'                    "16-04-18", "17-04-18", "18-04-18", "19-04-18", "20-04-18"),
#'                  format = "%d-%m-%y"),
#'   A = c(0, 0, 4, 12, 20, 16, 16, 0, 12, 18, 12, 18, 18, 0, 0, 33, 31, 38, 76, 198)
#' )
#' check_and_format_data(data)
#' # Add a new column 'X' based on the values in the second column
#' data$X <- ifelse(data$A != 0, 1, 0)
#'
#' result_embed <- embed_columns(data = data, n_lag = 2)
#' new_data <- result_embed$data_frame
#' embedded_colnames <- result_embed$column_names
#'
#' result_split <- split_data(new_data = new_data, val_ratio = 0.1)
#' train_data <- result_split$train_data
#' validation_data <- result_split$validation_data

#' train_data <- result_split$train_data
#' validation_data <- result_split$validation_data
#' embedded_colnames <- result_embed$column_names
#' numeric_matrices <- convert_to_numeric_matrices(train_data = train_data,
#'                                                 validation_data = validation_data,
#'                                                 embedded_colnames = embedded_colnames)
#' X_train <- numeric_matrices$X_train
#' y_train <- numeric_matrices$y_train
#' X_val <- numeric_matrices$X_val
#' y_val <- numeric_matrices$y_val
#'
#' #' initialize_tensorflow()
#' }
#'
#' @export
convert_to_numeric_matrices <- function(train_data, validation_data, embedded_colnames) {
  X_train <- as.matrix(train_data[, embedded_colnames])
  y_train <- as.numeric(train_data$y)
  X_val <- as.matrix(validation_data[, embedded_colnames])
  y_val <- as.numeric(validation_data$y)
  return(list(X_train = X_train, y_train = y_train, X_val = X_val, y_val = y_val))
}

#' Function to reshape input data for LSTM
#'
#' This function reshapes input data to be compatible with LSTM models.
#'
#' @param X_train Numeric matrix representing the training input data.
#' @param X_val Numeric matrix representing the validation input data.
#'
#' @return A list containing reshaped training and validation input data.
#'
#' @import tensorflow
#' @importFrom tensorflow array_reshape
#' @importFrom reticulate import
#' @examples
#' \donttest{
#' data <- data.frame(
#'   Date = as.Date(c("01-04-18", "02-04-18", "03-04-18", "04-04-18", "05-04-18",
#'                    "06-04-18", "07-04-18", "08-04-18", "09-04-18", "10-04-18",
#'                    "11-04-18", "12-04-18", "13-04-18", "14-04-18", "15-04-18",
#'                    "16-04-18", "17-04-18", "18-04-18", "19-04-18", "20-04-18"),
#'                  format = "%d-%m-%y"),
#'   A = c(0, 0, 4, 12, 20, 16, 16, 0, 12, 18, 12, 18, 18, 0, 0, 33, 31, 38, 76, 198)
#' )
#' check_and_format_data(data)
#' # Add a new column 'X' based on the values in the second column
#' data$X <- ifelse(data$A != 0, 1, 0)
#'
#' result_embed <- embed_columns(data = data, n_lag = 2)
#' new_data <- result_embed$data_frame
#' embedded_colnames <- result_embed$column_names
#'
#' result_split <- split_data(new_data = new_data, val_ratio = 0.1)
#' train_data <- result_split$train_data
#' validation_data <- result_split$validation_data

#' train_data <- result_split$train_data
#' validation_data <- result_split$validation_data
#' embedded_colnames <- result_embed$column_names
#' numeric_matrices <- convert_to_numeric_matrices(train_data = train_data,
#'                                                 validation_data = validation_data,
#'                                                 embedded_colnames = embedded_colnames)
#' X_train <- numeric_matrices$X_train
#' y_train <- numeric_matrices$y_train
#' X_val <- numeric_matrices$X_val
#' y_val <- numeric_matrices$y_val
#'
#' #' initialize_tensorflow()
#'
#' X_train <- numeric_matrices$X_train
#' X_val <- numeric_matrices$X_val
#' reshaped_data <- reshape_for_lstm(X_train = X_train, X_val = X_val)
#' X_train <- reshaped_data$X_train
#' X_val <- reshaped_data$X_val
#' }
#'
#' @export
reshape_for_lstm <- function(X_train, X_val) {
  X_train <- array_reshape(X_train, c(dim(X_train)[1], 1, dim(X_train)[2]))
  X_val <- array_reshape(X_val, c(dim(X_val)[1], 1, dim(X_val)[2]))
  return(list(X_train = X_train, X_val = X_val))
}

#' Function to convert data to TensorFlow tensors
#'
#' This function converts input data to TensorFlow tensors for compatibility with TensorFlow and keras models.
#'
#' @param X_train Numeric matrix representing the training input data.
#' @param y_train Numeric vector representing the training output data.
#' @param X_val Numeric matrix representing the validation input data.
#' @param y_val Numeric vector representing the validation output data.
#'
#' @return A list containing TensorFlow tensors for training and validation data.
#'
#' @import tensorflow
#' @examples
#' \donttest{
#' data <- data.frame(
#'   Date = as.Date(c("01-04-18", "02-04-18", "03-04-18", "04-04-18", "05-04-18",
#'                    "06-04-18", "07-04-18", "08-04-18", "09-04-18", "10-04-18",
#'                    "11-04-18", "12-04-18", "13-04-18", "14-04-18", "15-04-18",
#'                    "16-04-18", "17-04-18", "18-04-18", "19-04-18", "20-04-18"),
#'                  format = "%d-%m-%y"),
#'   A = c(0, 0, 4, 12, 20, 16, 16, 0, 12, 18, 12, 18, 18, 0, 0, 33, 31, 38, 76, 198)
#' )
#' check_and_format_data(data)
#' # Add a new column 'X' based on the values in the second column
#' data$X <- ifelse(data$A != 0, 1, 0)
#'
#' result_embed <- embed_columns(data = data, n_lag = 2)
#' new_data <- result_embed$data_frame
#' embedded_colnames <- result_embed$column_names
#'
#' result_split <- split_data(new_data = new_data, val_ratio = 0.1)
#' train_data <- result_split$train_data
#' validation_data <- result_split$validation_data

#' train_data <- result_split$train_data
#' validation_data <- result_split$validation_data
#' embedded_colnames <- result_embed$column_names
#' numeric_matrices <- convert_to_numeric_matrices(train_data = train_data,
#'                                                 validation_data = validation_data,
#'                                                 embedded_colnames = embedded_colnames)
#' X_train <- numeric_matrices$X_train
#' y_train <- numeric_matrices$y_train
#' X_val <- numeric_matrices$X_val
#' y_val <- numeric_matrices$y_val
#'
#' #' initialize_tensorflow()
#'
#' X_train <- numeric_matrices$X_train
#' X_val <- numeric_matrices$X_val
#' reshaped_data <- reshape_for_lstm(X_train = X_train, X_val = X_val)
#' X_train <- reshaped_data$X_train
#' X_val <- reshaped_data$X_val
#' X_train <- reshaped_data$X_train
#' y_train <- numeric_matrices$y_train
#' X_val <- reshaped_data$X_val
#' y_val <- numeric_matrices$y_val
#' tf <- reticulate::import("tensorflow")
#' tensors <- convert_to_tensors(X_train = X_train, y_train = y_train, X_val = X_val, y_val = y_val)
#' X_train <- tensors$X_train
#' y_train <- tensors$y_train
#' X_val <- tensors$X_val
#' y_val <- tensors$y_val
#' }
#'
#' @export
convert_to_tensors <- function(X_train, y_train, X_val, y_val) {
  X_train <- tf$constant(X_train)
  y_train <- tf$constant(y_train)
  X_val <- tf$constant(X_val)
  y_val <- tf$constant(y_val)
  return(list(X_train = X_train, y_train = y_train, X_val = X_val, y_val = y_val))
}
#' Function to define early stopping callback
#'
#' This function defines an early stopping callback for keras models.
#'
#' @param n_patience Integer specifying the number of epochs with no improvement after which training will be stopped.
#'
#' @return A keras early stopping callback.
#'
#' @import tensorflow
#' @import keras
#' @importFrom keras callback_early_stopping
#'
#' @examples
#' \donttest{
#' data <- data.frame(
#'   Date = as.Date(c("01-04-18", "02-04-18", "03-04-18", "04-04-18", "05-04-18",
#'                    "06-04-18", "07-04-18", "08-04-18", "09-04-18", "10-04-18",
#'                    "11-04-18", "12-04-18", "13-04-18", "14-04-18", "15-04-18",
#'                    "16-04-18", "17-04-18", "18-04-18", "19-04-18", "20-04-18"),
#'                  format = "%d-%m-%y"),
#'   A = c(0, 0, 4, 12, 20, 16, 16, 0, 12, 18, 12, 18, 18, 0, 0, 33, 31, 38, 76, 198)
#' )
#' check_and_format_data(data)
#' # Add a new column 'X' based on the values in the second column
#' data$X <- ifelse(data$A != 0, 1, 0)
#'
#' result_embed <- embed_columns(data = data, n_lag = 2)
#' new_data <- result_embed$data_frame
#' embedded_colnames <- result_embed$column_names
#'
#' result_split <- split_data(new_data = new_data, val_ratio = 0.1)
#' train_data <- result_split$train_data
#' validation_data <- result_split$validation_data

#' train_data <- result_split$train_data
#' validation_data <- result_split$validation_data
#' embedded_colnames <- result_embed$column_names
#' numeric_matrices <- convert_to_numeric_matrices(train_data = train_data,
#'                                                 validation_data = validation_data,
#'                                                 embedded_colnames = embedded_colnames)
#' X_train <- numeric_matrices$X_train
#' y_train <- numeric_matrices$y_train
#' X_val <- numeric_matrices$X_val
#' y_val <- numeric_matrices$y_val
#'
#' #' initialize_tensorflow()
#'
#' X_train <- numeric_matrices$X_train
#' X_val <- numeric_matrices$X_val
#' reshaped_data <- reshape_for_lstm(X_train = X_train, X_val = X_val)
#' X_train <- reshaped_data$X_train
#' X_val <- reshaped_data$X_val
#' X_train <- reshaped_data$X_train
#' y_train <- numeric_matrices$y_train
#' X_val <- reshaped_data$X_val
#' y_val <- numeric_matrices$y_val
#' tf <- reticulate::import("tensorflow")
#' tensors <- convert_to_tensors(X_train = X_train, y_train = y_train, X_val = X_val, y_val = y_val)
#' X_train <- tensors$X_train
#' y_train <- tensors$y_train
#' X_val <- tensors$X_val
#' y_val <- tensors$y_val
#' n_patience <- 50
#' early_stopping <- define_early_stopping(n_patience = n_patience)
#' }
#'
#' @export
define_early_stopping <- function(n_patience) {
  early_stopping <- callback_early_stopping(
    monitor = "val_loss",
    patience = n_patience,
    restore_best_weights = TRUE
  )
  return(early_stopping)
}

#' Time Series LSTM Hyperparameter Tuning
#'
#' This function performs hyperparameter tuning for a Time Series LSTM model using a grid search approach.
#'
#' @param X_train Numeric matrix, the training input data.
#' @param y_train Numeric vector, the training target data.
#' @param X_val Numeric matrix, the validation input data.
#' @param y_val Numeric vector, the validation target data.
#' @param embedded_colnames Character vector, column names of the embedded features.
#' @param custom_loss Function, custom loss function for the LSTM model.
#' @param early_stopping keras early stopping callback.
#' @param n_lag Integer, desired lag value.
#' @param lstm_units_list Numeric vector, list of LSTM units to search over.
#' @param learning_rate_list Numeric vector, list of learning rates to search over.
#' @param batch_size_list Numeric vector, list of batch sizes to search over.
#' @param dropout_list Numeric vector, list of dropout rates to search over.
#' @param l1_reg_list Numeric vector, list of L1 regularization values to search over.
#' @param l2_reg_list Numeric vector, list of L2 regularization values to search over.
#' @param n_iter Integer, number of epochs for each model training.
#' @param n_verbose Integer, level of verbosity during training (0 or 1).
#'
#' @return A list containing the results data frame, all histories, and LSTM models.
#'
#' @examples
#' \donttest{
#' data <- data.frame(
#'   Date = as.Date(c("01-04-18", "02-04-18", "03-04-18", "04-04-18", "05-04-18",
#'                    "06-04-18", "07-04-18", "08-04-18", "09-04-18", "10-04-18",
#'                    "11-04-18", "12-04-18", "13-04-18", "14-04-18", "15-04-18",
#'                    "16-04-18", "17-04-18", "18-04-18", "19-04-18", "20-04-18"),
#'                  format = "%d-%m-%y"),
#'   A = c(0, 0, 4, 12, 20, 16, 16, 0, 12, 18, 12, 18, 18, 0, 0, 33, 31, 38, 76, 198)
#' )
#' check_and_format_data(data)
#' # Add a new column 'X' based on the values in the second column
#' data$X <- ifelse(data$A != 0, 1, 0)
#'
#' result_embed <- embed_columns(data = data, n_lag = 2)
#' new_data <- result_embed$data_frame
#' embedded_colnames <- result_embed$column_names
#'
#' result_split <- split_data(new_data = new_data, val_ratio = 0.1)
#' train_data <- result_split$train_data
#' validation_data <- result_split$validation_data

#' train_data <- result_split$train_data
#' validation_data <- result_split$validation_data
#' embedded_colnames <- result_embed$column_names
#' numeric_matrices <- convert_to_numeric_matrices(train_data = train_data,
#'                                                 validation_data = validation_data,
#'                                                 embedded_colnames = embedded_colnames)
#' X_train <- numeric_matrices$X_train
#' y_train <- numeric_matrices$y_train
#' X_val <- numeric_matrices$X_val
#' y_val <- numeric_matrices$y_val
#'
#' #' initialize_tensorflow()
#'
#' X_train <- numeric_matrices$X_train
#' X_val <- numeric_matrices$X_val
#' reshaped_data <- reshape_for_lstm(X_train = X_train, X_val = X_val)
#' X_train <- reshaped_data$X_train
#' X_val <- reshaped_data$X_val
#' X_train <- reshaped_data$X_train
#' y_train <- numeric_matrices$y_train
#' X_val <- reshaped_data$X_val
#' y_val <- numeric_matrices$y_val
#' tf <- reticulate::import("tensorflow")
#' tensors <- convert_to_tensors(X_train = X_train, y_train = y_train, X_val = X_val, y_val = y_val)
#' X_train <- tensors$X_train
#' y_train <- tensors$y_train
#' X_val <- tensors$X_val
#' y_val <- tensors$y_val
#' n_patience <- 50
#' early_stopping <- define_early_stopping(n_patience = n_patience)
#'
#' X_train <- tensors$X_train
#' X_val <- tensors$X_val
#'
#' y_train <- tensors$y_train
#' y_val <- tensors$y_val
#'
#' embedded_colnames <- result_embed$column_names
#'
#' # Define your custom loss function
#' custom_loss <- function(y_true, y_pred) {
#'   condition <- tf$math$equal(y_true, 0)
#'   loss <- tf$math$reduce_mean(tf$math$square(y_true - y_pred))  # Remove 'axis'
#'   loss <- tf$where(condition, tf$constant(0), loss)
#'   return(loss)
#' }
#'
#' early_stopping <- define_early_stopping(n_patience = n_patience)
#'
#' grid_search_results <- ts_lstm_x_tuning(
#'   X_train, y_train, X_val, y_val,
#'   embedded_colnames, custom_loss, early_stopping,
#'   n_lag = 2, # desired lag value
#'   lstm_units_list = c(32),
#'   learning_rate_list = c(0.001, 0.01),
#'   batch_size_list = c(32),
#'   dropout_list = c(0.2),
#'   l1_reg_list = c(0.001),
#'   l2_reg_list = c(0.001),
#'   n_iter = 10,
#'   n_verbose = 0 # or 1
#' )
#'
#' results_df <- grid_search_results$results_df
#' all_histories <- grid_search_results$all_histories
#' lstm_models <- grid_search_results$lstm_models
#'
#' # Find the row with the minimum val_loss_mae in results_df
#' min_val_loss_row <- results_df[which.min(results_df$val_loss_mae), ]
#'
#' # Extract hyperparameters from the row
#' best_lstm_units <- min_val_loss_row$lstm_units
#' best_learning_rate <- min_val_loss_row$learning_rate
#' best_batch_size <- min_val_loss_row$batch_size
#' best_n_lag <- min_val_loss_row$n_lag
#' best_dropout <- min_val_loss_row$dropout
#' best_l1_reg <- min_val_loss_row$l1_reg
#' best_l2_reg <- min_val_loss_row$l2_reg
#'
#' # Generate the lstm_model_name for the best model
#' best_model_name <- paste0("lstm_model_lu_", best_lstm_units, "_lr_", best_learning_rate,
#'                           "_bs_", best_batch_size, "_lag_", best_n_lag,
#'                           "_do_", best_dropout, "_l1_", best_l1_reg, "_l2_", best_l2_reg)
#'
#' # Generate the history_name for the best model
#' best_history_name <- paste0("history_lu_", best_lstm_units, "_lr_", best_learning_rate,
#'                             "_bs_", best_batch_size, "_lag_", best_n_lag,
#'                             "_do_", best_dropout, "_l1_", best_l1_reg, "_l2_", best_l2_reg)
#'
#' # Access the best model from lstm_models
#' best_model <- lstm_models[[best_model_name]]
#'
#' best_model_details <- data.frame(min_val_loss_row)
#'
#' colnames(best_model_details) <- colnames(results_df)
#'
#' # Access the best model from lstm_models
#' best_history <- all_histories[[best_history_name]]
#'}
#'
#' @import tensorflow
#' @import keras
#' @importFrom keras layer_lstm layer_dense optimizer_adam regularizer_l1_l2
#' @references Garai, S., & Paul, R. K. (2023). Development of MCS based-ensemble models using CEEMDAN decomposition and machine intelligence. Intelligent Systems with Applications, 18, 200202.
#' @export
ts_lstm_x_tuning <- function(
    X_train, y_train, X_val, y_val,
    embedded_colnames, custom_loss, early_stopping,
    n_lag = 2,
    lstm_units_list = c(32),
    learning_rate_list = c(0.001, 0.01),
    batch_size_list = c(32),
    dropout_list = c(0.2),
    l1_reg_list = c(0.001),
    l2_reg_list = c(0.001),
    n_iter = 10,
    n_verbose = 0
) {
  results_df <- data.frame(
    lstm_units = numeric(),
    learning_rate = numeric(),
    batch_size = numeric(),
    n_lag = numeric(),
    dropout = numeric(),
    l1_reg = numeric(),
    l2_reg = numeric(),
    training_loss_mae = numeric(),
    val_loss_mae = numeric(),
    early_stopping = numeric()
  )

  all_histories <- list()
  lstm_models <- list()

  for (lstm_units in lstm_units_list) {
    for (learning_rate in learning_rate_list) {
      for (batch_size in batch_size_list) {
        for (dropout in dropout_list) {
          for (l1_reg in l1_reg_list) {
            for (l2_reg in l2_reg_list) {
              # Create and compile the model with current hyperparameters
              lstm_model_name <- paste0("lstm_model_lu_", lstm_units, "_lr_", learning_rate,
                                        "_bs_", batch_size, "_lag_", n_lag,
                                        "_do_", dropout, "_l1_", l1_reg, "_l2_", l2_reg)
              lstm_model <- keras_model_sequential(name = lstm_model_name)
              lstm_model %>%
                layer_lstm(units = lstm_units, return_sequences = TRUE, input_shape = c(1, length(embedded_colnames)),
                           dropout = dropout, recurrent_dropout = dropout,
                           kernel_regularizer = regularizer_l1_l2(l1 = l1_reg, l2 = l2_reg)) %>%
                layer_lstm(units = lstm_units, return_sequences = TRUE, dropout = dropout, recurrent_dropout = dropout,
                           kernel_regularizer = regularizer_l1_l2(l1 = l1_reg, l2 = l2_reg)) %>%
                layer_lstm(units = lstm_units, dropout = dropout, recurrent_dropout = dropout,
                           kernel_regularizer = regularizer_l1_l2(l1 = l1_reg, l2 = l2_reg)) %>%
                layer_dense(units = 1)
              lstm_model %>% compile(
                loss = custom_loss,
                optimizer = optimizer_adam(learning_rate = learning_rate),
                metrics = c("mean_absolute_error")
              )

              # Train the model with early stopping
              warning("Training", lstm_model_name, "\n")
              history <- lstm_model %>% fit(
                x = X_train,
                y = y_train,
                epochs = n_iter,
                batch_size = batch_size,
                validation_data = list(X_val, y_val),
                verbose = n_verbose,
                callbacks = list(early_stopping)
              )

              # Find the minimum val_mean_absolute_error for this combination
              min_val_loss_epoch <- which.min(history$metrics$val_loss)

              # Extract relevant information from 'history' and store in the results data frame
              results_df <- rbind(results_df, data.frame(
                lstm_units = lstm_units,
                learning_rate = learning_rate,
                batch_size = batch_size,
                n_lag = n_lag,
                dropout = dropout,
                l1_reg = l1_reg,
                l2_reg = l2_reg,
                training_loss_mae = history$metrics$mean_absolute_error[min_val_loss_epoch],
                val_loss_mae = history$metrics$val_mean_absolute_error[min_val_loss_epoch],
                early_stopping = min_val_loss_epoch
              ))

              # Store the history in the list with a unique name
              history_name <- paste0("history_lu_", lstm_units, "_lr_", learning_rate,
                                     "_bs_", batch_size, "_lag_", n_lag,
                                     "_do_", dropout, "_l1_", l1_reg, "_l2_", l2_reg)
              all_histories[[history_name]] <- history

              # Store the LSTM model in the list with a unique name
              lstm_models[[lstm_model_name]] <- lstm_model
            }
          }
        }
      }
    }
  }

  return(list(results_df = results_df, all_histories = all_histories, lstm_models = lstm_models))
}
#' Evaluate the best LSTM model on the validation set
#'
#' This function evaluates the performance of the best LSTM model on the provided validation set.
#'
#' @param best_model The best LSTM model obtained from hyperparameter tuning.
#' @param X_val The validation set input data.
#' @param y_val The validation set target data.
#'
#' @return The validation loss of the best model on the provided validation set.
#'
#' @examples
#' \donttest{
#' data <- data.frame(
#'   Date = as.Date(c("01-04-18", "02-04-18", "03-04-18", "04-04-18", "05-04-18",
#'                    "06-04-18", "07-04-18", "08-04-18", "09-04-18", "10-04-18",
#'                    "11-04-18", "12-04-18", "13-04-18", "14-04-18", "15-04-18",
#'                    "16-04-18", "17-04-18", "18-04-18", "19-04-18", "20-04-18"),
#'                  format = "%d-%m-%y"),
#'   A = c(0, 0, 4, 12, 20, 16, 16, 0, 12, 18, 12, 18, 18, 0, 0, 33, 31, 38, 76, 198)
#' )
#' check_and_format_data(data)
#' # Add a new column 'X' based on the values in the second column
#' data$X <- ifelse(data$A != 0, 1, 0)
#'
#' result_embed <- embed_columns(data = data, n_lag = 2)
#' new_data <- result_embed$data_frame
#' embedded_colnames <- result_embed$column_names
#'
#' result_split <- split_data(new_data = new_data, val_ratio = 0.1)
#' train_data <- result_split$train_data
#' validation_data <- result_split$validation_data

#' train_data <- result_split$train_data
#' validation_data <- result_split$validation_data
#' embedded_colnames <- result_embed$column_names
#' numeric_matrices <- convert_to_numeric_matrices(train_data = train_data,
#'                                                 validation_data = validation_data,
#'                                                 embedded_colnames = embedded_colnames)
#' X_train <- numeric_matrices$X_train
#' y_train <- numeric_matrices$y_train
#' X_val <- numeric_matrices$X_val
#' y_val <- numeric_matrices$y_val
#'
#' #' initialize_tensorflow()
#'
#' X_train <- numeric_matrices$X_train
#' X_val <- numeric_matrices$X_val
#' reshaped_data <- reshape_for_lstm(X_train = X_train, X_val = X_val)
#' X_train <- reshaped_data$X_train
#' X_val <- reshaped_data$X_val
#' X_train <- reshaped_data$X_train
#' y_train <- numeric_matrices$y_train
#' X_val <- reshaped_data$X_val
#' y_val <- numeric_matrices$y_val
#' tf <- reticulate::import("tensorflow")
#' tensors <- convert_to_tensors(X_train = X_train, y_train = y_train, X_val = X_val, y_val = y_val)
#' X_train <- tensors$X_train
#' y_train <- tensors$y_train
#' X_val <- tensors$X_val
#' y_val <- tensors$y_val
#' n_patience <- 50
#' early_stopping <- define_early_stopping(n_patience = n_patience)
#'
#' X_train <- tensors$X_train
#' X_val <- tensors$X_val
#'
#' y_train <- tensors$y_train
#' y_val <- tensors$y_val
#'
#' embedded_colnames <- result_embed$column_names
#'
#' # Define your custom loss function
#' custom_loss <- function(y_true, y_pred) {
#'   condition <- tf$math$equal(y_true, 0)
#'   loss <- tf$math$reduce_mean(tf$math$square(y_true - y_pred))  # Remove 'axis'
#'   loss <- tf$where(condition, tf$constant(0), loss)
#'   return(loss)
#' }
#'
#' early_stopping <- define_early_stopping(n_patience = n_patience)
#'
#' grid_search_results <- ts_lstm_x_tuning(
#'   X_train, y_train, X_val, y_val,
#'   embedded_colnames, custom_loss, early_stopping,
#'   n_lag = 2, # desired lag value
#'   lstm_units_list = c(32),
#'   learning_rate_list = c(0.001, 0.01),
#'   batch_size_list = c(32),
#'   dropout_list = c(0.2),
#'   l1_reg_list = c(0.001),
#'   l2_reg_list = c(0.001),
#'   n_iter = 10,
#'   n_verbose = 0 # or 1
#' )
#'
#' results_df <- grid_search_results$results_df
#' all_histories <- grid_search_results$all_histories
#' lstm_models <- grid_search_results$lstm_models
#'
#' # Find the row with the minimum val_loss_mae in results_df
#' min_val_loss_row <- results_df[which.min(results_df$val_loss_mae), ]
#'
#' # Extract hyperparameters from the row
#' best_lstm_units <- min_val_loss_row$lstm_units
#' best_learning_rate <- min_val_loss_row$learning_rate
#' best_batch_size <- min_val_loss_row$batch_size
#' best_n_lag <- min_val_loss_row$n_lag
#' best_dropout <- min_val_loss_row$dropout
#' best_l1_reg <- min_val_loss_row$l1_reg
#' best_l2_reg <- min_val_loss_row$l2_reg
#'
#' # Generate the lstm_model_name for the best model
#' best_model_name <- paste0("lstm_model_lu_", best_lstm_units, "_lr_", best_learning_rate,
#'                           "_bs_", best_batch_size, "_lag_", best_n_lag,
#'                           "_do_", best_dropout, "_l1_", best_l1_reg, "_l2_", best_l2_reg)
#'
#' # Generate the history_name for the best model
#' best_history_name <- paste0("history_lu_", best_lstm_units, "_lr_", best_learning_rate,
#'                             "_bs_", best_batch_size, "_lag_", best_n_lag,
#'                             "_do_", best_dropout, "_l1_", best_l1_reg, "_l2_", best_l2_reg)
#'
#' # Access the best model from lstm_models
#' best_model <- lstm_models[[best_model_name]]
#'
#' best_model_details <- data.frame(min_val_loss_row)
#'
#' colnames(best_model_details) <- colnames(results_df)
#'
#' # Access the best model from lstm_models
#' best_history <- all_histories[[best_history_name]]
#'
#' validation_loss_best <- best_model_on_validation(best_model, X_val, y_val)
#' }
#'
#' @import tensorflow
#'
#' @export
best_model_on_validation <- function(best_model, X_val, y_val) {
  validation_loss <- best_model %>% evaluate(X_val, y_val)
  warning("Validation Loss:", validation_loss, "\n")
  return(validation_loss)
}

#' Predict y values for the training and validation sets using the best LSTM model
#'
#' This function predicts y values for the training and validation sets using the provided LSTM model.
#'
#' @param best_model The best LSTM model obtained from hyperparameter tuning.
#' @param X_train The training set input data.
#' @param X_val The validation set input data.
#' @param train_data The training set data, including x values.
#' @param validation_data The validation set data, including x values.
#'
#' @return A list containing the predicted y values for the training and validation sets.
#'
#' @examples
#' \donttest{
#' data <- data.frame(
#'   Date = as.Date(c("01-04-18", "02-04-18", "03-04-18", "04-04-18", "05-04-18",
#'                    "06-04-18", "07-04-18", "08-04-18", "09-04-18", "10-04-18",
#'                    "11-04-18", "12-04-18", "13-04-18", "14-04-18", "15-04-18",
#'                    "16-04-18", "17-04-18", "18-04-18", "19-04-18", "20-04-18"),
#'                  format = "%d-%m-%y"),
#'   A = c(0, 0, 4, 12, 20, 16, 16, 0, 12, 18, 12, 18, 18, 0, 0, 33, 31, 38, 76, 198)
#' )
#' check_and_format_data(data)
#' # Add a new column 'X' based on the values in the second column
#' data$X <- ifelse(data$A != 0, 1, 0)
#'
#' result_embed <- embed_columns(data = data, n_lag = 2)
#' new_data <- result_embed$data_frame
#' embedded_colnames <- result_embed$column_names
#'
#' result_split <- split_data(new_data = new_data, val_ratio = 0.1)
#' train_data <- result_split$train_data
#' validation_data <- result_split$validation_data

#' train_data <- result_split$train_data
#' validation_data <- result_split$validation_data
#' embedded_colnames <- result_embed$column_names
#' numeric_matrices <- convert_to_numeric_matrices(train_data = train_data,
#'                                                 validation_data = validation_data,
#'                                                 embedded_colnames = embedded_colnames)
#' X_train <- numeric_matrices$X_train
#' y_train <- numeric_matrices$y_train
#' X_val <- numeric_matrices$X_val
#' y_val <- numeric_matrices$y_val
#'
#' #' initialize_tensorflow()
#'
#' X_train <- numeric_matrices$X_train
#' X_val <- numeric_matrices$X_val
#' reshaped_data <- reshape_for_lstm(X_train = X_train, X_val = X_val)
#' X_train <- reshaped_data$X_train
#' X_val <- reshaped_data$X_val
#' X_train <- reshaped_data$X_train
#' y_train <- numeric_matrices$y_train
#' X_val <- reshaped_data$X_val
#' y_val <- numeric_matrices$y_val
#' tf <- reticulate::import("tensorflow")
#' tensors <- convert_to_tensors(X_train = X_train, y_train = y_train, X_val = X_val, y_val = y_val)
#' X_train <- tensors$X_train
#' y_train <- tensors$y_train
#' X_val <- tensors$X_val
#' y_val <- tensors$y_val
#' n_patience <- 50
#' early_stopping <- define_early_stopping(n_patience = n_patience)
#'
#' X_train <- tensors$X_train
#' X_val <- tensors$X_val
#'
#' y_train <- tensors$y_train
#' y_val <- tensors$y_val
#'
#' embedded_colnames <- result_embed$column_names
#'
#' # Define your custom loss function
#' custom_loss <- function(y_true, y_pred) {
#'   condition <- tf$math$equal(y_true, 0)
#'   loss <- tf$math$reduce_mean(tf$math$square(y_true - y_pred))  # Remove 'axis'
#'   loss <- tf$where(condition, tf$constant(0), loss)
#'   return(loss)
#' }
#'
#' early_stopping <- define_early_stopping(n_patience = n_patience)
#'
#' grid_search_results <- ts_lstm_x_tuning(
#'   X_train, y_train, X_val, y_val,
#'   embedded_colnames, custom_loss, early_stopping,
#'   n_lag = 2, # desired lag value
#'   lstm_units_list = c(32),
#'   learning_rate_list = c(0.001, 0.01),
#'   batch_size_list = c(32),
#'   dropout_list = c(0.2),
#'   l1_reg_list = c(0.001),
#'   l2_reg_list = c(0.001),
#'   n_iter = 10,
#'   n_verbose = 0 # or 1
#' )
#'
#' results_df <- grid_search_results$results_df
#' all_histories <- grid_search_results$all_histories
#' lstm_models <- grid_search_results$lstm_models
#'
#' # Find the row with the minimum val_loss_mae in results_df
#' min_val_loss_row <- results_df[which.min(results_df$val_loss_mae), ]
#'
#' # Extract hyperparameters from the row
#' best_lstm_units <- min_val_loss_row$lstm_units
#' best_learning_rate <- min_val_loss_row$learning_rate
#' best_batch_size <- min_val_loss_row$batch_size
#' best_n_lag <- min_val_loss_row$n_lag
#' best_dropout <- min_val_loss_row$dropout
#' best_l1_reg <- min_val_loss_row$l1_reg
#' best_l2_reg <- min_val_loss_row$l2_reg
#'
#' # Generate the lstm_model_name for the best model
#' best_model_name <- paste0("lstm_model_lu_", best_lstm_units, "_lr_", best_learning_rate,
#'                           "_bs_", best_batch_size, "_lag_", best_n_lag,
#'                           "_do_", best_dropout, "_l1_", best_l1_reg, "_l2_", best_l2_reg)
#'
#' # Generate the history_name for the best model
#' best_history_name <- paste0("history_lu_", best_lstm_units, "_lr_", best_learning_rate,
#'                             "_bs_", best_batch_size, "_lag_", best_n_lag,
#'                             "_do_", best_dropout, "_l1_", best_l1_reg, "_l2_", best_l2_reg)
#'
#' # Access the best model from lstm_models
#' best_model <- lstm_models[[best_model_name]]
#'
#' best_model_details <- data.frame(min_val_loss_row)
#'
#' colnames(best_model_details) <- colnames(results_df)
#'
#' # Access the best model from lstm_models
#' best_history <- all_histories[[best_history_name]]
#'
#' validation_loss_best <- best_model_on_validation(best_model, X_val, y_val)
#' predicted_values <- predict_y_values(best_model, X_train, X_val, train_data, validation_data)
#' y_train_pred <- predicted_values$y_train_pred
#' y_val_pred <- predicted_values$y_val_pred
#' }
#'
#' @import tensorflow
#'
#' @export
predict_y_values <- function(best_model, X_train, X_val, train_data, validation_data) {
  y_train_pred <- best_model %>% predict(X_train)
  y_val_pred <- best_model %>% predict(X_val)
  y_train_pred <- as.numeric(y_train_pred)
  y_val_pred <- as.numeric(y_val_pred)
  y_train_pred[train_data$x == 0] <- 0
  y_val_pred[validation_data$x == 0] <- 0
  return(list(y_train_pred = y_train_pred, y_val_pred = y_val_pred))
}
#' Compare predicted and actual values for training and validation sets
#'
#' This function compares the predicted and actual values for the training and validation sets and computes metrics.
#'
#' @param train_data The training set data, including actual y values.
#' @param validation_data The validation set data, including actual y values.
#' @param y_train_pred Predicted y values for the training set.
#' @param y_val_pred Predicted y values for the validation set.
#'
#' @return A list containing data frames with the comparison of actual vs. predicted values for training and validation sets,
#' as well as metrics for the training and validation sets.
#'
#' @examples
#' \donttest{
#' data <- data.frame(
#'   Date = as.Date(c("01-04-18", "02-04-18", "03-04-18", "04-04-18", "05-04-18",
#'                    "06-04-18", "07-04-18", "08-04-18", "09-04-18", "10-04-18",
#'                    "11-04-18", "12-04-18", "13-04-18", "14-04-18", "15-04-18",
#'                    "16-04-18", "17-04-18", "18-04-18", "19-04-18", "20-04-18"),
#'                  format = "%d-%m-%y"),
#'   A = c(0, 0, 4, 12, 20, 16, 16, 0, 12, 18, 12, 18, 18, 0, 0, 33, 31, 38, 76, 198)
#' )
#' check_and_format_data(data)
#' # Add a new column 'X' based on the values in the second column
#' data$X <- ifelse(data$A != 0, 1, 0)
#'
#' result_embed <- embed_columns(data = data, n_lag = 2)
#' new_data <- result_embed$data_frame
#' embedded_colnames <- result_embed$column_names
#'
#' result_split <- split_data(new_data = new_data, val_ratio = 0.1)
#' train_data <- result_split$train_data
#' validation_data <- result_split$validation_data

#' train_data <- result_split$train_data
#' validation_data <- result_split$validation_data
#' embedded_colnames <- result_embed$column_names
#' numeric_matrices <- convert_to_numeric_matrices(train_data = train_data,
#'                                                 validation_data = validation_data,
#'                                                 embedded_colnames = embedded_colnames)
#' X_train <- numeric_matrices$X_train
#' y_train <- numeric_matrices$y_train
#' X_val <- numeric_matrices$X_val
#' y_val <- numeric_matrices$y_val
#'
#' #' initialize_tensorflow()
#'
#' X_train <- numeric_matrices$X_train
#' X_val <- numeric_matrices$X_val
#' reshaped_data <- reshape_for_lstm(X_train = X_train, X_val = X_val)
#' X_train <- reshaped_data$X_train
#' X_val <- reshaped_data$X_val
#' X_train <- reshaped_data$X_train
#' y_train <- numeric_matrices$y_train
#' X_val <- reshaped_data$X_val
#' y_val <- numeric_matrices$y_val
#' tf <- reticulate::import("tensorflow")
#' tensors <- convert_to_tensors(X_train = X_train, y_train = y_train, X_val = X_val, y_val = y_val)
#' X_train <- tensors$X_train
#' y_train <- tensors$y_train
#' X_val <- tensors$X_val
#' y_val <- tensors$y_val
#' n_patience <- 50
#' early_stopping <- define_early_stopping(n_patience = n_patience)
#'
#' X_train <- tensors$X_train
#' X_val <- tensors$X_val
#'
#' y_train <- tensors$y_train
#' y_val <- tensors$y_val
#'
#' embedded_colnames <- result_embed$column_names
#'
#' # Define your custom loss function
#' custom_loss <- function(y_true, y_pred) {
#'   condition <- tf$math$equal(y_true, 0)
#'   loss <- tf$math$reduce_mean(tf$math$square(y_true - y_pred))  # Remove 'axis'
#'   loss <- tf$where(condition, tf$constant(0), loss)
#'   return(loss)
#' }
#'
#' early_stopping <- define_early_stopping(n_patience = n_patience)
#'
#' grid_search_results <- ts_lstm_x_tuning(
#'   X_train, y_train, X_val, y_val,
#'   embedded_colnames, custom_loss, early_stopping,
#'   n_lag = 2, # desired lag value
#'   lstm_units_list = c(32),
#'   learning_rate_list = c(0.001, 0.01),
#'   batch_size_list = c(32),
#'   dropout_list = c(0.2),
#'   l1_reg_list = c(0.001),
#'   l2_reg_list = c(0.001),
#'   n_iter = 10,
#'   n_verbose = 0 # or 1
#' )
#'
#' results_df <- grid_search_results$results_df
#' all_histories <- grid_search_results$all_histories
#' lstm_models <- grid_search_results$lstm_models
#'
#' # Find the row with the minimum val_loss_mae in results_df
#' min_val_loss_row <- results_df[which.min(results_df$val_loss_mae), ]
#'
#' # Extract hyperparameters from the row
#' best_lstm_units <- min_val_loss_row$lstm_units
#' best_learning_rate <- min_val_loss_row$learning_rate
#' best_batch_size <- min_val_loss_row$batch_size
#' best_n_lag <- min_val_loss_row$n_lag
#' best_dropout <- min_val_loss_row$dropout
#' best_l1_reg <- min_val_loss_row$l1_reg
#' best_l2_reg <- min_val_loss_row$l2_reg
#'
#' # Generate the lstm_model_name for the best model
#' best_model_name <- paste0("lstm_model_lu_", best_lstm_units, "_lr_", best_learning_rate,
#'                           "_bs_", best_batch_size, "_lag_", best_n_lag,
#'                           "_do_", best_dropout, "_l1_", best_l1_reg, "_l2_", best_l2_reg)
#'
#' # Generate the history_name for the best model
#' best_history_name <- paste0("history_lu_", best_lstm_units, "_lr_", best_learning_rate,
#'                             "_bs_", best_batch_size, "_lag_", best_n_lag,
#'                             "_do_", best_dropout, "_l1_", best_l1_reg, "_l2_", best_l2_reg)
#'
#' # Access the best model from lstm_models
#' best_model <- lstm_models[[best_model_name]]
#'
#' best_model_details <- data.frame(min_val_loss_row)
#'
#' colnames(best_model_details) <- colnames(results_df)
#'
#' # Access the best model from lstm_models
#' best_history <- all_histories[[best_history_name]]
#'
#' validation_loss_best <- best_model_on_validation(best_model, X_val, y_val)
#' predicted_values <- predict_y_values(best_model, X_train, X_val, train_data, validation_data)
#' y_train_pred <- predicted_values$y_train_pred
#' y_val_pred <- predicted_values$y_val_pred
#' comparison <- compare_predicted_vs_actual(train_data, validation_data, y_train_pred, y_val_pred)
#' compare_train <- comparison$compare_train
#' compare_val <- comparison$compare_val
#' metrics_train <- comparison$metrics_train
#' metrics_val <- comparison$metrics_val
#' }
#'
#' @import AllMetrics
#' @importFrom AllMetrics all_metrics
#'
#' @export
compare_predicted_vs_actual <- function(train_data, validation_data, y_train_pred, y_val_pred) {
  compare_train <- data.frame(train_data$y, round(y_train_pred, 3))
  colnames(compare_train) <- c("actual", "predicted")
  compare_val <- data.frame(validation_data$y, round(y_val_pred, 3))
  colnames(compare_val) <- c("actual", "predicted")
  metrics_train <- all_metrics(train_data$y, y_train_pred)
  metrics_val <- all_metrics(validation_data$y, y_val_pred)
  return(list(compare_train = compare_train, compare_val = compare_val, metrics_train = metrics_train, metrics_val = metrics_val))
}
#' Perform forecasting using the best model
#'
#' This function performs forecasting using the best-trained model.
#'
#' @param best_model The best-trained LSTM model.
#' @param best_learning_rate The best learning rate used during training.
#' @param custom_loss The custom loss function used during training.
#' @param n_lag The lag value used during training.
#' @param new_data The input data for forecasting.
#' @param test The test data frame containing the input data for forecasting.
#' @param forecast_steps The number of steps to forecast.
#'
#' @return A list containing the forecasted values, actual vs. forecasted data frame, and metrics for forecasting.
#'
#' @examples
#' \donttest{
#' data <- data.frame(
#'   Date = as.Date(c("01-04-18", "02-04-18", "03-04-18", "04-04-18", "05-04-18",
#'                    "06-04-18", "07-04-18", "08-04-18", "09-04-18", "10-04-18",
#'                    "11-04-18", "12-04-18", "13-04-18", "14-04-18", "15-04-18",
#'                    "16-04-18", "17-04-18", "18-04-18", "19-04-18", "20-04-18"),
#'                  format = "%d-%m-%y"),
#'   A = c(0, 0, 4, 12, 20, 16, 16, 0, 12, 18, 12, 18, 18, 0, 0, 33, 31, 38, 76, 198)
#' )
#' check_and_format_data(data)
#' # Add a new column 'X' based on the values in the second column
#' data$X <- ifelse(data$A != 0, 1, 0)
#'
#' result_embed <- embed_columns(data = data, n_lag = 2)
#' new_data <- result_embed$data_frame
#' embedded_colnames <- result_embed$column_names
#'
#' result_split <- split_data(new_data = new_data, val_ratio = 0.1)
#' train_data <- result_split$train_data
#' validation_data <- result_split$validation_data

#' train_data <- result_split$train_data
#' validation_data <- result_split$validation_data
#' embedded_colnames <- result_embed$column_names
#' numeric_matrices <- convert_to_numeric_matrices(train_data = train_data,
#'                                                 validation_data = validation_data,
#'                                                 embedded_colnames = embedded_colnames)
#' X_train <- numeric_matrices$X_train
#' y_train <- numeric_matrices$y_train
#' X_val <- numeric_matrices$X_val
#' y_val <- numeric_matrices$y_val
#'
#' #' initialize_tensorflow()
#'
#' X_train <- numeric_matrices$X_train
#' X_val <- numeric_matrices$X_val
#' reshaped_data <- reshape_for_lstm(X_train = X_train, X_val = X_val)
#' X_train <- reshaped_data$X_train
#' X_val <- reshaped_data$X_val
#' X_train <- reshaped_data$X_train
#' y_train <- numeric_matrices$y_train
#' X_val <- reshaped_data$X_val
#' y_val <- numeric_matrices$y_val
#' tf <- reticulate::import("tensorflow")
#' tensors <- convert_to_tensors(X_train = X_train, y_train = y_train, X_val = X_val, y_val = y_val)
#' X_train <- tensors$X_train
#' y_train <- tensors$y_train
#' X_val <- tensors$X_val
#' y_val <- tensors$y_val
#' n_patience <- 50
#' early_stopping <- define_early_stopping(n_patience = n_patience)
#'
#' X_train <- tensors$X_train
#' X_val <- tensors$X_val
#'
#' y_train <- tensors$y_train
#' y_val <- tensors$y_val
#'
#' embedded_colnames <- result_embed$column_names
#'
#' # Define your custom loss function
#' custom_loss <- function(y_true, y_pred) {
#'   condition <- tf$math$equal(y_true, 0)
#'   loss <- tf$math$reduce_mean(tf$math$square(y_true - y_pred))  # Remove 'axis'
#'   loss <- tf$where(condition, tf$constant(0), loss)
#'   return(loss)
#' }
#'
#' early_stopping <- define_early_stopping(n_patience = n_patience)
#'
#' grid_search_results <- ts_lstm_x_tuning(
#'   X_train, y_train, X_val, y_val,
#'   embedded_colnames, custom_loss, early_stopping,
#'   n_lag = 2, # desired lag value
#'   lstm_units_list = c(32),
#'   learning_rate_list = c(0.001, 0.01),
#'   batch_size_list = c(32),
#'   dropout_list = c(0.2),
#'   l1_reg_list = c(0.001),
#'   l2_reg_list = c(0.001),
#'   n_iter = 10,
#'   n_verbose = 0 # or 1
#' )
#'
#' results_df <- grid_search_results$results_df
#' all_histories <- grid_search_results$all_histories
#' lstm_models <- grid_search_results$lstm_models
#'
#' # Find the row with the minimum val_loss_mae in results_df
#' min_val_loss_row <- results_df[which.min(results_df$val_loss_mae), ]
#'
#' # Extract hyperparameters from the row
#' best_lstm_units <- min_val_loss_row$lstm_units
#' best_learning_rate <- min_val_loss_row$learning_rate
#' best_batch_size <- min_val_loss_row$batch_size
#' best_n_lag <- min_val_loss_row$n_lag
#' best_dropout <- min_val_loss_row$dropout
#' best_l1_reg <- min_val_loss_row$l1_reg
#' best_l2_reg <- min_val_loss_row$l2_reg
#'
#' # Generate the lstm_model_name for the best model
#' best_model_name <- paste0("lstm_model_lu_", best_lstm_units, "_lr_", best_learning_rate,
#'                           "_bs_", best_batch_size, "_lag_", best_n_lag,
#'                           "_do_", best_dropout, "_l1_", best_l1_reg, "_l2_", best_l2_reg)
#'
#' # Generate the history_name for the best model
#' best_history_name <- paste0("history_lu_", best_lstm_units, "_lr_", best_learning_rate,
#'                             "_bs_", best_batch_size, "_lag_", best_n_lag,
#'                             "_do_", best_dropout, "_l1_", best_l1_reg, "_l2_", best_l2_reg)
#'
#' # Access the best model from lstm_models
#' best_model <- lstm_models[[best_model_name]]
#'
#' best_model_details <- data.frame(min_val_loss_row)
#'
#' colnames(best_model_details) <- colnames(results_df)
#'
#' # Access the best model from lstm_models
#' best_history <- all_histories[[best_history_name]]
#'
#' validation_loss_best <- best_model_on_validation(best_model, X_val, y_val)
#' predicted_values <- predict_y_values(best_model, X_train, X_val, train_data, validation_data)
#' y_train_pred <- predicted_values$y_train_pred
#' y_val_pred <- predicted_values$y_val_pred
#' comparison <- compare_predicted_vs_actual(train_data, validation_data, y_train_pred, y_val_pred)
#' compare_train <- comparison$compare_train
#' compare_val <- comparison$compare_val
#' metrics_train <- comparison$metrics_train
#' metrics_val <- comparison$metrics_val
#'
#' test <- data.frame(
#'   Date = as.Date(c("01-04-23", "02-04-23", "03-04-23", "04-04-23", "05-04-23",
#'                    "06-04-23", "07-04-23", "08-04-23", "09-04-23", "10-04-23",
#'                    "11-04-23", "12-04-23", "13-04-23", "14-04-23", "15-04-23",
#'                    "16-04-23", "17-04-23", "18-04-23", "19-04-23", "20-04-23"),
#'                  format = "%d-%m-%y"),
#'   A = c(0, 0, 15, 4, -31, 24, 14, 0, 0, 33, 38, 33, 29, 29, 25, 0, 44, 67, 162, 278)
#' )
#'
#' test$X <- ifelse(test$A != 0, 1, 0)
#'
#' n_forecast <- nrow(test)
#'
#' # Perform one-step-ahead forecasting
#' forecast_steps <- n_forecast
#' current_row <- nrow(new_data)
#' forecast_results <- forecast_best_model(best_model, best_learning_rate,
#'                                         custom_loss, n_lag = 2,
#'                                         new_data, test,
#'                                         forecast_steps)
#'
#' # Access the results
#' forecast_values <- forecast_results$forecast_values
#' actual_vs_forecast <- forecast_results$actual_vs_forecast
#' metrics_forecast <- forecast_results$metrics_forecast
#' }
#'
#' @import AllMetrics
#' @importFrom AllMetrics all_metrics
#' @importFrom keras clone_model
#'
#' @export
forecast_best_model <- function(
    best_model,
    best_learning_rate,
    custom_loss,
    n_lag = 2,
    new_data,
    test,
    forecast_steps
) {
  forecast_model <- clone_model(best_model)
  forecast_model$set_weights(best_model$get_weights())

  # Compile the forecasting model
  forecast_model %>% compile(
    loss = custom_loss,
    optimizer = optimizer_adam(learning_rate = best_learning_rate),
    metrics = c("mean_absolute_error")
  )

  current_row <- nrow(new_data)

  for (i in 1:forecast_steps) {
    forecast_input <- as.matrix(new_data[current_row, c("y", paste0("y", 1:(n_lag - 1)), "x")])
    forecast_input <- array_reshape(forecast_input, c(1, 1, length(forecast_input)))

    next_value <- forecast_model %>% predict(forecast_input)
    next_value[test$X[i] == 0] <- 0

    new_data[current_row + 1, "y"] <- next_value
    new_data[current_row + 1, "y1"] <- new_data[current_row, "y"]
    for (j in 1:(n_lag - 1)) {
      new_data[current_row + 1, paste0("y", j + 1)] <- new_data[current_row, paste0("y", j)]
    }
    new_data[current_row + 1, "x"] <- test$X[i]

    test$forecasted_A[i] <- round(next_value, 3)

    current_row <- current_row + 1
    forecast_model$set_weights(forecast_model %>% get_weights())
  }

  forecast_values <- test$forecasted_A[1:forecast_steps]
  actual_vs_forecast <- test[1:forecast_steps, ]

  # forecast_loss <- forecast_model %>% evaluate(tf$constant(X_val), tf$constant(y_val))
  # warning("Forecasting Loss on Validation Set:", forecast_loss, "\n")

  metrics_forecast <- all_metrics(test$A[1:forecast_steps], test$forecasted_A[1:forecast_steps])

  return(list(forecast_values = forecast_values, actual_vs_forecast = actual_vs_forecast, metrics_forecast = metrics_forecast))
}

Try the tsLSTMx package in your browser

Any scripts or data that you put into this service are public.

tsLSTMx documentation built on May 29, 2024, 7:20 a.m.