Nothing
#' @importFrom magrittr %>%
#' @importFrom stats runif setNames
#' @keywords internal
# Calculate dataset statistics for scaling
calculate_dataset_stats <- function(numeric_dataset) {
list(
mean = numeric_dataset %>% dplyr::summarise(dplyr::across(dplyr::everything(), ~ round(mean(.), 4))),
sd = numeric_dataset %>% dplyr::summarise(dplyr::across(dplyr::everything(), ~ round(stats::sd(.), 4)))
)
}
# Pre-compute HVT models sequentially to ensure consistency
precompute_hvt_models <- function(entire_dataset, ncell_range, hvt_params,
verbose = FALSE, time_column) {
# Pre-prepare dataset once
numeric_dataset <- entire_dataset %>% dplyr::select(-dplyr::all_of(time_column))
# Prepare HVT training dataset
hvt_train_dataset <- numeric_dataset
# Train models using lapply
hvt_models <- lapply(ncell_range, function(nclust) {
tryCatch({
set.seed(279)
hvt_args <- c(list(dataset = hvt_train_dataset, n_cells = nclust), hvt_params)
hvt_results <- do.call(trainHVT, hvt_args)
if (is.null(hvt_results) || length(hvt_results) == 0) {
stop("trainHVT returned NULL or empty results")
}
return(hvt_results)
}, error = function(e) NULL)
})
# Name the list elements
names(hvt_models) <- as.character(ncell_range)
return(hvt_models)
}
# Process a single nclust value using pre-computed HVT model
process_single_nclust_with_precomputed_model <- function(nclust, hvt_model, numeric_dataset, entire_dataset, expost_forecasting,
time_column, k_range, nn_range, num_simulations,
time_values, n_ahead, mae_metric,
raw_dataset_stats, parallel, verbose) {
tryCatch({
if (verbose) cat("Processing nclust =", nclust, "\n")
# Check if HVT model exists
if (is.null(hvt_model)) {
stop("Pre-computed HVT model for nclust = ", nclust, " is NULL")
}
# Score complete dataset using pre-computed model
complete_dataset <- dplyr::bind_rows(entire_dataset, expost_forecasting)
scoring <- score_hvt_model(complete_dataset, hvt_model)
# Prepare temporal data
temporal_data_all <- prepare_temporal_data(scoring, complete_dataset, time_column)
# Create training-only temporal data for transition probabilities
max_train_t <- max(entire_dataset[[time_column]], na.rm = TRUE)
temporal_data_train <- temporal_data_all %>%
dplyr::filter(.data[[time_column]] <= max_train_t)
# Get transition probabilities using ONLY training data
prob_trans_matx <- get_transition_probabilities(temporal_data_train, time_column)
# Get initial state
initial_state <- get_initial_state(temporal_data_train, time_column)
# Prepare centroid data
centroid_2d_points <- scoring$cellID_coordinates
centroid_data_for_mae <- prepare_centroid_data_for_mae(
hvt_model, numeric_dataset, scoring,
complete_dataset, raw_dataset_stats, mae_metric, centroid_2d_points
)
# Detect problematic states
scored_cells <- unique(scoring$scoredPredictedData$Cell.ID)
problematic_detection <- detect_problematic_states(prob_trans_matx, scored_cells)
problematic_states <- problematic_detection$all_problematic
problematic_states <- intersect(problematic_states, scored_cells)
# Filter to training cells only
training_cells <- unique(temporal_data_train$Cell.ID)
problematic_states <- problematic_states[problematic_states %in% training_cells]
if (verbose && length(problematic_states) > 0) {
cat(" Found", length(problematic_states), "problematic states\n")
}
# Initialize attempts storage
all_attempts_by_metric <- initialize_attempts_storage(mae_metric)
# Check if there are problematic states
if (length(problematic_states) == 0) {
# No problematic states - run baseline simulation only
baseline_mae <- run_baseline_simulation(
transition_matrix = prob_trans_matx,
temporal_data_train = temporal_data_train,
scoring = scoring,
hvt_results = hvt_model,
initial_state = initial_state,
centroid_data_for_mae = centroid_data_for_mae,
mae_metrics = mae_metric,
raw_dataset_stats = raw_dataset_stats,
time_column = time_column,
entire_dataset = entire_dataset,
expost_forecasting = expost_forecasting,
num_simulations = num_simulations,
time_values = time_values,
n_ahead = n_ahead
)
# Store baseline results
for (metric in mae_metric) {
status_msg <- if (is.na(baseline_mae[[metric]]) || is.null(baseline_mae[[metric]])) {
"ERROR: Data structure error in MSM baseline"
} else {
"Successful Iteration without Problematic state"
}
all_attempts_by_metric[[metric]][[1]] <- data.frame(
`Number of Cells` = nclust,
k = "-",
`Number of nearest neighbors` = "-",
mae = baseline_mae[[metric]],
status = status_msg,
stringsAsFactors = FALSE,
check.names = FALSE
)
}
comprehensive_results <- list(
simulation_results = list(),
all_attempts = all_attempts_by_metric
)
} else {
# Problematic states exist - run full k-nn combinations
clustering_data <- centroid_2d_points %>% dplyr::select(-.data$Cell.ID)
comprehensive_results <- process_all_k_nn_combinations(
k_local_range = k_range,
nn_range = nn_range,
problematic_states = problematic_states,
centroid_2d_points = centroid_2d_points,
clustering_data = clustering_data,
hvt_results = hvt_model,
scoring = scoring,
nclust = nclust,
mae_metric = mae_metric,
all_attempts_by_metric = all_attempts_by_metric,
prob_trans_matx = prob_trans_matx,
temporal_data_complete = temporal_data_all,
initial_state = initial_state,
centroid_data_for_mae = centroid_data_for_mae,
raw_dataset_stats = raw_dataset_stats,
time_column = time_column,
entire_dataset = entire_dataset,
expost_forecasting = expost_forecasting,
num_simulations = num_simulations,
time_values = time_values,
n_ahead = n_ahead,
parallel = FALSE,
verbose = verbose
)
}
# Process and format results
processed_results <- process_simulation_results(
simulation_results = comprehensive_results$simulation_results,
mae_metric = mae_metric,
nclust = nclust,
all_attempts_by_metric = comprehensive_results$all_attempts
)
return(list(
successful_results = processed_results$successful_results,
best_result = processed_results$best_result,
all_attempts = processed_results$all_attempts,
success = TRUE,
nclust = nclust
))
}, error = function(e) {
if (verbose) cat(" ERROR with nclust", nclust, ":", e$message, "\n")
all_config_failure <- list()
for (metric in mae_metric) {
all_config_failure[[metric]] <- data.frame(
`Number of Cells` = nclust, k = "-", `Number of nearest neighbors` = "-",
mae = NA, status = paste0("ERROR: High-level error: ", e$message),
stringsAsFactors = FALSE, check.names = FALSE
)
}
return(list(
successful_results = NULL,
best_result = NULL,
all_attempts = all_config_failure,
success = FALSE,
error = e$message,
nclust = nclust
))
})
}
# Collect and format final optimization results
collect_final_results <- function(optimization_results, mae_metric, verbose) {
# Efficient result categorization
successful_results <- optimization_results[purrr::map_lgl(optimization_results, ~ .x$success %||% FALSE)]
failed_results <- optimization_results[purrr::map_lgl(optimization_results, ~ !(.x$success %||% FALSE))]
final_output <- list()
if (length(optimization_results) > 0) {
# Process each metric efficiently
for (metric in mae_metric) {
metric_key <- paste0(metric, "_mae")
# Collect successful results with consistent data types (5 columns only)
all_successful_metric <- purrr::map_dfr(optimization_results, function(x) {
# Get k,nn simulation results
sim_results <- if (!is.null(x$successful_results) && !is.null(x$successful_results[[metric]])) {
result <- x$successful_results[[metric]]
result$k <- as.character(result$k)
result$`Number of nearest neighbors` <- as.character(result$`Number of nearest neighbors`)
req <- c("Number of Cells","k","Number of nearest neighbors","mae","status")
result <- result[, intersect(names(result), req), drop = FALSE]
for (nm in setdiff(req, names(result))) {
if (nrow(result) > 0) {
result[[nm]] <- NA
} else {
result[[nm]] <- if (nm %in% c("Number of Cells")) integer(0) else if (nm %in% c("k", "Number of nearest neighbors", "status")) character(0) else numeric(0)
}
}
result[req]
} else {
data.frame(
`Number of Cells` = integer(0),
k = character(0),
`Number of nearest neighbors` = character(0),
mae = numeric(0),
status = character(0),
check.names = FALSE
)
}
# Get baseline results from all_attempts
baseline_results <- if (!is.null(x$all_attempts) && !is.null(x$all_attempts[[metric]])) {
tryCatch({
if (is.list(x$all_attempts[[metric]]) && !is.data.frame(x$all_attempts[[metric]])) {
attempts_list <- lapply(x$all_attempts[[metric]], function(df) {
if (is.data.frame(df) && nrow(df) > 0) {
df$k <- as.character(df$k)
df$`Number of nearest neighbors` <- as.character(df$`Number of nearest neighbors`)
}
df
})
attempts_df <- do.call(rbind, attempts_list)
} else {
attempts_df <- x$all_attempts[[metric]]
}
if (is.data.frame(attempts_df) && "status" %in% colnames(attempts_df)) {
success_rows <- attempts_df[grepl("^Successful Iteration", attempts_df$status), ]
if (nrow(success_rows) > 0) {
success_rows$k <- as.character(success_rows$k)
success_rows$`Number of nearest neighbors` <- as.character(success_rows$`Number of nearest neighbors`)
}
req <- c("Number of Cells","k","Number of nearest neighbors","mae","status")
success_rows <- success_rows[, intersect(names(success_rows), req), drop = FALSE]
for (nm in setdiff(req, names(success_rows))) success_rows[[nm]] <- NA
success_rows[req]
} else {
data.frame(
`Number of Cells` = integer(0),
k = character(0),
`Number of nearest neighbors` = character(0),
mae = numeric(0),
status = character(0),
check.names = FALSE
)
}
}, error = function(e) {
data.frame(
`Number of Cells` = integer(0),
k = character(0),
`Number of nearest neighbors` = character(0),
mae = numeric(0),
status = character(0),
check.names = FALSE
)
})
} else {
data.frame(
`Number of Cells` = integer(0),
k = character(0),
`Number of nearest neighbors` = character(0),
mae = numeric(0),
status = character(0),
check.names = FALSE
)
}
# Ensure both sim_results and baseline_results have consistent column types
if (nrow(sim_results) > 0) {
sim_results$k <- as.character(sim_results$k)
sim_results$`Number of nearest neighbors` <- as.character(sim_results$`Number of nearest neighbors`)
}
if (nrow(baseline_results) > 0) {
baseline_results$k <- as.character(baseline_results$k)
baseline_results$`Number of nearest neighbors` <- as.character(baseline_results$`Number of nearest neighbors`)
}
# Safe combine with consistent structures and drop duplicates
combined <- dplyr::bind_rows(sim_results, baseline_results)
combined <- dplyr::distinct(
combined,
.data$`Number of Cells`, .data$k, .data$`Number of nearest neighbors`, .data$mae, .data$status,
.keep_all = FALSE
)
combined
})
# Sort all_successful_metric by Number of Cells, k, and nn
if (nrow(all_successful_metric) > 0) {
# Convert to numeric for proper sorting, handling special cases
all_successful_metric$k_sort <- suppressWarnings({
ifelse(is.na(all_successful_metric$k) | all_successful_metric$k == "-", 999, as.numeric(all_successful_metric$k))
})
all_successful_metric$nn_sort <- suppressWarnings({
ifelse(is.na(all_successful_metric$`Number of nearest neighbors`) |
all_successful_metric$`Number of nearest neighbors` == "-", 999,
as.numeric(all_successful_metric$`Number of nearest neighbors`))
})
# Replace any NAs from conversion with 999
all_successful_metric$k_sort[is.na(all_successful_metric$k_sort)] <- 999
all_successful_metric$nn_sort[is.na(all_successful_metric$nn_sort)] <- 999
all_successful_metric <- all_successful_metric %>%
dplyr::arrange(.data$`Number of Cells`, .data$k_sort, .data$nn_sort) %>%
dplyr::select(-.data$k_sort, -.data$nn_sort)
}
# Collect best results per nclust
best_results_metric <- purrr::map_dfr(successful_results, function(x) {
if (!is.null(x$best_result) && !is.null(x$best_result[[metric]])) {
result <- x$best_result[[metric]]
result$k <- as.character(result$k)
result$`Number of nearest neighbors` <- as.character(result$`Number of nearest neighbors`)
result
} else {
data.frame()
}
})
# Enhanced all attempts collection with error handling
all_attempts_metric <- tryCatch({
purrr::map_dfr(optimization_results, function(x) {
if (!is.null(x$all_attempts) && !is.null(x$all_attempts[[metric]])) {
if (length(x$all_attempts[[metric]]) > 0) {
tryCatch({
if (is.list(x$all_attempts[[metric]]) && !is.data.frame(x$all_attempts[[metric]])) {
attempts_df <- do.call(rbind, x$all_attempts[[metric]])
} else {
attempts_df <- x$all_attempts[[metric]]
}
if (nrow(attempts_df) > 0) {
attempts_df$k <- as.character(attempts_df$k)
attempts_df$`Number of nearest neighbors` <- as.character(attempts_df$`Number of nearest neighbors`)
}
attempts_df
}, error = function(e) {
data.frame()
})
} else {
data.frame()
}
} else {
data.frame()
}
})
}, error = function(e) {
data.frame()
})
# Sort all results tables by Number of Cells, k, and nn
if (nrow(all_attempts_metric) > 0) {
# Convert to numeric for proper sorting, handling special cases
all_attempts_metric$k_sort <- suppressWarnings({
ifelse(is.na(all_attempts_metric$k) | all_attempts_metric$k == "-", 999, as.numeric(all_attempts_metric$k))
})
all_attempts_metric$nn_sort <- suppressWarnings({
ifelse(is.na(all_attempts_metric$`Number of nearest neighbors`) |
all_attempts_metric$`Number of nearest neighbors` == "-", 999,
as.numeric(all_attempts_metric$`Number of nearest neighbors`))
})
# Replace any NAs from conversion with 999
all_attempts_metric$k_sort[is.na(all_attempts_metric$k_sort)] <- 999
all_attempts_metric$nn_sort[is.na(all_attempts_metric$nn_sort)] <- 999
all_attempts_metric <- all_attempts_metric %>%
dplyr::arrange(.data$`Number of Cells`, .data$k_sort, .data$nn_sort) %>%
dplyr::select(-.data$k_sort, -.data$nn_sort)
}
if (nrow(best_results_metric) > 0) {
# Convert to numeric for proper sorting, handling special cases
best_results_metric$k_sort <- suppressWarnings({
ifelse(is.na(best_results_metric$k) | best_results_metric$k == "-", 999, as.numeric(best_results_metric$k))
})
best_results_metric$nn_sort <- suppressWarnings({
ifelse(is.na(best_results_metric$`Number of nearest neighbors`) |
best_results_metric$`Number of nearest neighbors` == "-", 999,
as.numeric(best_results_metric$`Number of nearest neighbors`))
})
# Replace any NAs from conversion with 999
best_results_metric$k_sort[is.na(best_results_metric$k_sort)] <- 999
best_results_metric$nn_sort[is.na(best_results_metric$nn_sort)] <- 999
best_results_metric <- best_results_metric %>%
dplyr::arrange(.data$`Number of Cells`, .data$k_sort, .data$nn_sort) %>%
dplyr::select(-.data$k_sort, -.data$nn_sort)
req <- c("Number of Cells","k","Number of nearest neighbors","mae","status")
best_results_metric <- best_results_metric[, intersect(names(best_results_metric), req), drop = FALSE]
for (nm in setdiff(req, names(best_results_metric))) best_results_metric[[nm]] <- NA
best_results_metric <- best_results_metric[req]
}
# Find overall best
overall_best_metric <- if (nrow(all_successful_metric) > 0) {
tryCatch({
best_row <- all_successful_metric %>%
dplyr::arrange(.data$mae) %>%
dplyr::slice(1)
best_row
}, error = function(e) {
data.frame()
})
} else {
data.frame()
}
# enforce 5-column shape everywhere
req <- c("Number of Cells","k","Number of nearest neighbors","mae","status")
shape <- function(df){
if (is.null(df) || !nrow(df)) return(data.frame(`Number of Cells`=integer(0),k=character(0),`Number of nearest neighbors`=character(0),mae=numeric(0),status=character(0),check.names=FALSE))
df <- df[, intersect(names(df), req), drop = FALSE]
for (nm in setdiff(req, names(df))) df[[nm]] <- NA
df[req]
}
final_output[[metric_key]] <- list(
successful_results = shape(all_successful_metric),
nclust_best_results = shape(best_results_metric),
overall_best = shape(overall_best_metric),
all_results = shape(all_attempts_metric)
)
}
}
# Print summary if verbose
if (verbose) {
cat("Optimization Summary:\n")
cat(" Total configurations tested:", length(optimization_results), "\n")
cat(" Successfully processed:", length(successful_results), "\n")
cat(" Failed:", length(failed_results), "\n")
for (metric in mae_metric) {
metric_key <- paste0(metric, "_mae")
all_results <- final_output[[metric_key]]$all_results
if (nrow(all_results) > 0) {
successful_count <- sum(grepl("^Successful Iteration", all_results$status), na.rm = TRUE)
total_count <- nrow(all_results)
success_rate <- round(100 * successful_count / total_count, 1)
cat(" ", metric, "success rate:", success_rate, "% (", successful_count, "/", total_count, ")\n")
if (nrow(final_output[[metric_key]]$overall_best) > 0) {
best <- final_output[[metric_key]]$overall_best
cat(" Best", metric, "MAE:", round(best$mae[1], 4),
"with", best$`Number of Cells`[1], "cells, k =", best$k[1], ", nn =", best$`Number of nearest neighbors`[1], "\n")
}
}
}
}
return(final_output)
}
# ============================================================================
# HVT PROCESSING FUNCTIONS
score_hvt_model <- function(complete_dataset, hvt_results) {
scoring <- scoreHVT(
complete_dataset, hvt_results,
analysis.plots = FALSE,
names.column = complete_dataset[, 1]
)
if (is.null(scoring) || is.null(scoring$scoredPredictedData)) {
stop("scoreHVT returned NULL or missing scoredPredictedData")
}
return(scoring)
}
# Prepare temporal data for MSM
prepare_temporal_data <- function(scoring, complete_dataset, time_column) {
scored_data <- scoring$scoredPredictedData
scored_data[[time_column]] <- as.POSIXct(complete_dataset[[time_column]])
# Reorder columns to put time first
time_col_pos <- which(names(scored_data) == time_column)
if (length(time_col_pos) == 1) {
other_cols <- names(scored_data)[-time_col_pos]
scored_data <- scored_data[, c(time_column, other_cols), drop = FALSE]
}
temporal_data_all <- scored_data %>% dplyr::select(dplyr::all_of(c(time_column, "Cell.ID")))
if (any(is.na(temporal_data_all[[time_column]]))) {
stop("Failed to parse time column")
}
return(temporal_data_all)
}
# Get transition probabilities from training data ONLY
get_transition_probabilities <- function(temporal_data_train, time_column) {
prob_trans_matx <- getTransitionProbability(
df = temporal_data_train, # ONLY training data
cellid_column = "Cell.ID",
time_column = time_column,
type = "with_self_state"
)
if (is.null(prob_trans_matx) || nrow(prob_trans_matx) == 0) {
stop("getTransitionProbability (train window) returned NULL or empty matrix")
}
return(prob_trans_matx)
}
# Get initial state for simulation
get_initial_state <- function(temporal_data_train, time_column) {
last_t_train <- max(temporal_data_train[[time_column]], na.rm = TRUE)
initial_state <- temporal_data_train$Cell.ID[temporal_data_train[[time_column]] == last_t_train][1]
return(initial_state)
}
# Prepare centroid data for MAE calculation
# prepare_centroid_data_for_mae <- function(dependent_variables, hvt_results, numeric_dataset,
# scoring, complete_dataset, raw_dataset_stats,
# mae_metric, centroid_2d_points) {
# if (is.null(dependent_variables)) {
# # Normal flow: align to training variables only
# centroid_data_for_mae <- hvt_results[[3]]$summary %>%
# dplyr::select(dplyr::any_of(colnames(numeric_dataset)))
# } else {
# # DV flow: build normalized per-state centroids for DVs
# assign_df <- scoring$scoredPredictedData %>% dplyr::select("Cell.ID")
# dv_df <- cbind(assign_df, complete_dataset[, dependent_variables, drop = FALSE])
#
# mu_vec <- as.numeric(raw_dataset_stats$mean[1, dependent_variables, drop = TRUE])
# sd_vec <- as.numeric(raw_dataset_stats$sd[1, dependent_variables, drop = TRUE])
# sd_vec[is.na(sd_vec) | sd_vec == 0] <- 1
#
# dv_norm <- as.data.frame(Map(function(col, m, s) (col - m)/s,
# dv_df[dependent_variables], mu_vec, sd_vec))
# dv_norm$Cell.ID <- dv_df[["Cell.ID"]]
#
# agg_fun <- switch(mae_metric[1],
# mean = function(x) mean(x, na.rm = TRUE),
# median = function(x) median(x, na.rm = TRUE),
# mode = function(x) {
# tb <- table(x);
# as.numeric(names(tb)[which.max(tb)])
# })
#
# dv_state_map_scaled <- dv_norm %>%
# dplyr::group_by(.data$Cell.ID) %>%
# dplyr::summarise(dplyr::across(dplyr::all_of(dependent_variables), ~ agg_fun(.x)), .groups = "drop")
#
# all_cells <- data.frame(Cell.ID = centroid_2d_points[["Cell.ID"]])
# dv_state_map_scaled <- dplyr::left_join(all_cells, dv_state_map_scaled, by = "Cell.ID")
#
# for (nm in dependent_variables) {
# dv_state_map_scaled[[nm]][is.na(dv_state_map_scaled[[nm]])] <- 0
# }
#
# centroid_data_for_mae <- dv_state_map_scaled %>% dplyr::select(-"Cell.ID")
# }
#
# return(centroid_data_for_mae)
# }
# Prepare centroid data for MAE calculation
prepare_centroid_data_for_mae <- function(hvt_results, numeric_dataset,
scoring, complete_dataset, raw_dataset_stats,
mae_metric, centroid_2d_points) {
# Normal flow: align to training variables only
centroid_data_for_mae <- hvt_results[[3]]$summary %>%
dplyr::select(dplyr::any_of(colnames(numeric_dataset)))
return(centroid_data_for_mae)
}
# Initialize attempts storage for each metric
initialize_attempts_storage <- function(mae_metric) {
all_attempts_by_metric <- list()
for (metric in mae_metric) {
all_attempts_by_metric[[metric]] <- list()
}
return(all_attempts_by_metric)
}
# ============================================================================
# SIMULATION FUNCTIONS
# ============================================================================
# Run MSM simulation directly for optimization - MSM handles all validation
run_single_simulation <- function(k_val, n_nn, cluster_data, transition_matrix,
temporal_data_train, scoring, hvt_results, initial_state,
problematic_states, centroid_data_for_mae, mae_metrics,
raw_dataset_stats, time_column, entire_dataset,
expost_forecasting, num_simulations, time_values, n_ahead) {
# Create temporal data properly for MSM
scored_data <- scoring$scoredPredictedData
complete_dataset <- rbind(entire_dataset, expost_forecasting)
scored_data[[time_column]] <- as.POSIXct(complete_dataset[[time_column]])
temporal_data_all <- scored_data %>% dplyr::select(dplyr::all_of(c(time_column, "Cell.ID")))
tryCatch({
# Set optimization flag for hvt_msm_opt
options(hvt.msm.optimization = TRUE)
# Let MSM handle ALL validation and clustering - no pre-validation
msm_result <- msm(
state_time_data = temporal_data_all,
forecast_type = "ex-post",
transition_probability_matrix = transition_matrix,
initial_state = initial_state,
num_simulations = num_simulations,
scoreHVT_results = scoring,
trainHVT_results = hvt_results,
actual_data = expost_forecasting,
raw_dataset = complete_dataset,
k = k_val,
handle_problematic_states = length(problematic_states) > 0,
n_nearest_neighbor = n_nn,
mae_metric = mae_metrics[1],
show_simulation = FALSE,
time_column = time_column,
precomputed_problematic_states = problematic_states,
plot_mode = "mae-only"
)
# Extract MAE from MSM results
mae_results <- extract_mae_from_msm_results(msm_result, mae_metrics)
# Cleanup heavy objects
rm(msm_result, temporal_data_all, scored_data)
gc(verbose = FALSE)
# Return success with models
if (!is.null(mae_results) && !is.na(mae_results[[mae_metrics[1]]])) {
return(list(
success = TRUE,
k = k_val,
nn = n_nn,
mae_results = mae_results
))
} else {
return(list(success = FALSE, k = k_val, nn = n_nn, error = "MAE extraction failed"))
}
}, error = function(e) {
# MSM failed (could be validation, clustering, or simulation failure)
gc(verbose = FALSE)
return(list(success = FALSE, k = k_val, nn = n_nn, error = as.character(e$message)))
})
}
# Extract MAE values from MSM results
extract_mae_from_msm_results <- function(msm_result, mae_metrics) {
mae_results <- list()
# New compact structure from mae-only path
if (!is.null(msm_result$mae_by_metric)) {
for (metric in mae_metrics) {
mm <- msm_result$mae_by_metric[[metric]]
if (!is.null(mm)) {
centroid_maes <- sapply(mm$centroid_mae_list, function(x) x[["mae"]])
mae_results[[metric]] <- mean(centroid_maes, na.rm = TRUE)
} else {
mae_results[[metric]] <- NA
}
}
return(mae_results)
}
# Backward-compatible path (plots-based)
for (metric in mae_metrics) {
if (!is.null(msm_result$plots) && length(msm_result$plots) >= 2) {
centroid_maes <- sapply(msm_result$plots[[1]], function(x) x[["mae"]])
mae_results[[metric]] <- mean(centroid_maes, na.rm = TRUE)
} else {
mae_results[[metric]] <- NA
}
}
mae_results
}
# Run baseline simulation using MSM without problematic state handling
run_baseline_simulation <- function(transition_matrix, temporal_data_train, scoring, hvt_results,
initial_state, centroid_data_for_mae, mae_metrics, raw_dataset_stats,
time_column, entire_dataset, expost_forecasting, num_simulations,
time_values, n_ahead) {
# Create temporal data for MSM
scored_data <- scoring$scoredPredictedData
complete_dataset <- rbind(entire_dataset, expost_forecasting)
scored_data[[time_column]] <- as.POSIXct(complete_dataset[[time_column]])
temporal_data_all <- scored_data %>%
dplyr::select(dplyr::all_of(c(time_column, "Cell.ID")))
tryCatch({
# Set optimization flag for hvt_msm_opt
options(hvt.msm.optimization = TRUE)
# Baseline MSM call: no problematic state handling
msm_result <- msm(
state_time_data = temporal_data_all,
forecast_type = "ex-post",
transition_probability_matrix = transition_matrix,
initial_state = initial_state,
num_simulations = num_simulations,
scoreHVT_results = scoring,
trainHVT_results = hvt_results,
actual_data = expost_forecasting,
raw_dataset = complete_dataset,
handle_problematic_states = FALSE, # Baseline = no problematic state handling
mae_metric = mae_metrics[1],
show_simulation = FALSE,
time_column = time_column,
plot_mode = "mae-only"
)
# Extract MAE from MSM results
mae_results <- extract_mae_from_msm_results(msm_result, mae_metrics)
# Cleanup
rm(msm_result, temporal_data_all, scored_data)
gc(verbose = FALSE)
return(mae_results)
}, error = function(e) {
# Return NA for all metrics if baseline fails
mae_results <- list()
for (metric in mae_metrics) {
mae_results[[metric]] <- NA
}
return(mae_results)
})
}
# Process all k-nn combinations with pre-filtering checkpoint for invalid combinations
process_all_k_nn_combinations <- function(k_local_range, nn_range, problematic_states,
centroid_2d_points, clustering_data, hvt_results,
scoring, nclust, mae_metric, all_attempts_by_metric,
prob_trans_matx, temporal_data_complete, initial_state,
centroid_data_for_mae, raw_dataset_stats, time_column,
entire_dataset, expost_forecasting, num_simulations,
time_values, n_ahead, parallel,
verbose = FALSE) {
# Generate ALL possible k-nn combinations
all_combinations <- expand.grid(k = k_local_range, nn = nn_range)
if (verbose) {
cat(" Total k-nn combinations:", nrow(all_combinations), "\n")
}
# CHECKPOINT: Pre-filter mathematically invalid combinations
valid_combinations <- list()
invalid_combinations <- list()
for (i in seq_len(nrow(all_combinations))) {
k_val <- all_combinations$k[i]
nn_val <- all_combinations$nn[i]
# Check 1: k must be less than or equal to number of cells
if (k_val > nclust) {
invalid_combinations[[length(invalid_combinations) + 1]] <- list(
k = k_val,
nn = nn_val,
status = paste0("Mathematical bound reached - Invalid k (", k_val, ") for given n_cells (", nclust, ")")
)
next
}
# Check 2: nn must be <= (cells - k) [mathematical upper bound]
max_possible_nn <- nclust - k_val
if (nn_val > max_possible_nn) {
invalid_combinations[[length(invalid_combinations) + 1]] <- list(
k = k_val,
nn = nn_val,
status = paste0("Mathematical bound reached - Invalid nn (", nn_val, ") for given n_cells (", nclust, ") and k (", k_val, ")")
)
next
}
# Passed both checks - valid combination
valid_combinations[[length(valid_combinations) + 1]] <- list(
k = k_val,
nn = nn_val,
status = "ready_for_msm",
valid = TRUE
)
}
# Store invalid combinations in all_attempts_by_metric
if (length(invalid_combinations) > 0) {
for (combo in invalid_combinations) {
for (metric in mae_metric) {
idx <- length(all_attempts_by_metric[[metric]]) + 1
all_attempts_by_metric[[metric]][[idx]] <- data.frame(
`Number of Cells` = nclust,
k = as.character(combo$k),
`Number of nearest neighbors` = as.character(combo$nn),
mae = NA,
status = combo$status, # Already contains "ERROR: " prefix
stringsAsFactors = FALSE,
check.names = FALSE
)
}
}
}
if (verbose) {
cat(" Valid combinations:", length(valid_combinations), "\n")
cat(" Pre-filtered (invalid):", length(invalid_combinations), "\n")
}
# Run MSM simulations for VALID combinations only (invalid ones already filtered)
simulation_results <- list()
if (length(valid_combinations) > 0) {
# Simulation function for valid combinations
simulation_function <- function(combo) {
# Call run_single_simulation which calls MSM
result <- run_single_simulation(
k_val = combo$k, n_nn = combo$nn, cluster_data = NULL,
transition_matrix = prob_trans_matx, temporal_data_train = NULL,
scoring = scoring, hvt_results = hvt_results, initial_state = initial_state,
problematic_states = problematic_states, centroid_data_for_mae = NULL,
mae_metrics = mae_metric, raw_dataset_stats = NULL,
time_column = time_column, entire_dataset = entire_dataset,
expost_forecasting = expost_forecasting, num_simulations = num_simulations,
time_values = NULL, n_ahead = NULL
)
return(result)
}
# Execute ALL combinations
if (parallel) {
if (!requireNamespace("furrr", quietly = TRUE) || !requireNamespace("future", quietly = TRUE)) {
stop("Packages 'furrr' and 'future' are required for parallel processing. ",
"Please install them with: install.packages(c('furrr', 'future'))")
}
simulation_results <- furrr::future_map(valid_combinations, simulation_function,
.options = furrr::furrr_options(seed = TRUE, packages = c("dplyr", "data.table")))
} else {
simulation_results <- purrr::map(valid_combinations, simulation_function)
}
if (verbose) {
successful_count <- sum(sapply(simulation_results, function(x) x$success %||% FALSE))
cat(" Completed:", length(simulation_results), "simulations,", successful_count, "successful\n")
}
}
return(list(simulation_results = simulation_results, all_attempts = all_attempts_by_metric))
}
# Process simulation results - lightweight version
process_simulation_results <- function(simulation_results, mae_metric, nclust, all_attempts_by_metric) {
if (length(simulation_results) == 0) {
formatted_results <- list()
best_results <- list()
for (metric in mae_metric) {
formatted_results[[metric]] <- data.frame()
best_results[[metric]] <- data.frame()
}
return(list(
successful_results = formatted_results,
best_result = best_results,
all_attempts = all_attempts_by_metric
))
}
formatted_results <- list()
best_results <- list()
successful_results_list <- list()
# Initialize lists for each metric
for (metric in mae_metric) {
successful_results_list[[metric]] <- list()
}
# Process each simulation result
for (result in simulation_results) {
if (result$success && !is.null(result$mae_results)) {
# Process successful results
for (metric in mae_metric) {
# Add to all attempts
idx <- length(all_attempts_by_metric[[metric]]) + 1
all_attempts_by_metric[[metric]][[idx]] <- data.frame(
`Number of Cells` = nclust,
k = as.character(result$k),
`Number of nearest neighbors` = as.character(result$nn),
mae = round(result$mae_results[[metric]], 4),
status = "Successful Iteration by Handling Problematic state",
stringsAsFactors = FALSE,
check.names = FALSE
)
# Add to successful results WITHOUT models
success_idx <- length(successful_results_list[[metric]]) + 1
successful_results_list[[metric]][[success_idx]] <- data.frame(
`Number of Cells` = nclust,
k = as.character(result$k),
`Number of nearest neighbors` = as.character(result$nn),
mae = round(result$mae_results[[metric]], 4),
status = "Successful Iteration by Handling Problematic state",
stringsAsFactors = FALSE,
check.names = FALSE
)
}
} else {
# Process failed results
for (metric in mae_metric) {
idx <- length(all_attempts_by_metric[[metric]]) + 1
status_msg <- if (!is.null(result$error)) {
result$error # Message comes as-is from MSM (already formatted)
} else {
"ERROR: MSM failed"
}
all_attempts_by_metric[[metric]][[idx]] <- data.frame(
`Number of Cells` = nclust,
k = as.character(result$k),
`Number of nearest neighbors` = as.character(result$nn),
mae = NA,
status = status_msg,
stringsAsFactors = FALSE,
check.names = FALSE
)
}
}
}
# Format successful results and find best combinations WITHOUT models
for (metric in mae_metric) {
if (length(successful_results_list[[metric]]) > 0) {
# Extract configs and de-duplicate strictly on the 5 columns
configs <- purrr::map_dfr(successful_results_list[[metric]], ~ .x)
configs <- dplyr::distinct(configs, `Number of Cells`, k, `Number of nearest neighbors`, mae, status, .keep_all = FALSE)
# Ensure numeric conversion for sorting
configs$k_numeric <- as.numeric(configs$k)
configs$nn_numeric <- as.numeric(configs$`Number of nearest neighbors`)
configs$mae_numeric <- as.numeric(configs$mae)
# Deduplicate and sort, keep only required 5 cols
configs <- dplyr::distinct(configs, .data$`Number of Cells`, .data$k, .data$`Number of nearest neighbors`, .data$mae, .keep_all = TRUE)
sort_order <- order(configs$`Number of Cells`, configs$k_numeric, configs$nn_numeric)
formatted <- configs[sort_order, c("Number of Cells","k","Number of nearest neighbors","mae","status")]
formatted_results[[metric]] <- formatted
# Find best combination WITHOUT storing models
best_idx <- which.min(configs$mae_numeric)
best_combination <- configs[best_idx, ] %>%
dplyr::select(-.data$k_numeric, -.data$nn_numeric, -.data$mae_numeric)
best_results[[metric]] <- best_combination[, c("Number of Cells","k","Number of nearest neighbors","mae","status")]
} else {
formatted_results[[metric]] <- data.frame()
best_results[[metric]] <- data.frame()
}
}
# Sort all attempts
for (metric in mae_metric) {
if (length(all_attempts_by_metric[[metric]]) > 0) {
combined_attempts <- do.call(rbind, all_attempts_by_metric[[metric]])
# Sort by cells, k, nn
combined_attempts$k_sort <- ifelse(combined_attempts$k == "-", 999, as.numeric(combined_attempts$k))
combined_attempts$nn_sort <- ifelse(combined_attempts$`Number of nearest neighbors` == "-", 999,
as.numeric(combined_attempts$`Number of nearest neighbors`))
combined_attempts <- combined_attempts %>%
dplyr::arrange(.data$`Number of Cells`, .data$k_sort, .data$nn_sort) %>%
dplyr::select(-.data$k_sort, -.data$nn_sort)
# Convert back to list format
all_attempts_by_metric[[metric]] <- split(combined_attempts, seq_len(nrow(combined_attempts)))
all_attempts_by_metric[[metric]] <- lapply(all_attempts_by_metric[[metric]], as.data.frame)
}
}
gc(verbose = FALSE)
return(list(
successful_results = formatted_results,
best_result = best_results,
all_attempts = all_attempts_by_metric
))
}
# ============================================================================
# PROBLEMATIC STATE DETECTION FUNCTIONS
# ============================================================================
# Detect all types of problematic states in a transition matrix
detect_problematic_states <- function(prob_trans_matx, scored_cells) {
# Missing states: states in scored_cells but not in transition matrix
transition_states <- unique(prob_trans_matx$Current_State)
missing_states <- setdiff(scored_cells, transition_states)
# Self-transition states (100% self-transition)
self_transition_states <- prob_trans_matx %>%
dplyr::group_by(.data$Current_State) %>%
dplyr::summarize(
is_self_only = dplyr::n() == 1 && all(.data$Current_State == .data$Next_State),
.groups = 'drop'
) %>%
dplyr::filter(.data$is_self_only == TRUE) %>%
dplyr::pull(.data$Current_State)
# Cyclic states detection
cyclic_states <- find_cyclic_states(prob_trans_matx)
# Combine all problematic states
all_problematic <- unique(c(missing_states, self_transition_states, cyclic_states))
return(list(
missing_states = missing_states,
self_transition_states = self_transition_states,
cyclic_states = cyclic_states,
all_problematic = all_problematic
))
}
# Detect cyclic states in a transition matrix
find_cyclic_states <- function(trans_table) {
all_states <- unique(c(trans_table$Current_State, trans_table$Next_State))
state_info <- trans_table %>%
dplyr::group_by(.data$Current_State) %>%
dplyr::summarize(next_states = list(unique(.data$Next_State)), .groups = 'drop')
adjacency <- setNames(lapply(state_info$next_states, unlist), as.character(state_info$Current_State))
two_way_cycles <- c(); one_way_traps <- c(); processed_pairs <- character(0)
for (state_a in all_states) {
if (!as.character(state_a) %in% names(adjacency)) next
next_states_a <- adjacency[[as.character(state_a)]]
other_states_a <- next_states_a[next_states_a != state_a]
for (state_b in other_states_a) {
pair_key <- paste(sort(c(state_a, state_b)), collapse = "-")
if (pair_key %in% processed_pairs) next
processed_pairs <- c(processed_pairs, pair_key)
if (!as.character(state_b) %in% names(adjacency)) next
next_states_b <- adjacency[[as.character(state_b)]]
states_in_system <- c(state_a, state_b)
reaches_outside_a <- setdiff(next_states_a, states_in_system)
reaches_outside_b <- setdiff(next_states_b, states_in_system)
if (length(reaches_outside_a) == 0 && length(reaches_outside_b) == 0) {
a_can_reach_b <- state_b %in% next_states_a
b_can_reach_a <- state_a %in% next_states_b
if (a_can_reach_b && b_can_reach_a) two_way_cycles <- c(two_way_cycles, state_a, state_b)
else if (a_can_reach_b && !b_can_reach_a) {
if (length(next_states_b) == 1 && next_states_b[1] == state_b) one_way_traps <- c(one_way_traps, state_a, state_b)
} else if (!a_can_reach_b && b_can_reach_a) {
if (length(next_states_a) == 1 && next_states_a[1] == state_a) one_way_traps <- c(one_way_traps, state_a, state_b)
}
}
}
}
unique(c(two_way_cycles, one_way_traps))
}
# ============================================================================
# MISSING MSM HELPER FUNCTIONS
# ============================================================================
# Validate MSM clustering for problematic states
validate_msm_clustering <- function(cluster_data, problematic_states, n_nearest_neighbor,
centroid_2d_points, verbose = TRUE) {
if (length(problematic_states) == 0) {
return(list(is_valid = TRUE, message = "No problematic states found"))
}
validation_issues <- c()
for (prob_state in problematic_states) {
prob_cluster <- cluster_data$cluster[cluster_data$Cell.ID == prob_state]
cluster_members <- cluster_data$Cell.ID[cluster_data$cluster == prob_cluster]
valid_neighbors <- setdiff(cluster_members, c(prob_state, problematic_states))
if (verbose) {
cat("Problematic state", prob_state, "in cluster", prob_cluster,
"- Valid neighbors:", length(valid_neighbors), "\n")
}
if (length(valid_neighbors) == 0) {
validation_issues <- c(validation_issues,
paste0("State ", prob_state, " has no valid neighbors in cluster ", prob_cluster))
} else if (length(valid_neighbors) < n_nearest_neighbor) {
validation_issues <- c(validation_issues,
paste0("State ", prob_state, " has only ", length(valid_neighbors),
" valid neighbors, but ", n_nearest_neighbor, " requested"))
}
}
if (length(validation_issues) > 0) {
return(list(
is_valid = FALSE,
message = paste(validation_issues, collapse = "; ")
))
}
list(is_valid = TRUE, message = "MSM clustering validation passed")
}
# Choose a nearest non-problematic neighbor for a problematic state
find_nearest_neighbor_enhanced <- function(problematic_state,
centroid_2d_points,
cluster_data,
problematic_states,
n_nearest_neighbor,
scored_cells,
initial_state) {
if (is.null(cluster_data)) {
valid_states <- setdiff(scored_cells, problematic_states)
if (length(valid_states) > 0) return(sample(valid_states, 1))
warning("No valid fallback states available, returning initial state")
return(initial_state)
}
current_cluster <- cluster_data$cluster[cluster_data$Cell.ID == problematic_state]
coords <- centroid_2d_points[centroid_2d_points$Cell.ID %in%
cluster_data$Cell.ID[cluster_data$cluster == current_cluster], ]
current_coords <- coords[coords$Cell.ID == problematic_state, c("x", "y")]
distances <- sqrt((coords$x - current_coords$x)^2 + (coords$y - current_coords$y)^2)
valid_neighbors <- coords$Cell.ID[!coords$Cell.ID %in% c(problematic_state, problematic_states)]
if (length(valid_neighbors) == 0) {
warning(paste0("No valid neighbors for problematic state ", problematic_state,
" in cluster ", current_cluster, ". Using global fallback."))
all_coords <- centroid_2d_points
all_distances <- sqrt((all_coords$x - current_coords$x)^2 +
(all_coords$y - current_coords$y)^2)
global_valid <- setdiff(all_coords$Cell.ID, c(problematic_state, problematic_states))
if (length(global_valid) > 0) {
global_neighbor_distances <- all_distances[all_coords$Cell.ID %in% global_valid]
closest_global <- global_valid[which.min(global_neighbor_distances)]
return(closest_global)
} else {
warning("No valid neighbors globally available, returning initial state")
return(initial_state)
}
}
neighbor_distances <- distances[coords$Cell.ID %in% valid_neighbors]
num_neighbors <- min(length(valid_neighbors), n_nearest_neighbor)
nearest_n <- valid_neighbors[order(neighbor_distances)][1:num_neighbors]
neighbor_probs <- 1/neighbor_distances[order(neighbor_distances)][1:num_neighbors]
neighbor_probs <- neighbor_probs/sum(neighbor_probs)
random_shock <- round(runif(min=0, max=1, n=1), 4)
cumulative_probs <- cumsum(neighbor_probs)
nearest_n[which.max(random_shock <= cumulative_probs)]
}
# Single simulation step for MSM
simulate_step_enhanced <- function(i, current_state, random_val,
transition_probability_matrix,
centroid_2d_points,
cluster_data,
problematic_states,
n_nearest_neighbor,
scored_cells,
initial_state) {
random_shock <- if (is.null(random_val)) round(runif(min=0, max=1, n=1), 4) else random_val
if (i == 1) return(initial_state)
# Handle problematic states only if handle_problematic_states = TRUE (indicated by cluster_data being non-NULL)
# If cluster_data is NULL, just follow the transition matrix normally, even for problematic states
if (!is.null(cluster_data) && current_state %in% problematic_states) {
return(find_nearest_neighbor_enhanced(current_state, centroid_2d_points, cluster_data,
problematic_states, n_nearest_neighbor, scored_cells, initial_state))
}
transition <- subset(transition_probability_matrix, Current_State == current_state)
if (nrow(transition) == 0) {
if (current_state %in% scored_cells) return(current_state)
else return(find_nearest_neighbor_enhanced(current_state, centroid_2d_points, cluster_data,
problematic_states, n_nearest_neighbor, scored_cells, initial_state))
}
transition <- transition[order(-transition$Transition_Probability), ]
transition$cumulative_prob <- cumsum(transition$Transition_Probability)
next_state <- transition$Next_State[which.max(random_shock <= transition$cumulative_prob)]
next_state <- as.numeric(next_state)
# Replace problematic states only if handle_problematic_states = TRUE (cluster_data is non-NULL)
# If handle_problematic_states = FALSE, follow transition matrix even if it leads to problematic states
if (!is.null(cluster_data) && next_state %in% problematic_states) {
return(find_nearest_neighbor_enhanced(next_state, centroid_2d_points, cluster_data,
problematic_states, n_nearest_neighbor, scored_cells, initial_state))
}
# Always replace invalid states (not in scored_cells)
if (!next_state %in% scored_cells) {
return(find_nearest_neighbor_enhanced(next_state, centroid_2d_points, cluster_data,
problematic_states, n_nearest_neighbor, scored_cells, initial_state))
}
next_state
}
# Simulate a full MSM sequence
simulate_sequence_enhanced <- function(sim_index, n_ahead,
initial_state, transition_probability_matrix,
centroid_2d_points, cluster_data, problematic_states,
n_nearest_neighbor, scored_cells) {
current_state <- initial_state
simulated_values <- sapply(1:n_ahead, function(i) {
random_val <- NULL
current_state <<- simulate_step_enhanced(i, current_state, random_val,
transition_probability_matrix,
centroid_2d_points, cluster_data,
problematic_states, n_nearest_neighbor,
scored_cells, initial_state)
current_state
})
as.numeric(simulated_values)
}
# Adjust invalid initial state using neighbors
adjust_initial_state <- function(initial_state, transition_cells, problematic_states,
scored_cells, centroid_2d_points, cluster_data,
n_nearest_neighbor) {
if (!(initial_state %in% transition_cells) || initial_state %in% problematic_states) {
return(find_nearest_neighbor_enhanced(initial_state, centroid_2d_points, cluster_data,
problematic_states, n_nearest_neighbor,
scored_cells, initial_state))
}
initial_state
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.