inst/doc/Comprehensive_Evaluation.R

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
  echo = TRUE, 
  warning = FALSE, 
  message = FALSE,
  fig.width = 8, 
  fig.height = 6,
  collapse = TRUE,
  comment = "#>",
  cache = FALSE
)
library(topolow)

# Function to check and load packages with informative messages
check_and_load_package <- function(package_name) {
  if (!requireNamespace(package_name, quietly = TRUE)) {
    message("Package ", package_name, " not found. Please install it with:")
    message("install.packages('", package_name, "')")
    return(FALSE)
  } else {
    library(package_name, character.only = TRUE)
    return(TRUE)
  }
}

# Load required libraries
required_packages <- c("topolow", "ggplot2", "dplyr", "reshape2", 
                      "scales", "MASS")

# Check for optional packages
optional_packages <- c("smacof", "RANN")

missing_packages <- character(0)
for(pkg in required_packages) {
  if(!check_and_load_package(pkg)) {
    missing_packages <- c(missing_packages, pkg)
  }
}

# Load optional packages quietly
for(pkg in optional_packages) {
  check_and_load_package(pkg)
}

if(length(missing_packages) > 0) {
  stop("Please install missing packages: ", paste(missing_packages, collapse = ", "))
}

# Set seed for reproducibility
set.seed(42)

## ----utility_functions, eval=FALSE--------------------------------------------
# # Enhanced statistical testing with effect sizes
# enhanced_statistical_comparison <- function(topolow_errors, other_errors, method_name) {
#   valid_topolow <- topolow_errors[!is.na(topolow_errors)]
#   valid_other <- other_errors[!is.na(other_errors)]
# 
#   if(length(valid_other) < 2 || length(valid_topolow) < 2) {
#     cat("Insufficient data for statistical comparison with", method_name, "\n")
#     return(NULL)
#   }
# 
#   # T-test
#   t_result <- t.test(valid_topolow, valid_other)
# 
#   # Effect size (Cohen's d)
#   pooled_sd <- sqrt(((length(valid_topolow) - 1) * var(valid_topolow) +
#                      (length(valid_other) - 1) * var(valid_other)) /
#                     (length(valid_topolow) + length(valid_other) - 2))
# 
#   cohens_d <- (mean(valid_topolow) - mean(valid_other)) / pooled_sd
# 
#   cat("\n--- Statistical Comparison (Topolow vs", method_name, ") ---\n")
#   cat("- Welch's t-statistic:", round(t_result$statistic, 3), "\n")
#   cat("- p-value:", format(t_result$p.value, scientific = TRUE, digits = 3), "\n")
#   cat("- Cohen's d (effect size):", round(cohens_d, 3), "\n")
#   cat("- Effect size interpretation:",
#       if(abs(cohens_d) < 0.2) "Negligible"
#       else if(abs(cohens_d) < 0.5) "Small"
#       else if(abs(cohens_d) < 0.8) "Medium"
#       else "Large", "\n")
# 
#   return(list(t_test = t_result, cohens_d = cohens_d))
# }
# 
# # Calculate data quality metrics
# calculate_data_quality_metrics <- function(distance_matrix) {
#   total_possible <- nrow(distance_matrix) * (nrow(distance_matrix) - 1) / 2
#   total_available <- sum(!is.na(distance_matrix[upper.tri(distance_matrix)]))
#   completeness <- total_available / total_possible
# 
#   available_distances <- distance_matrix[!is.na(distance_matrix)]
# 
#   metrics <- list(
#     completeness = completeness,
#     n_objects = nrow(distance_matrix),
#     n_available_distances = total_available,
#     distance_range = range(available_distances),
#     distance_mean = mean(available_distances),
#     distance_sd = sd(available_distances),
#     distance_cv = sd(available_distances) / mean(available_distances)
#   )
#   return(metrics)
# }
# 
# # Coordinate validation
# validate_coordinates <- function(coords, method_name, n_objects, target_dims) {
#   if(is.null(coords)) {
#     cat("WARNING:", method_name, "returned NULL coordinates\n")
#     return(FALSE)
#   }
#   if(any(!is.finite(coords))) {
#     cat("WARNING:", method_name, "has non-finite coordinates\n")
#     return(FALSE)
#   }
#   if(nrow(coords) != n_objects || ncol(coords) != target_dims) {
#     cat("WARNING:", method_name, "has incorrect dimensions\n")
#     return(FALSE)
#   }
#   return(TRUE)
# }
# 
# # Missing data imputation
# improved_missing_data_imputation <- function(dist_matrix) {
#   available_distances <- dist_matrix[!is.na(dist_matrix)]
#   if(length(available_distances) == 0) {
#     median_distance <- 1.0
#   } else {
#     median_distance <- median(available_distances, na.rm = TRUE)
#   }
#   dist_matrix[is.na(dist_matrix)] <- median_distance
#   return(list(matrix = dist_matrix, imputation_value = median_distance))
# }

## ----distorted_euclidean_generator, eval=FALSE--------------------------------
# #' Generate Non-Euclidean Data by Distorting Clustered Points
# #'
# #' Creates non-Euclidean data through systematic distortion of clustered high-dimensional points.
# #' This method is particularly effective for testing robustness to violations of the triangle inequality.
# #'
# #' @param n_objects Number of objects to generate
# #' @param initial_dims Dimensionality of the initial latent space
# #' @param n_clusters Number of clusters to form
# #' @param noise_factor Magnitude of asymmetric noise to add
# #' @param missing_fraction Proportion of distances to set to NA
# #' @return List containing complete and incomplete distance matrices
# generate_distorted_euclidean_data <- function(n_objects = 50,
#                                               initial_dims = 10,
#                                               n_clusters = 8,
#                                               noise_factor = 0.3,
#                                               missing_fraction = 0.30) {
# 
#   object_names <- paste0("Object_", sprintf("%02d", 1:n_objects))
# 
#   # Generate structured high-dimensional coordinates
#   cluster_size <- n_objects %/% n_clusters
#   cluster_centers <- matrix(rnorm(n_clusters * initial_dims, mean = 0, sd = 4),
#                            nrow = n_clusters, ncol = initial_dims)
#   initial_coords <- matrix(0, nrow = n_objects, ncol = initial_dims)
#   rownames(initial_coords) <- object_names
# 
#   for(i in 1:n_objects) {
#     cluster_id <- ((i - 1) %/% cluster_size) + 1
#     if(cluster_id > n_clusters) cluster_id <- n_clusters
#     initial_coords[i, ] <- cluster_centers[cluster_id, ] +
#                           rnorm(initial_dims, mean = 0, sd = 1.5)
#   }
# 
#   # Calculate foundational Euclidean distances
#   euclidean_distances <- as.matrix(dist(initial_coords, method = "euclidean"))
# 
#   # Apply non-linear transformations
#   dist_quantiles <- quantile(euclidean_distances[upper.tri(euclidean_distances)],
#                             c(0.33, 0.66))
#   transform_1 <- euclidean_distances
#   transform_1[euclidean_distances <= dist_quantiles[1]] <-
#     euclidean_distances[euclidean_distances <= dist_quantiles[1]]^1.3
#   transform_1[euclidean_distances > dist_quantiles[1] &
#              euclidean_distances <= dist_quantiles[2]] <-
#     euclidean_distances[euclidean_distances > dist_quantiles[1] &
#                        euclidean_distances <= dist_quantiles[2]]^1.6
#   transform_1[euclidean_distances > dist_quantiles[2]] <-
#     euclidean_distances[euclidean_distances > dist_quantiles[2]]^1.8
# 
#   # Add asymmetric noise
#   asymmetric_noise <- transform_1 * noise_factor *
#                      matrix(runif(n_objects^2, -1, 1), nrow = n_objects)
#   asymmetric_noise <- (asymmetric_noise + t(asymmetric_noise)) / 2
#   transform_2 <- transform_1 + asymmetric_noise
#   transform_2[transform_2 < 0] <- 0.01
# 
#   # Create final non-Euclidean distance matrix
#   complete_non_euclidean_distances <- transform_2
#   diag(complete_non_euclidean_distances) <- 0
# 
#   # Introduce missing values
#   total_unique_pairs <- n_objects * (n_objects - 1) / 2
#   target_missing_pairs <- round(total_unique_pairs * missing_fraction)
# 
#   upper_tri_indices <- which(upper.tri(complete_non_euclidean_distances), arr.ind = TRUE)
#   missing_pair_indices <- sample(nrow(upper_tri_indices), target_missing_pairs)
# 
#   incomplete_distances <- complete_non_euclidean_distances
#   incomplete_distances[upper_tri_indices[missing_pair_indices,]] <- NA
#   incomplete_distances[upper_tri_indices[missing_pair_indices, c(2,1)]] <- NA
# 
#   return(list(
#     complete_distances = complete_non_euclidean_distances,
#     incomplete_distances = incomplete_distances,
#     object_names = object_names,
#     method = "Synthesized non-Euclidean"
#   ))
# }

## ----swiss_roll_generator, eval=FALSE-----------------------------------------
# #' Generate Non-Euclidean Data from a Swiss Roll Manifold
# #'
# #' Creates data points on a Swiss Roll manifold where geodesic distances
# #' are inherently non-Euclidean. Includes "tunneling" effects common in real-world manifold data.
# #'
# #' @param n_objects Number of points on the manifold
# #' @param noise Standard deviation of Gaussian noise added to points
# #' @param tunnel_fraction Fraction of distances to replace with Euclidean shortcuts
# #' @param missing_fraction Proportion of final distances to set to NA
# #' @return List containing complete and incomplete distance matrices
# generate_swiss_roll_data <- function(n_objects = 50,
#                                      noise = 0.05,
#                                      tunnel_fraction = 0.05,
#                                      missing_fraction = 0.30) {
#   # Check if required packages are available
#   if(!requireNamespace("RANN", quietly = TRUE)) {
#     stop("RANN package is required for this function. Please install it.")
#   }
#   if(!requireNamespace("igraph", quietly = TRUE)) {
#     stop("igraph package is required for this function. Please install it.")
#   }
# 
#   # Generate points on the Swiss Roll
#   t <- 1.5 * pi * (1 + 2 * runif(n_objects))
#   height <- 21 * runif(n_objects)
# 
#   x <- t * cos(t)
#   y <- height
#   z <- t * sin(t)
# 
#   points_3d <- data.frame(x = x, y = y, z = z) + rnorm(n_objects * 3, sd = noise)
#   object_names <- paste0("Object_", sprintf("%02d", 1:n_objects))
#   rownames(points_3d) <- object_names
# 
#   # Calculate geodesic distances via k-NN graph
#   if(requireNamespace("RANN", quietly = TRUE)) {
#     knn_graph <- RANN::nn2(points_3d, k = min(10, n_objects-1))$nn.idx
#     adj_matrix <- matrix(0, n_objects, n_objects)
#     for (i in 1:n_objects) {
#       for (j_idx in 2:min(10, n_objects)) {
#         if(j_idx <= ncol(knn_graph)) {
#           j <- knn_graph[i, j_idx]
#           dist_ij <- dist(rbind(points_3d[i,], points_3d[j,]))
#           adj_matrix[i, j] <- dist_ij
#           adj_matrix[j, i] <- dist_ij
#         }
#       }
#     }
# 
#     g <- igraph::graph_from_adjacency_matrix(adj_matrix, mode = "undirected", weighted = TRUE)
#     geodesic_distances <- igraph::distances(g)
#     geodesic_distances[is.infinite(geodesic_distances)] <-
#       max(geodesic_distances[is.finite(geodesic_distances)]) * 1.5
#   } else {
#     # Fallback to Euclidean if RANN not available
#     geodesic_distances <- as.matrix(dist(points_3d))
#     warning("RANN package not available. Using Euclidean distances as approximation.")
#   }
# 
#   # Introduce "tunnels" or "short-circuits"
#   euclidean_distances <- as.matrix(dist(points_3d))
#   n_tunnels <- round(tunnel_fraction * n_objects * (n_objects - 1) / 2)
# 
#   upper_tri_indices <- which(upper.tri(geodesic_distances), arr.ind = TRUE)
#   tunnel_indices <- sample(nrow(upper_tri_indices), min(n_tunnels, nrow(upper_tri_indices)))
# 
#   complete_distances <- geodesic_distances
#   if(length(tunnel_indices) > 0) {
#     for (k in tunnel_indices) {
#       i <- upper_tri_indices[k, 1]
#       j <- upper_tri_indices[k, 2]
#       complete_distances[i, j] <- complete_distances[j, i] <- euclidean_distances[i, j]
#     }
#   }
#   diag(complete_distances) <- 0
# 
#   # Introduce missing values
#   total_unique_pairs <- n_objects * (n_objects - 1) / 2
#   target_missing_pairs <- round(total_unique_pairs * missing_fraction)
# 
#   missing_pair_indices <- sample(nrow(upper_tri_indices),
#                                 min(target_missing_pairs, nrow(upper_tri_indices)))
# 
#   incomplete_distances <- complete_distances
#   if(length(missing_pair_indices) > 0) {
#     incomplete_distances[upper_tri_indices[missing_pair_indices,]] <- NA
#     incomplete_distances[upper_tri_indices[missing_pair_indices, c(2,1)]] <- NA
#   }
# 
#   rownames(complete_distances) <- object_names
#   colnames(complete_distances) <- object_names
#   rownames(incomplete_distances) <- object_names
#   colnames(incomplete_distances) <- object_names
# 
#   return(list(
#     complete_distances = complete_distances,
#     incomplete_distances = incomplete_distances,
#     object_names = object_names,
#     method = "Swiss Roll Manifold"
#   ))
# }

## ----analysis_pipeline, eval=FALSE--------------------------------------------
# #' Comprehensive analysis pipeline for embedding method comparison
# #'
# #' @param data_gen_func Function to generate the dataset
# #' @param dataset_name Name for the dataset (for labeling)
# #' @param n_objects Number of objects in the dataset
# #' @param n_runs Number of runs for stochastic methods
# run_comprehensive_analysis <- function(data_gen_func, dataset_name, n_objects = 50, n_runs = 50) {
# 
#   cat("=== ANALYSIS FOR", toupper(dataset_name), "===\n")
# 
#   # Step 1: Data Generation
#   cat("\n1. Generating", dataset_name, "dataset...\n")
#   data_gen_output <- data_gen_func(n_objects = n_objects, missing_fraction = 0.3)
# 
#   complete_distances_for_evaluation <- data_gen_output$complete_distances
#   incomplete_distances_for_embedding <- data_gen_output$incomplete_distances
#   object_names <- data_gen_output$object_names
# 
#   actual_missing_percentage <- sum(is.na(incomplete_distances_for_embedding)) /
#                               (n_objects * (n_objects-1)) * 100
# 
#   data_quality <- calculate_data_quality_metrics(incomplete_distances_for_embedding)
# 
#   cat("Generated dataset with", n_objects, "objects and",
#       round(actual_missing_percentage, 1), "% missing values\n")
# 
#   # Step 2: Assess Non-Euclidean Character
#   cat("\n2. Assessing non-Euclidean character...\n")
# 
#   D_squared <- complete_distances_for_evaluation^2
#   n <- nrow(D_squared)
#   J <- diag(n) - (1/n) * matrix(1, n, n)
#   B <- -0.5 * J %*% D_squared %*% J
#   eigenvals <- eigen(B, symmetric = TRUE)$values
# 
#   numerical_tolerance <- 1e-12
#   positive_eigenvals <- eigenvals[eigenvals > numerical_tolerance]
#   negative_eigenvals <- eigenvals[eigenvals < -numerical_tolerance]
# 
#   total_variance <- sum(abs(eigenvals))
#   negative_variance <- sum(abs(negative_eigenvals))
#   positive_variance <- sum(positive_eigenvals)
# 
#   if(positive_variance > 0) {
#     deviation_score <- negative_variance / positive_variance
#   } else {
#     deviation_score <- 1.0
#   }
# 
#   negative_variance_fraction <- negative_variance / total_variance
# 
#   cumulative_positive_variance <- cumsum(positive_eigenvals) / positive_variance
#   dims_for_90_percent <- which(cumulative_positive_variance >= 0.90)[1]
#   if(is.na(dims_for_90_percent)) dims_for_90_percent <- length(positive_eigenvals)
# 
#   cat("Non-Euclidean deviation score:", round(deviation_score, 4), "\n")
#   cat("Dimensions for 90% variance:", dims_for_90_percent, "\n")
# 
#   # Step 3: Parameter Optimization using Euclidify
#   cat("\n3. Running automated parameter optimization...\n")
# 
#   # Create temporary directory for optimization
#   temp_dir <- tempfile()
#   dir.create(temp_dir, recursive = TRUE)
# 
#   euclidify_results <- tryCatch({
#     Euclidify(
#       dissimilarity_matrix = incomplete_distances_for_embedding,
#       output_dir = temp_dir,
#       ndim_range = c(2, min(10, dims_for_90_percent + 3)),
#       n_initial_samples = 20,  # Reduced for vignette speed
#       n_adaptive_samples = 40,  # Reduced for vignette speed
#       folds = 20,
#       mapping_max_iter = 300,  # Reduced for speed
#       clean_intermediate = TRUE,
#       verbose = "off",
#       fallback_to_defaults = TRUE,
#       save_results = FALSE
#     )
#   }, error = function(e) {
#     cat("Euclidify failed, using default parameters\n")
#     NULL
#   })
# 
#   # Clean up temp directory
#   unlink(temp_dir, recursive = TRUE)
# 
#   if(!is.null(euclidify_results)) {
#     optimal_params <- euclidify_results$optimal_params
#     euclidify_positions <- euclidify_results$positions
#     target_dims <- optimal_params$ndim
#     cat("Optimal parameters found - dimensions:", target_dims, "\n")
#   } else {
#     # Fallback parameters
#     target_dims <- max(2, min(5, dims_for_90_percent))
#     optimal_params <- list(
#       ndim = target_dims,
#       k0 = 5.0,
#       cooling_rate = 0.01,
#       c_repulsion = 0.02
#     )
#     euclidify_positions <- NULL
#     cat("Using fallback parameters - dimensions:", target_dims, "\n")
#   }
# 
#   # Step 4: Three-Method Comparison
#   cat("\n4. Running three-method comparison...\n")
# 
#   # Initialize storage
#   topolow_results <- vector("list", n_runs)
#   iterative_mds_results <- vector("list", n_runs)
#   classical_mds_result <- NULL
# 
#   topolow_errors <- numeric(n_runs)
#   iterative_mds_errors <- numeric(n_runs)
#   classical_mds_error <- NA
# 
#   # Topolow runs
#   cat("Running Topolow embeddings...\n")
#   for(i in 1:n_runs) {
#     if(i == 1 && !is.null(euclidify_positions)) {
#       # Use Euclidify result for first run
#       topolow_coords <- euclidify_positions
#     } else {
#       # Run new embedding
#       topolow_result <- tryCatch({
#         euclidean_embedding(
#           dissimilarity_matrix = incomplete_distances_for_embedding,
#           ndim = optimal_params$ndim,
#           mapping_max_iter = 300,
#           k0 = optimal_params$k0,
#           cooling_rate = optimal_params$cooling_rate,
#           c_repulsion = optimal_params$c_repulsion,
#           relative_epsilon = 1e-6,
#           convergence_counter = 3,
#           verbose = FALSE
#         )
#       }, error = function(e) NULL)
# 
#       if(!is.null(topolow_result)) {
#         topolow_coords <- topolow_result$positions
#       } else {
#         topolow_coords <- NULL
#       }
#     }
# 
#     if(!is.null(topolow_coords)) {
#       topolow_coords <- topolow_coords[order(row.names(topolow_coords)), ]
# 
#       if(validate_coordinates(topolow_coords, "Topolow", n_objects, target_dims)) {
#         topolow_coords <- scale(topolow_coords, center = TRUE, scale = FALSE)
#         topolow_results[[i]] <- topolow_coords
# 
#         embedded_distances <- as.matrix(dist(topolow_coords))
#         rownames(embedded_distances) <- rownames(topolow_coords)
#         colnames(embedded_distances) <- rownames(topolow_coords)
# 
#         valid_mask <- !is.na(complete_distances_for_evaluation)
#         distance_errors <- abs(complete_distances_for_evaluation[valid_mask] -
#                              embedded_distances[valid_mask])
#         topolow_errors[i] <- sum(distance_errors)
#       } else {
#         topolow_results[[i]] <- NULL
#         topolow_errors[i] <- NA
#       }
#     } else {
#       topolow_results[[i]] <- NULL
#       topolow_errors[i] <- NA
#     }
#   }
# 
#   # Classical MDS
#   cat("Running Classical MDS...\n")
#   imputation_result <- improved_missing_data_imputation(incomplete_distances_for_embedding)
#   classical_mds_matrix <- imputation_result$matrix
# 
#   tryCatch({
#     classical_mds_result_raw <- cmdscale(classical_mds_matrix, k = target_dims, eig = TRUE)
#     classical_mds_coords <- classical_mds_result_raw$points
#     rownames(classical_mds_coords) <- object_names
#     classical_mds_coords <- classical_mds_coords[order(row.names(classical_mds_coords)), ]
# 
#     if(validate_coordinates(classical_mds_coords, "Classical MDS", n_objects, target_dims)) {
#       classical_mds_coords <- scale(classical_mds_coords, center = TRUE, scale = FALSE)
#       classical_mds_result <- classical_mds_coords
# 
#       embedded_distances <- as.matrix(dist(classical_mds_coords))
#       rownames(embedded_distances) <- rownames(classical_mds_coords)
#       colnames(embedded_distances) <- rownames(classical_mds_coords)
# 
#       valid_mask <- !is.na(complete_distances_for_evaluation)
#       distance_errors <- abs(complete_distances_for_evaluation[valid_mask] -
#                            embedded_distances[valid_mask])
#       classical_mds_error <- sum(distance_errors)
#     }
#   }, error = function(e) {
#     cat("Classical MDS failed:", e$message, "\n")
#   })
# 
#   # Iterative MDS
#   cat("Running Iterative MDS...\n")
#   for(i in 1:n_runs) {
#     tryCatch({
#       if(requireNamespace("smacof", quietly = TRUE)) {
#         max_dist <- max(classical_mds_matrix, na.rm = TRUE)
#         scaled_matrix <- classical_mds_matrix / max_dist
# 
#         iterative_mds_result_raw <- smacof::smacofSym(
#           delta = scaled_matrix,
#           ndim = target_dims,
#           type = "interval",
#           init = "torgerson",
#           verbose = FALSE,
#           itmax = 1000,
#           eps = 1e-5
#         )
# 
#         iterative_mds_coords <- iterative_mds_result_raw$conf
#         current_max_dist <- max(dist(iterative_mds_coords))
#         if(current_max_dist > 0) {
#           scale_factor <- max_dist / current_max_dist
#           iterative_mds_coords <- iterative_mds_coords * scale_factor
#         }
#       } else {
#         # Fallback to isoMDS
#         iterative_mds_result_raw <- MASS::isoMDS(classical_mds_matrix, k = target_dims, trace = FALSE)
#         iterative_mds_coords <- iterative_mds_result_raw$points
#       }
# 
#       rownames(iterative_mds_coords) <- object_names
#       iterative_mds_coords <- iterative_mds_coords[order(row.names(iterative_mds_coords)), ]
# 
#       if(validate_coordinates(iterative_mds_coords, "Iterative MDS", n_objects, target_dims)) {
#         iterative_mds_coords <- scale(iterative_mds_coords, center = TRUE, scale = FALSE)
#         iterative_mds_results[[i]] <- iterative_mds_coords
# 
#         embedded_distances <- as.matrix(dist(iterative_mds_coords))
#         rownames(embedded_distances) <- rownames(iterative_mds_coords)
#         colnames(embedded_distances) <- rownames(iterative_mds_coords)
# 
#         valid_mask <- !is.na(complete_distances_for_evaluation)
#         distance_errors <- abs(complete_distances_for_evaluation[valid_mask] -
#                              embedded_distances[valid_mask])
#         iterative_mds_errors[i] <- sum(distance_errors)
#       } else {
#         iterative_mds_results[[i]] <- NULL
#         iterative_mds_errors[i] <- NA
#       }
# 
#     }, error = function(e) {
#       iterative_mds_results[[i]] <- NULL
#       iterative_mds_errors[i] <- NA
#     })
#   }
# 
#   # Step 5: Compile Results
#   valid_topolow_results <- !is.na(topolow_errors)
#   valid_iterative_results <- !is.na(iterative_mds_errors)
# 
#   results <- list(
#     dataset_name = dataset_name,
#     data_characteristics = list(
#       n_objects = n_objects,
#       missing_percentage = actual_missing_percentage,
#       deviation_score = deviation_score,
#       dims_90_percent = dims_for_90_percent,
#       negative_variance_fraction = negative_variance_fraction
#     ),
#     optimal_params = optimal_params,
#     topolow_errors = topolow_errors,
#     topolow_results = topolow_results,
#     valid_topolow_results = valid_topolow_results,
#     classical_mds_error = classical_mds_error,
#     classical_mds_result = classical_mds_result,
#     iterative_mds_errors = iterative_mds_errors,
#     iterative_mds_results = iterative_mds_results,
#     valid_iterative_results = valid_iterative_results,
#     complete_distances = complete_distances_for_evaluation,
#     incomplete_distances = incomplete_distances_for_embedding,
#     target_dims = target_dims
#   )
# 
#   return(results)
# }

## ----distorted_analysis, eval=FALSE, results='hide', fig.show='hold'----------
# # Run analysis for Synthesized non-Euclidean data
# distorted_results <- run_comprehensive_analysis(
#   generate_distorted_euclidean_data,
#   "Synthesized non-Euclidean",
#   n_objects = 50,
#   n_runs = 50
# )

## ----distorted_summary, eval=FALSE, echo=FALSE--------------------------------
# cat("=== Synthesized non-Euclidean DATA ANALYSIS SUMMARY ===\n")
# cat("Dataset characteristics:\n")
# cat("- Objects:", distorted_results$data_characteristics$n_objects, "\n")
# cat("- Missing data:", round(distorted_results$data_characteristics$missing_percentage, 1), "%\n")
# cat("- Non-Euclidean deviation score:", round(distorted_results$data_characteristics$deviation_score, 4), "\n")
# cat("- Dimensions for 90% variance:", distorted_results$data_characteristics$dims_90_percent, "\n")
# cat("- Target embedding dimensions:", distorted_results$target_dims, "\n\n")
# 
# cat("Performance Summary:\n")
# if(sum(distorted_results$valid_topolow_results) > 0) {
#   cat("- Topolow mean error:",
#       round(mean(distorted_results$topolow_errors[distorted_results$valid_topolow_results]), 2),
#       "±", round(sd(distorted_results$topolow_errors[distorted_results$valid_topolow_results]), 2), "\n")
# }
# if(!is.na(distorted_results$classical_mds_error)) {
#   cat("- Classical MDS error:", round(distorted_results$classical_mds_error, 2), "\n")
# }
# if(sum(distorted_results$valid_iterative_results) > 0) {
#   cat("- Iterative MDS mean error:", round(mean(distorted_results$iterative_mds_errors[distorted_results$valid_iterative_results]), 2),
#       "±", round(sd(distorted_results$iterative_mds_errors[distorted_results$valid_iterative_results]), 2), "\n")
# }

## ----distorted_eigenvalues, eval=FALSE, echo=FALSE----------------------------
# # Create eigenvalue visualization for distorted data
# D_squared <- distorted_results$complete_distances^2
# n <- nrow(D_squared)
# J <- diag(n) - (1/n) * matrix(1, n, n)
# B <- -0.5 * J %*% D_squared %*% J
# eigenvals <- eigen(B, symmetric = TRUE)$values
# 
# eigenval_df <- data.frame(
#   index = 1:length(eigenvals),
#   eigenvalue = eigenvals,
#   type = ifelse(eigenvals > 1e-12, "Positive",
#                ifelse(eigenvals < -1e-12, "Negative", "Zero"))
# )
# 
# p_eigen_distorted <- ggplot(eigenval_df, aes(x = index, y = eigenvalue, fill = type)) +
#   geom_bar(stat = "identity", alpha = 0.8) +
#   geom_hline(yintercept = 0, linetype = "dashed", linewidth = 1, color = "black") +
#   scale_fill_manual(values = c("Positive" = "steelblue", "Negative" = "red", "Zero" = "gray")) +
#   labs(
#     title = "Eigenvalue Spectrum: Synthesized non-Euclidean Data",
#     subtitle = paste("Deviation Score:", round(distorted_results$data_characteristics$deviation_score, 4),
#                      "| Non-Euclidean Variance:", round(distorted_results$data_characteristics$negative_variance_fraction * 100, 1), "%"),
#     x = "Eigenvalue Index (sorted descending)",
#     y = "Eigenvalue",
#     fill = "Eigenvalue Type"
#   ) +
#   theme_minimal() +
#   theme(
#     legend.position = "bottom",
#     plot.title = element_text(size = 12, face = "bold"),
#     panel.grid.minor = element_blank()
#   )
# 
# print(p_eigen_distorted)

## ----distorted_eigenvalues-1, echo=FALSE, out.width="100%"--------------------
# Include pre-generated figure from man/figures
knitr::include_graphics("../man/figures/distorted_eigenvalues-1.png")

## ----distorted_performance, eval=FALSE, echo=FALSE----------------------------
# # Create performance comparison for distorted data
# results_df_distorted <- data.frame()
# 
# if(sum(distorted_results$valid_topolow_results) > 0) {
#   topolow_df <- data.frame(
#     Run = which(distorted_results$valid_topolow_results),
#     Method = "Topolow",
#     Total_Error = distorted_results$topolow_errors[distorted_results$valid_topolow_results],
#     Method_Type = "Stochastic",
#     Dataset = "Synthesized non-Euclidean"
#   )
#   results_df_distorted <- rbind(results_df_distorted, topolow_df)
# }
# 
# if(!is.na(distorted_results$classical_mds_error)) {
#   classical_df <- data.frame(
#     Run = 1,
#     Method = "Classical MDS",
#     Total_Error = distorted_results$classical_mds_error,
#     Method_Type = "Deterministic",
#     Dataset = "Synthesized non-Euclidean"
#   )
#   results_df_distorted <- rbind(results_df_distorted, classical_df)
# }
# 
# if(sum(distorted_results$valid_iterative_results) > 0) {
#   iterative_df <- data.frame(
#     Run = which(distorted_results$valid_iterative_results),
#     Method = "Iterative MDS",
#     Total_Error = distorted_results$iterative_mds_errors[distorted_results$valid_iterative_results],
#     Method_Type = "Stochastic",
#     Dataset = "Synthesized non-Euclidean"
#   )
#   results_df_distorted <- rbind(results_df_distorted, iterative_df)
# }
# 
# if(nrow(results_df_distorted) > 0) {
#   p_performance_distorted <- ggplot(results_df_distorted, aes(x = Method, y = Total_Error, fill = Method_Type)) +
#     geom_boxplot(alpha = 0.7, outlier.shape = NA) +
#     geom_jitter(width = 0.2, alpha = 0.7, size = 2.5) +
#     scale_fill_manual(values = c("Stochastic" = "steelblue", "Deterministic" = "darkorange")) +
#     labs(
#       title = "Embedding Performance: Synthesized non-Euclidean Data",
#       subtitle = paste("Non-Euclidean Score:", round(distorted_results$data_characteristics$deviation_score, 4),
#                        "| Missing Data:", round(distorted_results$data_characteristics$missing_percentage, 1), "%"),
#       y = "Sum of Absolute Distance Errors (L1 Norm)",
#       x = "Method",
#       fill = "Method Type"
#     ) +
#     theme_minimal() +
#     theme(
#       plot.title = element_text(size = 12, face = "bold"),
#       panel.grid.minor = element_blank(),
#       legend.position = "bottom"
#     )
# 
#   print(p_performance_distorted)
# }

## ----distorted_performance-1, echo=FALSE, out.width="100%"--------------------
# Include pre-generated figure from man/figures
knitr::include_graphics("../man/figures/distorted_performance-1.png")

## ----swiss_analysis, eval=FALSE, results='hide', fig.show='hold'--------------
# # Run analysis for Swiss Roll data
# swiss_results <- run_comprehensive_analysis(
#   generate_swiss_roll_data,
#   "Swiss Roll Manifold",
#   n_objects = 50,
#   n_runs = 50
# )

## ----swiss_summary, eval=FALSE, echo=FALSE------------------------------------
# cat("=== SWISS ROLL MANIFOLD DATA ANALYSIS SUMMARY ===\n")
# cat("Dataset characteristics:\n")
# cat("- Objects:", swiss_results$data_characteristics$n_objects, "\n")
# cat("- Missing data:", round(swiss_results$data_characteristics$missing_percentage, 1), "%\n")
# cat("- Non-Euclidean deviation score:", round(swiss_results$data_characteristics$deviation_score, 4), "\n")
# cat("- Dimensions for 90% variance:", swiss_results$data_characteristics$dims_90_percent, "\n")
# cat("- Target embedding dimensions:", swiss_results$target_dims, "\n\n")
# 
# cat("Performance Summary:\n")
# if(sum(swiss_results$valid_topolow_results) > 0) {
#   cat("- Topolow mean error:", round(mean(swiss_results$topolow_errors[swiss_results$valid_topolow_results]), 2),
#       "±", round(sd(swiss_results$topolow_errors[swiss_results$valid_topolow_results]), 2), "\n")
# }
# if(!is.na(swiss_results$classical_mds_error)) {
#   cat("- Classical MDS error:", round(swiss_results$classical_mds_error, 2), "\n")
# }
# if(sum(swiss_results$valid_iterative_results) > 0) {
#   cat("- Iterative MDS mean error:", round(mean(swiss_results$iterative_mds_errors[swiss_results$valid_iterative_results]), 2),
#       "±", round(sd(swiss_results$iterative_mds_errors[swiss_results$valid_iterative_results]), 2), "\n")
# }

## ----swiss_eigenvalues, eval=FALSE, echo=FALSE--------------------------------
# # Create eigenvalue visualization for Swiss Roll data
# D_squared <- swiss_results$complete_distances^2
# n <- nrow(D_squared)
# J <- diag(n) - (1/n) * matrix(1, n, n)
# B <- -0.5 * J %*% D_squared %*% J
# eigenvals <- eigen(B, symmetric = TRUE)$values
# 
# eigenval_df <- data.frame(
#   index = 1:length(eigenvals),
#   eigenvalue = eigenvals,
#   type = ifelse(eigenvals > 1e-12, "Positive",
#                ifelse(eigenvals < -1e-12, "Negative", "Zero"))
# )
# 
# p_eigen_swiss <- ggplot(eigenval_df, aes(x = index, y = eigenvalue, fill = type)) +
#   geom_bar(stat = "identity", alpha = 0.8) +
#   geom_hline(yintercept = 0, linetype = "dashed", linewidth = 1, color = "black") +
#   scale_fill_manual(values = c("Positive" = "steelblue", "Negative" = "red", "Zero" = "gray")) +
#   labs(
#     title = "Eigenvalue Spectrum: Swiss Roll Manifold Data",
#     subtitle = paste("Deviation Score:", round(swiss_results$data_characteristics$deviation_score, 4),
#                      "| Non-Euclidean Variance:", round(swiss_results$data_characteristics$negative_variance_fraction * 100, 1), "%"),
#     x = "Eigenvalue Index (sorted descending)",
#     y = "Eigenvalue",
#     fill = "Eigenvalue Type"
#   ) +
#   theme_minimal() +
#   theme(
#     legend.position = "bottom",
#     plot.title = element_text(size = 12, face = "bold"),
#     panel.grid.minor = element_blank()
#   )
# 
# print(p_eigen_swiss)

## ----swiss_eigenvalues-1, echo=FALSE, out.width="100%"------------------------
# Include pre-generated figure from man/figures
knitr::include_graphics("../man/figures/swiss_eigenvalues-1.png")

## ----swiss_performance, eval=FALSE, echo=FALSE--------------------------------
# # Create performance comparison for Swiss Roll data
# results_df_swiss <- data.frame()
# 
# if(sum(swiss_results$valid_topolow_results) > 0) {
#   topolow_df <- data.frame(
#     Run = which(swiss_results$valid_topolow_results),
#     Method = "Topolow",
#     Total_Error = swiss_results$topolow_errors[swiss_results$valid_topolow_results],
#     Method_Type = "Stochastic",
#     Dataset = "Swiss Roll"
#   )
#   results_df_swiss <- rbind(results_df_swiss, topolow_df)
# }
# 
# if(!is.na(swiss_results$classical_mds_error)) {
#   classical_df <- data.frame(
#     Run = 1,
#     Method = "Classical MDS",
#     Total_Error = swiss_results$classical_mds_error,
#     Method_Type = "Deterministic",
#     Dataset = "Swiss Roll"
#   )
#   results_df_swiss <- rbind(results_df_swiss, classical_df)
# }
# 
# if(sum(swiss_results$valid_iterative_results) > 0) {
#   iterative_df <- data.frame(
#     Run = which(swiss_results$valid_iterative_results),
#     Method = "Iterative MDS",
#     Total_Error = swiss_results$iterative_mds_errors[swiss_results$valid_iterative_results],
#     Method_Type = "Stochastic",
#     Dataset = "Swiss Roll"
#   )
#   results_df_swiss <- rbind(results_df_swiss, iterative_df)
# }
# 
# if(nrow(results_df_swiss) > 0) {
#   p_performance_swiss <- ggplot(results_df_swiss, aes(x = Method, y = Total_Error, fill = Method_Type)) +
#     geom_boxplot(alpha = 0.7, outlier.shape = NA) +
#     geom_jitter(width = 0.2, alpha = 0.7, size = 2.5) +
#     scale_fill_manual(values = c("Stochastic" = "steelblue", "Deterministic" = "darkorange")) +
#     labs(
#       title = "Embedding Performance: Swiss Roll Manifold Data",
#       subtitle = paste("Non-Euclidean Score:", round(swiss_results$data_characteristics$deviation_score, 4),
#                        "| Missing Data:", round(swiss_results$data_characteristics$missing_percentage, 1), "%"),
#       y = "Sum of Absolute Distance Errors (L1 Norm)",
#       x = "Method",
#       fill = "Method Type"
#     ) +
#     theme_minimal() +
#     theme(
#       plot.title = element_text(size = 12, face = "bold"),
#       panel.grid.minor = element_blank(),
#       legend.position = "bottom"
#     )
# 
#   print(p_performance_swiss)
# }

## ----swiss_performance-1, echo=FALSE, out.width="100%"------------------------
# Include pre-generated figure from man/figures
knitr::include_graphics("../man/figures/swiss_performance-1.png")

## ----combined_analysis, eval=FALSE, echo=FALSE--------------------------------
# # Combine results from both datasets
# combined_results <- rbind(
#   if(nrow(results_df_distorted) > 0) results_df_distorted else data.frame(),
#   if(nrow(results_df_swiss) > 0) results_df_swiss else data.frame()
# )
# 
# if(nrow(combined_results) > 0) {
#   # Overall performance comparison
#   p_combined_performance <- ggplot(combined_results, aes(x = Method, y = Total_Error)) +
#     geom_boxplot(alpha = 0.7, fill = "lightblue") +
#     geom_point(position = position_jitter(width = 0.2),
#                size = 2, alpha = 0.8, color = "darkblue") +
#     facet_wrap(~ Dataset, scales = "free_y") +
#     labs(
#       title = "Comparative Performance Across Dataset Types",
#       subtitle = "Error comparison for three methods on Synthesized non-Euclidean and Swiss Roll manifold data",
#       y = "Sum of Absolute Distance Errors (L1 Norm)",
#       x = "Method"
#     ) +
#     theme_minimal() +
#     theme(
#       plot.title = element_text(size = 12, face = "bold"),
#       panel.grid.minor = element_blank(),
#       strip.text = element_text(face = "bold", size = 10),
#       axis.text.x = element_text(angle = 45, hjust = 1)
#     )
# 
#   print(p_combined_performance)
# 
#   # Summary statistics by method and dataset
#   summary_stats <- combined_results %>%
#     group_by(Method, Dataset, Method_Type) %>%
#     summarise(
#       Count = n(),
#       Mean_Error = mean(Total_Error),
#       SD_Error = sd(Total_Error),
#       Min_Error = min(Total_Error),
#       Max_Error = max(Total_Error),
#       .groups = 'drop'
#     )
# 
#   cat("\n=== COMBINED PERFORMANCE SUMMARY ===\n")
#   print(summary_stats)
# }

## ----combined_analysis-1, echo=FALSE, out.width="100%"------------------------
# Include pre-generated figure from man/figures
knitr::include_graphics("../man/figures/combined_analysis-1.png")

## ----statistical_analysis, eval=FALSE, echo=FALSE-----------------------------
# cat("\n=== STATISTICAL ANALYSIS ===\n")
# 
# # Statistical comparisons for Synthesized non-Euclidean data
# if(sum(distorted_results$valid_topolow_results) > 0 &&
#    sum(distorted_results$valid_iterative_results) > 0) {
#   cat("\nSynthesized non-Euclidean Data:\n")
#   enhanced_statistical_comparison(
#     distorted_results$topolow_errors[distorted_results$valid_topolow_results],
#     distorted_results$iterative_mds_errors[distorted_results$valid_iterative_results],
#     "Iterative MDS"
#   )
# }
# 
# # Statistical comparisons for Swiss Roll data
# if(sum(swiss_results$valid_topolow_results) > 0 &&
#    sum(swiss_results$valid_iterative_results) > 0) {
#   cat("\nSwiss Roll Manifold Data:\n")
#   enhanced_statistical_comparison(
#     swiss_results$topolow_errors[swiss_results$valid_topolow_results],
#     swiss_results$iterative_mds_errors[swiss_results$valid_iterative_results],
#     "Iterative MDS"
#   )
# }

## ----distance_preservation, eval=FALSE, echo=FALSE----------------------------
# # Distance preservation analysis for both datasets
# analyze_distance_preservation <- function(results, dataset_name) {
#   cat("\n=== DISTANCE PRESERVATION:", toupper(dataset_name), "===\n")
# 
#   # Get best performing runs
#   best_methods <- list()
# 
#   if(sum(results$valid_topolow_results) > 0) {
#     best_topolow_idx <- which.min(results$topolow_errors[results$valid_topolow_results])
#     actual_topolow_idx <- which(results$valid_topolow_results)[best_topolow_idx]
#     best_methods$topolow <- results$topolow_results[[actual_topolow_idx]]
#   }
# 
#   if(!is.na(results$classical_mds_error)) {
#     best_methods$classical <- results$classical_mds_result
#   }
# 
#   if(sum(results$valid_iterative_results) > 0) {
#     best_iterative_idx <- which.min(results$iterative_mds_errors[results$valid_iterative_results])
#     actual_iterative_idx <- which(results$valid_iterative_results)[best_iterative_idx]
#     best_methods$iterative <- results$iterative_mds_results[[actual_iterative_idx]]
#   }
# 
#   if(length(best_methods) > 0) {
#     # Calculate correlations
#     upper_tri_mask <- upper.tri(results$complete_distances)
#     true_distances_vector <- results$complete_distances[upper_tri_mask]
# 
#     distance_comparison_list <- list()
#     preservation_stats <- data.frame()
# 
#     for(method_name in names(best_methods)) {
#       embedded_distances <- as.matrix(dist(best_methods[[method_name]]))
#       embedded_distances_vector <- embedded_distances[upper_tri_mask]
# 
#       correlation <- cor(true_distances_vector, embedded_distances_vector, use = "complete.obs")
#       rsq <- summary(lm(embedded_distances_vector ~ true_distances_vector))$r.squared
# 
#       method_display_name <- switch(method_name,
#                                    "topolow" = "Topolow",
#                                    "classical" = "Classical MDS",
#                                    "iterative" = "Iterative MDS")
# 
#       preservation_stats <- rbind(preservation_stats, data.frame(
#         Method = method_display_name,
#         Dataset = dataset_name,
#         Correlation = correlation,
#         R_squared = rsq
#       ))
# 
#       distance_comparison_list[[method_name]] <- data.frame(
#         True_Distance = true_distances_vector,
#         Embedded_Distance = embedded_distances_vector,
#         Method = method_display_name,
#         Dataset = dataset_name
#       )
#     }
# 
#     distance_comparison_df <- do.call(rbind, distance_comparison_list)
# 
#     # Create distance preservation plot
#     p_distance_preservation <- ggplot(distance_comparison_df,
#                                      aes(x = True_Distance, y = Embedded_Distance, color = Method)) +
#       geom_point(alpha = 0.6, size = 1.5) +
#       geom_smooth(method = "lm", se = FALSE, linewidth = 1) +
#       geom_abline(intercept = 0, slope = 1, color = "black", linetype = "dashed", alpha = 0.7) +
#       facet_wrap(~Method, scales = "free_x") +
#       coord_cartesian(ylim = c(0, NA)) +
#       labs(
#         title = paste("Distance Preservation:", dataset_name),
#         x = "Original Non-Euclidean Distance",
#         y = "Embedded Euclidean Distance"
#       ) +
#       theme_minimal() +
#       theme(
#         plot.title = element_text(size = 12, face = "bold"),
#         legend.position = "none",
#         panel.grid.minor = element_blank()
#       )
# 
#     print(p_distance_preservation)
# 
#     cat("\nDistance Preservation Statistics:\n")
#     print(preservation_stats)
# 
#     return(preservation_stats)
#   }
# }
# 
# # Analyze both datasets
# distorted_preservation <- analyze_distance_preservation(distorted_results, "Synthesized non-Euclidean")
# swiss_preservation <- analyze_distance_preservation(swiss_results, "Swiss Roll")

## ----distance_preservation-1, echo=FALSE, out.width="100%"--------------------
# Include pre-generated figure from man/figures
knitr::include_graphics("../man/figures/distance_preservation-1.png")

## ----distance_preservation-2, echo=FALSE, out.width="100%"--------------------
# Include pre-generated figure from man/figures
knitr::include_graphics("../man/figures/distance_preservation-2.png")

## ----preservation_summary, eval=FALSE, echo=FALSE-----------------------------
# if(exists("distorted_preservation") && exists("swiss_preservation")) {
#   combined_preservation <- rbind(distorted_preservation, swiss_preservation)
# 
#   cat("Distance Preservation Summary (Correlation with Original Distances):\n")
#   print(combined_preservation)
# 
#   # Calculate mean correlations by method
#   mean_correlations <- combined_preservation %>%
#     group_by(Method) %>%
#     summarise(Mean_Correlation = mean(Correlation, na.rm = TRUE),
#               Mean_R_squared = mean(R_squared, na.rm = TRUE),
#               .groups = 'drop')
# 
#   cat("\nMean Performance Across Dataset Types:\n")
#   print(mean_correlations)
# }

## ----session_info, echo=FALSE-------------------------------------------------
sessionInfo()

Try the topolow package in your browser

Any scripts or data that you put into this service are public.

topolow documentation built on Aug. 31, 2025, 1:07 a.m.