Nothing
## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
echo = TRUE,
warning = FALSE,
message = FALSE,
fig.width = 8,
fig.height = 6,
collapse = TRUE,
comment = "#>",
cache = FALSE
)
library(topolow)
# Function to check and load packages with informative messages
check_and_load_package <- function(package_name) {
if (!requireNamespace(package_name, quietly = TRUE)) {
message("Package ", package_name, " not found. Please install it with:")
message("install.packages('", package_name, "')")
return(FALSE)
} else {
library(package_name, character.only = TRUE)
return(TRUE)
}
}
# Load required libraries
required_packages <- c("topolow", "ggplot2", "dplyr", "reshape2",
"scales", "MASS")
# Check for optional packages
optional_packages <- c("smacof", "RANN")
missing_packages <- character(0)
for(pkg in required_packages) {
if(!check_and_load_package(pkg)) {
missing_packages <- c(missing_packages, pkg)
}
}
# Load optional packages quietly
for(pkg in optional_packages) {
check_and_load_package(pkg)
}
if(length(missing_packages) > 0) {
stop("Please install missing packages: ", paste(missing_packages, collapse = ", "))
}
# Set seed for reproducibility
set.seed(42)
## ----utility_functions, eval=FALSE--------------------------------------------
# # Enhanced statistical testing with effect sizes
# enhanced_statistical_comparison <- function(topolow_errors, other_errors, method_name) {
# valid_topolow <- topolow_errors[!is.na(topolow_errors)]
# valid_other <- other_errors[!is.na(other_errors)]
#
# if(length(valid_other) < 2 || length(valid_topolow) < 2) {
# cat("Insufficient data for statistical comparison with", method_name, "\n")
# return(NULL)
# }
#
# # T-test
# t_result <- t.test(valid_topolow, valid_other)
#
# # Effect size (Cohen's d)
# pooled_sd <- sqrt(((length(valid_topolow) - 1) * var(valid_topolow) +
# (length(valid_other) - 1) * var(valid_other)) /
# (length(valid_topolow) + length(valid_other) - 2))
#
# cohens_d <- (mean(valid_topolow) - mean(valid_other)) / pooled_sd
#
# cat("\n--- Statistical Comparison (Topolow vs", method_name, ") ---\n")
# cat("- Welch's t-statistic:", round(t_result$statistic, 3), "\n")
# cat("- p-value:", format(t_result$p.value, scientific = TRUE, digits = 3), "\n")
# cat("- Cohen's d (effect size):", round(cohens_d, 3), "\n")
# cat("- Effect size interpretation:",
# if(abs(cohens_d) < 0.2) "Negligible"
# else if(abs(cohens_d) < 0.5) "Small"
# else if(abs(cohens_d) < 0.8) "Medium"
# else "Large", "\n")
#
# return(list(t_test = t_result, cohens_d = cohens_d))
# }
#
# # Calculate data quality metrics
# calculate_data_quality_metrics <- function(distance_matrix) {
# total_possible <- nrow(distance_matrix) * (nrow(distance_matrix) - 1) / 2
# total_available <- sum(!is.na(distance_matrix[upper.tri(distance_matrix)]))
# completeness <- total_available / total_possible
#
# available_distances <- distance_matrix[!is.na(distance_matrix)]
#
# metrics <- list(
# completeness = completeness,
# n_objects = nrow(distance_matrix),
# n_available_distances = total_available,
# distance_range = range(available_distances),
# distance_mean = mean(available_distances),
# distance_sd = sd(available_distances),
# distance_cv = sd(available_distances) / mean(available_distances)
# )
# return(metrics)
# }
#
# # Coordinate validation
# validate_coordinates <- function(coords, method_name, n_objects, target_dims) {
# if(is.null(coords)) {
# cat("WARNING:", method_name, "returned NULL coordinates\n")
# return(FALSE)
# }
# if(any(!is.finite(coords))) {
# cat("WARNING:", method_name, "has non-finite coordinates\n")
# return(FALSE)
# }
# if(nrow(coords) != n_objects || ncol(coords) != target_dims) {
# cat("WARNING:", method_name, "has incorrect dimensions\n")
# return(FALSE)
# }
# return(TRUE)
# }
#
# # Missing data imputation
# improved_missing_data_imputation <- function(dist_matrix) {
# available_distances <- dist_matrix[!is.na(dist_matrix)]
# if(length(available_distances) == 0) {
# median_distance <- 1.0
# } else {
# median_distance <- median(available_distances, na.rm = TRUE)
# }
# dist_matrix[is.na(dist_matrix)] <- median_distance
# return(list(matrix = dist_matrix, imputation_value = median_distance))
# }
## ----distorted_euclidean_generator, eval=FALSE--------------------------------
# #' Generate Non-Euclidean Data by Distorting Clustered Points
# #'
# #' Creates non-Euclidean data through systematic distortion of clustered high-dimensional points.
# #' This method is particularly effective for testing robustness to violations of the triangle inequality.
# #'
# #' @param n_objects Number of objects to generate
# #' @param initial_dims Dimensionality of the initial latent space
# #' @param n_clusters Number of clusters to form
# #' @param noise_factor Magnitude of asymmetric noise to add
# #' @param missing_fraction Proportion of distances to set to NA
# #' @return List containing complete and incomplete distance matrices
# generate_distorted_euclidean_data <- function(n_objects = 50,
# initial_dims = 10,
# n_clusters = 8,
# noise_factor = 0.3,
# missing_fraction = 0.30) {
#
# object_names <- paste0("Object_", sprintf("%02d", 1:n_objects))
#
# # Generate structured high-dimensional coordinates
# cluster_size <- n_objects %/% n_clusters
# cluster_centers <- matrix(rnorm(n_clusters * initial_dims, mean = 0, sd = 4),
# nrow = n_clusters, ncol = initial_dims)
# initial_coords <- matrix(0, nrow = n_objects, ncol = initial_dims)
# rownames(initial_coords) <- object_names
#
# for(i in 1:n_objects) {
# cluster_id <- ((i - 1) %/% cluster_size) + 1
# if(cluster_id > n_clusters) cluster_id <- n_clusters
# initial_coords[i, ] <- cluster_centers[cluster_id, ] +
# rnorm(initial_dims, mean = 0, sd = 1.5)
# }
#
# # Calculate foundational Euclidean distances
# euclidean_distances <- as.matrix(dist(initial_coords, method = "euclidean"))
#
# # Apply non-linear transformations
# dist_quantiles <- quantile(euclidean_distances[upper.tri(euclidean_distances)],
# c(0.33, 0.66))
# transform_1 <- euclidean_distances
# transform_1[euclidean_distances <= dist_quantiles[1]] <-
# euclidean_distances[euclidean_distances <= dist_quantiles[1]]^1.3
# transform_1[euclidean_distances > dist_quantiles[1] &
# euclidean_distances <= dist_quantiles[2]] <-
# euclidean_distances[euclidean_distances > dist_quantiles[1] &
# euclidean_distances <= dist_quantiles[2]]^1.6
# transform_1[euclidean_distances > dist_quantiles[2]] <-
# euclidean_distances[euclidean_distances > dist_quantiles[2]]^1.8
#
# # Add asymmetric noise
# asymmetric_noise <- transform_1 * noise_factor *
# matrix(runif(n_objects^2, -1, 1), nrow = n_objects)
# asymmetric_noise <- (asymmetric_noise + t(asymmetric_noise)) / 2
# transform_2 <- transform_1 + asymmetric_noise
# transform_2[transform_2 < 0] <- 0.01
#
# # Create final non-Euclidean distance matrix
# complete_non_euclidean_distances <- transform_2
# diag(complete_non_euclidean_distances) <- 0
#
# # Introduce missing values
# total_unique_pairs <- n_objects * (n_objects - 1) / 2
# target_missing_pairs <- round(total_unique_pairs * missing_fraction)
#
# upper_tri_indices <- which(upper.tri(complete_non_euclidean_distances), arr.ind = TRUE)
# missing_pair_indices <- sample(nrow(upper_tri_indices), target_missing_pairs)
#
# incomplete_distances <- complete_non_euclidean_distances
# incomplete_distances[upper_tri_indices[missing_pair_indices,]] <- NA
# incomplete_distances[upper_tri_indices[missing_pair_indices, c(2,1)]] <- NA
#
# return(list(
# complete_distances = complete_non_euclidean_distances,
# incomplete_distances = incomplete_distances,
# object_names = object_names,
# method = "Synthesized non-Euclidean"
# ))
# }
## ----swiss_roll_generator, eval=FALSE-----------------------------------------
# #' Generate Non-Euclidean Data from a Swiss Roll Manifold
# #'
# #' Creates data points on a Swiss Roll manifold where geodesic distances
# #' are inherently non-Euclidean. Includes "tunneling" effects common in real-world manifold data.
# #'
# #' @param n_objects Number of points on the manifold
# #' @param noise Standard deviation of Gaussian noise added to points
# #' @param tunnel_fraction Fraction of distances to replace with Euclidean shortcuts
# #' @param missing_fraction Proportion of final distances to set to NA
# #' @return List containing complete and incomplete distance matrices
# generate_swiss_roll_data <- function(n_objects = 50,
# noise = 0.05,
# tunnel_fraction = 0.05,
# missing_fraction = 0.30) {
# # Check if required packages are available
# if(!requireNamespace("RANN", quietly = TRUE)) {
# stop("RANN package is required for this function. Please install it.")
# }
# if(!requireNamespace("igraph", quietly = TRUE)) {
# stop("igraph package is required for this function. Please install it.")
# }
#
# # Generate points on the Swiss Roll
# t <- 1.5 * pi * (1 + 2 * runif(n_objects))
# height <- 21 * runif(n_objects)
#
# x <- t * cos(t)
# y <- height
# z <- t * sin(t)
#
# points_3d <- data.frame(x = x, y = y, z = z) + rnorm(n_objects * 3, sd = noise)
# object_names <- paste0("Object_", sprintf("%02d", 1:n_objects))
# rownames(points_3d) <- object_names
#
# # Calculate geodesic distances via k-NN graph
# if(requireNamespace("RANN", quietly = TRUE)) {
# knn_graph <- RANN::nn2(points_3d, k = min(10, n_objects-1))$nn.idx
# adj_matrix <- matrix(0, n_objects, n_objects)
# for (i in 1:n_objects) {
# for (j_idx in 2:min(10, n_objects)) {
# if(j_idx <= ncol(knn_graph)) {
# j <- knn_graph[i, j_idx]
# dist_ij <- dist(rbind(points_3d[i,], points_3d[j,]))
# adj_matrix[i, j] <- dist_ij
# adj_matrix[j, i] <- dist_ij
# }
# }
# }
#
# g <- igraph::graph_from_adjacency_matrix(adj_matrix, mode = "undirected", weighted = TRUE)
# geodesic_distances <- igraph::distances(g)
# geodesic_distances[is.infinite(geodesic_distances)] <-
# max(geodesic_distances[is.finite(geodesic_distances)]) * 1.5
# } else {
# # Fallback to Euclidean if RANN not available
# geodesic_distances <- as.matrix(dist(points_3d))
# warning("RANN package not available. Using Euclidean distances as approximation.")
# }
#
# # Introduce "tunnels" or "short-circuits"
# euclidean_distances <- as.matrix(dist(points_3d))
# n_tunnels <- round(tunnel_fraction * n_objects * (n_objects - 1) / 2)
#
# upper_tri_indices <- which(upper.tri(geodesic_distances), arr.ind = TRUE)
# tunnel_indices <- sample(nrow(upper_tri_indices), min(n_tunnels, nrow(upper_tri_indices)))
#
# complete_distances <- geodesic_distances
# if(length(tunnel_indices) > 0) {
# for (k in tunnel_indices) {
# i <- upper_tri_indices[k, 1]
# j <- upper_tri_indices[k, 2]
# complete_distances[i, j] <- complete_distances[j, i] <- euclidean_distances[i, j]
# }
# }
# diag(complete_distances) <- 0
#
# # Introduce missing values
# total_unique_pairs <- n_objects * (n_objects - 1) / 2
# target_missing_pairs <- round(total_unique_pairs * missing_fraction)
#
# missing_pair_indices <- sample(nrow(upper_tri_indices),
# min(target_missing_pairs, nrow(upper_tri_indices)))
#
# incomplete_distances <- complete_distances
# if(length(missing_pair_indices) > 0) {
# incomplete_distances[upper_tri_indices[missing_pair_indices,]] <- NA
# incomplete_distances[upper_tri_indices[missing_pair_indices, c(2,1)]] <- NA
# }
#
# rownames(complete_distances) <- object_names
# colnames(complete_distances) <- object_names
# rownames(incomplete_distances) <- object_names
# colnames(incomplete_distances) <- object_names
#
# return(list(
# complete_distances = complete_distances,
# incomplete_distances = incomplete_distances,
# object_names = object_names,
# method = "Swiss Roll Manifold"
# ))
# }
## ----analysis_pipeline, eval=FALSE--------------------------------------------
# #' Comprehensive analysis pipeline for embedding method comparison
# #'
# #' @param data_gen_func Function to generate the dataset
# #' @param dataset_name Name for the dataset (for labeling)
# #' @param n_objects Number of objects in the dataset
# #' @param n_runs Number of runs for stochastic methods
# run_comprehensive_analysis <- function(data_gen_func, dataset_name, n_objects = 50, n_runs = 50) {
#
# cat("=== ANALYSIS FOR", toupper(dataset_name), "===\n")
#
# # Step 1: Data Generation
# cat("\n1. Generating", dataset_name, "dataset...\n")
# data_gen_output <- data_gen_func(n_objects = n_objects, missing_fraction = 0.3)
#
# complete_distances_for_evaluation <- data_gen_output$complete_distances
# incomplete_distances_for_embedding <- data_gen_output$incomplete_distances
# object_names <- data_gen_output$object_names
#
# actual_missing_percentage <- sum(is.na(incomplete_distances_for_embedding)) /
# (n_objects * (n_objects-1)) * 100
#
# data_quality <- calculate_data_quality_metrics(incomplete_distances_for_embedding)
#
# cat("Generated dataset with", n_objects, "objects and",
# round(actual_missing_percentage, 1), "% missing values\n")
#
# # Step 2: Assess Non-Euclidean Character
# cat("\n2. Assessing non-Euclidean character...\n")
#
# D_squared <- complete_distances_for_evaluation^2
# n <- nrow(D_squared)
# J <- diag(n) - (1/n) * matrix(1, n, n)
# B <- -0.5 * J %*% D_squared %*% J
# eigenvals <- eigen(B, symmetric = TRUE)$values
#
# numerical_tolerance <- 1e-12
# positive_eigenvals <- eigenvals[eigenvals > numerical_tolerance]
# negative_eigenvals <- eigenvals[eigenvals < -numerical_tolerance]
#
# total_variance <- sum(abs(eigenvals))
# negative_variance <- sum(abs(negative_eigenvals))
# positive_variance <- sum(positive_eigenvals)
#
# if(positive_variance > 0) {
# deviation_score <- negative_variance / positive_variance
# } else {
# deviation_score <- 1.0
# }
#
# negative_variance_fraction <- negative_variance / total_variance
#
# cumulative_positive_variance <- cumsum(positive_eigenvals) / positive_variance
# dims_for_90_percent <- which(cumulative_positive_variance >= 0.90)[1]
# if(is.na(dims_for_90_percent)) dims_for_90_percent <- length(positive_eigenvals)
#
# cat("Non-Euclidean deviation score:", round(deviation_score, 4), "\n")
# cat("Dimensions for 90% variance:", dims_for_90_percent, "\n")
#
# # Step 3: Parameter Optimization using Euclidify
# cat("\n3. Running automated parameter optimization...\n")
#
# # Create temporary directory for optimization
# temp_dir <- tempfile()
# dir.create(temp_dir, recursive = TRUE)
#
# euclidify_results <- tryCatch({
# Euclidify(
# dissimilarity_matrix = incomplete_distances_for_embedding,
# output_dir = temp_dir,
# ndim_range = c(2, min(10, dims_for_90_percent + 3)),
# n_initial_samples = 20, # Reduced for vignette speed
# n_adaptive_samples = 40, # Reduced for vignette speed
# folds = 20,
# mapping_max_iter = 300, # Reduced for speed
# clean_intermediate = TRUE,
# verbose = "off",
# fallback_to_defaults = TRUE,
# save_results = FALSE
# )
# }, error = function(e) {
# cat("Euclidify failed, using default parameters\n")
# NULL
# })
#
# # Clean up temp directory
# unlink(temp_dir, recursive = TRUE)
#
# if(!is.null(euclidify_results)) {
# optimal_params <- euclidify_results$optimal_params
# euclidify_positions <- euclidify_results$positions
# target_dims <- optimal_params$ndim
# cat("Optimal parameters found - dimensions:", target_dims, "\n")
# } else {
# # Fallback parameters
# target_dims <- max(2, min(5, dims_for_90_percent))
# optimal_params <- list(
# ndim = target_dims,
# k0 = 5.0,
# cooling_rate = 0.01,
# c_repulsion = 0.02
# )
# euclidify_positions <- NULL
# cat("Using fallback parameters - dimensions:", target_dims, "\n")
# }
#
# # Step 4: Three-Method Comparison
# cat("\n4. Running three-method comparison...\n")
#
# # Initialize storage
# topolow_results <- vector("list", n_runs)
# iterative_mds_results <- vector("list", n_runs)
# classical_mds_result <- NULL
#
# topolow_errors <- numeric(n_runs)
# iterative_mds_errors <- numeric(n_runs)
# classical_mds_error <- NA
#
# # Topolow runs
# cat("Running Topolow embeddings...\n")
# for(i in 1:n_runs) {
# if(i == 1 && !is.null(euclidify_positions)) {
# # Use Euclidify result for first run
# topolow_coords <- euclidify_positions
# } else {
# # Run new embedding
# topolow_result <- tryCatch({
# euclidean_embedding(
# dissimilarity_matrix = incomplete_distances_for_embedding,
# ndim = optimal_params$ndim,
# mapping_max_iter = 300,
# k0 = optimal_params$k0,
# cooling_rate = optimal_params$cooling_rate,
# c_repulsion = optimal_params$c_repulsion,
# relative_epsilon = 1e-6,
# convergence_counter = 3,
# verbose = FALSE
# )
# }, error = function(e) NULL)
#
# if(!is.null(topolow_result)) {
# topolow_coords <- topolow_result$positions
# } else {
# topolow_coords <- NULL
# }
# }
#
# if(!is.null(topolow_coords)) {
# topolow_coords <- topolow_coords[order(row.names(topolow_coords)), ]
#
# if(validate_coordinates(topolow_coords, "Topolow", n_objects, target_dims)) {
# topolow_coords <- scale(topolow_coords, center = TRUE, scale = FALSE)
# topolow_results[[i]] <- topolow_coords
#
# embedded_distances <- as.matrix(dist(topolow_coords))
# rownames(embedded_distances) <- rownames(topolow_coords)
# colnames(embedded_distances) <- rownames(topolow_coords)
#
# valid_mask <- !is.na(complete_distances_for_evaluation)
# distance_errors <- abs(complete_distances_for_evaluation[valid_mask] -
# embedded_distances[valid_mask])
# topolow_errors[i] <- sum(distance_errors)
# } else {
# topolow_results[[i]] <- NULL
# topolow_errors[i] <- NA
# }
# } else {
# topolow_results[[i]] <- NULL
# topolow_errors[i] <- NA
# }
# }
#
# # Classical MDS
# cat("Running Classical MDS...\n")
# imputation_result <- improved_missing_data_imputation(incomplete_distances_for_embedding)
# classical_mds_matrix <- imputation_result$matrix
#
# tryCatch({
# classical_mds_result_raw <- cmdscale(classical_mds_matrix, k = target_dims, eig = TRUE)
# classical_mds_coords <- classical_mds_result_raw$points
# rownames(classical_mds_coords) <- object_names
# classical_mds_coords <- classical_mds_coords[order(row.names(classical_mds_coords)), ]
#
# if(validate_coordinates(classical_mds_coords, "Classical MDS", n_objects, target_dims)) {
# classical_mds_coords <- scale(classical_mds_coords, center = TRUE, scale = FALSE)
# classical_mds_result <- classical_mds_coords
#
# embedded_distances <- as.matrix(dist(classical_mds_coords))
# rownames(embedded_distances) <- rownames(classical_mds_coords)
# colnames(embedded_distances) <- rownames(classical_mds_coords)
#
# valid_mask <- !is.na(complete_distances_for_evaluation)
# distance_errors <- abs(complete_distances_for_evaluation[valid_mask] -
# embedded_distances[valid_mask])
# classical_mds_error <- sum(distance_errors)
# }
# }, error = function(e) {
# cat("Classical MDS failed:", e$message, "\n")
# })
#
# # Iterative MDS
# cat("Running Iterative MDS...\n")
# for(i in 1:n_runs) {
# tryCatch({
# if(requireNamespace("smacof", quietly = TRUE)) {
# max_dist <- max(classical_mds_matrix, na.rm = TRUE)
# scaled_matrix <- classical_mds_matrix / max_dist
#
# iterative_mds_result_raw <- smacof::smacofSym(
# delta = scaled_matrix,
# ndim = target_dims,
# type = "interval",
# init = "torgerson",
# verbose = FALSE,
# itmax = 1000,
# eps = 1e-5
# )
#
# iterative_mds_coords <- iterative_mds_result_raw$conf
# current_max_dist <- max(dist(iterative_mds_coords))
# if(current_max_dist > 0) {
# scale_factor <- max_dist / current_max_dist
# iterative_mds_coords <- iterative_mds_coords * scale_factor
# }
# } else {
# # Fallback to isoMDS
# iterative_mds_result_raw <- MASS::isoMDS(classical_mds_matrix, k = target_dims, trace = FALSE)
# iterative_mds_coords <- iterative_mds_result_raw$points
# }
#
# rownames(iterative_mds_coords) <- object_names
# iterative_mds_coords <- iterative_mds_coords[order(row.names(iterative_mds_coords)), ]
#
# if(validate_coordinates(iterative_mds_coords, "Iterative MDS", n_objects, target_dims)) {
# iterative_mds_coords <- scale(iterative_mds_coords, center = TRUE, scale = FALSE)
# iterative_mds_results[[i]] <- iterative_mds_coords
#
# embedded_distances <- as.matrix(dist(iterative_mds_coords))
# rownames(embedded_distances) <- rownames(iterative_mds_coords)
# colnames(embedded_distances) <- rownames(iterative_mds_coords)
#
# valid_mask <- !is.na(complete_distances_for_evaluation)
# distance_errors <- abs(complete_distances_for_evaluation[valid_mask] -
# embedded_distances[valid_mask])
# iterative_mds_errors[i] <- sum(distance_errors)
# } else {
# iterative_mds_results[[i]] <- NULL
# iterative_mds_errors[i] <- NA
# }
#
# }, error = function(e) {
# iterative_mds_results[[i]] <- NULL
# iterative_mds_errors[i] <- NA
# })
# }
#
# # Step 5: Compile Results
# valid_topolow_results <- !is.na(topolow_errors)
# valid_iterative_results <- !is.na(iterative_mds_errors)
#
# results <- list(
# dataset_name = dataset_name,
# data_characteristics = list(
# n_objects = n_objects,
# missing_percentage = actual_missing_percentage,
# deviation_score = deviation_score,
# dims_90_percent = dims_for_90_percent,
# negative_variance_fraction = negative_variance_fraction
# ),
# optimal_params = optimal_params,
# topolow_errors = topolow_errors,
# topolow_results = topolow_results,
# valid_topolow_results = valid_topolow_results,
# classical_mds_error = classical_mds_error,
# classical_mds_result = classical_mds_result,
# iterative_mds_errors = iterative_mds_errors,
# iterative_mds_results = iterative_mds_results,
# valid_iterative_results = valid_iterative_results,
# complete_distances = complete_distances_for_evaluation,
# incomplete_distances = incomplete_distances_for_embedding,
# target_dims = target_dims
# )
#
# return(results)
# }
## ----distorted_analysis, eval=FALSE, results='hide', fig.show='hold'----------
# # Run analysis for Synthesized non-Euclidean data
# distorted_results <- run_comprehensive_analysis(
# generate_distorted_euclidean_data,
# "Synthesized non-Euclidean",
# n_objects = 50,
# n_runs = 50
# )
## ----distorted_summary, eval=FALSE, echo=FALSE--------------------------------
# cat("=== Synthesized non-Euclidean DATA ANALYSIS SUMMARY ===\n")
# cat("Dataset characteristics:\n")
# cat("- Objects:", distorted_results$data_characteristics$n_objects, "\n")
# cat("- Missing data:", round(distorted_results$data_characteristics$missing_percentage, 1), "%\n")
# cat("- Non-Euclidean deviation score:", round(distorted_results$data_characteristics$deviation_score, 4), "\n")
# cat("- Dimensions for 90% variance:", distorted_results$data_characteristics$dims_90_percent, "\n")
# cat("- Target embedding dimensions:", distorted_results$target_dims, "\n\n")
#
# cat("Performance Summary:\n")
# if(sum(distorted_results$valid_topolow_results) > 0) {
# cat("- Topolow mean error:",
# round(mean(distorted_results$topolow_errors[distorted_results$valid_topolow_results]), 2),
# "±", round(sd(distorted_results$topolow_errors[distorted_results$valid_topolow_results]), 2), "\n")
# }
# if(!is.na(distorted_results$classical_mds_error)) {
# cat("- Classical MDS error:", round(distorted_results$classical_mds_error, 2), "\n")
# }
# if(sum(distorted_results$valid_iterative_results) > 0) {
# cat("- Iterative MDS mean error:", round(mean(distorted_results$iterative_mds_errors[distorted_results$valid_iterative_results]), 2),
# "±", round(sd(distorted_results$iterative_mds_errors[distorted_results$valid_iterative_results]), 2), "\n")
# }
## ----distorted_eigenvalues, eval=FALSE, echo=FALSE----------------------------
# # Create eigenvalue visualization for distorted data
# D_squared <- distorted_results$complete_distances^2
# n <- nrow(D_squared)
# J <- diag(n) - (1/n) * matrix(1, n, n)
# B <- -0.5 * J %*% D_squared %*% J
# eigenvals <- eigen(B, symmetric = TRUE)$values
#
# eigenval_df <- data.frame(
# index = 1:length(eigenvals),
# eigenvalue = eigenvals,
# type = ifelse(eigenvals > 1e-12, "Positive",
# ifelse(eigenvals < -1e-12, "Negative", "Zero"))
# )
#
# p_eigen_distorted <- ggplot(eigenval_df, aes(x = index, y = eigenvalue, fill = type)) +
# geom_bar(stat = "identity", alpha = 0.8) +
# geom_hline(yintercept = 0, linetype = "dashed", linewidth = 1, color = "black") +
# scale_fill_manual(values = c("Positive" = "steelblue", "Negative" = "red", "Zero" = "gray")) +
# labs(
# title = "Eigenvalue Spectrum: Synthesized non-Euclidean Data",
# subtitle = paste("Deviation Score:", round(distorted_results$data_characteristics$deviation_score, 4),
# "| Non-Euclidean Variance:", round(distorted_results$data_characteristics$negative_variance_fraction * 100, 1), "%"),
# x = "Eigenvalue Index (sorted descending)",
# y = "Eigenvalue",
# fill = "Eigenvalue Type"
# ) +
# theme_minimal() +
# theme(
# legend.position = "bottom",
# plot.title = element_text(size = 12, face = "bold"),
# panel.grid.minor = element_blank()
# )
#
# print(p_eigen_distorted)
## ----distorted_eigenvalues-1, echo=FALSE, out.width="100%"--------------------
# Include pre-generated figure from man/figures
knitr::include_graphics("../man/figures/distorted_eigenvalues-1.png")
## ----distorted_performance, eval=FALSE, echo=FALSE----------------------------
# # Create performance comparison for distorted data
# results_df_distorted <- data.frame()
#
# if(sum(distorted_results$valid_topolow_results) > 0) {
# topolow_df <- data.frame(
# Run = which(distorted_results$valid_topolow_results),
# Method = "Topolow",
# Total_Error = distorted_results$topolow_errors[distorted_results$valid_topolow_results],
# Method_Type = "Stochastic",
# Dataset = "Synthesized non-Euclidean"
# )
# results_df_distorted <- rbind(results_df_distorted, topolow_df)
# }
#
# if(!is.na(distorted_results$classical_mds_error)) {
# classical_df <- data.frame(
# Run = 1,
# Method = "Classical MDS",
# Total_Error = distorted_results$classical_mds_error,
# Method_Type = "Deterministic",
# Dataset = "Synthesized non-Euclidean"
# )
# results_df_distorted <- rbind(results_df_distorted, classical_df)
# }
#
# if(sum(distorted_results$valid_iterative_results) > 0) {
# iterative_df <- data.frame(
# Run = which(distorted_results$valid_iterative_results),
# Method = "Iterative MDS",
# Total_Error = distorted_results$iterative_mds_errors[distorted_results$valid_iterative_results],
# Method_Type = "Stochastic",
# Dataset = "Synthesized non-Euclidean"
# )
# results_df_distorted <- rbind(results_df_distorted, iterative_df)
# }
#
# if(nrow(results_df_distorted) > 0) {
# p_performance_distorted <- ggplot(results_df_distorted, aes(x = Method, y = Total_Error, fill = Method_Type)) +
# geom_boxplot(alpha = 0.7, outlier.shape = NA) +
# geom_jitter(width = 0.2, alpha = 0.7, size = 2.5) +
# scale_fill_manual(values = c("Stochastic" = "steelblue", "Deterministic" = "darkorange")) +
# labs(
# title = "Embedding Performance: Synthesized non-Euclidean Data",
# subtitle = paste("Non-Euclidean Score:", round(distorted_results$data_characteristics$deviation_score, 4),
# "| Missing Data:", round(distorted_results$data_characteristics$missing_percentage, 1), "%"),
# y = "Sum of Absolute Distance Errors (L1 Norm)",
# x = "Method",
# fill = "Method Type"
# ) +
# theme_minimal() +
# theme(
# plot.title = element_text(size = 12, face = "bold"),
# panel.grid.minor = element_blank(),
# legend.position = "bottom"
# )
#
# print(p_performance_distorted)
# }
## ----distorted_performance-1, echo=FALSE, out.width="100%"--------------------
# Include pre-generated figure from man/figures
knitr::include_graphics("../man/figures/distorted_performance-1.png")
## ----swiss_analysis, eval=FALSE, results='hide', fig.show='hold'--------------
# # Run analysis for Swiss Roll data
# swiss_results <- run_comprehensive_analysis(
# generate_swiss_roll_data,
# "Swiss Roll Manifold",
# n_objects = 50,
# n_runs = 50
# )
## ----swiss_summary, eval=FALSE, echo=FALSE------------------------------------
# cat("=== SWISS ROLL MANIFOLD DATA ANALYSIS SUMMARY ===\n")
# cat("Dataset characteristics:\n")
# cat("- Objects:", swiss_results$data_characteristics$n_objects, "\n")
# cat("- Missing data:", round(swiss_results$data_characteristics$missing_percentage, 1), "%\n")
# cat("- Non-Euclidean deviation score:", round(swiss_results$data_characteristics$deviation_score, 4), "\n")
# cat("- Dimensions for 90% variance:", swiss_results$data_characteristics$dims_90_percent, "\n")
# cat("- Target embedding dimensions:", swiss_results$target_dims, "\n\n")
#
# cat("Performance Summary:\n")
# if(sum(swiss_results$valid_topolow_results) > 0) {
# cat("- Topolow mean error:", round(mean(swiss_results$topolow_errors[swiss_results$valid_topolow_results]), 2),
# "±", round(sd(swiss_results$topolow_errors[swiss_results$valid_topolow_results]), 2), "\n")
# }
# if(!is.na(swiss_results$classical_mds_error)) {
# cat("- Classical MDS error:", round(swiss_results$classical_mds_error, 2), "\n")
# }
# if(sum(swiss_results$valid_iterative_results) > 0) {
# cat("- Iterative MDS mean error:", round(mean(swiss_results$iterative_mds_errors[swiss_results$valid_iterative_results]), 2),
# "±", round(sd(swiss_results$iterative_mds_errors[swiss_results$valid_iterative_results]), 2), "\n")
# }
## ----swiss_eigenvalues, eval=FALSE, echo=FALSE--------------------------------
# # Create eigenvalue visualization for Swiss Roll data
# D_squared <- swiss_results$complete_distances^2
# n <- nrow(D_squared)
# J <- diag(n) - (1/n) * matrix(1, n, n)
# B <- -0.5 * J %*% D_squared %*% J
# eigenvals <- eigen(B, symmetric = TRUE)$values
#
# eigenval_df <- data.frame(
# index = 1:length(eigenvals),
# eigenvalue = eigenvals,
# type = ifelse(eigenvals > 1e-12, "Positive",
# ifelse(eigenvals < -1e-12, "Negative", "Zero"))
# )
#
# p_eigen_swiss <- ggplot(eigenval_df, aes(x = index, y = eigenvalue, fill = type)) +
# geom_bar(stat = "identity", alpha = 0.8) +
# geom_hline(yintercept = 0, linetype = "dashed", linewidth = 1, color = "black") +
# scale_fill_manual(values = c("Positive" = "steelblue", "Negative" = "red", "Zero" = "gray")) +
# labs(
# title = "Eigenvalue Spectrum: Swiss Roll Manifold Data",
# subtitle = paste("Deviation Score:", round(swiss_results$data_characteristics$deviation_score, 4),
# "| Non-Euclidean Variance:", round(swiss_results$data_characteristics$negative_variance_fraction * 100, 1), "%"),
# x = "Eigenvalue Index (sorted descending)",
# y = "Eigenvalue",
# fill = "Eigenvalue Type"
# ) +
# theme_minimal() +
# theme(
# legend.position = "bottom",
# plot.title = element_text(size = 12, face = "bold"),
# panel.grid.minor = element_blank()
# )
#
# print(p_eigen_swiss)
## ----swiss_eigenvalues-1, echo=FALSE, out.width="100%"------------------------
# Include pre-generated figure from man/figures
knitr::include_graphics("../man/figures/swiss_eigenvalues-1.png")
## ----swiss_performance, eval=FALSE, echo=FALSE--------------------------------
# # Create performance comparison for Swiss Roll data
# results_df_swiss <- data.frame()
#
# if(sum(swiss_results$valid_topolow_results) > 0) {
# topolow_df <- data.frame(
# Run = which(swiss_results$valid_topolow_results),
# Method = "Topolow",
# Total_Error = swiss_results$topolow_errors[swiss_results$valid_topolow_results],
# Method_Type = "Stochastic",
# Dataset = "Swiss Roll"
# )
# results_df_swiss <- rbind(results_df_swiss, topolow_df)
# }
#
# if(!is.na(swiss_results$classical_mds_error)) {
# classical_df <- data.frame(
# Run = 1,
# Method = "Classical MDS",
# Total_Error = swiss_results$classical_mds_error,
# Method_Type = "Deterministic",
# Dataset = "Swiss Roll"
# )
# results_df_swiss <- rbind(results_df_swiss, classical_df)
# }
#
# if(sum(swiss_results$valid_iterative_results) > 0) {
# iterative_df <- data.frame(
# Run = which(swiss_results$valid_iterative_results),
# Method = "Iterative MDS",
# Total_Error = swiss_results$iterative_mds_errors[swiss_results$valid_iterative_results],
# Method_Type = "Stochastic",
# Dataset = "Swiss Roll"
# )
# results_df_swiss <- rbind(results_df_swiss, iterative_df)
# }
#
# if(nrow(results_df_swiss) > 0) {
# p_performance_swiss <- ggplot(results_df_swiss, aes(x = Method, y = Total_Error, fill = Method_Type)) +
# geom_boxplot(alpha = 0.7, outlier.shape = NA) +
# geom_jitter(width = 0.2, alpha = 0.7, size = 2.5) +
# scale_fill_manual(values = c("Stochastic" = "steelblue", "Deterministic" = "darkorange")) +
# labs(
# title = "Embedding Performance: Swiss Roll Manifold Data",
# subtitle = paste("Non-Euclidean Score:", round(swiss_results$data_characteristics$deviation_score, 4),
# "| Missing Data:", round(swiss_results$data_characteristics$missing_percentage, 1), "%"),
# y = "Sum of Absolute Distance Errors (L1 Norm)",
# x = "Method",
# fill = "Method Type"
# ) +
# theme_minimal() +
# theme(
# plot.title = element_text(size = 12, face = "bold"),
# panel.grid.minor = element_blank(),
# legend.position = "bottom"
# )
#
# print(p_performance_swiss)
# }
## ----swiss_performance-1, echo=FALSE, out.width="100%"------------------------
# Include pre-generated figure from man/figures
knitr::include_graphics("../man/figures/swiss_performance-1.png")
## ----combined_analysis, eval=FALSE, echo=FALSE--------------------------------
# # Combine results from both datasets
# combined_results <- rbind(
# if(nrow(results_df_distorted) > 0) results_df_distorted else data.frame(),
# if(nrow(results_df_swiss) > 0) results_df_swiss else data.frame()
# )
#
# if(nrow(combined_results) > 0) {
# # Overall performance comparison
# p_combined_performance <- ggplot(combined_results, aes(x = Method, y = Total_Error)) +
# geom_boxplot(alpha = 0.7, fill = "lightblue") +
# geom_point(position = position_jitter(width = 0.2),
# size = 2, alpha = 0.8, color = "darkblue") +
# facet_wrap(~ Dataset, scales = "free_y") +
# labs(
# title = "Comparative Performance Across Dataset Types",
# subtitle = "Error comparison for three methods on Synthesized non-Euclidean and Swiss Roll manifold data",
# y = "Sum of Absolute Distance Errors (L1 Norm)",
# x = "Method"
# ) +
# theme_minimal() +
# theme(
# plot.title = element_text(size = 12, face = "bold"),
# panel.grid.minor = element_blank(),
# strip.text = element_text(face = "bold", size = 10),
# axis.text.x = element_text(angle = 45, hjust = 1)
# )
#
# print(p_combined_performance)
#
# # Summary statistics by method and dataset
# summary_stats <- combined_results %>%
# group_by(Method, Dataset, Method_Type) %>%
# summarise(
# Count = n(),
# Mean_Error = mean(Total_Error),
# SD_Error = sd(Total_Error),
# Min_Error = min(Total_Error),
# Max_Error = max(Total_Error),
# .groups = 'drop'
# )
#
# cat("\n=== COMBINED PERFORMANCE SUMMARY ===\n")
# print(summary_stats)
# }
## ----combined_analysis-1, echo=FALSE, out.width="100%"------------------------
# Include pre-generated figure from man/figures
knitr::include_graphics("../man/figures/combined_analysis-1.png")
## ----statistical_analysis, eval=FALSE, echo=FALSE-----------------------------
# cat("\n=== STATISTICAL ANALYSIS ===\n")
#
# # Statistical comparisons for Synthesized non-Euclidean data
# if(sum(distorted_results$valid_topolow_results) > 0 &&
# sum(distorted_results$valid_iterative_results) > 0) {
# cat("\nSynthesized non-Euclidean Data:\n")
# enhanced_statistical_comparison(
# distorted_results$topolow_errors[distorted_results$valid_topolow_results],
# distorted_results$iterative_mds_errors[distorted_results$valid_iterative_results],
# "Iterative MDS"
# )
# }
#
# # Statistical comparisons for Swiss Roll data
# if(sum(swiss_results$valid_topolow_results) > 0 &&
# sum(swiss_results$valid_iterative_results) > 0) {
# cat("\nSwiss Roll Manifold Data:\n")
# enhanced_statistical_comparison(
# swiss_results$topolow_errors[swiss_results$valid_topolow_results],
# swiss_results$iterative_mds_errors[swiss_results$valid_iterative_results],
# "Iterative MDS"
# )
# }
## ----distance_preservation, eval=FALSE, echo=FALSE----------------------------
# # Distance preservation analysis for both datasets
# analyze_distance_preservation <- function(results, dataset_name) {
# cat("\n=== DISTANCE PRESERVATION:", toupper(dataset_name), "===\n")
#
# # Get best performing runs
# best_methods <- list()
#
# if(sum(results$valid_topolow_results) > 0) {
# best_topolow_idx <- which.min(results$topolow_errors[results$valid_topolow_results])
# actual_topolow_idx <- which(results$valid_topolow_results)[best_topolow_idx]
# best_methods$topolow <- results$topolow_results[[actual_topolow_idx]]
# }
#
# if(!is.na(results$classical_mds_error)) {
# best_methods$classical <- results$classical_mds_result
# }
#
# if(sum(results$valid_iterative_results) > 0) {
# best_iterative_idx <- which.min(results$iterative_mds_errors[results$valid_iterative_results])
# actual_iterative_idx <- which(results$valid_iterative_results)[best_iterative_idx]
# best_methods$iterative <- results$iterative_mds_results[[actual_iterative_idx]]
# }
#
# if(length(best_methods) > 0) {
# # Calculate correlations
# upper_tri_mask <- upper.tri(results$complete_distances)
# true_distances_vector <- results$complete_distances[upper_tri_mask]
#
# distance_comparison_list <- list()
# preservation_stats <- data.frame()
#
# for(method_name in names(best_methods)) {
# embedded_distances <- as.matrix(dist(best_methods[[method_name]]))
# embedded_distances_vector <- embedded_distances[upper_tri_mask]
#
# correlation <- cor(true_distances_vector, embedded_distances_vector, use = "complete.obs")
# rsq <- summary(lm(embedded_distances_vector ~ true_distances_vector))$r.squared
#
# method_display_name <- switch(method_name,
# "topolow" = "Topolow",
# "classical" = "Classical MDS",
# "iterative" = "Iterative MDS")
#
# preservation_stats <- rbind(preservation_stats, data.frame(
# Method = method_display_name,
# Dataset = dataset_name,
# Correlation = correlation,
# R_squared = rsq
# ))
#
# distance_comparison_list[[method_name]] <- data.frame(
# True_Distance = true_distances_vector,
# Embedded_Distance = embedded_distances_vector,
# Method = method_display_name,
# Dataset = dataset_name
# )
# }
#
# distance_comparison_df <- do.call(rbind, distance_comparison_list)
#
# # Create distance preservation plot
# p_distance_preservation <- ggplot(distance_comparison_df,
# aes(x = True_Distance, y = Embedded_Distance, color = Method)) +
# geom_point(alpha = 0.6, size = 1.5) +
# geom_smooth(method = "lm", se = FALSE, linewidth = 1) +
# geom_abline(intercept = 0, slope = 1, color = "black", linetype = "dashed", alpha = 0.7) +
# facet_wrap(~Method, scales = "free_x") +
# coord_cartesian(ylim = c(0, NA)) +
# labs(
# title = paste("Distance Preservation:", dataset_name),
# x = "Original Non-Euclidean Distance",
# y = "Embedded Euclidean Distance"
# ) +
# theme_minimal() +
# theme(
# plot.title = element_text(size = 12, face = "bold"),
# legend.position = "none",
# panel.grid.minor = element_blank()
# )
#
# print(p_distance_preservation)
#
# cat("\nDistance Preservation Statistics:\n")
# print(preservation_stats)
#
# return(preservation_stats)
# }
# }
#
# # Analyze both datasets
# distorted_preservation <- analyze_distance_preservation(distorted_results, "Synthesized non-Euclidean")
# swiss_preservation <- analyze_distance_preservation(swiss_results, "Swiss Roll")
## ----distance_preservation-1, echo=FALSE, out.width="100%"--------------------
# Include pre-generated figure from man/figures
knitr::include_graphics("../man/figures/distance_preservation-1.png")
## ----distance_preservation-2, echo=FALSE, out.width="100%"--------------------
# Include pre-generated figure from man/figures
knitr::include_graphics("../man/figures/distance_preservation-2.png")
## ----preservation_summary, eval=FALSE, echo=FALSE-----------------------------
# if(exists("distorted_preservation") && exists("swiss_preservation")) {
# combined_preservation <- rbind(distorted_preservation, swiss_preservation)
#
# cat("Distance Preservation Summary (Correlation with Original Distances):\n")
# print(combined_preservation)
#
# # Calculate mean correlations by method
# mean_correlations <- combined_preservation %>%
# group_by(Method) %>%
# summarise(Mean_Correlation = mean(Correlation, na.rm = TRUE),
# Mean_R_squared = mean(R_squared, na.rm = TRUE),
# .groups = 'drop')
#
# cat("\nMean Performance Across Dataset Types:\n")
# print(mean_correlations)
# }
## ----session_info, echo=FALSE-------------------------------------------------
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.