dataSDA_intro.R
In dataSDA: Datasets and Basic Statistics for Symbolic Data Analysis

## ----setup, include = FALSE---------------------------------------------------
not_cran <- identical(Sys.getenv("NOT_CRAN"), "true")

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  echo = TRUE,
  eval = not_cran,
  warning = FALSE,
  message = FALSE,
  out.width = "100%",
  fig.align = "center"
)
library(knitr)
library(dataSDA)
library(RSDA)
library(HistDAWass)

has_symbolicDA <- requireNamespace("symbolicDA", quietly = TRUE)
has_MAINT <- requireNamespace("MAINT.Data", quietly = TRUE)
has_e1071 <- requireNamespace("e1071", quietly = TRUE)
has_ggInterval <- requireNamespace("ggInterval", quietly = TRUE) && not_cran

## ----dataset-summary, echo = FALSE--------------------------------------------
# # Use installed package data dir; fall back to source tree when building locally
# data_dir <- system.file("data", package = "dataSDA")
# if (!nzchar(data_dir) || length(list.files(data_dir, pattern = "[.]rda$")) == 0) {
#   data_dir <- file.path("..", "data")
# }
# 
# data_files <- list.files(data_dir, pattern = "[.]rda$")
# data_names <- sub("[.]rda$", "", data_files)
# 
# classify_type <- function(name) {
#   if (grepl("[.]its$", name)) {
#     return("Interval Time Series (.its)")
#   }
#   if (grepl("[.]int[.]mm$", name)) {
#     return("Interval (.int.mm)")
#   }
#   if (grepl("[.]int$", name)) {
#     return("Interval (.int)")
#   }
#   if (grepl("[.]iGAP$", name)) {
#     return("Interval (.iGAP)")
#   }
#   if (grepl("[.]hist$", name)) {
#     return("Histogram (.hist)")
#   }
#   if (grepl("[.]distr$", name)) {
#     return("Distributional (.distr)")
#   }
#   if (grepl("[.]mix$", name)) {
#     return("Mixed (.mix)")
#   }
#   if (grepl("[.]modal$", name)) {
#     return("Modal (.modal)")
#   }
#   "Other"
# }
# 
# types <- sapply(data_names, classify_type, USE.NAMES = FALSE)
# type_tbl <- as.data.frame(table(Type = types), stringsAsFactors = FALSE)
# type_tbl <- type_tbl[order(-type_tbl$Freq), ]
# names(type_tbl)[2] <- "Datasets"
# #kable(type_tbl,
# #  row.names = FALSE,
# #  caption = "Dataset counts by type"
# #)

## -----------------------------------------------------------------------------
# data(mushroom.int)
# head(mushroom.int, 3)
# class(mushroom.int)

## -----------------------------------------------------------------------------
# data(abalone.int)
# head(abalone.int, 3)
# class(abalone.int)

## -----------------------------------------------------------------------------
# data(abalone.iGAP)
# head(abalone.iGAP, 3)
# class(abalone.iGAP)

## -----------------------------------------------------------------------------
# int_detect_format(mushroom.int)
# int_detect_format(abalone.int)
# int_detect_format(abalone.iGAP)

## -----------------------------------------------------------------------------
# int_list_conversions()

## -----------------------------------------------------------------------------
# # RSDA to MM
# mushroom.MM <- int_convert_format(mushroom.int, to = "MM")
# head(mushroom.MM, 3)

## -----------------------------------------------------------------------------
# # iGAP to MM
# abalone.MM <- int_convert_format(abalone.iGAP, to = "MM")
# head(abalone.MM, 3)

## -----------------------------------------------------------------------------
# # iGAP to RSDA
# data(face.iGAP)
# face.RSDA <- int_convert_format(face.iGAP, to = "RSDA")
# head(face.RSDA, 3)

## -----------------------------------------------------------------------------
# # RSDA to MM
# mushroom.MM <- RSDA_to_MM(mushroom.int, RSDA = TRUE)
# head(mushroom.MM, 3)

## -----------------------------------------------------------------------------
# # MM to iGAP
# mushroom.iGAP <- MM_to_iGAP(mushroom.MM)
# head(mushroom.iGAP, 3)

## -----------------------------------------------------------------------------
# # iGAP to MM
# data(face.iGAP)
# face.MM <- iGAP_to_MM(face.iGAP, location = 1:6)
# head(face.MM, 3)

## -----------------------------------------------------------------------------
# # MM to RSDA
# face.RSDA <- MM_to_RSDA(face.MM)
# head(face.RSDA, 3)
# class(face.RSDA)

## -----------------------------------------------------------------------------
# # iGAP to RSDA (direct, one-step)
# abalone.RSDA <- iGAP_to_RSDA(abalone.iGAP, location = 1:7)
# head(abalone.RSDA, 3)
# class(abalone.RSDA)

## -----------------------------------------------------------------------------
# # RSDA to iGAP
# mushroom.iGAP2 <- RSDA_to_iGAP(mushroom.int)
# head(mushroom.iGAP2, 3)

## -----------------------------------------------------------------------------
# data(mushroom.int.mm)
# head(mushroom.int.mm, 3)

## -----------------------------------------------------------------------------
# mushroom_set <- set_variable_format(
#   data = mushroom.int.mm, location = 8,
#   var = "Species"
# )
# head(mushroom_set, 3)

## -----------------------------------------------------------------------------
# mushroom_tmp <- RSDA_format(
#   data = mushroom_set,
#   sym_type1 = c("I", "I", "I", "S"),
#   location = c(25, 27, 29, 31),
#   sym_type2 = c("S"),
#   var = c("Species")
# )
# head(mushroom_tmp, 3)

## -----------------------------------------------------------------------------
# mushroom_clean <- clean_colnames(data = mushroom_tmp)
# head(mushroom_clean, 3)

## -----------------------------------------------------------------------------
# write_symbolic_csv(mushroom_clean, file = "mushroom_interval.csv")
# mushroom_int <- read_symbolic_csv(file = "mushroom_interval.csv")
# head(mushroom_int, 3)
# class(mushroom_int)

## ----include=FALSE------------------------------------------------------------
# file.remove("mushroom_interval.csv")

## -----------------------------------------------------------------------------
# BLOOD[1:3, 1:2]

## -----------------------------------------------------------------------------
# A1 <- c(50, 60, 70, 80, 90, 100, 110, 120)
# B1 <- c(0.00, 0.02, 0.08, 0.32, 0.62, 0.86, 0.92, 1.00)
# A2 <- c(50, 60, 70, 80, 90, 100, 110, 120)
# B2 <- c(0.00, 0.05, 0.12, 0.42, 0.68, 0.88, 0.94, 1.00)
# A3 <- c(50, 60, 70, 80, 90, 100, 110, 120)
# B3 <- c(0.00, 0.03, 0.24, 0.36, 0.75, 0.85, 0.98, 1.00)
# 
# ListOfWeight <- list(
#   distributionH(A1, B1),
#   distributionH(A2, B2),
#   distributionH(A3, B3)
# )
# 
# Weight <- methods::new("MatH",
#   nrows = 3, ncols = 1, ListOfDist = ListOfWeight,
#   names.rows = c("20s", "30s", "40s"),
#   names.cols = c("weight"), by.row = FALSE
# )
# Weight

## -----------------------------------------------------------------------------
# data(mushroom.int)
# var_name <- c("Stipe.Length", "Stipe.Thickness")
# int_mean(mushroom.int, var_name, method = c("CM", "FV", "EJD"))

## -----------------------------------------------------------------------------
# data(mushroom.int)
# var_name <- c("Pileus.Cap.Width", "Stipe.Length")
# method <- c("CM", "VM", "QM", "SE", "FV", "EJD", "GQ", "SPT")
# 
# mean_mat <- int_mean(mushroom.int, var_name, method)
# mean_mat
# 
# var_mat <- int_var(mushroom.int, var_name, method)
# var_mat

## ----fig.width=7, fig.height=8------------------------------------------------
# cols <- c("#4E79A7", "#F28E2B")
# par(mfrow = c(2, 1), mar = c(5, 4, 3, 6), las = 2, xpd = TRUE)
# 
# # --- Mean across eight methods ---
# bp <- barplot(t(mean_mat),
#   beside = TRUE, col = cols,
#   main = "Interval Mean by Method (mushroom.int)",
#   ylab = "Mean",
#   ylim = c(0, max(mean_mat) * 1.25)
# )
# legend("topright",
#   inset = c(-0.18, 0),
#   legend = colnames(mean_mat), fill = cols, bty = "n", cex = 0.85
# )
# 
# # --- Variance across eight methods ---
# bp <- barplot(t(var_mat),
#   beside = TRUE, col = cols,
#   main = "Interval Variance by Method (mushroom.int)",
#   ylab = "Variance",
#   ylim = c(0, max(var_mat) * 1.25)
# )
# legend("topright",
#   inset = c(-0.18, 0),
#   legend = colnames(var_mat), fill = cols, bty = "n", cex = 0.85
# )

## -----------------------------------------------------------------------------
# cov_list <- int_cov(mushroom.int, "Pileus.Cap.Width", "Stipe.Length", method)
# cor_list <- int_cor(mushroom.int, "Pileus.Cap.Width", "Stipe.Length", method)
# 
# # Collect scalar values into named vectors for display and plotting
# cov_vec <- sapply(cov_list, function(x) x[1, 1])
# cor_vec <- sapply(cor_list, function(x) x[1, 1])
# 
# data.frame(
#   Method = names(cov_vec), Covariance = round(cov_vec, 4),
#   Correlation = round(cor_vec, 4), row.names = NULL
# )

## ----fig.width=7, fig.height=8------------------------------------------------
# par(mfrow = c(2, 1), mar = c(5, 4, 3, 1), las = 2)
# 
# # --- Covariance across eight methods ---
# bar_cols <- c(
#   "#4E79A7", "#59A14F", "#F28E2B", "#E15759",
#   "#76B7B2", "#EDC948", "#B07AA1", "#FF9DA7"
# )
# bp <- barplot(cov_vec,
#   col = bar_cols, border = NA,
#   main = "Cov(Pileus.Cap.Width, Stipe.Length) by Method",
#   ylab = "Covariance",
#   ylim = c(0, max(cov_vec) * 1.25)
# )
# text(bp, cov_vec, labels = round(cov_vec, 2), pos = 3, cex = 0.8)
# 
# # --- Correlation across eight methods ---
# bp <- barplot(cor_vec,
#   col = bar_cols, border = NA,
#   main = "Cor(Pileus.Cap.Width, Stipe.Length) by Method",
#   ylab = "Correlation",
#   ylim = c(0, 1.15)
# )
# text(bp, cor_vec, labels = round(cor_vec, 2), pos = 3, cex = 0.8)
# abline(h = 1, lty = 2, col = "grey50")

## -----------------------------------------------------------------------------
# data(mushroom.int)
# 
# # Width = upper - lower
# head(int_width(mushroom.int, "Stipe.Length"))
# 
# # Radius = width / 2
# head(int_radius(mushroom.int, "Stipe.Length"))
# 
# # Center = (lower + upper) / 2
# head(int_center(mushroom.int, "Stipe.Length"))
# 
# # Midrange
# head(int_midrange(mushroom.int, "Stipe.Length"))

## -----------------------------------------------------------------------------
# # Overlap between two interval variables
# head(int_overlap(mushroom.int, "Stipe.Length", "Stipe.Thickness"))
# 
# # Containment: proportion of var_name2 contained within var_name1
# head(int_containment(mushroom.int, "Stipe.Length", "Stipe.Thickness"))

## -----------------------------------------------------------------------------
# data(mushroom.int)
# 
# # Median (default method = "CM")
# int_median(mushroom.int, "Stipe.Length")
# 
# # Quantiles
# int_quantile(mushroom.int, "Stipe.Length", probs = c(0.25, 0.5, 0.75))
# 
# # Compare median across methods
# int_median(mushroom.int, "Stipe.Length", method = c("CM", "FV"))

## -----------------------------------------------------------------------------
# # Range (max - min)
# int_range(mushroom.int, "Stipe.Length")
# 
# # Interquartile range (Q3 - Q1)
# int_iqr(mushroom.int, "Stipe.Length")
# 
# # Median absolute deviation
# int_mad(mushroom.int, "Stipe.Length")
# 
# # Mode (histogram-based estimation)
# int_mode(mushroom.int, "Stipe.Length")

## -----------------------------------------------------------------------------
# data(mushroom.int)
# 
# # Compare standard mean vs trimmed mean (10% trim)
# int_mean(mushroom.int, "Stipe.Length", method = "CM")
# int_trimmed_mean(mushroom.int, "Stipe.Length", trim = 0.1, method = "CM")
# 
# # Winsorized mean: extreme values are replaced (not removed)
# int_winsorized_mean(mushroom.int, "Stipe.Length", trim = 0.1, method = "CM")

## -----------------------------------------------------------------------------
# int_var(mushroom.int, "Stipe.Length", method = "CM")
# int_trimmed_var(mushroom.int, "Stipe.Length", trim = 0.1, method = "CM")
# int_winsorized_var(mushroom.int, "Stipe.Length", trim = 0.1, method = "CM")

## -----------------------------------------------------------------------------
# data(mushroom.int)
# 
# # Skewness: asymmetry of the distribution
# int_skewness(mushroom.int, "Stipe.Length", method = "CM")
# 
# # Kurtosis: tail heaviness
# int_kurtosis(mushroom.int, "Stipe.Length", method = "CM")
# 
# # Symmetry coefficient
# int_symmetry(mushroom.int, "Stipe.Length", method = "CM")
# 
# # Tailedness (related to kurtosis)
# int_tailedness(mushroom.int, "Stipe.Length", method = "CM")

## -----------------------------------------------------------------------------
# data(mushroom.int)
# 
# int_jaccard(mushroom.int, "Stipe.Length", "Stipe.Thickness")
# int_dice(mushroom.int, "Stipe.Length", "Stipe.Thickness")
# int_cosine(mushroom.int, "Stipe.Length", "Stipe.Thickness")
# int_overlap_coefficient(mushroom.int, "Stipe.Length", "Stipe.Thickness")

## -----------------------------------------------------------------------------
# int_tanimoto(mushroom.int, "Stipe.Length", "Stipe.Thickness")

## -----------------------------------------------------------------------------
# int_similarity_matrix(mushroom.int, method = "jaccard")

## -----------------------------------------------------------------------------
# data(mushroom.int)
# 
# # Shannon entropy (higher = more uncertainty)
# int_entropy(mushroom.int, "Stipe.Length", method = "CM")
# 
# # Coefficient of variation (SD / mean)
# int_cv(mushroom.int, "Stipe.Length", method = "CM")
# 
# # Dispersion index
# int_dispersion(mushroom.int, "Stipe.Length", method = "CM")

## -----------------------------------------------------------------------------
# # Imprecision: based on interval widths
# int_imprecision(mushroom.int, "Stipe.Length")
# 
# # Granularity: variability in interval sizes
# int_granularity(mushroom.int, "Stipe.Length")
# 
# # Uniformity: inverse of granularity (higher = more uniform)
# int_uniformity(mushroom.int, "Stipe.Length")
# 
# # Normalized information content (between 0 and 1)
# int_information_content(mushroom.int, "Stipe.Length", method = "CM")

## -----------------------------------------------------------------------------
# data(car.int)
# car_num <- car.int[, 2:5]
# head(car_num, 3)

## -----------------------------------------------------------------------------
# # Euclidean distance between observations
# int_dist(car_num, method = "euclidean")

## -----------------------------------------------------------------------------
# # Return as a full matrix
# dm <- int_dist_matrix(car_num, method = "hausdorff")
# dm[1:5, 1:5]

## -----------------------------------------------------------------------------
# int_pairwise_dist(car_num, "Price", "Max_Velocity", method = "euclidean")

## -----------------------------------------------------------------------------
# all_dists <- int_dist_all(car_num)
# names(all_dists)

## -----------------------------------------------------------------------------
# all_methods <- c("BG", "BD", "B", "L2W")
# var_names <- c("Cholesterol", "Hemoglobin")
# 
# # Compute mean for each variable and method
# mean_mat <- sapply(all_methods, function(m) {
#   sapply(var_names, function(v) hist_mean(BLOOD, v, method = m))
# })
# rownames(mean_mat) <- var_names
# mean_mat
# 
# # Compute variance for each variable and method
# var_mat <- sapply(all_methods, function(m) {
#   sapply(var_names, function(v) hist_var(BLOOD, v, method = m))
# })
# rownames(var_mat) <- var_names
# var_mat

## ----fig.width=7, fig.height=8------------------------------------------------
# bar_cols <- c("#4E79A7", "#59A14F", "#F28E2B", "#E15759")
# par(mfrow = c(2, 2), mar = c(4, 5, 3, 1), las = 1)
# 
# # --- Mean: Cholesterol ---
# bp <- barplot(mean_mat["Cholesterol", ],
#   col = bar_cols, border = NA,
#   main = "Mean of Cholesterol", ylab = "Mean",
#   ylim = c(0, max(mean_mat["Cholesterol", ]) * 1.15)
# )
# text(bp, mean_mat["Cholesterol", ],
#   labels = round(mean_mat["Cholesterol", ], 2), pos = 3, cex = 0.8
# )
# 
# # --- Mean: Hemoglobin ---
# bp <- barplot(mean_mat["Hemoglobin", ],
#   col = bar_cols, border = NA,
#   main = "Mean of Hemoglobin", ylab = "Mean",
#   ylim = c(0, max(mean_mat["Hemoglobin", ]) * 1.15)
# )
# text(bp, mean_mat["Hemoglobin", ],
#   labels = round(mean_mat["Hemoglobin", ], 2), pos = 3, cex = 0.8
# )
# 
# # --- Variance: Cholesterol ---
# bp <- barplot(var_mat["Cholesterol", ],
#   col = bar_cols, border = NA,
#   main = "Variance of Cholesterol", ylab = "Variance",
#   ylim = c(0, max(var_mat["Cholesterol", ]) * 1.25)
# )
# text(bp, var_mat["Cholesterol", ],
#   labels = round(var_mat["Cholesterol", ], 1), pos = 3, cex = 0.8
# )
# 
# # --- Variance: Hemoglobin ---
# bp <- barplot(var_mat["Hemoglobin", ],
#   col = bar_cols, border = NA,
#   main = "Variance of Hemoglobin", ylab = "Variance",
#   ylim = c(0, max(var_mat["Hemoglobin", ]) * 1.25)
# )
# text(bp, var_mat["Hemoglobin", ],
#   labels = round(var_mat["Hemoglobin", ], 4), pos = 3, cex = 0.8
# )

## -----------------------------------------------------------------------------
# cov_vec <- sapply(all_methods, function(m) {
#   hist_cov(BLOOD, "Cholesterol", "Hemoglobin", method = m)
# })
# cor_vec <- sapply(all_methods, function(m) {
#   hist_cor(BLOOD, "Cholesterol", "Hemoglobin", method = m)
# })
# 
# data.frame(
#   Method = all_methods,
#   Covariance = round(cov_vec, 4),
#   Correlation = round(cor_vec, 4),
#   row.names = NULL
# )

## ----fig.width=7, fig.height=4------------------------------------------------
# par(mfrow = c(1, 2), mar = c(4, 5, 3, 1), las = 1)
# 
# # --- Covariance ---
# bp <- barplot(cov_vec,
#   col = bar_cols, border = NA,
#   main = "Cov(Cholesterol, Hemoglobin)",
#   ylab = "Covariance",
#   ylim = c(min(cov_vec) * 1.35, 0)
# )
# text(bp, cov_vec, labels = round(cov_vec, 2), pos = 1, cex = 0.8)
# 
# # --- Correlation ---
# bp <- barplot(cor_vec,
#   col = bar_cols, border = NA,
#   main = "Cor(Cholesterol, Hemoglobin)",
#   ylab = "Correlation",
#   ylim = c(min(cor_vec) * 1.4, 0)
# )
# text(bp, cor_vec, labels = round(cor_vec, 2), pos = 1, cex = 0.8)
# abline(h = -1, lty = 2, col = "grey50")

## ----eda-aggregate------------------------------------------------------------
# set.seed(42)
# iris_int <- aggregate_to_symbolic(
#   iris,
#   type = "int",
#   group_by = "kmeans",
#   stratify_var = "Species",
#   K = 10,
#   zero_width = "remove"
# )
# iris_int

## ----eda-indeximage, eval = has_ggInterval, fig.width = 8, fig.height = 5-----
# library(ggInterval)
# library(ggplot2)
# ggInterval_indexImage(iris_int[, 2:5], plotAll = TRUE, full_strip = FALSE,
#                       column_condition = FALSE) +
#   scale_colour_distiller(palette = "Spectral")
# 
# 

## ----eda-pca, eval = has_ggInterval, fig.width = 6, fig.height = 5------------
# ggInterval_PCA(iris_int[, 2:5])

## ----eda-radar, eval = has_ggInterval, fig.width = 7, fig.height = 7----------
# data(environment.mix)
# ggInterval_radarplot(environment.mix,
#   plotPartial = c(4, 6),
#   showLegend = FALSE, addText = FALSE
# )

## ----eda-ts, fig.width = 12, fig.height = 4-----------------------------------
# library(ggplot2)
# data(irish_wind.its)
# wind_sub <- irish_wind.its[1:12, ]
# 
# # Reshape to long format
# stations <- c("BIR", "DUB", "KIL", "SHA", "VAL")
# wind_long <- do.call(rbind, lapply(stations, function(st) {
#   data.frame(
#     month_num = seq_len(12),
#     Station   = st,
#     lower     = wind_sub[[paste0(st, "_l")]],
#     upper     = wind_sub[[paste0(st, "_u")]],
#     mid       = (wind_sub[[paste0(st, "_l")]] + wind_sub[[paste0(st, "_u")]]) / 2
#   )
# }))
# wind_long$Station <- factor(wind_long$Station, levels = stations)
# 
# # Dodge bars for each station within each month
# n_st <- length(stations)
# bar_w <- 0.6 / n_st
# wind_long$st_idx <- as.numeric(wind_long$Station)
# wind_long$x <- wind_long$month_num +
#   (wind_long$st_idx - (n_st + 1) / 2) * bar_w
# 
# ggplot(wind_long) +
#   geom_rect(
#     aes(
#       xmin = x - bar_w / 2, xmax = x + bar_w / 2,
#       ymin = lower, ymax = upper, fill = Station
#     ),
#     alpha = 0.4, color = NA
#   ) +
#   geom_line(aes(x = x, y = mid, color = Station, group = Station),
#     linewidth = 0.5
#   ) +
#   geom_point(aes(x = x, y = mid, color = Station), size = 1) +
#   scale_x_continuous(breaks = 1:12, labels = month.abb) +
#   labs(
#     title = "Irish Wind Speed Intervals (1961)",
#     x = "Month", y = "Wind Speed (knots)"
#   ) +
#   theme_grey(base_size = 12)

## ----clust-int-helpers, include = FALSE---------------------------------------
# # Helper: extract interval-only columns as symbolic_tbl
# .get_interval_cols <- function(x) {
#   int_cols <- sapply(x, function(col) inherits(col, "symbolic_interval"))
#   if (sum(int_cols) == 0) {
#     return(x)
#   }
#   out <- x[, int_cols, drop = FALSE]
#   class(out) <- c("symbolic_tbl", class(out))
#   out
# }
# 
# # Helper: convert symbolic_tbl to 3D array [n, p, 2] for symbolicDA
# .to_3d_array <- function(x) {
#   n <- nrow(x)
#   p <- ncol(x)
#   arr <- array(0, dim = c(n, p, 2))
#   for (j in seq_len(p)) {
#     cv <- unclass(x[[j]])
#     arr[, j, 1] <- Re(cv)
#     arr[, j, 2] <- Im(cv)
#   }
#   arr
# }
# 
# # Helper: compute clustering quality (1 - WSS/TSS) from distance matrix
# .clust_quality <- function(d, cl) {
#   d <- as.matrix(d)
#   n <- nrow(d)
#   TSS <- sum(d^2) / (2 * n)
#   WSS <- 0
#   for (k in unique(cl)) {
#     idx <- which(cl == k)
#     nk <- length(idx)
#     if (nk > 1) WSS <- WSS + sum(d[idx, idx]^2) / (2 * nk)
#   }
#   1 - WSS / TSS
# }
# 
# # Helper: find optimal k via n-adaptive elbow method with 2-step lookahead
# .find_optimal_k <- function(qualities, n) {
#   ks <- as.integer(names(qualities))
#   qs <- qualities
#   valid <- !is.na(qs)
#   if (sum(valid) < 2) {
#     return(ks[which(valid)[1]])
#   }
#   valid_ks <- ks[valid]
#   valid_qs <- qs[valid]
#   gains <- diff(valid_qs)
#   max_gain <- max(gains, na.rm = TRUE)
#   if (max_gain <= 0) {
#     return(valid_ks[1])
#   }
#   threshold <- max_gain / (1 + n / 100)
#   for (i in seq_along(gains)) {
#     if (gains[i] < threshold) {
#       look <- (i + 1):min(i + 2, length(gains))
#       look <- look[look >= i + 1 & look <= length(gains)]
#       ahead <- gains[look]
#       if (length(ahead) > 0 && any(!is.na(ahead) & ahead >= threshold)) next
#       return(valid_ks[i])
#     }
#   }
#   valid_ks[length(valid_ks)]
# }

## ----clust-int, eval = has_symbolicDA && not_cran, results = "hide"-----------
# library(symbolicDA)
# 
# set.seed(123)
# 
# datasets_clust_int <- list(
#   list(name = "face.iGAP", data = "face.iGAP"),
#   list(name = "prostate.int", data = "prostate.int"),
#   list(name = "nycflights.int", data = "nycflights.int"),
#   list(name = "china_temp.int", data = "china_temp.int"),
#   list(name = "lisbon_air_quality.int", data = "lisbon_air_quality.int")
# )
# 
# clust_int_results <- do.call(rbind, lapply(datasets_clust_int, function(ds) {
#   tryCatch(
#     {
#       data(list = ds$data)
#       x <- get(ds$data)
#       if (!inherits(x, "symbolic_tbl")) {
#         x <- tryCatch(int_convert_format(x, to = "RSDA"), error = function(e) x)
#         for (i in seq_along(x)) {
#           if (is.complex(x[[i]]) && !inherits(x[[i]], "symbolic_interval")) {
#             class(x[[i]]) <- c("symbolic_interval", "vctrs_vctr")
#           }
#         }
#         if (!inherits(x, "symbolic_tbl")) {
#           class(x) <- c("symbolic_tbl", class(x))
#         }
#       }
#       x_int <- .get_interval_cols(x)
#       n <- nrow(x_int)
#       p <- ncol(x_int)
#       k_max <- min(n - 1, 10, max(3, floor(n / 5)))
# 
#       d <- int_dist_matrix(x_int, method = "hausdorff")
#       so <- simple2SO(.to_3d_array(x_int))
# 
#       km_qs <- dc_qs <- sc_qs <- setNames(
#         rep(NA_real_, k_max - 1),
#         as.character(2:k_max)
#       )
#       for (k in 2:k_max) {
#         set.seed(123)
#         km_qs[as.character(k)] <- tryCatch(
#           {
#             res <- sym.kmeans(x_int, k = k)
#             1 - res$tot.withinss / res$totss
#           },
#           error = function(e) NA
#         )
# 
#         set.seed(123)
#         dc_qs[as.character(k)] <- tryCatch(
#           {
#             cl <- DClust(d, cl = k, iter = 100)
#             .clust_quality(d, cl)
#           },
#           error = function(e) NA
#         )
# 
#         set.seed(123)
#         sc_qs[as.character(k)] <- tryCatch(
#           {
#             cl <- SClust(so, cl = k, iter = 100)
#             .clust_quality(d, cl)
#           },
#           error = function(e) NA
#         )
#       }
# 
#       km_k <- .find_optimal_k(km_qs, n)
#       km_q <- km_qs[as.character(km_k)]
#       dc_k <- .find_optimal_k(dc_qs, n)
#       dc_q <- dc_qs[as.character(dc_k)]
#       sc_k <- .find_optimal_k(sc_qs, n)
#       sc_q <- sc_qs[as.character(sc_k)]
# 
#       data.frame(
#         Dataset = ds$name, n = n, p = p,
#         sym.kmeans = sprintf("%.4f (%d)", km_q, km_k),
#         DClust = sprintf("%.4f (%d)", dc_q, dc_k),
#         SClust = sprintf("%.4f (%d)", sc_q, sc_k)
#       )
#     },
#     error = function(e) NULL
#   )
# }))

## ----clust-int-table, eval = has_symbolicDA && not_cran-----------------------
# kable(clust_int_results,
#   row.names = FALSE,
#   caption = "Table 4: Interval clustering quality (1 - WSS/TSS) with optimal k in parentheses"
# )

## ----clust-hist-helpers, include = FALSE--------------------------------------
# # Helper: parse a dataSDA histogram string into a HistDAWass distributionH
# # Format: "{[lo, hi), prob; [lo, hi], prob; ...}"
# .parse_hist_to_distH <- function(s) {
#   s <- trimws(sub("^\\{", "", sub("\\}$", "", s)))
#   bins <- trimws(strsplit(s, ";")[[1]])
#   xs <- numeric(0)
#   ps <- numeric(0)
#   for (b in bins) {
#     b_clean <- gsub("\\[|\\]|\\(|\\)", "", b) # strip brackets
#     parts <- as.numeric(trimws(strsplit(b_clean, ",")[[1]]))
#     lo <- parts[1]
#     hi <- parts[2]
#     p <- parts[3]
#     if (length(xs) == 0) xs <- lo
#     xs <- c(xs, hi)
#     ps <- c(ps, p)
#   }
#   cp <- c(0, cumsum(ps))
#   cp[length(cp)] <- 1 # ensure exact 1
#   distributionH(xs, cp)
# }
# 
# # Helper: convert a dataSDA histogram data frame to a HistDAWass MatH object
# # Keeps histogram columns with >50% non-NA, then drops incomplete rows
# .dataSDA_hist_to_MatH <- function(df) {
#   df <- as.data.frame(df)
#   hist_cols <- names(df)[sapply(df, is.character)]
#   hist_cols <- hist_cols[sapply(hist_cols, function(cn) {
#     any(grepl("^\\{\\[", na.omit(df[[cn]])))
#   })]
#   # Keep only columns where >50% of values are non-NA (handles conditional vars)
#   hist_cols <- hist_cols[sapply(hist_cols, function(cn) {
#     mean(!is.na(df[[cn]])) > 0.5
#   })]
#   # Drop rows with remaining NAs
#   complete <- complete.cases(df[, hist_cols, drop = FALSE])
#   df <- df[complete, , drop = FALSE]
#   n <- nrow(df)
#   p <- length(hist_cols)
#   dists <- vector("list", n * p)
#   for (j in seq_along(hist_cols)) {
#     for (i in seq_len(n)) {
#       dists[[(j - 1) * n + i]] <- .parse_hist_to_distH(df[[hist_cols[j]]][i])
#     }
#   }
#   rn <- if (!is.null(rownames(df))) rownames(df) else paste0("I", seq_len(n))
#   methods::new("MatH",
#     nrows = n, ncols = p,
#     ListOfDist = dists,
#     names.rows = rn,
#     names.cols = hist_cols,
#     by.row = FALSE
#   )
# }

## ----clust-hist, results = "hide"---------------------------------------------
# set.seed(123)
# 
# datasets_clust_hist <- list(
#   list(name = "age_pyramids.hist"),
#   list(name = "ozone.hist"),
#   list(name = "china_climate_season.hist"),
#   list(name = "french_agriculture.hist"),
#   list(name = "flights_detail.hist")
# )
# 
# clust_hist_results <- do.call(rbind, lapply(datasets_clust_hist, function(ds) {
#   tryCatch(
#     {
#       data(list = ds$name, package = "dataSDA")
#       raw <- get(ds$name)
#       x <- .dataSDA_hist_to_MatH(raw)
#       n <- nrow(x@M)
#       p <- ncol(x@M)
#       k_max <- min(n - 1, 10, max(3, floor(n / 5)))
# 
#       # Precompute Wasserstein distance matrix and hclust tree (shared across k)
#       dm <- WH_MAT_DIST(x)
#       set.seed(123)
#       hc <- WH_hclust(x, simplify = TRUE)
# 
#       km_qs <- fc_qs <- hc_qs <- setNames(
#         rep(NA_real_, k_max - 1),
#         as.character(2:k_max)
#       )
#       for (k in 2:k_max) {
#         set.seed(123)
#         km_qs[as.character(k)] <- tryCatch(
#           {
#             res <- WH_kmeans(x, k = k)
#             res$quality
#           },
#           error = function(e) NA
#         )
# 
#         set.seed(123)
#         fc_qs[as.character(k)] <- tryCatch(
#           {
#             res <- WH_fcmeans(x, k = k)
#             res$quality
#           },
#           error = function(e) NA
#         )
# 
#         set.seed(123)
#         hc_qs[as.character(k)] <- tryCatch(
#           {
#             cl <- cutree(hc, k = k)
#             .clust_quality(dm, cl)
#           },
#           error = function(e) NA
#         )
#       }
# 
#       km_k <- .find_optimal_k(km_qs, n)
#       km_q <- km_qs[as.character(km_k)]
#       fc_k <- .find_optimal_k(fc_qs, n)
#       fc_q <- fc_qs[as.character(fc_k)]
#       hc_k <- .find_optimal_k(hc_qs, n)
#       hc_q <- hc_qs[as.character(hc_k)]
# 
#       data.frame(
#         Dataset = ds$name, n = n, p = p,
#         WH_kmeans = sprintf("%.4f (%d)", km_q, km_k),
#         WH_fcmeans = sprintf("%.4f (%d)", fc_q, fc_k),
#         WH_hclust = sprintf("%.4f (%d)", hc_q, hc_k)
#       )
#     },
#     error = function(e) NULL
#   )
# }))

## ----clust-hist-table---------------------------------------------------------
# kable(clust_hist_results,
#   row.names = FALSE,
#   caption = "Table 5: Histogram clustering quality (1 - WSS/TSS) with optimal k in parentheses"
# )

## ----class-helpers, include = FALSE-------------------------------------------
# # Helper: extract class labels from symbolic_set or character/factor column
# .get_class_labels <- function(x, col) {
#   cls <- x[[col]]
#   if (inherits(cls, "symbolic_set")) {
#     factor(vapply(cls, function(v) paste(v, collapse = ","), character(1)))
#   } else {
#     factor(cls)
#   }
# }
# 
# # Helper: build IData from interval columns of a symbolic_tbl
# .build_IData <- function(x) {
#   int_cols <- sapply(x, function(col) inherits(col, "symbolic_interval"))
#   df <- data.frame(row.names = seq_len(nrow(x)))
#   for (v in names(x)[int_cols]) {
#     cv <- unclass(x[[v]])
#     df[[paste0(v, "_l")]] <- Re(cv)
#     df[[paste0(v, "_u")]] <- Im(cv)
#   }
#   MAINT.Data::IData(df)
# }

## ----class-int, eval = has_MAINT && has_e1071 && not_cran, results = "hide"----
# library(MAINT.Data)
# library(e1071)
# 
# datasets_class <- list(
#   list(
#     name = "cars.int", data = "cars.int",
#     class_col = "class",
#     class_desc = "class: Utilitarian(7), Berlina(8), Sportive(8), Luxury(4)"
#   ),
#   list(
#     name = "china_temp.int", data = "china_temp.int",
#     class_col = "GeoReg",
#     class_desc = "GeoReg: 6 regions"
#   ),
#   list(
#     name = "mushroom.int", data = "mushroom.int",
#     class_col = "Edibility",
#     class_desc = "Edibility: T(4), U(2), Y(17)"
#   ),
#   list(
#     name = "ohtemp.int", data = "ohtemp.int",
#     class_col = "STATE",
#     class_desc = "STATE: 10 groups"
#   ),
#   list(
#     name = "wine.int", data = "wine.int",
#     class_col = "class",
#     class_desc = "class: 1(21), 2(12)"
#   )
# )
# 
# class_results <- do.call(rbind, lapply(datasets_class, function(ds) {
#   tryCatch(
#     {
#       data(list = ds$data)
#       x <- get(ds$data)
#       grp <- .get_class_labels(x, ds$class_col)
# 
#       idata <- .build_IData(x)
# 
#       int_cols <- sapply(x, function(col) inherits(col, "symbolic_interval"))
#       svm_df <- data.frame(row.names = seq_len(nrow(x)))
#       for (v in names(x)[int_cols]) {
#         cv <- unclass(x[[v]])
#         svm_df[[paste0(v, "_l")]] <- Re(cv)
#         svm_df[[paste0(v, "_u")]] <- Im(cv)
#       }
# 
#       set.seed(123)
#       lda_acc <- tryCatch(
#         {
#           res <- MAINT.Data::lda(idata, grouping = grp)
#           pred <- predict(res, idata)
#           mean(pred$class == grp)
#         },
#         error = function(e) NA
#       )
# 
#       set.seed(123)
#       qda_acc <- tryCatch(
#         {
#           res <- MAINT.Data::qda(idata, grouping = grp)
#           pred <- predict(res, idata)
#           mean(pred$class == grp)
#         },
#         error = function(e) NA
#       )
# 
#       set.seed(123)
#       svm_acc <- tryCatch(
#         {
#           svm_df$class <- grp
#           res <- svm(class ~ ., data = svm_df, kernel = "radial")
#           pred <- predict(res, svm_df)
#           mean(pred == grp)
#         },
#         error = function(e) NA
#       )
# 
#       data.frame(
#         Dataset = ds$name, Response = ds$class_desc,
#         LDA = lda_acc, QDA = qda_acc, SVM = svm_acc
#       )
#     },
#     error = function(e) NULL
#   )
# }))

## ----class-int-table, eval = has_MAINT && has_e1071 && not_cran---------------
# kable(class_results,
#   digits = 4, row.names = FALSE,
#   caption = "Table 6: Classification accuracy (resubstitution)"
# )

## ----reg-int, results = "hide"------------------------------------------------
# datasets_reg <- list(
#   list(
#     name = "abalone.iGAP", data = "abalone.iGAP",
#     response = "Length", n_x = 6
#   ),
#   list(
#     name = "cardiological.int", data = "cardiological.int",
#     response = "pulse", n_x = 4
#   ),
#   list(
#     name = "nycflights.int", data = "nycflights.int",
#     response = "distance", n_x = 3
#   ),
#   list(
#     name = "oils.int", data = "oils.int",
#     response = "specific_gravity", n_x = 3
#   ),
#   list(
#     name = "prostate.int", data = "prostate.int",
#     response = "lpsa", n_x = 8
#   )
# )
# 
# reg_results <- do.call(rbind, lapply(datasets_reg, function(ds) {
#   tryCatch(
#     {
#       data(list = ds$data)
#       x <- get(ds$data)
# 
#       if (!inherits(x, "symbolic_tbl")) {
#         x2 <- tryCatch(int_convert_format(x, to = "RSDA"), error = function(e) NULL)
#         if (!is.null(x2)) {
#           x <- x2
#           for (i in seq_along(x)) {
#             if (is.complex(x[[i]]) && !inherits(x[[i]], "symbolic_interval")) {
#               class(x[[i]]) <- c("symbolic_interval", "vctrs_vctr")
#             }
#           }
#           if (!inherits(x, "symbolic_tbl")) {
#             class(x) <- c("symbolic_tbl", class(x))
#           }
#         } else {
#           cn <- colnames(x)
#           l_cols <- grep("_l$", cn, value = TRUE)
#           vars <- sub("_l$", "", l_cols)
#           out <- data.frame(row.names = seq_len(nrow(x)))
#           for (v in vars) {
#             lv <- x[[paste0(v, "_l")]]
#             uv <- x[[paste0(v, "_u")]]
#             si <- complex(real = lv, imaginary = uv)
#             class(si) <- c("symbolic_interval", "vctrs_vctr")
#             out[[v]] <- si
#           }
#           class(out) <- c("symbolic_tbl", class(out))
#           x <- out
#         }
#       }
#       x_int <- .get_interval_cols(x)
# 
#       fml <- as.formula(paste(ds$response, "~ ."))
#       nc <- data.frame(row.names = seq_len(nrow(x_int)))
#       for (v in names(x_int)) {
#         cv <- unclass(x_int[[v]])
#         nc[[v]] <- (Re(cv) + Im(cv)) / 2
#       }
#       actual <- nc[[ds$response]]
#       resp_idx <- which(names(x_int) == ds$response)
#       .r2 <- function(a, p) 1 - sum((a - p)^2) / sum((a - mean(a))^2)
# 
#       set.seed(123)
#       lm_r2 <- tryCatch(
#         {
#           res <- sym.lm(fml, sym.data = x_int, method = "cm")
#           summary(res)$r.squared
#         },
#         error = function(e) NA
#       )
# 
#       set.seed(123)
#       glm_r2 <- tryCatch(
#         {
#           res <- sym.glm(sym.data = x_int, response = resp_idx, method = "cm")
#           pred <- as.numeric(predict(res,
#             newx = as.matrix(nc[, -resp_idx]),
#             s = "lambda.min"
#           ))
#           .r2(actual, pred)
#         },
#         error = function(e) NA
#       )
# 
#       set.seed(123)
#       rf_r2 <- tryCatch(
#         {
#           res <- sym.rf(fml, sym.data = x_int, method = "cm")
#           tail(res$rsq, 1)
#         },
#         error = function(e) NA
#       )
# 
#       set.seed(123)
#       rt_r2 <- tryCatch(
#         {
#           res <- sym.rt(fml, sym.data = x_int, method = "cm")
#           .r2(actual, predict(res))
#         },
#         error = function(e) NA
#       )
# 
#       set.seed(123)
#       nnet_r2 <- tryCatch(
#         {
#           res <- sym.nnet(fml, sym.data = x_int, method = "cm")
#           pred_sc <- as.numeric(res$net.result[[1]])
#           pred <- pred_sc * res$data_c_sds[resp_idx] + res$data_c_means[resp_idx]
#           .r2(actual, pred)
#         },
#         error = function(e) NA
#       )
# 
#       data.frame(
#         Dataset = ds$name, Response = ds$response, p = ds$n_x,
#         sym.lm = lm_r2, sym.glm = glm_r2, sym.rf = rf_r2,
#         sym.rt = rt_r2, sym.nnet = nnet_r2
#       )
#     },
#     error = function(e) NULL
#   )
# }))

## ----reg-int-table------------------------------------------------------------
# kable(reg_results,
#   digits = 4, row.names = FALSE,
#   caption = "Table 7: Regression R-squared"
# )
Any scripts or data that you put into this service are public.
dataSDA documentation built on June 12, 2026, 9:06 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
dataSDA
Datasets and Basic Statistics for Symbolic Data Analysis

inst/doc/dataSDA_intro.R
In dataSDA: Datasets and Basic Statistics for Symbolic Data Analysis

Try the dataSDA package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

dataSDA Datasets and Basic Statistics for Symbolic Data Analysis

inst/doc/dataSDA_intro.R In dataSDA: Datasets and Basic Statistics for Symbolic Data Analysis

Try the dataSDA package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

dataSDA
Datasets and Basic Statistics for Symbolic Data Analysis

inst/doc/dataSDA_intro.R
In dataSDA: Datasets and Basic Statistics for Symbolic Data Analysis