inst/doc/data-preparation.R

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----include=FALSE------------------------------------------------------------
library(nuggets)
library(dplyr)
library(ggplot2)
library(tidyr)

options(tibble.width = Inf)

## -----------------------------------------------------------------------------
# Create a copy to avoid modifying the original dataset
mtcars_example <- mtcars
mtcars_example$cyl <- factor(mtcars_example$cyl,
                     levels= c(4, 6, 8),
                     labels = c("four", "six", "eight"))
head(mtcars_example)

## -----------------------------------------------------------------------------
partition(mtcars_example, cyl)

## -----------------------------------------------------------------------------
partition(mtcars_example, vs:gear, .method = "dummy")

## -----------------------------------------------------------------------------
partition(mtcars_example, mpg, .method = "crisp", .breaks = c(-Inf, 15, 20, 30, Inf))

## -----------------------------------------------------------------------------
partition(mtcars_example, disp, .method = "crisp", .breaks = 3)

## -----------------------------------------------------------------------------
crisp_mtcars <- mtcars_example |>
    partition(cyl, vs:gear, .method = "dummy") |>
    partition(mpg, .method = "crisp", .breaks = c(-Inf, 15, 20, 30, Inf)) |>
    partition(disp:carb, .method = "crisp", .breaks = 3) 

head(crisp_mtcars, n = 3)

## -----------------------------------------------------------------------------
# Equal-width intervals (default)
partition(CO2, conc, .method = "crisp", .breaks = 4, .style = "equal")

## -----------------------------------------------------------------------------
# Quantile-based intervals (equal frequency in each interval)
partition(CO2, conc, .method = "crisp", .breaks = 4, .style = "quantile")

## -----------------------------------------------------------------------------
# K-means clustering to find natural breakpoints
partition(CO2, conc, .method = "crisp", .breaks = 4, .style = "kmeans")

## -----------------------------------------------------------------------------
# Standard deviation-based intervals
partition(CO2, conc, .method = "crisp", .breaks = 4, .style = "sd")

## -----------------------------------------------------------------------------
# Use Lloyd's algorithm for k-means
partition(CO2, conc, .method = "crisp", .breaks = 4, 
          .style = "kmeans", 
          .style_params = list(algorithm = "Lloyd"))

## -----------------------------------------------------------------------------
# Use different quantile types (see ?quantile for details)
partition(CO2, conc, .method = "crisp", .breaks = 4, 
          .style = "quantile", 
          .style_params = list(type = 7))

## -----------------------------------------------------------------------------
data.frame(x = seq(-15, 15, length.out = 1000)) |>
    partition(x, .method = "triangle", .breaks = c(-10, 0, 10), .labels = "triangle", .keep = TRUE) |>
    partition(x, .method = "raisedcos", .breaks = c(-10, 0, 10), .labels = "raisedcos", .keep = TRUE) |>
    pivot_longer(starts_with("x="), names_to = "method", values_to = "value") |>
    mutate(method = gsub("x=", "", method)) |>
    ggplot() +
        aes(x = x, y = value, color = method) +
        geom_line(size = 1.2) +
        labs(x = "x", y = "membership degree", title = ".breaks = c(-10, 0, 10)") +
        theme_gray(base_size = 16) +
        theme(legend.position = "right")

## -----------------------------------------------------------------------------
data.frame(x = seq(-15, 15, length.out = 1000)) |>
    partition(x, .method = "triangle", .breaks = c(-10, -5, 0, 5, 10), .keep = TRUE) |>
    pivot_longer(starts_with("x="), names_to = "fuzzy set", values_to = "value") |>
    ggplot() +
        aes(x = x, y = value, color = `fuzzy set`) +
        geom_line(size = 1.2) +
        labs(x = "x", y = "membership degree", title = ".breaks = c(-10, -5, 0, 5, 10)") +
        theme_gray(base_size = 16) +
        theme(legend.position = "right")

## -----------------------------------------------------------------------------
data.frame(x = seq(-15, 15, length.out = 1000)) |>
    partition(x, .method = "triangle", .breaks = c(-Inf, -5, 0, 5, Inf), .keep = TRUE) |>
    pivot_longer(starts_with("x="), names_to = "fuzzy set", values_to = "value") |>
    ggplot() +
        aes(x = x, y = value, color = `fuzzy set`) +
        geom_line(size = 1.2) +
        labs(x = "x", y = "membership degree", title = ".breaks = c(-Inf, -5, 0, 5, Inf)") +
        theme_gray(base_size = 16) +
        theme(legend.position = "right")

## -----------------------------------------------------------------------------
data.frame(x = seq(-15, 15, length.out = 1000)) |>
    partition(x, .method = "triangle", .breaks = 4, .keep = TRUE) |>
    pivot_longer(starts_with("x="), names_to = "fuzzy set", values_to = "value") |>
    ggplot() +
        aes(x = x, y = value, color = `fuzzy set`) +
        geom_line(size = 1.2) +
        labs(x = "x", y = "membership degree", title = ".breaks = 4") +
        theme_gray(base_size = 16) +
        theme(legend.position = "right")

## -----------------------------------------------------------------------------
data.frame(x = seq(-15, 15, length.out = 1000)) |>
    partition(x, .method = "raisedcos", .breaks = c(-Inf, -10, -5, 0, 5, 10, Inf), .keep = TRUE) |>
    pivot_longer(starts_with("x="), names_to = "fuzzy set", values_to = "value") |>
    ggplot() +
        aes(x = x, y = value, color = `fuzzy set`) +
        geom_line(size = 1.2) +
        labs(x = "x", y = "membership degree", title = ".breaks = c(-Inf, -10, -5, 0, 5, 10, Inf)") +
        theme_gray(base_size = 16) +
        theme(legend.position = "right")

## ----message=FALSE------------------------------------------------------------
# Start with a fresh copy of mtcars
fuzzy_mtcars <- mtcars |>
    mutate(cyl = factor(cyl, levels = c(4, 6, 8), labels = c("four", "six", "eight"))) |>
    partition(cyl, vs:gear, .method = "dummy") |>
    partition(mpg, .method = "triangle", .breaks = c(-Inf, 15, 20, 30, Inf)) |>
    partition(disp:carb, .method = "triangle", .breaks = 3) 

head(fuzzy_mtcars, n = 3)

## -----------------------------------------------------------------------------
data.frame(x = seq(-15, 15, length.out = 1000)) |>
    partition(x, .method = "triangle", .breaks = c(-10, -5, 5, 10), .span = 2, .keep = TRUE) |>
    pivot_longer(starts_with("x="), names_to = "fuzzy set", values_to = "value") |>
    ggplot() +
        aes(x = x, y = value, color = `fuzzy set`) +
        geom_line(size = 1.2) +
        labs(x = "x", y = "membership degree", title = ".span = 2, .breaks = c(-10, -5, 5, 10)") +
        theme_gray(base_size = 16) +
        theme(legend.position = "right")

## -----------------------------------------------------------------------------
data.frame(x = seq(-15, 15, length.out = 1000)) |>
    partition(x, .method = "triangle", .breaks = c(-15, -10, -5, 0, 5, 10, 15), .inc = 1, .span = 2, .keep = TRUE) |>
    pivot_longer(starts_with("x="), names_to = "fuzzy set", values_to = "value") |>
    ggplot() +
        aes(x = x, y = value, color = `fuzzy set`) +
        geom_line(size = 1.2) +
        labs(x = "x", y = "membership degree", title = ".inc = 1, .span = 2, .breaks = c(-15, -10, -5, 0, 5, 10, 15)") +
        theme_gray(base_size = 16) +
        theme(legend.position = "right")

## -----------------------------------------------------------------------------
data.frame(x = seq(-15, 15, length.out = 1000)) |>
    partition(x, .method = "triangle", .breaks = c(-15, -10, -5, 0, 5, 10, 15), .inc = 3, .span = 2, .keep = TRUE) |>
    pivot_longer(starts_with("x="), names_to = "fuzzy set", values_to = "value") |>
    ggplot() +
        aes(x = x, y = value, color = `fuzzy set`) +
        geom_line(size = 1.2) +
        labs(x = "x", y = "membership degree", title = ".inc = 3, .span = 2, .breaks = c(-15, -10, -5, 0, 5, 10, 15)") +
        theme_gray(base_size = 16) +
        theme(legend.position = "right")

## -----------------------------------------------------------------------------
# Completely constant vector
is_almost_constant(c(1, 1, 1, 1, 1))

# Variable vector
is_almost_constant(c(1, 2, 3, 4, 5))

# Almost constant (80% are the same value)
is_almost_constant(c(1, 1, 1, 1, 2), threshold = 0.8)

# Not almost constant with threshold 0.8
is_almost_constant(c(1, 1, 1, 2, 2), threshold = 0.8)

## -----------------------------------------------------------------------------
# With NA values - by default NA is treated as a regular value
is_almost_constant(c(NA, NA, NA, 1, 2), threshold = 0.5)

# With NA removed before computing proportions
is_almost_constant(c(NA, NA, NA, 1, 2), threshold = 0.5, na_rm = TRUE)

## -----------------------------------------------------------------------------
# Create a data frame with some constant and variable columns
d <- data.frame(
  a1 = 1:10,              # variable
  a2 = c(1:9, NA),        # variable
  b1 = "b",               # constant
  b2 = NA,                # constant (all NA)
  c1 = rep(c(TRUE, FALSE), 5),  # variable
  c2 = rep(c(TRUE, NA), 5),     # 50% TRUE, 50% NA
  d  = c(rep(TRUE, 4), rep(FALSE, 4), NA, NA)  # 40% TRUE, 40% FALSE, 20% NA
)

# Remove columns that are completely constant
remove_almost_constant(d, .threshold = 1.0, .na_rm = FALSE)

# Remove columns where the majority value occurs in >= 50% of rows
remove_almost_constant(d, .threshold = 0.5, .na_rm = FALSE)

# Same as above, but removing NA before computing proportions
remove_almost_constant(d, .threshold = 0.5, .na_rm = TRUE)

## -----------------------------------------------------------------------------
# Only check columns a1 through b2
remove_almost_constant(d, a1:b2, .threshold = 0.5, .na_rm = TRUE)

## -----------------------------------------------------------------------------
# Prepare mtcars data with partition - use fresh copy
prepared_data <- mtcars |>
    mutate(cyl = factor(cyl, levels = c(4, 6, 8), labels = c("four", "six", "eight"))) |>
    partition(cyl, vs:gear, .method = "dummy") |>
    partition(mpg:carb, .method = "crisp", .breaks = 3)

# Check for and remove any almost constant columns
prepared_data <- remove_almost_constant(prepared_data, 
                                       .threshold = 0.95, 
                                       .verbose = TRUE)

## -----------------------------------------------------------------------------
# Prepare fuzzy data - use fresh copy of mtcars
fuzzy_mtcars <- mtcars |>
    mutate(cyl = factor(cyl, levels = c(4, 6, 8), labels = c("four", "six", "eight"))) |>
    partition(cyl, vs:gear, .method = "dummy") |>
    partition(mpg:carb, .method = "triangle", .breaks = 3)

# Create disjoint vector
disj <- var_names(colnames(fuzzy_mtcars))

# Find tautologies with very high confidence
tautologies <- dig_tautologies(
    fuzzy_mtcars,
    antecedent = everything(),
    consequent = everything(),
    disjoint = disj,
    min_confidence = 0.95,
    min_support = 0.1,
    max_length = 3,
    t_norm = "goguen"
)

print(tautologies)

## ----eval=FALSE---------------------------------------------------------------
# # Convert tautologies to excluded format
# excluded_conditions <- parse_condition(tautologies$antecedent)
# 
# # Use in subsequent pattern search
# results <- dig_associations(
#     fuzzy_mtcars,
#     antecedent = !starts_with("am"),
#     consequent = starts_with("am"),
#     disjoint = disj,
#     excluded = excluded_conditions,  # Exclude tautological patterns
#     min_support = 0.1,
#     min_confidence = 0.8
# )

Try the nuggets package in your browser

Any scripts or data that you put into this service are public.

nuggets documentation built on Nov. 5, 2025, 6:25 p.m.