Data Processing & Transformation"

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  fig.align = "center",
  warning = FALSE,
  message = FALSE
)

🔄 Data Processing & Transformation with evanverse

This guide covers the comprehensive data processing and transformation capabilities of evanverse, with special focus on void value handling and data manipulation utilities.

library(evanverse)
library(dplyr)

🕳️ Understanding Void Values

In data analysis, "void" values are elements that represent missing or absent data. The evanverse package provides a comprehensive system for handling these values.

What Are Void Values?

Void values in evanverse include: - NA (missing values) - NULL (null values) - "" (empty strings)

# Examples of void values
void_examples <- list(
  numbers = c(1, NA, 3, 4),
  strings = c("A", "", "C", NA),
  mixed = c("text", NA, "", "data")
)

print("Examples of data with void values:")
str(void_examples)

🔍 Detecting Void Values

Single Value Checks

# Check if individual values are void
print(is_void(NA))           # TRUE
print(is_void(""))           # TRUE
print(is_void(NULL))         # TRUE
print(is_void("hello"))      # FALSE
print(is_void(0))            # FALSE

Vector Checks

# Check if any element in a vector is void
test_vector <- c("A", "", "C", NA, "E")
print(any_void(test_vector))  # TRUE

# Example with no void values
clean_vector <- c("A", "B", "C")
print(any_void(clean_vector))  # FALSE

Data Frame Analysis

# Create sample data with various void patterns
sample_data <- data.frame(
  id = 1:6,
  name = c("Alice", "", "Charlie", NA, "Eve", "Frank"),
  age = c(25, 30, NA, 35, 28, 32),
  city = c("NYC", "LA", "", "Chicago", NA, "Boston"),
  stringsAsFactors = FALSE
)

print("Sample data with void values:")
print(sample_data)

# Identify columns with void values
void_cols <- cols_with_void(sample_data)
print(paste("Columns with void values:", paste(void_cols, collapse = ", ")))

# Identify rows with void values
void_rows <- rows_with_void(sample_data)
print(paste("Rows with void values:", paste(void_rows, collapse = ", ")))

🔧 Replacing Void Values

Basic Replacement

# Replace all void values with a single replacement
messy_vector <- c("A", "", "C", NA, "E")
clean_vector <- replace_void(messy_vector, value = "MISSING")

print("Original vector:")
print(messy_vector)
print("After replacement:")
print(clean_vector)

Selective Replacement

# Replace only specific types of void values
mixed_data <- c("A", "", "C", NA, "E")

# Replace only empty strings
only_empty <- replace_void(mixed_data,
                          value = "EMPTY",
                          include_na = FALSE,
                          include_empty_str = TRUE)

print("Replace only empty strings:")
print(only_empty)

# Replace only NA values
only_na <- replace_void(mixed_data,
                       value = "NOT_AVAILABLE",
                       include_na = TRUE,
                       include_empty_str = FALSE)

print("Replace only NA values:")
print(only_na)

Data Frame Replacement

# Apply replacement column by column
clean_data <- sample_data
clean_data$name <- replace_void(sample_data$name, value = "UNKNOWN")
clean_data$city <- replace_void(sample_data$city, value = "UNKNOWN")

print("Data after void replacement:")
print(clean_data)

✂️ Removing Void Values

Drop Elements with Void Values

# For vectors, drop_void removes void elements
test_vector <- c("A", "", "C", NA, "E")
clean_vector <- drop_void(test_vector)

print("Original vector:")
print(test_vector)
print("After dropping void elements:")
print(clean_vector)

# For data analysis, we can identify problematic rows/columns
print("Rows with void values:")
print(rows_with_void(sample_data))
print("Columns with void values:")
print(cols_with_void(sample_data))

📊 Data Transformation

Converting Data Frames to Lists

# Group data by a key column and create lists
mtcars_subset <- mtcars[1:12, c("cyl", "mpg", "hp", "wt")]

# Group by cylinder count, focusing on MPG values
grouped_cars <- df2list(
  data = mtcars_subset,
  key_col = "cyl",
  value_col = "mpg"
)

print("Cars grouped by cylinder count (MPG values):")
str(grouped_cars)

# Access specific groups
print("4-cylinder cars MPG values:")
print(grouped_cars[["4"]])

Column Mapping

# Map values in a column using a named vector
grades_data <- data.frame(
  student = c("Alice", "Bob", "Charlie", "Diana"),
  grade_letter = c("A", "B", "A", "C")
)

# Create mapping for letter grades to numbers
grade_mapping <- c("A" = 4.0, "B" = 3.0, "C" = 2.0, "D" = 1.0, "F" = 0.0)

# Apply mapping using the correct parameters
result <- map_column(
  query = grades_data,
  by = "grade_letter",
  map = grade_mapping,
  to = "grade_numeric"
)

print("Grades with numeric mapping:")
print(result)

💾 Advanced File Operations

Flexible Table Reading

# Read various file formats with automatic detection
data1 <- read_table_flex("data.csv")
data2 <- read_table_flex("data.tsv", sep = "\t")
data3 <- read_table_flex("data.txt", header = TRUE)

# Read Excel files with flexibility
excel_data <- read_excel_flex("workbook.xlsx", sheet = "Sheet1")

File Information and Management

# Get comprehensive file information
info <- file_info("myfile.csv")
print(info)

# Extract file extensions
files <- c("data.csv", "analysis.R", "report.pdf")
extensions <- sapply(files, get_ext)
print(extensions)

# Display directory structure
file_tree(".", max_depth = 2)

⚡ Custom Operators for Data Processing

String Operations

# Paste operator for clean string concatenation
full_name <- "John" %p% " " %p% "Doe"
print(full_name)

file_path <- "data" %p% "/" %p% "analysis" %p% ".csv"
print(file_path)

Logical Operations

# Enhanced "not in" operator
fruits <- c("apple", "banana", "orange")
check_fruits <- c("apple", "grape", "banana", "kiwi")

# Find fruits not in our list
missing_fruits <- check_fruits[check_fruits %nin% fruits]
print(paste("Missing fruits:", paste(missing_fruits, collapse = ", ")))

# Enhanced identity checking
print(5 %is% 5)        # TRUE
print("a" %is% "a")    # TRUE
print(5 %is% "5")      # FALSE

Combinatorial Operations

# Generate combinations and permutations
items <- c("A", "B", "C", "D")

# Calculate combination numbers
combinations_count <- comb(4, 2)  # C(4,2) = 6
print(paste("Number of ways to choose 2 items from 4:", combinations_count))

# Calculate permutation numbers
permutations_count <- perm(4, 2)  # P(4,2) = 12
print(paste("Number of ways to arrange 2 items from 4:", permutations_count))

🔗 Complex Data Processing Workflows

Real-world Example: Survey Data Cleaning

# Simulate messy survey data
survey_data <- data.frame(
  id = 1:8,
  age = c(25, "", 30, NA, "35", 28, 0, 45),
  income = c("50000", "", NA, "75000", "60000", "invalid", "80000", ""),
  satisfaction = c(5, 4, "", 3, NA, 5, 4, 2),
  stringsAsFactors = FALSE
)

print("Original messy survey data:")
print(survey_data)

# Step 1: Identify problematic data
cat("\nData quality assessment:\n")
cat("Columns with void values:", paste(cols_with_void(survey_data), collapse = ", "), "\n")
cat("Rows with void values:", paste(rows_with_void(survey_data), collapse = ", "), "\n")

# Step 2: Clean the data
# Replace void values with appropriate defaults
survey_clean <- survey_data
survey_clean$age <- replace_void(survey_clean$age, value = "25")
survey_clean$income <- replace_void(survey_clean$income, value = "50000")
survey_clean$satisfaction <- replace_void(survey_clean$satisfaction, value = 3)

# Convert to appropriate types
survey_clean$age <- as.numeric(survey_clean$age)
survey_clean$income <- as.numeric(survey_clean$income)
survey_clean$satisfaction <- as.numeric(survey_clean$satisfaction)

# Handle special cases (e.g., age = 0, income = "invalid")
survey_clean$age[survey_clean$age == 0] <- 25
survey_clean$income[is.na(survey_clean$income)] <- 50000

print("Cleaned survey data:")
print(survey_clean)

📈 Performance Tips

Efficient Void Handling

# For large datasets, check specific columns rather than entire data frame
large_data <- data.frame(
  col1 = sample(c(1:100, NA), 1000, replace = TRUE),
  col2 = sample(c(letters, ""), 1000, replace = TRUE),
  col3 = runif(1000)
)

# Check only columns likely to have voids
critical_cols <- c("col1", "col2")
void_status <- sapply(critical_cols, function(col) any_void(large_data[[col]]))
print("Void status for critical columns:")
print(void_status)

🎯 Best Practices

  1. Always inspect your data before processing using cols_with_void() and rows_with_void()

  2. Choose appropriate replacement values that make sense in your domain context

  3. Document your void handling strategy for reproducibility

  4. Use selective replacement when different types of voids should be handled differently

  5. Validate your results after transformation to ensure data integrity

🔗 Next Steps


The evanverse data processing tools provide a robust foundation for handling real-world messy data with confidence and efficiency.



Try the evanverse package in your browser

Any scripts or data that you put into this service are public.

evanverse documentation built on March 10, 2026, 5:07 p.m.