inst/doc/data-processing.R

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  fig.align = "center",
  warning = FALSE,
  message = FALSE
)

## ----load-package-------------------------------------------------------------
library(evanverse)
library(dplyr)

## ----void-concepts------------------------------------------------------------
# Examples of void values
void_examples <- list(
  numbers = c(1, NA, 3, 4),
  strings = c("A", "", "C", NA),
  mixed = c("text", NA, "", "data")
)

print("Examples of data with void values:")
str(void_examples)

## ----void-detection-single----------------------------------------------------
# Check if individual values are void
print(is_void(NA))           # TRUE
print(is_void(""))           # TRUE
print(is_void(NULL))         # TRUE
print(is_void("hello"))      # FALSE
print(is_void(0))            # FALSE

## ----void-detection-vector----------------------------------------------------
# Check if any element in a vector is void
test_vector <- c("A", "", "C", NA, "E")
print(any_void(test_vector))  # TRUE

# Example with no void values
clean_vector <- c("A", "B", "C")
print(any_void(clean_vector))  # FALSE

## ----void-detection-dataframe-------------------------------------------------
# Create sample data with various void patterns
sample_data <- data.frame(
  id = 1:6,
  name = c("Alice", "", "Charlie", NA, "Eve", "Frank"),
  age = c(25, 30, NA, 35, 28, 32),
  city = c("NYC", "LA", "", "Chicago", NA, "Boston"),
  stringsAsFactors = FALSE
)

print("Sample data with void values:")
print(sample_data)

# Identify columns with void values
void_cols <- cols_with_void(sample_data)
print(paste("Columns with void values:", paste(void_cols, collapse = ", ")))

# Identify rows with void values
void_rows <- rows_with_void(sample_data)
print(paste("Rows with void values:", paste(void_rows, collapse = ", ")))

## ----void-replacement-basic---------------------------------------------------
# Replace all void values with a single replacement
messy_vector <- c("A", "", "C", NA, "E")
clean_vector <- replace_void(messy_vector, value = "MISSING")

print("Original vector:")
print(messy_vector)
print("After replacement:")
print(clean_vector)

## ----void-replacement-selective-----------------------------------------------
# Replace only specific types of void values
mixed_data <- c("A", "", "C", NA, "E")

# Replace only empty strings
only_empty <- replace_void(mixed_data,
                          value = "EMPTY",
                          include_na = FALSE,
                          include_empty_str = TRUE)

print("Replace only empty strings:")
print(only_empty)

# Replace only NA values
only_na <- replace_void(mixed_data,
                       value = "NOT_AVAILABLE",
                       include_na = TRUE,
                       include_empty_str = FALSE)

print("Replace only NA values:")
print(only_na)

## ----void-replacement-dataframe-----------------------------------------------
# Apply replacement column by column
clean_data <- sample_data
clean_data$name <- replace_void(sample_data$name, value = "UNKNOWN")
clean_data$city <- replace_void(sample_data$city, value = "UNKNOWN")

print("Data after void replacement:")
print(clean_data)

## ----drop-elements------------------------------------------------------------
# For vectors, drop_void removes void elements
test_vector <- c("A", "", "C", NA, "E")
clean_vector <- drop_void(test_vector)

print("Original vector:")
print(test_vector)
print("After dropping void elements:")
print(clean_vector)

# For data analysis, we can identify problematic rows/columns
print("Rows with void values:")
print(rows_with_void(sample_data))
print("Columns with void values:")
print(cols_with_void(sample_data))

## ----df-to-list---------------------------------------------------------------
# Group data by a key column and create lists
mtcars_subset <- mtcars[1:12, c("cyl", "mpg", "hp", "wt")]

# Group by cylinder count, focusing on MPG values
grouped_cars <- df2list(
  data = mtcars_subset,
  key_col = "cyl",
  value_col = "mpg"
)

print("Cars grouped by cylinder count (MPG values):")
str(grouped_cars)

# Access specific groups
print("4-cylinder cars MPG values:")
print(grouped_cars[["4"]])

## ----column-mapping-----------------------------------------------------------
# Map values in a column using a named vector
grades_data <- data.frame(
  student = c("Alice", "Bob", "Charlie", "Diana"),
  grade_letter = c("A", "B", "A", "C")
)

# Create mapping for letter grades to numbers
grade_mapping <- c("A" = 4.0, "B" = 3.0, "C" = 2.0, "D" = 1.0, "F" = 0.0)

# Apply mapping using the correct parameters
result <- map_column(
  query = grades_data,
  by = "grade_letter",
  map = grade_mapping,
  to = "grade_numeric"
)

print("Grades with numeric mapping:")
print(result)

## ----file-reading, eval=FALSE-------------------------------------------------
# # Read various file formats with automatic detection
# data1 <- read_table_flex("data.csv")
# data2 <- read_table_flex("data.tsv", sep = "\t")
# data3 <- read_table_flex("data.txt", header = TRUE)
# 
# # Read Excel files with flexibility
# excel_data <- read_excel_flex("workbook.xlsx", sheet = "Sheet1")

## ----file-info, eval=FALSE----------------------------------------------------
# # Get comprehensive file information
# info <- file_info("myfile.csv")
# print(info)
# 
# # Extract file extensions
# files <- c("data.csv", "analysis.R", "report.pdf")
# extensions <- sapply(files, get_ext)
# print(extensions)
# 
# # Display directory structure
# file_tree(".", max_depth = 2)

## ----string-operators---------------------------------------------------------
# Paste operator for clean string concatenation
full_name <- "John" %p% " " %p% "Doe"
print(full_name)

file_path <- "data" %p% "/" %p% "analysis" %p% ".csv"
print(file_path)

## ----logical-operators--------------------------------------------------------
# Enhanced "not in" operator
fruits <- c("apple", "banana", "orange")
check_fruits <- c("apple", "grape", "banana", "kiwi")

# Find fruits not in our list
missing_fruits <- check_fruits[check_fruits %nin% fruits]
print(paste("Missing fruits:", paste(missing_fruits, collapse = ", ")))

# Enhanced identity checking
print(5 %is% 5)        # TRUE
print("a" %is% "a")    # TRUE
print(5 %is% "5")      # FALSE

## ----combinatorial------------------------------------------------------------
# Generate combinations and permutations
items <- c("A", "B", "C", "D")

# Calculate combination numbers
combinations_count <- comb(4, 2)  # C(4,2) = 6
print(paste("Number of ways to choose 2 items from 4:", combinations_count))

# Calculate permutation numbers
permutations_count <- perm(4, 2)  # P(4,2) = 12
print(paste("Number of ways to arrange 2 items from 4:", permutations_count))

## ----survey-example-----------------------------------------------------------
# Simulate messy survey data
survey_data <- data.frame(
  id = 1:8,
  age = c(25, "", 30, NA, "35", 28, 0, 45),
  income = c("50000", "", NA, "75000", "60000", "invalid", "80000", ""),
  satisfaction = c(5, 4, "", 3, NA, 5, 4, 2),
  stringsAsFactors = FALSE
)

print("Original messy survey data:")
print(survey_data)

# Step 1: Identify problematic data
cat("\nData quality assessment:\n")
cat("Columns with void values:", paste(cols_with_void(survey_data), collapse = ", "), "\n")
cat("Rows with void values:", paste(rows_with_void(survey_data), collapse = ", "), "\n")

# Step 2: Clean the data
# Replace void values with appropriate defaults
survey_clean <- survey_data
survey_clean$age <- replace_void(survey_clean$age, value = "25")
survey_clean$income <- replace_void(survey_clean$income, value = "50000")
survey_clean$satisfaction <- replace_void(survey_clean$satisfaction, value = 3)

# Convert to appropriate types
survey_clean$age <- as.numeric(survey_clean$age)
survey_clean$income <- as.numeric(survey_clean$income)
survey_clean$satisfaction <- as.numeric(survey_clean$satisfaction)

# Handle special cases (e.g., age = 0, income = "invalid")
survey_clean$age[survey_clean$age == 0] <- 25
survey_clean$income[is.na(survey_clean$income)] <- 50000

print("Cleaned survey data:")
print(survey_clean)

## ----performance-tips---------------------------------------------------------
# For large datasets, check specific columns rather than entire data frame
large_data <- data.frame(
  col1 = sample(c(1:100, NA), 1000, replace = TRUE),
  col2 = sample(c(letters, ""), 1000, replace = TRUE),
  col3 = runif(1000)
)

# Check only columns likely to have voids
critical_cols <- c("col1", "col2")
void_status <- sapply(critical_cols, function(col) any_void(large_data[[col]]))
print("Void status for critical columns:")
print(void_status)

Try the evanverse package in your browser

Any scripts or data that you put into this service are public.

evanverse documentation built on March 10, 2026, 5:07 p.m.