data-processing.R
In evanverse: Utility Functions for Data Analysis and Visualization

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  fig.align = "center",
  warning = FALSE,
  message = FALSE
)

## ----load-package-------------------------------------------------------------
library(evanverse)
library(dplyr)

## ----void-concepts------------------------------------------------------------
# Examples of void values
void_examples <- list(
  numbers = c(1, NA, 3, 4),
  strings = c("A", "", "C", NA),
  mixed = c("text", NA, "", "data")
)

print("Examples of data with void values:")
str(void_examples)

## ----void-detection-single----------------------------------------------------
# Check if individual values are void
print(is_void(NA))           # TRUE
print(is_void(""))           # TRUE
print(is_void(NULL))         # TRUE
print(is_void("hello"))      # FALSE
print(is_void(0))            # FALSE

## ----void-detection-vector----------------------------------------------------
# Check if any element in a vector is void
test_vector <- c("A", "", "C", NA, "E")
print(any_void(test_vector))  # TRUE

# Example with no void values
clean_vector <- c("A", "B", "C")
print(any_void(clean_vector))  # FALSE

## ----void-detection-dataframe-------------------------------------------------
# Create sample data with various void patterns
sample_data <- data.frame(
  id = 1:6,
  name = c("Alice", "", "Charlie", NA, "Eve", "Frank"),
  age = c(25, 30, NA, 35, 28, 32),
  city = c("NYC", "LA", "", "Chicago", NA, "Boston"),
  stringsAsFactors = FALSE
)

print("Sample data with void values:")
print(sample_data)

# Identify columns with void values
void_cols <- cols_with_void(sample_data)
print(paste("Columns with void values:", paste(void_cols, collapse = ", ")))

# Identify rows with void values
void_rows <- rows_with_void(sample_data)
print(paste("Rows with void values:", paste(void_rows, collapse = ", ")))

## ----void-replacement-basic---------------------------------------------------
# Replace all void values with a single replacement
messy_vector <- c("A", "", "C", NA, "E")
clean_vector <- replace_void(messy_vector, value = "MISSING")

print("Original vector:")
print(messy_vector)
print("After replacement:")
print(clean_vector)

## ----void-replacement-selective-----------------------------------------------
# Replace only specific types of void values
mixed_data <- c("A", "", "C", NA, "E")

# Replace only empty strings
only_empty <- replace_void(mixed_data,
                          value = "EMPTY",
                          include_na = FALSE,
                          include_empty_str = TRUE)

print("Replace only empty strings:")
print(only_empty)

# Replace only NA values
only_na <- replace_void(mixed_data,
                       value = "NOT_AVAILABLE",
                       include_na = TRUE,
                       include_empty_str = FALSE)

print("Replace only NA values:")
print(only_na)

## ----void-replacement-dataframe-----------------------------------------------
# Apply replacement column by column
clean_data <- sample_data
clean_data$name <- replace_void(sample_data$name, value = "UNKNOWN")
clean_data$city <- replace_void(sample_data$city, value = "UNKNOWN")

print("Data after void replacement:")
print(clean_data)

## ----drop-elements------------------------------------------------------------
# For vectors, drop_void removes void elements
test_vector <- c("A", "", "C", NA, "E")
clean_vector <- drop_void(test_vector)

print("Original vector:")
print(test_vector)
print("After dropping void elements:")
print(clean_vector)

# For data analysis, we can identify problematic rows/columns
print("Rows with void values:")
print(rows_with_void(sample_data))
print("Columns with void values:")
print(cols_with_void(sample_data))

## ----df-to-list---------------------------------------------------------------
# Group data by a key column and create lists
mtcars_subset <- mtcars[1:12, c("cyl", "mpg", "hp", "wt")]

# Group by cylinder count, focusing on MPG values
grouped_cars <- df2list(
  data = mtcars_subset,
  key_col = "cyl",
  value_col = "mpg"
)

print("Cars grouped by cylinder count (MPG values):")
str(grouped_cars)

# Access specific groups
print("4-cylinder cars MPG values:")
print(grouped_cars[["4"]])

## ----column-mapping-----------------------------------------------------------
# Map values in a column using a named vector
grades_data <- data.frame(
  student = c("Alice", "Bob", "Charlie", "Diana"),
  grade_letter = c("A", "B", "A", "C")
)

# Create mapping for letter grades to numbers
grade_mapping <- c("A" = 4.0, "B" = 3.0, "C" = 2.0, "D" = 1.0, "F" = 0.0)

# Apply mapping using the correct parameters
result <- map_column(
  query = grades_data,
  by = "grade_letter",
  map = grade_mapping,
  to = "grade_numeric"
)

print("Grades with numeric mapping:")
print(result)

## ----file-reading, eval=FALSE-------------------------------------------------
# # Read various file formats with automatic detection
# data1 <- read_table_flex("data.csv")
# data2 <- read_table_flex("data.tsv", sep = "\t")
# data3 <- read_table_flex("data.txt", header = TRUE)
# 
# # Read Excel files with flexibility
# excel_data <- read_excel_flex("workbook.xlsx", sheet = "Sheet1")

## ----file-info, eval=FALSE----------------------------------------------------
# # Get comprehensive file information
# info <- file_info("myfile.csv")
# print(info)
# 
# # Extract file extensions
# files <- c("data.csv", "analysis.R", "report.pdf")
# extensions <- sapply(files, get_ext)
# print(extensions)
# 
# # Display directory structure
# file_tree(".", max_depth = 2)

## ----string-operators---------------------------------------------------------
# Paste operator for clean string concatenation
full_name <- "John" %p% " " %p% "Doe"
print(full_name)

file_path <- "data" %p% "/" %p% "analysis" %p% ".csv"
print(file_path)

## ----logical-operators--------------------------------------------------------
# Enhanced "not in" operator
fruits <- c("apple", "banana", "orange")
check_fruits <- c("apple", "grape", "banana", "kiwi")

# Find fruits not in our list
missing_fruits <- check_fruits[check_fruits %nin% fruits]
print(paste("Missing fruits:", paste(missing_fruits, collapse = ", ")))

# Enhanced identity checking
print(5 %is% 5)        # TRUE
print("a" %is% "a")    # TRUE
print(5 %is% "5")      # FALSE

## ----combinatorial------------------------------------------------------------
# Generate combinations and permutations
items <- c("A", "B", "C", "D")

# Calculate combination numbers
combinations_count <- comb(4, 2)  # C(4,2) = 6
print(paste("Number of ways to choose 2 items from 4:", combinations_count))

# Calculate permutation numbers
permutations_count <- perm(4, 2)  # P(4,2) = 12
print(paste("Number of ways to arrange 2 items from 4:", permutations_count))

## ----survey-example-----------------------------------------------------------
# Simulate messy survey data
survey_data <- data.frame(
  id = 1:8,
  age = c(25, "", 30, NA, "35", 28, 0, 45),
  income = c("50000", "", NA, "75000", "60000", "invalid", "80000", ""),
  satisfaction = c(5, 4, "", 3, NA, 5, 4, 2),
  stringsAsFactors = FALSE
)

print("Original messy survey data:")
print(survey_data)

# Step 1: Identify problematic data
cat("\nData quality assessment:\n")
cat("Columns with void values:", paste(cols_with_void(survey_data), collapse = ", "), "\n")
cat("Rows with void values:", paste(rows_with_void(survey_data), collapse = ", "), "\n")

# Step 2: Clean the data
# Replace void values with appropriate defaults
survey_clean <- survey_data
survey_clean$age <- replace_void(survey_clean$age, value = "25")
survey_clean$income <- replace_void(survey_clean$income, value = "50000")
survey_clean$satisfaction <- replace_void(survey_clean$satisfaction, value = 3)

# Convert to appropriate types
survey_clean$age <- as.numeric(survey_clean$age)
survey_clean$income <- as.numeric(survey_clean$income)
survey_clean$satisfaction <- as.numeric(survey_clean$satisfaction)

# Handle special cases (e.g., age = 0, income = "invalid")
survey_clean$age[survey_clean$age == 0] <- 25
survey_clean$income[is.na(survey_clean$income)] <- 50000

print("Cleaned survey data:")
print(survey_clean)

## ----performance-tips---------------------------------------------------------
# For large datasets, check specific columns rather than entire data frame
large_data <- data.frame(
  col1 = sample(c(1:100, NA), 1000, replace = TRUE),
  col2 = sample(c(letters, ""), 1000, replace = TRUE),
  col3 = runif(1000)
)

# Check only columns likely to have voids
critical_cols <- c("col1", "col2")
void_status <- sapply(critical_cols, function(col) any_void(large_data[[col]]))
print("Void status for critical columns:")
print(void_status)

Any scripts or data that you put into this service are public.

evanverse documentation built on March 10, 2026, 5:07 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

evanverse
Utility Functions for Data Analysis and Visualization

inst/doc/data-processing.R
In evanverse: Utility Functions for Data Analysis and Visualization

Try the evanverse package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

evanverse Utility Functions for Data Analysis and Visualization

inst/doc/data-processing.R In evanverse: Utility Functions for Data Analysis and Visualization

Try the evanverse package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

evanverse
Utility Functions for Data Analysis and Visualization

inst/doc/data-processing.R
In evanverse: Utility Functions for Data Analysis and Visualization