knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, fig.align = "center", warning = FALSE, message = FALSE )
This guide covers the comprehensive data processing and transformation capabilities of evanverse, with special focus on void value handling and data manipulation utilities.
library(evanverse) library(dplyr)
In data analysis, "void" values are elements that represent missing or absent data. The evanverse package provides a comprehensive system for handling these values.
Void values in evanverse include:
- NA (missing values)
- NULL (null values)
- "" (empty strings)
# Examples of void values void_examples <- list( numbers = c(1, NA, 3, 4), strings = c("A", "", "C", NA), mixed = c("text", NA, "", "data") ) print("Examples of data with void values:") str(void_examples)
# Check if individual values are void print(is_void(NA)) # TRUE print(is_void("")) # TRUE print(is_void(NULL)) # TRUE print(is_void("hello")) # FALSE print(is_void(0)) # FALSE
# Check if any element in a vector is void test_vector <- c("A", "", "C", NA, "E") print(any_void(test_vector)) # TRUE # Example with no void values clean_vector <- c("A", "B", "C") print(any_void(clean_vector)) # FALSE
# Create sample data with various void patterns sample_data <- data.frame( id = 1:6, name = c("Alice", "", "Charlie", NA, "Eve", "Frank"), age = c(25, 30, NA, 35, 28, 32), city = c("NYC", "LA", "", "Chicago", NA, "Boston"), stringsAsFactors = FALSE ) print("Sample data with void values:") print(sample_data) # Identify columns with void values void_cols <- cols_with_void(sample_data) print(paste("Columns with void values:", paste(void_cols, collapse = ", "))) # Identify rows with void values void_rows <- rows_with_void(sample_data) print(paste("Rows with void values:", paste(void_rows, collapse = ", ")))
# Replace all void values with a single replacement messy_vector <- c("A", "", "C", NA, "E") clean_vector <- replace_void(messy_vector, value = "MISSING") print("Original vector:") print(messy_vector) print("After replacement:") print(clean_vector)
# Replace only specific types of void values mixed_data <- c("A", "", "C", NA, "E") # Replace only empty strings only_empty <- replace_void(mixed_data, value = "EMPTY", include_na = FALSE, include_empty_str = TRUE) print("Replace only empty strings:") print(only_empty) # Replace only NA values only_na <- replace_void(mixed_data, value = "NOT_AVAILABLE", include_na = TRUE, include_empty_str = FALSE) print("Replace only NA values:") print(only_na)
# Apply replacement column by column clean_data <- sample_data clean_data$name <- replace_void(sample_data$name, value = "UNKNOWN") clean_data$city <- replace_void(sample_data$city, value = "UNKNOWN") print("Data after void replacement:") print(clean_data)
# For vectors, drop_void removes void elements test_vector <- c("A", "", "C", NA, "E") clean_vector <- drop_void(test_vector) print("Original vector:") print(test_vector) print("After dropping void elements:") print(clean_vector) # For data analysis, we can identify problematic rows/columns print("Rows with void values:") print(rows_with_void(sample_data)) print("Columns with void values:") print(cols_with_void(sample_data))
# Group data by a key column and create lists mtcars_subset <- mtcars[1:12, c("cyl", "mpg", "hp", "wt")] # Group by cylinder count, focusing on MPG values grouped_cars <- df2list( data = mtcars_subset, key_col = "cyl", value_col = "mpg" ) print("Cars grouped by cylinder count (MPG values):") str(grouped_cars) # Access specific groups print("4-cylinder cars MPG values:") print(grouped_cars[["4"]])
# Map values in a column using a named vector grades_data <- data.frame( student = c("Alice", "Bob", "Charlie", "Diana"), grade_letter = c("A", "B", "A", "C") ) # Create mapping for letter grades to numbers grade_mapping <- c("A" = 4.0, "B" = 3.0, "C" = 2.0, "D" = 1.0, "F" = 0.0) # Apply mapping using the correct parameters result <- map_column( query = grades_data, by = "grade_letter", map = grade_mapping, to = "grade_numeric" ) print("Grades with numeric mapping:") print(result)
# Read various file formats with automatic detection data1 <- read_table_flex("data.csv") data2 <- read_table_flex("data.tsv", sep = "\t") data3 <- read_table_flex("data.txt", header = TRUE) # Read Excel files with flexibility excel_data <- read_excel_flex("workbook.xlsx", sheet = "Sheet1")
# Get comprehensive file information info <- file_info("myfile.csv") print(info) # Extract file extensions files <- c("data.csv", "analysis.R", "report.pdf") extensions <- sapply(files, get_ext) print(extensions) # Display directory structure file_tree(".", max_depth = 2)
# Paste operator for clean string concatenation full_name <- "John" %p% " " %p% "Doe" print(full_name) file_path <- "data" %p% "/" %p% "analysis" %p% ".csv" print(file_path)
# Enhanced "not in" operator fruits <- c("apple", "banana", "orange") check_fruits <- c("apple", "grape", "banana", "kiwi") # Find fruits not in our list missing_fruits <- check_fruits[check_fruits %nin% fruits] print(paste("Missing fruits:", paste(missing_fruits, collapse = ", "))) # Enhanced identity checking print(5 %is% 5) # TRUE print("a" %is% "a") # TRUE print(5 %is% "5") # FALSE
# Generate combinations and permutations items <- c("A", "B", "C", "D") # Calculate combination numbers combinations_count <- comb(4, 2) # C(4,2) = 6 print(paste("Number of ways to choose 2 items from 4:", combinations_count)) # Calculate permutation numbers permutations_count <- perm(4, 2) # P(4,2) = 12 print(paste("Number of ways to arrange 2 items from 4:", permutations_count))
# Simulate messy survey data survey_data <- data.frame( id = 1:8, age = c(25, "", 30, NA, "35", 28, 0, 45), income = c("50000", "", NA, "75000", "60000", "invalid", "80000", ""), satisfaction = c(5, 4, "", 3, NA, 5, 4, 2), stringsAsFactors = FALSE ) print("Original messy survey data:") print(survey_data) # Step 1: Identify problematic data cat("\nData quality assessment:\n") cat("Columns with void values:", paste(cols_with_void(survey_data), collapse = ", "), "\n") cat("Rows with void values:", paste(rows_with_void(survey_data), collapse = ", "), "\n") # Step 2: Clean the data # Replace void values with appropriate defaults survey_clean <- survey_data survey_clean$age <- replace_void(survey_clean$age, value = "25") survey_clean$income <- replace_void(survey_clean$income, value = "50000") survey_clean$satisfaction <- replace_void(survey_clean$satisfaction, value = 3) # Convert to appropriate types survey_clean$age <- as.numeric(survey_clean$age) survey_clean$income <- as.numeric(survey_clean$income) survey_clean$satisfaction <- as.numeric(survey_clean$satisfaction) # Handle special cases (e.g., age = 0, income = "invalid") survey_clean$age[survey_clean$age == 0] <- 25 survey_clean$income[is.na(survey_clean$income)] <- 50000 print("Cleaned survey data:") print(survey_clean)
# For large datasets, check specific columns rather than entire data frame large_data <- data.frame( col1 = sample(c(1:100, NA), 1000, replace = TRUE), col2 = sample(c(letters, ""), 1000, replace = TRUE), col3 = runif(1000) ) # Check only columns likely to have voids critical_cols <- c("col1", "col2") void_status <- sapply(critical_cols, function(col) any_void(large_data[[col]])) print("Void status for critical columns:") print(void_status)
Always inspect your data before processing using cols_with_void() and rows_with_void()
Choose appropriate replacement values that make sense in your domain context
Document your void handling strategy for reproducibility
Use selective replacement when different types of voids should be handled differently
Validate your results after transformation to ensure data integrity
The evanverse data processing tools provide a robust foundation for handling real-world messy data with confidence and efficiency.
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.