In holmesjoli/validR: Data Validation Tools for R

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

The validR is a set of functions to be used for testing and validating data. The main functions are:

test_range: tests if column values are within a certain range
test_values: tests if column values are certain categories
test_unique: tests column values are unique
test_na: tests column values don't contain NAs
test_test_orphan_rec: tests if merge between primary and related table doesn't result in any orphaned keys

To use the testing functions first create the test setup. How each test class should be setup can be found by ?setup_test_range in the details section.

setup_test_range
setup_test_values
setup_test_unique
setup_test_na
setup_test_orphan_rec

Column Tests

library(validR)

df <- data.frame(w = c(NA, 2:10),
                 x = 1:10, 
                 y = c(rep("X", 4), rep("Y", 4), rep("Z", 2)),
                 z = c(1,1,3:10), 
                 stringsAsFactors = F)

# Setup the test classes

setup1 <- setup_test_range(df_name = "df", col_name = "x", int = TRUE, lower_inclu = TRUE,
                                  upper_inclu = TRUE, lower = 1, upper = 10)
setup2 <- setup_test_range(df_name = "df", col_name = "z", int = TRUE, lower_inclu = TRUE,
                                  upper_inclu = TRUE, lower = 3, upper = 10)
setup3 <- setup_test_range(df_name = "df", col_name = "w", int = TRUE, lower_inclu = FALSE,
                                  upper_inclu = NULL, lower = 2, upper = NULL)
setup4 <- setup_test_values(df_name = "df", col_name = "y", values = c("X"))
setup5 <- setup_test_unique(df_name = "df", col_name = "x")
setup6 <- setup_test_unique(df_name = "df", col_name = "z")
setup7 <- setup_test_na(df_name = "df", col_name = "y")
setup8 <- setup_test_na(df_name = "df", col_name = "w")

setup <- list(setup1, setup2, setup3, setup4, setup5, setup6, setup7, setup8)

# If the tests all rely on the same dataframe use the following line of code to apply the tests
tests_all <- lapply(setup, test, df = df)

col_tests <- do.call("rbind", lapply(tests_all, test_summary))

issues <- do.call("rbind", lapply(tests_all, wrong_rows))

Merge Tests

library(validR)

primary_df <- data.frame(id1 = c(1,2,2,3), id2 = c(1,1,2,1), y = c(1,1,2,5))
related_df <- data.frame(id1 = c(1,2,3), id2 = c(1,1,1), x = c(1,2,3))

setup1 <- setup_test_orphan_rec(primary_df = "df1", related_df = "df2", 
                               primary_key = c("id1", "id2"), foreign_key = c("id1", "id2"))

test1 <- test(setup1, primary_df, related_df)
merge_test1 <- test_summary(test1)
issues1 <- wrong_rows(test1)


primary_df <- data.frame(id = c(1,2,2), id2 = c(1,1,2), y = c(1,1,2))
related_df <- data.frame(id1 = c(1,2,15), id2 = c(1,1,1), x = c(1,2,3))

setup2 <- setup_test_orphan_rec(primary_df = "df1", related_df = "df2", 
                               primary_key = c("id", "id2"), foreign_key = c("id1", "id2"))

test2 <- test(setup2, primary_df, related_df)
merge_test2 <- test_summary(test2)
issues2 <- wrong_rows(test2)

# Merge tests

merge_tests <- rbind(merge_test1, merge_test2)
merge_problems <- rbind(issues1, issues2)