Nothing
# PREPARATION ####
withr::local_options(joyn.verbose = FALSE)
library(data.table)
# options(possible_ids.verbose = FALSE)
x1 = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
t = c(1L, 2L, 1L, 2L, NA_integer_),
x = 11:15)
y1 = data.table(id = c(1,2, 4),
y = c(11L, 15L, 16))
x2 = data.table(id = c(1, 4, 2, 3, NA),
t = c(1L, 2L, 1L, 2L, NA_integer_),
x = c(16, 12, NA, NA, 15))
y2 = data.table(id = c(1, 2, 5, 6, 3),
yd = c(1, 2, 5, 6, 3),
y = c(11L, 15L, 20L, 13L, 10L),
x = c(16:20))
y3 <- data.table(id = c("c","b", "c", "a"),
y = c(11L, 15L, 18L, 20L))
x3 <- data.table(id=c("c","b", "d"),
v=8:10,
foo=c(4,2, 7))
x4 = data.table(id1 = c(1, 1, 2, 3, 3),
id2 = c(1, 1, 2, 3, 4),
t = c(1L, 2L, 1L, 2L, NA_integer_),
x = c(16, 12, NA, NA, 15))
y4 = data.table(id = c(1, 2, 5, 6, 3),
id2 = c(1, 1, 2, 3, 4),
y = c(11L, 15L, 20L, 13L, 10L),
x = c(16:20))
# Auxiliary data: Big data table--------------------
# Set seed for reproducibility
set.seed(123)
# Number of rows and variables
n_rows <- 1e4 # 10,000 rows
n_vars <- 50 # Total variables
# Initialize an empty data.table
dt_large <- data.table(id = 1:n_rows)
## Manually create three variables that uniquely identify the data ####
dt_large[, unique_id1 := rep(1:10, each = 1000)] # 1000 unique values repeated 100 times
dt_large[, unique_id2 := sample(letters, n_rows, replace = TRUE)] # Random character variable
dt_large[, unique_id3 := sample(1:1000, n_rows, replace = TRUE)] # Random integer
# Function to generate random data
generate_random_data <- function(n, type) {
switch(type,
"numeric_int" = sample(1:1e6, n, replace = TRUE),
"numeric_double" = rnorm(n),
"character" = replicate(n, paste0(sample(letters, 5, replace = TRUE), collapse = "")),
"factor" = factor(sample(letters[1:10], n, replace = TRUE)),
"logical" = sample(c(TRUE, FALSE), n, replace = TRUE),
"date" = as.Date("2000-01-01") + sample(0:3650, n, replace = TRUE),
"datetime" = as.POSIXct("2000-01-01") + sample(0:(3650*24*60*60), n, replace = TRUE)
)
}
# Variable types and counts
var_types <- c("numeric_int", "numeric_double", "character", "factor", "logical", "date", "datetime")
vars_per_type <- c(10, 10, 10, 10, 5, 3, 2) # Total should sum to 50
# Generate variables and add to the data.table
var_count <- 0
for (i in seq_along(var_types)) {
type <- var_types[i]
n_vars_type <- vars_per_type[i]
for (j in 1:n_vars_type) {
var_count <- var_count + 1
var_name <- paste0(type, "_", j)
dt_large[, (var_name) := generate_random_data(n_rows, type)]
}
}
## Introduce duplicates in some columns that are NOT the unique identifiers ####
# For example, we can duplicate the first 100 rows in the "numeric_int_1" and "character_1" columns
# dt_large <- rbind(dt_large, dt_large[1:100, .(numeric_int_1, character_1)])
# Shuffle the data to avoid ordered data
dt_large <- dt_large[sample(.N)]
# dt_large[, id := .I]
dt <- copy(dt_large)
possible_ids(
dt = dt_large,
exclude_classes = c("numeric"),
verbose = TRUE
)
possible_ids(
dt = dt_large,
exclude_classes = c("numeric"),
exclude = "id",
verbose = TRUE
)
uniq_vars <- grep("unique_id", names(dt_large), value = TRUE)
pids <- possible_ids(
dt = dt_large,
#exclude_classes = c("logical", "date", "datetime", "numeric"),
exclude = "id",
#vars = uniq_vars,
verbose = TRUE,
min_combination_size = 3,
# max_combination_size = 3,
max_processing_time = 240,
get_all = TRUE
)
possible_ids(
dt = dt_large,
verbose = TRUE
)
## Remove the 'id' column to simulate data without a clear unique identifier ####
dt_large[, id := NULL]
possible_ids_list <- possible_ids(
dt = dt_large,
exclude_classes = c("logical", "date", "datetime"), # Exclude some types for efficiency
verbose = TRUE
)
possible_ids_list
possible_ids_list <- possible_ids(
dt = dt_large,
exclude_classes = c("logical", "date", "datetime", "numeric"), # Exclude some types for efficiency
max_processing_time = 120,
verbose = TRUE
)
possible_ids_list
# TESTS ####
## Test create ids --------------------- ####
test_that("create_ids works as intended", {
df_test <- data.table(a = 1:50)
# with a single id
res <- as.data.frame(create_ids(n_rows = 50,
n_ids = 1))
nrow(unique(res)) |>
expect_equal(50)
class(res) |>
expect_equal("data.frame")
# ret class
res <- create_ids(n_rows = 50,
n_ids = 1)
class(res) |>
expect_equal("list")
length(res) |>
expect_equal(1)
# prefix
create_ids(n_rows = 40, n_ids = 4, prefix = "unique_id") |>
names() |>
expect_equal(paste0("unique_id", 1:4))
# with more than an id
vars <- c("var1", "var2", "var3")
dt <- df_test |> copy()
res <- dt[, (vars) := create_ids(.N,
n_ids = 3)]
nrow(res[, .N,
by = vars][N > 1]) |>
expect_equal(0)
res <- df_test[, .(as.data.table(
create_ids(.N, n_ids = 3)
))]
nrow(res[, .N,
by = c("id1", "id2", "id3")][N > 1]) |>
expect_equal(0)
dt <- dt_large |> copy()
new_ids <- as.data.table(create_ids(nrow(dt),
n_ids = 3,
prefix = "new_id"))
# Bind the new IDs to the existing data.table
dt <- cbind(dt, new_ids)
})
## Test possible_ids ------------------- ####
test_that("convert to data.table", {
xx1 <- as.data.frame(x1)
expect_equal(possible_ids(x1), possible_ids(xx1))
})
test_that("store checked ids", {
res <- possible_ids(x4,
get_all = TRUE)
attributes(res)$checked_ids |>
expect_setequal(names(x4))
.joynenv$checked_ids |>
expect_setequal(names(x4))
# with get_all = FALSE
res <- possible_ids(x4)
attributes(res)$checked_ids |>
expect_setequal(names(x4))
.joynenv$checked_ids |>
expect_setequal(names(x4))
# with large dt -get_all FALSE
res <- possible_ids(dt,
vars = paste0("numeric_int_", 1:4))
attributes(res)$checked_ids |>
expect_setequal(paste0("numeric_int_", 1:4))
.joynenv$checked_ids |>
expect_setequal(paste0("numeric_int_", 1:4))
# with large dt -get_all TRUE
res <- possible_ids(dt,
vars = paste0("numeric_int_", 1:4),
get_all = TRUE)
attributes(res)$checked_ids |>
expect_setequal(paste0("numeric_int_", 1:4))
.joynenv$checked_ids |>
expect_setequal(paste0("numeric_int_", 1:4))
})
test_that("error if not dataframe", {
m1 <- as.matrix(x1)
expect_error(possible_ids(m1))
})
test_that("vars provided by user", {
# single var -raise error
possible_ids(x4,
vars = c("t")) |>
expect_error()
# one or more vars not included in dt
possible_ids(x4,
vars = c("id3", "id2")) |>
expect_error()
possible_ids(dt,
vars = c("id", "id3", "id2")) |>
expect_error()
possible_ids(dt,
vars = c("id", "numeric_int_1", "character_1"),
verbose = TRUE) |>
expect_no_error()
ids_dt <- possible_ids(dt)
vars <- c("id", "numeric_double_1", "numeric_double_2")
use_ids_dt <- possible_ids(dt,
vars = vars)
all(sapply(use_ids_dt,
function(x) { x %in% ids_dt })) |>
expect_equal(TRUE)
all(unlist(use_ids_dt) %in% vars) |>
expect_equal(TRUE)
# Check if the combination of unique_id1, unique_id2, and unique_id3 uniquely identifies rows
vars <- c("id", "unique_id2", "unique_id3")
res <- dt[, .N,
by = vars][N > 1] |>
nrow()
# --if it does, possible_ids should return those vars
if (res == 0) { # if no duplicate rows
possible_ids(dt,
vars = vars,
min_combination_size = 3) |>
unlist() |>
expect_setequal(vars)
}
})
test_that("relationship include and vars", {
possible_ids(x4,
vars = c("id1", "id2"),
include = c("t")) |>
expect_no_error()
possible_ids(dt,
vars = c("id", "unique_id2"),
include = c("id", "factor_1", "factor_2")) |>
expect_no_error()
possible_ids(x4,
vars = NULL,
include = c("t", "x")) |>
expect_no_error()
possible_ids(x4,
vars = c("t", "x"),
include = NULL) |>
expect_no_error()
# test checked vars are at least `vars` plus those in `include`
res <- possible_ids(x4,
vars = c("id1", "id2", "t"),
include = "t")
checked_ids <- attributes(res)$checked_ids
checked_ids |>
expect_setequal(c("id1", "id2", "t"))
res <- possible_ids(dt,
vars = c("logical_1", "logical_2", "factor_1", "factor_2"),
include = "unique_id1")
checked_ids <- attributes(res)$checked_ids
checked_ids |>
expect_setequal(c("logical_1", "logical_2", "factor_1", "factor_2", "unique_id1"))
})
test_that("relationship exclude and vars", {
possible_ids(x4,
vars = c("t", "x"),
exclude_classes = "character") |>
expect_message()
possible_ids(x4,
vars = c("id1", "x"),
exclude = "x") |>
expect_message()
possible_ids(dt,
vars = paste0("character_", 1:10),
exclude = c("character_1", "character_2")) |>
expect_message()
})
# test_that("inconsistent use of `include`", {
#
# # expect_warning(possible_ids(x1,
# # include = "x"))
# #
# # possible_ids(x1,
# # include = c("id", "x")) |>
# # expect_no_error()
#
# })
test_that("exclude and include", {
dd <- possible_ids(x3,
exclude_classes = c("numeric", "integer"),
include = "foo")
expect_equal(unlist(dd),
c("id", "foo"))
## Test combination between include class and exclude vars ####
res <- possible_ids(dt,
get_all = TRUE,
include_classes = c("integer"),
exclude = paste0("numeric_int_", 1:5))
checked_vars <- attributes(res)$checked_ids
any(
paste0("numeric_int_", 1:5) %in% checked_vars
) |>
expect_equal(FALSE)
all(
paste0("numeric_int_", 6:10) %in% checked_vars
) |>
expect_equal(TRUE)
## Test combination between include vars and exclude class ####
res <- possible_ids(dt,
include = c("numeric_double_1",
"numeric_double_2"),
exclude_classes = "numeric")
checked_vars <- attributes(res)$checked_ids
all(
paste0("numeric_double_", 1:2) %in% checked_vars
) |>
expect_equal(TRUE)
any(
paste0("numeric_double_", 3:10) %in% checked_vars
) |>
expect_equal(FALSE)
res_ids <- possible_ids(x2,
include = "x",
exclude_classes = "numeric")
res_ids |>
unlist() |>
expect_equal(c("t", "x"))
res_ids <- possible_ids(x3,
include = "id",
exclude_classes = "character")
res_ids |>
unlist() |>
expect_setequal(c("foo", "id", "v"))
# alert if include and exclude same class ####
possible_ids(dt,
include_classes = "numeric",
exclude_classes = "numeric") |>
expect_message()
# alert if include and exclude same vars ####
possible_ids(dt,
include = c("id", "unique_id1"),
exclude = c("id", "unique_id1")) |>
expect_message()
res <- possible_ids(dt,
exclude_classes = c("integer"),
include = c("numeric_int_1"))
attributes(res)$checked_ids |>
expect_setequal(setdiff(names(dt), c(paste0("numeric_int_", 2:10),
"id",
"unique_id1", "unique_id3")))
})
test_that("get length 0 -error", {
expect_error(possible_ids(x1,
exclude_classes = c("numeric", "integer"),
include = "t"))
})
test_that("get all works", {
# no error
possible_ids(x4,
get_all = TRUE) |>
expect_no_error()
# get all with user selected vars
possible_ids(x4,
vars = c("id1", "t"),
get_all = TRUE) |>
expect_no_error()
# get all with max number of combinations
possible_ids(x4,
max_combination_size = 3,
get_all = TRUE) |>
expect_no_error()
# check get all combs
possible_ids(x3,
get_all = TRUE) |>
unlist() |>
expect_setequal(c("id", "v", "foo"))
})
test_that("Max combination size", {
res <- possible_ids(dt,
vars = c( "unique_id1", "unique_id2", "unique_id3",
"character_1", "character_2", "character_3", "character_4"),
max_combination_size = 5)
sapply(res, function(sublist) {
length(sublist) <= 3}) |>
all() |>
expect_true()
res <- possible_ids(x1,
get_all = TRUE,
max_combination_size = 2)
sapply(res, function(sublist) {
length(sublist) <= 2}) |>
all() |>
expect_true()
})
test_that("Min combination size", {
res <- possible_ids(x4,
min_combination_size = 1,
get_all = FALSE) |>
unlist()
expect_true(
length(res) >= 1)
res <- possible_ids(dt,
min_combination_size = 3,
get_all = FALSE) |>
unlist()
expect_true(length(res) >= 3)
possible_ids(x4,
#min_combination_size = 1,
max_combination_size = 1) |>
expect_error()
possible_ids(x4,
min_combination_size = 3,
max_combination_size = 2) |>
expect_error()
})
test_that("Exclude nothing", {
p1 <- possible_ids(x1)
p2 <- possible_ids(x1, exclude = "rer")
expect_equal(p1, p2)
})
test_that("Exclude type and variable", {
xx4 <- copy(x4)
xx4[, id2 := as.character(id2)]
dd <- possible_ids(xx4,
exclude_classes = c("character"),
exclude = "x")
expect_equal(c("id1", "t"), unlist(dd))
})
test_that("Exclude more than one variable", {
dd <- possible_ids(x4,
exclude = c("id2", "x"))
expect_equal(c("id1", "t"), unlist(dd))
})
test_that("duplicated names", {
xx4 <- copy(x4)
setnames(xx4, "t", "x")
expect_error(possible_ids(xx4))
})
test_that("identifies ids", {
vars <- c("var1", "var2", "var3")
dt[, (vars) := create_ids(.N, n_ids = 3)]
possible_ids(dt,
vars = vars) |>
unlist() |>
expect_equal(vars)
df_test <- as.data.frame(
create_ids(n_rows = 50,
n_ids = 3)
)
possible_ids(df_test,
vars = c("id1", "id2"),
include = "id3") |>
unlist() |>
expect_equal(c("id1", "id2", "id3"))
possible_ids(df_test,
exclude_classes = "integer",
include = c("id1", "id2", "id3")) |>
unlist() |>
expect_equal(c("id1", "id2", "id3"))
})
# Auxiliary data: Big data table--------------------
# Set seed for reproducibility
set.seed(123)
# Number of rows and variables
n_rows <- 1e4 # 10,000 rows
n_vars <- 50 # Total variables
# Initialize an empty data.table
dt_large <- data.table(id = 1:n_rows)
## Manually create three variables that uniquely identify the data ####
dt_large[, unique_id1 := rep(1:10, each = 1000)] # 1000 unique values repeated 100 times
dt_large[, unique_id2 := sample(letters, n_rows, replace = TRUE)] # Random character variable
dt_large[, unique_id3 := sample(1:1000, n_rows, replace = TRUE)] # Random integer
# Function to generate random data
generate_random_data <- function(n, type) {
switch(type,
"numeric_int" = sample(1:1e6, n, replace = TRUE),
"numeric_double" = rnorm(n),
"character" = replicate(n, paste0(sample(letters, 5, replace = TRUE), collapse = "")),
"factor" = factor(sample(letters[1:10], n, replace = TRUE)),
"logical" = sample(c(TRUE, FALSE), n, replace = TRUE),
"date" = as.Date("2000-01-01") + sample(0:3650, n, replace = TRUE),
"datetime" = as.POSIXct("2000-01-01") + sample(0:(3650*24*60*60), n, replace = TRUE)
)
}
# Variable types and counts
var_types <- c("numeric_int", "numeric_double", "character", "factor", "logical", "date", "datetime")
vars_per_type <- c(10, 10, 10, 10, 5, 3, 2) # Total should sum to 50
# Generate variables and add to the data.table
var_count <- 0
for (i in seq_along(var_types)) {
type <- var_types[i]
n_vars_type <- vars_per_type[i]
for (j in 1:n_vars_type) {
var_count <- var_count + 1
var_name <- paste0(type, "_", j)
dt_large[, (var_name) := generate_random_data(n_rows, type)]
}
}
## Introduce duplicates in some columns that are NOT the unique identifiers ####
# For example, we can duplicate the first 100 rows in the "numeric_int_1" and "character_1" columns
# dt_large <- rbind(dt_large, dt_large[1:100, .(numeric_int_1, character_1)])
# Shuffle the data to avoid ordered data
dt_large <- dt_large[sample(.N)]
# dt_large[, id := .I]
dt <- copy(dt_large)
possible_ids(
dt = dt_large,
exclude_classes = c("numeric"),
verbose = TRUE
)
possible_ids(
dt = dt_large,
exclude_classes = c("numeric"),
exclude = "id",
verbose = TRUE
)
uniq_vars <- grep("unique_id", names(dt_large), value = TRUE)
pids <- possible_ids(
dt = dt_large,
#exclude_classes = c("logical", "date", "datetime", "numeric"),
exclude = "id",
#vars = uniq_vars,
verbose = TRUE,
min_combination_size = 3,
# max_combination_size = 3,
max_processing_time = 240,
get_all = TRUE
)
possible_ids(
dt = dt_large,
verbose = TRUE
)
## Remove the 'id' column to simulate data without a clear unique identifier ####
dt_large[, id := NULL]
possible_ids_list <- possible_ids(
dt = dt_large,
exclude_classes = c("logical", "date", "datetime"), # Exclude some types for efficiency
verbose = TRUE
)
possible_ids_list
possible_ids_list <- possible_ids(
dt = dt_large,
exclude_classes = c("logical", "date", "datetime", "numeric"), # Exclude some types for efficiency
max_processing_time = 120,
verbose = TRUE
)
possible_ids_list
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.