joyn: Tool for Diagnosis of Tables Joins and Complementary Join Features

# PREPARATION ####
withr::local_options(joyn.verbose = FALSE)
library(data.table)
# options(possible_ids.verbose = FALSE)
x1 = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
                t  = c(1L, 2L, 1L, 2L, NA_integer_),
                x  = 11:15)

y1 = data.table(id = c(1,2, 4),
                y  = c(11L, 15L, 16))


x2 = data.table(id = c(1, 4, 2, 3, NA),
                t  = c(1L, 2L, 1L, 2L, NA_integer_),
                x  = c(16, 12, NA, NA, 15))


y2 = data.table(id = c(1, 2, 5, 6, 3),
                yd = c(1, 2, 5, 6, 3),
                y  = c(11L, 15L, 20L, 13L, 10L),
                x  = c(16:20))


y3 <- data.table(id = c("c","b", "c", "a"),
                 y  = c(11L, 15L, 18L, 20L))

x3 <- data.table(id=c("c","b", "d"),
                 v=8:10,
                 foo=c(4,2, 7))

x4 = data.table(id1 = c(1, 1, 2, 3, 3),
                id2 = c(1, 1, 2, 3, 4),
                t   = c(1L, 2L, 1L, 2L, NA_integer_),
                x   = c(16, 12, NA, NA, 15))


y4 = data.table(id  = c(1, 2, 5, 6, 3),
                id2 = c(1, 1, 2, 3, 4),
                y   = c(11L, 15L, 20L, 13L, 10L),
                x   = c(16:20))

# Auxiliary data: Big data table--------------------

# Set seed for reproducibility
set.seed(123)

# Number of rows and variables
n_rows <- 1e4        # 10,000 rows
n_vars <- 50         # Total variables

# Initialize an empty data.table
dt_large <- data.table(id = 1:n_rows)

## Manually create three variables that uniquely identify the data ####
dt_large[, unique_id1 := rep(1:10, each = 1000)]  # 1000 unique values repeated 100 times
dt_large[, unique_id2 := sample(letters, n_rows, replace = TRUE)]  # Random character variable
dt_large[, unique_id3 := sample(1:1000, n_rows, replace = TRUE)]   # Random integer

# Function to generate random data
generate_random_data <- function(n, type) {
  switch(type,
         "numeric_int" = sample(1:1e6, n, replace = TRUE),
         "numeric_double" = rnorm(n),
         "character" = replicate(n, paste0(sample(letters, 5, replace = TRUE), collapse = "")),
         "factor" = factor(sample(letters[1:10], n, replace = TRUE)),
         "logical" = sample(c(TRUE, FALSE), n, replace = TRUE),
         "date" = as.Date("2000-01-01") + sample(0:3650, n, replace = TRUE),
         "datetime" = as.POSIXct("2000-01-01") + sample(0:(3650*24*60*60), n, replace = TRUE)
  )
}

# Variable types and counts
var_types <- c("numeric_int", "numeric_double", "character", "factor", "logical", "date", "datetime")
vars_per_type <- c(10, 10, 10, 10, 5, 3, 2)  # Total should sum to 50

# Generate variables and add to the data.table
var_count <- 0
for (i in seq_along(var_types)) {
  type <- var_types[i]
  n_vars_type <- vars_per_type[i]
  for (j in 1:n_vars_type) {
    var_count <- var_count + 1
    var_name <- paste0(type, "_", j)
    dt_large[, (var_name) := generate_random_data(n_rows, type)]
  }
}

## Introduce duplicates in some columns that are NOT the unique identifiers ####
# For example, we can duplicate the first 100 rows in the "numeric_int_1" and "character_1" columns
# dt_large <- rbind(dt_large, dt_large[1:100, .(numeric_int_1, character_1)])

# Shuffle the data to avoid ordered data
dt_large <- dt_large[sample(.N)]



# dt_large[, id := .I]
dt <- copy(dt_large)


possible_ids(
  dt = dt_large,
  exclude_classes = c("numeric"),
  verbose = TRUE
)

possible_ids(
  dt = dt_large,
  exclude_classes = c("numeric"),
  exclude = "id",
  verbose = TRUE
)

uniq_vars <- grep("unique_id", names(dt_large), value = TRUE)
pids <- possible_ids(
  dt = dt_large,
  #exclude_classes = c("logical", "date", "datetime", "numeric"),
  exclude = "id",
  #vars = uniq_vars,
  verbose = TRUE,
  min_combination_size = 3,
  # max_combination_size = 3,
  max_processing_time = 240,
  get_all = TRUE
)

possible_ids(
  dt = dt_large,
  verbose = TRUE
)

## Remove the 'id' column to simulate data without a clear unique identifier ####
dt_large[, id := NULL]

possible_ids_list <- possible_ids(
  dt = dt_large,
  exclude_classes = c("logical", "date", "datetime"),  # Exclude some types for efficiency
  verbose = TRUE
)
possible_ids_list

possible_ids_list <- possible_ids(
  dt = dt_large,
  exclude_classes = c("logical", "date", "datetime", "numeric"),  # Exclude some types for efficiency
  max_processing_time = 120,
  verbose = TRUE
)
possible_ids_list



# TESTS ####
## Test create ids --------------------- ####

test_that("create_ids works as intended", {
  df_test <- data.table(a = 1:50)

  # with a single id
  res <- as.data.frame(create_ids(n_rows = 50,
                                  n_ids = 1))

  nrow(unique(res)) |>
    expect_equal(50)

  class(res) |>
    expect_equal("data.frame")


  # ret class
  res <- create_ids(n_rows = 50,
                    n_ids = 1)

  class(res) |>
    expect_equal("list")

  length(res) |>
    expect_equal(1)

  # prefix
  create_ids(n_rows = 40, n_ids = 4, prefix = "unique_id") |>
    names() |>
    expect_equal(paste0("unique_id", 1:4))

  # with more than an id
  vars <- c("var1", "var2", "var3")

  dt <- df_test |> copy()

  res <- dt[, (vars) := create_ids(.N,
                                        n_ids = 3)]

  nrow(res[, .N,

     by = vars][N > 1]) |>
    expect_equal(0)

  res <- df_test[, .(as.data.table(
    create_ids(.N, n_ids = 3)
    ))]

  nrow(res[, .N,

           by = c("id1", "id2", "id3")][N > 1]) |>
    expect_equal(0)

  dt <- dt_large |> copy()

  new_ids <- as.data.table(create_ids(nrow(dt),
                                      n_ids = 3,
                                      prefix = "new_id"))

  # Bind the new IDs to the existing data.table
  dt <- cbind(dt, new_ids)


})



## Test possible_ids ------------------- ####
test_that("convert to data.table", {
  xx1 <- as.data.frame(x1)
  expect_equal(possible_ids(x1), possible_ids(xx1))
})


test_that("store checked ids", {

  res <- possible_ids(x4,
                      get_all = TRUE)

  attributes(res)$checked_ids |>
    expect_setequal(names(x4))

  .joynenv$checked_ids |>
    expect_setequal(names(x4))

  # with get_all = FALSE
  res <- possible_ids(x4)

  attributes(res)$checked_ids |>
    expect_setequal(names(x4))

  .joynenv$checked_ids |>
    expect_setequal(names(x4))

  # with large dt -get_all FALSE
  res <- possible_ids(dt,
                      vars = paste0("numeric_int_", 1:4))

  attributes(res)$checked_ids |>
    expect_setequal(paste0("numeric_int_", 1:4))

  .joynenv$checked_ids |>
    expect_setequal(paste0("numeric_int_", 1:4))

  # with large dt -get_all TRUE
  res <- possible_ids(dt,
                      vars = paste0("numeric_int_", 1:4),
                      get_all = TRUE)

  attributes(res)$checked_ids |>
    expect_setequal(paste0("numeric_int_", 1:4))

  .joynenv$checked_ids |>
    expect_setequal(paste0("numeric_int_", 1:4))

})

test_that("error if not dataframe", {

  m1 <- as.matrix(x1)
  expect_error(possible_ids(m1))

})

test_that("vars provided by user", {

  # single var -raise error
  possible_ids(x4,
               vars = c("t")) |>
    expect_error()

  # one or more vars not included in dt
  possible_ids(x4,
               vars = c("id3", "id2")) |>
    expect_error()

  possible_ids(dt,
               vars = c("id", "id3", "id2")) |>
    expect_error()

  possible_ids(dt,
               vars = c("id", "numeric_int_1", "character_1"),
               verbose = TRUE) |>
    expect_no_error()

  ids_dt <- possible_ids(dt)

  vars <- c("id", "numeric_double_1", "numeric_double_2")

  use_ids_dt <- possible_ids(dt,
               vars = vars)

  all(sapply(use_ids_dt,
             function(x) { x %in% ids_dt })) |>
    expect_equal(TRUE)

  all(unlist(use_ids_dt) %in% vars) |>
    expect_equal(TRUE)


  # Check if the combination of unique_id1, unique_id2, and unique_id3 uniquely identifies rows
  vars <- c("id", "unique_id2", "unique_id3")

  res <- dt[, .N,
           by = vars][N > 1] |>
    nrow()

  # --if it does, possible_ids should return those vars

  if (res == 0) {  # if no duplicate rows
    possible_ids(dt,
                 vars = vars,
                 min_combination_size = 3) |>
      unlist() |>
      expect_setequal(vars)
  }

})

test_that("relationship include and vars", {

  possible_ids(x4,
               vars = c("id1", "id2"),
               include = c("t")) |>
    expect_no_error()

  possible_ids(dt,
               vars = c("id", "unique_id2"),
               include = c("id", "factor_1", "factor_2")) |>
    expect_no_error()

  possible_ids(x4,
               vars = NULL,
               include = c("t", "x")) |>
    expect_no_error()

  possible_ids(x4,
               vars = c("t", "x"),
               include = NULL) |>
    expect_no_error()

  # test checked vars are at least `vars` plus those in `include`
  res <- possible_ids(x4,
                      vars = c("id1", "id2", "t"),
                      include = "t")

  checked_ids <- attributes(res)$checked_ids

  checked_ids |>
    expect_setequal(c("id1", "id2", "t"))

  res <- possible_ids(dt,
                      vars = c("logical_1", "logical_2", "factor_1", "factor_2"),
                      include = "unique_id1")

  checked_ids <- attributes(res)$checked_ids

  checked_ids |>
    expect_setequal(c("logical_1", "logical_2", "factor_1", "factor_2", "unique_id1"))


})

test_that("relationship exclude and vars", {

  possible_ids(x4,
               vars = c("t", "x"),
               exclude_classes = "character") |>
    expect_message()

  possible_ids(x4,
               vars = c("id1", "x"),
               exclude = "x") |>
    expect_message()

  possible_ids(dt,
               vars = paste0("character_", 1:10),
               exclude = c("character_1", "character_2")) |>
    expect_message()
  })

# test_that("inconsistent use of `include`", {
#
#   #  expect_warning(possible_ids(x1,
#   #                              include = "x"))
#   #
#   # possible_ids(x1,
#   #              include = c("id", "x")) |>
#   #   expect_no_error()
#
# })

test_that("exclude and include", {

  dd <- possible_ids(x3,
               exclude_classes =  c("numeric", "integer"),
               include = "foo")

  expect_equal(unlist(dd),
               c("id", "foo"))

  ## Test combination between include class and exclude vars ####

  res <- possible_ids(dt,
                               get_all = TRUE,
                               include_classes     = c("integer"),
                               exclude             = paste0("numeric_int_", 1:5))

  checked_vars <- attributes(res)$checked_ids

  any(
    paste0("numeric_int_", 1:5) %in% checked_vars
    ) |>
    expect_equal(FALSE)

  all(
    paste0("numeric_int_", 6:10) %in% checked_vars
  ) |>
    expect_equal(TRUE)

  ## Test combination between include vars and exclude class ####
  res <- possible_ids(dt,
                      include     = c("numeric_double_1",
                                      "numeric_double_2"),
                      exclude_classes = "numeric")

  checked_vars <- attributes(res)$checked_ids

  all(
    paste0("numeric_double_", 1:2) %in% checked_vars
  ) |>
    expect_equal(TRUE)

  any(
    paste0("numeric_double_", 3:10) %in% checked_vars
  ) |>
    expect_equal(FALSE)

  res_ids <- possible_ids(x2,
                               include = "x",
                               exclude_classes = "numeric")

  res_ids |>
    unlist() |>
    expect_equal(c("t", "x"))

  res_ids <- possible_ids(x3,
                          include = "id",
                          exclude_classes = "character")

  res_ids |>
    unlist() |>
    expect_setequal(c("foo", "id", "v"))

  # alert if include and exclude same class ####
  possible_ids(dt,
               include_classes = "numeric",
               exclude_classes = "numeric") |>
    expect_message()

  # alert if include and exclude same vars ####
  possible_ids(dt,
               include = c("id", "unique_id1"),
               exclude = c("id", "unique_id1")) |>
    expect_message()

  res <- possible_ids(dt,
                      exclude_classes = c("integer"),
                      include = c("numeric_int_1"))

  attributes(res)$checked_ids |>
    expect_setequal(setdiff(names(dt), c(paste0("numeric_int_", 2:10),
                                         "id",
                                         "unique_id1", "unique_id3")))

})


test_that("get length 0 -error", {

  expect_error(possible_ids(x1,
                           exclude_classes = c("numeric", "integer"),
                           include = "t"))

})

test_that("get all works", {

  # no error
  possible_ids(x4,
               get_all = TRUE) |>
    expect_no_error()

  # get all with user selected vars
  possible_ids(x4,
               vars = c("id1", "t"),
               get_all = TRUE) |>
    expect_no_error()

  # get all with max number of combinations
  possible_ids(x4,
               max_combination_size = 3,
               get_all = TRUE) |>
    expect_no_error()

  # check get all combs
  possible_ids(x3,
               get_all = TRUE) |>
    unlist() |>
    expect_setequal(c("id", "v", "foo"))



})


test_that("Max combination size", {

  res <- possible_ids(dt,
               vars = c( "unique_id1", "unique_id2", "unique_id3",
                        "character_1", "character_2", "character_3", "character_4"),
               max_combination_size = 5)


  sapply(res, function(sublist) {
    length(sublist) <= 3}) |>
    all() |>
    expect_true()

  res <- possible_ids(x1,
                      get_all = TRUE,
                      max_combination_size = 2)

  sapply(res, function(sublist) {
    length(sublist) <= 2}) |>
    all() |>
    expect_true()

})

test_that("Min combination size", {

  res <- possible_ids(x4,
               min_combination_size = 1,
               get_all = FALSE) |>
    unlist()

  expect_true(
    length(res) >= 1)

  res <- possible_ids(dt,
               min_combination_size = 3,
               get_all = FALSE) |>
    unlist()

  expect_true(length(res) >= 3)


  possible_ids(x4,
               #min_combination_size = 1,
               max_combination_size = 1) |>
    expect_error()


  possible_ids(x4,
               min_combination_size = 3,
               max_combination_size = 2) |>
    expect_error()

})


test_that("Exclude nothing", {
  p1 <- possible_ids(x1)
  p2 <- possible_ids(x1, exclude = "rer")

  expect_equal(p1, p2)

})


test_that("Exclude type and variable", {

  xx4 <- copy(x4)

  xx4[, id2 := as.character(id2)]
  dd <- possible_ids(xx4,
               exclude_classes = c("character"),
               exclude = "x")

  expect_equal(c("id1", "t"), unlist(dd))

})


test_that("Exclude more than one variable", {


  dd <- possible_ids(x4,
             exclude = c("id2", "x"))

  expect_equal(c("id1", "t"), unlist(dd))

})


test_that("duplicated names", {
  xx4 <- copy(x4)
  setnames(xx4, "t", "x")

  expect_error(possible_ids(xx4))

})

test_that("identifies ids", {

  vars <- c("var1", "var2", "var3")
  dt[, (vars) := create_ids(.N, n_ids = 3)]

  possible_ids(dt,
               vars = vars) |>
    unlist() |>
    expect_equal(vars)

  df_test <- as.data.frame(
    create_ids(n_rows = 50,
               n_ids = 3)
  )

  possible_ids(df_test,
               vars = c("id1", "id2"),
               include = "id3") |>
    unlist() |>
    expect_equal(c("id1", "id2", "id3"))

  possible_ids(df_test,
               exclude_classes = "integer",
               include = c("id1", "id2", "id3")) |>
    unlist() |>
    expect_equal(c("id1", "id2", "id3"))

})

# Auxiliary data: Big data table--------------------

# Set seed for reproducibility
set.seed(123)

# Number of rows and variables
n_rows <- 1e4        # 10,000 rows
n_vars <- 50         # Total variables

# Initialize an empty data.table
dt_large <- data.table(id = 1:n_rows)

## Manually create three variables that uniquely identify the data ####
dt_large[, unique_id1 := rep(1:10, each = 1000)]  # 1000 unique values repeated 100 times
dt_large[, unique_id2 := sample(letters, n_rows, replace = TRUE)]  # Random character variable
dt_large[, unique_id3 := sample(1:1000, n_rows, replace = TRUE)]   # Random integer

# Function to generate random data
generate_random_data <- function(n, type) {
  switch(type,
         "numeric_int" = sample(1:1e6, n, replace = TRUE),
         "numeric_double" = rnorm(n),
         "character" = replicate(n, paste0(sample(letters, 5, replace = TRUE), collapse = "")),
         "factor" = factor(sample(letters[1:10], n, replace = TRUE)),
         "logical" = sample(c(TRUE, FALSE), n, replace = TRUE),
         "date" = as.Date("2000-01-01") + sample(0:3650, n, replace = TRUE),
         "datetime" = as.POSIXct("2000-01-01") + sample(0:(3650*24*60*60), n, replace = TRUE)
  )
}

# Variable types and counts
var_types <- c("numeric_int", "numeric_double", "character", "factor", "logical", "date", "datetime")
vars_per_type <- c(10, 10, 10, 10, 5, 3, 2)  # Total should sum to 50

# Generate variables and add to the data.table
var_count <- 0
for (i in seq_along(var_types)) {
  type <- var_types[i]
  n_vars_type <- vars_per_type[i]
  for (j in 1:n_vars_type) {
    var_count <- var_count + 1
    var_name <- paste0(type, "_", j)
    dt_large[, (var_name) := generate_random_data(n_rows, type)]
  }
}

## Introduce duplicates in some columns that are NOT the unique identifiers ####
# For example, we can duplicate the first 100 rows in the "numeric_int_1" and "character_1" columns
# dt_large <- rbind(dt_large, dt_large[1:100, .(numeric_int_1, character_1)])

# Shuffle the data to avoid ordered data
dt_large <- dt_large[sample(.N)]



# dt_large[, id := .I]
dt <- copy(dt_large)


possible_ids(
  dt = dt_large,
  exclude_classes = c("numeric"),
  verbose = TRUE
)

possible_ids(
  dt = dt_large,
  exclude_classes = c("numeric"),
  exclude = "id",
  verbose = TRUE
)

uniq_vars <- grep("unique_id", names(dt_large), value = TRUE)
pids <- possible_ids(
  dt = dt_large,
  #exclude_classes = c("logical", "date", "datetime", "numeric"),
  exclude = "id",
  #vars = uniq_vars,
  verbose = TRUE,
  min_combination_size = 3,
  # max_combination_size = 3,
  max_processing_time = 240,
  get_all = TRUE
)

possible_ids(
  dt = dt_large,
  verbose = TRUE
)

## Remove the 'id' column to simulate data without a clear unique identifier ####
dt_large[, id := NULL]

possible_ids_list <- possible_ids(
  dt = dt_large,
  exclude_classes = c("logical", "date", "datetime"),  # Exclude some types for efficiency
  verbose = TRUE
)
possible_ids_list

possible_ids_list <- possible_ids(
  dt = dt_large,
  exclude_classes = c("logical", "date", "datetime", "numeric"),  # Exclude some types for efficiency
  max_processing_time = 120,
  verbose = TRUE
)
possible_ids_list
randrescastaneda/joyn documentation built on Dec. 20, 2024, 6:51 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
randrescastaneda/joyn
Tool for Diagnosis of Tables Joins and Complementary Join Features

tests/testthat/test-possible_ids.R
In randrescastaneda/joyn: Tool for Diagnosis of Tables Joins and Complementary Join Features

R Package Documentation

Browse R Packages

We want your feedback!

randrescastaneda/joyn Tool for Diagnosis of Tables Joins and Complementary Join Features

tests/testthat/test-possible_ids.R In randrescastaneda/joyn: Tool for Diagnosis of Tables Joins and Complementary Join Features

R Package Documentation

Browse R Packages

We want your feedback!

randrescastaneda/joyn
Tool for Diagnosis of Tables Joins and Complementary Join Features

tests/testthat/test-possible_ids.R
In randrescastaneda/joyn: Tool for Diagnosis of Tables Joins and Complementary Join Features