tests/testthat/test_semi_join.R

skip_if_no_db()

# Shared connection object
conn = get_scidb_connection()

schemaTemplate = conn$array_from_schema(
  "<lower:string COMPRESSION 'zlib', upper:string, f_int32:int32, f_int64:int64, f_bool: bool, f_double: double> 
      [da=0:*:0:*; db=0:*:0:*]"
)

ArrayContent = data.frame(
  #da=1:20, db=101:120,
  da = c(1:10, 1:10),
  db = c(101:110, 201:210),
  lower = letters[1:20], 
  upper = LETTERS[1:20],
  f_int32 = -20:-1, 
  f_int64 = 1:20 * 10.0, 
  f_bool = c(T,NA,F,NA,F), 
  f_double = c(3.14, 2.0, NA, 0, -99),
  stringsAsFactors = FALSE
)

RefArray = conn$
  array_from_df(ArrayContent, schemaTemplate, force_template_schema = T)$
  persist(.gc = FALSE)


assert_df_match = function(result_op, expected_df, afl_patterns) {
  resultAfl = result_op$to_afl()
  sapply(afl_patterns, function(x) expect_match(resultAfl, x))
  expect_identical(result_op$to_schema_str(), RefArray$to_schema_str())
  # Scidb doesn't have a deterministic ordering rule, so we need to sort the result data frame and then compare
  expect_equal(
    dplyr::arrange(data.frame(result_op$to_df_all()), da, db),
    dplyr::arrange(expected_df, da, db)
  )
}

# Filter mode ----

test_that("filter, 1-dim", {
  withr::local_options(stringsAsFactors = FALSE)

  df = data.frame(da = c(3,5,8,11))
  
  assert_df_match(
    RefArray$semi_join(df, filter_threshold=10, upload_threshold=20),
    dplyr::semi_join(ArrayContent, df),
    "filter"
  )
})

test_that("filter, 1-attr", {
  withr::local_options(stringsAsFactors = FALSE)

  values = c(-10, -11, -12)
  df = data.frame(f_int32 = values)

  assert_df_match(
    RefArray$semi_join(df, filter_threshold=10, upload_threshold=20),
    dplyr::semi_join(ArrayContent, df),
    "filter"
  )
})

test_that("filter, 2-attrs", {
  withr::local_options(stringsAsFactors = FALSE)

  values1 = c(-20, -17, -16)
  values2 = c("no_match", "d", "e")
  df = data.frame(f_int32 = values1, lower = values2)

  assert_df_match(
    RefArray$semi_join(df, filter_threshold=10, upload_threshold=20),
    dplyr::semi_join(ArrayContent, df),
    "filter"
  )
})

test_that("filter, 1-dim + 1-attr", {
  withr::local_options(stringsAsFactors = FALSE)

  values1 = c(-1, 4, 5)
  values2 = c("no_match", "d", "e")
  df = data.frame(da = values1, lower = values2)

  assert_df_match(
    RefArray$semi_join(df, filter_threshold=10, upload_threshold=20),
    dplyr::semi_join(ArrayContent, df),
    "filter"
  )
})

test_that("filter, lower/upper bounds on same dimension", {
  withr::local_options(stringsAsFactors = FALSE)

  df = data.frame(da_low = c(1,3), da_hi = c(5, 8))
  
  assert_df_match(
    RefArray$semi_join(df, 
                   lower_bound = list('da' = 'da_low'), 
                   upper_bound = list('da' = 'da_hi'),
                   filter_threshold=10, upload_threshold=20),
    dplyr::filter(ArrayContent, (da >= 1 & da <= 5) | (da >=3 & da <=8)),
    "filter"
  )
})

test_that("filter, lower/upper bounds on different dimensions", {
  withr::local_options(stringsAsFactors = FALSE)

  df = data.frame(da = c(1,3), db = c(105, 108))
  
  assert_df_match(
    RefArray$semi_join(df, 
                   lower_bound = list(da = 'da'), 
                   upper_bound = list(db = 'db'),
                   filter_threshold=10, upload_threshold=20),
    dplyr::filter(ArrayContent, (da >= 1 & db <= 105) | (da >=3 & db <=108)),
    "filter"
  )
})

test_that("filter mode cannot take array_op as data source", {
  withr::local_options(stringsAsFactors = FALSE)

  df = data.frame(da = c(1,3), db = c(105, 108))
  dataArr = conn$array_from_df(df, RefArray)
  
  expect_error(RefArray$semi_join(dataArr, mode = 'filter'), "data.frame")
})

# Cross_between mode ----

test_that("cross_between, 2 dimension, build", {
  withr::local_options(stringsAsFactors = FALSE)

  df = data.frame(da = c(1:5, -1), db = c(101:105, -1))

  assert_df_match(
    RefArray$semi_join(df, filter_threshold=10, upload_threshold=20, mode="cross_between"),
    dplyr::semi_join(ArrayContent, df),
    c("cross_between", "build")
  )

  assert_df_match(
    RefArray$semi_join(conn$array_from_df(df, RefArray), mode="cross_between"),
    dplyr::semi_join(ArrayContent, df),
    c("cross_between", "build")
  )
})

test_that("cross_between, error, no attribute allowed", {
  withr::local_options(stringsAsFactors = FALSE)

  df = data.frame(da = c(1:5, -1), db = c(101:105, -1), lower = letters[1:6])
  expect_error(
    RefArray$semi_join(data.frame(f_int32 = -1, f_int64 = 200), mode = "cross_between"),
    "cross_between"
  )
})


test_that("cross_between, lower/upper bounds on same dimension", {
  withr::local_options(stringsAsFactors = FALSE)

  df = data.frame(da_low = c(1,3), da_hi = c(5, 8))
  
  assert_df_match(
    RefArray$semi_join(df, 
                   lower_bound = list(da = 'da_low'), 
                   upper_bound = list(da = 'da_hi'),
                   filter_threshold=2, upload_threshold=20, mode="cross_between"),
    dplyr::filter(ArrayContent, (da >= 1 & da <= 5) | (da >=3 & da <=8)),
    "cross_between"
  )
  assert_df_match(
    RefArray$semi_join(conn$array_from_df(df, "<da_low:int64, da_hi:int64>[anything]"), 
                   lower_bound = list(da = 'da_low'), 
                   upper_bound = list(da = 'da_hi'),
                   mode="cross_between"),
    dplyr::filter(ArrayContent, (da >= 1 & da <= 5) | (da >=3 & da <=8)),
    "cross_between"
  )
})

test_that("cross_between, lower/upper bounds on different dimensions", {
  withr::local_options(stringsAsFactors = FALSE)

  df = data.frame(da = c(1,3), db = c(105, 108))
  
  assert_df_match(
    RefArray$semi_join(df, 
                   lower_bound = list(da = 'da'), 
                   upper_bound = list(db = 'db'),
                   field_mapping = list(),
                   filter_threshold=2, upload_threshold=20, mode="cross_between"),
    dplyr::filter(ArrayContent, (da >= 1 & db <= 105) | (da >=3 & db <=108)),
    "cross_between"
  )
  
  assert_df_match(
    RefArray$semi_join(conn$array_from_df(df, RefArray), 
                   lower_bound = list(da = 'da'), 
                   upper_bound = list(db = 'db'),
                   field_mapping = list(),
                   upload_threshold=0, mode="cross_between"),
    dplyr::filter(ArrayContent, (da >= 1 & db <= 105) | (da >=3 & db <=108)),
    "cross_between"
  )
})

# index_lookup mode ----

test_that("index_lookup, 1 dimension, build", {
  withr::local_options(stringsAsFactors = FALSE)

  df = data.frame(da = 1:15)
  
  assert_df_match(
    RefArray$semi_join(df, filter_threshold=5, upload_threshold=20),
    dplyr::semi_join(ArrayContent, df),
    c("index_lookup", "build")
  )
  
  assert_df_match(
    RefArray$semi_join(conn$array_from_df(df, RefArray), upload_threshold=0),
    dplyr::semi_join(ArrayContent, df),
    c("index_lookup", "build")
  )
})


test_that("index_lookup, 1 attribute, build", {
  withr::local_options(stringsAsFactors = FALSE)

  df = data.frame(lower = letters[5:15])
  
  assert_df_match(
    RefArray$semi_join(df, filter_threshold=5, upload_threshold=20),
    dplyr::semi_join(ArrayContent, df),
    c("index_lookup", "build")
  )
  
  assert_df_match(
    RefArray$semi_join(conn$array_from_df(df, RefArray), upload_threshold=0),
    dplyr::semi_join(ArrayContent, df),
    c("index_lookup", "build")
  )
})

test_that("index_lookup, 1 dimension, upload", {
  withr::local_options(stringsAsFactors = FALSE)

  df = data.frame(da = 1:15)
  
  assert_df_match(
    RefArray$semi_join(df, filter_threshold = 5, upload_threshold = 10),
    dplyr::semi_join(ArrayContent, df),
    c("index_lookup", "Rarrayop|R_array")
  )
  assert_df_match(
    RefArray$semi_join(conn$array_from_df(df, RefArray), upload_threshold=0),
    dplyr::semi_join(ArrayContent, df),
    c("index_lookup", "build")
  )
})


test_that("index_lookup, 1 attribute, upload", {
  withr::local_options(stringsAsFactors = FALSE)

  df = data.frame(lower = letters[5:15])
  
  assert_df_match(
    RefArray$semi_join(df, filter_threshold = 2, upload_threshold = 5),
    dplyr::semi_join(ArrayContent, df),
    "index_lookup"
  )
  assert_df_match(
    RefArray$semi_join(conn$array_from_df(df, RefArray), upload_threshold=0),
    dplyr::semi_join(ArrayContent, df),
    c("index_lookup", "build")
  )
})

test_that("index_lookup, error cases", {
  withr::local_options(stringsAsFactors = FALSE)

  df = data.frame(da = c(1:5, -1), db = c(101:105, -1))
  dataArray1 = conn$compile_df(df, RefArray)
  dataArray2 = conn$compile_df(df %>% dplyr::select(da), RefArray)
  
  # only 1-dim and 1-attr array is allowed
  expect_error(
    RefArray$semi_join(dataArray1, mode = "index_lookup"),
    "only one dimension and one attribute"
  )
  # only field_mapping allowed
  expect_error(
    RefArray$semi_join(dataArray2, mode = "index_lookup", lower_bound = list(da="da")),
    "lower_bound"
  )
})

test_that("equi_join, 2-attrs", {
  withr::local_options(stringsAsFactors = FALSE)

  values1 = c(-20, -17, -16)
  values2 = c("no_match", "d", "e")
  df = data.frame(f_int32 = values1, lower = values2)

  assert_df_match(
    RefArray$semi_join(df, filter_threshold=10, upload_threshold=20, mode="equi_join"),
    dplyr::semi_join(ArrayContent, df),
    "equi_join"
  )
})

test_that("equi_join, 1-dim + 1-attr", {
  withr::local_options(stringsAsFactors = FALSE)

  values1 = c(-1, 4, 5)
  values2 = c("no_match", "d", "e")
  df = data.frame(da = values1, lower = values2)

  assert_df_match(
    RefArray$semi_join(df, filter_threshold=10, upload_threshold=20, mode="equi_join"),
    dplyr::semi_join(ArrayContent, df),
    "equi_join"
  )
})

test_that("param format error checking", {
  withr::local_options(stringsAsFactors = FALSE)

  df = data.frame(da = c(1:5, -1), db = c(101:105, -1))
  dataArray = conn$compile_df(df, RefArray)
  
  expect_error(
    RefArray$semi_join(df, mode = "non-existent"),
    "non-existent"
  )
  expect_error(RefArray$semi_join(df, field_mapping = list(non_field = "da")), "non_field")
  expect_error(RefArray$semi_join(list()), "list")
})

test_that("all modes return equal results 1 dimension", {
  withr::local_options(stringsAsFactors = FALSE)

  filter_df = data.frame(da = c(3, 5, 8, 11))
  expected_df = dplyr::semi_join(ArrayContent, filter_df)

  semi_join_result = RefArray$semi_join(filter_df, mode = "filter")$to_df_all()
  expect_equal(
    dplyr::arrange(semi_join_result, da, db),
    dplyr::arrange(expected_df, da, db)
  )

  semi_join_result = RefArray$semi_join(filter_df, mode = "index_lookup")$to_df_all()
  expect_equal(
    dplyr::arrange(semi_join_result, da, db),
    dplyr::arrange(expected_df, da, db)
  )

  semi_join_result = RefArray$semi_join(filter_df, mode = "cross_between")$to_df_all()
  expect_equal(
    dplyr::arrange(semi_join_result, da, db),
    dplyr::arrange(expected_df, da, db)
  )

  semi_join_result = RefArray$semi_join(filter_df, mode = "equi_join")$to_df_all()
  expect_equal(
    dplyr::arrange(semi_join_result, da, db),
    dplyr::arrange(expected_df, da, db)
  )
})

test_that("all modes return equal results 1 attribute", {
  withr::local_options(stringsAsFactors = FALSE)

  filter_df = data.frame(f_int32 = c(-10, -11, -12))
  expected_df = dplyr::semi_join(ArrayContent, filter_df)

  semi_join_result = RefArray$semi_join(filter_df, mode = "filter")$to_df_all()
  expect_equal(
    dplyr::arrange(semi_join_result, da, db),
    dplyr::arrange(expected_df, da, db)
  )

  semi_join_result = RefArray$semi_join(filter_df, mode = "index_lookup")$to_df_all()
  expect_equal(
    dplyr::arrange(semi_join_result, da, db),
    dplyr::arrange(expected_df, da, db)
  )

  expect_error(
    RefArray$semi_join(filter_df, mode = "cross_between")$to_df_all(),
    "Only dimensions are matched in this mode"
  )

  semi_join_result = RefArray$semi_join(filter_df, mode = "equi_join")$to_df_all()
  expect_equal(
    dplyr::arrange(semi_join_result, da, db),
    dplyr::arrange(expected_df, da, db)
  )

})

test_that("all modes return equal results 2 dimensions", {
  withr::local_options(stringsAsFactors = FALSE)

  filter_df = data.frame(da = c(1:5, -1), db = c(101, 202, 103, 204, 105, 110))
  expected_df = dplyr::semi_join(ArrayContent, filter_df)

  semi_join_result = RefArray$semi_join(filter_df, mode = "filter")$to_df_all()
  expect_equal(
    dplyr::arrange(semi_join_result, da),
    dplyr::arrange(expected_df, da)
  )

  semi_join_result = RefArray$semi_join(filter_df, mode = "cross_between")$to_df_all()
  expect_equal(
    dplyr::arrange(semi_join_result, da),
    dplyr::arrange(expected_df, da)
  )

  semi_join_result = RefArray$semi_join(filter_df, mode = "equi_join")$to_df_all()
  expect_equal(
    dplyr::arrange(semi_join_result, da),
    dplyr::arrange(expected_df, da)
  )
})

test_that("all modes return equal results 2 attributes", {
  withr::local_options(stringsAsFactors = FALSE)

  filter_df = data.frame(f_int32 = c(-20, -17, -5), lower = c("no_match", "d", "p"))
  expected_df = dplyr::semi_join(ArrayContent, filter_df)

  semi_join_result = RefArray$semi_join(filter_df, mode = "filter")$to_df_all()
  expect_equal(
    dplyr::arrange(semi_join_result, da, db),
    dplyr::arrange(expected_df, da, db)
  )

  expect_error(
    RefArray$semi_join(filter_df, mode = "cross_between")$to_df_all(),
    "Only dimensions are matched in this mode"
  )

  semi_join_result = RefArray$semi_join(filter_df, mode = "equi_join")$to_df_all()
  expect_equal(
    dplyr::arrange(semi_join_result, da, db),
    dplyr::arrange(expected_df, da, db)
  )  
})

test_that("all modes return equal results 1 dimension, 1 attribute", {
  withr::local_options(stringsAsFactors = FALSE)

  filter_df = data.frame(da = c(20, 4, 6), lower = c("no_match", "d", "p"))
  expected_df = dplyr::semi_join(ArrayContent, filter_df)

  semi_join_result = RefArray$semi_join(filter_df, mode = "filter")$to_df_all()
  expect_equal(
    dplyr::arrange(semi_join_result, da, db),
    dplyr::arrange(expected_df, da, db)
  )

  # Not testing cross_between, results won't be the same as it ignores attributes.
  # TODO: put in a error condition if you pass attributes to cross_between mode

  semi_join_result = RefArray$semi_join(filter_df, mode = "equi_join")$to_df_all()
  expect_equal(
    dplyr::arrange(semi_join_result, da, db),
    dplyr::arrange(expected_df, da, db)
  ) 
})

RefArray$remove_array()
Paradigm4/ArrayOpR documentation built on Dec. 11, 2023, 5:59 a.m.