Create an array

We first create a RefArray with content from ArrayContent data frame

ArrayContent = data.frame(
  da=1:20, db=101:120,
  lower = letters[1:20], 
  upper = LETTERS[1:20],
  f_int32 = -20:-1, 
  f_int64 = 1:20 * 10.0, 
  f_bool = c(T,NA,F,NA,F), 
  f_double = c(3.14, 2.0, NA, 0, -99)
)

RefArray = conn$
  array_from_df(ArrayContent, 
  template =  "<lower:string COMPRESSION 'zlib', upper:string, f_int32:int32, f_int64:int64, f_bool: bool, f_double: double> 
      [da=0:*:0:*; db=0:*:0:*]", 
  force_template_schema = T)$ # Ensure RefArray has the exact schema as the template
  persist(.gc = FALSE)

RefArray$to_afl()
RefArray$to_df_all()

Overview

We can get an arrayOp with the same schema as RefArray but filtered content. The filtering criteria can be from either an R data frame or an arrayOp instance.

Let's define a function to show the semi join result arrayOp afl and download the data frame.

show = function(result_array_op){
  print(result_array_op$to_afl())
  result_array_op$to_df_all() # also download dimensions
}

Filter mode

When the filtering criteria is from an R data frame and its cell count is relative small to fit in a scidb build literal, we can use mode = 'filter'.

By one field

RefArray$semi_join(
  data.frame(da = c(3,5,8,11)), # number of cells <= filter_threshold
  filter_threshold=10, upload_threshold=20) %>%
  show

# No matched cells
RefArray$semi_join(
  data.frame(da =  c(-10, -11, -12)),
  filter_threshold=10, upload_threshold=20) %>%
  show

By two fields

RefArray$semi_join(
  data.frame(
    f_int32 =  c(-20, -17, -16),
    lower = c("no_match", "d", "e")
  ),
  filter_threshold = 10,
  upload_threshold = 20
) %>%
  show


RefArray$semi_join(
  data.frame(da =  c(-1, 4, 5),
             lower = c("no_match", "d", "e")),
  filter_threshold = 10,
  upload_threshold = 20
) %>%
  show

# Set lower/upper bounds to range query array dimensions
RefArray$semi_join(
  data.frame(da_low = c(1, 3), da_hi = c(5, 8)),
  lower_bound = list('da' = 'da_low'),
  upper_bound = list('da' = 'da_hi'),
  filter_threshold = 10,
  upload_threshold = 20
) %>%
  show

# Set lower/upper bounds to on different dimensions
RefArray$semi_join(
  data.frame(da = c(1, 3), db = c(105, 108)),
  lower_bound = list('da' = 'da'),
  upper_bound = list('db' = 'db'),
  filter_threshold = 10,
  upload_threshold = 20
) %>%
  show

cross_between mode

When the query data frame has cell count greater than filter_threshold, but smaller than upload_threshold, the data frame is converted to a build literal. Or if its greater than upload_threshold, the data frame is uploaded as a persistent scidb array first.

In both case, if the arrayOp from the query data frame has more than one attribute, a cross_between mode is automatically chosen.

RefArray$semi_join(
  data.frame(da = c(1:5, -1), db = c(101:105, -1)),
  filter_threshold = 10,
  upload_threshold = 20
) %>%
  show

RefArray$semi_join(
  data.frame(da_low = c(1,3), da_hi = c(5, 8)),
  lower_bound = list(da = 'da_low'), 
  upper_bound = list(da = 'da_hi'),
  filter_threshold = 10,
  upload_threshold = 20
) %>%
  show

# The query is already an arrayOp, then the threshold is irrelevant
# cross_between mode is chosen if the query arrayOp has more than one 
# attribute
RefArray$semi_join(
  conn$array_from_df(
    data.frame(da_low = c(1, 3), da_hi = c(5, 8)),
    "<da_low:int64, da_hi:int64> [anything]"
  ),
  lower_bound = list(da = 'da_low'),
  upper_bound = list(da = 'da_hi'),
) %>%
  show


# Explicitly set `field_mapping` to an empty list if the matching fields are 
# supposed to be used as bounds instead of exact matching
RefArray$semi_join(
  data.frame(da = c(1, 3), db = c(105, 108)),
  lower_bound = list(da = 'da'),
  upper_bound = list(db = 'db'),
  field_mapping = list(),
  filter_threshold = 2,
  upload_threshold = 20
) %>%
  show

RefArray$semi_join(
  conn$array_from_df(
    data.frame(da = c(1, 3), db = c(105, 108)),
    template = RefArray
  ),
  lower_bound = list(da = 'da'),
  upper_bound = list(db = 'db'),
  field_mapping = list()
) %>%
  show

index_lookup mode

If the query arrayOp has only one attribute, 'index_lookup' mode is automatically chosen unless otherwise provided.

# by attribute
RefArray$semi_join(
  data.frame(lower = letters[5:15]),
  filter_threshold = 5,
  upload_threshold = 20
) %>% show

# by dimension
RefArray$semi_join(
  data.frame(da = 1:15),
  filter_threshold = 5,
  upload_threshold = 20
) %>% show

A note about uploaded data and temp arrays

When the query data is larger than upload_threashold, a persistant array is created in SciDB. The reference to this array is contained in the resulting arrayOp instance. However, this reference isn't passed along to additonal chained opperation that may be called. In order to prevent the query array from being garbage collected by R before the arrayOp is executed, you should hold onto a reference to the arrayOp that semi_join returns.

# The following will work, due to only having a single 
result_array <- RefArray$semi_join(
                  data.frame(da = c(1:5, -1), db = c(101:105, -1)),
                  filter_threshold = 10,
                  upload_threshold = 20)

result_array %>% show
rm(result_array)

# The following _would_ fail if we didn't create `temp_arr_1` around 
result_array <- temp_arr_1 <- RefArray$semi_join(
                                data.frame(da = c(1:5, -1), db = c(101:105, -1)),
                                filter_threshold = 10,
                                upload_threshold = 20)

# Because we are assigning the result of this second `semi_join` to the same result_array
#   variable, the reference to the original uploaded query data goes out of scope, and
#   could be garbage collected, resulting the an 'array not found' error when you 
#   finally execute the arrayOp chain, unless we still have a 
#   reference to it (with `temp_arr_1` above). 
result_array <- result_array$semi_join(data.frame(da = c(1:5)),
                                       filter_threshold = 10,
                                       upload_threshold = 20)
result_array %>% show
rm(result_array)
rm(temp_arr_1)
# Clena up
RefArray$remove_array()


Paradigm4/ArrayOpR documentation built on Dec. 11, 2023, 5:59 a.m.