# A base class for other specialized array operands/operations, e.g. SubsetOp, JoinOp, ArraySchema
MAX_DIM = '4611686018427387903'
MIN_DIM = '-4611686018427387902'
#' A string representation of ArrayOp instance
#' @param object the ArratOp to get a string representation of
#' @param ... Additional arguments
#' @export
str.ArrayOpBase = function(object, ...) {
sprintf("%s %s", object$to_afl(), object$to_schema_str())
}
#' A string representation of ArrayOp instance
#' @param x The ArrayOp to print
#' @param ... Additional arguments
#' @export
print.ArrayOpBase = function(x, ...) {
sprintf("%s %s", x$to_afl(), x$to_schema_str())
}
#' ArrayOp base class that encapsulates a scidb array or array operations
#'
#' @description
#' ArrayOp class instances denote scidb array operations and operands, hence the name.
#'
#' Operands can be plain scidb array names or (potentially nested) operations on arrays.
#'
#' Most ArrayOp class methods return a new ArrayOp instance and the original
#' instance on which methods are invoked from remains the same,
#' i.e. ArrayOp instances are immutable.
#'
#' @details
#' One ArrayOp operation may involve one or multiple scidb operators and any number of operands.
#' in another operation. Operands and Opreration results can all be denoted by ArrayOp.
#'
#' Sub-classes of ArrayOpBase deal with any syntax or operator changes in different
#' SciDB version so that the ArrayOpBase class can provide a unified API on all
#' supported SciDB versions. Currently SciDB V18 and V19 are supported.
#'
#' Users of `arrayop` package shouldn't be concerned with a specific sub-class since
#' the `ScidbConnection` object automatically chooses the correct class version
#' and creates instances based on the scidb version it connects to.
#'
#' Get arrayOp instances from the default ScidbConnection object.
#' See `arrayop::get_default_connection` for details.
ArrayOpBase <- R6::R6Class(
"ArrayOpBase",
cloneable = FALSE,
# ArrayOpBase class internally maintains three private fields to represent its state.
private = list(
# Raw AFL encapsulated in the ArrayOp instance.
# A string of either an array name or potentially nested AFL operations.
raw_afl = NULL
,
# A list that stores other meta including: field types, dimension specs.
metaList = NULL
,
# A ScidbConnection instance which determines where and how AFL is executed.
conn = NULL
,
# Generailized funtion to set or update a metadata with a (key, value) pair.
set_meta = function(key, value) {
private$metaList[[key]] <- value
}
,
# Generalized function to return a medatadata with key.
# Return NULL if key doesn't exist.
get_meta = function(key) {
private$metaList[[key]]
}
,
# todo: move connection setup to initialize function
set_conn = function(connection) {
private$conn = connection
}
,
# Hold references to the provided objects
# Held obj refs accumulate
# This is necessary to prevent R GC remove scidbr loaded arrays
hold_refs = function(objs){
key = ".refs"
# if get_meta(key) is NULL, c(NULL, objs) still works
private$set_meta(key, c(private$get_meta(key), objs))
}
,
# Set gc to TRUE or FALSE
set_gc = function(if_gc){
assert_single_bool(if_gc)
key = ".gc"
# if get_meta(key) is NULL, c(NULL, objs) still works
private$set_meta(key, if_gc)
}
,
# Whether the arrayOp should be gc'ed. Default FALSE
is_gc = function(){
key = ".gc"
# if get_meta(key) is NULL, c(NULL, objs) still works
private$get_meta(key) %?% FALSE
}
,
# set the metadata to indicate the schema from this array_op instance is
# returned from SciDB rather than locally inferred in R
confirm_schema_synced = function(){
private$set_meta('is_schema_from_scidb', TRUE)
invisible(self)
}
,
# @description
# Return ArrayOp field types as a list
#
# @param field_names R character. If NULL, defaults to `self$dims_n_attrs`, ie. dimensions and attributes.
# @param .raw Default FALSE, full data types are returned; if set TRUE, only the raw data types are returned
# (raw data types are string, int32, int64, bool, etc, without scidb attribute specs such as: string compression 'zlib')
# @return a named list where keys are field names,
# and values are scidb data types (single word or multiple words).
#
# Throw an error if field_names are not all valid.
get_field_types = function(field_names = NULL, .raw = FALSE){
# field_names default to dims + attrs
field_names = field_names %?% self$dims_n_attrs
assert_empty(field_names %-% self$dims_n_attrs, "Invalid field(s): [{.value}]")
result = new_named_list(self$dtypes[field_names], field_names)
if(.raw){
result = new_named_list(regmatches(result, regexpr("^\\w+", result)), names = field_names)
}
return(result)
}
,
# @description
# Get dimension specifications as a list
#
# A dimension spec str is formatted as "lower bound : upper bound : overlap : chunk_length",
# as seen in scidb `show(array_name)` operator.
# All dimensions' data types are int64, as returned by `get_field_types` function.
# @param dim_names Default NULL equals all dimensions.
# @return A named list where the name is dimension name and value is a dimension spec string
#
# Throw an error if dim_names are not all valid dimension names.
get_dim_specs = function(dim_names = NULL) {
dim_names = dim_names %?% self$dims
assert_empty(dim_names %-% self$dims, "Invalid dimension(s): [{.value}]")
private$get_meta('dim_specs')[dim_names]
}
,
# if no fields selected of self, then select all fields
# otherwise return unchanged self
auto_select = function() {
if(length(self$selected) == 0L) self$select(self$dims_n_attrs)
else self
}
,
validate_join_operand = function(side, operand, keys){
assert(inherits(operand, "ArrayOpBase"),
"JoinOp arg '%s' must be class of [%s], but got '%s' instead.",
side, 'ArrayOpBase', paste(class(operand), collapse=","))
assert(is.character(keys) && .not_empty(keys),
"Join params 'on_%s' must be a non-empty R character, but got '%s' instead",
side, paste(class(keys), collapse = ","))
assert_empty(keys %-% operand$dims_n_attrs,
sprintf("Invalid key field(s): [{.value}] on the %s operand", side))
}
,
validate_join_params = function(left, right, by.x, by.y, settings) {
private$validate_join_operand('left', left, by.x)
private$validate_join_operand('right', right, by.y)
# Assert left and right keys lengths match
assert(length(by.x) == length(by.y),
"ArrayOp join: by.x[%s field(s)] and by.y[%s field(s)] must have the same length.",
length(by.x), length(by.y))
# Validate settings
if(.not_empty(settings)){
settingKeys <- names(settings)
if(.is_empty(settingKeys) || any(settingKeys == '')){
stop("ArrayOp join settings must be a named list and each setting item must have a non-empty name")
}
}
}
,
# @description
# Generate afl for ArrayOp used in a join context (equi_join).
#
#Prerequisites include 1. dimensions/attributes/selected field names 2. .to_afl
#Joined field(s) (attr or dimension) is converted to an attribute in equi_join result
#All attributes and joined dimensions will be kept in equi_join result, which may not be optimal.
#All non-joined dimensions will be dropped.
#Detailed design and logic flow can be found at:
#https://docs.google.com/spreadsheets/d/1kN7QgvQXXxcovW9q25d4TNb6tsf888-xhWdZW-WELWw/edit?usp=sharing
# @param keyFileds Field names as join keys
# @param keep_dimensiosn If `keep_dimensions` is specified in scidb `equi_join` operator
# @return An AFL string
to_join_operand_afl = function(keyFields, keep_dimensions = FALSE, artificial_field = dbutils$random_field_name()) {
selectedFields = self$selected
if(.is_empty(selectedFields))
return(self$to_afl())
dimensions = self$dims
attributes = self$attrs
arrName = self$to_afl()
applyList <- selectedFields %n% (dimensions %-% keyFields)
projectList <- applyList %u% (attributes %n% selectedFields) %u% (keyFields %-% dimensions)
specialList <- dimensions %n% keyFields # for joined-on dimensions which we also we want to keep in result
if(keep_dimensions){
applyList = applyList %-% self$dims
projectList = projectList %-% self$dims
# specialList = specialList %-% self$dims
}
# case-1: When no attrs projected && selected dimensions, we need to create an artificial attr to project on.
if(.not_empty(specialList) && .is_empty(projectList) && .not_empty(selectedFields)){
res = afl(arrName | apply(artificial_field, 'null') | project(artificial_field))
}
# case-2: Regular mode. Just 'apply'/'project' for dimensions/attributes when needed.
else{
applyExpr = if(.not_empty(applyList))
afl(arrName | apply(aflutils$join_fields(applyList, applyList)))
else arrName
res = if(.not_empty(projectList))
afl(applyExpr | project(projectList))
else applyExpr
}
return(res)
}
,
# @description
# Create a new ArrayOp instance by joining with another ArrayOp
#
# Currently implemented with scidb `equi_join` operator.
# @param right The other ArrayOp instance to join with.
# @param by.x R character vector. Join keys from the left (self).
# @param by.y R character vector. Join keys from the `right`. Must be of the same length as `by.x`
# @param by Join keys on both operand.
# @param settings `equi_join` settings, a named list where both key and values are strings.
# @param .dim_mode How to reshape the resultant ArrayOp. Same meaning as in `ArrayOp$reshape` function.
# By default, dim_mode = 'keep', the artificial dimensions, namely `instance_id` and `value_no` from `equi_join`
# are retained. If set to 'drop', the artificial dimensions will be removed. See `ArrayOp$reshape` for more details.
# @param .artificial_field As in `ArrayOp$reshape`, it defaults to a random field name. It can be safely ignored in
# client code. It exists only for test purposes.
# @return A new arrayOp
join = function(left, right,
by.x = NULL, by.y = NULL, by = NULL,
join_mode = 'equi_join',
settings = NULL,
left_alias = '_L', right_alias = '_R') {
if(.is_empty(by.x) && .is_empty(by.y) && .is_empty(by)){
by = left$dims_n_attrs %n% right$dims_n_attrs
}
if(.not_empty(by)){
by.x = by %u% by.x
by.y = by %u% by.y
}
switch(join_mode,
'equi_join' = private$equi_join,
'cross_join' = private$cross_join,
stopf("ERROR: ArrayOp$join: Invalid param 'join_mode' %s. Must be one of [equi_join, cross_join]", join_mode)
)(
left$.private$auto_select(),
right$.private$auto_select(),
by.x, by.y, settings = settings,
.left_alias = left_alias, .right_alias = right_alias)
}
,
# @description
# Disambiguate selected fields of join
#
# If no name clashes between selected fields of both operands,
# nothing changes.
# Otherwise, add suffix to duplicated fields for each operand.
# E.g. if a field named 'value' exists on both operands and is retained,
# it will be renamed as 'value_left' and 'value_right' if left_alias and right_alias
# are '_left' and '_right', respectively.
disambiguate_join_fields = function(left_fields, right_fields, .left_alias, .right_alias) {
rawFields = c(left_fields, right_fields)
duplicatedFields = rawFields[duplicated(rawFields)]
if(length(duplicatedFields) > 0L){
selectedLeftFields = Map(function(x) if(x %in% duplicatedFields) sprintf("%s.%s", .left_alias, x) else x,
left_fields)
selectedRightFields = Map(function(x) if(x %in% duplicatedFields) sprintf("%s.%s", .right_alias, x) else x,
right_fields)
selectedLeftNames= Map(function(x) if(x %in% duplicatedFields) sprintf("%s%s", x, .left_alias) else x,
left_fields)
selectedRightNames = Map(function(x) if(x %in% duplicatedFields) sprintf("%s%s", x, .right_alias) else x,
right_fields)
.new_named_char_vec(
values = selectedLeftFields %u% selectedRightFields,
names = selectedLeftNames %u% selectedRightNames
)
} else {
.new_named_char_vec(rawFields)
}
}
,
equi_join = function(left, right, by.x, by.y, settings = NULL,
.left_alias = '_L', .right_alias = '_R') {
# Validate join params
private$validate_join_params(left, right, by.x, by.y, settings)
# Validate selected fields
hasSelected = .not_empty(left$selected) || .not_empty(right$selected)
assertf(hasSelected, "There should be selected fields of equi_join operands.")
# Create setting items
mergedSettings <- c(list(left_names = by.x, right_names = by.y), settings)
# Values of setting items cannot be quoted.
# But the whole 'key=value' needs single quotation according to equi_join plugin specs
settingItems = mapply(function(k, v) private$to_equi_join_setting_item_str(k, v, .left_alias, .right_alias),
names(mergedSettings), mergedSettings)
keep_dimensions = .not_empty(settings[['keep_dimensions']]) &&
(settings[['keep_dimensions']] == 1 || settings[['keep_dimensions']])
# Join two operands
joinExpr <- sprintf(private$equi_join_template(.left_alias, .right_alias),
left$.private$to_join_operand_afl(by.x, keep_dimensions = keep_dimensions),
right$.private$to_join_operand_afl(by.y, keep_dimensions = keep_dimensions),
paste(settingItems, collapse = ', '))
dims = list(instance_id = 'int64', value_no = 'int64')
attrs = as.character(unique(c(left$selected, right$selected %-% by.y)))
dtypes = .remove_null_values(c(dims, c(left$dtypes, right$dtypes)[attrs]))
selectedFields = private$disambiguate_join_fields(
left$selected, right$selected %-% by.y,
.left_alias, .right_alias
)
joinedOp = private$create_new(joinExpr, names(dims), attrs, dtypes = dtypes)
return(
# selectedFields names and values may be different due to disambiguation
joinedOp$.private$reshape_fields(selectedFields)$select(names(selectedFields))
)
}
,
cross_join = function(left, right, by.x, by.y, settings = NULL,
.left_alias = '_L', .right_alias = '_R') {
# Scidb cross_join operator only allows join on operands' dimensions
assert_keys_are_all_dimensions = function(side, operand, keys){
nonDimKeys = keys %-% operand$dims
assert_not_has_len(nonDimKeys,
"ERROR: ArrayOp$join: cross_join mode: All join keys must be dimensions, but %s key(s) '%s' are not.",
side, paste(nonDimKeys, collapse = ','))
}
# Validate join params
# left = self
private$validate_join_params(left, right, by.x, by.y, settings)
assert_keys_are_all_dimensions('left', left, by.x)
assert_keys_are_all_dimensions('right', right, by.y)
# Construct the AFL
joinDims = paste(sprintf(", %s.%s, %s.%s", .left_alias, by.x, .right_alias, by.y), collapse = '')
aflStr = sprintf("cross_join(%s as %s, %s as %s %s)",
# left$to_afl(),
left$.private$to_join_operand_afl(by.x, keep_dimensions = TRUE),
.left_alias,
# right$to_afl(),
right$.private$to_join_operand_afl(by.y, keep_dimensions = TRUE),
.right_alias,
joinDims)
attrs = c(left$attrs, right$attrs)
dims = c(left$dims, right$dims %-% by.y)
dtypes = c(left$.private$get_field_types(),
right$.private$get_field_types(right$dims_n_attrs %-% by.y))
dim_specs = c(left$.private$get_dim_specs(),
right$.private$get_dim_specs(right$dims %-% by.y))
joinedOp = private$create_new(aflStr, dims = dims, attrs = attrs, dtypes = dtypes, dim_specs = dim_specs)
selectedFields = private$disambiguate_join_fields(
left$selected, right$selected %-% by.y,
.left_alias, .right_alias
)
# cross_join keeps operands' dimensions, no need to apply dimensions as attributes
selectedAttrs = Filter(function(x) !x %in% left$dims && !x %in% right$dims, selectedFields)
joinedOp$.private$reshape_fields(selectedAttrs)$select(names(selectedFields))
}
,
# TODO: Move logic from `match` method
semi_join_by_df_filter = function(df, field_mapping, lower_bound, upper_bound){
private$match(df, "filter", field_mapping = field_mapping, lower_bound = lower_bound, upper_bound = upper_bound)
}
,
# @description
# Create a new ArrayOp by matching a template against a source (self).
#
# The result has the same schema with the source.
# All fields in the template are compared to their matching source fields by equality, except for thos in
# lower_bound/upper_bound which will be used as a range `[lower_bound, upper_bound]`.
# @param template A data.frame or ArrayOp used to reduce the number of source cells without changing its schema
# @param op_mode ['filter', 'cross_between']
# @param lower_bound Field names as lower bounds.
# @param upper_bound Field names as upper bounds.
# @param field_mapping A named list where name is source field name and value is template field name.
# Default NULL: fields are mapped between template and source by field names only.
# If there is mapping fields in the template which are intended for lower or upper bound,
# provide an empty list or a list with matching fields
# @return A new ArrayOp instance which has the same schema as the source.
match = function(template, op_mode, lower_bound = NULL, upper_bound = NULL, field_mapping = NULL){
assert_not_has_len(names(lower_bound) %n% names(field_mapping),
"ERROR: ArrayOp$semi_join: Field names in param 'lower_bound' and 'field_mapping' cannot overlap: '%s'",
paste(names(lower_bound) %n% names(field_mapping), collapse = ','))
assert_not_has_len(names(upper_bound) %n% names(field_mapping),
"ERROR: ArrayOp$semi_join: Field names in param 'upper_bound' and 'field_mapping' cannot overlap: '%s'",
paste(names(upper_bound) %n% names(field_mapping), collapse = ','))
if(.not_empty(lower_bound))
assert_named_list(lower_bound, "ERROR: ArrayOp$semi_join: lower_bound if provided must be a named list.")
if(.not_empty(upper_bound))
assert_named_list(upper_bound, "ERROR: ArrayOp$semi_join: upper_bound if provided must be a named list.")
filter_mode = function(){
assert(inherits(template, 'data.frame'),
"ERROR: ArrayOp$semi_join: filter mode: template must be a data.frame, but got: %s", class(template))
unmatchedCols = names(template) %-% self$dims_n_attrs %-% lower_bound %-% upper_bound
assert_not_has_len(unmatchedCols,
"ERROR: ArrayOp$semi_join: filter mode: template field(s) not matching the source: '%s'",
paste(unmatchedCols, collapse = ','))
colTypes = sapply(template, class)
needQuotes = !(colTypes %in% c('numeric', 'integer', 'integer64'))
valueStrTemplates = lapply(needQuotes, .ifelse, "'%s'", "%s")
# Iterate on the template data frame
convertRow = function(eachRow, colNames) {
rowValues = mapply(sprintf, valueStrTemplates, eachRow)
# Each filter item per row per field
rowItems = mapply(function(name, val){
operator = as.character(NULL)
sourceFieldName = as.character(NULL)
if(name %in% lower_bound) {
operator = c(operator, '>=')
sourceFieldName = c(sourceFieldName, names(lower_bound)[lower_bound == name][[1]])
}
if(name %in% upper_bound) {
operator = c(operator, '<=')
sourceFieldName = c(sourceFieldName, names(upper_bound)[upper_bound == name][[1]])
}
if(!name %in% lower_bound && !name %in% upper_bound){
operator = c(operator, '=')
sourceFieldName = c(sourceFieldName, name)
}
sprintf("%s%s%s", sourceFieldName, operator, val)
}, colNames, rowValues)
if(is.list(rowItems))
rowItems = do.call(c, rowItems) # in case of mixed vector and string
sprintf(
.ifelse(length(rowItems) > 1, "(%s)", "%s"), # String template for a row
paste(rowItems, collapse = ' and ')
)
}
filter_afl = paste( apply(template, 1, convertRow, names(template)), collapse = ' or ' )
return(afl(self | filter(filter_afl)))
}
cross_between_mode = function(){
assert(inherits(template, 'ArrayOpBase'),
"ERROR: ArrayOp$semi_join: cross_between mode: template must be a ArrayOp instance, but got: %s", class(template))
if(is.null(field_mapping)){
dimMatchMarks = self$dims %in% template$dims_n_attrs
matchedDims = template$dims_n_attrs %n% self$dims
field_mapping = as.list(structure(matchedDims, names = matchedDims))
}
else {
matchedDims = names(field_mapping) %n% self$dims
dimMatchMarks = self$dims %in% names(field_mapping)
}
assert_has_len(matchedDims %u% names(lower_bound) %u% names(upper_bound),
"ERROR: ArrayOp$semi_join: cross_between mode: none of the template fields '%s' matches the source's dimensions: '%s'.
Only dimensions are matched in this mode. Attributes are ignored even if they are provided.",
paste(template$dims_n_attrs, collapse = ','), paste(self$dims, collapse = ','))
# get region array's attr values
getRegionArrayAttrValue = function(default, low){
res = rep(default, length(self$dims))
for(i in 1:length(self$dims)){
mainDimKeyName = self$dims[[i]]
if(low && mainDimKeyName %in% names(lower_bound)){
res[[i]] <- lower_bound[[mainDimKeyName]]
}
else if(!low && mainDimKeyName %in% names(upper_bound)){
res[[i]] <- upper_bound[[mainDimKeyName]]
}
else if(dimMatchMarks[[i]]){
# Handle the open case - make sure it's not in either of them, otherwise set it to its own value
if( !( (mainDimKeyName %in% names(upper_bound)) || (mainDimKeyName %in% names(lower_bound)))) {
res[[i]] <- sprintf("int64(%s)", field_mapping[[mainDimKeyName]]) # handle the open/closed cases.
}
}
}
return(res)
}
regionLowAttrValues = getRegionArrayAttrValue(MIN_DIM, low = TRUE)
regionHighAttrValues = getRegionArrayAttrValue(MAX_DIM, low = FALSE)
regionLowAttrNames = sprintf('_%s_low', self$dims)
regionHighAttrNames = sprintf('_%s_high', self$dims)
# apply new attributes as the region array in 'cross_between'
applyExpr = aflutils$join_fields(regionLowAttrNames, regionLowAttrValues, regionHighAttrNames, regionHighAttrValues)
afl_literal = afl(
self | cross_between(
template | apply(applyExpr) |
project(regionLowAttrNames, regionHighAttrNames))
)
return(afl_literal)
}
index_lookup_mode = function() {
assert(inherits(template, 'ArrayOpBase'),
"ERROR: ArrayOp$semi_join: index_lookup mode: template must be an ArrayOp instance, but got: [%s]", paste(class(template), collapse = ","))
assert(length(template$attrs) == 1 && length(template$dims) == 1,
"ERROR: ArrayOp$semi_join: index_lookup mode: template must have only one dimension and one attribute")
if(is.null(field_mapping)){
matchedFields = template$attrs %n% self$dims_n_attrs # find matched fields from template's attrs only (not in dims)
assert(length(matchedFields) == 1,
"ERROR: ArrayOp$semi_join: index_lookup mode: param 'field_mapping' == NULL, but template field '%s' does not match any source fields",
template$attrs)
field_mapping = new_named_list(matchedFields, names = matchedFields)
}
assert(length(field_mapping) == 1,
"ERROR: ArrayOp$semi_join: index_lookup mode: there should be exactly one template attribute that matches source's fields, but %d field(s) found: %s",
length(field_mapping), paste(field_mapping, collapse = ","))
assert_not_has_len(lower_bound,
"ERROR: ArrayOp$semi_join: index_lookup mode: param 'lower_bound' is not allowed in this mode.")
assert_not_has_len(upper_bound,
"ERROR: ArrayOp$semi_join: index_lookup mode: param 'upper_bound' is not allowed in this mode.")
if(is.null(names(field_mapping))){
# e.g. field_mapping = 'field_a'
sourceField = as.character(field_mapping)
templateField = template$attrs
} else {
sourceField = names(field_mapping)
templateField = as.character(field_mapping)
}
assert(sourceField %in% self$dims_n_attrs,
"ERROR: ArrayOp$semi_join: index_lookup mode: '%s' is not a valid source field.", sourceField)
assert(templateField %in% template$attrs,
"ERROR: ArrayOp$semi_join: index_lookup mode: '%s' is not a valid template field.", templateField)
isSourceAttrMatched = sourceField %in% self$attrs
sourceMatchField = if(isSourceAttrMatched) sourceField else sprintf("attr_%s", sourceField)
sourceOp = if(isSourceAttrMatched) self else {
afl(self | apply(sourceMatchField, sourceField))
}
indexName = sprintf("index_%s", sourceField)
templateOp = template
if(templateField %in% self$dims_n_attrs) {
# if the template field name exist on the source too, there will be field name conflicts when we 'project'
# templateOp = template$reshape(new_named_list(templateField, names = sprintf("alt_%s_", templateField)))
templateOp = template$.private$reshape_fields(structure(templateField, names = sprintf("alt_%s_", templateField)))
}
afl(
sourceOp |
index_lookup(templateOp, sourceMatchField, indexName) |
filter(sprintf("%s is not null", indexName)) |
project(self$attrs)
)
}
equi_join_mode = function(){
assert(inherits(template, 'ArrayOpBase'),
"ERROR: ArrayOp$semi_join: equi_join mode: template must be a ArrayOp instance, but got: %s",
class(template))
rand_id <- paste0(sample(c(0:9, LETTERS[1:6]), 6, T), collapse = '')
return(private$join(self, template,
left_alias = paste0("_L_semi_join_", rand_id),
right_alias = paste0("_R_semi_join_", rand_id),
join_mode = "equi_join",
settings = NULL
)$change_schema(self))
}
# Select the mode function which returns an AFL statement
aflExpr = switch(op_mode,
'filter' = filter_mode,
'cross_between' = cross_between_mode,
'equi_join' = equi_join_mode,
'index_lookup' = index_lookup_mode,
stopf("ERROR: ArrayOp$semi_join: unknown op_mode '%s'.", op_mode)
)()
if(inherits(aflExpr, 'ArrayOpBase')){
aflExpr
} else {
self$spawn(aflExpr)
}
}
,
# @description
# Return a new arrayOp instance with the same version as `self`
#
# Work in sub-class without requiring class names or constructor function.
# @param ... The samme params with Repo$ArrayOp(...)
# @return A new arrayOp
create_new = function(...){
classConstructor = get(class(self)[1])$new
result = classConstructor(...)
result$.private$set_conn(private$conn)
result
}
,
# @description
# Create a new ArrayOp instance from 'build'ing a data.frame
#
# All matching fields are built as attributes of the result ArrayOp.
# Build operator accepts compound attribute types, so the result may have something like "build(<aa:string not null, ...)"
# @param df a data.frame, where all column names must all validate template fields.
# @param artificial_field A field name used as the artificial dimension name in `build` scidb operator
# By default, a random string is generated, and the dimension starts from 0.
# A customized dimension can be provided e.g. `z=42:*` or `z=0:*:0:1000`.
# @return A new ArrayOp instance whose attributes share the same name and data types with the template's fields.
build_new = function(df, artificial_field = .random_attr_name(), as_scidb_data_frame = FALSE) {
assert_inherits(df, "data.frame")
builtAttrs = names(df)
dfNonMatchingCols = builtAttrs %-% self$dims_n_attrs
assert_not_has_len(dfNonMatchingCols, "ERROR: ArrayOp$build_new: df column(s) '%s' not found in template %s",
paste(dfNonMatchingCols, collapse = ','), self$to_afl())
builtDtypes = private$get_field_types(builtAttrs)
builtDtypesRaw = private$get_field_types(builtAttrs, .raw = TRUE)
attrStr = paste(builtAttrs, builtDtypes, collapse = ',', sep = ':')
# convert columns to escaped strings
colStrs = lapply(builtAttrs, function(x) {
colScidbType = builtDtypesRaw[[x]]
vec = df[[x]]
switch(
colScidbType,
"string" = sapply(
gsub("(['\\&])", "\\\\\\\\\\1", vec),
function(singleStr) if(is.na(singleStr)) "" else sprintf("\\'%s\\'", singleStr)
),
"datetime" = sprintf("\\'%s\\'", vec),
"bool" = tolower(vec),
vec # should be numeric types
)
})
names(colStrs) = builtAttrs
glueTemplate = sprintf("(%s)", paste("{", names(df), "}", sep = '', collapse = ","))
# Use the transformer to make sure we don't apply scientific notation
glue_transformer <- function(text, envir) {
if(is.numeric(get(text, envir))) {
x <- get(text, envir)
ifelse(is.na(x), NA, format(x, scientific = F))
} else {
get(text, envir)
}
}
contentStr = glue::glue_collapse(
glue::glue_data(colStrs, glueTemplate, .sep = ',', .na = '', .transformer = glue_transformer),
sep = ','
)
if(!as_scidb_data_frame){
builtDtypes[[artificial_field]] = 'int64'
private$create_new(
sprintf("build(<%s>[%s], '[%s]', true)",
attrStr, artificial_field, contentStr),
dims = artificial_field,
attrs = builtAttrs,
dtypes = builtDtypes
)
} else { # build as a scidb data frame
private$create_new(
sprintf("build(<%s>, '[[%s]]', true)",
attrStr, contentStr),
dims = c("$inst", "$seq"),
attrs = builtAttrs,
dtypes = builtDtypes
)
}
}
,
# @description
# Create a new ArrayOp instance by loading a file and checking it against an ArrayOp template (self).
#
# The ArrayOp instance where this function is called from serves as a template. By defulat, it assumes file
# column headers match the template's dims and attrs; otherwise an explicit file_headers can be provided and will
# be used to match the template's schema.
# @param filepath A single file path
# @param aio_settings Customized settings of aio_input
# @param field_conversion If NULL (default), use template's field type to convert aio_input attributes; Or provide
# a list for customized field conversion
# @param file_headers Column headers of the input file regardless of whether there is a header line in the file.
# Default NULL assumes file headers match self$dims_n_attrs. If the headers order are different or there are
# columns to skip, please provide a string vector, in which case only columns with matching template field are
# loaded. Names of the unmatching column headers are irrelevant.
#
# @return A new ArrayOp instance with matching fields
load_file = function(filepath, aio_settings = NULL, field_conversion = NULL, file_headers){
lookup = structure(0:(length(file_headers) - 1), names = file_headers)
colIndexes = vapply(self$dims_n_attrs, function(x) lookup[x], integer(1))
colIndexes = colIndexes[!is.na(colIndexes)]
fieldTypes = private$get_field_types(names(colIndexes), .raw = TRUE)
# Populate aio settings
aio_settings = c(list(path = filepath, num_attributes = max(colIndexes) + 1), aio_settings)
settingItems = mapply(private$to_aio_setting_item_str, names(aio_settings), aio_settings)
# cast raw attributes
castedItems = mapply(function(ft, index, name){
fmt = if(ft == 'string') "%s" else paste0(ft, "(%s)")
# If there is customized field conversion, use it
attrName = sprintf("a%s", index)
if(!is.null(field_conversion) && !is.null(field_conversion[[name]]))
# If customized field conversion defined for 'name', then use it
gsub('@', attrName, field_conversion[[name]]) # Replace all @ occurences in template
else # Otherwise just directly 'cast' it to the right data type if needed.
sprintf(fmt, attrName)
}, fieldTypes, colIndexes, names(fieldTypes))
aioExpr = afl(aflutils$join_fields(settingItems) | aio_input)
applyExpr = afl(aioExpr | apply(aflutils$join_fields(names(fieldTypes), castedItems)))
projectedExpr = afl(applyExpr | project(names(fieldTypes)))
return(private$create_new(projectedExpr, c(), names(fieldTypes), dtypes = fieldTypes))
}
,
# reshape array attributes by name and expression
#
# Do not infer field data type because field can be mutated to a differnt dtype.
# Do not alter dimensions
# @param field_vec A named string vector
# @return A new transient array_op
reshape_fields = function(field_vec) {
field_names = names(field_vec)
assert_inherits(field_vec, 'character')
assert_not_empty(field_names)
assertf(!any(is.na(field_vec)) && !any(is.na(field_names)),
glue("No NA allowed in either field names or expressions: {field_vec}"))
# existing dims are also regarded as new fields
newFileds = field_vec[field_names %-% self$attrs]
existingFields = field_vec[field_names %n% self$attrs]
replacedFields = existingFields[existingFields != names(existingFields)]
passedOnFields = existingFields[existingFields == names(existingFields)]
newAfl = if(.not_empty(replacedFields)){
replaceFieldsNamesAlt = sprintf("_%s", names(replacedFields))
afl(self |
apply(aflutils$join_fields(replaceFieldsNamesAlt, replacedFields)) |
project(c(replaceFieldsNamesAlt, names(passedOnFields))) |
apply(aflutils$join_fields(
c(names(replacedFields), names(newFileds)),
c(replaceFieldsNamesAlt, newFileds))) |
project(field_names)
)
} else if(.not_empty(newFileds)){
afl(self |
apply(aflutils$join_fields(names(newFileds), newFileds)) |
project(field_names)
)
} else {
afl(self | project(field_names))
}
private$create_new(newAfl, dims = self$dims, attrs = field_names,
dim_specs = private$get_dim_specs())
}
,
# Reshape an arrayOp on a compound key (consisted of one or multiple fields) according to the template (self)
#
# If the operands have the same dimensions as the template, then only 'project' fields
# If the key contains all template's dimensions, then 'redimension' the operand and 'project' away unrelated fields
# Otherwise, 'equi_join' is needed
#
# @param operand The araryOp operand
# @param keys Which fields to identify matching cells
# @param reserved_fields Which fields of `operand` should be reserved
# @param .redimension_setting A list of settings if scidb redimension is needed.
# @param .join_setting A list of settings if ArrayOp$join is needed.
#
# @return an arrayOp that has the same dimensions as the template (self), only reserved fields are retained.
key_to_coordinates = function(operand, keys, reserved_fields,
.redimension_setting = NULL, .join_setting = NULL){
if(length(operand$dims) == length(self$dims) && all(operand$dims == self$dims))
# If all dimensions match, no need to do anything about dimensions.
return(operand$.private$afl_project(reserved_fields))
operandKeyFields = .get_element_names(keys)
templateKeyFields = as.character(keys)
extraFields = operand$attrs %-% keys %-% reserved_fields
if(all(self$dims %in% operandKeyFields)){
if(.not_empty(extraFields))
# operand = operand$reshape(c(keys, reserved_fields))
operand = operand$.private$afl_project(c(keys, reserved_fields))
return(operand$
change_schema(self, strict = FALSE, .setting = .redimension_setting)$
# reshape(reserved_fields, .force_project = FALSE))
.private$afl_project(reserved_fields))
}
joinOp = private$join(
operand$select(reserved_fields %u% (self$dims %n% operand$dims_n_attrs)),
self$select(self$dims),
by.x = operandKeyFields,
by.y = templateKeyFields,
settings = .join_setting)
joinOp$change_schema(self, strict = FALSE, .setting = .redimension_setting)
}
,
# @description
# Create a new ArrayOp instance that with added auto incremented fields
#
# @param reference ArrayOp instance to draw existing max id from
# @param source_field
# @param ref_field
# @param source_start
# @param ref_start
# @param new_field
# @return A new arrayOp
set_auto_increment_field = function(reference, source_field, ref_field, source_start, ref_start, new_field = NULL) {
refDims = ref_field %-% reference$attrs
maxRefFields = sprintf("_max_%s", ref_field)
aggFields = sprintf("max(%s) as %s", ref_field, maxRefFields)
defaultOffset = ref_start - source_start
nonDefaultOffset = 1 - source_start
if(.is_empty(new_field)) new_field = ref_field
newFieldExpr = sprintf("iif(%s is null, %s%s, %s+%s%s)", maxRefFields,
source_field, .to_signed_integer_str(defaultOffset),
maxRefFields, source_field, .to_signed_integer_str(nonDefaultOffset))
forAggregate = if(.not_empty(refDims)) afl(reference | apply(aflutils$join_fields(refDims, refDims))) else reference
aggregated = afl(forAggregate | aggregate(aggFields))
crossJoined = afl(
self |
cross_join(aggregated) |
apply(
aflutils$join_fields(new_field, newFieldExpr)
)
)
result = self$spawn(crossJoined,
added = new_field,
dtypes = new_named_list(reference$.private$get_field_types(ref_field), new_field))
result$.private$afl_project(result$attrs)
}
,
# @description
# Create a new ArrayOp instance that has an anti-collision field set according to a template arrayOp
#
# The source (self) operand should have N fields given the target has N+1 dimensions. The one missing field is
# treated as the anti-collision field.
#
# @param target A target arrayOp that the source draws anti-collision dimension from.
# @param anti_collision_field a target dimension name which exsits only to resolve cell collision
# (ie. cells with the same dimension coordinate).
# @return A new arrayOp
set_anti_collision_field = function(target, anti_collision_field = NULL, join_setting = NULL, source_anti_collision_dim_spec = NULL) {
matchedFieldsOfTargetDimensions = self$dims_n_attrs %n% target$dims
assert(length(matchedFieldsOfTargetDimensions) + 1 == length(target$dims),
"ERROR: ArrayOp$set_anti_collision_field: incorrect number of matching fields: source has %d field(s) (SHOULD BE %d) that match %d target dimension(s).",
length(matchedFieldsOfTargetDimensions), length(target$dims) - 1, length(target$dims))
if(is.null(anti_collision_field))
anti_collision_field = target$dims %-% matchedFieldsOfTargetDimensions
assert(is.character(anti_collision_field) && length(anti_collision_field) == 1,
"ERROR: ArrayOp$set_anti_collision_field: anti_collission_field should be a single string, but got '%s'",
str(anti_collision_field))
assert(anti_collision_field %in% target$dims_n_attrs,
"ERROR: ArrayOp$set_anti_collision_field: param 'anti_collision_field' [%s] is not a valid field of target.", anti_collision_field)
# Target dimensions other than the anti_collision_field
regularTargetDims = target$dims %-% anti_collision_field
# Redimension source with a different field name for the anti-collision-field
srcAltId = sprintf("src_%s_", anti_collision_field) # this field is to avoid dimension collision within source
renamedList = as.list(structure(srcAltId, names = anti_collision_field))
renamedTarget = target$spawn(renamed = renamedList)
redimensionTemplateDimSpecs =
if (is.null(source_anti_collision_dim_spec))
renamedTarget$.private$get_dim_specs()
else
utils::modifyList(renamedTarget$.private$get_dim_specs(),
as.list(structure(source_anti_collision_dim_spec, names = srcAltId)))
redimensionTemplate = private$create_new("TEMPLATE",
dims = renamedTarget$dims,
attrs = self$attrs %-% renamedTarget$dims,
dtypes = utils::modifyList(private$get_field_types(), renamedTarget$.private$get_field_types(renamedTarget$dims)),
dim_specs = redimensionTemplateDimSpecs)
redimenedSource = redimensionTemplate$spawn(afl(
self | redimension(redimensionTemplate$to_schema_str()) | apply(srcAltId, srcAltId)
))
# Get the max anti-collision-field from group aggregating the target on the remainder of target dimensions
targetAltIdMax = sprintf("max_%s_", anti_collision_field)
groupedTarget = private$create_new('',
attrs = c(regularTargetDims, targetAltIdMax), dims = c('instance_id', 'value_no'),
dtypes = c(invert.list(list("int64" = c('instance_id', 'value_no', regularTargetDims, targetAltIdMax))))
)$spawn(afl(
target | apply(anti_collision_field, anti_collision_field) | grouped_aggregate(
sprintf("max(%s) as %s", anti_collision_field, targetAltIdMax), regularTargetDims)
))
# Left join on the remainder of the target dimensions
joinSetting = if(is.null(join_setting)) list(left_outer=1,keep_dimensions=TRUE) else {
utils::modifyList(as.list(join_setting), list(left_outer=1, keep_dimensions=TRUE))
}
joined = private$join(redimenedSource$select(redimenedSource$dims_n_attrs),
groupedTarget$select(targetAltIdMax),
by = regularTargetDims,
settings = joinSetting)
# Finally calculate the values of anti_collision_field
# src's attributes (that are target dimensions) are converted to dimensions according to the target
resultTemplate = redimensionTemplate$
spawn(renamed = invert.list(renamedList))
resultTemplate = resultTemplate$.private$afl_unpack()
result = resultTemplate$
spawn(afl(
joined | apply(anti_collision_field, sprintf(
"iif(%s is null, %s, %s + %s + 1)", targetAltIdMax, srcAltId, srcAltId, targetAltIdMax
)))
# )$reshape(resultTemplate$attrs)
)$.private$afl_project(resultTemplate$attrs)
return(result)
}
,
# @description
# Return AFL suitable for retrieving data.frame.
#
# scidb::iquery has a param `only_attributes`, which, if set TRUE, will effectively drop all dims.
# @param drop_dims Whether self's dimensions are dropped when generating AFL for data.frame conversion
# @return An AFL string
to_df_afl = function(drop_dims = FALSE, artificial_field = .random_attr_name()) {
return(private$to_afl_explicit(drop_dims, self$selected, artificial_field = artificial_field))
}
,
# @description
# Returns AFL when self used as an operand in another parent operation.
#
# By default, 1. dimensions are not dropped in parent operation; 2. no intent to select fields
# @param drop_dims Whether self dimensions will be dropped in parent operations
# @param selected_fields which fields are selected no matter what the parent operation is.
# If NULL, self fields will pass on by default depending on the parent operation.
# @return An AFL string
to_afl_explicit = function(drop_dims = FALSE, selected_fields = NULL, artificial_field = .random_attr_name()){
# No intent to select fields
if(.is_empty(selected_fields))
return(self$to_afl())
# 'selected_fields' is specified in sub-classes.
# We need to ensure selected fields are passed on in parent operations.
# dims are dropped in parent operations.
if(drop_dims){
selectedDims = base::intersect(selected_fields, self$dims)
inner = if(.not_empty(selectedDims))
afl(self | apply(aflutils$join_fields(selectedDims, selectedDims)))
else self$to_afl()
return(afl(inner | project(aflutils$join_fields(selected_fields))))
}
# drop_dims = F. All dims are preserved in parent operations, we can only select/drop attrs.
selectedAttrs = base::intersect(selected_fields, self$attrs)
if(.not_empty(selectedAttrs)) # If some attributes selected
return(afl(self | project(selectedAttrs)))
# If no attributes selected, we have to create an artificial attribute, then project it.
# Because 'apply' doesn't work on array dimensions
return(afl(self | apply(artificial_field, 'null') |
project(artificial_field)))
}
### Implement raw AFL function ----
# Functions prefixed with 'afl_' are implemented according to scidb operators with sanity checks.
,
# Project a list of array's attributes.
#
# Return a result array instance with the same dimensions and a subset (projected) attributes
# If length(c(...)), then result self
# Throw an error if there are non-attribute fileds because scidb only allows project'ing on attributes
# Dimensions canont be projected
#
# param ... c(...) is used as projected fields. NULL is discarded.
afl_project = function(...) {
fields = c(...)
if(.is_empty(fields)) return(self)
assert_inherits(fields, "character")
assert_empty(fields %-% self$attrs, "afl_project: invalid field(s) to project: [{.value}]")
# assert_not_has_len(nonAttrs, "ERROR: afl_project: %d non-attribute field(s) found: %s", length(nonAttrs), paste(nonAttrs, collapse = ', '))
private$create_new(afl(self | project(fields)), dims = self$dims, attrs = fields,
dtypes = private$get_field_types(c(self$dims, fields)), dim_specs = private$get_dim_specs())
}
,
# AFL unpack operator
#
# Return an array_op with self's dims + attrs, plus an extra dimension
afl_unpack = function(dim_name = dbutils$random_field_name(), chunk_size = NULL) {
private$create_new(afl(self | unpack(c(dim_name, chunk_size))),
dims = dim_name,
attrs = self$dims_n_attrs,
dtypes = private$get_field_types())
}
# ,
# # Apply new attributes to an existing array.
# # Return a result array with added (applied) attributes
# # If fields are existing dimensions, data types are inheritted; otherwise new attributes require data types
# # @param fields: a named list or character. Cannot contain existing attributes because it creates conflicts.
# afl_apply = function(fields, dtypes = NULL) {
# fieldNames = .get_element_names(fields)
# fieldExprs = as.character(fields)
# conflictFields = fields %n% self$attrs
# assert_not_has_len(conflictFields, "ERROR: afl_apply: cannot apply existing attribute(s): %s", paste(conflictFields, collapse = ', '))
#
# newDTypes = utils::modifyList(private$get_field_types(), as.list(dtypes))
# private$create_new(afl(self | apply(aflutils$join_fields(fieldNames, fieldExprs))), dims = self$dims,
# attrs = self$attrs %u% fields, dtypes = newDTypes, dim_specs = private$get_dim_specs())
# }
,
# Redimension `self` according to a template.
#
# @param .setting a string vector, where each item will be appended to the redimension operand.
# E.g. .setting = c('false', 'cells_per_chunk: 1234') ==> redimension(source, template, false, cells_per_chunk: 1234)
# Similar to scidb redimension operator
afl_redimension = function(template, .setting = NULL) {
assert_no_fields(template$dims_n_attrs %-% self$dims_n_attrs,
"ERROR:ArrayOp$afl_redimension:Field(s) '%s' of the template not found in the source.")
return(template$spawn(afl(
self | redimension(c(template$to_schema_str(), as.character(.setting)))
)))
}
,
# Insert to a target array (scidb `insert` operator)
#
# Require numbers of attributes and dimensions of self and target match.
# Field names are irrelevant
#
# @param target: Target array
# @return An scidb insert operation
afl_insert = function(target) {
assertf(length(self$dims) == length(target$dims),
glue("Number of dimensions mismatch between ",
"source array {length(self$dims)} [{paste(self$dims, collapse=',')}] and ",
"target array {length(target$dims)} [{paste(target$dims, collapse=',')}]"),
.nframe = 1)
assertf(length(self$attrs) == length(target$attrs),
glue("Number of attributes mismatch between ",
"source array {length(self$attrs)} [{paste(self$attrs, collapse=',')}] and ",
"target array {length(target$attrs)} [{paste(target$attrs, collapse=',')}]"),
.nframe = 1)
target$spawn(afl(self | insert(target)))
}
,
# Overwrite a target array (scidb `store` operator)
#
# Require numbers of attributes and dimensions of self and target match.
# Field names are irrelevant
#
# @param target: Target array
# @return An scidb insert operation
afl_store = function(target, new_target = FALSE) {
assertf(length(self$dims) == length(target$dims),
glue("Number of dimensions mismatch between ",
"source array {length(self$dims)} [{paste(self$dims, collapse=',')}] and ",
"target array {length(target$dims)} [{paste(target$dims, collapse=',')}]"),
.nframe = 1)
assertf(length(self$attrs) == length(target$attrs),
glue("Number of attributes mismatch between ",
"source array {length(self$attrs)} [{paste(self$attrs, collapse=',')}] and ",
"target array {length(target$attrs)} [{paste(target$attrs, collapse=',')}]"),
.nframe = 1)
target$spawn(afl(self | store(target)))
}
,
# Remove scidbr objects from hold_refs on remove_array or finalize if gc is true
remove_scidbr_from_refs = function(force_gc = FALSE){
refObj = self$.private$get_meta('.refs')
newRefObj = c()
for (obj in refObj){
if (!("scidb" %in% class(obj))) {
newRefObj = c(newRefObj, obj)
} else {
rm(obj)
}
}
should_gc = FALSE
if (length(refObj) < length(newRefObj)){
should_gc = TRUE
}
if(length(newRefObj) == 0) {
newRefObj = NULL
}
self$.private$set_meta('.refs', newRefObj)
rm(refObj)
#only call gc() if we need to.
if(force_gc && should_gc) {
gc()
}
}
)
,
# Active bindings ----
active = list(
#' @field dims Dimension names
dims = function() private$get_meta('dims'),
#' @field attrs Attribute names
attrs = function() private$get_meta('attrs'),
#' @field selected Selected dimension and/or attribute names
selected = function() private$get_meta('selected'),
#' @field dtypes A named list, where key is dim/attr name and value is respective SciDB data type as string
dtypes = function() private$get_meta('dtypes'),
#' @field raw_dtypes A named list, where key is dim/attr name and value is first part of respective SciDB data type as string
raw_dtypes = function() lapply(private$get_meta('dtypes'), function(x) gsub("^(\\S+).*$", "\\1", x)),
#' @field dims_n_attrs Dimension and attribute names
dims_n_attrs = function() c(self$dims, self$attrs),
#' @field attrs_n_dims Attribute and dimension names
attrs_n_dims = function() c(self$attrs, self$dims),
#' @field is_schema_from_scidb If the array schema is retrieved from SciDB or inferred locally in R
is_schema_from_scidb = function() private$get_meta('is_schema_from_scidb') %?% FALSE,
#' @field is_scidb_data_frame Whether current array_op is a regular array or
#' SciDB data frame (array with hidden dimensions; not to be confused with R data frames)
is_scidb_data_frame = function() {
# True if any dimension starts with $
any(grepl("^\\$", self$sync_schema()$dims))
},
#' @field .private For internal testing only. Do not access this field to avoid unintended consequences!!!
.private = function() private
),
# Public ----------------------------------------------------------------------------------------------------------
public = list(
#' @description
#' Base class initialize function, to be called in sub-class internally.
#'
#' Always use `ScidbConnection` to get array_op instances.
#' @param raw_afl AFL expression (array name or operations) as string
#' @param dims A string vector used as dimension names
#' @param attrs A string vector used as attribute names
#' @param dtypes A named list of strings, where names are attribute names and
#' values are full scidb data types.
#' E.g. `dtypes = list(field_str = "string NOT NULL", field_int32 = "int32")`
#' @param dim_specs A named list of string, where names are dimension names
#' and values are dimension specs.
#' E.g. `dim_sepcs = list(da = "0:*:0:*", chrom = "1:24:0:1")`.
#' @param ... A named list of metadata items, where names are used as keys
#' in `private$set_meta` and `private$get_meta` functions.
#' @param meta_list A list that stores ArrayOp meta data, e.g. field types
#' If provided, other regular parms are not allowed.
initialize = function(
raw_afl,
dims = as.character(c()),
attrs = as.character(c()),
dtypes = list(),
dim_specs = list(),
...,
meta_list
) {
assert(
xor(methods::hasArg('meta_list'),
methods::hasArg('dims') || methods::hasArg('attrs') || methods::hasArg('dtypes')
),
"ERROR: ArrayOp:initialze: meta_list cannot be provided with any of the args: dims, attrs, dtypes")
assert_single_str(raw_afl, "character")
private$raw_afl = raw_afl
private$metaList = if(methods::hasArg('meta_list')) meta_list else
list(dims = dims, attrs = attrs,
dtypes = utils::modifyList(dtypes, invert.list(list("int64" = dims))),
dim_specs = dim_specs, ...)
}
# Functions that create new ArrayOps -------------------------------------------------------------------------------
,
#' @description
#' Create a new ArrayOp instance with filter expressions
#'
#' Similar to `dplyr::filter`, fields are not quoted.
#'
#' Operators for any type of fields include `==`, `!=`,
#' `%in%`, `%not_in%`.
#' To test whether a field is null, use unary operators: `is_null`, `not_null`.
#'
#' Special binary operators for string fields include:
#' `%contains%`, `%starts_with%`, `%ends_with%`, `%like%`, where
#' only `%like%` takes a regular expression and other operators escape any special
#' characters in the right operand.
#'
#' Operators for numeric fields include: `>`, `<`, `>=`, `<=`
#'
#' @param ... Filter expression(s) in R syntax.
#' These expression(s) are not evaluated in R but first captured then converted to scidb expressions with appropriate syntax.
#' @param .expr A single R expression, or a list of R exprs, or NULL.
#' If provided, `...` is ignored. Mutiple exprs are joined by 'and'.
#' This param is useful when we want to pass an already captured R expression.
#' @param .validate_fields Boolean, default TURE, whether to validate fields
#' in filter epxressions. Throw error if invalid fields exist when set to TRUE.
#' @param .regex_func A string of regex function implementation, default 'regex'.
#' Due to scidb compatiblity issue with its dependencies, the regex function from boost library may not be available
#' Currently supported options include 'rsub', and 'regex'
#' @param .ignore_case A Boolean, default TRUE. If TRUE, ignore case in string match patterns.
#' Otherwise, perform case-sensitive regex matches.
#' @return A new arrayOp
filter = function(..., .expr = NULL, .validate_fields = TRUE,
.regex_func = getOption('arrayop.regex_func', default = 'regex'),
.ignore_case = getOption('arrayop.ignore_case', default = TRUE)) {
filterExpr = .expr %?% aflutils$e_merge(aflutils$e(...))
# No actual filter expression
if(.is_empty(filterExpr)) return(self)
# If we need to validate every field is actually one of self's fields
if(.validate_fields){
status = aflutils$validate_filter_expr(filterExpr, self$dims_n_attrs)
if(!status$success){
# either invalid fields or R expression syntax error
assert_empty(status$absent_fields,
"invalid field(s) [{.value}] in filter expression")
stop(paste(status$error_msgs, collapse = '\n'))
}
}
self$spawn(afl(self |
filter(aflutils$e_to_afl(filterExpr,
regex_func = .regex_func,
ignore_case = .ignore_case))))
}
,
#' @description
#' Create a new ArrayOp instance with mutated fields
#'
#' Similar to `dplyr::mutate`, fields of source (self) can be removed or added to the result arrayOp
#' Any field that are not in the mutate expressions remain unchanged.
#'
#' @param ... Named R expressions. Names are field names in the result arrayOp and must not be empty.
#' Set field = NULL to remove existing fields. E.g. `abcd = NULL, def = def` removes
#' field 'abcd' and keep field 'def'.
#'
#' Values are R expressions similar to the `filter` method.
#' E.g. `a = b + 2, name = first + "-" + last, chrom = if(chrom == 'x') 23 else if(chrom == 'y') 24 else chrom`
#' @param .dots A named list of R expressions or NULL. If provided,
#' the `...` param is ignored. Useful when an a list of mutation expressions is already
#' created and can be passed around.
#' @param .sync_schema Whether to get the exact schema from scidb. Default
#' TRUE will cause a scidb query to get the schema. Set to FALSE to avoid schema checking.
#'
#' @return a new ArrayOp instance
mutate = function(..., .dots = NULL, .sync_schema = TRUE){
paramFieldExprs = lapply(.dots %?% rlang::exprs(...), aflutils$e_to_afl)
assert_unique_named_list(
paramFieldExprs,
"'mutate fields' should be a named list where each element has a unique name. E.g. mutate(field_name = 'field_expression') "
)
assertf(all(sapply(paramFieldExprs, function(x) is.null(x) || is.character(x))),
"Each element should be a string or NULL")
assert_empty(
names(Filter(is.null, paramFieldExprs)) %-% self$attrs,
"mutate: Can only remove array attributes, but got invalid field(s) for removal: [{.value}]"
)
fieldList = utils::modifyList(new_named_list(self$attrs), paramFieldExprs)
assert_not_empty(fieldList,
paste0("mutate: no valid fields found.",
"Please set at least one field, e.g. array_op$mutate(existing_field = NULL, placeholder=bool(null))"))
fields_vec = structure(as.character(fieldList), names = names(fieldList))
result = private$reshape_fields(fields_vec)
if(.sync_schema) result$sync_schema() else result
}
,
#' @description
#' Create a new ArrayOp instance with mutated fields
#'
#' Similar to `dplyr::transmute`, only listed fields are retained in the result arrayOp
#' NOTE: Any field that are not in the mutate expressions will be discarded.
#'
#' @param ... R expressions. Names are optional.
#' For each named expression, the name is used as field name in the result arrayOp.
#' Unnamed expressions must be existing field names, unquoted, which result in unchanged source fields of self.
#'
#' Values are R expressions similar to the `filter` method.
#' E.g. `a = b + 2, name = first + "-" + last, chrom = if(chrom == 'x') 23 else if(chrom == 'y') 24 else chrom`
#' @param .dots A named list of R expressions or NULL. If provided,
#' the `...` param is ignored. Useful when an a list of mutation expressions is already
#' created and can be passed around.
#' @param .sync_schema Whether to get the exact schema from scidb. Default
#' TRUE will cause a scidb query to get the schema. Set to FALSE to avoid schema checking.
#'
#' @return a new ArrayOp instance
transmute = function(..., .dots = NULL, .sync_schema = TRUE){
paramFieldExprs = lapply(.dots %?% rlang::exprs(...), aflutils$e_to_afl)
assertf(all(sapply(paramFieldExprs, is.character)),
glue(
"'transmute fields' should be a list of non-nullable strings with optional names. E.g. transmute('a', b='a + 2').",
" But got: {deparse(paramFieldExprs)}"
))
paramFieldNames = names(paramFieldExprs) %?% paramFieldExprs
names(paramFieldExprs) = ifelse(paramFieldNames == "", paramFieldExprs, paramFieldNames)
assert_unique_named_list(
paramFieldExprs,
glue(
"'transmute fields' should be a named list where each element has a unique name. E.g. mutate(field_name = 'field_expression') ",
" But got: {deparse(paramFieldExprs)}"
)
)
fields_vec = structure(as.character(paramFieldExprs), names = names(paramFieldExprs))
result = private$reshape_fields(fields_vec)
if(.sync_schema) result$sync_schema() else result
}
,
#' @description
#' Create a ArrayOp instance with the same schema of self, but different cells
#' from 'data_array' for the 'updated_fields'.
#'
#' @param data_array An ArrayOp instance that have at least two overlapping fields with self.
#' @param keys Field names in both self and data_array. Cell content of these
#' fields are from the 'self' arrayOp rather than 'data_array'.
#' @param updated_fields Field names in both self and data_array. Cell content
#' of these fields are from the 'data_array', NOT 'self'.
#' @param .redimension_setting A list of strings used as the settings of scidb
#' 'redimension' operator. Only applicable when a 'redimension' is needed.
#' @param .join_setting A list of strings used as the settings of scidb
#' 'join' operator. Only applicable when a 'join' is needed.
#' @return A new arrayOp with the same schema as self
mutate_by = function(data_array,
keys = NULL,
updated_fields = NULL,
.redimension_setting = NULL,
.join_setting = NULL) {
assert_inherits(data_array, "ArrayOpBase")
data_source = data_array
# updateFields default to the overlapping attrs between source and self
# updatedFields = if(.not_empty(updated_fields)) updated_fields else data_source$attrs %n% self$attrs
updatedFields = updated_fields %?% data_source$attrs %n% self$attrs
assert_not_empty(updatedFields, "param 'data_source' does not have any target attributes to mutate.")
reservedFields = self$attrs %-% updatedFields
sameDims = length(data_source$dims) == length(self$dims) && all(data_source$dims == self$dims)
if(!sameDims){ # not the same dimensions, transform the data_source first
assert_not_empty(keys, "param 'keys' cannot be empty")
assert_not_empty(updated_fields, "param 'updated_fields' cannot be empty")
assert_no_fields(
(keys %-% self$dims_n_attrs) %u% (keys %-% data_source$dims_n_attrs),
"param 'keys' has invalid field(s) [%s]")
assert_no_fields(
(updated_fields %-% self$dims_n_attrs) %u% (updated_fields %-% data_source$dims_n_attrs),
"param 'updated_fields' has invalid field(s) [%s]")
data_source = private$key_to_coordinates(data_source, keys = keys, reserved_fields = updatedFields,
.redimension_setting = .redimension_setting, .join_setting = .join_setting)
} else {
if(.not_empty(keys %-% self$dims))
warning(sprintf("Extra key(s) [%s] ignored because source/target dimensions match.",
keys %-% self$dims))
}
if(.is_empty(reservedFields)){
# No need to join with self. Just ensure ordering of data_source attributes conforms to 'self'
data_source$.private$afl_project(self$attrs)
} else {
self$spawn(
afl(
data_source$.private$afl_project(updatedFields) |
join(private$afl_project(reservedFields))
)
)$.private$afl_project(self$attrs)
}
}
,
#' @description
#' Inner join two arrays: 'self' (left) and 'right'
#'
#' Similar to `dplyr::inner_join`, the result arrayOp performs an inner join.
#' For both left and right arrays, only selected fields are included in the result arrayOp.
#' If no fields are selected, then all fields are treated as selected.
#'
#' @param right An arrayOp instance
#' @param by.x NULL or a string vector as join keys. If set to NULL, join
#' keys are inferred as shared fields of 'left' and 'right'.
#' @param by.y NULL or a string vector as join keys. If set to NULL, join
#' keys are inferred as shared fields of 'left' and 'right'.
#' @param by NULL or a string vector as join keys. If set to NULL, join
#' keys are inferred as shared fields of 'left' and 'right'. If not NULL,
#' must be fields of both operands.
#' @param left_alias Alias for left array to resolve potential conflicting fields in result
#' @param right_alias Alias for right array to resolve potential conflicting fields in result
#' @param join_mode String 'equi_join' or 'cross_join'. The latter requires
#' join keys are all dimensions of both operands, which is more strigenet than 'equi_join'
#' with the benifit of non-materializing result in scidb.
#' @param settings A named list as join settings. E.g. ` list(algorithm = "'hash_replicate_right'")`
#' @return A new arrayOp instance
inner_join = function(right,
by.x = NULL, by.y = NULL, by = NULL,
left_alias = '_L', right_alias = '_R',
join_mode = 'equi_join',
settings = NULL
) {
private$join(self, right,
by.x = by.x, by.y = by.y, by = by,
left_alias = left_alias, right_alias = right_alias,
join_mode = join_mode,
settings = settings
)
}
,
#' @description
#' Left join two arrays: 'self' (left) and 'right'
#'
#' Similar to `dplyr::left_join`, the result arrayOp performs a left join.
#' For both left and right arrays, only selected fields are included in the result arrayOp.
#' If no fields are selected, then all fields are treated as selected.
#'
#' @param right An arrayOp instance
#' @param by.x NULL or a string vector as join keys. If set to NULL, join
#' keys are inferred as shared fields of 'left' and 'right'.
#' @param by.y NULL or a string vector as join keys. If set to NULL, join
#' keys are inferred as shared fields of 'left' and 'right'.
#' @param by NULL or a string vector as join keys. If set to NULL, join
#' keys are inferred as shared fields of 'left' and 'right'. If not NULL,
#' must be fields of both operands.
#' @param left_alias Alias for left array to resolve potential conflicting fields in result
#' @param right_alias Alias for right array to resolve potential conflicting fields in result
#' @param settings A named list as join settings. E.g. ` list(algorithm = "'hash_replicate_right'")`
#' @return A new arrayOp instance
left_join = function(right,
by.x = NULL, by.y = NULL, by = NULL,
left_alias = '_L', right_alias = '_R',
settings = NULL
) {
joinSettings = utils::modifyList(as.list(settings), list(left_outer = 1))
private$join(self, right,
by.x = by.x, by.y = by.y, by = by,
left_alias = left_alias, right_alias = right_alias,
settings = joinSettings,
join_mode = 'equi_join'
)
}
,
#' @description
#' Right join two arrays: 'self' (left) and 'right'
#'
#' Similar to `dplyr::right_join`, the result arrayOp performs a right join.
#' For both left and right arrays, only selected fields are included in the result arrayOp.
#' If no fields are selected, then all fields are treated as selected.
#'
#' @param right An arrayOp instance
#' @param by.x NULL or a string vector as join keys. If set to NULL, join
#' keys are inferred as shared fields of 'left' and 'right'.
#' @param by.y NULL or a string vector as join keys. If set to NULL, join
#' keys are inferred as shared fields of 'left' and 'right'.
#' @param by NULL or a string vector as join keys. If set to NULL, join
#' keys are inferred as shared fields of 'left' and 'right'. If not NULL,
#' must be fields of both operands.
#' @param left_alias Alias for left array to resolve potential conflicting fields in result
#' @param right_alias Alias for right array to resolve potential conflicting fields in result
#' @param settings A named list as join settings. E.g. ` list(algorithm = "'hash_replicate_right'")`
#' @return A new arrayOp instance
right_join = function(right,
by.x = NULL, by.y = NULL, by = NULL,
left_alias = '_L', right_alias = '_R',
settings = NULL
) {
joinSettings = utils::modifyList(as.list(settings), list(right_outer=1))
private$join(self, right,
by.x = by.x, by.y = by.y, by = by,
left_alias = left_alias, right_alias = right_alias,
settings = joinSettings,
join_mode = 'equi_join'
)
}
,
#' @description
#' Full join two arrays: 'self' (left) and 'right'
#'
#' Similar to `dplyr::full_join`, the result arrayOp performs a full join.
#' For both left and right arrays, only selected fields are included in the result arrayOp.
#' If no fields are selected, then all fields are treated as selected.
#'
#' @param right An arrayOp instance
#' @param by.x NULL or a string vector as join keys. If set to NULL, join
#' keys are inferred as shared fields of 'left' and 'right'.
#' @param by.y NULL or a string vector as join keys. If set to NULL, join
#' keys are inferred as shared fields of 'left' and 'right'.
#' @param by NULL or a string vector as join keys. If set to NULL, join
#' keys are inferred as shared fields of 'left' and 'right'. If not NULL,
#' must be fields of both operands.
#' @param left_alias Alias for left array to resolve potential conflicting fields in result
#' @param right_alias Alias for right array to resolve potential conflicting fields in result
#' @param settings A named list as join settings. E.g. `list(algorithm = "'hash_replicate_right'")`
#' @return A new arrayOp instance
full_join = function(right,
by.x = NULL, by.y = NULL, by = NULL,
left_alias = '_L', right_alias = '_R',
settings = NULL
) {
joinSettings = utils::modifyList(as.list(settings), list(left_outer = 1, right_outer=1))
private$join(self, right,
by.x = by.x, by.y = by.y, by = by,
left_alias = left_alias, right_alias = right_alias,
settings = joinSettings,
join_mode = 'equi_join'
)
}
,
#' @description
#' Return an arrayOp instance with same schema as self and content cells that
#' match the cells of 'df_or_arrayop'.
#'
#' Similar to `dplyr::semi_join`, the result has the same schema as the left
#' operand 'self' and with content filtered by 'df_or_arrayop'.
#'
#' params `field_mapping`, `lower_bound` and `upper_bound`, if provided, must be named list,
#' where the.names are from the source array (i.e. self), and values are from
#' the right operand `df_or_arrayop`
#' @param field_mapping NULL or a named list of strings. Only applicable when
#' mode is 'cross_between' mode, ignored in other modes.
#' @param lower_bound NULL or a named list of strings. Only applicable when
#' mode is 'filter' or 'cross_between'. Names of the list are fields of self,
#' and value strings are fields or columns of the `df_or_arrayop` which are
#' treated as lower bound to matching fields rather than exact match.
#'
#' In 'filter' mode, the self fields in lower_bound can be any numeric fields.
#' In 'cross_between' mode, the self fields in lower_bound must be array dimensions.
#' @param upper_bound NULL or a named list of strings. Only applicable when
#' mode is 'filter' or 'cross_between'. Names of the list are fields of self,
#' and value strings are fields or columns of the `df_or_arrayop` which are
#' treated as upper bound to matching fields rather than exact match.
#'
#' In 'filter' mode, the self fields in upper_bound can be any numeric fields.
#' In 'cross_between' mode, the self fields in upper_bound must be array dimensions.
#' @param df_or_arrayop An R data frame or arrayOp instance.
#' @param mode String of 'filter', 'cross_between', 'index_lookup', 'equi_join' or 'auto'
#' @param filter_threshold A number below which the 'filter' mode is used
#' unless a mode other than 'auto' is provided.
#' @param upload_threshold A number below which the 'df_or_arrayop' data frame
#' is compield into a build literal array; otherwise uploaded to scidb as a regular array.
#' Only applicable when 'df_or_arrayop' is an R data frame.
#' @return An arrayOp instance
semi_join = function(df_or_arrayop,
field_mapping = NULL,
lower_bound = NULL,
upper_bound = NULL,
mode = "auto",
filter_threshold = 200L,
upload_threshold = 6000L
){
assert_inherits(df_or_arrayop, c("data.frame", "ArrayOpBase"))
assert_empty(
(names(field_mapping) %u% names(lower_bound) %u% names(upper_bound)) %-% self$dims_n_attrs,
"Field(s) not found in the reference array: [{.value}]"
)
VALID_MODES = c("auto", "filter", "cross_between", "index_lookup", "equi_join")
assertf(
mode %in% VALID_MODES,
sprintf("Invalid param 'mode': [%s]. Should be one of [%s]",
mode, paste(VALID_MODES, collapse = ","))
)
op_mode =
if(is.data.frame(df_or_arrayop)){
df = df_or_arrayop
assert_empty(
names(df) %-% self$dims_n_attrs %-% as.character(field_mapping) %-% as.character(lower_bound) %-% as.character(upper_bound),
"Param 'df' has invalid column(s): [{.value}]"
)
numCells = base::nrow(df) * length(names(df))
numCols = length(names(df))
if(mode == "auto") {
if (numCells <= filter_threshold) {
"filter"
} else if (numCols == 1) {
"index_lookup"
} else if (all(names(df) %in% self$dims)) {
"cross_between"
} else {
"equi_join"
}
} else {
mode
}
} else { # rhs is array_op
if(mode == "auto"){
if(length(df_or_arrayop$dims_n_attrs) == 2L) {
"index_lookup"
} else if (!any(df_or_arrayop$dims_n_attrs %in% self$attrs)) {
"cross_between"
} else {
"equi_join"
}
} else {
assertf(mode != "filter", "semi_join 'filter' mode only works with data.frame, but array_op is provided.")
mode
}
}
if(is.data.frame(df_or_arrayop)){
df = df_or_arrayop
if(op_mode == "filter"){
return(
private$semi_join_by_df_filter(
df,
field_mapping = field_mapping,
lower_bound = lower_bound,
upper_bound = upper_bound
)
)
}
explicitFields = as.list(c(field_mapping, lower_bound, upper_bound))
# if(.not_empty(as.character(explicitFields) %-% names(explicitFields))) browser()
implicitFields = names(df) %-% as.character(explicitFields)
implicitFields = new_named_list(implicitFields, implicitFields)
dfFields = as.character(explicitFields) %u% names(implicitFields)
refFields = c(names(explicitFields), as.character(implicitFields)) # allow duplicates here
templateDtypes = new_named_list(
private$get_field_types(refFields, .raw = T),
names = dfFields
)
arrayTemplate = private$create_new(
"", dims = "x",
attrs = dfFields,
dtypes = templateDtypes
)
# build_or_upload_df(df, arrayTemplate, threshold = upload_threshold)
dataArray = private$conn$array_from_df(df, arrayTemplate, build_or_upload_threshold = upload_threshold)
} else {
dataArray = df_or_arrayop
}
result = private$match(dataArray, op_mode = op_mode,
field_mapping = field_mapping,
lower_bound = lower_bound,
upper_bound = upper_bound)
# Add to ref count to avoid R's GC
result$.private$hold_refs(dataArray)
return(result)
}
,
#' @description
#' Create a new arrayOp with 'group by' fields
#'
#' The result arrayOp is identical to self except for the 'group_by' fields.
#' When called before `summarize` function, result arrayOp will be converted into `grouped_aggregate` operation.
#' @param ... field names as strings or string vectors, which will be merged
#' into a single string vector with `c(...)`
#' @return An arrayOp instance with group_by fields
group_by = function(...) {
fields = c(...)
assert_inherits(fields, "character", .symbol = "group_by_fields")
assert_empty(fields %-% self$dims_n_attrs,
"invalid group_by field(s): [{.value}]")
result = self$spawn(self$to_afl())
result$.private$set_meta('group_by_fields', unique(fields))
result
}
,
#' @description
#' Create a new arrayOp with aggregated fields
#'
#' @param ... aggregation expressions in R syntax. Names of expressions are optional.
#' If provided, names will be the fields of result arrayOp; otherwise field
#' names are auto generated by scidb.
#' Same syntax as `...` in 'filter' and 'mutate' functions.
#' @param .dots a list of aggregation expressions. Similar to '.dots' in
#' 'mutate' and 'transmute'.
#' @return A new arrayOp instance
summarize = function(..., .dots = NULL){
group_by_fields = private$get_meta("group_by_fields")
# expression list converted to a string list
rawFieldsList = lapply(.dots %?% rlang::exprs(...), aflutils$e_to_afl)
# get aggregation expressions
aliases = names(rawFieldsList)
agg_exprs = as.character(rawFieldsList)
agg_exprs_pairs = if(!is.null(aliases)) {
ifelse(aliases == "", agg_exprs,
paste0(agg_exprs, ' as ', aliases))
} else agg_exprs
result =
if(.not_empty(group_by_fields)){
private$conn$afl_expr(afl(
self | grouped_aggregate(
paste(agg_exprs_pairs, collapse = ','),
group_by_fields
)))
} else {
private$conn$afl_expr(afl(
self |
apply(aflutils$join_fields(self$dims, self$dims)) |
aggregate(paste(agg_exprs_pairs, collapse = ','))
))
}
# result$.private$set_conn(private$conn)
result
}
,
#' @description
#' Create a new ArrayOp instance that has auto incremented fields and/or anti-collision fields according to a template arrayOp
#'
#' If the dimension count, attribute count and data types match between the source(self) and target,
#' then no redimension will be performed, otherwise redimension on the source first.
#'
#' Redimension mode requires all target fields exist on the source disregard of being attributes or dimensions.
#' Redimension mode does not check on whether source data types match the target because auto data conversion
#' occurs within scidb where necessary/applicable.
#' @param target A target ArrayOp the source data is written to.
#' @param source_auto_increment A single named integer, a single string, or NULL.
#' Eg. c(z=0) for field 'z' in the source (ie. self) starting from 0;
#' or a single string 'z' equivalent to c(z=0). If NULL, assume it to be the
#' only dimension in self, normally from an artificial dimension of a build literal or unpack operation.
#' @param target_auto_increment a named number vector or string vector or NULL.
#' where the name is a target field and value is the starting index.
#' E.g. c(aid=0, bid=1) means to set auto fields 'aid', 'bid' according to the target fields of the same name.
#' If 'target' doesn't have a cell, then default values start from 0 and 1 for aid and bid, respectively.
#' A string vector c("aid", "bid") is equvilant to c(aid=0, bid=0).
#' NULL means treat all missing fields (absent in self but present in target) as 0-based auto increment fields.
#' Here the `target_auto_increment` param only affects the initial load when the field is still null in the target array.
#' @param anti_collision_field a target dimension name which exsits only to resolve cell collision
#' (ie. cells with the same dimension coordinate).
#' @param join_setting NULL or a named list. When not NULL, it is converted
#' to settings for scidb `equi_join` operator, only applicable when
#' `anti_collision_field` is not NULL.
#' @param source_anti_collision_dim_spec NULL or a string.
#' If NULL, the dimension spec for the anti-collision dimension in source
#' (self) is taken from self's schema.
#' In rare cases, we need to set the dimension spec to control the chunk size
#' in the 'redimension' operation, e.g. `source_anti_collision_dim_spec = "0:*:0:123456"`
#' @return A new arrayop instance
set_auto_fields = function(target,
source_auto_increment = NULL,
target_auto_increment = NULL,
anti_collision_field = NULL,
join_setting = NULL,
source_anti_collision_dim_spec = NULL)
{
assert_inherits(target, "ArrayOpBase")
result = self
assertf(length(source_auto_increment) == 1L || length(source_auto_increment) == 0L,
"source_auto_increment should be a single value of string or integer")
if(.is_empty(source_auto_increment)){
# if the param is not provided, we assume the source array has only
# one 0-based dimension which is used as the source_auto_increment
assertf(length(self$dims) == 1L,
paste0("Cannot infer param source_auto_increment.",
"Please provide `c(z=1)` if `z` is a field in the source array starting from 1;",
"or `'z'` if z starts from 0."))
source_auto_increment = structure(0L, names = self$dims)
} else {
# if auto increment fields are strings, assume they are 0-based
if(is.character(source_auto_increment) && is.null(names(source_auto_increment)))
source_auto_increment = structure(0, names = source_auto_increment)
else{
source_auto_increment = structure(as.integer(source_auto_increment), names = names(source_auto_increment))
}
}
assertf(!any(is.na(source_auto_increment)),
"Invalid param 'source_auto_increment': must be a named integer where name is the field name, and value is the field starting value")
srcIncName = names(source_auto_increment)
assertf(.not_empty(srcIncName) && srcIncName %in% self$dims_n_attrs,
glue("source_auto_increment is not a valid field: {srcIncName}"))
# Infer target_auto_increment fields
if(.is_empty(anti_collision_field) &&
.is_empty(target_auto_increment)){
missingTargetFields = target$dims_n_attrs %-% self$dims_n_attrs
assert_not_empty(missingTargetFields,
"Cannot not infer param target_auto_increment because there are no fields in target array for the source array to add.")
target_auto_increment = structure(rep(0L, length(missingTargetFields)),
names = missingTargetFields)
}
# Infer target_auto_increment fields which can be mutiple, unlike source_auto_increment.
if(is.character(target_auto_increment) &&
is.null(names(target_auto_increment))) {
target_auto_increment = structure(rep(0L, length(target_auto_increment)),
names = target_auto_increment)
}
# If there is an auto-incremnt field, it needs to be inferred from the params
# After this step, 'src' is an updated ArrayOp with auto-incremented id calculated (during AFL execution only due to lazy evaluation)
if(.not_empty(target_auto_increment)){
result = result$.private$set_auto_increment_field(target,
source_field = names(source_auto_increment), ref_field = names(target_auto_increment),
source_start = source_auto_increment, ref_start = target_auto_increment)
}
# If there is anti_collision_field, more actions are required to avoid cell collision
# After this step, 'src' is an updated ArrayOp with auto_collision_field set properly
if(.not_empty(anti_collision_field)){
result = result$.private$set_anti_collision_field(target, anti_collision_field, join_setting = join_setting,
source_anti_collision_dim_spec = source_anti_collision_dim_spec)
}
result = private$conn$afl_expr(result$to_afl())
return(result)
}
,
#' @description
#' Update the target array with self's content
#'
#' Similar behavior to scidb insert operator.
#' Require numbers of attributes and dimensions of self and target arrays match.
#' Field names are irrelevant.
#'
#' This function only returns an arrayOp with the update operation AFL
#' encapsulated. No real action is performed in scidb until
#' `source$update(target)$execute()` is called.
#'
#' @param target An arrayOp instance where self's content is updated. Must be
#' a persistent array, since it is meanlingless to update an array operation.
#' @return A new arrayOp that encapsulates the update operation
update = function(target) {
assert_inherits(target, "ArrayOpBase")
assertf(target$is_persistent(), "update: param 'target' must be a persistent array")
private$afl_insert(target)
}
,
#' @description
#' Overwrite the target array with self's content
#'
#' Similar behavior to scidb store operator.
#' Require numbers of attributes and dimensions of self and target arrays match.
#' Field names are irrelevant.
#'
#' This function only returns an arrayOp with the update operation AFL
#' encapsulated. No real action is performed in scidb until
#' `source$overwrite(target)$execute()` is called.
#'
#' Warning: Target's content will be erased and filled with self's content.
#'
#' @param target An arrayOp instance where self's content is written to.
#' Must be a persitent array either preexist or does not exist in scidb.
#' @return A new arrayOp that encapsulates the overwrite operation
overwrite = function(target) {
assert_inherits(target, "ArrayOpBase")
assertf(target$is_persistent(), "update: param 'target' must be a persistent array")
private$afl_store(target)
}
,
#' @description
#' Create a new ArrayOp instance that encapsulates a delete operation
#'
#' Implemented by scidb `delete` operator.
#'
#' Operators for any type of fields include `==`, `!=`,
#' `%in%`, `%not_in%`.
#' To test whether a field is null, use unary operators: `is_null`, `not_null`.
#'
#' Special binary operators for string fields include:
#' `%contains%`, `%starts_with%`, `%ends_with%`, `%like%`, where
#' only `%like%` takes a regular expression and other operators escape any special
#' characters in the right operand.
#'
#' Operators for numeric fields include: `>`, `<`, `>=`, `<=`
#'
#' @param ... Filter expression(s) in R syntax.
#' These expression(s) are not evaluated in R but first captured then converted to scidb expressions with appropriate syntax.
#' @param .expr A single R expression, or a list of R exprs, or NULL.
#' If provided, `...` is ignored. Mutiple exprs are joined by 'and'.
#' This param is useful when we want to pass an already captured R expression.
#' @param .regex_func A string of regex function implementation, default 'regex'.
#' Due to scidb compatiblity issue with its dependencies, the regex function from boost library may not be available
#' Currently supported options include 'rsub', and 'regex'
#' @param .ignore_case A Boolean, default TRUE. If TRUE, ignore case in string match patterns.
#' Otherwise, perform case-sensitive regex matches.
#' @return A new arrayOp
delete_cells = function(..., .expr = NULL,
.regex_func = getOption('arrayop.regex_func', default = 'regex'),
.ignore_case = getOption('arrayop.ignore_case', default = TRUE)) {
filterExpr = .expr %?% aflutils$e_merge(aflutils$e(...))
# No actual filter expression
if(.is_empty(filterExpr)) return(self$spawn(afl(self | delete("false"))))
self$spawn(afl(self |
delete(aflutils$e_to_afl(filterExpr,
regex_func = .regex_func,
ignore_case = .ignore_case))))
}
,
#' @description
#' Create a new ArrayOp instance with selected fields
#'
#' NOTE: this does NOT change the to_afl output, but explicitly state which field(s) should be retained if used in
#' a parent operation that changes its schema, e.g. `inner_join`, `left_join`, `right_join` and `to_df`.
#'
#' The `select`ed fields are passed on to derived ArrayOp instances.
#'
#' In all join operations, if no field is explicitly `select`ed, then all fields are assumed be retained.
#' In `to_df`, if no field is explicitly `select`ed, only the attributes are retrieved as data frame columns.
#' In `to_df_all`, if no field is explicitly `select`ed, it is equivalent to select all dimensions and attributes.
#' @param ... Which field(s) to retain.
#' @return A new arrayOp
select = function(...) {
fieldNames = c(...)
if(!is.null(fieldNames)) assert_inherits(fieldNames, "character")
assert_empty(fieldNames %-% self$dims_n_attrs,
"Invalid field(s) in select: [{.value}]")
newMeta = private$metaList
newMeta[['selected']] <- fieldNames
private$create_new(private$raw_afl, meta_list = newMeta)
}
# db functions ----
,
#' @description
#' Download query result of self's AFL string with all self's fields.
#'
#' @return An R data frame with columns from self's dimensions and
#' attributes if no fields are selcted, or the selcted fields.
to_df_all = function(arrow = FALSE) {
private$conn$query_all(private$to_df_afl(), arrow = arrow)
}
,
#' @description
#' Download query result of self's AFL string with all self's attributes.
#'
#' @return An R data frame with columns from self's attributes if no fields
#' are selected, or the selcted fields only.
to_df = function(arrow = FALSE) {
private$conn$query(private$to_df_afl(drop_dims = TRUE), arrow = arrow)
}
,
#' @description
#' Execute the AFL string for pure side effect without result returned.
#'
#' @return self
execute = function() {
private$conn$execute(self$to_afl())
invisible(NULL)
}
,
#' @description
#' Persist array operation as scidb array
#'
#' If `self` is a persistent array and no `save_array_name` provided, then
#' `self` is returned.
#'
#' Otherwise, save self's AFL as a new scidb array. This includes two cases:
#' 1. `self` is persistent and `save_array_name` is provided, i.e. explicit persistence
#' 2. `self` is array operation(s), then a new array is created regardless of `save_array_name`
#'
#' From users perspective,
#'
#' 1. When we need to ensure a handle to a persistent array and do not care
#' whether it is a new or existing array, we should leave out
#' `save_array_name` to avoid unnecessary array copying. E.g. `conn$array_from_df`
#' may return a build literal or uploaded persistent array, call
#' `conn$array_from_df(...)$persist()` to ensure a persistent array.
#' 2. When we need to backup an array, then provide a `save_array_name` explicitly.
#'
#' Parameters `.gc` and `.temp` are only applicable when a new array is created.
#'
#' @param save_array_name NULL or String. The new array name to save self's AFL as.
#' If NULL, the array name is randomly generated when a new array is created.
#' @param .temp Boolean, default FALSE. Whether to creaet a temporary scidb array.
#' @param .gc Boolean, default TRUE. Whether to remove the persisted scidb array
#' once the encapsulating arrayOp goes out of scodb in R. Set to FALSE if we
#' need to keep the array indefinitely.
#' @return A new arrayOp instance or self; or throw an exception if
#' `save_array_name` is provided and the array already exists
persist = function(
save_array_name = NULL,
.temp = FALSE,
.gc = TRUE
){
if(!is.null(save_array_name))
assert_single_str(save_array_name)
# No need to store an already persistent arrary
if(self$is_persistent() && is.null(save_array_name)) return(self)
private$conn$private$array_from_stored_afl(
private$to_df_afl(),
save_array_name = save_array_name %?% dbutils$random_array_name(),
.temp = .temp,
.gc = .gc
)
}
# array schema ----
,
#' @description
#' Create a new ArrayOp instance whose schema is the same as the `template`.
#'
#' This operation throws away any fields that do not exist in `template` while keeping the `self`'s data of the
#' matching fields.
#'
#' Implemented by scidb `redimension` operator, but it allows for partial-fields match if `strict=F`.
#'
#' @param template an ArrayOp instance as the schema template.
#' @param strict If TRUE(default), requires `self` has all the `template` fields.
#' @param .setting a string vector, where each item will be appended to the redimension operand.
#' E.g. .setting = c('false', 'cells_per_chunk: 1234') ==> redimension(source, template, false, cells_per_chunk: 1234)
#' @return A new arrayOp instance
change_schema = function(template, strict = TRUE, .setting = NULL){
realTemplate = if(strict) template
else {
matchingDims = template$dims %n% self$dims_n_attrs
assert_has_len(matchingDims,
"ERROR:ArrayOp$change_schema:None of the template dimension(s) '%s' found in the source.",
paste(template$dims, collapse = ','))
matchedFileds = template$dims_n_attrs %n% self$dims_n_attrs
template$spawn(excluded = template$dims_n_attrs %-% matchedFileds)
}
private$afl_redimension(realTemplate, .setting)
}
,
# @param .chunk_size aka. cells_per_chunk in 'flatten' mode.
#' @description
#' Create a new arrayOp by dropping dimensions of 'self'.
#'
#' Use `mode = 'unpack'` to still keep an artificial dimension in result arrayOp.
#' The dimension is 0-based, auto-incremented up until `self$cell_count() - 1`.
#' 'unpack' mode is useful in taking advantage of this artifical dimension to
#' auto populate other fields, e.g. in `set_auto_fields`.
#'
#' Use `mode = 'flatten'` to return a scidb data frame which has no explicit dimensions.
#'
#' Result arrayOp in both modes has attributes of self's attributes and dimensions.
#' @param mode String 'unpack' (default) or 'flatten'.
#' @param .chunk_size NULL or an integer. Converted to the 'chunk_size' param
#' in 'unpack' mode; and 'cells_per_chunk' in 'flatten' mode.
#' @param .unpack_dim NULL (default) or string as the dimension if 'unpack'
#' mode is chosen. NULL defaults to a random field name.
#' @return A new arrayOp instance
drop_dims = function(mode = 'unpack', .chunk_size = NULL, .unpack_dim = dbutils$random_field_name()){
assert_single_str(mode)
if(!is.null(.chunk_size)) assert_single_num(.chunk_size)
VALID_MODES = c('unpack', 'flatten')
assertf(mode %in% VALID_MODES,
glue("unknown mode '{mode}'. Should be one of [{paste(VALID_MODES, collapse = ',')}]"))
conn = private$conn
if(mode == 'unpack'){
.ifelse(
is.null(.chunk_size),
conn$afl_expr(afl(self | unpack(.unpack_dim))),
conn$afl_expr(afl(self | unpack(.unpack_dim, .chunk_size)))
)
} else {
.ifelse(
is.null(.chunk_size),
conn$afl_expr(afl(self | flatten)),
conn$afl_expr(afl(self | flatten(glue("cells_per_chunk: {.chunk_size}"))))
)
}
}
,
#' @description
#' Create a new arrayOp with actual schema from SciDB or 'self' if
#' `self$is_schema_from_scidb == T`.
#'
#' Useful in confirmming the schema of complex array operations.
#' If the array schema is already retrieved from SciDB, then just return self.
#' @return An arrayOp instance
sync_schema = function() {
if(self$is_schema_from_scidb) self else
if(!self$is_persistent())
private$conn$afl_expr(self$to_afl()) else
private$conn$array(self$to_afl())
}
,
#' @description
#' Create a new ArrayOp instance using 'self' as a template
#'
#' This function is mainly for array schema string generation when
#' we want to rename, add, and/or exclude certain fields of self, but still
#' keep other unspecified fields unchanged.
#'
#' Data types and dimension specs of existing fields are inherited from 'self' unless provided explicitly.
#' New field data types default to NAs unless provided explicitly.
#'
#' This function is normally used internally for arrayOp generation.
#'
#' @param afl_str An AFL expression. In case of using the spawned result as
#' a schema template only, the afl_str does not need to be provided. Otherwise,
#' it should conform with the actual resultant arrayOp instance, which is very rare.
#' @param renamed A list of renamed fields where names are old fields and values are new field names.
#' @param added New fields added to result arrayOp. String vector or NULL.
#' @param excluded Fields excluded from `self`. String vector or NULL.
#' @param dtypes NULL or a named list of data types for fields of the result arrayOp,
#' where names are field names, values (strings) are data types.
#' @param dim_specs NULL or a named list of array dimension specs,
#' where names are dimension names, values (strings) are dimension specs in scidb format.
#' @return A new arrayOp instance
spawn = function(afl_str = "spawned array_op (as template only)",
renamed = NULL,
added = NULL,
excluded = NULL,
dtypes = NULL,
dim_specs = NULL) {
assert_inherits(afl_str, "character")
if(.not_empty(renamed)){
assert_named_list(renamed, "ERROR: ArrayOp$spawn: 'renamed' param must be a named list, but got: %s", renamed)
}
assert_inherits(added, c("NULL", "character"))
assert_inherits(excluded, c("NULL", "character"))
attrs = self$attrs
dims = self$dims
oldDtypes = self$dtypes
oldDimSpecs = private$get_dim_specs()
# Rename the existing
if(.not_empty(renamed)){
renamedOldFields = names(renamed)
assert_empty(renamedOldFields %-% self$dims_n_attrs,
"Unknown field(s) in param `renamed`: [{.value}]")
attrs = as.character(replace(attrs, attrs %in% renamedOldFields, .remove_null_values(renamed[attrs])))
dims = as.character(replace(dims, dims %in% renamedOldFields, .remove_null_values(renamed[dims])))
namesOldDtypes = names(oldDtypes)
names(oldDtypes) <- replace(namesOldDtypes, namesOldDtypes %in% renamedOldFields, .remove_null_values(renamed[namesOldDtypes]))
namesOldDimSpecs = names(oldDimSpecs)
names(oldDimSpecs) <- replace(namesOldDimSpecs, namesOldDimSpecs %in% renamedOldFields, .remove_null_values(renamed[namesOldDimSpecs]))
}
# Add new fields
if(.not_empty(added)){
addedDims = added %n% names(dim_specs)
addedAttrs = added %-% names(dim_specs)
attrs = attrs %u% addedAttrs
dims = dims %u% addedDims
}
# Exclude the excluded
if(.not_empty(excluded)){
attrs = attrs %-% excluded
dims = dims %-% excluded
}
dtypes = .remove_null_values(
c(dtypes, oldDtypes)[c(dims, attrs)]
)
dim_specs = .ifelse(.not_empty(dim_specs), c(dim_specs, oldDimSpecs), oldDimSpecs)
private$create_new(afl_str, dims = dims, attrs = attrs, dtypes = dtypes, dim_specs = dim_specs)
}
# AFL -------------------------------------------------------------------------------------------------------------
,
#' @description
#' AFL string encapsulated by of the self ArrayOp
#'
#' AFL can be either an scidb array name or array operation(s) on array(s).
#'
#' The ArrayOp instance may have 'selected' fields but they are not reflected in the result.
#' 'selected' fields are not reflected here, but determines which fields are retained in `to_df()` calls.
#'
#' @return an AFL expression string
to_afl = function() {
return(private$raw_afl)
}
,
#' @description
#' Return a schema representation of the ArrayOp `<attr1:type1 [, attr2:type2 ...]> [dim1 [;dim2]]`
#'
#' Unless `sync_schema()` is called, the schema may be inferred locally in R to save round trips between R and SciDB server.
#' SciDB data frames have hidden dimensions that start with `$`
#'
#' @return SciDB schema string
to_schema_str = function() {
attrStr = paste(self$attrs, private$get_field_types(self$attrs), sep = ':', collapse = ',')
# SciDB data frame format
if(length(self$dims) == 0) return(sprintf("<%s>", attrStr))
dimStrItems = mapply(function(dimName, dimSpec){
if(.not_empty(dimSpec) && nchar(dimSpec) > 0) sprintf("%s=%s", dimName, dimSpec)
else dimName
}, self$dims, private$get_dim_specs())
dimStr = paste(dimStrItems, collapse = ';')
sprintf("<%s> [%s]", attrStr, dimStr)
}
# Misc array operators ------------------------------------------------------------------------------------------
,
#' @description
#' Create a new arrayOp that encapsulate AFL for the first `n` cells of 'self'
#'
#' We still need to append a `to_df` call to download the result as data frame.
#' @param n How many cells to take
#' @param skip How many rows to skip before taking
#' @return A new ArrayOp instance
limit = function(n = 5, skip = NULL) {
assert_single_number(n)
assert(is.null(skip) || is.numeric(skip))
aflStr = if(is.null(skip)) afl(self | limit(n)) else afl(self | limit(n, skip))
self$spawn(aflStr)
}
,
#' @description
#' Return the number of cells of 'self'
#' @return A number of cells if the AFL that 'self' encapsulates is run.
cell_count = function(){
if(self$is_persistent()) self$summarize_array()$count
else private$conn$query(afl(self | op_count))[["count"]]
}
,
#' @description
#' Return a data frame of the summary of the 'self' array
#'
#' Implemented by scidb 'summarize' operator
#' @param by_attribute Summarize by array attributes
#' @param by_instance Summarize by array scidb instances
#' @param return A data frame of the 'self' array summary
summarize_array = function(by_attribute = FALSE, by_instance = FALSE){
private$conn$query_all(afl(self | summarize(
glue("by_attribute:{by_attribute}, by_instance:{by_instance}")))
)
}
,
#' @description
#' Return a data frame of all self's versions
#' @return An R data frame with columns: version_id and timestamp
list_versions = function(){
assertf(self$is_persistent(), "list_versions only works on persistent arrays")
private$conn$query(afl(self | versions))
}
,
#' @description
#' Get an arrayOp instance that encapsulates a version snapshot of a persistent
#' scidb array
#'
#' The function does not perform version check in scidb. It only construct
#' an arrayOp locally to represent a specific version. If a non-existent
#' version_id is later used in scidb related operations, an error will be
#' thrown by SciDB.
#'
#' @param version_id A number of the array version_id
#' @return An arrayOp instance with the same schema as self
version = function(version_id) {
assertf(self$is_persistent(), "version only works on persistent arrays")
assert_single_num(version_id)
self$spawn(sprintf("%s@%s", self$to_afl(), version_id))
}
,
#' @description
#' Returns whether the current arraOp instance encapsulates a persistent scidb
#' array namne that may or may not exist on the scidb server
#'
#' No checking with scidb server is performed. Only validate the arrayOp's AFL
#' with regex and see if it matches an array name. E.g. "myNamespace.myArray"
#' or "myArrayInPublicNamespace".
#'
#' @return TRUE or FALSE
is_persistent = function(){
!grepl("\\(", self$to_afl()) &&
grepl("^((\\w+)\\.)?(\\w+)$", self$to_afl())
}
,
#' @description
#' Returns whether the current arraOp instance encapsulates a persistent scidb
#' array that exists on the scidb server
#'
#' If current arrayOp encapsulates an array operation, then it returns FALSE
#' without checking with scidb server.
#'
#' @return TRUE or FALSE
exists_persistent_array = function() {
self$is_persistent() && nrow(self$array_meta_data()) == 1L
}
,
#' @description
#' Download the array meta as an R data frame
#'
#' The array metadata is retrieved from executing the scidb 'show' operator
#' in the array namespace and match for the current array name.
#' Array metadata include fields: "name", "uaid", "aid", "schema", "availability", "temporary", "namespace", "distribution", "etcomp"
#' @return An R data frame
array_meta_data = function(){
assertf(self$is_persistent(),
"Array meta data is only available for persistent scidb arrays.")
full_array_name = self$to_afl()
ns = gsub("^((\\w+)\\.)?(\\w+)$", "\\2", full_array_name)
bare_name = gsub("^((\\w+)\\.)?(\\w+)$", "\\3", full_array_name)
query_str = if(ns == "") "list('arrays')" else
sprintf("list('arrays', ns:%s)", ns)
private$conn$query(sprintf("filter(%s, name = '%s')", query_str, bare_name))
}
,
#' @description
#' Remove array versions of self
#'
#' Only applicable to persistent arrays.
#' **Warning**: This function will be executed effectively in scidb without
#' extra 'execute()' and cannot be undone.
#' @param version_id NULL or a number. When set to NULL, all array versions
#' are removed except for the latest one. When set to an number, must be
#' a valid version_id of self, in which case all versions up to the 'version_id'
#' are removed.
#' @return NULL
remove_versions = function(version_id = NULL){
assertf(self$is_persistent(), "remove_versions only works on persistent arrays")
if(is.null(version_id)) {
private$conn$execute(afl(self | remove_versions))
} else {
version_id = as.character(version_id)
assert_single_str(version_id, "ERROR: param 'version_id' cannot be converted to a single str: %s", deparse(version_id))
private$conn$execute(afl(self | remove_versions(version_id)))
}
invisible(NULL)
}
,
#' @description
#' Remove array versions of self
#'
#' Only applicable to persistent arrays.
#' **Warning**: This function will be executed effectively in scidb without
#' extra 'execute()' and cannot be undone.
#' @return NULL
remove_array = function(){
assertf(self$is_persistent(), "remove_array only works on persistent arrays")
if (self$exists_persistent_array()) {
self$.private$conn$execute(afl(self | remove))
if (self$.private$is_gc()) {
for (obj in self$.private$get_meta('.refs')){
if ("scidb" %in% class(obj)) {
# Since we've removed the array, we no longer need to do so when we GC
obj@meta$remove = FALSE
}
}
}
}
invisible(NULL)
}
,
#' @description
#' A finalize function executed when the 'self' instance is garbage collected in R
#'
#' If an arrayOp is marked as .gc = T, then it will be removed from scidb
#' when this function is executed.
#'
#' We don't normally call this function except in testing.
finalize = function() {
# Extra referenced objects where gc managed by ScidbR
self$.private$remove_scidbr_from_refs()
private$set_meta('.refs', NULL)
}
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.