R/expression.R

Defines functions is.na.Expression Ops.Expression op scalar field_ref get_nested_field `$.Expression` `[[.Expression` create

Documented in scalar

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

#' Arrow expressions
#'
#' @description
#' `Expression`s are used to define filter logic for passing to a [Dataset]
#' [Scanner].
#'
#' `Expression$scalar(x)` constructs an `Expression` which always evaluates to
#' the provided scalar (length-1) R value.
#'
#' `Expression$field_ref(name)` is used to construct an `Expression` which
#' evaluates to the named column in the `Dataset` against which it is evaluated.
#'
#' `Expression$create(function_name, ..., options)` builds a function-call
#' `Expression` containing one or more `Expression`s. Anything in `...` that
#' is not already an expression will be wrapped in `Expression$scalar()`.
#'
#' `Expression$op(FUN, ...)` is for logical and arithmetic operators. Scalar
#' inputs in `...` will be attempted to be cast to the common type of the
#' `Expression`s in the call so that the types of the columns in the `Dataset`
#' are preserved and not unnecessarily upcast, which may be expensive.
#' @name Expression
#' @rdname Expression
#' @include arrowExports.R
#' @export
Expression <- R6Class("Expression",
  inherit = ArrowObject,
  public = list(
    ToString = function() compute___expr__ToString(self),
    Equals = function(other, ...) {
      inherits(other, "Expression") && compute___expr__equals(self, other)
    },
    # TODO: Implement type determination without storing
    # schemas in Expression objects (ARROW-13186)
    schema = NULL,
    type = function(schema = self$schema) {
      assert_that(!is.null(schema))
      compute___expr__type(self, schema)
    },
    type_id = function(schema = self$schema) {
      assert_that(!is.null(schema))
      compute___expr__type_id(self, schema)
    },
    is_field_ref = function() {
      compute___expr__is_field_ref(self)
    },
    cast = function(to_type, safe = TRUE, ...) {
      opts <- cast_options(safe, ...)
      opts$to_type <- as_type(to_type)
      Expression$create("cast", self, options = opts)
    }
  ),
  active = list(
    field_name = function() compute___expr__get_field_ref_name(self)
  )
)
Expression$create <- function(function_name,
                              ...,
                              args = list(...),
                              options = empty_named_list()) {
  assert_that(is.string(function_name))
  # Make sure all inputs are Expressions
  args <- lapply(args, function(x) {
    if (!inherits(x, "Expression")) {
      x <- Expression$scalar(x)
    }
    x
  })
  expr <- compute___expr__call(function_name, args, options)
  if (length(args)) {
    expr$schema <- unify_schemas(schemas = lapply(args, function(x) x$schema))
  } else {
    # TODO: this shouldn't be necessary
    expr$schema <- schema()
  }
  expr
}


#' @export
`[[.Expression` <- function(x, i, ...) get_nested_field(x, i)

#' @export
`$.Expression` <- function(x, name, ...) {
  assert_that(is.string(name))
  if (name %in% ls(x)) {
    get(name, x)
  } else {
    get_nested_field(x, name)
  }
}

get_nested_field <- function(expr, name) {
  if (expr$is_field_ref()) {
    # Make a nested field ref
    # TODO(#33756): integer (positional) field refs are supported in C++
    assert_that(is.string(name))
    out <- compute___expr__nested_field_ref(expr, name)
  } else {
    # Use the struct_field kernel if expr is a struct:
    expr_type <- expr$type() # errors if no schema set
    if (inherits(expr_type, "StructType")) {
      # Because we have the type, we can validate that the field exists
      if (!(name %in% names(expr_type))) {
        stop(
          "field '", name, "' not found in ",
          expr_type$ToString(),
          call. = FALSE
        )
      }
      out <- Expression$create(
        "struct_field",
        expr,
        options = list(field_ref = Expression$field_ref(name))
      )
    } else {
      # TODO(#33757): if expr is list type and name is integer or Expression,
      # call list_element
      stop(
        "Cannot extract a field from an Expression of type ", expr_type$ToString(),
        call. = FALSE
      )
    }
  }
  # Schema bookkeeping
  out$schema <- expr$schema
  out
}

Expression$field_ref <- function(name) {
  # TODO(#33756): allow construction of field ref from integer
  assert_that(is.string(name))
  compute___expr__field_ref(name)
}
Expression$scalar <- function(x) {
  if (!inherits(x, "Scalar")) {
    x <- Scalar$create(x)
  }
  expr <- compute___expr__scalar(x)
  expr$schema <- schema()
  expr
}
# Wrapper around Expression$create that:
# (1) maps R operator names to Arrow C++ compute ("/" --> "divide_checked").
#     This is convenient for Ops.Expression, despite the special handling
#     for the division operators inside the function
# (2) wraps R input args as Array or Scalar and attempts to cast them to
#     match the type of the columns/fields in the expression. This is to prevent
#     upcasting all of the data where a simple downcast of a Scalar works.
Expression$op <- function(FUN,
                          ...,
                          args = list(...)) {
  if (FUN == "-" && length(args) == 1L) {
    if (inherits(args[[1]], c("ArrowObject", "Expression"))) {
      return(Expression$create("negate_checked", args[[1]]))
    } else {
      return(-args[[1]])
    }
  }

  if (FUN != "%/%") {
    # We switch %/% behavior based on the actual input types so don't
    # try to cast scalars to match the columns
    args <- cast_scalars_to_common_type(args)
  }

  # In Arrow, "divide" is one function, which does integer division on
  # integer inputs and floating-point division on floats
  if (FUN == "/") {
    # TODO: omg so many ways it's wrong to assume these types (right?)
    args <- lapply(args, cast, float64())
  } else if (FUN == "%/%") {
    # In R, integer division works like floor(float division)
    out <- Expression$create("floor", Expression$op("/", args = args))

    # ... but if inputs are integer, make sure we return an integer
    int_type_ids <- Type[toupper(INTEGER_TYPES)]
    is_int <- function(x) {
      is.integer(x) ||
        (inherits(x, "ArrowObject") && x$type_id() %in% int_type_ids)
    }

    if (is_int(args[[1]]) && is_int(args[[2]])) {
      if (inherits(args[[1]], "ArrowObject")) {
        out_type <- args[[1]]$type()
      } else {
        # It's an R integer
        out_type <- int32()
      }
      # If args[[2]] == 0, float division returns Inf,
      # but for integer division R returns NA, so wrap in if_else
      out <- Expression$create(
        "if_else",
        Expression$op("==", args[[2]], 0L),
        Scalar$create(NA_integer_, out_type),
        cast(out, out_type, allow_float_truncate = TRUE)
      )
    }
    return(out)
  } else if (FUN == "%%") {
    return(args[[1]] - args[[2]] * (args[[1]] %/% args[[2]]))
  }

  Expression$create(.operator_map[[FUN]], args = args)
}

#' @export
Ops.Expression <- function(e1, e2) {
  if (.Generic == "!") {
    Expression$create("invert", e1)
  } else {
    Expression$op(.Generic, e1, e2)
  }
}

#' @export
is.na.Expression <- function(x) {
  Expression$create("is_null", x, options = list(nan_is_null = TRUE))
}

Try the arrow package in your browser

Any scripts or data that you put into this service are public.

arrow documentation built on Nov. 25, 2023, 1:09 a.m.