R/array.R

Defines functions copy_buffers_recursive array_shallow_copy nanoarrow_array_modify nanoarrow_array_set_schema nanoarrow_array_init nanoarrow_array_proxy nanoarrow_array_proxy_safe `$<-.nanoarrow_array` `[[<-.nanoarrow_array` `$.nanoarrow_array` `[[.nanoarrow_array` names.nanoarrow_array length.nanoarrow_array format.nanoarrow_array print.nanoarrow_array str.nanoarrow_array infer_nanoarrow_schema.nanoarrow_array as_tibble.nanoarrow_array as.data.frame.nanoarrow_array as.vector.nanoarrow_array as_nanoarrow_array

Documented in as_nanoarrow_array nanoarrow_array_init nanoarrow_array_modify nanoarrow_array_set_schema

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

#' Convert an object to a nanoarrow array
#'
#' In nanoarrow an 'array' refers to the `struct ArrowArray` definition
#' in the Arrow C data interface. At the R level, we attach a
#' [schema][as_nanoarrow_schema] such that functionally the nanoarrow_array
#' class can be used in a similar way as an `arrow::Array`. Note that in
#' nanoarrow an `arrow::RecordBatch` and a non-nullable `arrow::StructArray`
#' are represented identically.
#'
#' @param x An object to convert to a array
#' @param schema An optional schema used to enforce conversion to a particular
#'   type. Defaults to [infer_nanoarrow_schema()].
#' @param ... Passed to S3 methods
#'
#' @return An object of class 'nanoarrow_array'
#' @export
#'
#' @examples
#' (array <- as_nanoarrow_array(1:5))
#' as.vector(array)
#'
#' (array <- as_nanoarrow_array(data.frame(x = 1:5)))
#' as.data.frame(array)
#'
as_nanoarrow_array <- function(x, ..., schema = NULL) {
  UseMethod("as_nanoarrow_array")
}

# See as-array.R for S3 method implementations

#' @export
as.vector.nanoarrow_array <- function(x, mode = "any") {
  stopifnot(identical(mode, "any"))
  convert_array(x, to = infer_nanoarrow_ptype(x))
}

#' @export
as.data.frame.nanoarrow_array <- function(x, ...) {
  schema <- infer_nanoarrow_schema(x)
  if (schema$format != "+s") {
    stop(
      sprintf(
        "Can't convert array with type %s to data.frame()",
        nanoarrow_schema_formatted(schema)
      )
    )
  }

  .Call(nanoarrow_c_convert_array, x, NULL)
}

# exported in zzz.R
as_tibble.nanoarrow_array <- function(x, ...) {
  tibble::as_tibble(as.data.frame.nanoarrow_array(x), ...)
}

#' @export
infer_nanoarrow_schema.nanoarrow_array <- function(x, ...) {
  .Call(nanoarrow_c_infer_schema_array, x) %||%
    stop("nanoarrow_array() has no associated schema")
}

#' @importFrom utils str
#' @export
str.nanoarrow_array <- function(object, ...) {
  cat(sprintf("%s\n", format(object, .recursive = FALSE)))

  if (nanoarrow_pointer_is_valid(object)) {
    # Use the str() of the list version but remove the first
    # line of the output ("List of 6")
    info <- nanoarrow_array_proxy_safe(object)
    raw_str_output <- utils::capture.output(str(info, ...))
    cat(paste0(raw_str_output[-1], collapse = "\n"))
    cat("\n")
  }

  invisible(object)
}

#' @export
print.nanoarrow_array <- function(x, ...) {
  str(x, ...)
  invisible(x)
}

#' @export
format.nanoarrow_array <- function(x, ..., .recursive = TRUE) {
  if (nanoarrow_pointer_is_valid(x)) {
    schema <- .Call(nanoarrow_c_infer_schema_array, x)
    if (is.null(schema)) {
      sprintf("<nanoarrow_array <unknown schema>[%s]>", x$length)
    } else {
      sprintf(
        "<nanoarrow_array %s[%s]>",
        nanoarrow_schema_formatted(schema, .recursive),
        x$length
      )
    }
  } else {
    "<nanoarrow_array[invalid pointer]>"
  }
}


# This is the list()-like interface to nanoarrow_array that allows $ and [[
# to make nice auto-complete for the array fields


#' @export
length.nanoarrow_array <- function(x, ...) {
  6L
}

#' @export
names.nanoarrow_array <- function(x, ...) {
  c("length",  "null_count", "offset", "buffers", "children", "dictionary")
}

#' @export
`[[.nanoarrow_array` <- function(x, i, ...) {
  nanoarrow_array_proxy_safe(x)[[i]]
}

#' @export
`$.nanoarrow_array` <- function(x, i, ...) {
  nanoarrow_array_proxy_safe(x)[[i]]
}

#' @export
`[[<-.nanoarrow_array` <- function(x, i, value) {
  if (is.numeric(i) && isTRUE(i %in% 1:6)) {
    i <- names.nanoarrow_array()[[i]]
  }

  if (is.character(i) && (length(i) == 1L) && !is.na(i)) {
    new_values <- list(value)
    names(new_values) <- i
    return(nanoarrow_array_modify(x, new_values))
  }

  stop("`i` must be character(1) or integer(1) %in% 1:6")
}

#' @export
`$<-.nanoarrow_array` <- function(x, i, value) {
  new_values <- list(value)
  names(new_values) <- i
  nanoarrow_array_modify(x, new_values)
}

# A version of nanoarrow_array_proxy() that is less likely to error for invalid
# arrays and/or schemas
nanoarrow_array_proxy_safe <- function(array, recursive = FALSE) {
  schema <- .Call(nanoarrow_c_infer_schema_array, array)
  tryCatch(
    nanoarrow_array_proxy(array, schema = schema, recursive = recursive),
    error = function(...) nanoarrow_array_proxy(array, recursive = recursive)
  )
}

nanoarrow_array_proxy <- function(array, schema = NULL, recursive = FALSE) {
  if (!is.null(schema)) {
    array_view <- .Call(nanoarrow_c_array_view, array, schema)
    result <- .Call(nanoarrow_c_array_proxy, array, array_view, recursive)

    names(result$children) <- names(schema$children)

    if (!recursive) {
      # Pass on some information from the schema if we have it
      result$children <- Map(
        nanoarrow_array_set_schema,
        result$children,
        schema$children
      )

      if (!is.null(result$dictionary)) {
        nanoarrow_array_set_schema(result$dictionary, schema$dictionary)
      }
    }
  } else {
    result <- .Call(nanoarrow_c_array_proxy, array, NULL, recursive)
  }

  result
}


#' Modify nanoarrow arrays
#'
#' Create a new array or from an existing array, modify one or more parameters.
#' When importing an array from elsewhere, `nanoarrow_array_set_schema()` is
#' useful to attach the data type information to the array (without this
#' information there is little that nanoarrow can do with the array since its
#' content cannot be otherwise interpreted). `nanoarrow_array_modify()` can
#' create a shallow copy and modify various parameters to create a new array,
#' including setting children and buffers recursively. These functions power the
#' `$<-` operator, which can modify one parameter at a time.
#'
#' @param array A [nanoarrow_array][as_nanoarrow_array].
#' @param schema A [nanoarrow_schema][as_nanoarrow_schema] to attach to this
#'   `array`.
#' @param new_values A named `list()` of values to replace.
#' @param validate Use `FALSE` to skip validation. Skipping validation may
#'   result in creating an array that will crash R.
#'
#' @return
#'   - `nanoarrow_array_init()` returns a possibly invalid but initialized
#'     array with a given `schema`.
#'   - `nanoarrow_array_set_schema()` returns `array`, invisibly. Note that
#'      `array` is modified in place by reference.
#'   - `nanoarrow_array_modify()` returns a shallow copy of `array` with the
#' modified parameters such that the original array remains valid.
#' @export
#'
#' @examples
#' nanoarrow_array_init(na_string())
#'
#' # Modify an array using $ and <-
#' array <- as_nanoarrow_array(1:5)
#' array$length <- 4
#' as.vector(array)
#'
#' # Modify potentially more than one component at a time
#' array <- as_nanoarrow_array(1:5)
#' as.vector(nanoarrow_array_modify(array, list(length = 4)))
#'
#' # Attach a schema to an array
#' array <- as_nanoarrow_array(-1L)
#' nanoarrow_array_set_schema(array, na_uint32())
#' as.vector(array)
#'
nanoarrow_array_init <- function(schema) {
  .Call(nanoarrow_c_array_init, schema)
}

#' @rdname nanoarrow_array_init
#' @export
nanoarrow_array_set_schema <- function(array, schema, validate = TRUE) {
  .Call(nanoarrow_c_array_set_schema, array, schema, as.logical(validate)[1])
  invisible(array)
}

#' @rdname nanoarrow_array_init
#' @export
nanoarrow_array_modify <- function(array, new_values, validate = TRUE) {
  array <- as_nanoarrow_array(array)

  if (length(new_values) == 0) {
    return(array)
  }

  # Make sure new_values has names to iterate over
  new_names <- names(new_values)
  if (is.null(new_names) || all(new_names == "", na.rm = TRUE)) {
    stop("`new_values` must be named")
  }

  # Make a copy and modify it. This is a deep copy in the sense that all
  # children are modifiable; however, it's a shallow copy in the sense that
  # none of the buffers are copied.
  schema <- .Call(nanoarrow_c_infer_schema_array, array)
  array_copy <- array_shallow_copy(array, schema, validate = validate)

  for (i in seq_along(new_values)) {
    nm <- new_names[i]
    value <- new_values[[i]]

    switch(
      nm,
      length = .Call(nanoarrow_c_array_set_length, array_copy, as.double(value)),
      null_count = .Call(nanoarrow_c_array_set_null_count, array_copy, as.double(value)),
      offset = .Call(nanoarrow_c_array_set_offset, array_copy, as.double(value)),
      buffers = {
        value <- lapply(value, as_nanoarrow_buffer)
        .Call(nanoarrow_c_array_set_buffers, array_copy, value)
      },
      children = {
        value <- lapply(value, as_nanoarrow_array)
        value_copy <- lapply(value, array_shallow_copy, validate = validate)
        .Call(nanoarrow_c_array_set_children, array_copy, value_copy)

        if (!is.null(schema)) {
          schema <- nanoarrow_schema_modify(
            schema,
            list(children = lapply(value, infer_nanoarrow_schema)),
            validate = validate
          )
        }
      },
      dictionary = {
        if (!is.null(value)) {
          value <- as_nanoarrow_array(value)
          value_copy <- array_shallow_copy(value, validate = validate)
        } else {
          value_copy <- NULL
        }

        .Call(nanoarrow_c_array_set_dictionary, array_copy, value_copy)

        if (!is.null(schema) && !is.null(value)) {
          schema <- nanoarrow_schema_modify(
            schema,
            list(dictionary = infer_nanoarrow_schema(value)),
            validate = validate
          )
        } else if (!is.null(schema)) {
          schema <- nanoarrow_schema_modify(
            schema,
            list(dictionary = NULL),
            validate = validate
          )
        }
      },
      stop(sprintf("Can't modify array[[%s]]: does not exist", deparse(nm)))
    )
  }

  if (!is.null(schema) && validate) {
    array_copy <- .Call(nanoarrow_c_array_validate_after_modify, array_copy, schema)
  }

  if (!is.null(schema)) {
    nanoarrow_array_set_schema(array_copy, schema, validate = validate)
  }

  array_copy
}

array_shallow_copy <- function(array, schema = NULL, validate = TRUE) {
  array_copy <- nanoarrow_allocate_array()
  nanoarrow_pointer_export(array, array_copy)
  schema <- schema %||% .Call(nanoarrow_c_infer_schema_array, array)

  # For validation, use some of the infrastructure we already have in place
  # to make sure array_copy knows how long each buffer is
  if (!is.null(schema) && validate) {
    copy_buffers_recursive(array, array_copy)
  }

  array_copy
}

copy_buffers_recursive <- function(array, array_copy) {
  proxy <- nanoarrow_array_proxy_safe(array)
  proxy_copy <- nanoarrow_array_proxy(array_copy)

  .Call(nanoarrow_c_array_set_buffers, array_copy, proxy$buffers)

  for (i in seq_along(proxy$children)) {
    copy_buffers_recursive(proxy$children[[i]], proxy_copy$children[[i]])
  }

  if (!is.null(proxy$dictionary)) {
    copy_buffers_recursive(proxy$dictionary, proxy_copy$dictionary)
  }
}

Try the nanoarrow package in your browser

Any scripts or data that you put into this service are public.

nanoarrow documentation built on June 22, 2024, 9:37 a.m.