R/extension-vctrs.R

Defines functions clean_ptype as_nanoarrow_array_extension.nanoarrow_extension_spec_vctrs convert_array_extension.nanoarrow_extension_spec_vctrs infer_nanoarrow_ptype_extension.nanoarrow_extension_spec_vctrs register_vctrs_extension na_vctrs

Documented in na_vctrs

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

#' Vctrs extension type
#'
#' The Arrow format provides a rich type system that can handle most R
#' vector types; however, many R vector types do not roundtrip perfectly
#' through Arrow memory. The vctrs extension type uses [vctrs::vec_data()],
#' [vctrs::vec_restore()], and [vctrs::vec_ptype()] in calls to
#' [as_nanoarrow_array()] and [convert_array()] to ensure roundtrip fidelity.
#'
#' @param ptype A vctrs prototype as returned by [vctrs::vec_ptype()].
#'   The prototype can be of arbitrary size, but a zero-size vector
#'   is sufficient here.
#' @inheritParams na_type
#'
#' @return A [nanoarrow_schema][as_nanoarrow_schema].
#' @export
#'
#' @examplesIf requireNamespace("jsonlite", quietly = TRUE)
#' vctr <- as.POSIXlt("2000-01-02 03:45", tz = "UTC")
#' array <- as_nanoarrow_array(vctr, schema = na_vctrs(vctr))
#' infer_nanoarrow_ptype(array)
#' convert_array(array)
#'
na_vctrs <- function(ptype, storage_type = NULL) {
  ptype <- vctrs::vec_ptype(ptype)

  if (is.null(storage_type)) {
    storage_type <- infer_nanoarrow_schema(vctrs::vec_data(ptype))
  }

  # Note: a potential replacement for this is the JSON generated by the cereal
  # package; however, as of this writing that JSON doesn't handle arbitrary
  # ptypes (e.g., serializes POSIXlt and POSIXct as POSIXct).
  serialized <- jsonlite::serializeJSON(clean_ptype(ptype), digits = 16)
  na_extension(storage_type, "arrow.r.vctrs", serialized)
}

register_vctrs_extension <- function() {
  register_nanoarrow_extension(
    "arrow.r.vctrs",
    nanoarrow_extension_spec(subclass = "nanoarrow_extension_spec_vctrs")
  )
}

#' @export
infer_nanoarrow_ptype_extension.nanoarrow_extension_spec_vctrs <- function(extension_spec, x, ...) {
  parsed <- .Call(nanoarrow_c_schema_parse, x)
  clean_ptype(jsonlite::unserializeJSON(rawToChar(parsed$extension_metadata)))
}

#' @export
convert_array_extension.nanoarrow_extension_spec_vctrs <- function(extension_spec,
                                                                   array, to,
                                                                   ...) {
  # Restore the vector data to the ptype that is serialized in the type metadata
  to_r_data <- infer_nanoarrow_ptype(array)
  to_data <- vctrs::vec_data(to_r_data)
  data <- convert_array_extension(NULL, array, to_data, warn_unregistered = FALSE)
  vctr <- vctrs::vec_restore(data, to_r_data)

  # Cast to `to` if a different ptype was requested
  if (!is.null(to)) {
    vctrs::vec_cast(vctr, to)
  } else {
    vctr
  }
}

#' @export
as_nanoarrow_array_extension.nanoarrow_extension_spec_vctrs <- function(
    extension_spec, x, ...,
    schema = NULL) {
  storage_schema <- schema
  storage_schema$metadata[["ARROW:extension:name"]] <- NULL
  storage_schema$metadata[["ARROW:extension:metadata"]] <- NULL

  storage_array <- as_nanoarrow_array(
    vctrs::vec_data(x),
    schema = storage_schema
  )

  nanoarrow_extension_array(
    storage_array,
    "arrow.r.vctrs",
    schema$metadata[["ARROW:extension:metadata"]]
  )
}

# Applies some heuristics to minimize the chance that non-standard object types
# that don't roundtrip properly (e.g., external pointers, functions) aren't
# serialized/unserialized in the JSON ptype.
clean_ptype <- function(x) {
  whitelist <- c("character", "double", "integer", "logical", "complex", "list")

  if (!(typeof(x) %in% whitelist)) {
    stop(
      sprintf(
        "Can't serialize/unserialize ptype containing object of type %s",
        typeof(x)
      )
    )
  }

  attrs <- attributes(x)
  if (!is.null(x)) {
    attrs <- lapply(attrs, clean_ptype)
  }

  if (is.list(x)) {
    x <- lapply(unclass(x), clean_ptype)
  }

  attributes(x) <- attrs
  x
}

Try the nanoarrow package in your browser

Any scripts or data that you put into this service are public.

nanoarrow documentation built on June 22, 2024, 9:37 a.m.