R/as-array.R

Defines functions as_nanoarrow_array_from_c union_array_from_data_frame as_nanoarrow_array.vctrs_unspecified as_nanoarrow_array.factor as_nanoarrow_array.POSIXlt as_nanoarrow_array.Date as_nanoarrow_array.list as_nanoarrow_array.data.frame as_nanoarrow_array.blob as_nanoarrow_array.difftime as_nanoarrow_array.POSIXct as_nanoarrow_array.integer64 as_nanoarrow_array.nanoarrow_array as_nanoarrow_array.default

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

#' @export
as_nanoarrow_array.default <- function(x, ..., schema = NULL, .from_c = FALSE) {
  # If we're coming from C it's because we've tried all the internal conversions
  # and no suitable S3 method was found or the x--schema combination is not
  # implemented in nanoarrow. Try arrow::as_arrow_array().
  if (.from_c) {
    # Give extension types a chance to handle conversion
    parsed <- .Call(nanoarrow_c_schema_parse, schema)

    if (!is.null(parsed$extension_name)) {
      spec <- resolve_nanoarrow_extension(parsed$extension_name)
      return(as_nanoarrow_array_extension(spec, x, ..., schema = schema))
    }

    assert_arrow_installed(
      sprintf(
        "create %s array from object of type %s",
        nanoarrow_schema_formatted(schema),
        paste0(class(x), collapse = "/")
      )
    )

    result <- as_nanoarrow_array(
      arrow::as_arrow_array(
        x,
        type = arrow::as_data_type(schema)
      )
    )

    # Skip nanoarrow_pointer_export() for these arrays since we know there
    # are no external references to them
    class(result) <- c("nanoarrow_array_dont_export", class(result))

    return(result)
  }

  if (is.null(schema)) {
    schema <- infer_nanoarrow_schema(x)
  } else {
    schema <- as_nanoarrow_schema(schema)
  }

  .Call(nanoarrow_c_as_array_default, x, schema)
}

#' @export
as_nanoarrow_array.nanoarrow_array <- function(x, ..., schema = NULL) {
  if (is.null(schema)) {
    return(x)
  }

  inferred_schema <- infer_nanoarrow_schema(x)
  if (nanoarrow_schema_identical(schema, inferred_schema)) {
    return(x)
  }

  NextMethod()
}

#' @export
as_nanoarrow_array.integer64 <- function(x, ..., schema = NULL) {
  if (is.null(schema)) {
    schema <- infer_nanoarrow_schema(x)
  }

  schema <- as_nanoarrow_schema(schema)
  parsed <- nanoarrow_schema_parse(schema)
  if (!is.null(parsed$extension_name)) {
    spec <- resolve_nanoarrow_extension(parsed$extension_name)
    return(as_nanoarrow_array_extension(spec, x, ..., schema = schema))
  }

  switch(
    parsed$type,
    int64 = ,
    uint64 = {
      if (anyNA(x)) {
        is_valid_lgl <- is.finite(x)
        is_valid <- as_nanoarrow_array(is_valid_lgl, schema = na_bool())$buffers[[2]]
        na_count <- length(x) - sum(is_valid_lgl)
      } else {
        is_valid <- NULL
        na_count <- 0
      }

      array <- nanoarrow_array_init(schema)
      nanoarrow_array_modify(
        array,
        list(
          length = length(x),
          null_count = na_count,
          buffers = list(is_valid, x)
        )
      )
    },
    as_nanoarrow_array(as.double(x), schema = schema)
  )
}

#' @export
as_nanoarrow_array.POSIXct <- function(x, ..., schema = NULL) {
  if (is.null(schema)) {
    schema <- infer_nanoarrow_schema(x)
  }

  schema <- as_nanoarrow_schema(schema)
  parsed <- nanoarrow_schema_parse(schema)
  if (!is.null(parsed$extension_name)) {
    spec <- resolve_nanoarrow_extension(parsed$extension_name)
    return(as_nanoarrow_array_extension(spec, x, ..., schema = schema))
  }

  switch(
    parsed$type,
    timestamp = ,
    duration = {
      multipliers <- c(s = 1.0, ms = 1e3, us = 1e6, ns = 1e9)
      multiplier <- unname(multipliers[parsed$time_unit])
      array <- as_nanoarrow_array(
        as.numeric(x) * multiplier,
        schema = na_type(parsed$storage_type)
      )
      nanoarrow_array_set_schema(array, schema)
      array
    },
    NextMethod()
  )
}

#' @export
as_nanoarrow_array.difftime <- function(x, ..., schema = NULL) {
  if (is.null(schema)) {
    schema <- infer_nanoarrow_schema(x)
  }

  schema <- as_nanoarrow_schema(schema)
  parsed <- nanoarrow_schema_parse(schema)
  if (!is.null(parsed$extension_name)) {
    spec <- resolve_nanoarrow_extension(parsed$extension_name)
    return(as_nanoarrow_array_extension(spec, x, ..., schema = schema))
  }

  src_unit <- attr(x, "units")
  switch(
    parsed$type,
    time32 = ,
    time64 = ,
    duration = {
      multipliers <- c(s = 1.0, ms = 1e3, us = 1e6, ns = 1e9)
      src_multipliers <- c(
        secs = 1.0,
        mins = 60.0,
        hours = 3600.0,
        days = 86400.0,
        weeks = 604800.0
      )

      multiplier <- unname(multipliers[parsed$time_unit]) *
        unname(src_multipliers[src_unit])
      array <- as_nanoarrow_array(
        as.numeric(x) * multiplier,
        schema = na_type(parsed$storage_type)
      )
      nanoarrow_array_set_schema(array, schema)
      array
    },
    NextMethod()
  )
}

#' @export
as_nanoarrow_array.blob <- function(x, ..., schema = NULL) {
  if (is.null(schema)) {
    schema <- infer_nanoarrow_schema(x)
  }

  as_nanoarrow_array(unclass(x), schema = schema)
}

#' @export
as_nanoarrow_array.data.frame <- function(x, ..., schema = NULL) {
  # We need to override this to prevent the list implementation from handling it
  as_nanoarrow_array.default(x, ..., schema = schema)
}

#' @export
as_nanoarrow_array.list <- function(x, ..., schema = NULL) {
  if (is.null(schema)) {
    schema <- infer_nanoarrow_schema(x)
  }

  schema <- as_nanoarrow_schema(schema)
  parsed <- nanoarrow_schema_parse(schema)
  if (!is.null(parsed$extension_name) || parsed$type != "list") {
    return(NextMethod())
  }

  # This R implementation can't handle complex nesting
  if (startsWith(schema$children[[1]]$format, "+")) {
    return(NextMethod())
  }

  array <- nanoarrow_array_init(schema)

  child <- unlist(x, recursive = FALSE, use.names = FALSE)
  if (is.null(child)) {
    child_array <- as_nanoarrow_array.vctrs_unspecified(logical(), schema = na_na())
  } else {
    child_array <- as_nanoarrow_array(child, schema = schema$children[[1]])
  }

  offsets <- c(0L, cumsum(lengths(x)))
  is_na <- vapply(x, is.null, logical(1))
  validity <- as_nanoarrow_array(!is_na)$buffers[[2]]

  nanoarrow_array_modify(
    array,
    list(
      length = length(x),
      null_count = sum(is_na),
      buffers = list(
        validity,
        offsets
      ),
      children = list(
        child_array
      )
    )
  )
}

#' @export
as_nanoarrow_array.Date <- function(x, ..., schema = NULL) {
  if (is.null(schema)) {
    schema <- infer_nanoarrow_schema(x)
  }

  schema <- as_nanoarrow_schema(schema)
  parsed <- nanoarrow_schema_parse(schema)
  if (!is.null(parsed$extension_name)) {
    spec <- resolve_nanoarrow_extension(parsed$extension_name)
    return(as_nanoarrow_array_extension(spec, x, ..., schema = schema))
  }

  switch(
    parsed$type,
    date32 = {
      storage <- as_nanoarrow_array(
        as.integer(x),
        schema = na_type(parsed$storage_type)
      )
      nanoarrow_array_set_schema(storage, schema)
      storage
    },
    date64 = {
      storage <- as_nanoarrow_array(
        as.numeric(x) * 86400000,
        schema = na_type(parsed$storage_type)
      )
      nanoarrow_array_set_schema(storage, schema)
      storage
    },
    NextMethod()
  )
}

#' @export
as_nanoarrow_array.POSIXlt <- function(x, ..., schema = NULL) {
  if (is.null(schema)) {
    schema <- infer_nanoarrow_schema(x)
  }

  schema <- as_nanoarrow_schema(schema)
  parsed <- nanoarrow_schema_parse(schema)
  if (!is.null(parsed$extension_name)) {
    spec <- resolve_nanoarrow_extension(parsed$extension_name)
    return(as_nanoarrow_array_extension(spec, x, ..., schema = schema))
  }

  as_nanoarrow_array(new_data_frame(x, length(x)), schema = schema)
}

#' @export
as_nanoarrow_array.factor <- function(x, ..., schema = NULL) {
  if (is.null(schema)) {
    schema <- infer_nanoarrow_schema(x)
  }

  schema <- as_nanoarrow_schema(schema)
  parsed <- nanoarrow_schema_parse(schema)
  if (!is.null(parsed$extension_name)) {
    spec <- resolve_nanoarrow_extension(parsed$extension_name)
    return(as_nanoarrow_array_extension(spec, x, ..., schema = schema))
  }

  if (is.null(schema$dictionary)) {
    return(as_nanoarrow_array(as.character(x), schema = schema))
  }

  storage <- schema
  storage$dictionary <- NULL

  array <- as_nanoarrow_array(unclass(x) - 1L, schema = storage)
  array$dictionary <- as_nanoarrow_array(levels(x), schema = schema$dictionary)
  array
}

#' @export
as_nanoarrow_array.vctrs_unspecified <- function(x, ..., schema = NULL) {
  if (is.null(schema)) {
    schema <- infer_nanoarrow_schema(x)
  } else {
    schema <- as_nanoarrow_schema(schema)
  }

  schema <- as_nanoarrow_schema(schema)
  parsed <- nanoarrow_schema_parse(schema)
  if (!is.null(parsed$extension_name)) {
    spec <- resolve_nanoarrow_extension(parsed$extension_name)
    return(as_nanoarrow_array_extension(spec, x, ..., schema = schema))
  }

  switch(
    parsed$storage_type,
    na = {
      array <- nanoarrow_array_init(schema)
      array$length <- length(x)
      array$null_count <- length(x)
      array
    },
    NextMethod()
  )
}

# Called from C to create a union array when requested.
# There are other types of objects that might make sense to
# convert to a union but we basically just need enough to
# for testing at this point.
union_array_from_data_frame <- function(x, schema) {
  if (length(x) == 0 || length(x) > 127) {
    stop(
      sprintf(
        "Can't convert data frame with %d columns to union array",
        length(x)
      )
    )
  }

  # Compute NAs
  x_is_na <- do.call("cbind", lapply(x, is.na))

  # Make sure we only have one non-NA value per row to make sure we don't drop
  # values
  stopifnot(all(rowSums(!x_is_na) <= 1))

  child_index <- rep_len(0L, nrow(x))
  seq_x <- seq_along(x)
  for (i in seq_along(child_index)) {
    for (j in seq_x) {
      if (!x_is_na[i, j]) {
        child_index[i] <- j - 1L
        break;
      }
    }
  }

  switch(
    nanoarrow_schema_parse(schema)$storage_type,
    "dense_union" = {
      is_child <- lapply(seq_x - 1L, "==", child_index)
      child_offset_each <- lapply(is_child, function(x) cumsum(x) - 1L)
      child_offset <- lapply(seq_along(child_index), function(i) {
        child_offset_each[[child_index[i] + 1]][i]
      })

      children <- Map("[", x, is_child, drop = FALSE)
      names(children) <- names(schema$children)
      array <- nanoarrow_array_init(schema)
      nanoarrow_array_modify(
        array,
        list(
          length = length(child_index),
          null_count = 0,
          buffers = list(as.raw(child_index), as.integer(child_offset)),
          children = children
        )
      )
    },
    "sparse_union" = {
      struct_schema <- na_struct(schema$children)
      array <- as_nanoarrow_array(x, array = struct_schema)
      array <- nanoarrow_array_modify(
        array,
        list(buffers = list(as.raw(child_index))),
        validate = FALSE
      )
      nanoarrow_array_set_schema(array, schema, validate = TRUE)
      array
    },
    stop("Attempt to create union from non-union array type")
  )
}

# This is defined because it's verbose to pass named arguments from C.
# When converting data frame columns, we try the internal C conversions
# first to save R evaluation overhead. When the internal conversions fail,
# we call as_nanoarrow_array() to dispatch to conversions defined via S3
# dispatch, making sure to let the default method know that we've already
# tried the internal C conversions.
as_nanoarrow_array_from_c <- function(x, schema) {
  result <- as_nanoarrow_array(x, schema = schema, .from_c = TRUE)

  # Anything we get from an S3 method we need to validate (even from the
  # arrow package, which occasionally does not honour the schema argument)
  nanoarrow_array_set_schema(result, schema, validate = TRUE)

  result
}

Try the nanoarrow package in your browser

Any scripts or data that you put into this service are public.

nanoarrow documentation built on June 22, 2024, 9:37 a.m.