R/infer-ptype.R
In nanoarrow: Interface to the 'nanoarrow' 'C' Library

Documented in infer_nanoarrow_ptype

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

#' Infer an R vector prototype
#'
#' Resolves the default `to` value to use in [convert_array()] and
#' [convert_array_stream()]. The default conversions are:
#'
#' - null to [vctrs::unspecified()]
#' - boolean to [logical()]
#' - int8, uint8, int16, uint16, and int13 to [integer()]
#' - uint32, int64, uint64, float, and double to [double()]
#' - string and large string to [character()]
#' - struct to [data.frame()]
#' - binary and large binary to [blob::blob()]
#' - list, large_list, and fixed_size_list to [vctrs::list_of()]
#' - time32 and time64 to [hms::hms()]
#' - duration to [difftime()]
#' - date32 to [as.Date()]
#' - timestamp to [as.POSIXct()]
#'
#' Additional conversions are possible by specifying an explicit value for
#' `to`. For details of each conversion, see [convert_array()].
#'
#' @param x A [nanoarrow_schema][as_nanoarrow_schema],
#'   [nanoarrow_array][as_nanoarrow_array], or
#'   [nanoarrow_array_stream][as_nanoarrow_array_stream].
#'
#' @return An R vector of zero size describing the target into which
#'   the array should be materialized.
#' @export
#'
#' @examples
#' infer_nanoarrow_ptype(as_nanoarrow_array(1:10))
#'
infer_nanoarrow_ptype <- function(x) {
  if (inherits(x, "nanoarrow_array")) {
    x <- .Call(nanoarrow_c_infer_schema_array, x)
  } else if (inherits(x, "nanoarrow_array_stream")) {
    x <- .Call(nanoarrow_c_array_stream_get_schema, x)
  } else if (!inherits(x, "nanoarrow_schema")) {
    stop("`x` must be a nanoarrow_schema(), nanoarrow_array(), or nanoarrow_array_stream()")
  }

  .Call(nanoarrow_c_infer_ptype, x)
}

# This is called from C from nanoarrow_c_infer_ptype when all the C conversions
# have been tried. Some of these inferences could be moved to C to be faster
# (but are much less verbose to create here)
infer_ptype_other <- function(schema) {
  # We don't need the user-friendly versions and this is performance-sensitive
  parsed <- .Call(nanoarrow_c_schema_parse, schema)

  # Give registered extension types a chance to resolve the ptype
  if (!is.null(parsed$extension_name)) {
    spec <- resolve_nanoarrow_extension(parsed$extension_name)
    return(infer_nanoarrow_ptype_extension(spec, schema))
  }

  switch(
    parsed$type,
    "na" = vctrs::unspecified(),
    "binary" = ,
    "large_binary" = ,
    "binary_view" = new_blob_internal(),
    "date32" = structure(numeric(), class = "Date"),
    "time32" = ,
    "time64" = hms::hms(),
    "duration" = structure(numeric(), class = "difftime", units = "secs"),
    "date64" = ,
    "timestamp" = {
      if (is.null(parsed$timezone) || parsed$timezone == "") {
        # We almost never want to assume the user's timezone here, which is
        # what would happen if we passed on "". This is consistent with how
        # readr handles reading timezones (assign "UTC" since it's DST-free
        # and let the user explicitly set this later)
        parsed$timezone <- getOption("nanoarrow.timezone_if_unspecified", "UTC")
      }

      structure(
        numeric(0),
        class = c("POSIXct", "POSIXt"),
        tzone = parsed$timezone
      )
    },
    "map" = ,
    "large_list" = ,
    "list" = ,
    "fixed_size_list" = {
      ptype <- infer_nanoarrow_ptype(schema$children[[1]])
      vctrs::list_of(.ptype = ptype)
    },
    "dictionary" = {
      # Even though R's 'factor' can handle a dictionary of strings
      # (perhaps the most common case), an array arriving in chunks may have
      # different dictionary arrays. Thus, the best type-stable default we can
      # achieve is to expand dictionaries.
      infer_nanoarrow_ptype(schema$dictionary)
    },
    stop_cant_infer_ptype(schema, n = -1)
  )
}

stop_cant_infer_ptype <- function(schema, n = 0) {
  schema_label <- nanoarrow_schema_formatted(schema)

  if (is.null(schema$name) || identical(schema$name, "")) {
    cnd <- simpleError(
      sprintf(
        "Can't infer R vector type for <%s>",
        schema_label
      ),
      call = sys.call(n - 1)
    )
  } else {
    cnd <- simpleError(
      sprintf(
        "Can't infer R vector type for `%s` <%s>",
        schema$name,
        schema_label
      ),
      call = sys.call(n - 1)
    )
  }

  stop(cnd)
}

# Try to load the blob namespace. If it fails, we still return the correct
# ptype object. This is not ideal because the behaviour of the output object
# may be slightly different if blob isn't installed; however, we use this
# conversion for printing buffers and it's difficult to work around with the
# current system for conversion.
new_blob_internal <- function() {
  requireNamespace("blob", quietly = TRUE)
  structure(
    list(),
    ptype = raw(0),
    class = c("blob", "vctrs_list_of", "vctrs_vctr", "list")
  )
}