R/parquet-column-types.R

Defines functions add_r_type_to_schema parquet_column_types_file parquet_column_types

Documented in parquet_column_types

#' Map between R and Parquet data types
#'
#' Note that this function is now deprecated. Please use
#' [read_parquet_schema()] for files, and [infer_parquet_schema()] for
#' data frames.
#'
#' This function works two ways. It can map the R types of a data frame to
#' Parquet types, to see how [write_parquet()] would write out the data
#' frame. It can also map the types of a Parquet file to R types, to see
#' how [read_parquet()] would read the file into R.
#'
#' @param x Path to a Parquet file, or a data frame.
#' @param options Nanoparquet options, see [parquet_options()].
#' @return Data frame with columns:
#'   * `file_name`: file name.
#'   * `name`: column name.
#'   * `type`: (low level) Parquet data type.
#'   * `r_type`: the R type that corresponds to the Parquet type.
#'     Might be `NA` if [read_parquet()] cannot read this column. See
#'     [nanoparquet-types] for the type mapping rules.
#'   * `repetition_type`: whether the column in `REQUIRED` (cannot be
#'     `NA`) or `OPTIONAL` (may be `NA`). `REPEATED` columns are not
#'     currently supported by nanoparquet.
#'   * `logical_type`: Parquet logical type in a list column.
#'      An element has at least an entry called `type`, and potentially
#'      additional entries, e.g. `bit_width`, `is_signed`, etc.
#'
#' @seealso [read_parquet_metadata()] to read more metadata,
#'   [read_parquet_info()] for a very short summary.
#'   [read_parquet_schema()] for the complete Parquet schema.
#'   [read_parquet()], [write_parquet()], [nanoparquet-types].
#' @export

parquet_column_types <- function(x, options = parquet_options()) {
  warning(
    "`parquet_column_types()` is deprecated, please use ",
    "`read_parquet_schema()` or `parquet_schema()` instead."
  )
  if (is.character(x)) {
    parquet_column_types_file(x, options)
  } else if (is.data.frame(x)) {
    infer_parquet_schema(x, options)
  } else {
    stop("`x` must be a file name or a data frame in `parquet_column_types()`")
  }
}

parquet_column_types_file <- function(file, options) {
  mtd <- read_parquet_metadata(file)
  sch <- mtd$schema
  add_r_type_to_schema(mtd, sch, options)
}

add_r_type_to_schema <- function(mtd, sch, options, col_select = NULL) {
  kv <- mtd$file_meta_data$key_value_metadata[[1]]

  type_map <- c(
    BOOLEAN = "logical",
    INT32 = "integer",
    INT64 = "double",
    DOUBLE = "double",
    FLOAT = "double",
    INT96 = "POSIXct",
    FIXED_LEN_BYTE_ARRAY = "raw",
    BYTE_ARRAY = "raw"
  )

  sch$r_type <- unname(type_map[sch$type])

  sch$r_type[
    sch$type == "FIXED_LEN_BYTE_ARRAY" &
      sch$converted_type == "DECIMAL"
  ] <- "double"
  sch$r_type[
    vapply(
      sch$logical_type,
      function(x) {
        !is.null(x$type) && x$type %in% c("STRING", "ENUM", "UUID")
      },
      logical(1)
    ) |
      sch$converted_type == "UTF8"
  ] <- "character"

  # detected from Arrow schema
  if (options[["use_arrow_metadata"]]) {
    spec <- if ("ARROW:schema" %in% kv$key) {
      kv <- mtd$file_meta_data$key_value_metadata[[1]]
      arrow_find_special(
        kv$value[match("ARROW:schema", kv$key)],
        file
      )
    }
    if (length(spec$factor)) {
      sch$r_type[spec$factor] <- "factor"
    }
    if (length(spec$difftime)) sch$r_type[spec$difftime] <- "difftime"
  }

  # TODO: this is duplicated in the C++ code
  # our own conversions
  dates <- vapply(
    sch$logical_type,
    function(lt) !is.null(lt$type) && lt$type == "DATE",
    logical(1)
  ) |
    sch$converted_type == "DATE"
  sch$r_type[dates] <- "Date"

  hmss <- vapply(
    sch$logical_type,
    function(lt) !is.null(lt$type) && lt$type == "TIME",
    logical(1)
  ) |
    sch$converted_type == "TIME_MILLIS" |
    sch$converted_type == "TIME_MICROS"
  sch$r_type[hmss] <- "hms"

  poscts <- vapply(
    sch$logical_type,
    function(lt) !is.null(lt) && lt$type == "TIMESTAMP",
    logical(1)
  ) |
    sch$converted_type == "TIMESTAMP_MICROS"
  sch$r_type[poscts] <- "POSIXct"

  cols <- c(
    "file_name",
    "name",
    "r_type",
    setdiff(colnames(sch), c("file_name", "name", "r_type"))
  )
  sch[, cols]
}

Try the nanoparquet package in your browser

Any scripts or data that you put into this service are public.

nanoparquet documentation built on Feb. 26, 2026, 1:09 a.m.