R/dplyr-glimpse.R

Defines functions glimpse.ArrowTabular

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

#' @importFrom utils getFromNamespace
glimpse.ArrowTabular <- function(x,
                                 width = getOption("pillar.width", getOption("width")),
                                 ...) {
  # This function is inspired by pillar:::glimpse.tbl(), with some adaptations

  # We use cli:: and pillar:: throughout this function. We don't need to check
  # to see if they're installed because dplyr depends on pillar, which depends
  # on cli, and we're only in this function though S3 dispatch on dplyr::glimpse
  if (!is.finite(width)) {
    abort("`width` must be finite.")
  }

  # We need a couple of internal functions in pillar for formatting
  pretty_int <- getFromNamespace("big_mark", "pillar")
  make_shorter <- getFromNamespace("str_trunc", "pillar")
  tickify <- getFromNamespace("tick_if_needed", "pillar")

  # Even though this is the ArrowTabular method, we use it for arrow_dplyr_query
  # so make some accommodations. (Others are handled by S3 method dispatch.)
  if (inherits(x, "arrow_dplyr_query")) {
    class_title <- paste(source_data(x)$class_title(), "(query)")
  } else {
    class_title <- x$class_title()
  }
  cli::cat_line(class_title)

  dims <- dim(x)
  cli::cat_line(sprintf(
    "%s rows x %s columns", pretty_int(dims[1]), pretty_int(dims[2])
  ))

  if (dims[2] == 0) {
    return(invisible(x))
  }

  nrows <- as.integer(width / 3)
  head_tab <- dplyr::compute(head(x, nrows))
  # Take the schema from this Table because if x is arrow_dplyr_query, some
  # output types could be a best guess (in implicit_schema()).
  schema <- head_tab$schema

  # Assemble the column names and types
  # We use the Arrow type names here. See type_sum.DataType() below.
  var_types <- map_chr(schema$fields, ~ format(pillar::new_pillar_type(.$type)))
  # glimpse.tbl() left-aligns the var names (pads with whitespace to the right)
  # and appends the types next to them. Because those type names are
  # aggressively truncated to all be roughly the same length, this means the
  # data glimpse that follows is also mostly aligned.
  # However, Arrow type names are longer and variable length, and we're only
  # truncating the nested type information inside of <...>. So, to keep the
  # data glimpses aligned, we "justify" align the name and type: add the padding
  # whitespace between them so that the total width is equal.
  var_headings <- paste("$", center_pad(tickify(names(x)), var_types))

  # Assemble the data glimpse
  df <- as.data.frame(head_tab)
  formatted_data <- map_chr(df, function(.) {
    tryCatch(
      paste(pillar::format_glimpse(.), collapse = ", "),
      # This could error e.g. if you have a VctrsExtensionType and the package
      # that defines methods for the data is not loaded
      error = function(e) conditionMessage(e)
    )
  })
  # Here and elsewhere in the glimpse code, you have to use pillar::get_extent()
  # instead of nchar() because get_extent knows how to deal with ANSI escapes
  # etc.--it counts how much space on the terminal will be taken when printed.
  data_width <- width - pillar::get_extent(var_headings)
  truncated_data <- make_shorter(formatted_data, data_width)

  # Print the table body (var name, type, data glimpse)
  cli::cat_line(var_headings, " ", truncated_data)

  # TODO: use crayon to style these footers?
  if (inherits(x, "arrow_dplyr_query")) {
    cli::cat_line("Call `print()` for query details")
  } else if (any(grepl("<...>", var_types, fixed = TRUE)) || schema$HasMetadata) {
    cli::cat_line("Call `print()` for full schema details")
  }
  invisible(x)
}

# Dataset has an efficient head() method via Scanner so this is fine
glimpse.Dataset <- glimpse.ArrowTabular

glimpse.arrow_dplyr_query <- function(x,
                                      width = getOption("pillar.width", getOption("width")),
                                      ...) {
  if (any(map_lgl(all_sources(x), ~ inherits(., "RecordBatchReader")))) {
    msg <- paste(
      "Cannot glimpse() data from a RecordBatchReader because it can only be",
      "read one time. Call `compute()` to evaluate the query first."
    )
    message(msg)
    print(x)
  } else if (query_on_dataset(x) && !query_can_stream(x)) {
    msg <- paste(
      "This query requires a full table scan, so glimpse() may be",
      "expensive. Call `compute()` to evaluate the query first."
    )
    message(msg)
    print(x)
  } else {
    # Go for it
    glimpse.ArrowTabular(x, width = width, ...)
  }
}

glimpse.RecordBatchReader <- function(x,
                                      width = getOption("pillar.width", getOption("width")),
                                      ...) {
  # TODO(ARROW-17038): to_arrow() on duckdb con should hold con not RBR so it
  # can be run more than once (like duckdb does on the other side)
  msg <- paste(
    "Cannot glimpse() data from a RecordBatchReader because it can only be",
    "read one time; call `as_arrow_table()` to consume it first."
  )
  message(msg)
  print(x)
}

glimpse.ArrowDatum <- function(x, width, ...) {
  cli::cat_line(gsub("[ \n]+", " ", x$ToString()))
  invisible(x)
}

type_sum.DataType <- function(x) {
  if (inherits(x, "VctrsExtensionType")) {
    # ptype() holds a vctrs type object, which pillar knows how to format
    paste0("ext<", pillar::type_sum(x$ptype()), ">")
  } else {
    # Trim long type names with <...>
    sub("<.*>", "<...>", x$ToString())
  }
}

center_pad <- function(left, right) {
  left_sizes <- pillar::get_extent(left)
  right_sizes <- pillar::get_extent(right)
  total_width <- max(left_sizes + right_sizes) + 1L
  paste0(left, strrep(" ", total_width - left_sizes - right_sizes), right)
}

Try the arrow package in your browser

Any scripts or data that you put into this service are public.

arrow documentation built on Nov. 25, 2023, 1:09 a.m.