R/dplyr-select.R

Defines functions column_select relocate.arrow_dplyr_query rename_with.arrow_dplyr_query rename.arrow_dplyr_query select.arrow_dplyr_query tbl_vars.arrow_dplyr_query

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.


# The following S3 methods are registered on load if dplyr is present

tbl_vars.arrow_dplyr_query <- function(x) names(x$selected_columns)

select.arrow_dplyr_query <- function(.data, ...) {
  column_select(.data, enquos(...), op = "select")
}
select.Dataset <- select.ArrowTabular <- select.RecordBatchReader <- select.arrow_dplyr_query

rename.arrow_dplyr_query <- function(.data, ...) {
  column_select(.data, enquos(...), op = "rename")
}
rename.Dataset <- rename.ArrowTabular <- rename.RecordBatchReader <- rename.arrow_dplyr_query

rename_with.arrow_dplyr_query <- function(.data, .fn, .cols = everything(), ...) {
  .fn <- as_function(.fn)
  old_names <- names(dplyr::select(.data, {{ .cols }}))
  dplyr::rename(.data, !!set_names(old_names, .fn(old_names)))
}
rename_with.Dataset <- rename_with.ArrowTabular <- rename_with.RecordBatchReader <- rename_with.arrow_dplyr_query

relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL) {
  # The code in this function is adapted from the code in dplyr::relocate.data.frame
  # at https://github.com/tidyverse/dplyr/blob/main/R/relocate.R
  # TODO: revisit this after https://github.com/tidyverse/dplyr/issues/5829

  .data <- as_adq(.data)

  # Assign the schema to the expressions
  schema <- .data$.data$schema
  walk(.data$selected_columns, ~ (.$schema <- schema))

  # Create a mask for evaluating expressions in tidyselect helpers
  mask <- new_environment(.cache$functions, parent = caller_env())

  to_move <- eval_select(substitute(c(...)), .data$selected_columns, mask)

  .before <- enquo(.before)
  .after <- enquo(.after)
  has_before <- !quo_is_null(.before)
  has_after <- !quo_is_null(.after)

  if (has_before && has_after) {
    abort("Must supply only one of `.before` and `.after`.")
  } else if (has_before) {
    where <- min(unname(eval_select(quo_get_expr(.before), .data$selected_columns, mask)))
    if (!where %in% to_move) {
      to_move <- c(to_move, where)
    }
  } else if (has_after) {
    where <- max(unname(eval_select(quo_get_expr(.after), .data$selected_columns, mask)))
    if (!where %in% to_move) {
      to_move <- c(where, to_move)
    }
  } else {
    where <- 1L
    if (!where %in% to_move) {
      to_move <- c(to_move, where)
    }
  }

  lhs <- setdiff(seq2(1, where - 1), to_move)
  rhs <- setdiff(seq2(where + 1, length(.data$selected_columns)), to_move)

  pos <- vec_unique(c(lhs, to_move, rhs))
  new_names <- names(pos)
  .data$selected_columns <- .data$selected_columns[pos]

  if (!is.null(new_names)) {
    names(.data$selected_columns)[new_names != ""] <- new_names[new_names != ""]
  }
  .data
}
relocate.Dataset <- relocate.ArrowTabular <- relocate.RecordBatchReader <- relocate.arrow_dplyr_query

column_select <- function(.data, select_expression, op = c("select", "rename")) {
  op <- match.arg(op)

  .data <- as_adq(.data)
  sim_df <- as.data.frame(implicit_schema(.data))
  old_names <- names(sim_df)

  if (op == "select") {
    out <- eval_select(expr(c(!!!select_expression)), sim_df)
    # select only columns from `out`
    subset <- out
  } else if (op == "rename") {
    out <- eval_rename(expr(c(!!!select_expression)), sim_df)
    # select all columns as only renaming
    subset <- set_names(seq_along(old_names), old_names)
    names(subset)[out] <- names(out)
  }

  .data$selected_columns <- set_names(.data$selected_columns[subset], names(subset))

  # check if names have updated
  new_names <- old_names
  new_names[out] <- names(out)
  names_compared <- set_names(old_names, new_names)
  renamed <- names_compared[old_names != new_names]

  # Update names in group_by if changed in select() or rename()
  if (length(renamed)) {
    gbv <- .data$group_by_vars
    renamed_groups <- gbv %in% renamed
    gbv[renamed_groups] <- names(renamed)[match(gbv[renamed_groups], renamed)]
    .data$group_by_vars <- gbv
  }

  .data
}

Try the arrow package in your browser

Any scripts or data that you put into this service are public.

arrow documentation built on Nov. 25, 2023, 1:09 a.m.