Nothing
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
library(dplyr, warn.conflicts = FALSE)
library(stringr)
skip_if_not_available("acero")
tbl <- example_data
# Add some better string data
tbl$verses <- verses[[1]]
# c(" a ", " b ", " c ", ...) increasing padding
# nchar = 3 5 7 9 11 13 15 17 19 21
tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both")
tbl$some_negative <- tbl$int * (-1)^(1:nrow(tbl)) # nolint
test_that("filter() on is.na()", {
compare_dplyr_binding(
.input %>%
filter(is.na(lgl)) %>%
select(chr, int, lgl) %>%
collect(),
tbl
)
})
test_that("filter() with NAs in selection", {
compare_dplyr_binding(
.input %>%
filter(lgl) %>%
select(chr, int, lgl) %>%
collect(),
tbl
)
})
test_that("Filter returning an empty Table should not segfault (ARROW-8354)", {
compare_dplyr_binding(
.input %>%
filter(false) %>%
select(chr, int, lgl) %>%
collect(),
tbl
)
})
test_that("filtering with expression", {
char_sym <- "b"
compare_dplyr_binding(
.input %>%
filter(chr == char_sym) %>%
select(string = chr, int) %>%
collect(),
tbl
)
})
test_that("filtering with arithmetic", {
compare_dplyr_binding(
.input %>%
filter(dbl + 1 > 3) %>%
select(string = chr, int, dbl) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
filter(dbl / 2 > 3) %>%
select(string = chr, int, dbl) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
filter(dbl / 2L > 3) %>%
select(string = chr, int, dbl) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
filter(int / 2 > 3) %>%
select(string = chr, int, dbl) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
filter(int / 2L > 3) %>%
select(string = chr, int, dbl) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
filter(dbl %/% 2 > 3) %>%
select(string = chr, int, dbl) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
filter(dbl^2 > 3) %>%
select(string = chr, int, dbl) %>%
collect(),
tbl
)
})
test_that("filtering with expression + autocasting", {
compare_dplyr_binding(
.input %>%
filter(dbl + 1 > 3L) %>% # test autocasting with comparison to 3L
select(string = chr, int, dbl) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
filter(int + 1 > 3) %>%
select(string = chr, int, dbl) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
filter(int^2 > 3) %>%
select(string = chr, int, dbl) %>%
collect(),
tbl
)
})
test_that("More complex select/filter", {
compare_dplyr_binding(
.input %>%
filter(dbl > 2, chr == "d" | chr == "f") %>%
select(chr, int, lgl) %>%
filter(int < 5) %>%
select(int, chr) %>%
collect(),
tbl
)
})
test_that("filter() with %in%", {
compare_dplyr_binding(
.input %>%
filter(dbl > 2, chr %in% c("d", "f")) %>%
collect(),
tbl
)
})
test_that("Negative scalar values", {
compare_dplyr_binding(
.input %>%
filter(some_negative > -2) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
filter(some_negative %in% -1) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
filter(int == -some_negative) %>%
collect(),
tbl
)
})
test_that("filter() with between()", {
compare_dplyr_binding(
.input %>%
filter(between(dbl, 1, 2)) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
filter(between(dbl, 0.5, 2)) %>%
collect(),
tbl
)
expect_identical(
tbl %>%
record_batch() %>%
filter(between(dbl, int, dbl2)) %>%
collect(),
tbl %>%
filter(dbl >= int, dbl <= dbl2)
)
compare_dplyr_binding(
.input %>%
filter(between(dbl, 1, NA)) %>%
collect(),
tbl
)
})
test_that("filter() with string ops", {
skip_if_not_available("utf8proc")
compare_dplyr_binding(
.input %>%
filter(dbl > 2, str_length(verses) > 25) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
filter(dbl > 2, str_length(str_trim(padded_strings, "left")) > 5) %>%
collect(),
tbl
)
})
test_that("filter environment scope", {
# "object 'b_var' not found"
compare_dplyr_error(.input %>% filter(chr == b_var), tbl)
b_var <- "b"
compare_dplyr_binding(
.input %>%
filter(chr == b_var) %>%
collect(),
tbl
)
# Also for functions
# 'could not find function "isEqualTo"' because we haven't defined it yet
compare_dplyr_error(.input %>% filter(isEqualTo(int, 4)), tbl)
# This works but only because there are S3 methods for those operations
isEqualTo <- function(x, y) x == y & !is.na(x)
compare_dplyr_binding(
.input %>%
select(-fct) %>% # factor levels aren't identical
filter(isEqualTo(int, 4)) %>%
collect(),
tbl
)
# Try something that needs to call another nse_func
compare_dplyr_binding(
.input %>%
select(-fct) %>%
filter(nchar(padded_strings) < 10) %>%
collect(),
tbl
)
isShortString <- function(x) nchar(x) < 10
compare_dplyr_binding(
.input %>%
select(-fct) %>%
filter(isShortString(padded_strings)) %>%
collect(),
tbl
)
})
test_that("Filtering on a column that doesn't exist errors correctly", {
with_language("fr", {
# expect_warning(., NA) because the usual behavior when it hits a filter
# that it can't evaluate is to raise a warning, collect() to R, and retry
# the filter. But we want this to error the first time because it's
# a user error, not solvable by retrying in R
expect_warning(
expect_error(
tbl %>% record_batch() %>% filter(not_a_col == 42) %>% collect(),
"objet 'not_a_col' introuvable"
),
NA
)
})
with_language("en", {
expect_warning(
expect_error(
tbl %>% record_batch() %>% filter(not_a_col == 42) %>% collect(),
"object 'not_a_col' not found"
),
NA
)
})
})
test_that("Filtering with unsupported functions", {
compare_dplyr_binding(
.input %>%
filter(int > 2, pnorm(dbl) > .99) %>%
collect(),
tbl,
warning = paste(
"In pnorm\\(dbl\\) > 0.99: ",
"i Expression not supported in Arrow",
"> Pulling data into R",
sep = "\n"
)
)
compare_dplyr_binding(
.input %>%
filter(
nchar(chr, type = "bytes", allowNA = TRUE) == 1, # bad, Arrow msg
int > 2, # good
pnorm(dbl) > .99 # bad, opaque, but we'll error on the first one before we get here
) %>%
collect(),
tbl,
warning = paste(
'In nchar\\(chr, type = "bytes", allowNA = TRUE\\) == 1: ',
"i allowNA = TRUE not supported in Arrow",
"> Pulling data into R",
sep = "\n"
)
)
})
test_that("Calling Arrow compute functions 'directly'", {
expect_equal(
tbl %>%
record_batch() %>%
filter(arrow_add(dbl, 1) > 3L) %>%
select(string = chr, int, dbl) %>%
collect(),
tbl %>%
filter(dbl + 1 > 3L) %>%
select(string = chr, int, dbl)
)
compare_dplyr_binding(
tbl %>%
record_batch() %>%
filter(arrow_greater(arrow_add(dbl, 1), 3L)) %>%
select(string = chr, int, dbl) %>%
collect(),
tbl %>%
filter(dbl + 1 > 3L) %>%
select(string = chr, int, dbl)
)
})
test_that("filter() with .data pronoun", {
compare_dplyr_binding(
.input %>%
filter(.data$dbl > 4) %>%
# use "quoted" strings instead of .data pronoun where tidyselect is used
# .data pronoun deprecated in select in tidyselect 1.2
select("chr", "int", "lgl") %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
filter(is.na(.data$lgl)) %>%
select("chr", "int", "lgl") %>%
collect(),
tbl
)
# and the .env pronoun too!
chr <- 4
compare_dplyr_binding(
.input %>%
filter(.data$dbl > .env$chr) %>%
select("chr", "int", "lgl") %>%
collect(),
tbl
)
})
test_that("filter() with namespaced functions", {
compare_dplyr_binding(
.input %>%
filter(dplyr::between(dbl, 1, 2)) %>%
collect(),
tbl
)
skip_if_not_available("utf8proc")
compare_dplyr_binding(
.input %>%
filter(dbl > 2, stringr::str_length(verses) > 25) %>%
collect(),
tbl
)
})
test_that("filter() with across()", {
compare_dplyr_binding(
.input %>%
filter(if_any(ends_with("l"), ~ is.na(.))) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
filter(
false == FALSE,
if_all(everything(), ~ !is.na(.)),
int > 2
) %>%
collect(),
tbl
)
})
test_that(".by argument", {
compare_dplyr_binding(
.input %>%
filter(is.na(lgl), .by = chr) %>%
select(chr, int, lgl) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
filter(is.na(lgl), .by = starts_with("chr")) %>%
select(chr, int, lgl) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
filter(.by = chr) %>%
select(chr, int, lgl) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
filter(.by = c(int, chr)) %>%
select(chr, int, lgl) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
filter(.by = c("int", "chr")) %>%
select(chr, int, lgl) %>%
collect(),
tbl
)
# filter should pulling not grouped data into R when using the .by argument
compare_dplyr_binding(
.input %>%
filter(int > 2, pnorm(dbl) > .99, .by = chr) %>%
collect(),
tbl,
warning = paste(
"In pnorm\\(dbl\\) > 0.99: ",
"i Expression not supported in Arrow",
"> Pulling data into R",
sep = "\n"
)
)
expect_error(
tbl %>%
arrow_table() %>%
group_by(chr) %>%
filter(is.na(lgl), .by = chr),
"Can't supply `\\.by` when `\\.data` is grouped data"
)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.