Nothing
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
library(dplyr, warn.conflicts = FALSE)
skip_if_not_available("acero")
tbl <- example_data
test_that("slice_head/tail, ungrouped", {
# head/tail are not deterministic in Arrow because data is unordered
# so we can't assert identical to dplyr, just assert right number of rows
tab <- arrow_table(tbl)
expect_equal(
tab %>%
slice_head(n = 5) %>%
nrow(),
5
)
expect_equal(
tab %>%
slice_tail(n = 5) %>%
nrow(),
5
)
expect_equal(
tab %>%
slice_head(prop = .25) %>%
nrow(),
2
)
expect_equal(
tab %>%
slice_tail(prop = .25) %>%
nrow(),
2
)
})
test_that("slice_min/max, ungrouped", {
# with_ties must be FALSE
tab <- arrow_table(tbl)
expect_error(
tab %>% slice_max(int, n = 5),
"with_ties = TRUE"
)
expect_error(
tab %>% slice_min(int, n = 5),
"with_ties = TRUE"
)
compare_dplyr_binding(
.input %>%
slice_max(int, n = 4, with_ties = FALSE) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
slice_min(int, n = 4, with_ties = FALSE) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
slice_max(int, prop = .25, with_ties = FALSE) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
slice_min(int, prop = .25, with_ties = FALSE) %>%
collect(),
tbl
)
})
test_that("slice_sample, ungrouped", {
skip_if_not(CanRunWithCapturedR())
tab <- arrow_table(tbl)
expect_error(
tab %>% slice_sample(replace = TRUE),
"Sampling with replacement"
)
expect_error(
tab %>% slice_sample(weight_by = dbl),
"weight_by"
)
# Let's not take any chances on random failures
skip_on_cran()
# Because this is random (and we only have 10 rows), try several times
for (i in 1:50) {
sampled_prop <- tab %>%
slice_sample(prop = .2) %>%
collect() %>%
nrow()
if (sampled_prop == 2) break
}
expect_equal(sampled_prop, 2)
# Test that slice_sample(n) returns n rows
# With a larger dataset, we would be more confident to get exactly n
# but with this dataset, we should at least not get >n rows
sampled_n <- tab %>%
slice_sample(n = 2) %>%
collect() %>%
nrow()
expect_lte(sampled_n, 2)
# Test with dataset, which matters for the UDF HACK
skip_if_not_available("dataset")
sampled_n <- tab %>%
InMemoryDataset$create() %>%
slice_sample(n = 2) %>%
collect() %>%
nrow()
expect_lte(sampled_n, 2)
})
test_that("slice_* not supported with groups", {
grouped <- tbl %>%
arrow_table() %>%
group_by(lgl)
expect_error(
slice_head(grouped, n = 5),
"Slicing grouped data not supported in Arrow"
)
expect_error(
slice_tail(grouped, n = 5),
"Slicing grouped data not supported in Arrow"
)
expect_error(
slice_min(grouped, int, n = 5),
"Slicing grouped data not supported in Arrow"
)
expect_error(
slice_max(grouped, int, n = 5),
"Slicing grouped data not supported in Arrow"
)
expect_error(
slice_sample(grouped, n = 5),
"Slicing grouped data not supported in Arrow"
)
# with the by argument
expect_error(
slice_head(arrow_table(tbl), n = 5, by = lgl),
"Slicing grouped data not supported in Arrow"
)
expect_error(
slice_tail(arrow_table(tbl), n = 5, by = lgl),
"Slicing grouped data not supported in Arrow"
)
expect_error(
slice_min(arrow_table(tbl), int, n = 5, by = lgl),
"Slicing grouped data not supported in Arrow"
)
expect_error(
slice_max(arrow_table(tbl), int, n = 5, by = lgl),
"Slicing grouped data not supported in Arrow"
)
expect_error(
slice_sample(arrow_table(tbl), n = 5, by = lgl),
"Slicing grouped data not supported in Arrow"
)
})
test_that("input validation", {
tab <- arrow_table(tbl)
for (p in list("a", -1, 2, c(.01, .02), NA_real_)) {
expect_error(
slice_head(tab, prop = !!p),
"`prop` must be a single numeric value between 0 and 1",
fixed = TRUE
)
}
expect_error(
tab %>% slice_tail(n = 3, with_ties = FALSE),
"`...` must be empty"
)
})
test_that("n <-> prop conversion when nrow is not known", {
joined <- tbl %>%
arrow_table() %>%
full_join(tbl, by = "int")
expect_true(is.na(nrow(joined)))
expect_error(
joined %>%
slice_min(int, prop = .25, with_ties = FALSE),
"Slicing with `prop` when"
)
expect_error(
joined %>%
slice_sample(n = 5),
"slice_sample() with `n` when",
fixed = TRUE
)
})
# TODO: handle edge case where prop = 1, do nothing?
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.