test-dataset-dplyr.R
In arrow: Integration to 'Apache' 'Arrow'

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

skip_if_not_available("dataset")
skip_if_not_available("parquet")

library(dplyr, warn.conflicts = FALSE)

dataset_dir <- make_temp_dir()
hive_dir <- make_temp_dir()

test_that("Setup (putting data in the dir)", {
  dir.create(file.path(dataset_dir, 1))
  dir.create(file.path(dataset_dir, 2))
  write_parquet(df1, file.path(dataset_dir, 1, "file1.parquet"))
  write_parquet(df2, file.path(dataset_dir, 2, "file2.parquet"))
  expect_length(dir(dataset_dir, recursive = TRUE), 2)

  dir.create(file.path(hive_dir, "subdir", "group=1", "other=xxx"), recursive = TRUE)
  dir.create(file.path(hive_dir, "subdir", "group=2", "other=yyy"), recursive = TRUE)
  write_parquet(df1, file.path(hive_dir, "subdir", "group=1", "other=xxx", "file1.parquet"))
  write_parquet(df2, file.path(hive_dir, "subdir", "group=2", "other=yyy", "file2.parquet"))
  expect_length(dir(hive_dir, recursive = TRUE), 2)
})

test_that("filter() with is.nan()", {
  ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
  expect_equal(
    ds %>%
      select(part, dbl) %>%
      filter(!is.nan(dbl), part == 2) %>%
      collect(),
    tibble(part = 2L, dbl = df2$dbl[!is.nan(df2$dbl)])
  )
})

test_that("filter() with %in%", {
  ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
  expect_equal(
    ds %>%
      select(int, part) %>%
      filter(int %in% c(6, 4, 3, 103, 107), part == 1) %>%
      collect(),
    tibble(int = df1$int[c(3, 4, 6)], part = 1)
  )

  # ARROW-9606: bug in %in% filter on partition column with >1 partition columns
  ds <- open_dataset(hive_dir)
  expect_equal(
    ds %>%
      filter(group %in% 2) %>%
      select(names(df2)) %>%
      collect(),
    df2
  )
})

test_that("filter() on timestamp columns", {
  skip_if_not_available("re2")

  ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
  expect_equal(
    ds %>%
      filter(ts >= lubridate::ymd_hms("2015-05-04 03:12:39")) %>%
      filter(part == 1) %>%
      select(ts) %>%
      collect(),
    df1[5:10, c("ts")],
  )

  # Now with Date
  expect_equal(
    ds %>%
      filter(ts >= as.Date("2015-05-04")) %>%
      filter(part == 1) %>%
      select(ts) %>%
      collect(),
    df1[5:10, c("ts")],
  )

  # Now with bare string date
  skip("Implement more aggressive implicit casting for scalars (ARROW-11402)")
  expect_equal(
    ds %>%
      filter(ts >= "2015-05-04") %>%
      filter(part == 1) %>%
      select(ts) %>%
      collect(),
    df1[5:10, c("ts")],
  )
})

test_that("filter() on date32 columns", {
  tmp <- tempfile()
  dir.create(tmp)
  df <- data.frame(date = as.Date(c("2020-02-02", "2020-02-03")))
  write_parquet(df, file.path(tmp, "file.parquet"))

  expect_equal(
    open_dataset(tmp) %>%
      filter(date > as.Date("2020-02-02")) %>%
      collect() %>%
      nrow(),
    1L
  )

  skip_if_not_available("re2")

  # Also with timestamp scalar
  expect_equal(
    open_dataset(tmp) %>%
      filter(date > lubridate::ymd_hms("2020-02-02 00:00:00")) %>%
      collect() %>%
      nrow(),
    1L
  )
})


test_that("mutate()", {
  ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
  mutated <- ds %>%
    select(chr, dbl, int) %>%
    filter(dbl * 2 > 14 & dbl - 50 < 3L) %>%
    mutate(twice = int * 2)
  expect_output(
    print(mutated),
    "FileSystemDataset (query)
chr: string
dbl: double
int: int32
twice: int32 (multiply_checked(int, 2))

* Filter: ((multiply_checked(dbl, 2) > 14) and (subtract_checked(dbl, 50) < 3))
See $.data for the source Arrow object",
    fixed = TRUE
  )
  expect_equal(
    mutated %>%
      collect() %>%
      arrange(dbl),
    rbind(
      df1[8:10, c("chr", "dbl", "int")],
      df2[1:2, c("chr", "dbl", "int")]
    ) %>%
      mutate(
        twice = int * 2
      )
  )
})

test_that("mutate() features not yet implemented", {
  ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
  expect_error(
    ds %>%
      group_by(int) %>%
      mutate(avg = mean(int)),
    "window functions not currently supported in Arrow\nCall collect() first to pull data into R.",
    fixed = TRUE
  )
})

test_that("filter scalar validation doesn't crash (ARROW-7772)", {
  ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
  expect_error(
    ds %>%
      filter(int == Expression$scalar("fff"), part == 1) %>%
      collect(),
    "'equal' has no kernel matching input types .int32, string."
  )
})

test_that("collect() on Dataset works (if fits in memory)", {
  expect_equal(
    collect(open_dataset(dataset_dir)) %>% arrange(int),
    rbind(df1, df2)
  )
})

test_that("count()", {
  ds <- open_dataset(dataset_dir)
  df <- rbind(df1, df2)
  expect_equal(
    ds %>%
      filter(int > 6, int < 108) %>%
      count(chr) %>%
      arrange(chr) %>%
      collect(),
    df %>%
      filter(int > 6, int < 108) %>%
      count(chr)
  )
})

test_that("arrange()", {
  ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
  arranged <- ds %>%
    select(chr, dbl, int) %>%
    filter(dbl * 2 > 14 & dbl - 50 < 3L) %>%
    mutate(twice = int * 2) %>%
    arrange(chr, desc(twice), dbl + int)
  expect_output(
    print(arranged),
    "FileSystemDataset (query)
chr: string
dbl: double
int: int32
twice: int32 (multiply_checked(int, 2))

* Filter: ((multiply_checked(dbl, 2) > 14) and (subtract_checked(dbl, 50) < 3))
* Sorted by chr [asc], multiply_checked(int, 2) [desc], add_checked(dbl, int) [asc]
See $.data for the source Arrow object",
    fixed = TRUE
  )
  expect_equal(
    arranged %>%
      collect(),
    rbind(
      df1[8, c("chr", "dbl", "int")],
      df2[2, c("chr", "dbl", "int")],
      df1[9, c("chr", "dbl", "int")],
      df2[1, c("chr", "dbl", "int")],
      df1[10, c("chr", "dbl", "int")]
    ) %>%
      mutate(
        twice = int * 2
      )
  )
})

test_that("compute()/collect(as_data_frame=FALSE)", {
  ds <- open_dataset(dataset_dir)

  tab1 <- ds %>% compute()
  expect_r6_class(tab1, "Table")

  tab2 <- ds %>% collect(as_data_frame = FALSE)
  expect_r6_class(tab2, "Table")

  tab3 <- ds %>%
    mutate(negint = -int) %>%
    filter(negint > -100) %>%
    arrange(chr) %>%
    select(negint) %>%
    compute()

  expect_r6_class(tab3, "Table")

  expect_equal(
    tab3 %>% collect(),
    tibble(negint = -1:-10)
  )

  tab4 <- ds %>%
    mutate(negint = -int) %>%
    filter(negint > -100) %>%
    arrange(chr) %>%
    select(negint) %>%
    collect(as_data_frame = FALSE)

  expect_r6_class(tab3, "Table")

  expect_equal(
    tab4 %>% collect(),
    tibble(negint = -1:-10)
  )

  tab5 <- ds %>%
    mutate(negint = -int) %>%
    group_by(fct) %>%
    compute()

  expect_r6_class(tab5, "Table")
  # mutate() was evaluated
  expect_true("negint" %in% names(tab5))
})

test_that("head/tail on query on dataset", {
  # head/tail on arrow_dplyr_query does not have deterministic order,
  # so without sorting we can only assert the correct number of rows
  ds <- open_dataset(dataset_dir)

  expect_identical(
    ds %>%
      filter(int > 6) %>%
      head(5) %>%
      compute() %>%
      nrow(),
    5L
  )

  expect_equal(
    ds %>%
      filter(int > 6) %>%
      arrange(int) %>%
      head() %>%
      collect(),
    rbind(df1[7:10, ], df2[1:2, ])
  )

  expect_equal(
    ds %>%
      filter(int < 105) %>%
      tail(4) %>%
      compute() %>%
      nrow(),
    4L
  )

  expect_equal(
    ds %>%
      filter(int < 105) %>%
      arrange(int) %>%
      tail() %>%
      collect(),
    rbind(df1[9:10, ], df2[1:4, ])
  )
})

test_that("dplyr method not implemented messages", {
  ds <- open_dataset(dataset_dir)
  # This one is more nuanced
  expect_error(
    ds %>% filter(int > 6, dbl > max(dbl)),
    "Filter expression not supported for Arrow Datasets: dbl > max(dbl)\nCall collect() first to pull data into R.",
    fixed = TRUE
  )
})

test_that("show_exec_plan(), show_query() and explain() with datasets", {
  # show_query() and explain() are wrappers around show_exec_plan() and are not
  # tested separately

  ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))

  # minimal test
  expect_output(
    ds %>%
      show_exec_plan(),
    regexp = paste0(
      "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan
      "ProjectNode.*", # output columns
      "SourceNode" # entry point
    )
  )

  # filter and select
  expect_output(
    ds %>%
      select(string = chr, integer = int, part) %>%
      filter(integer > 6L & part == 1) %>%
      show_exec_plan(),
    regexp = paste0(
      "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan
      "ProjectNode.*", # output columns
      "FilterNode.*", # filter node
      "int > 6.*", # filtering expressions
      "SourceNode" # entry point
    )
  )

  # group_by and summarise
  expect_output(
    ds %>%
      group_by(part) %>%
      summarise(avg = mean(int)) %>%
      show_exec_plan(),
    regexp = paste0(
      "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan
      "GroupByNode.*", # group by node
      "keys=.*part.*", # key for aggregations
      "aggregates=.*hash_mean.*", # aggregations
      "ProjectNode.*", # input columns
      "SourceNode" # entry point
    )
  )

  # arrange and head
  expect_output(
    ds %>%
      filter(lgl) %>%
      arrange(chr) %>%
      show_exec_plan(),
    regexp = paste0(
      "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan
      "OrderByNode.*chr.*ASC.*", # arrange goes via the OrderBy node
      "ProjectNode.*", # output columns
      "FilterNode.*", # filter node
      "filter=lgl.*", # filtering expression
      "SourceNode" # entry point
    )
  )
})