tests/testthat/test-write-parquet-row-groups.R

test_that("errors", {
  expect_snapshot(error = TRUE, {
    parquet_options(num_rows_per_row_group = "foobar")
  })

  df <- test_df()
  tmp <- tempfile(fileext = ".parquet")
  on.exit(unlink(tmp), add = TRUE)
  expect_snapshot(error = TRUE, {
    write_parquet(df, tmp, row_groups = "foobar")
    write_parquet(df, tmp, row_groups = c(100L, 1L))
    write_parquet(df, tmp, row_groups = c(1L, 100L))
  })
})

test_that("row groups", {
  tmp1 <- tempfile(fileext = ".parquet")
  tmp2 <- tempfile(fileext = ".parquet")
  on.exit(unlink(c(tmp1, tmp2)), add = TRUE)

  df <- test_df()
  write_parquet(df, tmp1, row_groups = 1L)
  write_parquet(df, tmp2, row_groups = c(1L, 16L))
  expect_equal(read_parquet(tmp1), read_parquet(tmp2))
  expect_equal(nrow(read_parquet_metadata(tmp2)[["row_groups"]]), 2L)

  unlink(tmp2)
  write_parquet(df, tmp2, row_groups = seq_len(nrow(df)))
  expect_equal(read_parquet(tmp1), read_parquet(tmp2))
  expect_equal(nrow(read_parquet_metadata(tmp2)[["row_groups"]]), nrow(df))

  unlink(tmp2)
  withr::local_options(nanoparquet.num_rows_per_row_group = 10L)
  write_parquet(df, tmp2)
  expect_equal(read_parquet(tmp1), read_parquet(tmp2))
  expect_equal(nrow(read_parquet_metadata(tmp2)[["row_groups"]]), 4L)
})

test_that("factors & factor levels", {
  tmp <- tempfile(fileext = ".parquet")
  on.exit(unlink(tmp), add = TRUE)

  df <- data.frame(
    f = factor(c(rep("a", 100), rep("b", 100), rep("c", 100)))
  )
  withr::local_options(nanoparquet.num_rows_per_row_group = 50L)
  write_parquet(df, tmp)
  expect_equal(as.data.frame(read_parquet(tmp)), df)
  # the same dict is written into every dicitonary page
  pgs <- read_parquet_pages(tmp)
  dict_ofs <- pgs[["page_header_offset"]][
    pgs[["page_type"]] == "DICTIONARY_PAGE"
  ]
  dict_data <- read_parquet_page(tmp, dict_ofs[1])[["data"]]
  for (do in dict_ofs) {
    expect_equal(dict_data, read_parquet_page(tmp, do)[["data"]])
  }
})

test_that("non-factors write local dictionary", {
  tmp <- tempfile(fileext = ".parquet")
  on.exit(unlink(tmp), add = TRUE)

  df <- data.frame(
    stringsAsFactors = FALSE,
    f = c(rep("a", 100), rep("b", 100), rep("c", 100))
  )
  withr::local_options(nanoparquet.num_rows_per_row_group = 40L)
  write_parquet(df, tmp)
  expect_equal(as.data.frame(read_parquet(tmp)), df)
  pgs <- read_parquet_pages(tmp)
  dict_ofs <- pgs[["page_header_offset"]][
    pgs[["page_type"]] == "DICTIONARY_PAGE"
  ]
  expect_snapshot({
    for (do in dict_ofs) {
      print(read_parquet_page(tmp, do)[["data"]])
    }
  })
})

test_that("strings in a dictionary", {
  tmp <- tempfile(fileext = ".parquet")
  on.exit(unlink(tmp), add = TRUE)

  df <- test_df()
  write_parquet(
    df, tmp,
    encoding = c(large = "RLE", "RLE_DICTIONARY"),
    options = parquet_options(num_rows_per_row_group=10)
  )
  expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp)))
})

Try the nanoparquet package in your browser

Any scripts or data that you put into this service are public.

nanoparquet documentation built on April 3, 2025, 11:26 p.m.