tests/testthat/test-append-parquet.R

test_that("keep all row groups", {
  tmp <- tempfile(fileext = ".parquet")
  on.exit(unlink(tmp), add = TRUE)

  df1 <- test_df(missing = TRUE)
  write_parquet(df1, tmp)
  mtd1 <- read_parquet_metadata(tmp)

  df2 <- test_df()
  append_parquet(
    df2,
    tmp,
    options = parquet_options(keep_row_groups = TRUE)
  )
  df3 <- read_parquet(tmp)
  mtd2 <- read_parquet_metadata(tmp)

  df21 <- utils::head(df3, nrow(df1))
  df22 <- utils::tail(df3, nrow(df2))
  row.names(df22) <- NULL
  expect_equal(df21, df1)
  expect_equal(df22, df2)
  expect_equal(mtd1$file_meta_data[-3], mtd2$file_meta_data[-3])

  expect_equal(nrow(mtd2$row_groups), 2L)
  expect_equal(
    mtd1$column_chunks,
    utils::head(mtd2$column_chunks, nrow(mtd1$column_chunks))
  )
})

test_that("overwrite last row group", {
  tmp <- tempfile(fileext = ".parquet")
  on.exit(unlink(tmp), add = TRUE)

  df1 <- test_df(missing = TRUE)
  write_parquet(df1, tmp, row_groups = c(1L, 31L))
  mtd1 <- read_parquet_metadata(tmp)

  df2 <- test_df()
  append_parquet(
    df2,
    tmp,
    options = parquet_options(num_rows_per_row_group = 30)
  )
  df3 <- read_parquet(tmp)
  mtd2 <- read_parquet_metadata(tmp)

  df21 <- utils::head(df3, nrow(df1))
  df22 <- utils::tail(df3, nrow(df2))
  row.names(df22) <- NULL
  expect_equal(df21, df1)
  expect_equal(df22, df2)
  expect_equal(mtd1$file_meta_data[-3], mtd2$file_meta_data[-3])

  # the first column chunk of row group 1 is at the same position as before
  ccs1 <- mtd1$column_chunks
  ccs2 <- mtd2$column_chunks
  idx1 <- which(ccs1$row_group == 1)[1]
  idx2 <- which(ccs2$row_group == 1)[1]
  expect_equal(idx1, idx2)
  expect_equal(ccs1$file_offset[idx1], ccs2$file_offset[idx2])
})

test_that("overwrite a single row groups", {
  tmp <- tempfile(fileext = ".parquet")
  on.exit(unlink(tmp), add = TRUE)

  df1 <- test_df(missing = TRUE)
  write_parquet(df1, tmp)
  mtd1 <- read_parquet_metadata(tmp)

  df2 <- test_df()
  append_parquet(df2, tmp)
  df3 <- read_parquet(tmp)
  mtd2 <- read_parquet_metadata(tmp)

  df21 <- utils::head(df3, nrow(df1))
  df22 <- utils::tail(df3, nrow(df2))
  row.names(df22) <- NULL
  expect_equal(df21, df1)
  expect_equal(df22, df2)
  expect_equal(mtd1$file_meta_data[-3], mtd2$file_meta_data[-3])
})

Try the nanoparquet package in your browser

Any scripts or data that you put into this service are public.

nanoparquet documentation built on April 3, 2025, 11:26 p.m.