tests/testthat/test-csv-scan.R

test_that("tbl_csv reads basic CSV and returns correct data.frame", {
  f <- tempfile(fileext = ".csv")
  on.exit(unlink(f))
  write.csv(mtcars[1:10, ], f, row.names = FALSE)

  result <- tbl_csv(f) |> collect()
  expect_equal(nrow(result), 10)
  expect_equal(ncol(result), ncol(mtcars))
  expect_equal(names(result), names(mtcars))
  expect_equal(result$mpg, mtcars$mpg[1:10])
})

test_that("tbl_csv infers integer columns correctly", {
  f <- tempfile(fileext = ".csv")
  on.exit(unlink(f))
  df <- data.frame(x = 1:5, y = c(1.5, 2.5, 3.5, 4.5, 5.5))
  write.csv(df, f, row.names = FALSE)

  result <- tbl_csv(f) |> collect()
  # x should be int64 (returned as double by default)
  expect_type(result$x, "double")
  expect_equal(result$x, c(1, 2, 3, 4, 5))
  expect_equal(result$y, df$y)
})

test_that("tbl_csv handles string columns", {
  f <- tempfile(fileext = ".csv")
  on.exit(unlink(f))
  df <- data.frame(name = c("Alice", "Bob", "Charlie"), score = c(90, 85, 95),
                   stringsAsFactors = FALSE)
  write.csv(df, f, row.names = FALSE)

  result <- tbl_csv(f) |> collect()
  expect_equal(result$name, c("Alice", "Bob", "Charlie"))
  expect_equal(result$score, c(90, 85, 95))
})

test_that("tbl_csv handles NA values", {
  f <- tempfile(fileext = ".csv")
  on.exit(unlink(f))
  writeLines("x,y,z\n1,hello,TRUE\nNA,NA,NA\n3,world,FALSE", f)

  result <- tbl_csv(f) |> collect()
  expect_equal(result$x[1], 1)
  expect_true(is.na(result$x[2]))
  expect_equal(result$x[3], 3)
  expect_true(is.na(result$y[2]))
  expect_true(is.na(result$z[2]))
})

test_that("tbl_csv handles logical columns", {
  f <- tempfile(fileext = ".csv")
  on.exit(unlink(f))
  writeLines("a,b\nTRUE,1\nFALSE,2\nTRUE,3", f)

  result <- tbl_csv(f) |> collect()
  expect_equal(result$a, c(TRUE, FALSE, TRUE))
})

test_that("tbl_csv handles quoted fields with commas", {
  f <- tempfile(fileext = ".csv")
  on.exit(unlink(f))
  writeLines('name,city\n"Smith, John","New York"\n"Doe, Jane",Boston', f)

  result <- tbl_csv(f) |> collect()
  expect_equal(result$name, c("Smith, John", "Doe, Jane"))
  expect_equal(result$city, c("New York", "Boston"))
})

test_that("tbl_csv works with filter pipeline", {
  f <- tempfile(fileext = ".csv")
  on.exit(unlink(f))
  write.csv(mtcars, f, row.names = FALSE)

  result <- tbl_csv(f) |>
    filter(cyl == 6) |>
    select(mpg, cyl, hp) |>
    collect()

  expected <- mtcars[mtcars$cyl == 6, c("mpg", "cyl", "hp")]
  expect_equal(nrow(result), nrow(expected))
  expect_equal(result$mpg, expected$mpg)
})

test_that("tbl_csv works with group_by + summarise", {
  f <- tempfile(fileext = ".csv")
  on.exit(unlink(f))
  write.csv(mtcars, f, row.names = FALSE)

  result <- tbl_csv(f) |>
    group_by(cyl) |>
    summarise(mean_mpg = mean(mpg)) |>
    arrange(cyl) |>
    collect()

  expect_equal(nrow(result), 3)
  expect_equal(result$cyl, c(4, 6, 8))
})

test_that("tbl_csv respects batch_size parameter", {
  f <- tempfile(fileext = ".csv")
  on.exit(unlink(f))
  write.csv(mtcars, f, row.names = FALSE)

  # Small batch size should still give correct results
  result <- tbl_csv(f, batch_size = 5L) |> collect()
  expect_equal(nrow(result), nrow(mtcars))
  expect_equal(result$mpg, mtcars$mpg)
})

test_that("tbl_csv handles empty fields as NA", {
  f <- tempfile(fileext = ".csv")
  on.exit(unlink(f))
  writeLines("x,y\n1,hello\n2,\n3,world", f)

  result <- tbl_csv(f) |> collect()
  expect_true(is.na(result$y[2]))
  expect_equal(result$y[c(1, 3)], c("hello", "world"))
})

test_that("tbl_csv explain shows CsvScanNode", {
  f <- tempfile(fileext = ".csv")
  on.exit(unlink(f))
  write.csv(mtcars[1:5, ], f, row.names = FALSE)

  node <- tbl_csv(f)
  plan <- capture.output(explain(node))
  expect_true(any(grepl("CsvScanNode", plan)))
})

test_that("tbl_csv pipe to write_csv round-trips", {
  f_in <- tempfile(fileext = ".csv")
  f_out <- tempfile(fileext = ".csv")
  on.exit(unlink(c(f_in, f_out)))

  df <- data.frame(a = 1:3, b = c("x", "y", "z"), stringsAsFactors = FALSE)
  write.csv(df, f_in, row.names = FALSE)

  tbl_csv(f_in) |> write_csv(f_out)

  result <- tbl_csv(f_out) |> collect()
  expect_equal(result$a, c(1, 2, 3))
  expect_equal(result$b, c("x", "y", "z"))
})

test_that("tbl_csv reads gzip-compressed CSV via the miniz path", {
  f <- tempfile(fileext = ".csv.gz")
  on.exit(unlink(f))

  df <- data.frame(
    a = 1:5,
    b = c(1.5, 2.5, 3.5, 4.5, 5.5),
    c = c("foo", "bar", "baz", "qux", "quux"),
    stringsAsFactors = FALSE
  )
  con <- gzfile(f, "w")
  write.csv(df, con, row.names = FALSE)
  close(con)

  result <- tbl_csv(f) |> collect()
  expect_equal(nrow(result), 5)
  expect_equal(names(result), c("a", "b", "c"))
  expect_equal(result$a, c(1, 2, 3, 4, 5))
  expect_equal(result$b, df$b)
  expect_equal(result$c, df$c)
})

test_that("tbl_csv on a gz CSV exercises the type-inference rewind path", {
  # csv_scan infers types from the first 1000 rows, then seeks back to the
  # data start. For the gz path that means seeking inside the in-memory
  # decompressed buffer. Use >1000 rows so the rewind actually fires.
  f <- tempfile(fileext = ".csv.gz")
  on.exit(unlink(f))

  n <- 1500
  df <- data.frame(
    id = seq_len(n),
    val = as.double(seq_len(n)) / 2,
    tag = sprintf("row%04d", seq_len(n)),
    stringsAsFactors = FALSE
  )
  con <- gzfile(f, "w")
  write.csv(df, con, row.names = FALSE)
  close(con)

  result <- tbl_csv(f) |> collect()
  expect_equal(nrow(result), n)
  expect_equal(result$id[1], 1)
  expect_equal(result$id[n], n)
  expect_equal(result$val[n], n / 2)
  expect_equal(result$tag[1], "row0001")
  expect_equal(result$tag[n], sprintf("row%04d", n))
})

Try the vectra package in your browser

Any scripts or data that you put into this service are public.

vectra documentation built on May 8, 2026, 9:06 a.m.