tests/testthat/test-parsing-character.R

test_that("ws dropped by default", {
  df <- read_csv(I("x\n a \n    b\n"))
  expect_equal(df$x, c("a", "b"))
})

test_that("trim_ws = FALSE keeps ws", {
  df <- read_csv(I("x\n a\nb \n"), trim_ws = FALSE)
  expect_equal(df$x, c(" a", "b "))
})

test_that("trim_ws = TRUE trims spaces and tabs", {
  df <- read_csv(I("x\n a\n\tb \t\n"), trim_ws = TRUE)
  expect_equal(df$x, c("a", "b"))
})



# Encoding ----------------------------------------------------------------

test_that("locale encoding affects parsing", {
  x <- c("août", "élève", "ça va")
  # expect_equal(Encoding(x), rep("UTF-8", 3))

  y <- iconv(x, "UTF-8", "latin1")
  # expect_equal(Encoding(x), rep("latin1", 3))

  fr <- locale("fr", encoding = "latin1")
  z <- parse_character(y, locale = fr)
  # expect_equal(Encoding(z), rep("UTF-8", 3))

  # identical coerces encodings to match, so need to compare raw values
  as_raw <- function(x) lapply(x, charToRaw)
  expect_identical(as_raw(x), as_raw(z))
})

test_that("Unicode Byte order marks are stripped from output", {

  # UTF-8
  expect_equal(
    charToRaw(read_lines(
      as.raw(c(
        0xef, 0xbb, 0xbf, # BOM
        0x41, # A
        0x0A # newline
      ))
    )),
    as.raw(0x41)
  )

  # UTF-16 Big Endian
  expect_equal(
    charToRaw(read_lines(
      as.raw(c(
        0xfe, 0xff, # BOM
        0x41, # A
        0x0A # newline
      ))
    )),
    as.raw(0x41)
  )

  # UTF-16 Little Endian
  expect_equal(
    charToRaw(read_lines(
      as.raw(c(
        0xff, 0xfe, # BOM
        0x41, # A
        0x0A # newline
      ))
    )),
    as.raw(0x41)
  )

  # UTF-32 Big Endian
  expect_equal(
    charToRaw(read_lines(
      as.raw(c(
        0x00, 0x00, 0xfe, 0xff, # BOM
        0x41, # A
        0x0A # newline
      ))
    )),
    as.raw(0x41)
  )

  # UTF-32 Little Endian
  expect_equal(
    charToRaw(read_lines(
      as.raw(c(
        0xff, 0xfe, 0x00, 0x00, # BOM
        0x41, # A
        0x0A # newline
      ))
    )),
    as.raw(0x41)
  )

  # Vectors shorter than the BOM are handled safely
  expect_equal(
    charToRaw(read_lines(
      as.raw(c(0xef, 0xbb))
    )),
    as.raw(c(0xef, 0xbb))
  )

  expect_equal(
    charToRaw(read_lines(
      as.raw(c(0xfe))
    )),
    as.raw(c(0xfe))
  )

  expect_equal(
    charToRaw(read_lines(
      as.raw(c(0xff))
    )),
    as.raw(c(0xff))
  )
})
tidyverse/readr documentation built on Jan. 27, 2024, 11:59 p.m.