context("test ability to load encoded files correctly")
test_that("test readtext encoding parameter: ASCII encoded file, read as UTF-8: (should work)", {
skip_on_cran()
FILEDIR <- "../data/encoding"
utf8_bytes <- data.table::fread(file.path(FILEDIR, "UTF-8__bytes.tsv"))[[1]]
rt <- readtext(file.path(FILEDIR, "UTF-8__characters.txt"), encoding = "utf-8")
expect_equal(
as.numeric(charToRaw(as.character(rt))),
utf8_bytes
)
})
test_that("test readtext encoding parameter: UTF-8 encoded file, read as UTF-16 (should not work)", {
skip_on_cran()
expect_warning(
misread_texts <- as.character(readtext(file.path("../data/encoding",
"UTF-8__characters.txt"),
encoding = "utf-16"))
)
utf8_bytes <- data.table::fread(file.path("../data/encoding",
"UTF-8__bytes.tsv"))[[1]]
expect_false(
identical(charToRaw(misread_texts), utf8_bytes)
)
})
test_that("test encoding handling (skipped on travis and CRAN", {
skip_on_cran()
skip_on_os("windows")
# Currently, these encodings don't work for reasons that seem unrelated
# to quanteda, and are either a problem in base R or on travis-ci
broken_encodings <- c(
"437", "850", "852", "855", "857", "860", "861", "862", "863", "865",
"869", "BIG5-HKSCS", "CHINESE", "CP1251", "CP1255", "CP1256", "CP1361",
"CP154", "CP737", "CP858", "CP864", "CP856", "CP932", "CP950", "EUC-JISX0213",
"EUC-JP", "EUC-KR", "GB18030", "HEBREW", "HZ","ISO-2022-JP-1", "ISO-2022-JP-2",
"ISO-2022-JP-3", "ISO-8859-11", "ISO-IR-166", "KOI8-R",
"UNICODE-1-1-UTF-7",
"MACCENTRALEUROPE", "MACCYRILLIC", "MACGREEK", "MACICELAND", "MACTURKISH",
"MS_KANJI", "SHIFT_JISX0213"
)
FILEDIR <- "../data/encoding"
filenames <- list.files(FILEDIR, "*__characters.txt$")
parts <- strsplit(gsub(".txt$", "", filenames), "__")
fileencodings <- sapply(parts, "[", 1)
fileencodings <- fileencodings[!(fileencodings %in% broken_encodings)]
filenames <- file.path(FILEDIR, paste0(fileencodings, "__characters.txt"))
for (i in 1:length(fileencodings)) {
filename <- filenames[[i]]
encoding <- fileencodings[[i]]
test_that(paste("test readtext encoding parameter, encoding", encoding), {
characters <- as.numeric(charToRaw(
as.character(readtext(filename, encoding=fileencodings[[i]]))
))
bytes <- data.table::fread(gsub("__characters.txt", "__bytes.tsv", filename))[[1]]
expect_equal(characters, bytes)
})
}
# Test loading all these files at once with different encodings
encodedreadtxts <- readtext(filenames, encoding = fileencodings)
expect_equal(dim(encodedreadtxts), c(39, 2))
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.