test-encodings.R
In hunspell: High-Performance Stemmer, Tokenizer, and Spell Checker

context("Test UTF8 dict")

# Test that we can find a dictionary by full path
test_that("Dictionaries are by path",{
  expect_equal(hunspell_info(paste0(system.file("dict", package = "hunspell"), "/en_US"))$encoding, "UTF-8")
  expect_equal(hunspell_info("../testdict/ru_RU")$encoding, "UTF-8")
  expect_equal(hunspell_info("../testdict/russian-aot")$encoding, "KOI8-R")
})

# DICPATH gets preference, even when a system 'ru_RU' dict is found
Sys.setenv(DICPATH=normalizePath("../testdict", mustWork = TRUE))

test_that("Dictionaries are found",{
  expect_equal(hunspell_info("en_US")$encoding, "UTF-8")
  expect_equal(hunspell_info("ru_RU")$encoding, "UTF-8")
  expect_equal(hunspell_info("en_US")$wordchars, "\u2019")
  expect_equal(hunspell_info("en_GB")$wordchars, "\u2019")
  expect_equal(hunspell_info("ru_RU")$wordchars, NA_character_)
  expect_equal(hunspell_info("russian-aot")$encoding, "KOI8-R")
  expect_equal(nchar(hunspell_info("russian-aot")$wordchars), 122)
})

# needed because testthat doesn't parse UTF8 source code on windows
utf8 <- function(x){
  Encoding(x) <- "UTF-8"
  x
}

test_that("UTF8 always works", {
  str1 <- utf8("К сожалению, мне (нам) пора идти")
  str2 <- utf8("сожалению")
  str3 <- utf8("달력")
  str4 <- paste(str1, str3)
  expect_equal(length(hunspell_find(str1)[[1]]), 6)
  expect_equal(hunspell_find(str2)[[1]], str2)
  expect_equal(length(hunspell_find(str1, dict = "ru_RU")[[1]]), 0)
  expect_equal(hunspell_find(str4, dict = "ru_RU")[[1]], str3)
  expect_equal(hunspell_find(str4, ignore = str3, dict = "ru_RU")[[1]], character(0))
  expect_equal(length(hunspell_find(str1, dict = "russian-aot")[[1]]), 0)
  expect_equal(hunspell_find(str3, dict = "en_US")[[1]], str3)
  expect_equal(hunspell_find(str3, ignore = str3, dict = "en_US")[[1]], character(0))
  expect_equal(hunspell_parse(str4)[[1]][2], str2)
  expect_equal(hunspell_parse(str4)[[1]][7], str3)

  str5 <- utf8("Zürich")
  expect_equal(hunspell_parse(enc2native(paste(str5, "test")))[[1]][1], str5)
  expect_equal(hunspell_parse(enc2utf8(paste(str5, "test")))[[1]][1], str5)

  # Test encoding roundtrip
  stemstr <- utf8("сожалени")
  expect_match(hunspell_stem(str2, dict = "russian-aot")[[1]][1], stemstr)
  expect_match(hunspell_stem(str2, dict = "ru_RU")[[1]][1], stemstr)
  expect_match(hunspell_suggest(str2, dict = "russian-aot")[[1]][1], stemstr)
  expect_match(hunspell_suggest(str2, dict = "ru_RU")[[1]][1], stemstr)
  expect_match(hunspell_analyze(str2, dict = "russian-aot")[[1]][1], stemstr)
  expect_match(hunspell_analyze(str2, dict = "ru_RU")[[1]][1], stemstr)

  # iconv on macos 14 seems broken, retuns '?' instead of error
  skip_on_os('mac')
  expect_warning(hunspell_find(str3, dict = "russian-aot"), "encoding")
})

test_that("custom words in dictionary",{
  word_native <-  enc2native("üx")
  word_utf <- enc2utf8(word_native)
  expect_false(hunspell_check(word_native))
  expect_false(hunspell_check(word_utf))
  expect_true(hunspell_check(word_native, dictionary(add_words = word_native)))
  expect_true(hunspell_check(word_utf, dictionary(add_words = word_utf)))
  expect_true(hunspell_check(word_native, dictionary(add_words = word_utf)))
  expect_true(hunspell_check(word_utf, dictionary(add_words = word_native)))

  # tests caching
  expect_false(hunspell_check(word_native))
  expect_false(hunspell_check(word_utf))
})

Any scripts or data that you put into this service are public.

hunspell documentation built on Oct. 6, 2023, 9:07 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

hunspell
High-Performance Stemmer, Tokenizer, and Spell Checker

tests/testthat/test-encodings.R
In hunspell: High-Performance Stemmer, Tokenizer, and Spell Checker

Try the hunspell package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

hunspell High-Performance Stemmer, Tokenizer, and Spell Checker

tests/testthat/test-encodings.R In hunspell: High-Performance Stemmer, Tokenizer, and Spell Checker

Try the hunspell package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

hunspell
High-Performance Stemmer, Tokenizer, and Spell Checker

tests/testthat/test-encodings.R
In hunspell: High-Performance Stemmer, Tokenizer, and Spell Checker