uchardet | R Documentation |
R bindings for the uchardet library (<https://www.freedesktop.org/wiki/Software/uchardet/>), that is the encoding detector library of Mozilla. It takes a sequence of bytes in an unknown character encoding without any additional information, and attempts to determine the encoding of the text. Returned encoding names are iconv-compatible.
Maintainer: Artem Klevtsov a.a.klevtsov@gmail.com (ORCID)
Other contributors:
Philipp Upravitelev upravitelev@gmail.com [contributor]
uchardet
page: https://www.freedesktop.org/wiki/Software/uchardet/
Useful links:
Report bugs at https://gitlab.com/artemklevtsov/uchardet/-/issues
# detect character vector with ASCII strings ascii <- "I can eat glass and it doesn't hurt me." detect_str_enc(ascii) # detect character vector with UTF-8 strings utf8 <- "\u4e0b\u5348\u597d" print(utf8) detect_str_enc(utf8) # function to read ASCII or UTF-8 files read_file <- function(x) readChar(x, file.size(x)) # path to examples ex_path <- system.file("examples", package = "uchardet") # russian text ru_utf8 <- read_file(file.path(ex_path, "ru.txt")) print(ru_utf8) detect_str_enc(iconv(ru_utf8, "utf8", "ibm866")) detect_str_enc(iconv(ru_utf8, "utf8", "koi8-r")) detect_str_enc(iconv(ru_utf8, "utf8", "cp1251")) # china text zh_utf8 <- read_file(file.path(ex_path, "zh.txt")) print(zh_utf8) detect_str_enc(iconv(zh_utf8, "utf8", "big5")) detect_str_enc(iconv(zh_utf8, "utf8", "gb18030")) # korean text ko_utf8 <- read_file(file.path(ex_path, "ko.txt")) print(ko_utf8) detect_str_enc(iconv(ko_utf8, "utf8", "uhc")) detect_str_enc(iconv(ko_utf8, "utf8", "iso-2022-kr")) # detect ASCII file encoding detect_file_enc(system.file("DESCRIPTION", package = "uchardet")) # paths to examples files ex_path <- system.file("examples", package = "uchardet") # various langaues and encodings examples files ex_files <- Sys.glob(file.path(ex_path, "*", "*")) # detect files encodings detect_file_enc(head(ex_files, 10)) # detect raw vector encoding with ASCII encoding ascii <- "I can eat glass and it doesn't hurt me." detect_raw_enc(charToRaw(ascii)) # detect raw vector with UTF-8 encoding utf8 <- "\u4e0b\u5348\u597d" detect_raw_enc(charToRaw(utf8)) # function to read file as raw bytes read_bin <- function(x) readBin(x, raw(), file.size(x)) # detect encoding of files read as raw vector ex_path <- system.file("examples", package = "uchardet") # deutsch text as binary data de_bin <- read_bin(file.path(ex_path, "de", "windows-1252.txt")) detect_raw_enc(de_bin) # russian text as binary data ru_bin <- read_bin(file.path(ex_path, "ru", "windows-1251.txt")) detect_raw_enc(ru_bin) # china text as binary data zh_bin <- read_bin(file.path(ex_path, "zh", "utf-8.txt")) detect_raw_enc(zh_bin)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.