tests/unitizer/helper/UTF-8-critical.R

# In own file due to Solaris issues preventing direct inclusion in unitizer
#
# Well-Formed UTF-8 Byte Sequences
#
# Code Points        | Byte 1 | Byte 2 | Byte 3 | Byte 4
# U+0000..U+007F     | 00..7F |
# U+0080..U+07FF     | C2..DF | 80..BF
# U+0800..U+0FFF     | E0     | A0..BF | 80..BF
# U+1000..U+CFFF     | E1..EC | 80..BF | 80..BF
# U+D000..U+D7FF     | ED     | 80..9F | 80..BF
# U+E000..U+FFFF     | EE..EF | 80..BF | 80..BF
# U+10000..U+3FFFF   | F0     | 90..BF | 80..BF | 80..BF
# U+40000..U+FFFFF   | F1..F3 | 80..BF | 80..BF | 80..BF
# U+100000..U+10FFFF | F4     | 80..8F | 80..BF | 80..BF

# Check all the critical cases where we transition from legal to illegal
# sequences

crit.1 <- c(
  n.0="\x7F",
  y.1="\x80"
)
Encoding(crit.1) <- "UTF-8"
crit.2 <- c(
  n.0="\xC1\x91",
  n.1="\xC2\x79",
  y.2="\xC2\x80",
  y.3="\xDF\xBF",
  n.4="\xDF\xC0"
)
Encoding(crit.2) <- "UTF-8"
crit.3 <- c(
  n.00="\xE0\x9F\x91",
  n.01="\xE0\xA0\x79",
  y.02="\xE0\xA0\x80",
  y.03="\xE0\xBF\xBF",
  n.04="\xE0\xBF\xC0",
  n.05="\xE1\x79\x91",
  y.06="\xE1\x80\x80",
  y.07="\xEC\xBF\xBF",
  n.08="\xEC\xBF\xC0",
  n.10="\xEC\xC0\xBF",
  n.11="\xED\x79\x80",
  n.12="\xED\x80\x79",
  y.13="\xED\x80\x80",
  y.14="\xED\x9F\xBF",
  n.16="\xED\x9F\xC0",
  n.17="\xED\xA0\xBF",
  n.18="\xEE\x80\x79",
  n.19="\xEE\x79\x80",
  y.20="\xEE\x80\x80",
  y.21="\xEF\xBF\xBF",
  n.22="\xEF\xBF\xC0",
  n.23="\xEF\xC0\xBF"
)
Encoding(crit.3) <- "UTF-8"
crit.4 <- c(
  n.00="\xF0\x89\x80\x80",
  n.01="\xF0\x90\x79\x80",
  n.02="\xF0\x90\x80\x79",
  y.03="\xF0\x90\x80\x80",
  y.04="\xF0\xBF\xBF\xBF",
  n.06="\xF0\xBF\xBF\xC0",
  n.07="\xF0\xBF\xC0\xBF",
  n.08="\xF0\xC0\xBF\xBF",
  n.09="\xF1\x80\x80\x79",
  n.10="\xF1\x80\x79\x80",
  n.11="\xF1\x79\x80\x80",
  y.12="\xF1\x80\x80\x80",
  y.13="\xF3\xBF\xBF\xBF",
  n.14="\xF3\xBF\xBF\xC0",
  n.15="\xF3\xBF\xC0\xBF",
  n.16="\xF3\xC0\xBF\xBF",
  n.17="\xF4\x80\x80\x79",
  n.18="\xF4\x80\x79\x80",
  n.19="\xF4\x79\x80\x80",
  y.20="\xF4\x80\x80\x80",
  y.21="\xF4\x8F\xBF\xBF",
  n.22="\xF4\x8F\xBF\xC0",
  n.23="\xF4\x8F\xC0\xBF",
  n.24="\xF4\x90\x01\x01",
  n.25="\xF5\x81\x81\x81"
)
Encoding(crit.4) <- "UTF-8"
brodieG/validate documentation built on Feb. 2, 2023, 10:43 a.m.