Nothing
# Test UTF-8, separate to avoid issues with platforms that don't support it
library(fansi)
unitizer_sect("substr", {
term.cap <- c('bright', '256', 'truecolor')
lorem.cn.pieces <-
substr(rep(lorem.cn, 5), c(1, 11, 21, 31), c(10, 15, 22, 45))
lorem.cn.col.1 <- paste0(
red, lorem.cn.pieces[1], inv, lorem.cn.pieces[2], grn.bg,
lorem.cn.pieces[3], rgb.und, lorem.cn.pieces[4], end
)
lor.cn.c.1.5 <- rep(lorem.cn.col.1, 5)
starts <- seq(1, 17, 4)
ends <- starts + 3
substr2_ctl(lor.cn.c.1.5, starts, ends, term.cap=term.cap)
# These are all six chars wide, but look different due to different width
# characters
lorem.cn.col.2 <- paste0(
red, lorem.cn.pieces[1], "hello", inv, lorem.cn.pieces[2], " there ",
grn.bg, lorem.cn.pieces[3], rgb.und, lorem.cn.pieces[4], end
)
lor.cn.c.2.5 <- rep(lorem.cn.col.2, 5)
starts <- seq(1, by=6, length.out=5)
ends <- starts + 5
substr2_ctl(lor.cn.c.2.5, starts, ends, term.cap=term.cap)
substr2_sgr(lor.cn.c.2.5, starts, ends, term.cap=term.cap)
starts <- seq(1, by=12, length.out=5)
ends <- starts + 11
substr2_ctl(lor.cn.c.2.5, starts, ends, type='width', term.cap=term.cap)
# with colors that actually work on an OSX terminal
lorem.cn.col.4 <- paste0(
red, lorem.cn.pieces[1], "hello", inv, lorem.cn.pieces[2], " there ",
grn.bg, lorem.cn.pieces[3], rgb.und.256, lorem.cn.pieces[4], end
)
lor.cn.c.4.5 <- rep(lorem.cn.col.4, 5)
substr2_ctl(lor.cn.c.4.5, starts, ends, type='width')
# All wide characters even number of chars apart
lorem.cn.col.3 <- paste0(
red, lorem.cn.pieces[1], "helloo", inv, lorem.cn.pieces[2], " world! ",
grn.bg, lorem.cn.pieces[3], rgb.und, lorem.cn.pieces[4], end
)
lor.cn.c.3.5 <- rep(lorem.cn.col.3, 5)
starts <- seq(1, by=12, length.out=5)
ends <- starts + 10
ends[2] <- 24
# This is a bit of an accidental one, but it should be the case that the
# second line has two extra single width characters because all the others are
# losing the last character b/c we're ending in the middle, width wise
substr2_ctl(lor.cn.c.3.5, starts, ends, type='width', term.cap=term.cap)
# and now we grab those missing chars by allowing the round to happen
substr2_ctl(
lor.cn.c.3.5, starts, ends, type='width', round='both', term.cap=term.cap
)
# jagged first one leads short, second long
starts <- seq(1, by=7, length.out=5)
ends <- starts + 8
substr2_ctl(lor.cn.c.1.5, starts, ends, type='width', term.cap=term.cap)
substr2_ctl(
lor.cn.c.1.5, starts, ends, type='width', round='stop', term.cap=term.cap
)
# don't support byte encoded strings
bytes <- "\xC0\xB1\xF0\xB1\xC0\xB1\xC0\xB1"
Encoding(bytes) <- "bytes"
# need trycatch due to instability from C level `error` call in getting the
# function call
tce(substr_ctl(bytes, 2, 3))
# Let's try a latin one string
latin <- "H\xE9llo W\xD6rld!"
Encoding(latin) <- "latin1"
latin.utf8 <- substr_ctl(latin, 1, 9)
latin.utf8
Encoding(latin.utf8)
# Start/Stop rounding - examples
rnd.1 <- "MnW"
Encoding(rnd.1) <- "UTF-8"
substr2_ctl(rnd.1, 2, 4, type='width', round='start')
substr2_ctl(rnd.1, 2, 4, type='width', round='stop')
substr2_ctl(rnd.1, 2, 4, type='width', round='neither')
substr2_ctl(rnd.1, 2, 4, type='width', round='both')
# Start/Stop rounding - end edge cases
rnd.2 <- "MW"
Encoding(rnd.2) <- "UTF-8"
substr2_ctl(rnd.2, 2, 3, type='width', round='start')
substr2_ctl(rnd.2, 2, 3, type='width', round='stop')
substr2_ctl(rnd.2, 1, 2, type='width', round='start')
substr2_ctl(rnd.2, 1, 2, type='width', round='stop')
substr2_ctl(rnd.2, 3, 4, type='width', round='start')
substr2_ctl(rnd.2, 3, 4, type='width', round='stop')
})
unitizer_sect("rounding", {
# handling of subsetting when we end up in middle of wide display characters
substr2_ctl(lorem.cn.col.2, 1, 2, type='width')
substr2_ctl(lorem.cn.col.2, 1, 3, type='width')
substr2_ctl(lorem.cn.col.2, 2, 3, type='width')
substr2_ctl(lorem.cn.col.2, 2, 4, type='width')
substr2_ctl(lorem.cn.col.2, 3, 4, type='width')
substr2_ctl(lorem.cn.col.2, 1, 2, type='width', round='stop')
substr2_ctl(lorem.cn.col.2, 1, 3, type='width', round='stop')
substr2_ctl(lorem.cn.col.2, 2, 3, type='width', round='stop')
substr2_ctl(lorem.cn.col.2, 2, 4, type='width', round='stop')
substr2_ctl(lorem.cn.col.2, 3, 4, type='width', round='stop')
substr2_ctl(lorem.cn.col.2, 1, 2, type='width', round='both')
substr2_ctl(lorem.cn.col.2, 1, 3, type='width', round='both')
substr2_ctl(lorem.cn.col.2, 2, 3, type='width', round='both')
substr2_ctl(lorem.cn.col.2, 2, 4, type='width', round='both')
substr2_ctl(lorem.cn.col.2, 3, 4, type='width', round='both')
substr2_ctl(lorem.cn.col.2, 1, 2, type='width', round='neither')
substr2_ctl(lorem.cn.col.2, 1, 3, type='width', round='neither')
substr2_ctl(lorem.cn.col.2, 2, 3, type='width', round='neither')
substr2_ctl(lorem.cn.col.2, 2, 4, type='width', round='neither')
substr2_ctl(lorem.cn.col.2, 3, 4, type='width', round='neither')
substr2_ctl(lorem.cn.col.2, 2, 3, type='width', round='neither', terminate=FALSE)
})
unitizer_sect("multi-elem", {
# Due to preservation of state issues, need to make sure works well with
# more than one value
lor.cn.2.2 <- rep(lorem.cn.col.2, 2)
substr2_ctl(lor.cn.2.2, c(1,3), c(2,4), type='width')
substr2_ctl(lor.cn.2.2, c(2,4), c(2,4), type='width')
})
unitizer_sect("zero width combining", {
combo <- "hello\u0300\u035c world"
Encoding(combo) <- "UTF-8"
substr2_ctl(combo, 1, 5, type='width')
substr2_ctl(combo, 5, 8, type='width')
substr2_ctl(rep(combo, 2), c(1, 5), c(5, 8), type='width')
nchar_ctl(combo, type='width')
nchar_ctl(combo, type='graphemes')
# zero width with double width
combo3 <- paste0(substr(lorem.cn.pieces[1], 1, 2), '\u0300')
Encoding(combo3) <- "UTF-8"
substr2_ctl(combo3, 3, 4, type='width')
substr2_ctl(combo3, 2, 4, type='width')
substr2_ctl(combo3, 4, 4, type='width')
substr2_ctl(combo3, 4, 5, type='width')
# start with diacritic
combo4 <- paste0('\u0300hello')
substr2_ctl(combo4, 1, 1, type='width') # no diacritic
substr2_ctl(combo4, 1, 1) # diacritic only
substr2_ctl(combo4, 0, 1, type='width') # with diacritic
substr2_ctl(combo4, 0, 0, type='width') # empty
})
unitizer_sect("Corner cases", {
utf8.bad <- "hello \xF0 world, goodnight moon"
Encoding(utf8.bad) <- 'UTF-8'
substr_ctl(utf8.bad, 1, 7)
substr_ctl(utf8.bad, 5, 10)
# Need to use `tryCatch` because the warnings vascillate for no rhyme or
# reason between showing the call and not. Seems to be triggered by
# re-installing package. now we're stuck with the try business to circumvent
# that variability.
tce(substr2_ctl(utf8.bad, 1, 7, type='width'))
# # need to remove for changes in R3.6.0
# substr2_ctl(utf8.bad, 1, 7, type='width', warn=FALSE)
tce(substr2_ctl(utf8.bad, 5, 10, type='width'))
# # need to remove for changes in R3.6.0
# substr2_ctl(utf8.bad, 5, 10, type='width', warn=FALSE)
# ends early
chrs.2 <- "hello\xee"
Encoding(chrs.2) <- "UTF-8"
tce(substr2_ctl(chrs.2, 1, 10, type='width'))
# # need to remove for changes in R3.6.0
# substr2_ctl(chrs.2, 1, 10, type='width', warn=FALSE)
# bad utf8 in SGR and CSI
bad.u <- c("A\033[31;\x80mB", "A\033[31;\x80pB")
Encoding(bad.u) <- "UTF-8"
substr_ctl(bad.u[1], 0, 3)
substr_ctl(bad.u[2], 0, 3)
# boundaries
b.test <- c(
"\uc0f6\ubed9",
"\u0301a\ubed9", # leading diacritic
"\ubed9\u0301a", # trailing diacritic
"\ubed9a\u0301" # really trailing diacritic
)
identical(substr_ctl(b.test, 0, 3), substr(b.test, 0, 3))
identical(substr_ctl(b.test, 0, 2), substr(b.test, 0, 2))
identical(substr_ctl(b.test, 1, 2), substr(b.test, 1, 2))
identical(substr_ctl(b.test, 0, 4), substr(b.test, 0, 4))
identical(substr_ctl(b.test, 4, 4), substr(b.test, 4, 4))
b.t.c <- sprintf("\033[43m%s\033[49m", b.test)
substr_ctl(b.t.c, 0, 0)
substr_ctl(b.t.c, 0, 2)
substr_ctl(b.t.c, 1, 2)
substr_ctl(b.t.c, 0, 4)
substr_ctl(b.t.c, 4, 4)
substr2_ctl(b.t.c, 0, 0, type='width')
substr2_ctl(b.t.c, 0, 2, type='width')
substr2_ctl(b.t.c, 1, 4, type='width')
substr2_ctl(b.t.c, 0, 5, type='width')
substr2_ctl(b.t.c, 5, 5, type='width')
substr_ctl(b.t.c, 0, 4, terminate=FALSE)
substr2_ctl(b.t.c, 1, 4, terminate=FALSE, type='width')
})
unitizer_sect("nchar", {
chr.dia <- 'A\u030A'
nchar_ctl(chr.dia)
nchar(chr.dia) # for reference, base gets it wrong too
nchar_ctl(chr.dia, type='width')
# Wide chars
w1 <- "\u4E00\u4E01\u4E03"
w2 <- "\u4E00\u4E01\u4E03"
nchar_ctl(w1)
nchar_ctl(w2, type='width')
nchar_ctl(w2, type='graphemes')
nchar_ctl(w2, type='bytes')
# Allow NA for illegal sequences
hello.illegal <- c("hello", "\xF0", "\xF0aaaa")
Encoding(hello.illegal) <- 'UTF-8'
nchar_ctl(hello.illegal)
nchar_ctl(hello.illegal, allowNA=TRUE)
# nzchar doesn't care about multi-byte illegal
nzchar_ctl(hello.illegal)
# escapes mixed in
esc.1 <- sprintf(
"hello \033[31mworld\033[m%s\033[48;5;123m blahs \033[m%s",
"\u76F4\u8349",
"\u56FA\u55F0\u5F8C"
)
Encoding(esc.1) <- 'UTF-8'
nchar_ctl(esc.1)
nchar_ctl(esc.1, type='width')
nchar_ctl(esc.1, type='bytes')
nzchar_ctl(esc.1)
esc.2 <- "\n\r\033P\033[31m\a"
nchar_ctl(c(esc.1, esc.2, 'hello'), warn=FALSE)
# _sgr
esc.4 <- c(sprintf("\033[31m%shello", w1), NA, hello.illegal)
nchar_sgr(esc.4, type='width', keepNA=FALSE, warn=FALSE, allowNA=TRUE)
nzchar_sgr(esc.4, keepNA=FALSE, warn=FALSE)
# _sgr does not strip C0; note R behavior on width of C0-C1
# fluctuating around R4.1 transition so can't test directly.
nchar_sgr("\033[31m\thello", type='width') >=
nchar_ctl("\033[31m\thello", type='width')
# nchar doesn't care about bad bits embedded in escapes
ncb <- c("123\033[31\x80m123", "123\033\x80123")
Encoding(ncb) <- "UTF-8"
nchar_ctl(ncb)
})
unitizer_sect("unhandled", {
# a bad utf8 string and other bad stuff
utf8.bad.0 <- "hello\033\033\033[45p \xF0how wor\ald"
Encoding(utf8.bad.0) <- "UTF-8"
unhandled_ctl(utf8.bad.0)
utf8.bad.1 <- "hello \xF0ho"
Encoding(utf8.bad.1) <- "UTF-8"
unhandled_ctl(utf8.bad.1)
})
unitizer_sect("utf8clen", {
# Can't test directly, but we can check what character lenght we get back from
# nchar and infer whether things have changed or not
#
# These tests are designed to start failing if behavior of utf8clen changes
#
# See src/main/valid_utf8.h
# U+0000..U+007F | 00..7F |
# U+0080..U+07FF | C2..DF | 80..BF
# U+0800..U+0FFF | E0 |*A0..BF*| 80..BF
# U+1000..U+CFFF | E1..EC | 80..BF | 80..BF
# U+D000..U+D7FF | ED |*80..9F*| 80..BF
# U+E000..U+FFFF | EE..EF | 80..BF | 80..BF
# U+10000..U+3FFFF | F0 |*90..BF*| 80..BF | 80..BF
# U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF
# U+100000..U+10FFFF | F4 |*80..8F*| 80..BF | 80..BF
chrs <- c(
"\xc2\x80", "\xDF\xBF",
"\xe0\xA0\x80", "\xE0\xBF\xBF",
"\xe1\x80\x80", "\xeC\xbf\xbf",
"\xed\x80\x80", "\xed\x9f\xbf",
"\xee\x90\x80", "\xef\xbf\xbf",
"\xf0\x90\x80\x80", "\xf4\x8f\xbf\xbf",
"\xf8\x80\x80\x80\x80", "\xfb\x80\x80\x80\x80",
"\xfc\x80\x80\x80\x80\x80", "\xff\x80\x80\x80\x80\x80"
)
Encoding(chrs) <- "UTF-8"
nchar(chrs, allowNA=TRUE)
nchar_ctl(chrs, allowNA=TRUE)
# Of the 10xxxxxx variety
utf8.bad.2 <- "\xBFaaaaaa"
Encoding(utf8.bad.2) <- "UTF-8"
nchar(utf8.bad.2, allowNA=TRUE)
nchar_ctl(utf8.bad.2, allowNA=TRUE)
## remove for changes in R3.6.0
substr_ctl(utf8.bad.2, 1, 1)
})
unitizer_sect("wrap corner cases", {
# With UTF8
pre.2 <- "\x1b[32m\xd0\x9f \x1b[0m"
ini.2 <- "\x1b[33m\xd1\x80 \x1b[0m"
hello.8c <- "hello Привет world"
Encoding(pre.2) <- "UTF-8"
Encoding(ini.2) <- "UTF-8"
Encoding(hello.8c) <- "UTF-8"
pre.3 <- "\xd0\x9f "
ini.3 <- "\xd1\x80 "
Encoding(pre.3) <- "UTF-8"
Encoding(ini.3) <- "UTF-8"
wrap.csi.4 <- strwrap_ctl(hello.8c, 15, prefix=pre.2, initial=ini.2)
wrap.csi.4
utf8.chr <- "\u76F4"
strwrap2_ctl(utf8.chr, 1, wrap.always=TRUE)
strwrap2_ctl(utf8.chr, 2, wrap.always=TRUE)
strwrap2_ctl(utf8.chr, 3, wrap.always=TRUE)
strwrap_ctl("lovelyday.", 10)
strwrap2_ctl("lovelyday.", 10, wrap.always=TRUE)
utf8.bad <- "hello \xF0 world, goodnight moon"
Encoding(utf8.bad) <- "UTF-8"
strwrap_ctl(utf8.bad, 10)
# bad prefix values
utf8.bad.2 <- "\xF0"
Encoding(utf8.bad.2) <- "UTF-8"
tcw(strwrap_ctl("hello world", 6, prefix=utf8.bad.2))
suppressWarnings(strwrap_ctl("hello world", 6, prefix=utf8.bad.2))
# Byte encoded strings not allowed
bytes <- "\xC0\xB1\xF0\xB1\xC0\xB1\xC0\xB1"
Encoding(bytes) <- "bytes"
tce(strwrap_ctl(bytes))
# Encoding captured correctly
encstrings <- c("hell\u00F8 world", "hello w\u00F8rld")
Encoding(strwrap_ctl(encstrings, 5))
# Caused an infinite loop in one case
str.inf <- "\U1F600 \U1F600"
strwrap2_ctl(str.inf, 2)
})
unitizer_sect("wrap with wide UTF8 and ESC", {
wrap.mix <- strwrap_ctl(lorem.mix, 25)
wrap.mix
# identical(
# strwrap(strip_ctl(lorem.mix, "sgr"), 25), strip_ctl(wrap.mix, "sgr")
# )
string <- "\033[37;48;5;32m國官方認定的民族現有56個\033[39;49m"
Encoding(string) <- "UTF-8"
strwrap2_ctl(string, 24, wrap.always=TRUE, pad.end=" ")
})
unitizer_sect("issue 54 ctd", {
# other issu54 tests are in tohtml.R, but had to move this one here due to the
# ellipsis utf-8 character.
string3 <- c(
"\033[38;5;246m# … with 5 more variables: total_time \033[3m\033[38;5;246m<bch:tm>\033[38;5;246m\033[23m, result \033[3m\033[38;5;246m<list>\033[38;5;246m\033[23m, memory \033[3m\033[38;5;246m<list>\033[38;5;246m\033[23m,",
"# time \033[3m\033[38;5;246m<list>\033[38;5;246m\033[23m, gc \033[3m\033[38;5;246m<list>\033[38;5;246m\033[23m\033[39m"
)
Encoding(string3) <- "UTF-8"
fansi::sgr_to_html(html_esc(string3))
# head <- "<html><head><meta charset='utf-8'/></head><pre>"
# f <- paste0(tempfile(), ".html")
# writeLines(c(head, fansi::sgr_to_html(string3), "</pre></html>"), f)
# browseURL(f)
# unlink(f)
# trigger warnings/errors
string4 <- c(
"wow \033[31m then", "hello\033[\x80;wow", "yo \033[m there",
"boom \033[41m"
)
Encoding(string4) <- "UTF-8"
sgr_to_html(string4)
})
unitizer_sect("html_esc", {
x <- "\U0001F600"
html_esc(c("h&e'l\"lo", "wor<ld>s", NA, ""), x)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.