library("quanteda")
data(data_corpus_sotu, package = "quanteda.corpora")
data_corpus_sotu <- as.corpus(data_corpus_sotu)
microbenchmark(
word = tokens(data_corpus_sotu, what = "word", split_hyphens = TRUE, split_tags = TRUE),
word1 = tokens(data_corpus_sotu, what = "word1", split_hyphens = TRUE, split_tags = TRUE),
tokenizers = as.tokens(tokenizers::tokenize_words(texts(data_corpus_sotu),
lowercase = FALSE,
strip_punct = FALSE)),
times = 5
)
x <- data_corpus_sotu
txt <- c(doc1 = "Tweet https://quanteda.io using @quantedainit and #rstats.",
doc2 = "The £1,000,000 question.",
doc3 = "毎日 #quanteda を使用してください!",
doc4 = "Line 1.\nLine2\n\nLine3.",
doc5 = "?",
doc6 = "Self-aware machines! \U0001f600")
library("microbenchmark")
microbenchmark(
word = tokens(x, what = "word"),
word1 = tokens(x, what = "word1"),
faster = tokens(x, what = "fasterword"),
fastest = tokens(x, what = "fastestword"),
times = 3, unit = "relative"
)
#
#
# microbenchmark(regex = stri_detect_regex(x, "^((https{0,1}|s{0,1}ftp)://)|(\\w+@\\w+)"),
# fixed = stri_detect_fixed(x, "http"), times = 10, unit = "relative")
toks <- tokens(data_corpus_sotu)
microbenchmark(
fixed = tokens_split(toks, "@", valuetype = "fixed", remove_separator = F) %>% tokens_split("#", valuetype = "fixed", remove_separator = F),
regex = tokens_split(toks, "[@#]", valuetype = "regex", remove_separator = F),
times = 2, unit = "relative"
)
microbenchmark(
word = tokens(x, what = "word"),
word1 = tokens(x, what = "word1"),
times = 10
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.