tests/benchmarks/benchmark_tokens/tokenize.R

library(profvis) # for profiling
library(tokenizers)

corp <- readRDS("/home/kohei/Documents/Brexit/Data/data_corpus_guardian.RDS")

system.time(
    tokens(corp, what = 'word', verbose = TRUE)
)

system.time(
    tokens(corp, what = 'fastestword', verbose = TRUE)
)

txt <- rep(paste0(letters, collapse=' '), 10000)

microbenchmark::microbenchmark(
    tokenizers::tokenize_words(txt),
    tokens(txt, what = 'word'),
    unit = 'relative'
)

profvis(
  tokens(txt, what = 'fastestword')
)
quanteda/quanteda documentation built on April 15, 2024, 7:59 a.m.