In quanteda/quanteda: Quantitative Analysis of Textual Data

knitr::opts_chunk$set(echo = TRUE)
require(quanteda)

R Markdown

bigram

toks2 <- tokens(data_corpus_inaugural)
toks2 <- tokens_remove(toks2, stopwords("english"), padding = TRUE)
microbenchmark::microbenchmark(path_R = textstat_collocations(toks2, size = 2, tolower = FALSE, method="all", path=1),  
path_C = textstat_collocations(toks2, size = 2, tolower = FALSE, method="all", path=2), 
times = 2, unit = "relative")

compare minimum counts

toks2 <- tokens(quantedaData::data_corpus_SOTU)
toks2 <- tokens_remove(toks2, stopwords("english"), padding = TRUE)
microbenchmark::microbenchmark(path_R = textstat_collocations(toks2, size = 3, tolower = FALSE, method="lambda", min_count = 2, path=2),  
path_C = textstat_collocations(toks2, size = 3, tolower = FALSE, method="lambda", min_count = 1,  path=2), 
times = 2, unit = "relative")

trigram

toks2 <- tokens(data_corpus_inaugural)
toks2 <- tokens_remove(toks2, stopwords("english"), padding = TRUE)

microbenchmark::microbenchmark(path_R = textstat_collocations(toks2, size = 3, tolower = FALSE, method="all", path=1),  
path_C = textstat_collocations(toks2, size = 3, tolower = FALSE, method="all", path=2), 
times = 2, unit = "relative")

4-grams

toks2 <- tokens(data_corpus_inaugural)
toks2 <- tokens_remove(toks2, stopwords("english"), padding = TRUE)

microbenchmark::microbenchmark(path_R = textstat_collocations(toks2, size = 4, tolower = FALSE, method="all", path=1),  
path_C = textstat_collocations(toks2, size = 4, tolower = FALSE, method="all", path=2), 
times = 2, unit = "relative")