context("Full text to sparse pipeline")
test_that("Full text to sparse pipeline", {
skip_on_cran()
library(quanteda)
data("inaugTexts")
set.seed(1)
M_plain <- textVectors(
inaugTexts,
normalize=FALSE,
split_token=' ',
verbose=FALSE,
freqCutoff=0,
absCutoff=0,
tfidf=FALSE,
idf=NULL,
bagofwords=NULL,
spellcheck=FALSE,
remove_stopwords=FALSE,
stem=FALSE,
ngrams=1,
skips=0,
stops=NULL,
pca=FALSE,
tsne=FALSE)
expect_equal(dim(M_plain$M), c(length(inaugTexts), length(M_plain$bagofwords)))
set.seed(1)
M_ngram <- textVectors(
inaugTexts,
normalize=FALSE,
split_token=' ',
verbose=FALSE,
freqCutoff=0,
absCutoff=0,
tfidf=FALSE,
idf=NULL,
bagofwords=NULL,
spellcheck=FALSE,
remove_stopwords=FALSE,
stem=FALSE,
ngrams=5,
skips=0,
stops=NULL,
pca=FALSE,
tsne=FALSE)
expect_equal(dim(M_ngram$M), c(length(inaugTexts), length(M_ngram$bagofwords)))
set.seed(1)
M_skip <- textVectors(
inaugTexts,
normalize=FALSE,
split_token=' ',
verbose=FALSE,
freqCutoff=0,
absCutoff=0,
tfidf=FALSE,
idf=NULL,
bagofwords=NULL,
spellcheck=FALSE,
remove_stopwords=FALSE,
stem=FALSE,
ngrams=1,
skips=5,
stops=NULL,
pca=FALSE,
tsne=FALSE)
expect_equal(dim(M_skip$M), c(length(inaugTexts), length(M_skip$bagofwords)))
set.seed(1)
M_norm <- textVectors(
inaugTexts,
normalize=TRUE,
split_token=' ',
verbose=FALSE,
freqCutoff=0,
absCutoff=0,
tfidf=FALSE,
idf=NULL,
bagofwords=NULL,
spellcheck=FALSE,
remove_stopwords=FALSE,
stem=FALSE,
ngrams=1,
skips=0,
stops=NULL,
pca=FALSE,
tsne=FALSE)
expect_equal(dim(M_norm$M), c(length(inaugTexts), length(M_norm$bagofwords)))
set.seed(1)
M_stem <- textVectors(
inaugTexts,
normalize=TRUE,
split_token=' ',
verbose=FALSE,
freqCutoff=0,
absCutoff=0,
tfidf=FALSE,
idf=NULL,
bagofwords=NULL,
spellcheck=FALSE,
remove_stopwords=FALSE,
stem=TRUE,
ngrams=1,
skips=0,
stops=NULL,
pca=FALSE,
tsne=FALSE)
set.seed(1)
expect_equal(dim(M_stem$M), c(length(inaugTexts), length(M_stem$bagofwords)))
M_pca <- textVectors(
inaugTexts,
normalize=FALSE,
split_token=' ',
verbose=FALSE,
freqCutoff=0,
absCutoff=0,
tfidf=FALSE,
idf=NULL,
bagofwords=NULL,
spellcheck=FALSE,
remove_stopwords=FALSE,
stem=FALSE,
ngrams=1,
skips=0,
stops=NULL,
pca=TRUE,
pca_comp=15,
pca_rotation=NULL,
tsne=FALSE)
expect_equal(dim(M_pca$M), c(length(inaugTexts), length(M_pca$bagofwords)))
set.seed(1)
M_tnse <- textVectors(
inaugTexts,
normalize=FALSE,
split_token=' ',
verbose=FALSE,
freqCutoff=0,
absCutoff=0,
tfidf=FALSE,
idf=NULL,
bagofwords=NULL,
spellcheck=FALSE,
remove_stopwords=FALSE,
stem=FALSE,
ngrams=1,
skips=0,
stops=NULL,
pca=TRUE,
pca_comp=15,
pca_rotation=NULL,
tsne=TRUE,
tsne_dims=2,
tsne_perplexity=3)
expect_equal(dim(M_tnse$M), c(length(inaugTexts), length(M_tnse$bagofwords)))
set.seed(1)
M_dictionary <- textVectors(inaugTexts, bagofwords=letters)
expect_equal(dim(M_dictionary$M), c(length(inaugTexts), length(letters)))
set.seed(1)
M_complicated <- textVectors(
inaugTexts,
normalize=TRUE,
split_token=' ',
verbose=FALSE,
freqCutoff=.05,
absCutoff=5,
tfidf=TRUE,
remove_stopwords=TRUE,
stem=TRUE,
ngrams=3,
skips=3,
pca=TRUE,
pca_comp=15,
tsne=TRUE,
tsne_dims=2,
tsne_perplexity=3)
expect_equal(dim(M_complicated$M), c(length(inaugTexts), length(M_complicated$bagofwords)))
expect_more_than(length(M_plain$bagofwords), length(M_norm$bagofwords))
expect_more_than(length(M_plain$bagofwords), length(M_stem$bagofwords))
expect_less_than(length(M_plain$bagofwords), length(M_ngram$bagofwords))
expect_less_than(length(M_plain$bagofwords), length(M_skip$bagofwords))
expect_more_than(length(M_plain$bagofwords), length(M_complicated$bagofwords))
expect_more_than(length(M_ngram$bagofwords), length(M_complicated$bagofwords))
expect_more_than(length(M_ngram$bagofwords), length(M_skip$bagofwords))
expect_more_than(length(M_skip$bagofwords), length(M_complicated$bagofwords))
expect_equal(M_plain$bagofwords, M_tnse$bagofwords)
expect_equal(M_plain$bagofwords, M_pca$bagofwords)
expect_equal(M_plain$M, M_tnse$M)
expect_equal(M_plain$M, M_pca$M)
expect_equal(M_plain$pca_rotation, NULL)
expect_equal(M_norm$pca_rotation, NULL)
expect_equal(M_stem$pca_rotation, NULL)
expect_equal(M_plain$tsne_proj, NULL)
expect_equal(M_norm$tsne_proj, NULL)
expect_equal(M_stem$tsne_proj, NULL)
expect_equal(M_tnse$pca_rotation, M_pca$pca_rotation)
expect_equal(M_tnse$pca_rotation, M_pca$pca_rotation)
#Check PCA dims
for(model in list(
M_pca,
M_tnse,
M_complicated
)){
expect_equal(dim(model$x), c(length(inaugTexts), 15))
expect_equal(ncol(model$pca_rotation), 15)
expect_equal(nrow(model$pca_rotation), length(model$bagofwords))
expect_equal(nrow(model$pca_rotation), ncol(model$M))
}
#Check PCA dims
for(model in list(
M_tnse,
M_complicated
)){
expect_equal(dim(model$tsne_proj), c(length(inaugTexts), 2))
}
#Predicition
plts <- tempfile()
png(plts)
for(model in list(
M_plain,
M_ngram,
M_skip,
M_norm,
M_stem,
M_pca
)){
plot(model)
print(model)
p <- predict(model, inaugTexts)
expect_equal(p$bagofwords, model$bagofwords)
expect_equal(p$M, model$M)
expect_equal(p$pca_rotation, model$pca_rotation)
expect_equal(p$tsne_proj, model$tsne_proj)
}
for(model in list(
M_tnse,
M_complicated
)){
plot(model)
print(model)
expect_warning(p <- predict(model, inaugTexts))
expect_equal(p$bagofwords, model$bagofwords)
expect_equal(p$M, model$M)
expect_equal(p$pca_rotation, model$pca_rotation)
}
dev.off()
unlink(plts)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.