test_that("sklearn works", {
# set seed
seed <- 42L
set.seed(seed)
reticulate::py_set_seed(seed)
# preprocess
data(corpus, package = "gensimr")
docs <- prepare_documents(corpus)
dictionary <- corpora_dictionary(docs)
corpus_bow <- doc2bow(dictionary, docs)
corpus_mm <- serialize_mmcorpus(corpus_bow, auto_delete = FALSE)
tfidf <- model_tfidf(corpus_mm)
corpus_transformed <- wrap(tfidf, corpus_bow)
data("authors", package = "gensimr")
# author topic
auth2doc <- auth2doc(authors, name, document)
expect_type(auth2doc, "environment")
temp <- tempfile("serialized")
atmodel <- sklearn_at(
id2word = dictionary,
author2doc = auth2doc,
num_topics = 2L,
passes = 100L,
serialized = TRUE,
serialization_path = temp
)
unlink(temp, recursive = TRUE)
expect_type(atmodel, "environment")
fit <- atmodel$fit(corpus_bow)$transform("jack") %>%
reticulate::py_to_r()
expect_length(fit, 2)
# doc2vec
d2v <- sklearn_doc2vec(min_count = 1, size = 5)
expect_type(d2v, "environment")
vectors <- d2v$fit_transform(docs) %>%
reticulate::py_to_r()
expect_length(vectors, 45) # size = 5 * 9 docs
# hdp
hdp <- sklearn_hdp(id2word = dictionary)
expect_type(d2v, "environment")
vectors <- hdp$fit_transform(corpus_bow) %>%
reticulate::py_to_r()
expect_length(vectors, 81) # 9 docs
# lda
lda <- sklearn_lda(
num_topics = 2L,
id2word = dictionary,
iterations = 20L,
random_state = 1L
)
expect_type(lda, "environment")
trans <- lda$fit_transform(corpus_bow) %>%
reticulate::py_to_r()
expect_length(trans, 18) # 9 docs * 2 topics
# lsi
lsi <- sklearn_lsi(id2word = dictionary, num_topics = 15L)
expect_type(lsi, "environment")
# L2 reg classifier
clf <- sklearn_logistic(penalty = "l2", C = 0.1, solver = "lbfgs")
# sklearn pipepline
pipe <- sklearn_pipeline(lsi, clf)
# Create some random binary labels for our documents.
labels <- sample(c(0L, 1L), 9, replace = TRUE)
# How well does our pipeline perform on the training set?
fit <- pipe$fit(corpus_bow, labels)$score(corpus_bow, labels) %>%
reticulate::py_to_r()
expect_gt(fit, .7)
# random projections
rp_model <- sklearn_rp(id2word = dictionary)
expect_type(rp_model, "environment")
rp_fit <- rp_model$fit(corpus_bow)
# Use the trained model to transform a document.
result <- rp_fit$transform(corpus_bow) %>%
reticulate::py_to_r()
expect_length(result, 2700)
# phrase detection
corpus_split <- corpus %>%
purrr::map(strsplit, " ") %>%
purrr::map(function(x){
sentence <- x[[1]]
tolower(sentence)
})
# Create the model. Make sure no term is ignored and combinations seen 2+ times are captured.
pt_model <- sklearn_pt(min_count = 1, threshold = 2)
# Use sklearn fit_transform to see the transformation.
pt_trans <- pt_model$fit_transform(corpus_split)
# Since graph and minors were seen together 2+ times they are considered a phrase.
expect_true(c("graph_minors") %in% reticulate::py_to_r(pt_trans)[[9]])
# word id mapping
skbow_model <- sklearn_doc2bow()
# fit
corpus_skbow <- skbow_model$fit_transform(corpus) %>%
reticulate::py_to_r()
expect_length(corpus_skbow, 9)
# tfidf
tfidf_model <- sklearn_tfidf(dictionary = dictionary)
tfidf_w_sklearn <- tfidf_model$fit_transform(corpus_bow)
# same as with gensim
expect_true(corpus_transformed[[1]] == tfidf_w_sklearn[[1]])
# cleanup
delete_mmcorpus(corpus_mm)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.