Nothing
w <- tibble(
document = rep(1:2, each = 5),
word = c(
"the", "quick", "brown", "fox", "jumped",
"over", "the", "lazy", "brown", "dog"
),
frequency = c(
1, 1, 1, 1, 2,
1, 1, 1, 1, 2
)
)
test_that("Can calculate TF-IDF", {
result <- w %>%
bind_tf_idf(word, document, frequency)
result2 <- w %>%
bind_tf_idf("word", "document", "frequency")
expect_equal(result, result2)
expect_equal(
select(w, document, word, frequency),
select(result, document, word, frequency)
)
expect_s3_class(result, "tbl_df")
expect_type(result$tf, "double")
expect_type(result$idf, "double")
expect_type(result$tf_idf, "double")
expect_equal(result$tf, rep(c(1 / 6, 1 / 6, 1 / 6, 1 / 6, 1 / 3), 2))
expect_equal(result$idf[1:4], c(0, log(2), 0, log(2)))
expect_equal(result$tf_idf, result$tf * result$idf)
# preserves but ignores groups
result2 <- w %>%
group_by(document) %>%
bind_tf_idf(word, document, frequency)
expect_equal(length(groups(result2)), 1)
expect_equal(as.character(groups(result2)[[1]]), "document")
})
test_that("TF-IDF works when the document ID is a number", {
# example thanks to https://github.com/juliasilge/tidytext/issues/31
my_corpus <- dplyr::tibble(
id = rep(c(2, 3), each = 3),
word = c("an", "interesting", "text", "a", "boring", "text"),
n = c(1, 1, 3, 1, 2, 1)
)
tf_idf <- bind_tf_idf(my_corpus, word, id, n)
expect_false(any(is.na(tf_idf)))
expect_equal(tf_idf$tf_idf[c(3, 6)], c(0, 0))
})
test_that("tf-idf with tidyeval works", {
d <- tibble(txt = c(
"Because I could not stop for Death -",
"He kindly stopped for me -"
))
termvar <- quo("word")
documentvar <- quo("document")
countvar <- quo("frequency")
result <- w %>%
bind_tf_idf(!!termvar, !!documentvar, !!countvar)
expect_equal(
select(w, document, word, frequency),
select(result, document, word, frequency)
)
expect_s3_class(result, "tbl_df")
expect_type(result$tf, "double")
expect_type(result$idf, "double")
expect_type(result$tf_idf, "double")
expect_equal(result$tf, rep(c(1 / 6, 1 / 6, 1 / 6, 1 / 6, 1 / 3), 2))
expect_equal(result$idf[1:4], c(0, log(2), 0, log(2)))
expect_equal(result$tf_idf, result$tf * result$idf)
result2 <- w %>%
group_by(document) %>%
bind_tf_idf(!!termvar, !!documentvar, !!countvar)
expect_equal(length(groups(result2)), 1)
expect_equal(as.character(groups(result2)[[1]]), "document")
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.