Nothing
corpus_original <- c(
"The rabbit munched the orange carrot.",
"The snake hugged the green lizard.",
"The hedgehog impaled the orange orange.",
"The squirrel buried the brown nut."
)
# text preprocessing: tolower, remove punctuation, remove stopwords
# note this is just an example and not the best way for larger amounts of text
stopwords <- c("the", "a", "an", "and")
corpus <- corpus_original |>
tolower() |>
gsub(pattern = "[[:punct:]]", replacement = "") |>
gsub(pattern = paste0("\\b(", paste(stopwords, collapse = "|"), ") *\\b"),
replacement = "") |>
trimws()
# define some metadata for the text corpus, e.g., the original text and the source
metadata <- data.frame(
text_original = corpus_original,
source = c("book1", "book2", "book3", "book4")
)
test_that("BM25 works", {
bm <- BM25$new(data = corpus, metadata = metadata)
expect_equal(class(bm), c("BM25", "R6"))
expect_equal(bm$get_lang(), "Detect")
expected_data <- data.frame(
text = corpus,
text_original = corpus_original,
source = c("book1", "book2", "book3", "book4")
)
expect_equal(bm$get_data(), expected_data)
expected_languages <- c(
ar = "arabic", da = "danish", nl = "dutch", en = "english",
fr = "french", de = "german", el = "greek", hu = "hungarian",
it = "italian", no = "norwegian", pt = "portuguese", ro = "romanian",
ru = "russian", es = "spanish", sv = "swedish", ta = "tamil",
tr = "turkish", auto = "detect"
)
expect_equal(bm$available_languages(), expected_languages)
res <- bm$query(query = "orange", max_n = 2)
expected <- data.frame(
id = c(3, 1),
score = c(0.49042809, 0.35667497),
rank = c(1, 2),
text = corpus[c(3, 1)],
text_original = corpus_original[c(3, 1)],
source = c("book3", "book1")
)
expect_equal(res, expected)
})
test_that("bm25_score works", {
scores <- bm25_score(data = corpus, query = "orange")
expected <- c(0.35667497, 0.0, 0.49042809, 0.0)
expect_equal(scores, expected)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.