Nothing
## ----eval=FALSE---------------------------------------------------------------
# vignette("textreuse-pairwise", package = "textreuse")
# vignette("textreuse-minhash", package = "textreuse")
# vignette("textreuse-alignment", package = "textreuse")
## -----------------------------------------------------------------------------
library(textreuse)
file <- system.file("extdata/ats/remember00palm.txt",
package = "textreuse")
doc <- TextReuseTextDocument(file = file, meta = list("publisher" = "ATS"),
tokenizer = tokenize_ngrams, n = 5,
keep_tokens = TRUE)
doc
## -----------------------------------------------------------------------------
meta(doc)
meta(doc, "id")
meta(doc, "date") <- 1865
head(tokens(doc))
head(hashes(doc))
wordcount(doc)
## -----------------------------------------------------------------------------
dir <- system.file("extdata/ats", package = "textreuse")
corpus <- TextReuseCorpus(dir = dir, tokenizer = tokenize_ngrams, n = 5,
progress = FALSE)
corpus
## -----------------------------------------------------------------------------
names(corpus)
corpus[["remember00palm"]]
corpus[c("calltounconv00baxt", "lifeofrevrichard00baxt")]
## -----------------------------------------------------------------------------
wordcount(corpus)
## -----------------------------------------------------------------------------
text <- "How many roads must a man walk down\nBefore you'll call him a man?"
tokenize_words(text)
tokenize_sentences(text)
tokenize_ngrams(text, n = 3)
tokenize_skip_ngrams(text, n = 3, k = 2)
## -----------------------------------------------------------------------------
poem <- "Roses are red\nViolets are blue\nI like using R\nAnd you should too"
cat(poem)
tokenize_lines <- function(string) {
stringr::str_split(string, "\n+")[[1]]
}
tokenize_lines(poem)
## -----------------------------------------------------------------------------
hash_string(tokenize_words(text))
## -----------------------------------------------------------------------------
a <- tokenize_words(paste("How does it feel, how does it feel?",
"To be without a home",
"Like a complete unknown, like a rolling stone"))
b <- tokenize_words(paste("How does it feel, how does it feel?",
"To be on your own, with no direction home",
"A complete unknown, like a rolling stone"))
jaccard_similarity(a, b)
jaccard_dissimilarity(a, b)
jaccard_bag_similarity(a, b)
ratio_of_matches(a, b)
## ----eval = FALSE-------------------------------------------------------------
# options("mc.cores" = 4L)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.