Nothing
## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## ----default-vocabs-----------------------------------------------------------
library(wordpiece)
# The default vocabulary is uncased.
wordpiece_tokenize(
"I like tacos!"
)
# A cased vocabulary is also provided.
wordpiece_tokenize(
"I like tacos!",
vocab = wordpiece_vocab(cased = TRUE)
)
## ----example0-----------------------------------------------------------------
# Get path to sample vocabulary included with package.
vocab_path <- system.file("extdata", "tiny_vocab.txt", package = "wordpiece")
# Load the vocabulary.
vocab <- load_vocab(vocab_path)
# Take a peek at the vocabulary.
head(vocab)
## ----example1-----------------------------------------------------------------
# Now tokenize some text!
wordpiece_tokenize(text = "I love tacos, apples, and tea!", vocab = vocab)
## ----example2-----------------------------------------------------------------
# The above vocabulary was uncased.
attr(vocab, "is_cased")
# Here is the same vocabulary, but containing the capitalized token "Hi".
vocab_path2 <- system.file("extdata", "tiny_vocab_cased.txt",
package = "wordpiece")
vocab_cased <- load_vocab(vocab_path2)
head(vocab_cased)
# vocab_cased is inferred to be case-sensitive...
attr(vocab_cased, "is_cased")
# ... so the tokenization will *not* convert strings to lowercase, and so the
# words "I" and "And" are not found in the vocabulary (though "and" still is).
wordpiece_tokenize(text = "And I love tacos and salsa!", vocab = vocab_cased)
## ----example3-----------------------------------------------------------------
wordpiece_tokenize(text = "I love tacos!",
vocab = vocab_cased,
unk_token = "[missing]")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.