Nothing
# Copyright 2021 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# wordpiece_tokenize ----------------------------------------------------
#' Tokenize Sequence with Word Pieces
#'
#' Given a sequence of text and a wordpiece vocabulary, tokenizes the text.
#'
#' @inheritParams .wp_tokenize_single_string
#' @param text Character; text to tokenize.
#'
#' @return A list of named integer vectors, giving the tokenization of the input
#' sequences. The integer values are the token ids, and the names are the
#' tokens.
#' @export
#'
#' @examples
#' tokens <- wordpiece_tokenize(
#' text = c(
#' "I love tacos!",
#' "I also kinda like apples."
#' )
#' )
wordpiece_tokenize <- function(text,
vocab = wordpiece_vocab(),
unk_token = "[UNK]",
max_chars = 100) {
is_cased <- .get_casedness(vocab)
vocab <- .process_vocab(vocab)
if (!is_cased) {
text <- tolower(text)
}
text <- piecemaker::prepare_and_tokenize(
text = text,
prepare = TRUE,
remove_terminal_hyphens = FALSE
)
tokens <- lapply(
X = text,
FUN = .wp_tokenize_single_string,
vocab = vocab,
unk_token = unk_token,
max_chars = max_chars
)
return(tokens)
}
# .wp_tokenize_single_string -------------------------------------------------
#' Tokenize an Input Word-by-word
#'
#' @param words Character; a vector of words (generated by space-tokenizing a
#' single input).
#' @inheritParams .wp_tokenize_word
#'
#' @return A named integer vector of tokenized words.
#' @keywords internal
.wp_tokenize_single_string <- function(words,
vocab,
unk_token,
max_chars) {
token_vector <- unlist(
lapply(
X = words,
FUN = .wp_tokenize_word,
vocab = vocab,
unk_token = unk_token,
max_chars = max_chars
)
)
# Get IDs by position.
ids <- fastmatch::fmatch(token_vector, vocab)
names(ids) <- token_vector
return(ids - 1L) # default to 0-based index, for historical consistency
}
# .wp_tokenize_word -----------------------------------------------------------
#' Tokenize a Word
#'
#' Tokenize a single "word" (no whitespace). The word can technically contain
#' punctuation, but in BERT's tokenization, punctuation has been split out by
#' this point.
#'
#' @param word Word to tokenize.
#' @param vocab Character vector of vocabulary tokens. The tokens are assumed to
#' be in order of index, with the first index taken as zero to be compatible
#' with Python implementations.
#' @param unk_token Token to represent unknown words.
#' @param max_chars Maximum length of word recognized.
#'
#' @return Input word as a list of tokens.
#' @keywords internal
.wp_tokenize_word <- function(word,
vocab,
unk_token = "[UNK]",
max_chars = 100) {
word_len <- stringi::stri_length(word)
if (word_len > max_chars) {
return(unk_token)
}
if (word %fin% vocab) {
return(word)
}
is_bad <- FALSE
start <- 1
sub_tokens <- character(0)
while (start <= word_len) {
end <- word_len
cur_substr <- NA_character_
while (start <= end) {
sub_str <- substr(word, start, end) # inclusive on both ends
if (start > 1) { # means this substring is a suffix, so add '##'
sub_str <- paste0("##", sub_str)
}
if (sub_str %fin% vocab) {
cur_substr <- sub_str
break
}
end <- end - 1
}
if (is.na(cur_substr)) {
is_bad <- TRUE # nocov
break # nocov
}
sub_tokens <- append(sub_tokens, cur_substr)
start <- end + 1 # pick up where we left off
}
if (is_bad) {
return(unk_token) # nocov
}
return(sub_tokens)
}
# .process_vocab -----------------------------------------------------------
#' Process a Vocabulary for Tokenization
#'
#' @param v An object of class `wordpiece_vocabulary` or a character vector.
#'
#' @return A character vector of tokens for tokenization.
#' @keywords internal
.process_vocab <- function(v) {
UseMethod(".process_vocab", v)
}
#' @rdname dot-process_vocab
#' @keywords internal
#' @export
.process_vocab.default <- function(v) {
stop("Unsupported vocabulary type. ",
"The vocabulary should be a character vector ",
"or an object of type `wordpiece_vocabulary.` ",
"To use the default wordpiece vocabulary, see `wordpiece_vocab()`.")
}
#' @rdname dot-process_vocab
#' @keywords internal
#' @export
.process_vocab.wordpiece_vocabulary <- function(v) {
return(.process_wp_vocab(v))
}
#' @rdname dot-process_vocab
#' @keywords internal
#' @export
.process_vocab.character <- function(v) {
return(v)
}
#' Process a Wordpiece Vocabulary for Tokenization
#'
#' @param v An object of class `wordpiece_vocabulary`.
#'
#' @return A character vector of tokens for tokenization.
#' @keywords internal
.process_wp_vocab <- function(v) {
UseMethod(".process_wp_vocab", v)
}
#' @rdname dot-process_wp_vocab
#' @keywords internal
#' @export
.process_wp_vocab.default <- function(v) {
stop("Unsupported vocabulary type. ",
"The vocabulary should be an object of type `wordpiece_vocabulary.` ",
"To use the default wordpiece vocabulary, see `wordpiece_vocab()`.")
}
#' @rdname dot-process_wp_vocab
#' @keywords internal
#' @export
.process_wp_vocab.wordpiece_vocabulary <- function(v) {
NextMethod()
}
#' @rdname dot-process_wp_vocab
#' @keywords internal
#' @export
.process_wp_vocab.integer <- function(v) {
return(names(v)[order(v)])
}
#' @rdname dot-process_wp_vocab
#' @keywords internal
#' @export
.process_wp_vocab.character <- function(v) {
return(v)
}
# .get_casedness ----------------------------------------------------------
#' Determine Casedness of Vocabulary
#'
#' @param v An object of class `wordpiece_vocabulary`, or a character vector.
#'
#' @return TRUE if the vocabulary is case-sensitive, FALSE otherwise.
#' @keywords internal
.get_casedness <- function(v) {
UseMethod(".get_casedness", v)
}
#' @rdname dot-get_casedness
#' @keywords internal
#' @export
.get_casedness.default <- function(v) {
stop("Unsupported vocabulary type. ",
"The vocabulary should be a character vector ",
"or an object of type `wordpiece_vocabulary.` ",
"To use the default wordpiece vocabulary, see `wordpiece_vocab()`.")
}
#' @rdname dot-get_casedness
#' @keywords internal
#' @export
.get_casedness.wordpiece_vocabulary <- function(v) {
return(attr(v, "is_cased"))
}
#' @rdname dot-get_casedness
#' @keywords internal
#' @export
.get_casedness.character <- function(v) {
return(.infer_case_from_vocab(v))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.