R/tokenization.R
In wordpiece: R Implementation of Wordpiece Tokenization

Documented in .get_casedness .get_casedness.character .get_casedness.default .get_casedness.wordpiece_vocabulary .process_vocab .process_vocab.character .process_vocab.default .process_vocab.wordpiece_vocabulary .process_wp_vocab .process_wp_vocab.character .process_wp_vocab.default .process_wp_vocab.integer .process_wp_vocab.wordpiece_vocabulary wordpiece_tokenize .wp_tokenize_single_string .wp_tokenize_word

# Copyright 2021 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# wordpiece_tokenize ----------------------------------------------------

#' Tokenize Sequence with Word Pieces
#'
#' Given a sequence of text and a wordpiece vocabulary, tokenizes the text.
#'
#' @inheritParams .wp_tokenize_single_string
#' @param text Character; text to tokenize.
#'
#' @return A list of named integer vectors, giving the tokenization of the input
#'   sequences. The integer values are the token ids, and the names are the
#'   tokens.
#' @export
#'
#' @examples
#' tokens <- wordpiece_tokenize(
#'   text = c(
#'     "I love tacos!",
#'     "I also kinda like apples."
#'   )
#' )
wordpiece_tokenize <- function(text,
                               vocab = wordpiece_vocab(),
                               unk_token = "[UNK]",
                               max_chars = 100) {
  is_cased <- .get_casedness(vocab)
  vocab <- .process_vocab(vocab)

  if (!is_cased) {
    text <- tolower(text)
  }

  text <- piecemaker::prepare_and_tokenize(
    text = text,
    prepare = TRUE,
    remove_terminal_hyphens = FALSE
  )

  tokens <- lapply(
    X = text,
    FUN = .wp_tokenize_single_string,
    vocab = vocab,
    unk_token = unk_token,
    max_chars = max_chars
  )
  return(tokens)
}


# .wp_tokenize_single_string -------------------------------------------------

#' Tokenize an Input Word-by-word
#'
#' @param words Character; a vector of words (generated by space-tokenizing a
#'   single input).
#' @inheritParams .wp_tokenize_word
#'
#' @return A named integer vector of tokenized words.
#' @keywords internal
.wp_tokenize_single_string <- function(words,
                                       vocab,
                                       unk_token,
                                       max_chars) {
  token_vector <- unlist(
    lapply(
      X = words,
      FUN = .wp_tokenize_word,
      vocab = vocab,
      unk_token = unk_token,
      max_chars = max_chars
    )
  )
  # Get IDs by position.
  ids <- fastmatch::fmatch(token_vector, vocab)
  names(ids) <- token_vector
  return(ids - 1L) # default to 0-based index, for historical consistency
}


# .wp_tokenize_word -----------------------------------------------------------

#' Tokenize a Word
#'
#' Tokenize a single "word" (no whitespace). The word can technically contain
#' punctuation, but in BERT's tokenization, punctuation has been split out by
#' this point.
#'
#' @param word Word to tokenize.
#' @param vocab Character vector of vocabulary tokens. The tokens are assumed to
#'   be in order of index, with the first index taken as zero to be compatible
#'   with Python implementations.
#' @param unk_token Token to represent unknown words.
#' @param max_chars Maximum length of word recognized.
#'
#' @return Input word as a list of tokens.
#' @keywords internal
.wp_tokenize_word <- function(word,
                              vocab,
                              unk_token = "[UNK]",
                              max_chars = 100) {
  word_len <- stringi::stri_length(word)
  if (word_len > max_chars) {
    return(unk_token)
  }
  if (word %fin% vocab) {
    return(word)
  }

  is_bad <- FALSE
  start <- 1
  sub_tokens <- character(0)
  while (start <= word_len) {
    end <- word_len

    cur_substr <- NA_character_
    while (start <= end) {
      sub_str <- substr(word, start, end) # inclusive on both ends
      if (start > 1) { # means this substring is a suffix, so add '##'
        sub_str <- paste0("##", sub_str)
      }
      if (sub_str %fin% vocab) {
        cur_substr <- sub_str
        break
      }
      end <- end - 1
    }
    if (is.na(cur_substr)) {
      is_bad <- TRUE # nocov
      break # nocov
    }

    sub_tokens <- append(sub_tokens, cur_substr)
    start <- end + 1 # pick up where we left off
  }

  if (is_bad) {
    return(unk_token) # nocov
  }
  return(sub_tokens)
}


# .process_vocab -----------------------------------------------------------

#' Process a Vocabulary for Tokenization
#'
#' @param v An object of class `wordpiece_vocabulary` or a character vector.
#'
#' @return A character vector of tokens for tokenization.
#' @keywords internal
.process_vocab <- function(v) {
  UseMethod(".process_vocab", v)
}

#' @rdname dot-process_vocab
#' @keywords internal
#' @export
.process_vocab.default <- function(v) {
  stop("Unsupported vocabulary type. ",
       "The vocabulary should be a character vector ",
       "or an object of type `wordpiece_vocabulary.` ",
       "To use the default wordpiece vocabulary, see `wordpiece_vocab()`.")
}

#' @rdname dot-process_vocab
#' @keywords internal
#' @export
.process_vocab.wordpiece_vocabulary <- function(v) {
  return(.process_wp_vocab(v))
}

#' @rdname dot-process_vocab
#' @keywords internal
#' @export
.process_vocab.character <- function(v) {
  return(v)
}

#' Process a Wordpiece Vocabulary for Tokenization
#'
#' @param v An object of class `wordpiece_vocabulary`.
#'
#' @return A character vector of tokens for tokenization.
#' @keywords internal
.process_wp_vocab <- function(v) {
  UseMethod(".process_wp_vocab", v)
}

#' @rdname dot-process_wp_vocab
#' @keywords internal
#' @export
.process_wp_vocab.default <- function(v) {
  stop("Unsupported vocabulary type. ",
       "The vocabulary should be an object of type `wordpiece_vocabulary.` ",
       "To use the default wordpiece vocabulary, see `wordpiece_vocab()`.")
}

#' @rdname dot-process_wp_vocab
#' @keywords internal
#' @export
.process_wp_vocab.wordpiece_vocabulary <- function(v) {
  NextMethod()
}

#' @rdname dot-process_wp_vocab
#' @keywords internal
#' @export
.process_wp_vocab.integer <- function(v) {
  return(names(v)[order(v)])
}

#' @rdname dot-process_wp_vocab
#' @keywords internal
#' @export
.process_wp_vocab.character <- function(v) {
  return(v)
}


# .get_casedness ----------------------------------------------------------


#' Determine Casedness of Vocabulary
#'
#' @param v An object of class `wordpiece_vocabulary`, or a character vector.
#'
#' @return TRUE if the vocabulary is case-sensitive, FALSE otherwise.
#' @keywords internal
.get_casedness <- function(v) {
  UseMethod(".get_casedness", v)
}

#' @rdname dot-get_casedness
#' @keywords internal
#' @export
.get_casedness.default <- function(v) {
  stop("Unsupported vocabulary type. ",
       "The vocabulary should be a character vector ",
       "or an object of type `wordpiece_vocabulary.` ",
       "To use the default wordpiece vocabulary, see `wordpiece_vocab()`.")
}

#' @rdname dot-get_casedness
#' @keywords internal
#' @export
.get_casedness.wordpiece_vocabulary <- function(v) {
  return(attr(v, "is_cased"))
}

#' @rdname dot-get_casedness
#' @keywords internal
#' @export
.get_casedness.character <- function(v) {
  return(.infer_case_from_vocab(v))
}