R/sentencepiece.R
In sentencepiece: Text Tokenization using Byte Pair Encoding and Unigram Modelling

Documented in sentencepiece sentencepiece_decode sentencepiece_encode sentencepiece_load_model

#' @title Construct a Sentencepiece model
#' @description Construct a Sentencepiece model on text.
#' @param x a character vector of path(s) to the text files containing training data
#' @param type either one of 'bpe', 'char', 'unigram' or 'word' for Byte Pair Encoding, Character level encoding,
#' Unigram encoding or pretokenised word encoding. Defaults to 'bpe' (Byte Pair Encoding).
#' @param vocab_size integer indicating the number of tokens in the final vocabulary. Defaults to 8000.
#' @param coverage fraction of characters covered by the model. Must be in the range [0, 1]. A good value to use is about 0.9999.
#' @param model_prefix character string with the name of the model. Defaults to 'sentencepiece'.
#' When executing the function 2 files will be created in the directory specified by \code{model_dir}, namely
#' sentencepiece.model with the model and sentencepiece.vocab containing the vocabulary of the model. 
#' You can change the name of the model by providing the \code{model_prefix} argument.
#' @param model_dir directory where the model will be saved. Defaults to the temporary directory (tempdir())
#' @param threads integer indicating number of threads to use when building the model
#' @param args character string with arguments passed on to sentencepiece::SentencePieceTrainer::Train (for expert use only)
#' @param verbose logical indicating to show progress of sentencepiece training. Defaults to \code{FALSE}.
#' @return an object of class \code{sentencepiece} which is defined at \code{\link{sentencepiece_load_model}}
#' @seealso \code{\link{sentencepiece_load_model}}
#' @export
#' @examples
#' library(tokenizers.bpe)
#' data(belgium_parliament, package = "tokenizers.bpe")
#' path   <- "traindata.txt" 
#' folder <- getwd() 
#' \dontshow{
#' path   <- tempfile("traindata_", fileext = ".txt")
#' folder <- tempdir()
#' }
#' writeLines(belgium_parliament$text, con = path)
#' \dontshow{
#' model <- sentencepiece(path, type = "char", vocab_size = 30, model_dir = folder)
#' model <- sentencepiece(path, type = "unigram", vocab_size = 50, model_dir = folder)
#' model <- sentencepiece(path, type = "bpe", vocab_size = 200, model_dir = folder)
#' }
#' \donttest{
#' model <- sentencepiece(path, type = "char", 
#'                        model_dir = folder, verbose = TRUE)
#' model <- sentencepiece(path, type = "unigram", vocab_size = 20000, 
#'                        model_dir = folder, verbose = TRUE)
#' model <- sentencepiece(path, type = "bpe", vocab_size = 4000, 
#'                        model_dir = folder, verbose = TRUE)
#' 
#' txt <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.",
#'          "On est d'accord sur le prix de la biere?")
#' sentencepiece_encode(model, x = txt, type = "subwords")
#' sentencepiece_encode(model, x = txt, type = "ids")
#' 
#' 
#' model <- sentencepiece_load_model(file.path(folder, "sentencepiece.model"))
#' sentencepiece_encode(model, x = txt, type = "subwords")
#' sentencepiece_encode(model, x = txt, type = "ids")
#' }
#' 
#' \dontshow{
#' # clean up for CRAN
#' file.remove(file.path(folder, "sentencepiece.model"))
#' file.remove(file.path(folder, "sentencepiece.vocab"))
#' file.remove(path)
#' }
sentencepiece <- function(x, type = c("bpe", "char", "unigram", "word"), vocab_size = 8000, coverage = 0.9999, 
                          model_prefix = "sentencepiece", 
                          model_dir = tempdir(), threads = 1L, args, verbose = FALSE){
  oldwd <- getwd()
  on.exit(setwd(oldwd))
  setwd(model_dir)
  if(grepl(pattern = " ", model_prefix)){
    stop("model_prefix should not contain spaces")
  }
  type <- match.arg(type)
  if(missing(args)){
    stopifnot(is.character(x))
    stopifnot(all(file.exists(x)))
    args <- sprintf("--input=%s --model_prefix=%s --vocab_size=%s --character_coverage=%s --model_type=%s --num_threads=%s", paste(x, collapse=","), model_prefix, vocab_size, coverage, type, threads)
  }else{
    args <- as.character(args)
    model_prefix <- regmatches(args, regexpr(args, pattern = "model_prefix=.+ -"))
    model_prefix <- gsub("^model_prefix=|-$", "", model_prefix)
    model_prefix <- gsub(" +$", "", model_prefix)
    if(length(model_prefix) == 0 || nchar(model_prefix) == 0){
      stop("Please provide at least model_prefix")
    }
  }
  if(verbose){
    result <- spc_train(args)  
  }else{
    msg <- capture.output(result <- spc_train(args))
  }
  model <- file.path(getwd(), sprintf("%s.model", model_prefix))  
  result <- sentencepiece_load_model(file = model)
  result
}


#' @export
print.sentencepiece <- function(x, ...){
  cat("Sentencepiece model", sep = "\n")
  cat(sprintf("  size of the vocabulary: %s", x$vocab_size), sep = "\n")
  cat(sprintf("  model stored at: %s", x$model_path), sep = "\n")
}

#' @title Load a Sentencepiece model
#' @description Load a Sentencepiece model which either was trained with \code{\link{sentencepiece}} or which you have found in the wild.
#' @param file path to the file containing the Sentencepiece model
#' @return an object of class \code{sentencepiece} which is a list with elements
#' \itemize{
#' \item{model: an Rcpp pointer to the model}
#' \item{model_path: the path to the model}
#' \item{vocab_size: the size of the Sentencepiece vocabulary}
#' \item{vocabulary: the Sentencepiece vocabulary which is a data.frame with columns id and subword}
#' }
#' @export
#' @examples
#' model <- system.file(package = "sentencepiece", "models", "nl-fr-dekamer.model")
#' model <- sentencepiece_load_model(file = model)
#' 
#' txt <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.",
#'          "On est d'accord sur le prix de la biere?")
#' sentencepiece_encode(model, x = txt, type = "subwords")
#' sentencepiece_encode(model, x = txt, type = "ids")
sentencepiece_load_model <- function(file = "sentencepiece.model"){
  stopifnot(file.exists(file))
  model <- spc_load_model(file)
  Encoding(model$vocabulary$subword) <- "UTF-8"
  class(model) <- "sentencepiece"
  model
}

#' @title Tokenise text alongside a Sentencepiece model
#' @description Tokenise text alongside a Sentencepiece model
#' @param model an object of class \code{sentencepiece} as returned by \code{\link{sentencepiece_load_model}} or \code{\link{sentencepiece}}
#' @param x a character vector of text (in UTF-8 Encoding)
#' @param type a character string, either 'subwords' or 'ids' to get the subwords or the corresponding ids of these subwords as defined in the vocabulary of the model. 
#' Defaults to 'subwords'.
#' @param nbest integer indicating the number of segmentations to extract. See the details. The argument is not used if you do not provide a value for it.
#' @param alpha smoothing parameter to perform subword regularisation. Typical values are 0.1, 0.2 or 0.5. See the details. The argument is not used if you do not provide a value for it or do not provide a value for \code{nbest}.
#' @return a list with tokenised text, one for each element of \code{x} 
#' unless you provide \code{nbest} without providing \code{alpha} in which case the result is a list of list of \code{nbest} tokenised texts 
#' @details 
#' If you specify \code{alpha} to perform subword regularisation, keep in mind the following. \cr
#' When alpha is 0.0, one segmentation is uniformly sampled from the \code{nbest} or lattice. 
#' The best Viterbi segmentation is more likely sampled when setting larger \code{alpha} values like 0.1. \cr
#' \itemize{
#' \item If you provide a positive value for \code{nbest}, approximately samples one segmentation from \code{nbest} candidates.
#' \item If you provide a negative value for \code{nbest}, samples one segmentation from the hypotheses (Lattice) according to the generation probabilities using forward-filtering and backward-sampling algorithm.
#' }
#' \code{nbest} and \code{alpha} correspond respectively to the parameter \code{l} and in \code{alpha} 
#' in the paper \url{https://arxiv.org/abs/1804.10959} where (\code{nbest} < 0 means l = infinity).\cr
#' 
#' If the model is a BPE model, \code{alpha} is the merge probability \code{p} explained in \url{https://arxiv.org/abs/1910.13267}. 
#' In a BPE model, nbest-based sampling is not supported so the nbest parameter is ignored although 
#' it still needs to be provided if you want to make use of \code{alpha}.
#' @export
#' @examples
#' model <- system.file(package = "sentencepiece", "models", "nl-fr-dekamer.model")
#' model <- sentencepiece_load_model(file = model)
#' 
#' txt <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.",
#'          "On est d'accord sur le prix de la biere?")
#' sentencepiece_encode(model, x = txt, type = "subwords")
#' sentencepiece_encode(model, x = txt, type = "ids")
#' 
#' ## Examples using subword regularisation
#' model <- system.file(package = "sentencepiece", "models", "nl-fr-dekamer-unigram.model")
#' model <- sentencepiece_load_model(file = model)
#' 
#' txt <- c("Goed zo",
#'          "On est d'accord")
#' sentencepiece_encode(model, x = txt, type = "subwords", nbest = 4)
#' sentencepiece_encode(model, x = txt, type = "ids", nbest = 4)
#' sentencepiece_encode(model, x = txt, type = "subwords", nbest = 2)
#' sentencepiece_encode(model, x = txt, type = "ids", nbest = 2)
#' sentencepiece_encode(model, x = txt, type = "subwords", nbest = 1)
#' sentencepiece_encode(model, x = txt, type = "ids", nbest = 1)
#' sentencepiece_encode(model, x = txt, type = "subwords", nbest = 4, alpha = 0.1)
#' sentencepiece_encode(model, x = txt, type = "ids", nbest = 4, alpha = 0.1)
#' sentencepiece_encode(model, x = txt, type = "subwords", nbest = -1, alpha = 0.1)
#' sentencepiece_encode(model, x = txt, type = "ids", nbest = -1, alpha = 0.1)
#' sentencepiece_encode(model, x = txt, type = "subwords", nbest = -1, alpha = 0)
#' sentencepiece_encode(model, x = txt, type = "ids", nbest = -1, alpha = 0)
sentencepiece_encode <- function(model, x, type = c("subwords", "ids"), nbest = -1L, alpha = 0.1){
  type <- match.arg(type)
  if(type == "ids"){
    if(!missing(nbest) && !missing(alpha)){
      x <- spc_encode_as_ids_sample(model$model, x, as.integer(nbest), as.numeric(alpha))  
    }else if(!missing(nbest)){
      x <- spc_encode_as_ids_nbest(model$model, x, as.integer(nbest))  
    }else{
      x <- spc_encode_as_ids(model$model, x)  
    }
  }else if(type == "subwords"){
    if(!missing(nbest) && !missing(alpha)){
      x <- spc_encode_as_subwords_sample(model$model, x, as.integer(nbest), as.numeric(alpha))  
      x <- lapply(x, FUN=function(x){
        Encoding(x) <- "UTF-8"
        x
      })
    }else if(!missing(nbest)){
      x <- spc_encode_as_subwords_nbest(model$model, x, as.integer(nbest))  
      x <- lapply(x, FUN=function(x){
        x <- lapply(x, FUN=function(x){
          Encoding(x) <- "UTF-8"
          x
        })
        x
      })
    }else{
      x <- spc_encode_as_subwords(model$model, x)  
      x <- lapply(x, FUN=function(x){
        Encoding(x) <- "UTF-8"
        x
      })
    }
  }
  x
}




#' @title Decode encoded sequences back to text
#' @description Decode a sequence of Sentencepiece ids into text again
#' @param model an object of class \code{sentencepiece} as returned by \code{\link{sentencepiece_load_model}} or \code{\link{sentencepiece}}
#' @param x an integer vector of Sentencepiece id's or a list of these
#' @export
#' @return a character vector of detokenised text or if you encoded with \code{nbest}, a list of these
#' @examples 
#' model <- system.file(package = "sentencepiece", "models", "nl-fr-dekamer.model")
#' model <- sentencepiece_load_model(file = model)
#' 
#' txt <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.",
#'          "On est d'accord sur le prix de la biere?")
#'        
#' x <- sentencepiece_encode(model, x = txt, type = "subwords")
#' sentencepiece_decode(model, x)
#' x <- sentencepiece_encode(model, x = txt, type = "ids")
#' sentencepiece_decode(model, x)
#' 
#' model <- system.file(package = "sentencepiece", "models", 
#'                      "nl-fr-dekamer-unigram.model")
#' model <- sentencepiece_load_model(file = model)
#' x <- sentencepiece_encode(model, x = txt, type = "subwords", nbest = 3)
#' sentencepiece_decode(model, x)
#' x <- sentencepiece_encode(model, x = txt, type = "subwords", 
#'                           nbest = 3, alpha = 0.1)
#' sentencepiece_decode(model, x)
sentencepiece_decode <- function(model, x){
  if(is.integer(x) || (is.list(x) && is.integer(x[[1]]))){
    f <- spc_decode_ids
  }else{
    f <- spc_decode_subwords
  }
  if(is.list(x) && is.list(x[[1]])){
    x <- lapply(x, FUN=function(x){
      sentencepiece_decode(model, x)  
    })
  }else{
    if(!is.list(x)){
      x <- f(model$model, x) 
      Encoding(x) <- "UTF-8"
    }else{
      x <- lapply(x, FUN=function(x){
        x <- f(model$model, x)
        Encoding(x) <- "UTF-8"
        x
      })
    }
    x <- unlist(x)
  }
  x
}

Any scripts or data that you put into this service are public.

sentencepiece documentation built on Nov. 13, 2022, 5:05 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

sentencepiece
Text Tokenization using Byte Pair Encoding and Unigram Modelling

R/sentencepiece.R
In sentencepiece: Text Tokenization using Byte Pair Encoding and Unigram Modelling

Defines functions sentencepiece_decode sentencepiece_encode sentencepiece_load_model print.sentencepiece sentencepiece

Documented in sentencepiece sentencepiece_decode sentencepiece_encode sentencepiece_load_model

Try the sentencepiece package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

sentencepiece Text Tokenization using Byte Pair Encoding and Unigram Modelling

R/sentencepiece.R In sentencepiece: Text Tokenization using Byte Pair Encoding and Unigram Modelling

Defines functions sentencepiece_decode sentencepiece_encode sentencepiece_load_model print.sentencepiece sentencepiece

Documented in sentencepiece sentencepiece_decode sentencepiece_encode sentencepiece_load_model

Try the sentencepiece package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

sentencepiece
Text Tokenization using Byte Pair Encoding and Unigram Modelling

R/sentencepiece.R
In sentencepiece: Text Tokenization using Byte Pair Encoding and Unigram Modelling