R/API.R
In fastrtext: 'fastText' Wrapper for Text Classification and Word Representation

Documented in add_tags build_supervised build_vectors execute get_dictionary get_hamming_loss get_labels get_nn get_parameters get_sentence_representation get_tokenized_text get_word_distance get_word_ids get_word_vectors load_model predict.Rcpp_fastrtext print_help

#' Load an existing fastText trained model
#'
#' Load and return a pointer to an existing model which will be used in other functions of this package.
#' @param path path to the existing model
#' @examples
#'
#' library(fastrtext)
#' model_test_path <- system.file("extdata", "model_classification_test.bin", package = "fastrtext")
#' model <- load_model(model_test_path)
#' @export
load_model <- function(path) {
  if (!grepl("\\.(bin|ftz)$", path)) {
    message("add .bin extension to the path")
    path <- paste0(path, ".bin")
  }
  model <- new(fastrtext)
  model$load(path)
  model
}

#' Export hyper parameters
#'
#' Retrieve hyper parameters used to train the model
#' @param model trained `fastText` model
#' @return [list] containing each parameter
#' @examples
#'
#' library(fastrtext)
#' model_test_path <- system.file("extdata", "model_classification_test.bin", package = "fastrtext")
#' model <- load_model(model_test_path)
#' print(head(get_parameters(model), 5))
#'
#' @export
get_parameters <- function(model) {
  model$get_parameters()
}

#' Get list of known words
#'
#' Get a [character] containing each word seen during training.
#' @param model trained `fastText` model
#' @return [character] containing each word
#' @examples
#'
#' library(fastrtext)
#' model_test_path <- system.file("extdata", "model_classification_test.bin", package = "fastrtext")
#' model <- load_model(model_test_path)
#' print(head(get_dictionary(model), 5))
#'
#' @export
get_dictionary <- function(model) {
  model$get_dictionary()
}

#' Get list of labels (supervised model)
#'
#' Get a [character] containing each label seen during training.
#' @param model trained `fastText` model
#' @return [character] containing each label
#' @importFrom assertthat assert_that
#' @examples
#'
#' library(fastrtext)
#' model_test_path <- system.file("extdata", "model_classification_test.bin", package = "fastrtext")
#' model <- load_model(model_test_path)
#' print(head(get_labels(model), 5))
#'
#' @export
get_labels <- function(model) {
  param <- model$get_parameters()
  assert_that(param$model_name == "supervised",
              msg = "This is not a supervised model.")
  model$get_labels()
}

#' Get predictions (for supervised model)
#'
#' Apply the trained  model to new sentences.
#' Average word embeddings and search most similar `label` vector.
#' @param object trained `fastText` model
#' @param sentences [character] containing the sentences
#' @param k will return the `k` most probable labels (default = 1)
#' @param simplify when [TRUE] and `k` = 1, function return a (flat) [numeric] instead of a [list]
#' @param unlock_empty_predictions [logical] to avoid crash when some predictions are not provided for some sentences because all their words have not been seen during training. This parameter should only be set to [TRUE] to debug.
#' @param threshold used to limit number of words used. (optional; 0.0 by default)
#' @param ... not used
#' @return [list] containing for each sentence the probability to be associated with `k` labels.
#' @examples
#'
#' library(fastrtext)
#' data("test_sentences")
#' model_test_path <- system.file("extdata", "model_classification_test.bin", package = "fastrtext")
#' model <- load_model(model_test_path)
#' sentence <- test_sentences[1, "text"]
#' print(predict(model, sentence))
#'
#' @importFrom assertthat assert_that is.flag is.count
#' @export
predict.Rcpp_fastrtext <- function(object, sentences, k = 1, simplify = FALSE, unlock_empty_predictions = FALSE, threshold = 0.0, ...) {
  assert_that(is.flag(simplify),
              is.count(k))
  if (simplify) assert_that(k == 1, msg = "simplify can only be used with k == 1")

  predictions <- object$predict(sentences, k, threshold)

  # check empty predictions
  if (!unlock_empty_predictions) {
    assert_that(sum(lengths(predictions) == 0) == 0,
                msg = "Some sentences have no predictions. It may be caused by the fact that all their words are have not been seen during the training. You may want to use -minn and -maxn hyperparameters related to subwords to increase your chances to have a prediction for each case.")
  }

  if (simplify) {
    unlist(predictions)
  } else {
    predictions
  }
}

#' Get word embeddings
#'
#' Return the vector representation of provided words (unsupervised training)
#' or provided labels (supervised training).
#'
#' @examples
#'
#' library(fastrtext)
#' model_test_path <- system.file("extdata", "model_unsupervised_test.bin", package = "fastrtext")
#' model <- load_model(model_test_path)
#' get_word_vectors(model, c("introduction", "we"))
#'
#' @param model trained `fastText` model
#' @param words [character] of words. Default: return every word from the dictionary.
#' @return [matrix] containing each word embedding as a row and `rownames` are populated with word strings.
#' @importFrom assertthat assert_that
#' @export
get_word_vectors <- function(model, words = get_dictionary(model)) {
  assert_that(is.character(words))
  model$get_vectors(words)
}

#' Execute command on `fastText` model (including training)
#'
#' Use the same commands than the one to use for the command line.
#'
#' @param commands [character] of commands
#' @examples
#' \dontrun{
#' # Supervised learning example
#' library(fastrtext)
#'
#' data("train_sentences")
#' data("test_sentences")
#'
#' # prepare data
#' tmp_file_model <- tempfile()
#'
#' train_labels <- paste0("__label__", train_sentences[,"class.text"])
#' train_texts <- tolower(train_sentences[,"text"])
#' train_to_write <- paste(train_labels, train_texts)
#' train_tmp_file_txt <- tempfile()
#' writeLines(text = train_to_write, con = train_tmp_file_txt)
#'
#' test_labels <- paste0("__label__", test_sentences[,"class.text"])
#' test_texts <- tolower(test_sentences[,"text"])
#' test_to_write <- paste(test_labels, test_texts)
#'
#' # learn model
#' execute(commands = c("supervised", "-input", train_tmp_file_txt,
#'                      "-output", tmp_file_model, "-dim", 20, "-lr", 1,
#'                      "-epoch", 20, "-wordNgrams", 2, "-verbose", 1))
#'
#' model <- load_model(tmp_file_model)
#' predict(model, sentences = test_sentences[1, "text"])
#'
#' # Unsupervised learning example
#' library(fastrtext)
#'
#' data("train_sentences")
#' data("test_sentences")
#' texts <- tolower(train_sentences[,"text"])
#' tmp_file_txt <- tempfile()
#' tmp_file_model <- tempfile()
#' writeLines(text = texts, con = tmp_file_txt)
#' execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
#'
#' model <- load_model(tmp_file_model)
#' dict <- get_dictionary(model)
#' get_word_vectors(model, head(dict, 5))
#' }
#' @export
execute <- function(commands) {
  model <- new(fastrtext)
  model$execute(c("fasttext", commands))
}

#' Distance between two words
#'
#' Distance is equal to `1 - cosine`
#' @param model trained `fastText` model. Null if train a new model.
#' @param w1 first word to compare
#' @param w2 second word to compare
#' @return a `scalar` with the distance
#'
#' @examples
#'
#' library(fastrtext)
#' model_test_path <- system.file("extdata", "model_unsupervised_test.bin", package = "fastrtext")
#' model <- load_model(model_test_path)
#' get_word_distance(model, "time", "timing")
#'
#' @importFrom assertthat assert_that is.string
#' @export
get_word_distance <- function(model, w1, w2) {
  assert_that(is.string(w1))
  assert_that(is.string(w2))
  embeddings <- get_word_vectors(model, c(w1, w2))
  1 - crossprod(embeddings[1, ], embeddings[2, ]) /
    sqrt(crossprod(embeddings[1, ]) * crossprod(embeddings[2, ]))
}

#' Hamming loss
#'
#' Compute the hamming loss. When there is only one category, this measure the accuracy.
#' @param labels list of labels
#' @param predictions list returned by the predict command (including both the probability and the categories)
#' @return a `scalar` with the loss
#'
#' @examples
#'
#' library(fastrtext)
#' data("test_sentences")
#' model_test_path <- system.file("extdata", "model_classification_test.bin", package = "fastrtext")
#' model <- load_model(model_test_path)
#' sentences <- test_sentences[, "text"]
#' test_labels <- test_sentences[, "class.text"]
#' predictions <- predict(model, sentences)
#' get_hamming_loss(as.list(test_labels), predictions)
#'
#' @importFrom assertthat assert_that
#' @export
get_hamming_loss <- function(labels, predictions) {
  diff <- function(a, b) {
    common <- union(a, b)
    difference <- setdiff(union(a, b), intersect(a, b))
    distance <- length(difference) / length(common)
    1 - distance
  }
  assert_that(is.list(labels))
  assert_that(is.list(predictions))
  assert_that(length(labels) == length(predictions))
  prediction_categories <- lapply(predictions, names)
  mean(mapply(diff, prediction_categories, labels, USE.NAMES = FALSE))
}

#' Print help
#'
#' Print command information, mainly to use with [execute()] `function`.
#'
#' @examples
#' \dontrun{
#' print_help()
#' }
#'
#' @export
print_help <- function() {
  model <- new(fastrtext)
  model$print_help()
  rm(model)
}

#' Get nearest neighbour vectors
#'
#' Find the `k` words with the smallest distance.
#' First execution can be slow because of precomputation.
#' Search is done linearly, if your model is big you may want to use an approximate neighbour algorithm from other R packages (like RcppAnnoy).
#'
#' @param model trained `fastText` model. Null if train a new model.
#' @param word reference word
#' @param k [integer] defining the number of results to return
#' @return [numeric] with distances with [names] as words
#'
#' @examples
#'
#' library(fastrtext)
#' model_test_path <- system.file("extdata", "model_unsupervised_test.bin", package = "fastrtext")
#' model <- load_model(model_test_path)
#' get_nn(model, "time", 10)
#'
#' @importFrom assertthat assert_that is.string is.number
#' @export
get_nn <- function(model, word, k) {
  assert_that(is.string(word))
  assert_that(is.number(k))
  model$get_nn_by_word(word, k)
}

#' Add tags to documents
#'
#' Add tags in the `fastText`` format.
#' This format is require for the training step. As fastText doesn't support newlines inside documents
#' (as newlines are delimiting documents) this function also ensures that there are absolutely no
#' new lines. By default new lines are replaced by a single space.
#'
#' @param documents texts to learn
#' @param tags labels provided as a [list] or a [vector]. There can be 1 or more per document.
#' @param new_lines Character that replaces new lines (`\\r\\n`), default is space.
#' @param prefix [character] to add in front of tag (`fastText` format)
#'
#' @return [character] ready to be written in a file
#'
#' @examples
#' library(fastrtext)
#' tags <- list(c(1, 5), 0)
#' documents <- c("this is a text", "this is another document")
#' add_tags(documents = documents, tags = tags)
#'
#' @importFrom assertthat assert_that is.string
#' @export
add_tags <- function(documents, tags, prefix = "__label__", new_lines = ' ') {
  assert_that(is.character(documents))
  assert_that(is.string(prefix))
  assert_that(length(documents) == length(tags))

  tags_to_include <- sapply(tags, FUN = function(t) paste0(prefix, t, collapse = " "))
  single_lined_documents <- gsub(x = documents, pattern = '[\\n\\r]+', replacement = new_lines, perl = TRUE)
  paste(tags_to_include, documents)
}

#' Get sentence embedding
#'
#' Sentence is splitted in words (using space characters), and word embeddings are averaged.
#'
#' @param model `fastText` model
#' @param sentences [character] containing the sentences
#' @examples
#' library(fastrtext)
#' model_test_path <- system.file("extdata", "model_unsupervised_test.bin", package = "fastrtext")
#' model <- load_model(model_test_path)
#' m <- get_sentence_representation(model, "this is a test")
#' print(m)
#' @importFrom assertthat assert_that
#' @export
get_sentence_representation <- function(model, sentences) {
  assert_that(is.character(sentences))
  model$get_sentence_embeddings(sentences)
}

#' Retrieve word IDs
#'
#' Get ID of words in the dictionary
#' @param model `fastText` model
#' @param words [character] containing words to retrieve IDs
#' @return [numeric] of ids
#' @examples
#' library(fastrtext)
#' model_test_path <- system.file("extdata", "model_unsupervised_test.bin", package = "fastrtext")
#' model <- load_model(model_test_path)
#' ids <- get_word_ids(model, c("this", "is", "a", "test"))
#'
#' # print positions
#' print(ids)
#' # retrieve words in the dictionary using the positions retrieved
#' print(get_dictionary(model)[ids])
#' @importFrom assertthat assert_that
#' @export
get_word_ids <- function(model, words) {
  assert_that(is.character(words))
  model$get_word_ids(words)
}

#' Tokenize text
#'
#' Separate words in a text using space characters
#' @param model `fastText` model
#' @param texts a [character] containing the documents
#' @return a [list] of [character] containing words
#' @examples
#' library(fastrtext)
#' model_test_path <- system.file("extdata", "model_unsupervised_test.bin", package = "fastrtext")
#' model <- load_model(model_test_path)
#' tokens <- get_tokenized_text(model, "this is a test")
#' print(tokens)
#' tokens <- get_tokenized_text(model, c("this is a test 1", "this is a second test!"))
#' print(tokens)
#' @importFrom assertthat assert_that is.string
#' @export
get_tokenized_text <- function(model, texts){
  assert_that(is.character(texts))
  lapply(texts, FUN = function(text) model$tokenize(text))
}

#' Build fasttext vectors
#'
#' @description
#' Trains a fasttext vector/unsupervised model following method described in
#' \href{https://arxiv.org/abs/1607.04606}{Enriching Word Vectors with Subword Information}
#' using the \href{https://fasttext.cc/}{fasttext} implementation.
#'
#' See \href{https://fasttext.cc/docs/en/unsupervised-tutorial.html}{FastText word representation tutorial} for more information on
#' training unsupervised models using fasttext.
#'
#' @param documents character vector of documents used for training
#' @param model_path Name of output file *without* file extension.
#' @param modeltype Should training be done using skipgram or cbow? Defaults to skipgram.
#' @param bucket number of buckets
#' @param dim size of word vectors
#' @param epoch number of epochs
#' @param label text string, labels prefix. Default is "__label__"
#' @param loss loss function {ns, hs, softmax}
#' @param lr learning rate
#' @param lrUpdateRate change the rate of updates for the learning rate
#' @param maxn max length of char ngram
#' @param minCount minimal number of word occurences
#' @param minn min length of char ngram
#' @param neg number of negatives sampled
#' @param t sampling threshold
#' @param thread number of threads
#' @param verbose verbosity level
#' @param wordNgrams max length of word ngram
#' @param ws size of the context window
#'
#' @return path to model file, as character
#' @export
#'
#' @examples
#' \dontrun{
#' library(fastrtext)
#' text <- train_sentences
#' model_file <- build_vectors(text[['text']], 'my_model')
#' model <- load_model(model_file)
#' }
build_vectors <- function(documents, model_path,
                          modeltype = c('skipgram', 'cbow'),
                          bucket = 2000000,
                          dim = 100,
                          epoch = 5,
                          label = "__label__",
                          loss = c('ns', 'hs', 'softmax', 'ova', 'one-vs-all'),
                          lr = 0.05,
                          lrUpdateRate = 100,
                          maxn = 6,
                          minCount = 5,
                          minn = 3,
                          neg = 5,
                          t = 1e-4,
                          thread = 12,
                          verbose = 2,
                          wordNgrams = 1,
                          ws = 5) {

  # ensure modeltype only takes valid values as defined in function definition. https://stackoverflow.com/a/4684604
  modeltype <- match.arg(modeltype)
  loss <- match.arg(loss)
  args <- as.list(environment())

  tmp_file_txt <- tempfile()

  message("writing tempfile at ... ", tmp_file_txt)
  writeLines(text = documents, con = tmp_file_txt)

  #Build character vector containing all fasttext arguments
  c_args <- args[-1:-3] #First 3 args are not named arguments for our fasttext command
  commands <- c(modeltype,
                "-input", tmp_file_txt,
                "-output", model_path,
                # build name/value pairs for each named argument
                rbind(
                  paste0('-', names(c_args)),
                  format(c_args, scientific = FALSE)
                )
  )

  message("Starting training vectors with following commands: \n$ ", paste(commands, collapse=" "), "\n\n")
  fastrtext::execute(commands = commands)

  unlink(tmp_file_txt)
  return(paste0(model_path, '.bin'))
}


#' Build a supervised fasttext model
#'
#' @description
#' Trains a supervised model, following the method layed out in
#' \href{https://arxiv.org/abs/1607.01759}{Bag of Tricks for Efficient Text Classification}
#' using the \href{https://fasttext.cc/}{fasttext} implementation.
#'
#' See \href{https://fasttext.cc/docs/en/supervised-tutorial.html}{FastText text classification tutorial} for more information on
#' training supervised models using fasttext.
#'
#' @param documents character vector of documents used for training
#' @param targets vector of targets/catagory of each document. Must have same length as `documents` and be coercable to character
#' @param model_path Name of output file *without* file extension.
#' @param bucket number of buckets
#' @param wordNgrams max length of word ngram
#' @param minCount minimal number of word occurences
#' @param minCountLabel minimal number of label occurences
#' @param maxn max length of char ngram
#' @param minn min length of char ngram
#' @param t sampling threshold
#' @param lr learning rate
#' @param lrUpdateRate change the rate of updates for the learning rate
#' @param dim size of word vectors
#' @param ws size of the context window
#' @param epoch number of epochs
#' @param neg number of negatives sampled
#' @param loss = c('softmax', 'ns', 'hs', 'ova'), loss function {ns, hs, softmax, one Vs all}. one Vs all loss is usefull for multi class when you need to apply a threshold for each class score.
#' @param thread number of threads
#' @param pretrainedVectors path to pretrained word vectors for supervised learning. Leave empty for no pretrained vectors.
#' @param label text string, labels prefix. Default is "__label__"
#' @param verbose verbosity level
#'
#' @return path to new model file as a `character`
#' @export
#'
#' @examples
#'\dontrun{
#' library(fastrtext)
#' model_file <- build_supervised(documents = train_sentences[["text"]],
#'                                targets =train_sentences[["class.text"]],
#'                                model_path = 'my_model',
#'                                dim = 20, lr = 1, epoch = 20, wordNgrams = 2)
#'
#' model <- load_model(model_file)
#'
#' predictions <- predict(model, test_sentences[["text"]])
#' mean(sapply(predictions, names) == test_sentences[["class.text"]])
#' # ~0.8
#' }
build_supervised <- function(documents, targets, model_path,
                             # default arguments for fasttext, as seen by running `fasttext supervised` in bash
                             lr = 0.05,
                             dim = 100,
                             ws = 5,
                             epoch = 5,
                             minCount = 5,
                             minCountLabel = 0,
                             neg = 5,
                             wordNgrams = 1,
                             loss = c('ns', 'hs', 'softmax', 'ova', 'one-vs-all'),
                             bucket = 2000000,
                             minn = 3,
                             maxn = 6,
                             thread = 12,
                             lrUpdateRate = 100,
                             t = 1e-4,
                             label = "__label__",
                             verbose = 2,
                             pretrainedVectors = NULL) {

  #Check that all arguments are correct and load them all into a list
  modeltype = "supervised"
  loss <- match.arg(loss)
  if (!is.character(pretrainedVectors)) rm(pretrainedVectors)
  args <- as.list(environment())

  # get input / output file paths
  tmp_file_txt <- tempfile()


  message("Prepare and write tempfile at ... ", tmp_file_txt)
  # We need 1 document per line and to prepend each line with '__label__' + label name
  prepared_docs <- add_tags(documents, tags = targets, prefix = label)
  writeLines(text = prepared_docs,
             con = tmp_file_txt)

  #Build character vector containing all fasttext arguments
  c_args <- args[-1:-4] #First 4 args values are not to be used as named function arguments (modeltype, documents ...)
  commands <- c(modeltype,
                "-input", tmp_file_txt,
                "-output", model_path,
                # build name/value pairs for each named argument
                rbind(
                  paste0('-', names(c_args)),
                  format(c_args, scientific = FALSE)
                )
  )

  message("Starting supervised training with following commands: \n$ ", paste(commands, collapse = " "), "\n\n")
  fastrtext::execute(commands = commands)

  unlink(tmp_file_txt)
  return(paste0(model_path, '.bin'))
}

globalVariables(c("new"))

Any scripts or data that you put into this service are public.

fastrtext documentation built on Oct. 30, 2019, 11:32 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

fastrtext
'fastText' Wrapper for Text Classification and Word Representation

R/API.R
In fastrtext: 'fastText' Wrapper for Text Classification and Word Representation

Defines functions load_model get_parameters get_dictionary get_labels predict.Rcpp_fastrtext get_word_vectors execute get_word_distance get_hamming_loss print_help get_nn add_tags get_sentence_representation get_word_ids get_tokenized_text build_vectors build_supervised

Documented in add_tags build_supervised build_vectors execute get_dictionary get_hamming_loss get_labels get_nn get_parameters get_sentence_representation get_tokenized_text get_word_distance get_word_ids get_word_vectors load_model predict.Rcpp_fastrtext print_help

Try the fastrtext package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

fastrtext 'fastText' Wrapper for Text Classification and Word Representation

R/API.R In fastrtext: 'fastText' Wrapper for Text Classification and Word Representation

Defines functions load_model get_parameters get_dictionary get_labels predict.Rcpp_fastrtext get_word_vectors execute get_word_distance get_hamming_loss print_help get_nn add_tags get_sentence_representation get_word_ids get_tokenized_text build_vectors build_supervised

Documented in add_tags build_supervised build_vectors execute get_dictionary get_hamming_loss get_labels get_nn get_parameters get_sentence_representation get_tokenized_text get_word_distance get_word_ids get_word_vectors load_model predict.Rcpp_fastrtext print_help

Try the fastrtext package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

fastrtext
'fastText' Wrapper for Text Classification and Word Representation

R/API.R
In fastrtext: 'fastText' Wrapper for Text Classification and Word Representation