Nothing
#' Preloads a causal language model
#'
#' Preloads a causal language model to speed up next runs.
#'
#' @section More details about causal models:
#' A causal language model (also called GPT-like, auto-regressive, or decoder
#' model) is a type of large language model usually used for text-generation
#' that can predict the next word (or more accurately in fact token) based
#' on a preceding context.
#'
#' If not specified, the causal model used will be the one set in the global
#' option `pangoling.causal.default`, this can be
#' accessed via `getOption("pangoling.causal.default")` (by default
#' "`r getOption("pangoling.causal.default")`"). To change the default option
#' use `options(pangoling.causal.default = "newcausalmodel")`.
#'
#' A list of possible causal models can be found in
#' [Hugging Face website](https://huggingface.co/models?pipeline_tag=text-generation).
#'
#' Using the `config_model` and `config_tokenizer` arguments, it's possible to
#' control how the model and tokenizer from Hugging Face is accessed, see the
#' Python method
#' [`from_pretrained`](https://huggingface.co/docs/transformers/v4.25.1/en/model_doc/auto#transformers.AutoProcessor.from_pretrained)
#' for details.
#'
#' In case of errors when a new model is run, check the status of
#' [https://status.huggingface.co/](https://status.huggingface.co/)
#'
#' @param model Name of a pre-trained model or folder. One should be able to use
#' models based on "gpt2". See
#' [hugging face website](https://huggingface.co/models?other=gpt2).
#' @param checkpoint Folder of a checkpoint.
#' @param add_special_tokens Whether to include special tokens. It has the
#' same default as the
#' [AutoTokenizer](https://huggingface.co/docs/transformers/v4.25.1/en/model_doc/auto#transformers.AutoTokenizer)
#' method in Python.
#' @param config_model List with other arguments that control how the
#' model from Hugging Face is accessed.
#' @param config_tokenizer List with other arguments that control how the
#' tokenizer from Hugging Face is accessed.
#'
#' @return Nothing.
#'
#' @examplesIf installed_py_pangoling()
#' causal_preload(model = "gpt2")
#'
#' @family causal model helper functions
#' @export
#'
causal_preload <- function(model = getOption("pangoling.causal.default"),
checkpoint = NULL,
add_special_tokens = NULL,
config_model = NULL, config_tokenizer = NULL) {
message_verbose("Preloading causal model ", model, "...")
lang_model(model,
checkpoint = checkpoint,
task = "causal",
config_model = config_model)
tokenizer(model,
add_special_tokens = add_special_tokens,
config_tokenizer = config_tokenizer)
invisible()
}
#' Returns the configuration of a causal model
#'
#' @inheritParams causal_preload
#' @inherit causal_preload details
#' @inheritSection causal_preload More details about causal models
#' @return A list with the configuration of the model.
#' @examplesIf installed_py_pangoling()
#' causal_config(model = "gpt2")
#'
#' @family causal model helper functions
#' @export
causal_config <- function(model = getOption("pangoling.causal.default"),
checkpoint = NULL, config_model = NULL) {
lang_model(
model = model,
checkpoint = checkpoint,
task = "causal",
config_model = config_model
)$config$to_dict()
}
#' Generate next tokens after a context and their predictability using a causal
#' transformer model
#'
#' This function predicts the possible next tokens and their predictability
#' (log-probabilities by default). The function sorts tokens in descending order
#' of their predictability.
#'
#'
#' @details
#' The function uses a causal transformer model to compute the predictability
#' of all tokens in the model's vocabulary, given a single input context. It
#' returns a table where each row represents a token, along with its
#' predictability score. By default, the function returns log-probabilities in
#' natural logarithm (base *e*), but you can specify a different logarithm base
#' (e.g., `log.p = 1/2` for surprisal in bits).
#'
#' If `decode = TRUE`, the tokens are converted into human-readable strings,
#' handling special characters like accents and diacritics. This ensures that
#' tokens are more interpretable, especially for languages with complex
#' tokenization.
#'
#' @param context A single string representing the context for which the next
#' tokens and their predictabilities are predicted.
#' @param decode Logical. If `TRUE`, decodes the tokens into human-readable
#' strings, handling special characters and diacritics. Default is
#' `FALSE`.
#' @inheritParams causal_preload
#' @inheritParams causal_tokens_pred_lst
#' @inherit causal_preload details
#' @inheritSection causal_preload More details about causal models
#' @return A table with possible next tokens and their log-probabilities.
#' @examplesIf installed_py_pangoling()
#' causal_next_tokens_pred_tbl(
#' context = "The apple doesn't fall far from the",
#' model = "gpt2"
#' )
#'
#' @family causal model functions
#' @export
causal_next_tokens_pred_tbl <-
function(context,
log.p = getOption("pangoling.log.p"),
decode = FALSE,
model = getOption("pangoling.causal.default"),
checkpoint = NULL,
add_special_tokens = NULL,
config_model = NULL,
config_tokenizer = NULL) {
if (length(unlist(context)) > 1) {
stop2("Only one context is allowed in this function.")
}
if(any(!is_really_string(context))) {
stop2("`context` needs to contain a string.")
}
message_verbose_model(model, checkpoint)
trf <- lang_model(model,
checkpoint = checkpoint,
task = "causal",
config_model = config_model
)
tkzr <- tokenizer(model,
add_special_tokens = add_special_tokens,
config_tokenizer = config_tokenizer
)
# no batches allowed
context_tensor <- encode(list(unlist(context)),
tkzr,
add_special_tokens = add_special_tokens
)$input_ids
generated_outputs <- trf(context_tensor)
n_tokens <- length(context_tensor$tolist()[[1]]) # was 0
logits_next_word <- generated_outputs$logits[0][n_tokens - 1]
l_softmax <- torch$log_softmax(logits_next_word, dim = -1L)$tolist()
lp <- reticulate::py_to_r(l_softmax) |>
unlist()
vocab <- get_vocab(tkzr, decode = decode)
diff_words <- length(vocab) - length(lp)
if(diff_words > 0) {
warning(paste0("Tokenizer's vocabulary is longer than the model's.",
" Some words will have NA predictability."))
lp <- c(lp, rep(NA, diff_words))
} else if(diff_words < 0) {
message_verbose("Tokenizer's vocabulary is smaller than the model's.",
" The model might reserve extra embeddings for padding, dynamic additions,",
" or GPU efficiency.")
vocab <- c(vocab, rep("", -diff_words))
}
tidytable::tidytable(token = vocab,
pred = lp |> ln_p_change(log.p = log.p)) |>
tidytable::arrange(-pred)
}
#' Compute predictability using a causal transformer model
#'
#' These functions calculate the predictability of words, phrases, or tokens
#' using a causal transformer model.
#'
#' @details
#' These functions calculate the predictability (by default the natural
#' logarithm of the word probability) of words, phrases or tokens using a
#' causal transformer model:
#'
#' - **`causal_targets_pred()`**: Evaluates specific target words or phrases
#' based on their given contexts. Use when you have explicit
#' context-target pairs to evaluate, with each target word or phrase paired
#' with a single preceding context.
#' - **`causal_words_pred()`**: Computes predictability for all elements of a
#' vector grouped by a specified variable. Use when working with words or
#' phrases split into groups, such as sentences or paragraphs, where
#' predictability is computed for every word or phrase in each group.
#' - **`causal_tokens_pred_lst()`**: Computes the predictability of each token
#' in a sentence (or group of sentences) and returns a list of results for
#' each sentence. Use when you want to calculate the predictability of
#' every token in one or more sentences.
#'
#' See the
#' [online article](https://docs.ropensci.org/pangoling/articles/intro-gpt2.html)
#' in pangoling website for more examples.
#'
#' @param x A character vector of words, phrases, or texts to evaluate.
#' @param word_n Word order, by default this is the word order of the vector x.
#' @param texts A vector or list of sentences or paragraphs.
#' @param targets A character vector of target words or phrases.
#' @param contexts A character vector of contexts corresponding to each target.
#' @param by A grouping variable indicating how texts are split into groups.
#' @param sep A string specifying how words are separated within contexts or
#' groups. Default is `" "`. For languages that don't have spaces
#' between words (e.g., Chinese), set `sep = ""`.
#' @param log.p Base of the logarithm used for the output predictability values.
#' If `TRUE` (default), the natural logarithm (base *e*) is used.
#' If `FALSE`, the raw probabilities are returned.
#' Alternatively, `log.p` can be set to a numeric value specifying
#' the base of the logarithm (e.g., `2` for base-2 logarithms).
#' To get surprisal in bits (rather than predictability), set
#' `log.p = 1/2`.
#' @param ignore_regex Can ignore certain characters when calculating the log
#' probabilities. For example `^[[:punct:]]$` will ignore
#' all punctuation that stands alone in a token.
#' @param batch_size Maximum number of sentences/texts processed in parallel.
#' Larger batches increase speed but use more memory. Since
#' all texts in a batch must have the same length, shorter
#' ones are padded with placeholder tokens.
#' @inheritParams causal_preload
#' @inheritSection causal_preload More details about causal models
#' @param ... Currently not in use.
#' @return For `causal_targets_pred()` and `causal_words_pred()`,
#' a named numeric vector of predictability scores. For
#' `causal_tokens_pred_lst()`, a list of named numeric vectors, one for
#' each sentence or group.
#'
#' @examplesIf installed_py_pangoling()
#' # Using causal_targets_pred
#' causal_targets_pred(
#' contexts = c("The apple doesn't fall far from the",
#' "Don't judge a book by its"),
#' targets = c("tree.", "cover."),
#' model = "gpt2"
#' )
#'
#' # Using causal_words_pred
#' causal_words_pred(
#' x = df_sent$word,
#' by = df_sent$sent_n,
#' model = "gpt2"
#' )
#'
#' # Using causal_tokens_pred_lst
#' preds <- causal_tokens_pred_lst(
#' texts = c("The apple doesn't fall far from the tree.",
#' "Don't judge a book by its cover."),
#' model = "gpt2"
#' )
#' preds
#'
#' # Convert the output to a tidy table
#' suppressPackageStartupMessages(library(tidytable))
#' map2_dfr(preds, seq_along(preds),
#' ~ data.frame(tokens = names(.x), pred = .x, id = .y))
#'
#' @family causal model functions
#' @export
#' @rdname causal_predictability
causal_words_pred <- function(x,
by = rep(1, length(x)),
word_n = NULL,
sep = " ",
log.p = getOption("pangoling.log.p"),
ignore_regex = "",
model = getOption("pangoling.causal.default"),
checkpoint = NULL,
add_special_tokens = NULL,
config_model = NULL,
config_tokenizer = NULL,
batch_size = 1,
...) {
if(any(!is_really_string(x))) {
stop2("`x` needs to be a vector of non-empty strings.")
}
dots <- list(...)
# Check for unknown arguments
if (length(dots) > 0) {
unknown_args <- setdiff(names(dots), ".by")
if (length(unknown_args) > 0) {
stop("Unknown arguments: ", paste(unknown_args, collapse = ", "), ".")
}
}
if (length(x) != length(by)) {
stop2("The argument `by` has an incorrect length.")
}
if(is.null(word_n)){
word_n <- stats::ave(seq_along(by), by, FUN = seq_along)
}
if (length(word_n) != length(by)) {
stop2("The argument `word_n` has an incorrect length.")
}
if(any(x != trimws(x)) & sep == " ") {
message_verbose(paste0("Notice that some words have white spaces,",
' argument `sep` should probably set to "".'))
}
stride <- 1 # fixed for now
message_verbose_model(model, checkpoint = checkpoint)
word_by_word_texts <- split(x, by, drop = TRUE)
word_n_by <- split(word_n, by, drop = TRUE)
word_by_word_texts <- tidytable::map2(word_by_word_texts, word_n_by, ~ .x[order(.y)])
pasted_texts <- conc_words(word_by_word_texts, sep = sep)
tkzr <- tokenizer(model,
add_special_tokens = add_special_tokens,
config_tokenizer = config_tokenizer)
trf <- lang_model(model,
checkpoint = checkpoint,
task = "causal",
config_model = config_model)
tensors <- create_tensor_lst(
texts = unname(pasted_texts),
tkzr = tkzr,
add_special_tokens = add_special_tokens,
stride = stride,
batch_size = batch_size
)
lmats <- lapply(tensors, function(tensor) {
causal_mat(tensor,
trf = trf,
tkzr = tkzr,
add_special_tokens = add_special_tokens,
decode = FALSE,
stride = stride)
}) |>
unlist(recursive = FALSE)
out <- tidytable::pmap(
list(
word_by_word_texts,
names(word_by_word_texts),
lmats
),
function(words, item, mat) {
# words <- word_by_word_texts[[1]]
# item <- names(word_by_word_texts)[[1]]
# mat <- lmats[[1]]
message_verbose(
"Text id: ", item, "\n`",
paste(words, collapse = sep),
"`"
)
word_lp(words,
sep = sep,
mat = mat,
ignore_regex = ignore_regex,
model = model,
add_special_tokens = add_special_tokens,
config_tokenizer = config_tokenizer)
}
)
message_verbose("***\n")
out_reordered <- tidytable::map2(out, word_n_by,
~ .x[as.integer(as.factor(.y))])
lps <- out_reordered |> unsplit(by, drop = TRUE)
names(lps) <- x
# out |> lapply(function(x) paste0(names(x),"")) |>
# unsplit(by, drop = TRUE)
lps |> ln_p_change(log.p = log.p)
}
#' @rdname causal_predictability
#' @export
causal_tokens_pred_lst <-
function(texts,
log.p = getOption("pangoling.log.p"),
model = getOption("pangoling.causal.default"),
checkpoint = NULL,
add_special_tokens = NULL,
config_model = NULL,
config_tokenizer = NULL,
batch_size = 1) {
if(any(!is_really_string(texts))){
stop2("`texts` needs to be a vector of non-empty strings.")
}
stride <- 1
message_verbose_model(model, checkpoint)
ltexts <- as.list(unlist(texts, recursive = TRUE))
tkzr <- tokenizer(model,
add_special_tokens = add_special_tokens,
config_tokenizer = config_tokenizer)
trf <- lang_model(model,
checkpoint = checkpoint,
task = "causal",
config_model = config_model)
tensors <- create_tensor_lst(ltexts,
tkzr,
add_special_tokens = add_special_tokens,
stride = stride,
batch_size = batch_size)
ls_mat <- tidytable::map(tensors, function(tensor) {
causal_mat(tensor,
trf,
tkzr,
add_special_tokens = add_special_tokens,
decode = FALSE,
stride = stride)
}) |>
unlist(recursive = FALSE)
lapply(ls_mat, function(mat) {
if (ncol(mat) == 1 && colnames(mat) == "") {
pred <- NA_real_
names(pred) <- ""
} else {
pred <- tidytable::map2_dbl(colnames(mat),
seq_len(ncol(mat)),
~ mat[.x, .y]) |>
ln_p_change(log.p = log.p)
names(pred) <- colnames(mat)
}
pred
})
}
#' @noRd
causal_mat <- function(tensor,
trf,
tkzr,
add_special_tokens = NULL,
decode,
stride = 1) {
message_verbose(
"Processing a batch of size ",
tensor$input_ids$shape[0],
" with ",
tensor$input_ids$shape[1], " tokens."
)
if (tensor$input_ids$shape[1] == 0) {
warning("No tokens found.", call. = FALSE)
vocab <- get_vocab(tkzr, decode = decode)
mat <- matrix(rep(NA, length(vocab)), ncol = 1)
rownames(mat) <- vocab
colnames(mat) <- ""
return(list(mat))
}
logits_b <- trf$forward(
input_ids = tensor$input_ids,
attention_mask = tensor$attention_mask
)$logits
lmat <- lapply(seq_len(logits_b$shape[0]) - 1, function(i) {
real_token_pos <- seq_len(sum(tensor$attention_mask[i]$tolist())) - 1
logits <- logits_b[i][real_token_pos]
# in case it's only one token, it needs to be unsqueezed
ids <- tensor$input_ids[i]$unsqueeze(1L)
tokens <- tkzr$convert_ids_to_tokens(ids[real_token_pos])
lp <- reticulate::py_to_r(torch$log_softmax(logits, dim = -1L))$tolist()
rm(logits)
gc(full = TRUE)
if (is.list(lp)) {
mat <- do.call("cbind", lp)
} else {
# In case it's only one token, lp won't be a list
mat <- matrix(lp, ncol = 1)
}
# remove the last prediction, and the first is NA
mat <- cbind(rep(NA, nrow(mat)), mat[, -ncol(mat)])
# in case the last words in the vocab were not used to train the model
vocab <- get_vocab(tkzr, decode = decode)
diff_words <- length(vocab) - nrow(mat)
if(diff_words > 0) {
warning("Tokenizer's vocabulary is larger than the model's.")
} else if(diff_words < 0) {
message_verbose("Tokenizer's vocabulary is smaller than the model's.",
" The model might reserve extra embeddings for padding, dynamic additions,",
" or GPU efficiency.")
}
rownames(mat) <- vocab[seq_len(nrow(mat))]
colnames(mat) <- unlist(tokens)
mat
})
rm(logits_b)
lmat
}
#' Generate a list of predictability matrices using a causal transformer model
#'
#' This function computes a list of matrices, where each matrix corresponds to a
#' unique group specified by the `by` argument. Each matrix represents the
#' predictability of every token in the input text (`x`) based on preceding
#' context, as evaluated by a causal transformer model.
#'
#'
#' @details
#' The function splits the input `x` into groups specified by the `by` argument
#' and processes each group independently. For each group, the model computes
#' the predictability of each token in its vocabulary based on preceding
#' context.
#'
#' Each matrix contains:
#' - Rows representing the model's vocabulary.
#' - Columns corresponding to tokens in the group (e.g., a sentence or
#' paragraph).
#' - By default, values in the matrices are the natural logarithm of word
#' probabilities.
#'
#' @inheritParams causal_words_pred
#' @inheritParams causal_preload
#' @inheritParams causal_next_tokens_pred_tbl
#' @param sorted When default FALSE it will retain the order of groups we are
#' splitting by. When TRUE then sorted (according to `by`) list(s)
#' are returned.
#' @inherit causal_preload details
#' @inheritSection causal_preload More details about causal models
#' @return A list of matrices with tokens in their columns and the vocabulary of
#' the model in their rows
#'
#' @examplesIf installed_py_pangoling()
#' data("df_sent")
#' df_sent
#' list_of_mats <- causal_pred_mats(
#' x = df_sent$word,
#' by = df_sent$sent_n,
#' model = "gpt2"
#' )
#'
#' # View the structure of the resulting list
#' list_of_mats |> str()
#'
#' # Inspect the last rows of the first matrix
#' list_of_mats[[1]] |> tail()
#'
#' # Inspect the last rows of the second matrix
#' list_of_mats[[2]] |> tail()
#' @family causal model functions
#' @export
#'
causal_pred_mats <- function(x,
by = rep(1, length(x)),
sep = " ",
log.p = getOption("pangoling.log.p"),
sorted = FALSE,
model = getOption("pangoling.causal.default"),
checkpoint = NULL,
add_special_tokens = NULL,
decode = FALSE,
config_model = NULL,
config_tokenizer = NULL,
batch_size = 1,
...) {
if(any(!is_really_string(x))) {
stop2("`x` needs to be a vector of non-empty strings.")
}
dots <- list(...)
if(any(x != trimws(x)) & sep == " ") {
message_verbose(paste0('Notice that some words have white spaces,',
' argument `sep` should probably set to "".'))
}
# Check for unknown arguments
if (length(dots) > 0) {
unknown_args <- setdiff(names(dots), ".by")
if (length(unknown_args) > 0) {
stop("Unknown arguments: ", paste(unknown_args, collapse = ", "), ".")
}
}
stride <- 1
message_verbose_model(model, checkpoint)
tkzr <- tokenizer(model,
add_special_tokens = add_special_tokens,
config_tokenizer = config_tokenizer
)
trf <- lang_model(model,
checkpoint = checkpoint,
task = "causal",
config_model = config_model
)
word_by_word_texts <- split(x, by)
pasted_texts <- conc_words(word_by_word_texts, sep = sep)
tensors <- create_tensor_lst(unname(pasted_texts),
tkzr,
add_special_tokens = add_special_tokens,
stride = stride,
batch_size = batch_size
)
lmat <- tidytable::map(
tensors,
function(tensor) {
causal_mat(tensor,
trf = trf,
tkzr = tkzr,
add_special_tokens = add_special_tokens,
decode = decode,
stride = stride
)
}
)
names(lmat) <- levels(as.factor(by))
if(!sorted) lmat <- lmat[unique(as.factor(by))]
lmat |>
unlist(recursive = FALSE) |>
ln_p_change(log.p = log.p)
}
#' @export
#' @rdname causal_predictability
causal_targets_pred <- function(contexts,
targets,
sep = " ",
log.p = getOption("pangoling.log.p"),
ignore_regex = "",
model = getOption("pangoling.causal.default"),
checkpoint = NULL,
add_special_tokens = NULL,
config_model = NULL,
config_tokenizer = NULL,
batch_size = 1,
...) {
if(any(!is_really_string(targets))) {
stop2("`targets` needs to be a vector of non-empty strings.")
}
if(any(!is_really_string(contexts))) {
stop2("`contexts` needs to be a vector of non-empty strings.")
}
dots <- list(...)
# Check for unknown arguments
if (length(dots) > 0) {
unknown_args <- setdiff(names(dots), ".by")
if (length(unknown_args) > 0) {
stop("Unknown arguments: ", paste(unknown_args, collapse = ", "), ".")
}
}
if(any(targets != trimws(targets)) |
any(contexts != trimws(contexts)) & sep == " ") {
message_verbose(
paste0('Notice that some words have white spaces,',
' if this is intended, argument `sep` should probably set to "".'))
}
stride <- 1 # fixed for now
message_verbose_model(model, checkpoint)
x <- c(rbind(contexts, targets))
by <- rep(seq_len(length(x)/2), each = 2)
word_by_word_texts <- split(x, by, drop = TRUE)
pasted_texts <- conc_words(word_by_word_texts, sep = sep)
tkzr <- tokenizer(model,
add_special_tokens = add_special_tokens,
config_tokenizer = config_tokenizer
)
trf <- lang_model(model,
checkpoint = checkpoint,
task = "causal",
config_model = config_model
)
tensors <- create_tensor_lst(
texts = unname(pasted_texts),
tkzr = tkzr,
add_special_tokens = add_special_tokens,
stride = stride,
batch_size = batch_size
)
lmats <- lapply(tensors, function(tensor) {
causal_mat(tensor,
trf = trf,
tkzr = tkzr,
add_special_tokens = add_special_tokens,
decode = FALSE,
stride = stride
)
}) |>
unlist(recursive = FALSE)
out <- tidytable::pmap(
list(
word_by_word_texts,
names(word_by_word_texts),
lmats
),
function(words, item, mat) {
message_verbose(
"Text id: ", item, "\n`",
paste(words, collapse = sep),
"`"
)
word_lp(words,
sep = sep,
mat = mat,
ignore_regex = ignore_regex,
model = model,
add_special_tokens = add_special_tokens,
config_tokenizer = config_tokenizer
)
}
)
message_verbose("***\n")
keep <- c(FALSE, TRUE)
out <- out |> lapply(function(x) x[keep])
lps <- out |> unsplit(by[keep], drop = TRUE)
names(lps) <- out |> lapply(function(x) paste0(names(x),"")) |>
unsplit(by[keep], drop = TRUE)
lps |>
ln_p_change(log.p = log.p)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.