#' Find encoding type of variable and then set it to UTF-8.
#' @param x Tibble including both text and numeric variables.
#' @return all character variables in UTF-8 format.
#' @noRd
get_encoding_change <- function(x) {
Encoding(x) <- Encoding(enc2utf8(x))
x
}
#' Select all character variables and make them UTF-8 coded (BERT wants it in this format).
#' @param x A character variable or a tibble including both character and numeric variables.
#' @return a tibble containing all character variables in UTF-8 format.
#' @importFrom dplyr select_if
#' @importFrom tibble as_tibble
#' @importFrom purrr map
#' @noRd
select_character_v_utf8 <- function(x) {
# If a vector is submitted, make it a tibble column.
if (is.vector(x) == TRUE && is.list(x) == FALSE) {
# Select variable name to have as column name in the end
colname_x <- deparse(substitute(x))
# Remove everything before a "$"
colname_x <- gsub("^.*\\$", "", colname_x)
x <- tibble::as_tibble_col(x)
colnames(x) <- substitute(colname_x)
}
# Select all character variables
x_characters <- dplyr::select_if(x, is.character)
# This makes sure that all variables are UTF-8 coded
x_characters <- tibble::as_tibble(purrr::map(x_characters, get_encoding_change))
}
#' Detect non-ASCII characters
#'
#' This function to detect non-ASCII characters in a tibble with multiple columns.
#' @param data_tibble A character variable or a tibble including character variables.
#' @return a tibble containing variable names, row numbers and text including non-acii.
#' @importFrom tibble tibble
#' @export
textFindNonASCII <- function(data_tibble) {
# Initialize an empty list to store results
results_list <- list()
# Iterate over each column in the tibble
for (col_name in colnames(data_tibble)) {
# Extract the actual vector from the tibble column, ensuring it's treated as a character vector
data_column <- as.character(data_tibble[[col_name]])
# Use sapply to identify any characters outside the ASCII range
matches <- base::sapply(seq_along(data_column), function(i) {
text <- data_column[i]
# Check if the element is a valid string, then convert to a character and check
if (length(text) == 1 && !base::is.na(text) && !base::is.null(text)) {
any(base::charToRaw(base::as.character(text)) > base::as.raw(0x7F))
} else {
FALSE
}
})
# Get the original row numbers
row_indices <- base::which(matches)
# Extract the text entries that have non-ASCII characters
matching_texts <- data_column[row_indices]
# If there are any matches, create a tibble and add to the results list
if (length(row_indices) > 0) {
results_list[[col_name]] <- tibble::tibble(
column_name = col_name,
row_number = base::as.integer(row_indices),
text = base::as.character(matching_texts)
)
}
}
# Combine all results into a single tibble
final_result <- dplyr::bind_rows(results_list)
return(final_result)
}
#' # Function to clean non-ASCII characters from a single text entry
#' @param data_tibble A character variable.
#' @return a tibble with removed ascii characters
#' @noRd
clean_text <- function(text) {
#iconv(text, from = "UTF-8", to = "UTF-8", sub = "")
iconv(text, from = "UTF-8", to = "ASCII", sub = "")
}
#' Clean non-ASCII characters
#'
#' textCleanNonASCII() cleans all text entries with a non-ASCII character in a tibble.
#' @param data_tibble A tibble with character variables.
#' @param problematic_texts (tibble)
#' @param remove_non_ascii (boolean)
#' @return a tibble with removed ascii characters
#' @importFrom dplyr mutate across everything
#' @importFrom purrr map_chr
#' @export
textCleanNonASCIIinfo <- function(
data_tibble,
problematic_texts = NULL,
remove_non_ascii
) {
# Combine column_name and row_number for each row
combined_texts <- apply(problematic_texts[c("column_name", "row_number")], 1, function(x) {
paste(x, collapse = " ")
})
# Merge all combined texts into a single string, separated by ";"
final_string <- paste(combined_texts, collapse = "; ")
warning_ascii <- paste("Warning: non-ascii characters were found in:",
final_string, "Many large laguage models cannot handle them. \n")
message(colourise(warning_ascii, "brown"))
message(colourise("To examine thise text cases use the textNonASCII() function. \n", "green"))
# remove non-ASCII characters
if(remove_non_ascii){
# Apply `clean_text` to each element in the tibble
cleaned_tibble <- data_tibble %>%
dplyr::mutate(dplyr::across(dplyr::everything(), ~ purrr::map_chr(., clean_text)))
message(colourise("Non-ASCII characters has been removed. \n", "green"))
}
return(cleaned_tibble)
}
#' Clean non-ASCII characters
#'
#' textCleanNonASCII() cleans all text entries with a non-ASCII character in a tibble.
#' @param data_tibble A tibble with character variables.
#' @return a tibble with removed ascii characters
#' @importFrom dplyr mutate across everything
#' @importFrom purrr map_chr
#' @export
textCleanNonASCII <- function(data_tibble) {
# Apply `clean_text` to each element in the tibble
cleaned_tibble <- data_tibble %>%
dplyr::mutate(dplyr::across(dplyr::everything(), ~ purrr::map_chr(., clean_text)))
return(cleaned_tibble)
}
#' Function to normalize the vector to one; to a unit vector.
#'
#' @param x a word embedding
#' @return normalized (unit) vector/word embedding.
#' @noRd
normalizeV <- function(x) {
magnitude <- x / sqrt(sum(x^2, na.rm = TRUE))
return(magnitude)
}
#' Function to take min, max, mean or the CLS
#' (which comes from BERT models; not Static spaces) from list of vectors
#' @param x word embeddings to be aggregated
#' @param aggregation method to carry out the aggregation, including "min", "max" and "mean" which takes the
#' minimum, maximum or mean across each column; or "concatenate", which links together each word embedding layer
#' to one long row.
#' @param weights to be used when aggreating using mean (used when batching and aggregating word_types).
#' @return aggregated word embeddings.
#' @importFrom tibble as_tibble_row
#' @importFrom purrr map
#' @noRd
textEmbeddingAggregation <- function(x,
aggregation = "min",
weights = NULL) {
if (!is.null(weights) && aggregation == "mean") {
# Weighted mean calculation
weighted_mean_vector <- colSums(x * weights, na.rm = TRUE) / sum(weights, na.rm = TRUE)
return(weighted_mean_vector)
} else if (aggregation == "min") {
min_vector <- unlist(purrr::map(x, min, na.rm = TRUE))
return(min_vector)
} else if (aggregation == "max") {
max_vector <- unlist(purrr::map(x, max, na.rm = TRUE))
return(max_vector)
} else if (aggregation == "mean") {
mean_vector <- colMeans(x, na.rm = TRUE)
return(mean_vector)
} else if (aggregation == "concatenate") {
long_vector <- c(t(x)) %>% tibble::as_tibble_row(.name_repair = "minimal")
colnames(long_vector) <- paste0("Dim", seq_len(length(long_vector)))
variable_name <- names(x)[1]
# If original name is not just Dim1, then add back Dim1_variable.name
if (!variable_name == "Dim1") {
variable_name <- sub(".*Dim1_", "", variable_name)
colnames(long_vector) <- paste0(names(long_vector), "_", variable_name)
}
return(long_vector)
} else if (aggregation == "normalize") {
sum_vector <- unlist(purrr::map(x, sum, na.rm = TRUE))
normalized_vector <- normalizeV(sum_vector)
return(normalized_vector)
} else {
stop("Invalid aggregation method provided.")
}
}
#' getUniqueWordsAndFreq
#' Function unites several text variables and rows to one,
#' where all text is transformed to lowercase and tokenized.
#' Also give word frequencies.
#' @param x_characters A character column in a tibble.
#' @param hg_tokenizer (boolean) Weather to use textTokenize
#' @return A tibble with a unique words column and a column with their respective frequency.
#' @importFrom tibble tibble
#' @importFrom stringi stri_c stri_trans_tolower
# @importFrom stringr str_c str_split stri_split_boundaries
# @importFrom tokenizers tokenize_words
#' @noRd
getUniqueWordsAndFreq <- function(x_characters,
hg_tokenizer = NULL,
...) {
if (is.null(hg_tokenizer)) {
# Unite all text variables into one
x_characters2 <- tidyr::unite(x_characters, "x_characters2", seq_len(ncol(x_characters)), sep = " ")
# unite all rows in the column into one cell
x_characters3 <- stringi::stri_c(x_characters2$x_characters2, collapse = " ")
# Tokenize into single words help(stri_split_boundaries)
x_characters4a <- stringi::stri_trans_tolower(x_characters3)
x_characters4b <- stringi::stri_split_boundaries(x_characters4a,
type = "word",
skip_word_none = TRUE,
skip_word_number = FALSE
)[[1]]
# Create dataframe with single words and frequency
x_characters5 <- data.frame(sort(table(unlist(strsplit(tolower(x_characters4b), " ")))))
}
if (!is.null(hg_tokenizer)) {
x_characters4b <- lapply(list(x_characters), textTokenize, model = hg_tokenizer, ...)
x_characters5 <- data.frame(sort(table(unlist(x_characters4b))))
}
if (length(x_characters5) == 1) {
colnames(x_characters5) <- c("Freq")
x_characters5 <- tibble::rownames_to_column(x_characters5, "Var1")
}
singlewords <- tibble::tibble(x_characters5$Var1, x_characters5$Freq)
colnames(singlewords) <- c("words", "n")
singlewords$words <- as.character(singlewords$words)
return(singlewords)
}
#' This is a function that sorts out (i.e., tidy) the embeddings from the huggingface interface.
#' @param x list of layers.
#' @param layers the number of layers to get (setting comes from textEmbedRawLayers).
#' @param return_tokens boolean whether tokens have been returned (setting comes from textEmbedRawLayers).
#' @return Layers in tidy tibble format with each dimension column called Dim1, Dim2 etc.
#' @noRd
sortingLayers <- function(x,
layers = layers,
return_tokens = return_tokens) {
# If selecting "all" layers, find out number of layers to help indicate layer index later in code
if (is.character(layers)) {
layers <- 0:(length(x[[1]][[1]]) - 1)
}
# Find number of dimensions (where the place differ depending on return_token is TRUE or FALSE)
if (return_tokens) {
dimensions <- length(x[[1]][[1]][[1]][[1]][[1]])
participants <- length(x[[1]])
} else {
dimensions <- length(x[[1]][[1]][[1]][[1]])
participants <- length(x)
}
# Tidy-structure tokens and embeddings
# Replace outer loop over i_in_variable with map();
variable_x <- purrr::map(1:participants, function(i_in_variable) {
if (return_tokens) {
tokens <- x[[2]][[i_in_variable]]
token_id <- seq_len(length(tokens))
all_layers <- x[[1]][[i_in_variable]]
} else {
tokens <- NULL
all_layers <- x[[i_in_variable]]
# Count number of embeddings within one layer
token_id <- seq_len(length(all_layers[[1]][[1]]))
}
# Replace inner loop over i_layers with updated code
totalTokensNum <- length(tokens)
tarTb <- numeric(length = totalTokensNum * length(layers) * dimensions)
tarTb <- reticulate::np_array(tarTb)
tarTb <- tibble::as_tibble(
reticulate::py_to_r(
reticulate::array_reshape(tarTb, c(totalTokensNum * length(layers), dimensions))
),
.name_repair = "minimal"
)
colnames(tarTb) <- paste0("Dim", seq_len(dimensions))
purrr::map(seq_len(totalTokensNum), function(i) {
purrr::map(seq_len(length(layers)), function(j) {
k <- j - 1
tarTb[i + totalTokensNum * k, ] <<- as.list(all_layers[[j]][[1]][[i]])
})
})
# Add tokens, token IDs, and layer numbers to output tibble
if (return_tokens) {
tarTb <- cbind(tokens, token_id, layer_number = rep(layers, each = totalTokensNum), tarTb) %>%
tibble::as_tibble()
} else {
tarTb <- cbind(token_id, layer_number = rep(layers, each = totalTokensNum), tarTb) %>%
tibble::as_tibble()
}
tarTb
})
variable_x
}
#' This is a function that uses the textAggregation to aggregate the layers
#' @param x list of layers.
#' @param aggregation method to aggregate the layers.
#' @param return_tokens (boolean) returns the tokens as the first column.
#' @return Aggregated layers in tidy tibble format.
#' @noRd
layer_aggregation_helper <- function(x,
aggregation = aggregation,
return_tokens = FALSE) {
aggregated_layers_saved <- list()
# Get unique number of token ids in the variable starting with x$token_id ; i_token_id=1
number_of_ids <- unique(x[, grep("^token_id", names(x))][[1]])
# Loops over the number of tokens; i_token_id = 2; i_token_id = 3
for (i_token_id in number_of_ids) { # seq_len(length(number_of_ids))
# Selects all the layers for each token/token_id
x1 <- x[x[, grep("^token_id", names(x))][[1]] == i_token_id, ]
# Select only Dimensions
x2 <- dplyr::select(x1, dplyr::starts_with("Dim"))
# Aggregate the dimensions
x3 <- textEmbeddingAggregation(x2, aggregation = aggregation)
aggregated_layers_saved[[i_token_id]] <- x3
}
aggregated_layers_saved1 <- dplyr::bind_rows(aggregated_layers_saved)
if (return_tokens) {
# Number of ids
number_of_layers <- unique(x[, grep("^layer_number", names(x))][[1]])
n_layers <- length(number_of_layers)
tokens <- x$tokens[1:(length(x$tokens) / n_layers)]
tokens <- as_tibble_col(tokens, column_name = "tokens")
aggregated_layers_saved1 <- dplyr::bind_cols(tokens, aggregated_layers_saved1)
}
return(aggregated_layers_saved1)
}
#' grep_col_by_name_in_list
#' This function finds a column by name independent on where in the list structure it is.
#' @param l a list.
#' @param pattern what to find; such as the "layers_number" column.
#' @return elements in the column called pattern.
#' @noRd
grep_col_by_name_in_list <- function(l,
pattern) {
u <- unlist(l)
u[grep(pattern, names(u))]
}
#' Tokenize text-variables
#'
#' textTokenize() tokenizes according to different huggingface transformers
#' @param texts A character variable or a tibble/dataframe with at least one character variable.
#' @param model Character string specifying pre-trained language model (default 'bert-base-uncased').
#' For full list of options see pretrained models at
#' \href{https://huggingface.co/transformers/pretrained_models.html}{HuggingFace}.
#' For example use "bert-base-multilingual-cased", "openai-gpt",
#' "gpt2", "ctrl", "transfo-xl-wt103", "xlnet-base-cased", "xlm-mlm-enfr-1024", "distilbert-base-cased",
#' "roberta-base", or "xlm-roberta-base".
#' @param max_token_to_sentence (numeric) Maximum number of tokens in a string to handle before
#' switching to embedding text sentence by sentence.
#' @param device Name of device to use: 'cpu', 'gpu', 'gpu:k' or 'mps'/'mps:k' for MacOS, where k is a
#' specific device number.
#' @param tokenizer_parallelism If TRUE this will turn on tokenizer parallelism. Default FALSE.
#' @param model_max_length The maximum length (in number of tokens) for the inputs to the transformer model
#' (default the value stored for the associated model).
#' @param hg_gated Set to TRUE if the accessed model is gated.
#' @param hg_token The token needed to access the gated model.
#' Create a token from the ['Settings' page](https://huggingface.co/settings/tokens) of
#' the Hugging Face website. An an environment variable HUGGINGFACE_TOKEN can
#' be set to avoid the need to enter the token each time.
#' @param trust_remote_code use a model with custom code on the Huggingface Hub
#' @param logging_level Set the logging level. Default: "warning".
#' Options (ordered from less logging to more logging): critical, error, warning, info, debug
#' @return Returns tokens according to specified huggingface transformer.
#' @examples
#' \donttest{
#' # tokens <- textTokenize("hello are you?")
#' }
#' @seealso see \code{\link{textEmbed}}
#' @importFrom reticulate source_python
#' @importFrom tibble tibble as_tibble
#' @export
textTokenize <- function(texts,
model = "bert-base-uncased",
max_token_to_sentence = 4,
device = "cpu",
tokenizer_parallelism = FALSE,
model_max_length = NULL,
hg_gated = FALSE,
hg_token = Sys.getenv("HUGGINGFACE_TOKEN",
unset = ""),
trust_remote_code = FALSE,
logging_level = "error") {
# Run python file with HunggingFace interface to state-of-the-art transformers
reticulate::source_python(system.file("python",
"huggingface_Interface3.py",
package = "text",
mustWork = TRUE
))
tokens <- hgTokenizerGetTokens(
text_strings = texts,
model = model,
max_token_to_sentence = max_token_to_sentence,
device = device,
tokenizer_parallelism = tokenizer_parallelism,
model_max_length = model_max_length,
hg_gated = reticulate::r_to_py(hg_gated),
hg_token = reticulate::r_to_py(hg_token),
trust_remote_code = trust_remote_code,
logging_level = logging_level
)
tokens1 <- lapply(tokens, tibble::as_tibble_col, column_name = "tokens")
return(tokens1)
}
#' Extract layers of hidden states
#'
#' textEmbedRawLayers extracts layers of hidden states (word embeddings) for all character variables
#' in a given dataframe.
#' @param texts A character variable or a tibble with at least one character variable.
#' @param model (character) Character string specifying pre-trained language model
#' (default = 'bert-base-uncased'). For full list of options see pretrained models at
#' \href{https://huggingface.co/transformers/pretrained_models.html}{HuggingFace}.
#' For example use "bert-base-multilingual-cased", "openai-gpt",
#' "gpt2", "ctrl", "transfo-xl-wt103", "xlnet-base-cased", "xlm-mlm-enfr-1024",
#' "distilbert-base-cased", "roberta-base", or "xlm-roberta-base". Only load models that
#' you trust from HuggingFace; loading a malicious model can execute arbitrary code on
#' your computer).
#' @param layers (character or numeric) Specify the layers that should be extracted
#' (default -2, which give the second to last layer). It is more efficient to only extract the
#' layers that you need (e.g., 11). You can also extract several (e.g., 11:12),
#' or all by setting this parameter to "all". Layer 0 is the decontextualized input layer
#' (i.e., not comprising hidden states) and thus should normally not be used. These layers can then
#' be aggregated in the textEmbedLayerAggregation function.
#' @param return_tokens (boolean) If TRUE, provide the tokens used in the specified transformer
#' model. (default = TRUE)
#' @param word_type_embeddings (boolean) Wether to provide embeddings for each word/token type.
#' (default = FALSE)
#' @param decontextualize (boolean) Wether to dectonextualise embeddings (i.e., embedding one word
#' at a time). (default = TRUE)
#' @param keep_token_embeddings (boolean) Whether to keep token level embeddings in the output
#' (when using word_types aggregation). (default= TRUE)
#' @param device (character) Name of device to use: 'cpu', 'gpu', 'gpu:k' or 'mps'/'mps:k'
#' for MacOS, where k is a specific device number. (default = "cpu")
#' @param tokenizer_parallelism (boolean) If TRUE this will turn on tokenizer parallelism.
#' (default = FALSE).
#' @param model_max_length The maximum length (in number of tokens) for the inputs to the
#' transformer model (default the value stored for the associated model).
#' @param max_token_to_sentence (numeric) Maximum number of tokens in a string to handle before
#' switching to embedding text sentence by sentence. (default= 4)
#' @param hg_gated Set to TRUE if the accessed model is gated.
#' @param hg_token The token needed to access the gated model.
#' Create a token from the ['Settings' page](https://huggingface.co/settings/tokens) of
#' the Hugging Face website. An an environment variable HUGGINGFACE_TOKEN can
#' be set to avoid the need to enter the token each time.
#' @param trust_remote_code use a model with custom code on the Huggingface Hub
#' @param logging_level (character) Set the logging level. (default ="error")
#' Options (ordered from less logging to more logging): critical, error, warning, info, debug
#' @param sort (boolean) If TRUE sort the output to tidy format. (default = TRUE)
#' @return The textEmbedRawLayers() takes text as input, and returns the hidden states for
#' each token of the text, including the [CLS] and the [SEP].
#' Note that layer 0 is the input embedding to the transformer, and should normally not be used.
#' @examples
#' # Get hidden states of layer 11 and 12 for "I am fine".
#' \dontrun{
#' imf_embeddings_11_12 <- textEmbedRawLayers(
#' "I am fine",
#' layers = 11:12
#' )
#'
#' # Show hidden states of layer 11 and 12.
#' imf_embeddings_11_12
#' }
#' @seealso See \code{\link{textEmbedLayerAggregation}} and \code{\link{textEmbed}}.
#' @importFrom reticulate source_python
#' @importFrom dplyr %>% bind_rows group_split
#' @importFrom tibble tibble as_tibble
#' @importFrom magrittr set_colnames
#' @export
textEmbedRawLayers <- function(texts,
model = "bert-base-uncased",
layers = -2,
return_tokens = TRUE,
word_type_embeddings = FALSE,
decontextualize = FALSE,
keep_token_embeddings = TRUE,
device = "cpu",
tokenizer_parallelism = FALSE,
model_max_length = NULL,
max_token_to_sentence = 4,
hg_gated = FALSE,
hg_token = Sys.getenv("HUGGINGFACE_TOKEN",
unset = ""),
trust_remote_code = FALSE,
logging_level = "error",
sort = TRUE) {
if (decontextualize == TRUE && word_type_embeddings == FALSE) {
stop(message(
colourise("decontextualize = TRUE & word_type_embeddings = FALSE has not been
implemented in textEmbedRawLayers() at this stage.",
fg = "red"
),
colourise("When using decontextualize = TRUE you need to create the word_type_embeddings.
To create a text embeddings withouth it would take unnecessary time as it would require to
send the same decontextualised words to a transformer multiple times (whilst getting the same
results over and over). Consdier using rextEmbed, to get token embeddings as well as text embeddings.",
fg = "green"
)
))
}
# Run python file with HunggingFace interface to state-of-the-art transformers
reticulate::source_python(system.file("python",
"huggingface_Interface3.py",
package = "text",
mustWork = TRUE
))
if (is.numeric(layers)) {
if (max(layers) > textModelLayers(model, reticulate::r_to_py(hg_gated), reticulate::r_to_py(hg_token))) {
stop("You are trying to extract layers that do not exist in this model.")
}
}
if (layers[1] < 0) {
n <- textModelLayers("bert-base-uncased")
layers <- 1 + n + layers
layers
}
# Select all character variables and make them UTF-8 coded (e.g., BERT wants it that way).
data_character_variables <- select_character_v_utf8(texts)
# Context (default): gives sorted word embeddings based on context
# (i.e., the entire text is sent to the transformer model)
if (!decontextualize) {
x <- data_character_variables
sorted_layers_ALL_variables <- list()
sorted_layers_ALL_variables$context_tokens <- list()
# Loop over all character variables; i_variables = 1
for (i_variables in seq_len(length(data_character_variables))) {
T1_variable <- Sys.time()
# Python file function to HuggingFace
hg_embeddings <- hgTransformerGetEmbedding(
text_strings = x[[i_variables]],
model = model,
layers = layers,
return_tokens = return_tokens,
device = reticulate::r_to_py(device),
tokenizer_parallelism = tokenizer_parallelism,
model_max_length = model_max_length,
max_token_to_sentence = max_token_to_sentence,
hg_gated = reticulate::r_to_py(hg_gated),
hg_token = reticulate::r_to_py(hg_token),
trust_remote_code = trust_remote_code,
logging_level = logging_level
)
if (sort) {
variable_x <- sortingLayers(
x = hg_embeddings,
layers = layers,
return_tokens = return_tokens
)
} else {
variable_x <- hg_embeddings
}
sorted_layers_ALL_variables$context_tokens[[i_variables]] <- variable_x
names(sorted_layers_ALL_variables$context_tokens)[[i_variables]] <- names(x)[[i_variables]]
# Adding informative comment
layers_string <- paste(as.character(layers), sep = " ", collapse = " ")
comment(sorted_layers_ALL_variables$context_tokens) <-
paste("Information about the embeddings. textEmbedRawLayers: ",
"model: ", model, " ; ",
"layers: ", layers_string, " ; ",
"word_type_embeddings: ", word_type_embeddings, " ; ",
"max_token_to_sentence: ", max_token_to_sentence, " ; ",
"text_version: ", packageVersion("text"), ".",
sep = "",
collapse = "\n"
)
## Timing
T2_variable <- Sys.time()
variable_time <- T2_variable - T1_variable
variable_time <- sprintf(
"duration: %f %s).",
variable_time,
units(variable_time)
)
version_seq <- paste(i_variables, "/", length(data_character_variables), sep = "")
loop_text <- paste("Completed layers output for ",
names(texts)[[i_variables]], " (variable: ",
version_seq, ", ",
variable_time,
"\n",
sep = ""
)
message(colourise(loop_text, "green"))
}
}
# Word type embeddings based on Context embeddings
if (word_type_embeddings && !decontextualize) {
# see stop in the beginning of this function.
individual_tokens <- list()
individual_tokens$context_word_type <- list()
individual_tokens$tokens <- list()
# 1. Group individual tokens help(bind_rows)
i_we <- suppressWarnings(dplyr::bind_rows(sorted_layers_ALL_variables$context_tokens))
i_we2 <- dplyr::group_split(i_we, i_we[, grep("^tokens", names(i_we))][[1]])
names(i_we2) <- paste(rep("word_type", length(i_we2)), seq_len(length(i_we2)), sep = "")
individual_tokens$context_word_type <- i_we2
# Specify which token layers go together and ensure that the token_id starts with 1
# (for textLayersAggregation to know which layers are linked);
num_layers <- length(layers)
# Look over all token list objects to adjust the token_id. i_context = 1
for (i_context in seq_len(length(individual_tokens$context_word_type))) { # $word_type
token_id_df <- individual_tokens$context_word_type[[i_context]] # $word_type
token_id_variable <- token_id_df[, grep("^token_id", names(token_id_df))][[1]]
num_token <- length(token_id_variable) / num_layers
token_id <- sort(rep(1:num_token, num_layers))
individual_tokens$context_word_type[[i_context]][, grep("^token_id", names(token_id_df))][[1]] <- token_id
}
# Get first element from each list.
single_words <- sapply(individual_tokens$context_word_type, function(x) x[[1]][1]) # $word_type
single_words <- tibble::as_tibble_col(single_words, column_name = "words")
# n
n <- sapply(individual_tokens$context_word_type, function(x) length(x[[1]]) / num_layers) # $word_type
n <- tibble::as_tibble_col(n, column_name = "n")
single_words_n <- dplyr::bind_cols(single_words, n)
individual_tokens$tokens <- single_words_n
sing_text <- c("Completed layers aggregation for word_type_embeddings. \n")
message(colourise(sing_text, "green"))
}
# Decontextualized embeddings for aggregated embeddings and word type embeddings
if (decontextualize) {
individual_tokens <- list()
individual_tokens$decontext <- list()
# Get word embeddings for all individual tokens/words (which is, e.g., used for the word plot).help(bind_cols)
data_character_variables1 <- suppressMessages(apply(data_character_variables, 1, bind_cols)) %>%
bind_rows()
singlewords <- getUniqueWordsAndFreq(data_character_variables1[[1]],
hg_tokenizer = model
)
list_words <- sapply(singlewords$words, list)
names(list_words) <- NULL
hg_decontexts_embeddings <- hgTransformerGetEmbedding(
text_strings = list_words,
model = model,
layers = layers,
return_tokens = return_tokens,
device = reticulate::r_to_py(device),
tokenizer_parallelism = tokenizer_parallelism,
model_max_length = model_max_length,
max_token_to_sentence = max_token_to_sentence,
hg_gated = reticulate::r_to_py(hg_gated),
hg_token = reticulate::r_to_py(hg_token),
logging_level = logging_level
)
# Sort out layers as above
if (sort) {
individual_tokens$decontext$word_type <- sortingLayers(
x = hg_decontexts_embeddings,
layers = layers,
return_tokens = return_tokens
)
} else {
individual_tokens$decontext$word_type <- hg_decontexts_embeddings
}
names(individual_tokens$decontext$word_type) <- NULL
individual_tokens$decontext$single_words <- singlewords
# Adding informative data
layers_string <- paste(as.character(layers), sep = " ", collapse = " ")
comment(individual_tokens$decontext$word_type) <- c(paste("Information about the embeddings.
textEmbedRawLayers: ",
"model:", model,
"layers:", layers_string, ".",
collapse = " ; "
))
comment(individual_tokens$decontext$single_words) <- c(paste("Information about the embeddings.
textEmbedRawLayers: ",
"model:", model,
"layers:", layers_string, ".",
collapse = " ; "
))
de_text <- c("Completed layers aggregation for decontexts embeddings. \n")
message(colourise(de_text, "green"))
individual_tokens
}
# Combine previous list and word list
if (decontextualize == FALSE && word_type_embeddings == FALSE) {
word_embeddings_with_layers <- c(sorted_layers_ALL_variables)
rm(sorted_layers_ALL_variables)
} else if (decontextualize == FALSE && word_type_embeddings == TRUE && keep_token_embeddings == TRUE) {
word_embeddings_with_layers <- c(sorted_layers_ALL_variables, individual_tokens)
rm(sorted_layers_ALL_variables)
rm(individual_tokens)
} else if (decontextualize == FALSE && word_type_embeddings == TRUE && keep_token_embeddings == FALSE) {
word_embeddings_with_layers <- c(individual_tokens)
rm(sorted_layers_ALL_variables)
rm(individual_tokens)
} else if (decontextualize == TRUE && word_type_embeddings == TRUE) {
word_embeddings_with_layers <- c(individual_tokens)
rm(individual_tokens)
}
return(word_embeddings_with_layers)
}
#' Aggregate layers
#'
#' textEmbedLayerAggregation selects and aggregates layers of hidden states to form a word embedding.
#' @param word_embeddings_layers Layers returned by the textEmbedRawLayers function.
#' @param layers (character or numeric) The numbers of the layers to be aggregated
#' (e.g., c(11:12) to aggregate the eleventh and twelfth).
#' Note that layer 0 is the input embedding to the transformer, and should normally not be used.
#' Selecting 'all' thus removes layer 0 (default = "all")
#' @param aggregation_from_layers_to_tokens (character) Method to carry out the aggregation among
#' the layers for each word/token, including "min", "max" and "mean" which takes the minimum,
#' maximum or mean across each column; or "concatenate", which links together each layer of the
#' word embedding to one long row (default = "concatenate").
#' @param aggregation_from_tokens_to_texts (character) Method to carry out the aggregation among the word embeddings
#' for the words/tokens, including "min", "max" and "mean" which takes the minimum, maximum or mean across each column;
#' or "concatenate", which links together each layer of the word embedding to one long row (default = "mean").
#' @param return_tokens (boolean) If TRUE, provide the tokens used in the specified transformer model (default = FALSE).
#' @param tokens_select (character) Option to only select embeddings linked to specific tokens
#' in the textEmbedLayerAggregation() phase such as "[CLS]" and "[SEP]" (default NULL).
#' @param tokens_deselect (character) Option to deselect embeddings linked to specific tokens in
#' the textEmbedLayerAggregation() phase such as "[CLS]" and "[SEP]" (default NULL).
#' @return A tibble with word embeddings. Note that layer 0 is the input embedding to
#' the transformer, which is normally not used.
#' @examples
#' # Aggregate the hidden states from textEmbedRawLayers
#' # to create a word embedding representing the entire text.
#' # This is achieved by concatenating layer 11 and 12.
#' \dontrun{
#' word_embedding <- textEmbedLayerAggregation(
#' imf_embeddings_11_12$context_tokens,
#' layers = 11:12,
#' aggregation_from_layers_to_tokens = "concatenate",
#' aggregation_from_tokens_to_texts = "mean"
#' )
#'
#' # Examine word_embedding
#' word_embedding
#' }
#' @seealso See \code{\link{textEmbedRawLayers}} and \code{\link{textEmbed}}.
#' @importFrom dplyr %>% bind_rows
#' @export
textEmbedLayerAggregation <- function(word_embeddings_layers,
layers = "all",
aggregation_from_layers_to_tokens = "concatenate",
aggregation_from_tokens_to_texts = "mean",
return_tokens = FALSE,
tokens_select = NULL,
tokens_deselect = NULL) {
if (return_tokens == TRUE && !is.null(aggregation_from_tokens_to_texts)) {
stop(message(
colourise("return_tokens = TRUE does not work with aggregation_from_tokens_to_texts not being NULL ", fg = "red"),
colourise("When aggregating tokens to text, it is not possible to have return_token = TRUE.
To get both token_embeddings and text_embeddings use textEmbed().", fg = "green")
))
}
# If selecting 'all' layers, find out number of layers to help indicate layer index later in code
if (is.character(layers)) {
# Get the first embeddings
x_layer_unique <- unique(grep_col_by_name_in_list(word_embeddings_layers[[1]], "layer_number"))
# Get which layers
layers <- as.numeric(x_layer_unique)
# Remove layer 0 because it is the input layer for the word embeddings.
if (layers[1] == 0) {
layers <- layers[2:length(layers)]
}
}
# Loop over the list of variables; variable_list_i = 1; variable_list_i = 2; remove(variable_list_i)
selected_layers_aggregated_tibble <- list()
for (variable_list_i in seq_len(length(word_embeddings_layers))) {
T1_variable <- Sys.time()
x <- word_embeddings_layers[[variable_list_i]]
# This is to ensure x is in a list (this is to make it work for single word embedddings that are contextualised)
if (tibble::is_tibble(x)) {
x <- list(x)
}
# Go over the lists and select the layers; [[1]] ok to add below x=
# get number of unique layers in the variable starting with "layer_number"
number_of_layers <- unique(x[[1]][, grep("^layer_number", names(x[[1]]))][[1]])
# Check that the right number of levels are selected
if ((length(setdiff(layers, number_of_layers)) > 0) == TRUE) {
stop(message(
colourise("You are trying to aggregate layers that were not extracted.", fg = "red"),
colourise("For example, in textEmbed the layers option needs to include all the
layers used in context_layers.", fg = "green")
))
}
# Select layers in layers-argument selected from the variable starting with layer_number
selected_layers <- lapply(x, function(x) {
x[x[, grep("^layer_number", names(x))][[1]]
%in% layers, ]
})
# Go over the lists and select the tokens (e.g., CLS) (tokens_select = NULL tokens_select = "[CLS]")
if (!is.null(tokens_select)) {
selected_layers <- lapply(selected_layers, function(x) {
x[x[, grep("^tokens", names(x))][[1]]
%in% tokens_select, ]
})
}
# Go over the lists and DEselect the token (e.g., CLS) (tokens_deselect = NULL tokens_deselect = "[CLS]")
if (!is.null(tokens_deselect)) {
selected_layers <- lapply(selected_layers, function(x) {
x[!x[, grep("^tokens", names(x))][[1]]
%in% tokens_deselect, ]
})
# If any of the tokens that was removed was "[CLS]", subtract one on token_id so it starts with
# 1 and works with the layer_aggregation_helper
if (length(tokens_deselect) == 1 && tokens_deselect == "[CLS]") {
# Subtract
selected_layers <- purrr::map(selected_layers, function(x) {
x[, grep("^token_id", names(x))][[1]] <- x[, grep("^token_id", names(x))][[1]] - 1
x
})
} else if (length(tokens_deselect) > 1) {
if (table(tokens_deselect %in% "[CLS]")[[2]] > 0) {
# Subtract
selected_layers <- purrr::map(selected_layers, function(x) {
# select variable starting with "token_id" and substract 1
x[, grep("^token_id", names(x))][[1]] <- x[, grep("^token_id", names(x))][[1]] - 1
x
})
}
}
}
## Aggregate across layers; i_token_id=1 aggregation_from_layers_to_tokens="min"
selected_layers_aggregated <- lapply(selected_layers,
layer_aggregation_helper,
aggregation = aggregation_from_layers_to_tokens,
return_tokens = return_tokens
)
if (is.null(aggregation_from_tokens_to_texts)) {
# Sort output
selected_layers_aggregated_tibble[[variable_list_i]] <- selected_layers_aggregated
}
# Aggregate across tokens
if (!is.null(aggregation_from_tokens_to_texts)) {
selected_layers_tokens_aggregated <- lapply(selected_layers_aggregated,
textEmbeddingAggregation,
aggregation = aggregation_from_tokens_to_texts
)
# Sort output
selected_layers_aggregated_tibble[[variable_list_i]] <- dplyr::bind_rows(selected_layers_tokens_aggregated)
}
# Add informative comments
original_comment <- comment(word_embeddings_layers)
layers_string <- paste(as.character(layers), sep = " ", collapse = " ")
comment(selected_layers_aggregated_tibble[[variable_list_i]]) <- paste(original_comment,
"textEmbedLayerAggregation: layers = ",
layers_string,
"aggregation_from_layers_to_tokens = ",
aggregation_from_layers_to_tokens,
"aggregation_from_tokens_to_texts = ",
aggregation_from_tokens_to_texts,
"tokens_select = ",
tokens_select,
"tokens_deselect = ",
tokens_deselect,
collapse = " ; "
)
## Timing
T2_variable <- Sys.time()
variable_time <- T2_variable - T1_variable
variable_time <- sprintf(
"duration: %f %s).",
variable_time,
units(variable_time)
)
version_seq <- paste(variable_list_i, "/", length(word_embeddings_layers), sep = "")
loop_text <- paste("Completed layers aggregation", " (variable ",
version_seq, ", ",
variable_time,
"\n",
sep = ""
)
message(colourise(loop_text, "blue"))
}
names(selected_layers_aggregated_tibble) <- names(word_embeddings_layers)
selected_layers_aggregated_tibble
}
#' Generate_placement_vector input from textEmbedRawLayers and inserts NA-placeholder vectors for NA values.
#' @param raw_layers Layers returned by the textEmbedRawLayers function with NA values.
#' @return Layers returned by the textEmbedRawLayers with inserted NA-placeholder vectors.
#' @noRd
generate_placement_vector <- function(raw_layers,
texts) {
# Extract column name, if there is one.
column_name <- colnames(texts)
context_tokens <- NULL
if (!is.null(raw_layers$context_tokens$value)) {
context_tokens <- raw_layers$context_tokens$value
}
# If raw_layers$context_tokens$value is NULL, check if raw_layers$context_tokens$texts is not NULL and use it
if (!is.null(raw_layers$context_tokens$texts)) {
context_tokens <- raw_layers$context_tokens$texts
}
# Try with column name
if (!is.null(column_name)) {
context_tokens <- raw_layers$context_tokens[[column_name]]
}
# Check if raw_layers$context_tokens$value is not NULL, and use it
if (is.null(context_tokens)) {
stop("Neither raw_layers$context_tokens$value nor raw_layers$context_tokens$texts found or both are NULL.")
}
# Loop through the hidden states
for (i in 1:length(context_tokens)) {
token_embedding <- context_tokens[[i]]
# Find the corresponding token of each hidden state
elements <- context_tokens[[i]][1]
# Check if "na" or "NA" is represented as a token
if (any(sapply(elements, function(element) "na" %in% element)) ||
any(sapply(elements, function(element) "NA" %in% element))) {
# If so, then check for "NA" or "na" in the token-embedding
if (any(grepl("na", token_embedding$tokens, ignore.case = TRUE)) ||
any(grepl("NA", token_embedding$tokens, ignore.case = TRUE))) {
# Store the dimensions of the token-embedding with NA:s
dimensions <- dim(context_tokens[[i]])
}
}
}
# Create a placeholder tibble with NA values of the same shape as the original token embedding
template_na <- as_tibble(matrix(NA, nrow = dimensions[1], ncol = dimensions[2] - 2))
colnames(template_na) <- c("tokens", paste0("Dim", 1:(dimensions[2] - 3)))
# Create a list to store the modified embeddings
modified_embeddings <- list()
# Iterate over each context token in the original embedding list
for (i in 1:length(context_tokens)) {
token_embedding <- context_tokens[[i]]
elements <- context_tokens[[i]][1]
# Check if "na" or "" is present in any element of the list
if ((((any(sapply(elements, function(element) "na" %in% element)) ||
any(sapply(elements, function(element) "NA" %in% element))) &&
nrow(token_embedding) == 3))||
nrow(token_embedding) == 2){
# If so, then check for "na" (or "") in the token-embedding
if (any(grepl("na", token_embedding$tokens, ignore.case = TRUE)) ||
any(grepl("NA", token_embedding$tokens, ignore.case = TRUE)) ||
length(token_embedding$tokens) == 2) {
# Replace only the numerical columns with NA values while keeping the first three columns
token_embedding[, -(1:3)] <- NA # Exclude the first three columns
}
}
modified_embeddings[[i]] <- token_embedding
}
# Replace the original layers with the modified
if (!is.null(raw_layers$context_tokens$value)) {
raw_layers$context_tokens$value <- modified_embeddings
}
if (!is.null(raw_layers$context_tokens$texts)) {
raw_layers$context_tokens$texts <- modified_embeddings
}
if (!is.null(raw_layers$context_tokens[[column_name]])) {
raw_layers$context_tokens[[column_name]] <- modified_embeddings
}
return(raw_layers)
}
#' The number of layers to retrieve
#' @param layers The number of layers to retrieve.
#' @return The number of layers to us (if -2; i.e., the second to last layer)
#' @noRd
find_layer_number <- function(
layers,
hg_gated,
hg_token){
if (layers[1] < 0) {
n <- textModelLayers(model, reticulate::r_to_py(hg_gated), reticulate::r_to_py(hg_token))
layers <- 1 + n + layers
}
return(layers)
}
#' Helper function for textEmbed
#'
#' textEmbed() extracts layers and aggregate them to word embeddings, for all character variables in a given dataframe.
#' @param texts A character variable or a tibble/dataframe with at least one character variable.
#' @param model Character string specifying pre-trained language model (default 'bert-base-uncased').
#' For full list of options see pretrained models at
#' \href{https://huggingface.co/transformers/pretrained_models.html}{HuggingFace}.
#' For example use "bert-base-multilingual-cased", "openai-gpt",
#' "gpt2", "ctrl", "transfo-xl-wt103", "xlnet-base-cased", "xlm-mlm-enfr-1024", "distilbert-base-cased",
#' "roberta-base", or "xlm-roberta-base". Only load models that you trust from HuggingFace; loading a
#' malicious model can execute arbitrary code on your computer).
#' @param layers (string or numeric) Specify the layers that should be extracted
#' (default -2 which give the second to last layer). It is more efficient to only extract the layers
#' that you need (e.g., 11). You can also extract several (e.g., 11:12), or all by setting this parameter
#' to "all". Layer 0 is the decontextualized input layer (i.e., not comprising hidden states) and
#' thus should normally not be used. These layers can then be aggregated in the textEmbedLayerAggregation
#' function.
#' @param dim_name (boolean) If TRUE append the variable name after all variable-names in the output.
#' (This differentiates between word embedding dimension names; e.g., Dim1_text_variable_name).
#' see \code{\link{textDimName}} to change names back and forth.
#' @param aggregation_from_layers_to_tokens (string) Aggregated layers of each token. Method to aggregate the
#' contextualized layers (e.g., "mean", "min" or "max, which takes the minimum, maximum or mean, respectively,
#' across each column; or "concatenate", which links together each word embedding layer to one long row.
#' @param aggregation_from_tokens_to_texts (string) Method to carry out the aggregation among the word embeddings
#' for the words/tokens, including "min", "max" and "mean" which takes the minimum, maximum or mean across each column;
#' or "concatenate", which links together each layer of the word embedding to one long row (default = "mean"). If set to NULL, embeddings are not
#' aggregated.
#' @param aggregation_from_tokens_to_word_types (string) Aggregates to the word type (i.e., the individual words)
#' rather than texts. If set to "individually", then duplicate words are not aggregated, (i.e, the context of individual
#' is preserved). (default = NULL).
#' @param keep_token_embeddings (boolean) Whether to also keep token embeddings when using texts or word
#' types aggregation.
#' @param remove_non_ascii (bolean) TRUE warns and removes non-ascii (using textFindNonASCII()).
#' @param tokens_select Option to select word embeddings linked to specific tokens
#' such as [CLS] and [SEP] for the context embeddings.
#' @param tokens_deselect Option to deselect embeddings linked to specific tokens
#' such as [CLS] and [SEP] for the context embeddings.
#' @param decontextualize (boolean) Provide word embeddings of single words as input to the model
#' (these embeddings are, e.g., used for plotting; default is to use ). If using this, then set
#' single_context_embeddings to FALSE.
#' @param model_max_length The maximum length (in number of tokens) for the inputs to the transformer model
#' (default the value stored for the associated model).
#' @param max_token_to_sentence (numeric) Maximum number of tokens in a string to handle before
#' switching to embedding text sentence by sentence.
#' @param tokenizer_parallelism (boolean) If TRUE this will turn on tokenizer parallelism. Default FALSE.
#' @param device Name of device to use: 'cpu', 'gpu', 'gpu:k' or 'mps'/'mps:k' for MacOS, where k is a
#' specific device number such as 'mps:1'.
#' @param hg_gated Set to TRUE if the accessed model is gated.
#' @param hg_token The token needed to access the gated model.
#' Create a token from the ['Settings' page](https://huggingface.co/settings/tokens) of
#' the Hugging Face website. An an environment variable HUGGINGFACE_TOKEN can
#' be set to avoid the need to enter the token each time.
#' @param logging_level Set the logging level. Default: "warning".
#' Options (ordered from less logging to more logging): critical, error, warning, info, debug
#' @param ... settings from textEmbedRawLayers().
#' @return A tibble with tokens.
#' @importFrom reticulate source_python r_to_py
#' @importFrom tidyr unnest_wider
#' @noRd
text_embed_dlatk <- function(
texts,
model,
layers,
dim_name,
# aggregation_from_layers_to_tokens = aggregation_from_layers_to_tokens,
aggregation_from_tokens_to_texts = aggregation_from_tokens_to_texts,
# aggregation_from_tokens_to_word_types = aggregation_from_tokens_to_word_types,
# keep_token_embeddings = keep_token_embeddings,
remove_non_ascii = remove_non_ascii,
# tokens_select = tokens_select,
# tokens_deselect = tokens_deselect,
# decontextualize = decontextualize,
model_max_length = model_max_length,
# max_token_to_sentence = max_token_to_sentence,
tokenizer_parallelism = tokenizer_parallelism,
device = device,
hg_gated = hg_gated,
hg_token = hg_token,
trust_remote_code = trust_remote_code,
logging_level = logging_level,
batch_size = batch_size
){
if (sum(is.na(texts) > 0)) {
warning("texts contain NA-values.")
}
T1_textEmbed <- Sys.time()
reticulate::source_python(system.file("python",
"huggingface_Interface3.py",
package = "text",
mustWork = TRUE
))
# Number of layers to retrieve (if -2 is given; i.e., getting the second to last layer)
layers <- find_layer_number(layers, hg_gated, hg_token)
layers <- reticulate::r_to_py(as.integer(layers))
# Select all character variables and make them UTF-8 coded (e.g., BERT wants it that way).
data_character_variables <- select_character_v_utf8(texts)
# Check for ASCII characters
problematic_texts <- textFindNonASCII(data_character_variables)
#### Clean ASCII ####
if(nrow(problematic_texts)>0){
data_character_variables <- textCleanNonASCIIinfo(
data_tibble = data_character_variables,
problematic_texts = problematic_texts,
remove_non_ascii = remove_non_ascii
)
}
#### Get Layers & Aggregate layers ####
outcome_list <- list()
# text_i = 1
for (text_i in 1:ncol(data_character_variables)) {
texts <- data_character_variables[[text_i]]
dlatk_emb <- hgDLATKTransformerGetEmbedding(
text_strings = texts, # texts,
#text_ids = NULL,
#group_ids = NULL,
model = model,
layers = layers,
# return_tokens = True,
# max_token_to_sentence = 4,
device = device,
tokenizer_parallelism = tokenizer_parallelism,
model_max_length = model_max_length,
hg_gated = hg_gated,
hg_token = hg_token,
trust_remote_code = trust_remote_code,
logging_level = logging_level,
# sentence_tokenize = True
batch_size = 1L, #as.numeric(batch_size),
aggregations = aggregation_from_tokens_to_texts
)
dlatk_emb_message <- dlatk_emb #[[1]] This is only needed if the pyhon function return "return msg_embeddings, cf_embeddings"
# Extract first embedding from each list item
dlatk_emb_message <- lapply(dlatk_emb_message, function(x) unlist(x[[1]]))
# Convert to tibble: 1 row per embedding, 1024 columns
dlatk_emb_message <- tibble(values = dlatk_emb_message) %>%
tidyr::unnest_wider(values, names_sep = "_", names_repair = "unique")
# Rename columns to Dim1, Dim2, ...
colnames(dlatk_emb_message) <- paste0("Dim", seq_along(dlatk_emb_message))
T2_textEmbed <- Sys.time()
Time_textEmbed <- T2_textEmbed - T1_textEmbed
Time_textEmbed <- sprintf("Duration to embed text: %f %s", Time_textEmbed, units(Time_textEmbed))
Date_textEmbed <- Sys.time()
comment(dlatk_emb_message) <- paste(Time_textEmbed,
"; Date created: ", Date_textEmbed,
"; text_version: ", packageVersion("text"), ".",
sep = "",
collapse = " ")
outcome_list$texts[[text_i]] <- dlatk_emb_message
names(outcome_list$texts)[[text_i]] <- names(data_character_variables)[[text_i]]
if (dim_name == TRUE) {
outcome_list$texts[text_i] <- textDimName(outcome_list$texts[text_i])
}
}
return(outcome_list)
}
#' Helper function for textEmbed
#'
#' textEmbed() extracts layers and aggregate them to word embeddings, for all character variables in a given dataframe.
#' @param texts A character variable or a tibble/dataframe with at least one character variable.
#' @param model Character string specifying pre-trained language model (default 'bert-base-uncased').
#' For full list of options see pretrained models at
#' \href{https://huggingface.co/transformers/pretrained_models.html}{HuggingFace}.
#' For example use "bert-base-multilingual-cased", "openai-gpt",
#' "gpt2", "ctrl", "transfo-xl-wt103", "xlnet-base-cased", "xlm-mlm-enfr-1024", "distilbert-base-cased",
#' "roberta-base", or "xlm-roberta-base". Only load models that you trust from HuggingFace; loading a
#' malicious model can execute arbitrary code on your computer).
#' @param layers (string or numeric) Specify the layers that should be extracted
#' (default -2 which give the second to last layer). It is more efficient to only extract the layers
#' that you need (e.g., 11). You can also extract several (e.g., 11:12), or all by setting this parameter
#' to "all". Layer 0 is the decontextualized input layer (i.e., not comprising hidden states) and
#' thus should normally not be used. These layers can then be aggregated in the textEmbedLayerAggregation
#' function.
#' @param dim_name (boolean) If TRUE append the variable name after all variable-names in the output.
#' (This differentiates between word embedding dimension names; e.g., Dim1_text_variable_name).
#' see \code{\link{textDimName}} to change names back and forth.
#' @param aggregation_from_layers_to_tokens (string) Aggregated layers of each token. Method to aggregate the
#' contextualized layers (e.g., "mean", "min" or "max, which takes the minimum, maximum or mean, respectively,
#' across each column; or "concatenate", which links together each word embedding layer to one long row.
#' @param aggregation_from_tokens_to_texts (string) Method to carry out the aggregation among the word embeddings
#' for the words/tokens, including "min", "max" and "mean" which takes the minimum, maximum or mean across each column;
#' or "concatenate", which links together each layer of the word embedding to one long row (default = "mean"). If set to NULL, embeddings are not
#' aggregated.
#' @param aggregation_from_tokens_to_word_types (string) Aggregates to the word type (i.e., the individual words)
#' rather than texts. If set to "individually", then duplicate words are not aggregated, (i.e, the context of individual
#' is preserved). (default = NULL).
#' @param keep_token_embeddings (boolean) Whether to also keep token embeddings when using texts or word
#' types aggregation.
#' @param remove_non_ascii (bolean) TRUE warns and removes non-ascii (using textFindNonASCII()).
#' @param tokens_select Option to select word embeddings linked to specific tokens
#' such as [CLS] and [SEP] for the context embeddings.
#' @param tokens_deselect Option to deselect embeddings linked to specific tokens
#' such as [CLS] and [SEP] for the context embeddings.
#' @param decontextualize (boolean) Provide word embeddings of single words as input to the model
#' (these embeddings are, e.g., used for plotting; default is to use ). If using this, then set
#' single_context_embeddings to FALSE.
#' @param model_max_length The maximum length (in number of tokens) for the inputs to the transformer model
#' (default the value stored for the associated model).
#' @param max_token_to_sentence (numeric) Maximum number of tokens in a string to handle before
#' switching to embedding text sentence by sentence.
#' @param tokenizer_parallelism (boolean) If TRUE this will turn on tokenizer parallelism. Default FALSE.
#' @param device Name of device to use: 'cpu', 'gpu', 'gpu:k' or 'mps'/'mps:k' for MacOS, where k is a
#' specific device number such as 'mps:1'.
#' @param hg_gated Set to TRUE if the accessed model is gated.
#' @param hg_token The token needed to access the gated model.
#' Create a token from the ['Settings' page](https://huggingface.co/settings/tokens) of
#' the Hugging Face website. An an environment variable HUGGINGFACE_TOKEN can
#' be set to avoid the need to enter the token each time.
#' @param logging_level Set the logging level. Default: "warning".
#' Options (ordered from less logging to more logging): critical, error, warning, info, debug
#' @param ... settings from textEmbedRawLayers().
#' @return A tibble with tokens, a column for layer identifier and word embeddings.
#' Note that layer 0 is the input embedding to the transformer.
#' @examples
#' # Automatically transforms the characters in the example dataset:
#' # Language_based_assessment_data_8 (included in text-package), to embeddings.
#' \dontrun{
#' word_embeddings <- textEmbed(Language_based_assessment_data_8[1:2, 1:2],
#' layers = 10:11,
#' aggregation_from_layers_to_tokens = "concatenate",
#' aggregation_from_tokens_to_texts = "mean",
#' aggregation_from_tokens_to_word_types = "mean"
#' )
#'
#' # Show information about how the embeddings were constructed.
#' comment(word_embeddings$texts$satisfactiontexts)
#' comment(word_embeddings$word_types)
#' comment(word_embeddings$tokens$satisfactiontexts)
#'
#' # See how the word embeddings are structured.
#' word_embeddings
#'
#' # Save the word embeddings to avoid having to embed the text again.
#' saveRDS(word_embeddings, "word_embeddings.rds")
#'
#' # Retrieve the saved word embeddings.
#' word_embeddings <- readRDS("word_embeddings.rds")
#' }
#'
#' @seealso See \code{\link{textEmbedLayerAggregation}}, \code{\link{textEmbedRawLayers}} and
#' \code{\link{textDimName}}.
#' @importFrom reticulate source_python
#' @importFrom utils modifyList
#' @noRd
text_embed <- function(
texts,
model = "bert-base-uncased",
layers = -2,
dim_name = TRUE,
aggregation_from_layers_to_tokens = "concatenate",
aggregation_from_tokens_to_texts = "mean",
aggregation_from_tokens_to_word_types = NULL,
keep_token_embeddings = TRUE,
remove_non_ascii = TRUE,
tokens_select = NULL,
tokens_deselect = NULL,
decontextualize = FALSE,
model_max_length = NULL,
max_token_to_sentence = 4,
tokenizer_parallelism = FALSE,
device = "cpu",
hg_gated = FALSE,
hg_token = Sys.getenv("HUGGINGFACE_TOKEN",
unset = ""),
logging_level = "error",
...) {
if (sum(is.na(texts) > 0)) {
warning("texts contain NA-values.")
}
T1_textEmbed <- Sys.time()
reticulate::source_python(system.file("python",
"huggingface_Interface3.py",
package = "text",
mustWork = TRUE
))
if (
(decontextualize == TRUE && is.null(aggregation_from_tokens_to_texts)) ||
(decontextualize == TRUE && is.null(aggregation_from_tokens_to_word_types)) ||
(decontextualize == TRUE && is.null(aggregation_from_layers_to_tokens))) {
stop(message(
colourise("When using decontextualize = TRUE, it is required to set aggregation_from_tokens_to_texts,
aggregation_from_tokens_to_word_types as well as aggregation_from_tokens_to_word_types",
fg = "red"
),
colourise("This is because both the token embeddings and text embeddings are
constrcuted from the word type embeddings.",
fg = "green"
)
))
}
output <- list()
# Number of layers to retrieve (if -2 is given; i.e., getting the second to last layer)
layers <- find_layer_number(layers, hg_gated, hg_token)
# Select all character variables and make them UTF-8 coded (e.g., BERT wants it that way).
data_character_variables <- select_character_v_utf8(texts)
# Check for ASCII characters
problematic_texts <- textFindNonASCII(data_character_variables)
#### Clean ASCII ####
if(nrow(problematic_texts)>0){
data_character_variables <- textCleanNonASCIIinfo(
data_tibble = data_character_variables,
problematic_texts = problematic_texts,
remove_non_ascii = remove_non_ascii
)
}
#### Get Layers & Aggregate layers ####
outcome_list <- list()
#text_i = 1
for (text_i in 1:ncol(data_character_variables)) {
texts <- data_character_variables[text_i]
# Get hidden states/layers for output 1 and/or output 2 or decontextualized;
if (!is.null(aggregation_from_layers_to_tokens) ||
!is.null(aggregation_from_tokens_to_texts) ||
decontextualize) {
all_wanted_layers <- textEmbedRawLayers(
texts = texts,
model = model,
layers = layers,
return_tokens = TRUE,
word_type_embeddings = TRUE,
decontextualize = decontextualize,
device = device,
tokenizer_parallelism = tokenizer_parallelism,
model_max_length = model_max_length,
max_token_to_sentence = max_token_to_sentence,
hg_gated = hg_gated,
hg_token = hg_token,
logging_level = logging_level
, ...
)
}
# Generate placement vectors if there are NA:s in texts.
if (sum(is.na(texts) > 0)) {
all_wanted_layers <- generate_placement_vector(
raw_layers = all_wanted_layers,
texts = texts
)
}
if (!decontextualize) {
# 1. Get token-level embeddings with aggregated levels
if (!is.null(aggregation_from_layers_to_tokens) && keep_token_embeddings) {
token_embeddings <- textEmbedLayerAggregation(
word_embeddings_layers = all_wanted_layers$context_tokens,
layers = layers,
aggregation_from_layers_to_tokens = aggregation_from_layers_to_tokens,
aggregation_from_tokens_to_texts = NULL,
return_tokens = TRUE, # need to be TRUE
tokens_select = tokens_select,
tokens_deselect = tokens_deselect
)
output$tokens <- token_embeddings
}
# 2. Get aggregated token layers; aggregation_from_tokens_to_texts = "mean"
if (!is.null(aggregation_from_tokens_to_texts)) {
aggregated_token_embeddings <- textEmbedLayerAggregation(
word_embeddings_layers = all_wanted_layers$context_tokens,
layers = layers,
aggregation_from_layers_to_tokens = aggregation_from_layers_to_tokens,
aggregation_from_tokens_to_texts = aggregation_from_tokens_to_texts,
return_tokens = FALSE, # need to be FALSE
tokens_select = tokens_select,
tokens_deselect = tokens_deselect
)
output$texts <- aggregated_token_embeddings
}
}
# 3. Aggregate Word Type (both decontextualised or not)
if (!is.null(aggregation_from_tokens_to_word_types) || decontextualize) {
if (!decontextualize) {
single_context_text <- paste("Embedding single context embeddings.",
"\n",
sep = ""
)
message(colourise(single_context_text, "purple"))
##############################################################################
# These are the word_type embeddings with duplicates #########################
##############################################################################
individual_word_embeddings_layers <- all_wanted_layers$context_word_type
individual_words <- all_wanted_layers$tokens
}
if (decontextualize) {
single_context_text <- paste("Embedding decontextualised embeddings.",
"\n",
sep = ""
)
message(colourise(single_context_text, "purple"))
individual_word_embeddings_layers <- all_wanted_layers$decontext$word_type
individual_words <- all_wanted_layers$decontext$single_words
}
# Temporarily switch aggregation_from_tokens_to_word_types to NULL
if (aggregation_from_tokens_to_word_types == "individually"){
original_aggregation_from_tokens_to_texts = aggregation_from_tokens_to_texts
aggregation_from_tokens_to_texts = NULL
}
individual_word_embeddings <- textEmbedLayerAggregation(
word_embeddings_layers = individual_word_embeddings_layers,
layers = layers,
aggregation_from_layers_to_tokens = aggregation_from_layers_to_tokens,
aggregation_from_tokens_to_texts = aggregation_from_tokens_to_texts,
return_tokens = FALSE,
tokens_select = tokens_select,
tokens_deselect = tokens_deselect
)
# Switch back aggregation_from_tokens_to_word_type
if (aggregation_from_tokens_to_word_types == "individually"){
aggregation_from_tokens_to_texts = original_aggregation_from_tokens_to_texts
}
individual_word_embeddings <- dplyr::bind_rows(individual_word_embeddings)
# Combine the words for each decontextualized embedding
# Instead of aggregating word_type embeddings, keep them.
# In order to do so, the size of "individual_words" must match that of "individual_word_embeddings"
if (aggregation_from_tokens_to_word_types == "individually"){
individual_words <- tibble::tibble(individual_words)
# num rows
row_indices <- rep(seq_along(individual_words$n), individual_words$n)
# if a word occurs more than once, then, insert that word n times to match the size of "individual_word_embeddings"
individual_words <- individual_words[row_indices, ] %>%
mutate(id = seq_along(n), #id column
n = 1)
}
# Combine the words for each decontextualized embedding
individual_word_embeddings_words <- dplyr::bind_cols(
individual_words, # all_wanted_layers$decontext$single_words,
individual_word_embeddings
)
comment(individual_word_embeddings_words) <- paste(
comment(all_wanted_layers$context_tokens),
comment(individual_word_embeddings),
" ; aggregation_from_tokens_to_word_types = ", aggregation_from_tokens_to_word_types,
" ; decontextualize = ", decontextualize
)
individual_word_embeddings_words <- list(individual_word_embeddings_words)
names(individual_word_embeddings_words) <- colnames(texts)
output$word_types <- individual_word_embeddings_words
message(colourise("Done! \n", "purple"))
}
#### Decontextualised tokens and text embeddings (using output from 3 above) ####
if (decontextualize) {
decontext_space <- individual_word_embeddings_words
### 1. Get token-level embeddings with aggregated levels
if (!is.null(aggregation_from_layers_to_tokens)) { # & keep_token_embeddings
# Tokenize texts
output <- list()
token_embeddings_list <- list()
token_embeddings_list$tokens <- list()
if (!tibble::is_tibble(texts)) {
texts <- tibble::as_tibble(texts)
}
for (i_variables in seq_len(ncol(texts))) {
text_tokens <- lapply(texts[[i_variables]], textTokenize,
model = model, max_token_to_sentence = max_token_to_sentence
) # , ...
t_embeddings <- lapply(text_tokens, applysemrep_over_words, decontext_space[[1]], tolower = FALSE)
token_embeddings <- mapply(dplyr::bind_cols, text_tokens, t_embeddings, SIMPLIFY = FALSE)
token_embeddings_list$tokens[[i_variables]] <- token_embeddings
names(token_embeddings_list$tokens[[i_variables]]) <- paste(names(texts)[[i_variables]],
seq_len(length(token_embeddings)),
sep = "_"
)
}
output$tokens <- token_embeddings_list$tokens
}
# 2. Get aggregated token layers; aggregation_from_tokens_to_texts = "mean"
if (!is.null(aggregation_from_tokens_to_texts)) {
# Function to add dummy variable token_id and layer_number because textEmbedLayerAggregation
# expect that (and it is missing here because the dectontextualize method is used)
add_columns <- function(df) {
df$token_id <- seq_len(nrow(df))
df$layer_number <- rep(1, nrow(df))
return(df)
}
token_embeddings1 <- lapply(token_embeddings, add_columns)
text_embeddings <- textEmbedLayerAggregation(token_embeddings1,
aggregation_from_tokens_to_texts = aggregation_from_tokens_to_texts,
return_tokens = FALSE,
...
)
output$texts <- text_embeddings
}
}
T2_textEmbed <- Sys.time()
Time_textEmbed <- T2_textEmbed - T1_textEmbed
Time_textEmbed <- sprintf("Duration to embed text: %f %s", Time_textEmbed, units(Time_textEmbed))
Date_textEmbed <- Sys.time()
if (dim_name == TRUE && !is.null(aggregation_from_tokens_to_texts)) {
output$texts <- textDimName(output$texts)
}
comment(output) <- paste(Time_textEmbed,
"; Date created: ", Date_textEmbed,
"; text_version: ", packageVersion("text"), ".",
sep = "",
collapse = " "
)
outcome_list[[text_i]] <- output
}
if (ncol(data_character_variables) > 1) {
output1 <- Reduce(utils::modifyList, outcome_list)
} else {
output1 <- outcome_list[[1]]
}
return(output1)
}
#' Combine results from batches, accounting for tokens, texts, and word_types.
#' @param batch_results The results from each batch
#' @param aggregation The aggregation method
#' @return All character variables in UTF-8 format.
#' @importFrom stringi stri_match stri_trim
#' @importFrom purrr map2 map_chr
#' @importFrom dplyr group_by summarize across
#' @noRd
combine_textEmbed_results <- function(
batch_results,
aggregation = "mean") {
# Initialize combined structure
combined_results <- list(tokens = list(), texts = list(), word_types = list())
total_duration <- 0
creation_dates <- c()
text_versions <- c()
# Combine results for each component
for (result in batch_results) {
# Combine tokens while preserving names and comments
if (!is.null(result$tokens)) {
for (name in names(result$tokens)) {
if (is.null(combined_results$tokens[[name]])) {
combined_results$tokens[[name]] <- result$tokens[[name]]
comment(combined_results$tokens[[name]]) <- comment(result$tokens[[name]])
} else {
combined_results$tokens[[name]] <- c(
combined_results$tokens[[name]],
result$tokens[[name]]
)
# Concatenate unique comments
current_comment <- comment(combined_results$tokens[[name]])
new_comment <- comment(result$tokens[[name]])
comment(combined_results$tokens[[name]]) <- unique(c(
current_comment,
new_comment
)) %>% paste(collapse = " | ")
}
}
}
# Combine texts while preserving names and comments
if (!is.null(result$texts)) {
for (name in names(result$texts)) {
if (is.null(combined_results$texts[[name]])) {
combined_results$texts[[name]] <- result$texts[[name]]
comment(combined_results$texts[[name]]) <- comment(result$texts[[name]])
} else {
combined_results$texts[[name]] <- dplyr::bind_rows(
combined_results$texts[[name]],
result$texts[[name]]
)
# Concatenate unique comments
current_comment <- comment(combined_results$texts[[name]])
new_comment <- comment(result$texts[[name]])
comment(combined_results$texts[[name]]) <- unique(c(
current_comment,
new_comment
)) %>% paste(collapse = " | ")
}
}
}
# Combine word_types while preserving names and comments
if (!is.null(result$word_types)) {
for (name in names(result$word_types)) {
if (is.null(combined_results$word_types[[name]])) {
combined_results$word_types[[name]] <- result$word_types[[name]]
comment(combined_results$word_types[[name]]) <- comment(result$word_types[[name]])
} else {
combined_results$word_types[[name]] <- dplyr::bind_rows(
combined_results$word_types[[name]],
result$word_types[[name]]
)
# Concatenate unique comments
current_comment <- comment(combined_results$word_types[[name]])
new_comment <- comment(result$word_types[[name]])
comment(combined_results$word_types[[name]]) <- unique(c(
current_comment,
new_comment
)) %>% paste(collapse = " | ")
}
}
}
# Collect top-level comment details
if (!is.null(comment(result))) {
parsed_comment <- stringi::stri_match(
comment(result),
regex = "Duration to embed text: ([0-9.]+) secs; Date created: ([^;]+); text_version: ([^;]+)\\."
)
if (!is.na(parsed_comment[2])) total_duration <- total_duration + as.numeric(parsed_comment[2])
if (!is.na(parsed_comment[3])) creation_dates <- c(creation_dates, parsed_comment[3])
if (!is.na(parsed_comment[4])) text_versions <- c(text_versions, parsed_comment[4])
}
}
# Aggregate word_types to remove duplicates and assign comments
combined_results$word_types <- purrr::map2(
combined_results$word_types,
names(combined_results$word_types),
function(word_type_tibble, word_type_name) {
aggregated_tibble <- word_type_tibble %>%
dplyr::group_by(words) %>%
dplyr::summarize(
n = sum(n, na.rm = TRUE),
dplyr::across(starts_with("Dim"), ~ {
weights <- word_type_tibble$n[dplyr::cur_group_rows()]
if (aggregation == "mean" && !is.null(weights)) {
sum(.x * weights, na.rm = TRUE) / sum(weights, na.rm = TRUE)
} else {
textEmbeddingAggregation(as.matrix(.x), aggregation = aggregation)
}
}, .names = "{.col}")
)
# Assign comment to the aggregated tibble
original_comments <- purrr::map_chr(batch_results, ~ comment(.x$word_types[[word_type_name]])) %>%
unique() %>%
stats::na.omit() %>%
paste(collapse = " | ")
comment(aggregated_tibble) <- original_comments
return(aggregated_tibble)
}
)
# Deduplicate and summarize top-level comments
unique_dates <- unique(creation_dates)
unique_versions <- unique(text_versions)
date_summary <- if (length(unique_dates) == 1) {
sprintf("Date created: %s", unique_dates)
} else if (length(unique_dates) > 1) {
sprintf("Date range: %s to %s", min(unique_dates), max(unique_dates))
} else {
""
}
version_summary <- if (length(unique_versions) == 1) {
sprintf("text_version: %s", unique_versions)
} else {
""
}
# Set the summarized top-level comment
comment(combined_results) <- paste(
sprintf("Total duration to embed text: %.2f secs", total_duration),
date_summary,
version_summary,
sep = "; "
) %>% stringi::stri_trim()
return(combined_results)
}
#' textEmbed() extracts layers and aggregate them to word embeddings, for all character variables in a given dataframe.
#' @param texts A character variable or a tibble/dataframe with at least one character variable.
#' @param model Character string specifying pre-trained language model (default 'bert-base-uncased').
#' For full list of options see pretrained models at
#' \href{https://huggingface.co/transformers/pretrained_models.html}{HuggingFace}.
#' For example use "bert-base-multilingual-cased", "openai-gpt",
#' "gpt2", "ctrl", "transfo-xl-wt103", "xlnet-base-cased", "xlm-mlm-enfr-1024", "distilbert-base-cased",
#' "roberta-base", or "xlm-roberta-base". Only load models that you trust from HuggingFace; loading a
#' malicious model can execute arbitrary code on your computer).
#' @param layers (string or numeric) Specify the layers that should be extracted
#' (default -2 which give the second to last layer). It is more efficient to only extract the layers
#' that you need (e.g., 11). You can also extract several (e.g., 11:12), or all by setting this parameter
#' to "all". Layer 0 is the decontextualized input layer (i.e., not comprising hidden states) and
#' thus should normally not be used. These layers can then be aggregated in the textEmbedLayerAggregation
#' function.
#' @param dim_name (boolean) If TRUE append the variable name after all variable-names in the output.
#' (This differentiates between word embedding dimension names; e.g., Dim1_text_variable_name).
#' see \code{\link{textDimName}} to change names back and forth.
#' @param aggregation_from_layers_to_tokens (string) Aggregated layers of each token. Method to aggregate the
#' contextualized layers (e.g., "mean", "min" or "max, which takes the minimum, maximum or mean, respectively,
#' across each column; or "concatenate", which links together each word embedding layer to one long row.
#' @param aggregation_from_tokens_to_texts (string) Method to carry out the aggregation among the word embeddings
#' for the words/tokens, including "min", "max" and "mean" which takes the minimum, maximum or mean across each column;
#' or "concatenate", which links together each layer of the word embedding to one long row (default = "mean"). If set to NULL, embeddings are not
#' aggregated.
#' @param aggregation_from_tokens_to_word_types (string) Aggregates to the word type (i.e., the individual words)
#' rather than texts. If set to "individually", then duplicate words are not aggregated, (i.e, the context of individual
#' is preserved). (default = NULL).
#' @param keep_token_embeddings (boolean) Whether to also keep token embeddings when using texts or word
#' types aggregation.
#' @param batch_size Number of rows in each batch
#' @param remove_non_ascii (bolean) TRUE warns and removes non-ascii (using textFindNonASCII()).
#' @param tokens_select Option to select word embeddings linked to specific tokens
#' such as [CLS] and [SEP] for the context embeddings.
#' @param tokens_deselect Option to deselect embeddings linked to specific tokens
#' such as [CLS] and [SEP] for the context embeddings.
#' @param decontextualize (boolean) Provide word embeddings of single words as input to the model
#' (these embeddings are, e.g., used for plotting; default is to use ). If using this, then set
#' single_context_embeddings to FALSE.
#' @param model_max_length The maximum length (in number of tokens) for the inputs to the transformer model
#' (default the value stored for the associated model).
#' @param max_token_to_sentence (numeric) Maximum number of tokens in a string to handle before
#' switching to embedding text sentence by sentence.
#' @param tokenizer_parallelism (boolean) If TRUE this will turn on tokenizer parallelism. Default FALSE.
#' @param device Name of device to use: 'cpu', 'gpu', 'gpu:k' or 'mps'/'mps:k' for MacOS, where k is a
#' specific device number such as 'mps:1'.
#' @param hg_gated Set to TRUE if the accessed model is gated.
#' @param hg_token The token needed to access the gated model.
#' Create a token from the ['Settings' page](https://huggingface.co/settings/tokens) of
#' the Hugging Face website. An an environment variable HUGGINGFACE_TOKEN can
#' be set to avoid the need to enter the token each time.
#' @param logging_level Set the logging level. Default: "warning".
#' Options (ordered from less logging to more logging): critical, error, warning, info, debug
#' @param dlatk_method (boolean; experiments) If TRUE the text is split using the DLATK-method; this method appears better for longer texts (but it does not
#' return token level word embeddings, nor word_types embeddings at this stage).
#' @param trust_remote_code (boolean) use a model with custom code on the Huggingface Hub
#' @param ... settings from textEmbedRawLayers().
#' @return A tibble with tokens, a column for layer identifier and word embeddings.
#' Note that layer 0 is the input embedding to the transformer.
#' @examples
#' # Automatically transforms the characters in the example dataset:
#' # Language_based_assessment_data_8 (included in text-package), to embeddings.
#' \dontrun{
#' word_embeddings <- textEmbed(Language_based_assessment_data_8[1:2, 1:2],
#' layers = 10:11,
#' aggregation_from_layers_to_tokens = "concatenate",
#' aggregation_from_tokens_to_texts = "mean",
#' aggregation_from_tokens_to_word_types = "mean"
#' )
#'
#' # Show information about how the embeddings were constructed.
#' comment(word_embeddings$texts$satisfactiontexts)
#' comment(word_embeddings$word_types)
#' comment(word_embeddings$tokens$satisfactiontexts)
#'
#' # See how the word embeddings are structured.
#' word_embeddings
#'
#' # Save the word embeddings to avoid having to embed the text again.
#' saveRDS(word_embeddings, "word_embeddings.rds")
#'
#' # Retrieve the saved word embeddings.
#' word_embeddings <- readRDS("word_embeddings.rds")
#' }
#'
#' @seealso See \code{\link{textEmbedLayerAggregation}}, \code{\link{textEmbedRawLayers}} and
#' \code{\link{textDimName}}.
#' @importFrom reticulate source_python
#' @importFrom utils modifyList
#' @export
textEmbed <- function(
texts,
model = "bert-base-uncased",
layers = -2,
dim_name = TRUE,
aggregation_from_layers_to_tokens = "concatenate",
aggregation_from_tokens_to_texts = "mean",
aggregation_from_tokens_to_word_types = NULL,
keep_token_embeddings = TRUE,
batch_size = 100,
remove_non_ascii = TRUE,
tokens_select = NULL,
tokens_deselect = NULL,
decontextualize = FALSE,
model_max_length = NULL,
max_token_to_sentence = 4,
tokenizer_parallelism = FALSE,
device = "cpu",
hg_gated = FALSE,
hg_token = Sys.getenv("HUGGINGFACE_TOKEN",
unset = ""),
logging_level = "error",
dlatk_method = FALSE,
trust_remote_code = FALSE,
...) {
T1 <- Sys.time()
if(!is_tibble(texts)){
texts <- tibble::tibble(texts = texts)
}
# Split texts into batches
split_into_batches <- function(data, batch_size) {
split(data, ceiling(seq_along(1:nrow(data)) / batch_size))
}
batches <- split_into_batches(texts, batch_size)
# Process each batch and store results i=1
batch_results <- list()
for (i in seq_along(batches)) {
batch_message <- sprintf("Processing batch %d/%d\n", i, length(batches))
message(colourise(batch_message, "blue"))
batch_texts <- batches[[i]]
#batch_texts <- batch[["satisfactionwords"]]
# Process batch with error handling
if(dlatk_method == FALSE){
batch_result <- tryCatch(
text_embed(
texts = batch_texts,
model = model,
layers = layers,
dim_name = dim_name,
aggregation_from_layers_to_tokens = aggregation_from_layers_to_tokens,
aggregation_from_tokens_to_texts = aggregation_from_tokens_to_texts,
aggregation_from_tokens_to_word_types = aggregation_from_tokens_to_word_types,
keep_token_embeddings = keep_token_embeddings,
remove_non_ascii = remove_non_ascii,
tokens_select = tokens_select,
tokens_deselect = tokens_deselect,
decontextualize = decontextualize,
model_max_length = model_max_length,
max_token_to_sentence = max_token_to_sentence,
tokenizer_parallelism = tokenizer_parallelism,
device = device,
hg_gated = hg_gated,
hg_token = hg_token,
logging_level = logging_level
, ...
), # ADD TODO , ... for testing:
error = function(e) {
message(sprintf("Error in batch %d: %s", i, e$message))
return(NULL)
}
)
}
if(dlatk_method == TRUE){
# Process batch with error handling
batch_result <- tryCatch(
text_embed_dlatk(
texts = batch_texts,
model = model,
layers = layers,
dim_name = dim_name,
# aggregation_from_layers_to_tokens = aggregation_from_layers_to_tokens,
aggregation_from_tokens_to_texts = aggregation_from_tokens_to_texts,
# aggregation_from_tokens_to_word_types = aggregation_from_tokens_to_word_types,
# keep_token_embeddings = keep_token_embeddings,
remove_non_ascii = remove_non_ascii,
# tokens_select = tokens_select,
# tokens_deselect = tokens_deselect,
# decontextualize = decontextualize,
model_max_length = model_max_length,
# max_token_to_sentence = max_token_to_sentence,
tokenizer_parallelism = tokenizer_parallelism,
device = device,
hg_gated = hg_gated,
hg_token = hg_token,
trust_remote_code = trust_remote_code,
logging_level = logging_level,
batch_size =as.integer(batch_size)
, ...
),
error = function(e) {
message(sprintf("Error in batch %d: %s", i, e$message))
return(NULL)
}
)
}
batch_results[[i]] <- batch_result
T2 <- Sys.time()
time_from_starts <- round(as.numeric(difftime(T2, T1, units = "mins")), 3)
time_from_message <- paste("Minutes from start: ", time_from_starts)
message(colourise(time_from_message, "green"))
batches_left <- length(batches) - i
mean_time_per_batch <- time_from_starts/i
estimated_time_left <- mean_time_per_batch * batches_left
estimation_message <- paste0("Estimated embedding time left = ", estimated_time_left, " minutes")
message(colourise(estimation_message, "black"))
}
final_result <- combine_textEmbed_results(
batch_results,
aggregation = aggregation_from_tokens_to_word_types)
return(final_result)
}
#' Change dimension names
#'
#' textDimName() changes the names of the dimensions in the word embeddings.
#' @param word_embeddings List of word embeddings
#' @param dim_names (boolean) If TRUE the word embedding name will be attached to the name of each dimension;
#' is FALSE, the attached part of the name will be removed.
#' @return Word embeddings with changed names.
#' @examples
#' \donttest{
#' # Note that dimensions are called Dim1_harmonytexts etc.
#' word_embeddings_4$texts$harmonytexts
#' # Here they are changed to just Dim
#' w_e_T <- textDimName(word_embeddings_4$texts["harmonytexts"],
#' dim_names = FALSE
#' )
#' # Here they are changed back
#' w_e_F <- textDimName(w_e_T, dim_names = TRUE)
#' }
#' @seealso see \code{\link{textEmbed}}
#' @export
textDimName <- function(word_embeddings,
dim_names = TRUE) {
tokens <- NULL
word_type <- NULL
x_is_tibble <- tibble::is_tibble(word_embeddings)
if (x_is_tibble) word_embeddings <- list(word_embeddings)
# Remove singlewords_we if it exist
if (!is.null(word_embeddings$word_type)) {
word_type <- word_embeddings$word_type
word_embeddings$word_type <- NULL
}
if (!is.null(word_embeddings$tokens)) {
tokens <- word_embeddings$tokens
word_embeddings$tokens <- NULL
}
# i_row = 1 dim_name=TRUE
if (dim_names) {
for (i_row in seq_len(length(word_embeddings))) {
colnames(word_embeddings[[i_row]]) <- paste0(
names(word_embeddings[[i_row]]),
"_",
names(word_embeddings)[[i_row]]
)
}
}
if (!dim_names) {
for (i_row in seq_len(length(word_embeddings))) {
target_variables_names <- colnames(word_embeddings[[i_row]])
# Select everything BEFORE the first _ (i.e., the Dim1, etc.)
variable_names <- sub("\\_.*", "", target_variables_names)
colnames(word_embeddings[[i_row]]) <- variable_names
}
}
# Attach word embeddings again
if (!is.null(word_type)) {
word_embeddings$word_type <- word_type
}
# Attach word embeddings again
if (!is.null(tokens)) {
word_embeddings$tokens <- tokens
}
# Return tibble if x is a tibble (and not a list)
if (x_is_tibble) word_embeddings <- word_embeddings[[1]]
return(word_embeddings)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.