R/1_1_textEmbed.R

Defines functions textDimName textEmbed textEmbedLayerAggregation textEmbedRawLayers textTokenize grep_col_by_name_in_list layer_aggregation_helper sortingLayers getUniqueWordsAndFreq textEmbeddingAggregation normalizeV select_character_v_utf8 get_encoding_change

Documented in textDimName textEmbed textEmbedLayerAggregation textEmbedRawLayers textTokenize

#' Find encoding type of variable and then set it to UTF-8.
#' @param x Tibble including both text and numeric variables.
#' @return all character variables in UTF-8 format.
#' @noRd
get_encoding_change <- function(x) {
  Encoding(x) <- Encoding(enc2utf8(x))
  x
}


#' Select all character variables and make them UTF-8 coded (BERT wants it in this format).
#' @param x A character variable or a tibble including both character and numeric variables.
#' @return a tibble containing all character variables in UTF-8 format.
#' @importFrom dplyr select_if
#' @importFrom tibble as_tibble
#' @importFrom purrr map
#' @noRd
select_character_v_utf8 <- function(x) {
  # If a vector is submitted, make it a tibble column.
  if (is.vector(x) == TRUE & is.list(x) == FALSE) {
    # Select variable name to have as column name in the end
    colname_x <- deparse(substitute(x))
    # Remove everything before a "$"
    colname_x <- gsub("^.*\\$", "", colname_x)

    x <- tibble::as_tibble_col(x)
    colnames(x) <- substitute(colname_x)
  }
  # Select all character variables
  x_characters <- dplyr::select_if(x, is.character)
  # This makes sure that all variables are UTF-8 coded
  x_characters <- tibble::as_tibble(purrr::map(x_characters, get_encoding_change))
}


#' Function to normalize the vector to one; to a unit vector.
#'
#' @param x a word embedding
#' @return normalized (unit) vector/word embedding.
#' @noRd
normalizeV <- function(x) {
  magnitude <-
    x / sqrt(sum(x^2, na.rm = TRUE))
}

#' Function to take min, max, mean or the CLS
#' (which comes from BERT models; not Static spaces) from list of vectors
#' @param x word embeddings to be aggregated
#' @param aggregation method to carry out the aggregation, including "min", "max" and "mean" which takes the
#' minimum, maximum or mean across each column; or "concatenate", which links together each word embedding layer
#' to one long row.
#' @return aggregated word embeddings.
#' @importFrom tibble as_tibble_row
#' @importFrom purrr map
#' @noRd
textEmbeddingAggregation <- function(x, aggregation = "min") {
  if (aggregation == "min") {
    min_vector <- unlist(purrr::map(x, min, na.rm = TRUE))
    min_vector
  } else if (aggregation == "max") {
    max_vector <- unlist(purrr::map(x, max, na.rm = TRUE))
    max_vector
  } else if (aggregation == "mean") {
    mean_vector <- colMeans(x, na.rm = TRUE)
    mean_vector
  } else if (aggregation == "concatenate") {
    long_vector <- c(t(x)) %>% tibble::as_tibble_row(.name_repair = "minimal")
    colnames(long_vector) <- paste0("Dim", sep = "", seq_len(length(long_vector)))

    variable_name <- names(x)[1]

    # If original name is not just Dim1, then add back Dim1_variable.name
    if (!variable_name == "Dim1") {
      variable_name <- sub(".*Dim1_", "", variable_name)

      colnames(long_vector) <- paste0(
        names(long_vector),
        "_",
        variable_name
      )
    }
    long_vector
  } else if (aggregation == "normalize") {
    sum_vector <- unlist(purrr::map(x, sum, na.rm = TRUE))
    normalized_vector <- normalizeV(sum_vector)
    normalized_vector
  }
}


#' getUniqueWordsAndFreq
#' Function unites several text variables and rows to one,
#' where all text is transformed to lowercase and tokenized.
#' Also give word frequencies.
#' @param x_characters A character column in a tibble.
#' @param hg_tokenizer (boolean) Weather to use textTokenize
#' @return A tibble with a unique words column and a column with their respective frequency.
#' @importFrom tibble tibble
#' @importFrom stringi stri_c stri_trans_tolower
# @importFrom stringr str_c str_split stri_split_boundaries
# @importFrom tokenizers tokenize_words
#' @noRd
getUniqueWordsAndFreq <- function(x_characters, hg_tokenizer = NULL, ...) {
  if (is.null(hg_tokenizer)) {
    # Unite all text variables into one
    x_characters2 <- tidyr::unite(x_characters, "x_characters2", seq_len(ncol(x_characters)), sep = " ")

    # unite all rows in the column into one cell
    x_characters3 <- stringi::stri_c(x_characters2$x_characters2, collapse = " ")
    # Tokenize into single words help(stri_split_boundaries)
    x_characters4a <- stringi::stri_trans_tolower(x_characters3)
    x_characters4b <- stringi::stri_split_boundaries(x_characters4a,
      type = "word",
      skip_word_none = TRUE,
      skip_word_number = FALSE
    )[[1]]
    # Create dataframe with single words and frequency
    x_characters5 <- data.frame(sort(table(unlist(strsplit(tolower(x_characters4b), " ")))))
  }

  if (!is.null(hg_tokenizer)) {
    x_characters4b <- lapply(list(x_characters), textTokenize, model = hg_tokenizer, ...)
    x_characters5 <- data.frame(sort(table(unlist(x_characters4b))))
  }

  if (length(x_characters5) == 1) {
    colnames(x_characters5) <- c("Freq")
    x_characters5 <- tibble::rownames_to_column(x_characters5, "Var1")
  }

  singlewords <- tibble::tibble(x_characters5$Var1, x_characters5$Freq)
  colnames(singlewords) <- c("words", "n")
  singlewords$words <- as.character(singlewords$words)

  return(singlewords)
}


#' This is a function that sorts out (i.e., tidy) the embeddings from the huggingface interface.
#' @param x list of layers.
#' @param layers the number of layers to get (setting comes from textEmbedRawLayers).
#' @param return_tokens boolean whether tokens have been returned (setting comes from textEmbedRawLayers).
#' @return Layers in tidy tibble format with each dimension column called Dim1, Dim2 etc.
#' @noRd
sortingLayers <- function(x,
                           layers = layers,
                           return_tokens = return_tokens) {
  # If selecting "all" layers, find out number of layers to help indicate layer index later in code
  if (is.character(layers)) {
    layers <- 0:(length(x[[1]][[1]]) - 1)
  }

  # Find number of dimensions (where the place differ depending on return_token is TRUE or FALSE)
  if (return_tokens) {
    dimensions <- length(x[[1]][[1]][[1]][[1]][[1]])
    participants <- length(x[[1]])
  } else {
    dimensions <- length(x[[1]][[1]][[1]][[1]])
    participants <- length(x)
  }

  # Tidy-structure tokens and embeddings
  # Replace outer loop over i_in_variable with map();
  variable_x <- purrr::map(1:participants, function(i_in_variable) {
    if (return_tokens) {
      tokens <- x[[2]][[i_in_variable]]
      token_id <- seq_len(length(tokens))
      all_layers <- x[[1]][[i_in_variable]]
    } else {
      tokens <- NULL
      all_layers <- x[[i_in_variable]]
      # Count number of embeddings within one layer
      token_id <- seq_len(length(all_layers[[1]][[1]]))
    }

    # Replace inner loop over i_layers with updated code
    totalTokensNum <- length(tokens)

    tarTb <- numeric(length=totalTokensNum*length(layers)*dimensions)

    tarTb <- reticulate::np_array(tarTb)

    tarTb <- tibble::as_tibble(
      reticulate::py_to_r(
        reticulate::array_reshape(tarTb, c(totalTokensNum*length(layers), dimensions))),
      .name_repair = "minimal")

    colnames(tarTb) <- paste0("Dim", seq_len(dimensions))

    purrr::map(seq_len(totalTokensNum), function(i) {
      purrr::map(seq_len(length(layers)), function(j) {
        k <- j - 1
        tarTb[i + totalTokensNum * k,] <<- as.list(all_layers[[j]][[1]][[i]])
      })
    })

    # Add tokens, token IDs, and layer numbers to output tibble
    if (return_tokens) {
      tarTb <- cbind(tokens, token_id, layer_number = rep(layers, each = totalTokensNum), tarTb) %>%
        tibble::as_tibble()
    } else {
      tarTb <- cbind(token_id, layer_number = rep(layers, each = totalTokensNum), tarTb) %>%
        tibble::as_tibble()
    }

    tarTb
  })

  variable_x
}

#' This is a function that uses the textAggregation to aggregate the layers
#' @param x list of layers.
#' @param aggregation method to aggregate the layers.
#' @param return_tokens (boolean) returns the tokens as the first column.
#' @return Aggregated layers in tidy tibble format.
#' @noRd
layer_aggregation_helper <- function(x,
                                     aggregation = aggregation,
                                     return_tokens = FALSE) {
  aggregated_layers_saved <- list()

  # Get unique number of token ids in the variable starting with x$token_id ; i_token_id=1
  number_of_ids <- unique(x[, grep("^token_id", names(x))][[1]])

  # Loops over the number of tokens; i_token_id = 1
  for (i_token_id in seq_len(length(number_of_ids))) {
    # Selects all the layers for each token/token_id
    x1 <- x[x[, grep("^token_id", names(x))][[1]] == i_token_id, ]
    # Select only Dimensions
    x2 <- dplyr::select(x1, dplyr::starts_with("Dim"))
    # Aggregate the dimensions
    x3 <- textEmbeddingAggregation(x2, aggregation = aggregation)

    aggregated_layers_saved[[i_token_id]] <- x3
  }
  aggregated_layers_saved1 <- dplyr::bind_rows(aggregated_layers_saved)

  if (return_tokens) {
    # Number of ids
    number_of_layers <- unique(x[, grep("^layer_number", names(x))][[1]])
    n_layers <- length(number_of_layers)
    tokens <- x$tokens[1:(length(x$tokens) / n_layers)]
    tokens <- as_tibble_col(tokens, column_name = "tokens")
    aggregated_layers_saved1 <- dplyr::bind_cols(tokens, aggregated_layers_saved1)
  }

  return(aggregated_layers_saved1)
}


#' grep_col_by_name_in_list
#' This function finds a column by name independent on where in the list structure it is.
#' @param l a list.
#' @param pattern what to find; such as the "layers_number" column.
#' @return elements in the column called pattern.
#' @noRd
grep_col_by_name_in_list <- function(l, pattern) {
  u <- unlist(l)
  u[grep(pattern, names(u))]
}


#' Tokenize according to different huggingface transformers
#' @param texts A character variable or a tibble/dataframe with at least one character variable.
#' @param model Character string specifying pre-trained language model (default 'bert-base-uncased').
#'  For full list of options see pretrained models at
#'  \href{https://huggingface.co/transformers/pretrained_models.html}{HuggingFace}.
#'  For example use "bert-base-multilingual-cased", "openai-gpt",
#' "gpt2", "ctrl", "transfo-xl-wt103", "xlnet-base-cased", "xlm-mlm-enfr-1024", "distilbert-base-cased",
#' "roberta-base", or "xlm-roberta-base".
#' @param max_token_to_sentence (numeric) Maximum number of tokens in a string to handle before
#' switching to embedding text sentence by sentence.
#' @param device Name of device to use: 'cpu', 'gpu', 'gpu:k' or 'mps'/'mps:k' for MacOS, where k is a
#' specific device number.
#' @param tokenizer_parallelism If TRUE this will turn on tokenizer parallelism. Default FALSE.
#' @param model_max_length The maximum length (in number of tokens) for the inputs to the transformer model
#' (default the value stored for the associated model).
#' @param logging_level Set the logging level. Default: "warning".
#' Options (ordered from less logging to more logging): critical, error, warning, info, debug
#' @return Returns tokens according to specified huggingface transformer.
#' @examples
#' \donttest{
#' # tokens <- textTokenize("hello are you?")
#' }
#' @seealso see \code{\link{textEmbed}}
#' @importFrom reticulate source_python
#' @importFrom tibble tibble as_tibble
#' @export
textTokenize <- function(texts,
                         model = "bert-base-uncased",
                         max_token_to_sentence = 4,
                         device = "cpu",
                         tokenizer_parallelism = FALSE,
                         model_max_length = NULL,
                         logging_level = "error") {


  # Run python file with HunggingFace interface to state-of-the-art transformers
  reticulate::source_python(system.file("python",
    "huggingface_Interface3.py",
    # envir = NULL,
    package = "text",
    mustWork = TRUE
  ))

  tokens <- hgTokenizerGetTokens(
    text_strings = texts,
    model = model,
    max_token_to_sentence = max_token_to_sentence,
    device = device,
    tokenizer_parallelism = tokenizer_parallelism,
    model_max_length = model_max_length,
    logging_level = logging_level
  )
  tokens1 <- lapply(tokens, tibble::as_tibble_col, column_name = "tokens")

  return(tokens1)
}

#' Extract layers of hidden states (word embeddings) for all character variables in a given dataframe.
#' @param texts A character variable or a tibble/dataframe with at least one character variable.
#' @param model Character string specifying pre-trained language model (default 'bert-base-uncased').
#'  For full list of options see pretrained models at
#'  \href{https://huggingface.co/transformers/pretrained_models.html}{HuggingFace}.
#'  For example use "bert-base-multilingual-cased", "openai-gpt",
#' "gpt2", "ctrl", "transfo-xl-wt103", "xlnet-base-cased", "xlm-mlm-enfr-1024", "distilbert-base-cased",
#' "roberta-base", or "xlm-roberta-base". Only load models that you trust from HuggingFace; loading a
#'  malicious model can execute arbitrary code on your computer).
#' @param layers (string or numeric) Specify the layers that should be extracted
#' (default -2, which give the second to last layer). It is more efficient to only extract the
#' layers that you need (e.g., 11). You can also extract several (e.g., 11:12),
#' or all by setting this parameter to "all". Layer 0 is the decontextualized input layer
#' (i.e., not comprising hidden states) and thus should normally not be used. These layers can then
#'  be aggregated in the textEmbedLayerAggregation function.
#' @param return_tokens If TRUE, provide the tokens used in the specified transformer model.
#' @param word_type_embeddings (boolean) Wether to provide embeddings for each word/token type.
#' @param decontextualize (boolean) Wether to dectonextualise embeddings (i.e., embedding one word at a time).
#' @param keep_token_embeddings (boolean) Whether to keep token level embeddings in the output
#' (when using word_types aggregation)
#' @param device Name of device to use: 'cpu', 'gpu', 'gpu:k' or 'mps'/'mps:k' for MacOS, where k is a
#' specific device number.
#' @param tokenizer_parallelism If TRUE this will turn on tokenizer parallelism. Default FALSE.
#' @param model_max_length The maximum length (in number of tokens) for the inputs to the transformer model
#' (default the value stored for the associated model).
#' @param max_token_to_sentence (numeric) Maximum number of tokens in a string to handle before switching to embedding
#' text sentence by sentence.
#' @param logging_level Set the logging level. Default: "warning".
#' Options (ordered from less logging to more logging): critical, error, warning, info, debug
#' @param sort (boolean) If TRUE sort the output to tidy format.
#' @return Returns hiddenstates/layers that can be 1. Can return three different outputA tibble with tokens,
#' column specifying layer and word embeddings. Note that layer 0 is the input embedding to the transformer,
#' and should normally not be used.
#' @examples
#' \donttest{
#' # texts <- Language_based_assessment_data_8[1:2, 1:2]
#' # word_embeddings_with_layers <- textEmbedRawLayers(texts, layers = 11:12)
#' }
#' @seealso see \code{\link{textEmbedLayerAggregation}} and \code{\link{textEmbed}}
#' @importFrom reticulate source_python
#' @importFrom dplyr %>% bind_rows group_split
#' @importFrom tibble tibble as_tibble
#' @importFrom magrittr set_colnames
#' @export
textEmbedRawLayers <- function(texts,
                               model = "bert-base-uncased",
                               layers = -2,
                               return_tokens = TRUE,
                               word_type_embeddings = FALSE,
                               decontextualize = FALSE,
                               keep_token_embeddings = TRUE,
                               device = "cpu",
                               tokenizer_parallelism = FALSE,
                               model_max_length = NULL,
                               max_token_to_sentence = 4,
                               logging_level = "error",
                               sort = TRUE) {

  if (decontextualize == TRUE & word_type_embeddings == FALSE) {
    stop(cat(
      colourise("decontextualize = TRUE & word_type_embeddings = FALSE has not been implemented in textEmbedRawLayers() at this stage.",
                fg = "red"),
      colourise("When using decontextualize = TRUE  you need to create the word_type_embeddings. To create a text embeddings withouth it would take unnecessary
                time as it would require to send the same decontextualised words to a transformer multiple times (whilst getting the same results over and over).
                Consdier using rextEmbed, to get token embeddings as well as text embeddings.",
                fg = "green")
    ))
  }


  # Run python file with HunggingFace interface to state-of-the-art transformers
  reticulate::source_python(system.file("python",
                                        "huggingface_Interface3.py",
                                        # envir = NULL,
                                        package = "text",
                                        mustWork = TRUE
  ))

  if (is.numeric(layers)) {
    if (max(layers) > textModelLayers(model)) {
      stop("You are trying to extract layers that do not exist in this model.")
    }
  }

  if (layers[1] < 0) {
    n <- textModelLayers("bert-base-uncased")
    layers <- 1 + n + layers
    layers
  }


  # Select all character variables and make them UTF-8 coded (e.g., BERT wants it that way).
  data_character_variables <- select_character_v_utf8(texts)


  # Context (default): gives sorted word embeddings based on context
  # (i.e., the entire text is sent to the transformer model)
  if (!decontextualize) {
    x <- data_character_variables
    sorted_layers_ALL_variables <- list()
    sorted_layers_ALL_variables$context_tokens <- list()
    # Loop over all character variables; i_variables = 1
    T_test1 <- Sys.time()
    for (i_variables in seq_len(length(data_character_variables))) {
      T1_variable <- Sys.time()
      # Python file function to HuggingFace
      hg_embeddings <- hgTransformerGetEmbedding(
        text_strings = x[[i_variables]],
        model = model,
        layers = layers,
        return_tokens = return_tokens,
        device = device,
        tokenizer_parallelism = tokenizer_parallelism,
        model_max_length = model_max_length,
        max_token_to_sentence = max_token_to_sentence,
        logging_level = logging_level
      )
      T_test2 <- Sys.time()


      if(sort){
      variable_x <- sortingLayers(x = hg_embeddings,
                                  layers = layers,
                                  return_tokens = return_tokens)
      } else {
        variable_x <- hg_embeddings
      }

      sorted_layers_ALL_variables$context_tokens[[i_variables]] <- variable_x
      names(sorted_layers_ALL_variables$context_tokens)[[i_variables]] <- names(x)[[i_variables]]


      # Adding informative comment
      layers_string <- paste(as.character(layers), sep = " ", collapse = " ")

      comment(sorted_layers_ALL_variables$context_tokens) <-
        paste("Information about the embeddings. textEmbedRawLayers: ",
        "model: ", model, " ; ",
        "layers: ", layers_string, " ; ",
        "word_type_embeddings: ", word_type_embeddings, " ; ",
        "max_token_to_sentence: ", max_token_to_sentence, " ; ",
        "text_version: ", packageVersion("text"), ".",
        sep = "",
        collapse = "\n"
      )

      ## Timing
      T2_variable <- Sys.time()
      variable_time <- T2_variable - T1_variable
      variable_time <- sprintf(
        "duration: %f %s).",
        variable_time,
        units(variable_time)
      )

      version_seq <- paste(i_variables, "/", length(data_character_variables), sep = "")

      loop_text <- paste("Completed layers output for ",
        names(texts)[[i_variables]], " (variable: ",
        version_seq, ", ",
        variable_time,
        "\n",
        sep = ""
      )

      cat(colourise(loop_text, "green"))
    }
  }

  # Word type embeddings based on Context embeddings
  if (word_type_embeddings & !decontextualize) {

    # see stop in the beginning of this function.

    individual_tokens <- list()
    individual_tokens$context_word_type <- list()
    individual_tokens$tokens <- list()

    # 1. Group individual tokens help(bind_rows)
    i_we <- suppressWarnings(dplyr::bind_rows(sorted_layers_ALL_variables$context_tokens))
    i_we2 <- dplyr::group_split(i_we, i_we[, grep("^tokens", names(i_we))][[1]])
    names(i_we2) <- paste(rep("word_type", length(i_we2)), seq_len(length(i_we2)), sep = "")
    individual_tokens$context_word_type <- i_we2

    # Specify which token layers go together and ensure that the token_id starts with 1
    # (for textLayersAggregation to know which layers are linked);
    num_layers <- length(layers)

    # Look over all token list objects to adjust the token_id. i_context = 1
    for (i_context in seq_len(length(individual_tokens$context_word_type))) { # $word_type
      token_id_df <- individual_tokens$context_word_type[[i_context]] # $word_type

      token_id_variable <- token_id_df[, grep("^token_id", names(token_id_df))][[1]]

      num_token <- length(token_id_variable) / num_layers
      token_id <- sort(rep(1:num_token, num_layers))
      individual_tokens$context_word_type[[i_context]][, grep("^token_id", names(token_id_df))][[1]] <- token_id
    }

    # Get first element from each list.
    single_words <- sapply(individual_tokens$context_word_type, function(x) x[[1]][1]) # $word_type
    single_words <- tibble::as_tibble_col(single_words, column_name = "words")

    # n
    n <- sapply(individual_tokens$context_word_type, function(x) length(x[[1]]) / num_layers) # $word_type
    n <- tibble::as_tibble_col(n, column_name = "n")
    single_words_n <- dplyr::bind_cols(single_words, n)
    individual_tokens$tokens <- single_words_n

    sing_text <- c("Completed layers aggregation for word_type_embeddings. \n")
    cat(colourise(sing_text, "green"))
  }

  # Decontextualized embeddings for aggregated embeddings and word type embeddings
  if (decontextualize) {
    individual_tokens <- list()
    individual_tokens$decontext <- list()
    # Get word embeddings for all individual tokens/words (which is, e.g., used for the word plot).help(bind_cols)
    data_character_variables1 <- suppressMessages(apply(data_character_variables, 1, bind_cols)) %>%
      bind_rows()

    singlewords <- getUniqueWordsAndFreq(data_character_variables1[[1]],
      hg_tokenizer = model
    )
    list_words <- sapply(singlewords$words, list)
    names(list_words) <- NULL


    hg_decontexts_embeddings <- hgTransformerGetEmbedding(
      text_strings = list_words,
      model = model,
      layers = layers,
      return_tokens = return_tokens,
      device = device,
      tokenizer_parallelism = tokenizer_parallelism,
      model_max_length = model_max_length,
      max_token_to_sentence = max_token_to_sentence,
      logging_level = logging_level
    )

    # Sort out layers as above
    if(sort){
      individual_tokens$decontext$word_type <- sortingLayers(
        x = hg_decontexts_embeddings,
        layers = layers,
        return_tokens = return_tokens)
    } else {
      individual_tokens$decontext$word_type <- hg_decontexts_embeddings
    }

    names(individual_tokens$decontext$word_type) <- NULL
    individual_tokens$decontext$single_words <- singlewords


    # Adding informative data
    layers_string <- paste(as.character(layers), sep = " ", collapse = " ")
    comment(individual_tokens$decontext$word_type) <- c(paste("Information about the embeddings.
                                                                         textEmbedRawLayers: ",
      "model:", model,
      "layers:", layers_string, ".",
      collapse = " ; "
    ))

    comment(individual_tokens$decontext$single_words) <- c(paste("Information about the embeddings.
                                                                            textEmbedRawLayers: ",
      "model:", model,
      "layers:", layers_string, ".",
      collapse = " ; "
    ))

    de_text <- c("Completed layers aggregation for decontexts embeddings. \n")
    cat(colourise(de_text, "green"))

    individual_tokens
  }

  # Combine previous list and word list
  if (decontextualize == FALSE & word_type_embeddings == FALSE) {
    word_embeddings_with_layers <- c(sorted_layers_ALL_variables)
    rm(sorted_layers_ALL_variables)
  } else if (decontextualize == FALSE & word_type_embeddings == TRUE & keep_token_embeddings == TRUE) {
    word_embeddings_with_layers <- c(sorted_layers_ALL_variables, individual_tokens)
    rm(sorted_layers_ALL_variables)
    rm(individual_tokens)
  } else if (decontextualize == FALSE & word_type_embeddings == TRUE & keep_token_embeddings == FALSE) {
    word_embeddings_with_layers <- c(individual_tokens)
    rm(sorted_layers_ALL_variables)
    rm(individual_tokens)
  } else if (decontextualize == TRUE & word_type_embeddings == TRUE) {
    word_embeddings_with_layers <- c(individual_tokens)
    rm(individual_tokens)
  }

  return(word_embeddings_with_layers)
}


#' Select and aggregate layers of hidden states to form a word embeddings.
#' @param word_embeddings_layers Layers outputted from textEmbedRawLayers.
#' @param layers The numbers of the layers to be aggregated
#' (e.g., c(11:12) to aggregate the eleventh and twelfth).
#' Note that layer 0 is the input embedding to the transformer, and should normally not be used.
#' Selecting 'all' thus removes layer 0.
#' @param aggregation_from_layers_to_tokens Method to carry out the aggregation among the layers for each word/token,
#' including "min", "max" and "mean" which takes the minimum, maximum or mean across each column;
#' or "concatenate", which links together each layer of the word embedding to one long row. Default is "concatenate"
#' @param aggregation_from_tokens_to_texts Method to carry out the aggregation among the word embeddings
#' for the words/tokens, including "min", "max" and "mean" which takes the minimum, maximum or mean across each column;
#' or "concatenate", which links together each layer of the word embedding to one long row.
#' @param return_tokens If TRUE, provide the tokens used in the specified transformer model.
#' @param tokens_select Option to only select embeddings linked to specific tokens
#' such as "[CLS]" and "[SEP]" (default NULL).
#' @param tokens_deselect Option to deselect embeddings linked to specific tokens
#' such as "[CLS]" and "[SEP]" (default NULL).
#' @return A tibble with word embeddings. Note that layer 0 is the input embedding to
#' the transformer, which is normally not used.
#' @examples
#' \donttest{
#' # word_embeddings_layers <- textEmbedRawLayers(Language_based_assessment_data_8$harmonywords[1],
#' # layers = 11:12)
#' # word_embeddings <- textEmbedLayerAggregation(word_embeddings_layers$context, layers = 11)
#' }
#' @seealso see \code{\link{textEmbedRawLayers}} and \code{\link{textEmbed}}
#' @importFrom dplyr %>% bind_rows
#' @export
textEmbedLayerAggregation <- function(word_embeddings_layers,
                                      layers = "all",
                                      aggregation_from_layers_to_tokens = "concatenate",
                                      aggregation_from_tokens_to_texts = "mean",
                                      return_tokens = FALSE,
                                      tokens_select = NULL,
                                      tokens_deselect = NULL) {
  if (return_tokens == TRUE & !is.null(aggregation_from_tokens_to_texts)) {
    stop(cat(
      colourise("return_tokens = TRUE does not work with aggregation_from_tokens_to_texts not being NULL ", fg = "red"),
      colourise("When aggregating tokens to text, it is not possible to have return_token = TRUE.
                To get both token_embeddings and text_embeddings use textEmbed().", fg = "green")
    ))
  }

  # If selecting 'all' layers, find out number of layers to help indicate layer index later in code
  if (is.character(layers)) {
    # Get the first embeddings
    x_layer_unique <- unique(grep_col_by_name_in_list(word_embeddings_layers[[1]], "layer_number"))
    # Get which layers
    layers <- as.numeric(x_layer_unique)
    # Remove layer 0 because it is the input layer for the word embeddings.
    if (layers[1] == 0) {
      layers <- layers[2:length(layers)]
    }
  }

  # Loop over the list of variables; variable_list_i = 1; variable_list_i = 2; remove(variable_list_i)
  selected_layers_aggregated_tibble <- list()
  for (variable_list_i in seq_len(length(word_embeddings_layers))) {
    T1_variable <- Sys.time()

    x <- word_embeddings_layers[[variable_list_i]]

    # This is to ensure x is in a list (this is to make it work for single word embedddings that are contextualised)
    if (tibble::is_tibble(x)) {
      x <- list(x)
    }
    # Go over the lists and select the layers; [[1]] ok to add below x=
    # get number of unique layers in the variable starting with "layer_number"
    number_of_layers <- unique(x[[1]][, grep("^layer_number", names(x[[1]]))][[1]])

    # Check that the right number of levels are selected
    if ((length(setdiff(layers, number_of_layers)) > 0) == TRUE) {
      stop(cat(
        colourise("You are trying to aggregate layers that were not extracted.", fg = "red"),
        colourise("For example, in textEmbed the layers option needs to include all the
                  layers used in context_layers.", fg = "green")
      ))
    }

    # Select layers in layers-argument selected from the variable starting with layer_number
    selected_layers <- lapply(x, function(x) {
      x[x[, grep("^layer_number", names(x))][[1]]
      %in% layers, ]
    })

    # Go over the lists and select the tokens (e.g., CLS) (tokens_select = NULL tokens_select = "[CLS]")
    if (!is.null(tokens_select)) {
      selected_layers <- lapply(selected_layers, function(x) {
        x[x[, grep("^tokens", names(x))][[1]]
        %in% tokens_select, ]
      })
    }

    # Go over the lists and DEselect the token (e.g., CLS) (tokens_deselect = NULL tokens_deselect = "[CLS]")
    if (!is.null(tokens_deselect)) {
      selected_layers <- lapply(selected_layers, function(x) {
        x[!x[, grep("^tokens", names(x))][[1]]
        %in% tokens_deselect, ]
      })

      # If any of the tokens that was removed was "[CLS]", subtract one on token_id so it starts with
      # 1 and works with the layer_aggregation_helper
      if (length(tokens_deselect) == 1 & tokens_deselect == "[CLS]") {
        # Subtract
        selected_layers <- purrr::map(selected_layers, function(x) {
          x[, grep("^token_id", names(x))][[1]] <- x[, grep("^token_id", names(x))][[1]] - 1
          x
        })
      } else if (length(tokens_deselect) > 1) {
        if (table(tokens_deselect %in% "[CLS]")[[2]] > 0) {
          # Subtract
          selected_layers <- purrr::map(selected_layers, function(x) {
            # select variable starting with "token_id" and substract 1
            x[, grep("^token_id", names(x))][[1]] <- x[, grep("^token_id", names(x))][[1]] - 1
            x
          })
        }
      }
    }

    ## Aggregate across layers; i_token_id=1 aggregation_from_layers_to_tokens="min"
    selected_layers_aggregated <- lapply(selected_layers,
      layer_aggregation_helper,
      aggregation = aggregation_from_layers_to_tokens,
      return_tokens = return_tokens
    )

    if (is.null(aggregation_from_tokens_to_texts)) {
      # Sort output
      selected_layers_aggregated_tibble[[variable_list_i]] <- selected_layers_aggregated
    }

    # Aggregate across tokens
    if (!is.null(aggregation_from_tokens_to_texts)) {
      selected_layers_tokens_aggregated <- lapply(selected_layers_aggregated,
        textEmbeddingAggregation,
        aggregation = aggregation_from_tokens_to_texts
      )
      # Sort output
      selected_layers_aggregated_tibble[[variable_list_i]] <- dplyr::bind_rows(selected_layers_tokens_aggregated)
    }


    # Add informative comments
    original_comment <- comment(word_embeddings_layers)
    layers_string <- paste(as.character(layers), sep = " ", collapse = " ")
    comment(selected_layers_aggregated_tibble[[variable_list_i]]) <- paste(original_comment,
      "textEmbedLayerAggregation: layers = ",
      layers_string,
      "aggregation_from_layers_to_tokens = ",
      aggregation_from_layers_to_tokens,
      "aggregation_from_tokens_to_texts = ",
      aggregation_from_tokens_to_texts,
      "tokens_select = ",
      tokens_select,
      "tokens_deselect = ",
      tokens_deselect,
      collapse = " ; "
    )

    ## Timing
    T2_variable <- Sys.time()
    variable_time <- T2_variable - T1_variable
    variable_time <- sprintf(
      "duration: %f %s).",
      variable_time,
      units(variable_time)
    )

    version_seq <- paste(variable_list_i, "/", length(word_embeddings_layers), sep = "")

    loop_text <- paste("Completed layers aggregation", " (variable ",
      version_seq, ", ",
      variable_time,
      "\n",
      sep = ""
    )

    cat(colourise(loop_text, "blue"))
  }

  names(selected_layers_aggregated_tibble) <- names(word_embeddings_layers)
  selected_layers_aggregated_tibble
}




#' Extract layers and aggregate them to word embeddings, for all character variables in a given dataframe.
#' @param texts A character variable or a tibble/dataframe with at least one character variable.
#' @param model Character string specifying pre-trained language model (default 'bert-base-uncased').
#'  For full list of options see pretrained models at
#'  \href{https://huggingface.co/transformers/pretrained_models.html}{HuggingFace}.
#'  For example use "bert-base-multilingual-cased", "openai-gpt",
#' "gpt2", "ctrl", "transfo-xl-wt103", "xlnet-base-cased", "xlm-mlm-enfr-1024", "distilbert-base-cased",
#' "roberta-base", or "xlm-roberta-base". Only load models that you trust from HuggingFace; loading a
#'  malicious model can execute arbitrary code on your computer).
#' @param layers (string or numeric) Specify the layers that should be extracted
#' (default -2 which give the second to last layer). It is more efficient to only extract the layers
#' that you need (e.g., 11). You can also extract several (e.g., 11:12), or all by setting this parameter
#' to "all". Layer 0 is the decontextualized input layer (i.e., not comprising hidden states) and
#'  thus should normally not be used. These layers can then be aggregated in the textEmbedLayerAggregation
#'  function.
#' @param dim_name Boolean, if TRUE append the variable name after all variable-names in the output.
#' (This differentiates between word embedding dimension names; e.g., Dim1_text_variable_name).
#' see \code{\link{textDimName}} to change names back and forth.
#' @param aggregation_from_layers_to_tokens (string) Aggregated layers of each token. Method to aggregate the
#' contextualized layers (e.g., "mean", "min" or "max, which takes the minimum, maximum or mean, respectively,
#' across each column; or "concatenate", which links  together each word embedding layer to one long row.
#' @param aggregation_from_tokens_to_texts (string)  Aggregates to the individual text (i.e., the aggregation of
#' all tokens/words given to the transformer).
#' @param aggregation_from_tokens_to_word_types (string) Aggregates to the word type (i.e., the individual words)
#'  rather than texts.
#' @param keep_token_embeddings (boolean) Whether to also keep token embeddings when using texts or word
#' types aggregation.
#' @param tokens_select Option to select word embeddings linked to specific tokens
#' such as [CLS] and [SEP] for the context embeddings.
#' @param tokens_deselect Option to deselect embeddings linked to specific tokens
#' such as [CLS] and [SEP] for the context embeddings.
#' @param decontextualize (boolean) Provide word embeddings of single words as input to the model
#' (these embeddings are, e.g., used for plotting; default is to use ). If using this, then set
#' single_context_embeddings to FALSE.
#' @param model_max_length The maximum length (in number of tokens) for the inputs to the transformer model
#' (default the value stored for the associated model).
#' @param max_token_to_sentence (numeric) Maximum number of tokens in a string to handle before
#' switching to embedding text sentence by sentence.
#' @param tokenizer_parallelism (boolean) If TRUE this will turn on tokenizer parallelism. Default FALSE.
#' @param device Name of device to use: 'cpu', 'gpu', 'gpu:k' or 'mps'/'mps:k' for MacOS, where k is a
#' specific device number such as 'mps:1'.
#' @param logging_level Set the logging level. Default: "warning".
#' Options (ordered from less logging to more logging): critical, error, warning, info, debug
#' @param ... settings from textEmbedRawLayers().
#' @return A tibble with tokens, a column for layer identifier and word embeddings.
#' Note that layer 0 is the input embedding to the transformer
#' @examples
#' \donttest{
#' # word_embeddings <- textEmbed(Language_based_assessment_data_8[1:2, 1:2],
#' #                             layers = 10:11,
#' #                             aggregation_from_layers_to_tokens = "concatenate",
#' #                             aggregation_from_tokens_to_texts = "mean",
#' #                             aggregation_from_tokens_to_word_types = "mean")
#' ## Show information about how the embeddings were constructed
#' # comment(word_embeddings$texts$satisfactiontexts)
#' # comment(word_embeddings$word_types)
#' # comment(word_embeddings$tokens$satisfactiontexts)
#' }
#' @seealso see \code{\link{textEmbedLayerAggregation}}, \code{\link{textEmbedRawLayers}} and
#' \code{\link{textDimName}}
#' @importFrom reticulate source_python
#' @export
textEmbed <- function(texts,
                      model = "bert-base-uncased",
                      layers = -2,
                      dim_name = TRUE,
                      aggregation_from_layers_to_tokens = "concatenate",
                      aggregation_from_tokens_to_texts = "mean",
                      aggregation_from_tokens_to_word_types = NULL,
                      keep_token_embeddings = TRUE,
                      tokens_select = NULL,
                      tokens_deselect = NULL,
                      decontextualize = FALSE,
                      model_max_length = NULL,
                      max_token_to_sentence = 4,
                      tokenizer_parallelism = FALSE,
                      device = "cpu",
                      logging_level = "error",
                      ...) {


  T1_textEmbed <- Sys.time()

  reticulate::source_python(system.file("python",
                                        "huggingface_Interface3.py",
                                        package = "text",
                                        mustWork = TRUE
  ))

  if (
    (decontextualize == TRUE & is.null(aggregation_from_tokens_to_texts)) |
      (decontextualize == TRUE & is.null(aggregation_from_tokens_to_word_types)) |
      (decontextualize == TRUE & is.null(aggregation_from_layers_to_tokens))) {
    stop(cat(
      colourise("When using decontextualize = TRUE, it is required to set aggregation_from_tokens_to_texts,
                aggregation_from_tokens_to_word_types as well as aggregation_from_tokens_to_word_types",
                fg = "red"),
      colourise("This is because both the token ambeddings and text embeddings are constrcuted from the word type embeddings.",
                fg = "green")
    ))
  }
  output <- list()

  if (layers[1] < 0) {
    n <- textModelLayers(model)
    layers <- 1 + n + layers
    layers
  }

  # Get hidden states/layers for output 1 and/or output 2 or decontextualsied;
  if (!is.null(aggregation_from_layers_to_tokens) |
    !is.null(aggregation_from_tokens_to_texts) |
    decontextualize) {
    all_wanted_layers <- textEmbedRawLayers(
      texts = texts,
      model = model,
      layers = layers,
      return_tokens = TRUE,
      word_type_embeddings = TRUE,
      decontextualize = decontextualize,
      device = device,
      tokenizer_parallelism = tokenizer_parallelism,
      model_max_length = model_max_length,
      max_token_to_sentence = max_token_to_sentence,
      logging_level = logging_level,
      ...
    )
  }

  if (!decontextualize) {
    # 1. Get token-level embeddings with aggregated levels
    if (!is.null(aggregation_from_layers_to_tokens) & keep_token_embeddings) {
      token_embeddings <- textEmbedLayerAggregation(
        word_embeddings_layers = all_wanted_layers$context_tokens,
        layers = layers,
        aggregation_from_layers_to_tokens = aggregation_from_layers_to_tokens,
        aggregation_from_tokens_to_texts = NULL,
        return_tokens = TRUE, # need to be TRUE
        tokens_deselect = NULL
      )
      output$tokens <- token_embeddings
    }

    # 2. Get aggregated token layers; aggregation_from_tokens_to_texts = "mean"
    if (!is.null(aggregation_from_tokens_to_texts)) {
      aggregated_token_embeddings <- textEmbedLayerAggregation(
        word_embeddings_layers = all_wanted_layers$context_tokens,
        layers = layers,
        aggregation_from_layers_to_tokens = aggregation_from_layers_to_tokens,
        aggregation_from_tokens_to_texts = aggregation_from_tokens_to_texts,
        return_tokens = FALSE, # need to be FALSE
        tokens_deselect = NULL
      )
      output$texts <- aggregated_token_embeddings
    }
  }


  # 3. Aggregate Word Type (both decontextualised or not)
  if (!is.null(aggregation_from_tokens_to_word_types) | decontextualize) {
    if (!decontextualize) {
      single_context_text <- paste("Embedding single context embeddings.",
        "\n",
        sep = ""
      )
      cat(colourise(single_context_text, "purple"))

      individual_word_embeddings_layers <- all_wanted_layers$context_word_type
      individual_words <- all_wanted_layers$tokens
    }

    if (decontextualize) {
      single_context_text <- paste("Embedding decontextualised embeddings.",
        "\n",
        sep = ""
      )
      cat(colourise(single_context_text, "purple"))

      individual_word_embeddings_layers <- all_wanted_layers$decontext$word_type
      individual_words <- all_wanted_layers$decontext$single_words
    }

    individual_word_embeddings <- textEmbedLayerAggregation(
      word_embeddings_layers = individual_word_embeddings_layers,
      layers = layers,
      aggregation_from_layers_to_tokens = aggregation_from_layers_to_tokens,
      aggregation_from_tokens_to_texts = aggregation_from_tokens_to_texts,
      return_tokens = FALSE,
      tokens_select = tokens_select,
      tokens_deselect = tokens_deselect
    )
    individual_word_embeddings <- dplyr::bind_rows(individual_word_embeddings)
    # Combine the words for each decontextualized embedding
    individual_word_embeddings_words <- dplyr::bind_cols(
      individual_words, # all_wanted_layers$decontext$single_words,
      individual_word_embeddings
    )

    comment(individual_word_embeddings_words) <- paste(
      comment(all_wanted_layers$context_tokens),
      comment(individual_word_embeddings),
      " ; aggregation_from_tokens_to_word_types = ", aggregation_from_tokens_to_word_types,
      " ; decontextualize = ", decontextualize
    )
    output$word_types <- individual_word_embeddings_words
    cat(colourise("Done! \n", "purple"))
  }


  #### Decontextualised tokens and text embeddings (using output from 3 above)

  if (decontextualize) {
    decontext_space <- individual_word_embeddings_words

    ### 1. Get token-level embeddings with aggregated levels
    if (!is.null(aggregation_from_layers_to_tokens)) { #  & keep_token_embeddings

      # Tokenize texts
      output <- list()
      token_embeddings_list <- list()
      token_embeddings_list$tokens <- list()
      if (!tibble::is_tibble(texts)){
        texts <- tibble::as_tibble(texts)
      }

      for (i_variables in seq_len(ncol(texts))) {
        text_tokens <- lapply(texts[[i_variables]], textTokenize,
                              model = model, max_token_to_sentence = max_token_to_sentence) # , ...

        t_embeddings <- lapply(text_tokens, applysemrep_over_words, decontext_space, tolower = FALSE)

        token_embeddings <- mapply(dplyr::bind_cols, text_tokens, t_embeddings, SIMPLIFY = FALSE)

        token_embeddings_list$tokens[[i_variables]] <- token_embeddings
        names(token_embeddings_list$tokens[[i_variables]]) <- paste(names(texts)[[i_variables]],
          seq_len(length(token_embeddings)),
          sep = "_"
        )
      }
      output$tokens <- token_embeddings_list$tokens
    }

    # 2. Get aggregated token layers; aggregation_from_tokens_to_texts = "mean"
    if (!is.null(aggregation_from_tokens_to_texts)) {

      # Function to add dummy variable token_id and layer_number because textEmbedLayerAggregation
      # expect that (and it is missing here because the dectontextualize method is used)
      add_columns <- function(df) {
        df$token_id <- seq_len(nrow(df))
        df$layer_number <- rep(1, nrow(df))
        return(df)
      }
      token_embeddings1 <- lapply(token_embeddings, add_columns)

      text_embeddings <- textEmbedLayerAggregation(token_embeddings1,
        aggregation_from_tokens_to_texts = aggregation_from_tokens_to_texts,
        return_tokens = FALSE
      )

      output$texts <- text_embeddings
    }
  }

  T2_textEmbed <- Sys.time()
  Time_textEmbed <- T2_textEmbed - T1_textEmbed
  Time_textEmbed <- sprintf("Duration to embed text: %f %s", Time_textEmbed, units(Time_textEmbed))
  Date_textEmbed <- Sys.time()

  if (dim_name == TRUE & !is.null(aggregation_from_tokens_to_texts)) {
    output$texts <- textDimName(output$texts)
  }

  comment(output) <- paste(Time_textEmbed,
    "; Date created: ", Date_textEmbed,
    "; text_version: ", packageVersion("text"), ".",
    sep = "",
    collapse = " "
  )
  return(output)
}


#' Change the names of the dimensions in the word embeddings.
#' @param word_embeddings List of word embeddings
#' @param dim_names (boolean) If TRUE the word embedding name will be attached to the name of each dimension;
#' is FALSE, the attached part of the name will be removed.
#' @return Word embeddings with changed names.
#' @examples
#' \donttest{
#' # Note that dimensions are called Dim1_harmonytexts etc.
#' word_embeddings_4$texts$harmonytexts
#' # Here they are changed to just Dim
#' w_e_T <- textDimName(word_embeddings_4$texts["harmonytexts"],
#'   dim_names = FALSE
#' )
#' # Here they are changed back
#' w_e_F <- textDimName(w_e_T, dim_names = TRUE)
#' }
#' @seealso see \code{\link{textEmbed}}
#' @export
textDimName <- function(word_embeddings,
                        dim_names = TRUE) {
  tokens <- NULL
  word_type <- NULL

  x_is_tibble <- tibble::is_tibble(word_embeddings)
  if (x_is_tibble) word_embeddings <- list(word_embeddings)

  # Remove singlewords_we if it exist
  if (!is.null(word_embeddings$word_type)) {
    word_type <- word_embeddings$word_type
    word_embeddings$word_type <- NULL
  }

  if (!is.null(word_embeddings$tokens)) {
    tokens <- word_embeddings$tokens
    word_embeddings$tokens <- NULL
  }

  # i_row = 1 dim_name=TRUE
  if (dim_names) {
    for (i_row in seq_len(length(word_embeddings))) {
      colnames(word_embeddings[[i_row]]) <- paste0(
        names(word_embeddings[[i_row]]),
        "_",
        names(word_embeddings)[[i_row]]
      )
    }
  }

  if (!dim_names) {
    for (i_row in seq_len(length(word_embeddings))) {
      target_variables_names <- colnames(word_embeddings[[i_row]])

      # Select everything BEFORE the first _ (i.e., the Dim1, etc.)
      variable_names <- sub("\\_.*", "", target_variables_names)

      colnames(word_embeddings[[i_row]]) <- variable_names
    }
  }

  # Attach word embeddings again
  if (!is.null(word_type)) {
    word_embeddings$word_type <- word_type
  }
  # Attach word embeddings again
  if (!is.null(tokens)) {
    word_embeddings$tokens <- tokens
  }

  # Return tibble if x is a tibble (and not a list)
  if (x_is_tibble) word_embeddings <- word_embeddings[[1]]

  return(word_embeddings)
}

Try the text package in your browser

Any scripts or data that you put into this service are public.

text documentation built on Aug. 9, 2023, 5:08 p.m.