inst/doc/huggingface_in_r.R

## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
evaluate = FALSE

## ----HuggingFface_tabble_long, echo=FALSE, results='asis'---------------------
library(magrittr)
Models <- c("'bert-base-uncased'",
            "'roberta-base'",
            "'distilbert-base-cased'",
            "'bert-base-multilingual-cased'",
            "'xlm-roberta-large'"
            )

References <- c("[Devlin et al. 2019]( https://aclanthology.org/N19-1423/)",
                "[Liu et al. 2019](https://arxiv.org/abs/1907.11692)",
                "[Sahn et al., 2019](https://arxiv.org/abs/1910.01108)",
                "[Devlin et al.2019]( https://aclanthology.org/N19-1423/)",
                "[Liu et al](https://arxiv.org/pdf/1907.11692.pdf)"
                )

Layers <- c("12",
            "12", 
            "6?",
            "12",
            "24")

Language <- c("English",
              "English", 
              "English",
              "[104 top languages at Wikipedia](https://meta.wikimedia.org/wiki/List_of_Wikipedias)",
              "[100 language](https://huggingface.co/bert-base-multilingual-cased)")

Dimensions <- c("768", 
                "768", 
                "768?", 
                "768", 
                "1024")

Tables_short <- tibble::tibble(Models, References, Layers, Dimensions, Language)

knitr::kable(Tables_short, caption="", bootstrap_options = c("hover"), full_width = T)

## ----word_embedding_tutorial_1, eval = evaluate, warning=FALSE, message=FALSE----
#  library(text)
#  # Example text
#  texts <- c("I feel great")
#  
#  # Transform the text to BERT word embeddings
#  wordembeddings <- textEmbed(texts = texts,
#                              model = 'bert-base-uncased',
#                              layers = 11:12,
#                              aggregation_from_layers_to_tokens = "concatenate",
#                              aggregation_from_tokens_to_texts = "mean"
#                              )
#  
#  wordembeddings

## ----word_embedding_tutorial_2, eval = FALSE, warning=FALSE, message=FALSE----
#  library(text)
#  
#  # Transform the text data to BERT word embeddings
#  wordembeddings <- textEmbed(texts = Language_based_assessment_data_8[1:2],
#                              aggregation_from_tokens_to_word_types = "mean",
#                              keep_token_embeddings = FALSE)
#  
#  # See how word embeddings are structured
#  wordembeddings

## ----word_embedding_tutorial_3, eval = evaluate, warning=FALSE, message=FALSE----
#  library(text)
#  
#  #Transform the text data to BERT word embeddings
#  
#  # Example test
#  texts <- c("I feel great")
#  
#  wordembeddings_tokens_layers <- textEmbedRawLayers(
#    texts = texts,
#    layers = 10:12)
#  wordembeddings_tokens_layers

## ----word_embedding_tutorial_4, eval = evaluate, warning=FALSE, message=FALSE----
#  library(text)
#  
#  # Aggregating layer 11 and 12 by taking the mean of each dimension.
#  we_11_12_mean <- textEmbedLayerAggregation(
#    word_embeddings_layers = wordembeddings_tokens_layers$context_tokens$texts,
#    layers = 11:12,
#    aggregation_from_layers_to_tokens = "concatenate",
#    aggregation_from_tokens_to_texts = "mean")
#  we_11_12_mean
#  # Aggregating layer 11 and 12 by taking the minimum of each dimension accross the two layers.
#  we_10_11_min <- textEmbedLayerAggregation(
#    word_embeddings_layers = wordembeddings_tokens_layers$context_tokens$texts,
#    layers = 10:11,
#    aggregation_from_layers_to_tokens = "concatenate",
#    aggregation_from_tokens_to_texts = "min")
#  we_10_11_min
#  # Aggregating layer 1 to 12 by taking the max value of each dimension across the 12 layers.
#  we_11_max <- textEmbedLayerAggregation(
#    word_embeddings_layers = wordembeddings_tokens_layers$context_tokens$texts,
#    layers = 11,
#    aggregation_from_tokens_to_texts = "max")
#  we_11_max

Try the text package in your browser

Any scripts or data that you put into this service are public.

text documentation built on Aug. 9, 2023, 5:08 p.m.