test_PM_tutorial.R
In text: Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

library(text)
library(tibble)
library(testthat)

context("Testing PM tutorial")

test_that("PM tutorial", {
  skip_on_cran()

  # Example text
  texts <- c("I am feeling relatedness with others", "That's great!")

  # Defaults
  embeddings <- textEmbed(texts)

  # Output
  expect_equal(embeddings$tokens$texts[[1]][1][[1]][2], "i")
  expect_equal(embeddings$tokens$texts[[1]][2][[1]][2], 0.46768242, tolerance = 0.00001)

  # # # # # # # # # # # # # # # # # # # # # # # # # # # #

  # Transform the text/word data to word embeddings (see help(textEmbed) to see the default settings).
  word_embeddings <- textEmbed(Language_based_assessment_data_8,
                               model = "bert-base-uncased",
                               aggregation_from_layers_to_tokens = "concatenate",
                               aggregation_from_tokens_to_texts = "mean",
                               keep_token_embeddings = FALSE)

  # See how the word embeddings are structured
  word_embeddings

  # Save the word embeddings to avoid having to embed the text again. It is good practice to save output from analyses that take a lot of time to compute, which is often the case when analyzing text data.
  # saveRDS(word_embeddings, "word_embeddings.rds")

  # Get the saved word embeddings (again)
### word_embeddings <- readRDS("/Users/oscarkjell/Desktop/1 Projects/0 Research/0 text r-package/1 article/text_tutorial/word_embeddings.rds")

  # # # # # # # # # # # # # # # # # # # # # # # # # # # #

  # Get hidden states for "I am fine"
  imf_embeddings_11_12 <- textEmbedRawLayers("I am fine",
                                             layers = 11:12)
  imf_embeddings_11_12

  #OUTPUT
  expect_equal(imf_embeddings_11_12$context_tokens$texts[[1]][[1]][1][[1]], "[CLS]")
  expect_equal(imf_embeddings_11_12$context_tokens$texts[[1]][[4]][1], 0.2663715, tolerance = 0.00001)


  # # # # # # # # # # # # # # # # # # # # # # # # # # # #


  # 1. Concatenate layers(results in 1,536 dimensions).
  agg1 <- textEmbedLayerAggregation(imf_embeddings_11_12$context_tokens,
                            layers = 11:12,
                            aggregation_from_layers_to_tokens = "concatenate",
                            aggregation_from_tokens_to_texts = "mean")
  # OUTPUT
  expect_equal(agg1$texts[[1]], 0.3110354, tolerance = 0.00001)

  # # # # # # # # # # # # # # # # # # # # # # # # # # # #

  # 2. Aggregate layers using mean (results in 768).
  agg2 <- textEmbedLayerAggregation(imf_embeddings_11_12$context_tokens,
                            layers = 11,
                            aggregation_from_tokens_to_texts = "mean")

  # OUTPUT
  expect_equal(agg2$texts[[1]], 0.3110354, tolerance = 0.00001)

  # # # # # # # # # # # # # # # # # # # # # # # # # # # #


  # Examine the relationship between satisfactiontext and the corresponding rating scale
#  model_satisfactiontext_swls <- textTrain(
#    x = word_embeddings$texts$satisfactiontexts, # the predictor variables (i.e., the word embeddings)
#    y = Language_based_assessment_data_8$swlstotal, # the criterion variable (i.e.,the rating scale score.
#    model_description = "author(s): Kjell, Giorgi, & Schwartz; data: N=40, population =  Online, Mechanical Turk; publication: title = Example for demo; description: swls = the satisfaction with life scale")
#
#  # Examine the correlation between predicted and observed Harmony in life scale scores
#  model_satisfactiontext_swls$results
#  expect_equal(model_satisfactiontext_swls$results[[4]][[1]], 0.5385082, tolerance = 0.00001)

  # OUTPUT:

  # # # # # # # # # # # # # # # # # # # # # # # # # # # #

  # Save the mode
  #saveRDS(model_satisfactiontext_swls, "model_satisfactiontext_swls.rds")
  # Read the model
  #model_satisfactiontext_swls <- readRDS("model_satisfactiontext_swls.rds")

  # Examine the names in the object returned from training
  #names(model_satisfactiontext_swls)

  #OUTPUT:

  # # # # # # # # # # # # # # # # # # # # # # # # # # # #


#  # Predicting several outcomes from several word embeddings
#  models_words_ratings <- textTrainLists(word_embeddings$texts[1:2],
#                                         Language_based_assessment_data_8[5:6])
#
#  # See results
#  models_words_ratings$results

  # OUTPUT


  # Save model
  ## saveRDS(models_words_ratings, "models_words_ratings.rds")
  # Read model
  ## models_words_ratings <- readRDS("models_words_ratings.rds")


  # # # # # # # # # # # # # # # # # # # # # # # # # # # #

  # Read a valence trained prediction model (download it from https://osf.io/dgczt/)
  ## valence_Warriner_L11 <- readRDS("valence_Warriner_L11.rds")

  # Examine the model
  # valence_Warriner_L11

  # PART OF THE OUTPUT

  # # # # # # # # # # # # # # # # # # # # # # # # # # # #

  # Apply the model to the satisfaction text
#   satisfaction_text_valence <- textPredict(valence_Warriner_L11,
#                                           word_embeddings$texts$satisfactiontexts,
#                                           dim_names = FALSE)
#
#  # Examine the correlation between the predicted valence and the Satisfaction with life scale score
#  psych::corr.test(satisfaction_text_valence$word_embeddings__ypred, Language_based_assessment_data_8$swlstotal)


  # OUTPUT

  # # # # # # # # # # # # # # # # # # # # # # # # # # # #



  # Compute semantic similarity scores between two text columns, using the previously created word_embeddings.
  semantic_similarity_scores <- textSimilarity(word_embeddings$texts$harmonytexts,
                                               word_embeddings$texts$satisfactiontexts)
  # Look at the first scores
  #head(semantic_similarity_scores)
  expect_equal(semantic_similarity_scores[[1]], 0.9281080, tolerance = 0.00001)

  # OUTPUT
  # # # # # # # # # # # # # # # # # # # # # # # # # # # #


  # Read word norms text (later we will use these for the semantic centrality plot)
  # word_norms <- read.csv("Word_Norms_Mental_Health_Kjell2018_text.csv")

  # Read the word embeddings for the word norms
  # word_norms_embeddings <- readRDS("Word_Norms_Mental_Health_Kjell2018_text_embedding_L11.rds")

  # Examine which word norms there are.
  # names(word_norms_embeddings$texts)

  # OUTPUT
  # # # # # # # # # # # # # # # # # # # # # # # # # # # #


  # Compute semantic similarity score between the harmony answers and the harmony norm
  # Note that the descriptive word answers are used instead of text answers to correspond with how the word norm was created.
  # norm_similarity_scores_harmony <- textSimilarityNorm(word_embeddings$texts$harmonywords,
  #                                                     word_norms_embeddings$texts$harmonynorm)

  # Correlating the semantic measure with the corresponding rating scale
  # psych::corr.test(norm_similarity_scores_harmony, Language_based_assessment_data_8$hilstotal)

  # OUTPUT
  # # # # # # # # # # # # # # # # # # # # # # # # # # # #


  # Extract word type embeddings and text embeddings for harmony words
  harmony_words_embeddings <- textEmbed(texts = Language_based_assessment_data_8["harmonywords"],
                                        aggregation_from_layers_to_tokens = "concatenate",
                                        aggregation_from_tokens_to_texts = "mean",
                                        aggregation_from_tokens_to_word_types = "mean",
                                        keep_token_embeddings = FALSE)


  # Pre-processing data for plotting
  projection_results <- textProjection(words = Language_based_assessment_data_8$harmonywords,
                                       word_embeddings = harmony_words_embeddings$texts,
                                       word_types_embeddings = harmony_words_embeddings$word_types,
                                       x = Language_based_assessment_data_8$hilstotal,
                                       y = Language_based_assessment_data_8$age)

  projection_results$word_data
  expect_equal(projection_results$word_data[[1]][[1]], "Group1*")
  expect_equal(projection_results$word_data$dot.x[[1]], -14.87054, tolerance = 0.00001)
  # To avoid warnings -- and that words do not get plotted, first increase the max.overlaps for the entire session:
  #options(ggrepel.max.overlaps = 1000)

  # Plot
#  plot_projection <- textPlot(projection_results,
#                              min_freq_words_plot = 1,
#                              plot_n_word_extreme = 10,
#                              plot_n_word_frequency = 5,
#                              plot_n_words_middle = 5,
#                              y_axes = FALSE,
#                              p_alpha = 0.05,
#                              p_adjust_method = "fdr",
#                              title_top = "Harmony Words Responses (Supervised Dimension Projection)",
#                              x_axes_label = "Low to High Harmony in Life Scale Score",
#                              y_axes_label = "",
#                              bivariate_color_codes = c("#FFFFFF", "#FFFFFF", "#FFFFFF",
#                                                        "#E07f6a", "#EAEAEA", "#85DB8E",
#                                                        "#FFFFFF", "#FFFFFF", "#FFFFFF"
#                              )
#  )
#  # View plot
#
#  plot_projection$final_plot
#

  # # # # # # # # # # # # # # # # # # # # # # # # # # # #

  # Plot
#  plot_projection_2D <- textPlot(projection_results,
#                                 min_freq_words_plot = 1,
#                                 plot_n_word_extreme = 10,
#                                 plot_n_word_frequency = 5,
#                                 plot_n_words_middle = 5,
#                                 y_axes = TRUE, # Change to TRUE
#                                 p_alpha = 0.05,
#                                 p_adjust_method = "fdr",
#                                 title_top = "Harmony Words Responses (Supervised Dimension Projection)",
#                                 x_axes_label = "Low vs. High Harmony in Life Scale Score",
#                                 y_axes_label = "Low vs.High Age", # Added
#                                 bivariate_color_codes = c("#E07f6b", "#60A1F7", "#85DB8D",
#                                                           "#FF0000", "#EAEAEA", "#5dc688",
#                                                           "#E07f6a", "#60A1F7", "#85DB8E"
#                                 )
#  )
#  # View plot
#  plot_projection_2D$final_plot

  # # # # # # # # # # # # # # # # # # # # # # # # # # # #


#  # Computing words' centrality (semantic similarity) score to the aggregated embedding of all words
#  centrality_results <- textCentrality(words = word_norms$satisfactionnorm,
#                                       word_embeddings = word_norms_embeddings$texts$satisfactionnorm,
#                                       word_types_embeddings = word_norms_embeddings$word_types)
# # options(ggrepel.max.overlaps = 1000)
#  centrality_plot <- textCentralityPlot(word_data = centrality_results,
#                                        min_freq_words_test = 2,
#                                        plot_n_word_extreme = 10,
#                                        plot_n_word_frequency = 5,
#                                        plot_n_words_middle = 5,
#                                        title_top = "Satisfaction with life word norm: Semantic Centrality Plot",
#                                        x_axes_label = "Satisfaction with Life Semantic Centrality")
#
#  centrality_plot$final_plot
#
  # OUTPUT

  # # # # # # # # # # # # # # # # # # # # # # # # # # # #


  # Supplementary

  # PCA results to be plotted help(textPCA)
#  textPCA_results <- textPCA(words = Language_based_assessment_data_8$satisfactionwords,
#                             word_types_embeddings = harmony_words_embeddings$word_types)
#
#
#  # Plotting the PCA results
#  plot_PCA <- textPCAPlot(
#    word_data = textPCA_results,
#    min_freq_words_test = 2,
#    plot_n_word_extreme = 5,
#    plot_n_word_frequency = 5,
#    plot_n_words_middle = 5
#  )
#  plot_PCA$final_plot


})

Any scripts or data that you put into this service are public.

text documentation built on Aug. 9, 2023, 5:08 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

text
Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

tests/testthat/test_PM_tutorial.R
In text: Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

Try the text package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

text Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

tests/testthat/test_PM_tutorial.R In text: Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

Try the text package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

text
Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

tests/testthat/test_PM_tutorial.R
In text: Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning