Nothing
knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) evaluate = FALSE
# For text-version => 0.9.99 # Install text from CRAN install.packages("text") library(text) # Set-up en environment with text-required python packages textrpp_install() # Initialize the environment – and save the settings for next time textrpp_initialize(save_profile = TRUE) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Example text texts <- c("I am feeling relatedness with others", "That's great!") # Defaults embeddings <- textEmbed(texts) # Output embeddings$tokens # Output embeddings$texts # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Look at example data included in the text- package comprising both text and numerical variables (note that there are only 40 participants in this example). Language_based_assessment_data_8 # Transform the text/word data to word embeddings (see help(textEmbed) to see the default settings). word_embeddings <- textEmbed( Language_based_assessment_data_8, model = "bert-base-uncased", aggregation_from_layers_to_tokens = "concatenate", aggregation_from_tokens_to_texts = "mean", keep_token_embeddings = FALSE ) # See how the word embeddings are structured word_embeddings # Save the word embeddings to avoid having to embed the text again. It is good practice to save output from analyses that take a lot of time to compute, which is often the case when analyzing text data. saveRDS(word_embeddings, "word_embeddings.rds") # Get the saved word embeddings (again) word_embeddings <- readRDS("word_embeddings.rds") # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Get hidden states for "I am fine" imf_embeddings_11_12 <- textEmbedRawLayers( "I am fine", layers = 11:12 ) imf_embeddings_11_12 #OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 1. Concatenate layers(results in 1,536 dimensions). textEmbedLayerAggregation( imf_embeddings_11_12$context_tokens, layers = 11:12, aggregation_from_layers_to_tokens = "concatenate", aggregation_from_tokens_to_texts = "mean" ) # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 2. Aggregate layers using mean (results in 768). textEmbedLayerAggregation( imf_embeddings_11_12$context_tokens, layers = 11, aggregation_from_tokens_to_texts = "mean" ) # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Examine the relationship between satisfactiontext and the corresponding rating scale model_satisfactiontext_swls <- textTrain( x = word_embeddings$texts$satisfactiontexts, # the predictor variables (i.e., the word embeddings) y = Language_based_assessment_data_8$swlstotal, # the criterion variable (i.e.,the rating scale score. model_description = "author(s): Kjell, Giorgi, & Schwartz; data: N=40, population = Online, Mechanical Turk; publication: title = Example for demo; description: swls = the satisfaction with life scale" ) # Examine the correlation between predicted and observed Harmony in life scale scores model_satisfactiontext_swls$results # OUTPUT: # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Save the mode saveRDS( model_satisfactiontext_swls, "model_satisfactiontext_swls.rds" ) # Read the model model_satisfactiontext_swls <- readRDS( "model_satisfactiontext_swls.rds" ) # Examine the names in the object returned from training names(model_satisfactiontext_swls) #OUTPUT: # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Predicting several outcomes from several word embeddings models_words_ratings <- textTrainLists( word_embeddings$texts[1:2], Language_based_assessment_data_8[5:6] ) # See results models_words_ratings$results # OUTPUT # Save model saveRDS(models_words_ratings, "models_words_ratings.rds") # Read model models_words_ratings <- readRDS( "models_words_ratings.rds" ) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Read a valence trained prediction model (download it from https://osf.io/dgczt/) valence_Warriner_L11 <- readRDS( "valence_Warriner_L11.rds" ) # Examine the model valence_Warriner_L11 # PART OF THE OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Apply the model to the satisfaction text satisfaction_text_valence <- textPredict( valence_Warriner_L11, word_embeddings$texts$satisfactiontexts, dim_names = FALSE ) # Examine the correlation between the predicted valence and the Satisfaction with life scale score psych::corr.test( satisfaction_text_valence$word_embeddings__ypred, Language_based_assessment_data_8$swlstotal ) # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Compute semantic similarity scores between two text columns, using the previously created word_embeddings. semantic_similarity_scores <- textSimilarity( word_embeddings$texts$harmonytexts, word_embeddings$texts$satisfactiontexts ) # Look at the first scores head(semantic_similarity_scores) # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Read word norms text (later we will use these for the semantic centrality plot) word_norms <- read.csv( "Word_Norms_Mental_Health_Kjell2018_text.csv" ) # Read the word embeddings for the word norms word_norms_embeddings <- readRDS( "Word_Norms_Mental_Health_Kjell2018_text_embedding_L11.rds" ) # Examine which word norms there are. names(word_norms_embeddings$texts) # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Compute semantic similarity score between the harmony answers and the harmony norm # Note that the descriptive word answers are used instead of text answers to correspond with how the word norm was created. norm_similarity_scores_harmony <- textSimilarityNorm( word_embeddings$texts$harmonywords, word_norms_embeddings$texts$harmonynorm ) # Correlating the semantic measure with the corresponding rating scale psych::corr.test( norm_similarity_scores_harmony, Language_based_assessment_data_8$hilstotal ) # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Extract word type embeddings and text embeddings for harmony words harmony_words_embeddings <- textEmbed( texts = Language_based_assessment_data_8["harmonywords"], aggregation_from_layers_to_tokens = "concatenate", aggregation_from_tokens_to_texts = "mean", aggregation_from_tokens_to_word_types = "mean", keep_token_embeddings = FALSE ) # Pre-processing data for plotting projection_results <- textProjection( words = Language_based_assessment_data_8$harmonywords, word_embeddings = harmony_words_embeddings$texts, word_types_embeddings = harmony_words_embeddings$word_types, x = Language_based_assessment_data_8$hilstotal, y = Language_based_assessment_data_8$age ) projection_results$word_data # To avoid warnings -- and that words do not get plotted, first increase the max.overlaps for the entire session: options(ggrepel.max.overlaps = 1000) # Plot plot_projection <- textPlot( projection_results, min_freq_words_plot = 1, plot_n_word_extreme = 10, plot_n_word_frequency = 5, plot_n_words_middle = 5, y_axes = FALSE, p_alpha = 0.05, p_adjust_method = "fdr", title_top = "Harmony Words Responses (Supervised Dimension Projection)", x_axes_label = "Low to High Harmony in Life Scale Score", y_axes_label = "", bivariate_color_codes = c("#FFFFFF", "#FFFFFF", "#FFFFFF", "#E07f6a", "#EAEAEA", "#85DB8E", "#FFFFFF", "#FFFFFF", "#FFFFFF" ) ) # View plot plot_projection$final_plot # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Plot plot_projection_2D <- textPlot( projection_results, min_freq_words_plot = 1, plot_n_word_extreme = 10, plot_n_word_frequency = 5, plot_n_words_middle = 5, y_axes = TRUE, # Change to TRUE/FALSE p_alpha = 0.05, p_adjust_method = "fdr", title_top = "Harmony Words Responses (Supervised Dimension Projection)", x_axes_label = "Low vs. High Harmony in Life Scale Score", y_axes_label = "Low vs.High Age", bivariate_color_codes = c("#E07f6b", "#60A1F7", "#85DB8D", "#FF0000", "#EAEAEA", "#5dc688", "#E07f6a", "#60A1F7", "#85DB8E" ) ) # View plot plot_projection_2D$final_plot # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Computing words' centrality (semantic similarity) score to the aggregated embedding of all words centrality_results <- textCentrality( words = word_norms$satisfactionnorm, word_embeddings = word_norms_embeddings$texts$satisfactionnorm, word_types_embeddings = word_norms_embeddings$word_types ) options(ggrepel.max.overlaps = 1000) centrality_plot <- textCentralityPlot( word_data = centrality_results, min_freq_words_test = 2, plot_n_word_extreme = 10, plot_n_word_frequency = 5, plot_n_words_middle = 5, title_top = "Satisfaction with life word norm: Semantic Centrality Plot", x_axes_label = "Satisfaction with Life Semantic Centrality" ) centrality_plot$final_plot # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Supplementary # PCA results to be plotted help(textPCA) textPCA_results <- textPCA( words = Language_based_assessment_data_8$satisfactionwords, word_types_embeddings = harmony_words_embeddings$word_types ) # Plotting the PCA results plot_PCA <- textPCAPlot( word_data = textPCA_results, min_freq_words_test = 2, plot_n_word_extreme = 5, plot_n_word_frequency = 5, plot_n_words_middle = 5 ) plot_PCA$final_plot
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.