#### Semantic Centrality Plot SC ####
#' Semantic similarity score between single words' and an aggregated word embeddings
#'
#' textCentrality() computes semantic similarity score between single words' word embeddings
#' and the aggregated word embedding of all words.
#' @param words (character) Word or text variable to be plotted.
#' @param word_embeddings Word embeddings from textEmbed for the words to be plotted
#' (i.e., the aggregated word embeddings for the "words" variable).
#' @param word_types_embeddings Word embeddings from textEmbed for individual words
#' (i.e., the decontextualized word embeddings).
#' @inheritParams textSimilarity
#' @param aggregation (character) Method to aggregate the word embeddings
#' (default = "mean"; see also "min", "max" or "[CLS]").
#' @param min_freq_words_test (numeric) Option to select words that have at least occurred a specified
#' number of times (default = 0); when creating the semantic similarity
#' scores.
#' @return A dataframe with variables (e.g., including semantic similarity, frequencies)
#' for the individual words that are used as input for the plotting in the textCentralityPlot function.
#' @examples
#' # Computes the semantic similarity between the individual word embeddings (Iwe)
#' # in the "harmonywords" column of the pre-installed dataset: Language_based_assessment_data_8,
#' # and the aggregated word embedding (Awe).
#' # The Awe can be interpreted the latent meaning of the text.
#'
#' \dontrun{
#' df_for_plotting <- textCentrality(
#' words = Language_based_assessment_data_8["harmonywords"],
#' word_embeddings = word_embeddings_4$texts$harmonywords,
#' word_types_embeddings = word_embeddings_4$word_types
#' )
#'
#' # df_for_plotting contain variables (e.g., semantic similarity, frequencies) for
#' # the individual words that are used for plotting by the textCentralityPlot function.
#' }
#' @seealso See \code{\link{textCentralityPlot}} and \code{\link{textProjection}}.
#' @importFrom dplyr bind_rows
#' @importFrom tibble tibble
#' @export
textCentrality <- function(words,
word_embeddings,
word_types_embeddings = word_types_embeddings_df,
method = "cosine",
aggregation = "mean",
min_freq_words_test = 0) {
textCentrality_description <- paste("words =", substitute(words),
"word_embeddings =", comment(word_embeddings),
"word_types_embeddings =", comment(word_types_embeddings),
"method =", method,
"aggregation =", aggregation,
"min_freq_words_test =", min_freq_words_test,
collapse = " "
)
# Create Central Point by aggregating all word embeddings
Central_Point <- textEmbeddingAggregation(word_embeddings, aggregation = "mean")
# Select embeddings for unique words
# Group 1: getting unique words and their frequency min_freq_words_test=3
all_unique_freq_words <- unique_freq_words(words)
all_unique_freq_words_min_freq <- all_unique_freq_words[all_unique_freq_words$n >= min_freq_words_test, ]
# Get word embeddings for each word (applysemrep function is created in 1_1_textEmbedd).
all_single_wordembedding_a <- lapply(all_unique_freq_words_min_freq$words, applysemrep, word_types_embeddings)
all_single_wordembedding_a1 <- dplyr::bind_rows(all_single_wordembedding_a)
# Compute similarity to Central Point
Central_Point_df <- tibble::as_tibble(t(replicate(nrow(all_single_wordembedding_a1), Central_Point)))
central_semantic_similarity <- textSimilarity(all_single_wordembedding_a1, Central_Point_df, method = method)
cenrtal_sss_df <- tibble::tibble(all_unique_freq_words_min_freq[, 1:2], central_semantic_similarity)
cenrtal_sss_df$n_percent <- cenrtal_sss_df$n / sum(cenrtal_sss_df$n)
comment(cenrtal_sss_df) <- textCentrality_description
return(cenrtal_sss_df)
}
#' Plots words from textCentrality()
#'
#' textCentralityPlot() plots words according to semantic similarity to the aggregated word embedding.
#' @param word_data Tibble from the textPlot function.
#' @param min_freq_words_test Select words to significance test that have occurred
#' at least min_freq_words_test (default = 1).
#' @param plot_n_word_extreme Number of words per dimension to plot with extreme
#' Supervised Dimension Projection value (default = 10).
#' (i.e., even if not significant; duplicates are removed).
#' @param plot_n_word_frequency Number of words to plot according to their frequency (default = 10).
#' (i.e., even if not significant).
#' @param plot_n_words_middle Number of words to plot that are in the middle in Supervised Dimension
#' Projection score (default = 10). (i.e., even if not significant; duplicates are removed).
#' @param title_top Title (default: " ").
#' @param titles_color Color for all the titles (default: "#61605e").
#' @param x_axes Variable to be plotted on the x-axes (default: "central_semantic_similarity",
#' could also select "n", "n_percent").
#' @param x_axes_label Label on the x-axes (default: "Semantic Centrality").
#' @param scale_x_axes_lim Length of the x-axes (default: NULL, which uses
#' c(min(word_data$central_semantic_similarity)-0.05, max(word_data$central_semantic_similarity)+0.05);
#' change this by e.g., try c(-5, 5)).
#' @param scale_y_axes_lim Length of the y-axes (default: NULL, which uses c(-1, 1);
#' change e.g., by trying c(-5, 5)).
#' @param word_font Type of font (default: NULL).
#' @param centrality_color_codes (HTML color codes. type = character) Colors of the words selected as
#' plot_n_word_extreme (minimum values), plot_n_words_middle, plot_n_word_extreme (maximum values) and
#' plot_n_word_frequency; the default is c("#EAEAEA", "#85DB8E", "#398CF9", "#9e9d9d", respectively.
#' @param word_size_range Vector with minimum and maximum font size (default: c(3, 8)).
#' @param position_jitter_hight Jitter height (default: .0).
#' @param position_jitter_width Jitter width (default: .03).
#' @param point_size Size of the points indicating the words' position (default: 0.5).
#' @param arrow_transparency Transparency of the lines between each word and point (default: 0.1).
#' @param points_without_words_size Size of the points not linked to a word
#' (default is to not show the point; , i.e., 0).
#' @param points_without_words_alpha Transparency of the points that are not linked to a word
#' (default is to not show it; i.e., 0).
#' @param legend_title Title of the color legend (default: "SCP").
#' @param legend_x_axes_label Label on the color legend (default: "x").
#' @param legend_x_position Position on the x coordinates of the color legend (default = 0.02).
#' @param legend_y_position Position on the y coordinates of the color legend (default = 0.05).
#' @param legend_h_size Height of the color legend (default = 0.15).
#' @param legend_w_size Width of the color legend (default = 0.15).
#' @param legend_title_size Font size of the title (default = 7).
#' @param legend_number_size Font size of the values in the legend (default = 2).
#' @param seed Set different seed (default = 1007).
#' @return A 1-dimensional word plot based on similarity to the aggregated word embedding,
#' as well as tibble with processed data used to plot.
#' @seealso See \code{\link{textCentrality}} and \code{\link{textProjection}}.
#' @examples
#' # Plot a centrality plot from the dataframe df_for_plotting
#' # that is returned by the textCentrality function.
#' \dontrun{
#' textCentralityPlot(
#' df_for_plotting,
#' min_freq_words_test = 1,
#' plot_n_word_extreme = 10,
#' plot_n_word_frequency = 10,
#' plot_n_words_middle = 10,
#' titles_color = "#61605e",
#' x_axes = "central_semantic_similarity",
#' title_top = "Semantic Centrality Plot",
#' x_axes_label = "Semantic Centrality",
#' scale_x_axes_lim = NULL,
#' scale_y_axes_lim = NULL,
#' word_font = NULL,
#' centrality_color_codes = c("#EAEAEA", "#85DB8E", "#398CF9", "#9e9d9d"),
#' word_size_range = c(3, 8),
#' position_jitter_hight = 0,
#' position_jitter_width = 0.03,
#' point_size = 0.5,
#' arrow_transparency = 0.1,
#' points_without_words_size = 0.5,
#' points_without_words_alpha = 0.5,
#' legend_title = "SC",
#' legend_x_axes_label = "x",
#' legend_x_position = 0.02,
#' legend_y_position = 0.02,
#' legend_h_size = 0.2,
#' legend_w_size = 0.2,
#' legend_title_size = 7,
#' legend_number_size = 2,
#' seed = 1007
#' )
#' }
#'
#' @importFrom dplyr arrange slice filter between left_join transmute mutate case_when
#' @importFrom ggplot2 position_jitter element_text element_blank coord_fixed theme
#' theme_void theme_minimal aes labs scale_color_identity
#' @importFrom rlang sym .data
#' @export
textCentralityPlot <- function(word_data,
min_freq_words_test = 1,
plot_n_word_extreme = 10,
plot_n_word_frequency = 10,
plot_n_words_middle = 10,
titles_color = "#61605e",
x_axes = "central_semantic_similarity",
title_top = "Semantic Centrality Plot",
x_axes_label = "Semantic Centrality",
scale_x_axes_lim = NULL,
scale_y_axes_lim = NULL,
word_font = NULL,
centrality_color_codes = c("#EAEAEA", "#85DB8E", "#398CF9", "#9e9d9d"),
word_size_range = c(3, 8),
position_jitter_hight = .0,
position_jitter_width = .03,
point_size = 0.5,
arrow_transparency = 0.1,
points_without_words_size = 0.5,
points_without_words_alpha = 0.5,
legend_title = "SC",
legend_x_axes_label = "x",
legend_x_position = 0.02,
legend_y_position = 0.02,
legend_h_size = 0.2,
legend_w_size = 0.2,
legend_title_size = 7,
legend_number_size = 2,
seed = 1007) {
textCentralityPlot_comment <- paste(
"INFORMATION ABOUT THE PROJECTION",
comment(word_data),
"INFORMATION ABOUT THE PLOT",
"word_data =", substitute(word_data),
"min_freq_words_test =", min_freq_words_test,
"plot_n_word_extreme =", plot_n_word_extreme,
"plot_n_word_frequency =", plot_n_word_frequency,
"plot_n_words_middle =", plot_n_words_middle,
"centrality_color_codes =", paste(centrality_color_codes, collapse = " "),
"word_size_range =", paste(word_size_range, sep = "-", collapse = " - "),
"position_jitter_hight =", position_jitter_hight,
"position_jitter_width =", position_jitter_width,
"point_size =", point_size,
"arrow_transparency =", point_size,
"points_without_words_size =", points_without_words_size,
"points_without_words_alpha =", points_without_words_alpha,
"legend_x_position =", legend_x_position,
"legend_y_position =", legend_y_position,
"legend_h_size =", legend_h_size,
"legend_w_size =", legend_w_size,
"legend_title_size =", legend_title_size,
"legend_number_size =", legend_number_size
)
set.seed(seed)
y_axes_label <- NULL
y_axes_values <- ggplot2::element_blank()
# Selected min_freq_words_test
word_data1 <- word_data[word_data$n >= min_freq_words_test, ]
# Select plot_n_word_extreme and Select plot_n_word_frequency
word_data1_extrem_max_x <- word_data1 %>%
dplyr::arrange(-.data[[x_axes]]) %>%
dplyr::slice(0:plot_n_word_extreme)
word_data1_extrem_min_x <- word_data1 %>%
dplyr::arrange(.data[[x_axes]]) %>%
dplyr::slice(0:plot_n_word_extreme)
word_data1_frequency_x <- word_data1 %>%
dplyr::arrange(-n) %>%
dplyr::slice(0:plot_n_word_frequency)
# Select the middle range, order according to frequency and then select the plot_n_words_middle = 5
mean_m_sd_x <- mean(word_data1[[eval(x_axes)]], na.rm = TRUE) - (sd(word_data1[[eval(x_axes)]], na.rm = TRUE) / 1)
mean_p_sd_x <- mean(word_data1[[eval(x_axes)]], na.rm = TRUE) + (sd(word_data1[[eval(x_axes)]], na.rm = TRUE) / 1)
word_data1_middle_x <- word_data1 %>%
dplyr::filter(dplyr::between(
word_data1[[eval(x_axes)]],
mean_m_sd_x, mean_p_sd_x
)) %>%
dplyr::arrange(-n) %>%
dplyr::slice(0:plot_n_words_middle)
word_data1_all <- word_data1 %>%
dplyr::left_join(word_data1_extrem_max_x %>%
dplyr::transmute(words, check_extreme_max_x = 1), by = "words") %>%
dplyr::left_join(word_data1_extrem_min_x %>%
dplyr::transmute(words, check_extreme_min_x = 1), by = "words") %>%
dplyr::left_join(word_data1_frequency_x %>%
dplyr::transmute(words, check_extreme_frequency_x = 1), by = "words") %>%
dplyr::left_join(word_data1_middle_x %>%
dplyr::transmute(words, check_middle_x = 1), by = "words") %>%
dplyr::mutate(extremes_all_x = rowSums(cbind(
check_extreme_max_x, check_extreme_min_x,
check_extreme_frequency_x, check_middle_x
), na.rm = TRUE))
# Categorise words to apply specific color View(word_data1_all)
word_data1_all <- word_data1_all %>%
dplyr::mutate(colour_categories = dplyr::case_when(
check_extreme_min_x == 1 ~ centrality_color_codes[1],
check_middle_x == 1 ~ centrality_color_codes[2],
check_extreme_max_x == 1 ~ centrality_color_codes[3],
check_extreme_frequency_x == 1 ~ centrality_color_codes[4]
))
if (is.null(scale_x_axes_lim)) {
scale_x_axes_lim <- c(min(word_data1[[eval(x_axes)]], na.rm = TRUE) -
0.05, max(word_data1[[eval(x_axes)]], na.rm = TRUE) + 0.05)
}
if (is.null(scale_y_axes_lim)) {
scale_y_axes_lim <- c(-1, 1)
}
# This solution is because it is not possible to send "0" as a parameter
only_x_dimension <- 0
y_axes <- "only_x_dimension"
# Plot
plot <-
# construct ggplot; the !!sym( ) is to turn the strings into symbols.
ggplot2::ggplot(data = word_data1_all,
ggplot2::aes(!!rlang::sym(x_axes),
!!rlang::sym(y_axes),
label = words
)) +
ggplot2::geom_point(
data = word_data1_all,
size = points_without_words_size,
alpha = points_without_words_alpha,
ggplot2::aes(color = "#EAEAEA")
) +
# ggrepel geom, make arrows transparent, color by rank, size by n
ggrepel::geom_text_repel(
data = word_data1_all[word_data1_all$extremes_all_x >= 1, ],
segment.alpha = arrow_transparency,
position = ggplot2::position_jitter(h = position_jitter_hight, w = position_jitter_width),
ggplot2::aes(color = colour_categories, size = n, family = word_font),
) +
ggplot2::scale_color_identity() +
# Decide size and color of the points
ggplot2::geom_point(
data = word_data1_all[word_data1_all$extremes_all_x >= 1, ],
size = point_size,
ggplot2::aes(color = colour_categories)
) +
# set word size range and the guide
ggplot2::scale_size_continuous(
range = word_size_range,
guide = ggplot2::guide_legend(
title = "Frequency",
title.position = "top",
direction = "horizontal",
label.position = "bottom",
title.theme = ggplot2::element_text(color = titles_color)
)
) +
# Title
ggplot2::ggtitle(paste0(title_top)) +
ggplot2::labs(y = y_axes_label, x = x_axes_label) +
# Help create possibility to remove y-axes numbers
ggplot2::scale_x_continuous(limits = scale_x_axes_lim) +
ggplot2::scale_y_continuous(limits = scale_y_axes_lim) +
# Minimal theme, and turning off legends
ggplot2::theme_minimal() +
ggplot2::theme(
legend.position = c("bottom"),
plot.title = ggplot2::element_text(hjust = 0.5),
legend.justification = c("right", "top"),
panel.grid.major = ggplot2::element_blank(),
panel.grid.minor = ggplot2::element_blank(),
axis.text.y = y_axes_values,
title = ggplot2::element_text(color = titles_color),
axis.title.x = ggplot2::element_text(color = titles_color),
axis.title.y = ggplot2::element_text(color = titles_color)
)
plot
final_plot <- plot
output_plot_data <- list(final_plot, textCentralityPlot_comment, word_data1_all)
names(output_plot_data) <- c("final_plot", "description", "processed_word_data")
output_plot_data
}
###### End textCentralityPlot
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.