R/4_0_textPlot.R
In text: Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

Documented in textPlot

#### Supervised Dimension Projection #####

#' Creates the plot object (except for the legend).
#' @return A plot object.
#' @noRd
textPlotting <- function(
    word_data_all = word_data_all,
    word_data_all_yadjusted = word_data_all_yadjusted,
    only_x_dimension = only_x_dimension,
    x_axes_1 = x_axes_1,
    y_axes_1 = y_axes_1,
    group_embeddings1 = group_embeddings1,
    group_embeddings2 = group_embeddings2,
    projection_embedding = projection_embedding,
    label = words,
    points_without_words_size = points_without_words_size,
    points_without_words_alpha = points_without_words_alpha,
    colour_categories = colour_categories,
    arrow_transparency = arrow_transparency,
    scale_x_axes_lim = scale_x_axes_lim,
    scale_y_axes_lim = scale_y_axes_lim,
    position_jitter_hight = position_jitter_hight,
    position_jitter_width = position_jitter_width,
    word_font = word_font,
    point_size = point_size,
    aggregated_embeddings_data = aggregated_embeddings_data,
    aggregated_point_size = aggregated_point_size,
    aggregated_shape = aggregated_shape,
    aggregated_color_G1 = aggregated_color_G1,
    aggregated_color_G2 = aggregated_color_G2,
    projection_color = projection_color,
    word_size_range = word_size_range,
    # titles
    title_top = title_top,
    titles_color = titles_color,
    x_axes_label = x_axes_label,
    y_axes_label = y_axes_label,
    y_axes_values = y_axes_values) {
  plot <-
    # construct ggplot; the !!sym( ) is to  turn the strings into symbols.
    ggplot2::ggplot(data = word_data_all, ggplot2::aes(!!rlang::sym(x_axes_1), !!rlang::sym(y_axes_1), label = words)) +
    ggplot2::geom_point(
      data = word_data_all,
      size = points_without_words_size,
      alpha = points_without_words_alpha,
      ggplot2::aes(color = colour_categories)
    ) +

    # ggrepel geom, make arrows transparent, color by rank, size by n
    ggrepel::geom_text_repel(
      data = word_data_all_yadjusted,
      segment.alpha = arrow_transparency,
      position = ggplot2::position_jitter(h = position_jitter_hight, w = position_jitter_width),
      ggplot2::aes(color = colour_categories, size = n, family = word_font),
    ) +
    ggplot2::scale_color_identity() +

    # Decide size and color of the points
    ggplot2::geom_point(
      data = word_data_all_yadjusted,
      size = point_size,
      ggplot2::aes(color = colour_categories)
    ) +
    {
      if (group_embeddings1 == TRUE) {
        # Aggregated point help(geom_point)
        ggplot2::geom_point(
          data = aggregated_embeddings_data[1, ],
          size = aggregated_point_size,
          shape = aggregated_shape,
          ggplot2::aes(color = aggregated_color_G1)
        )
      }
    } +

    # Aggregated point 2
    {
      if (group_embeddings2 == TRUE) {
        ggplot2::geom_point(
          data = aggregated_embeddings_data[2, ],
          size = aggregated_point_size,
          shape = aggregated_shape,
          ggplot2::aes(color = aggregated_color_G2)
        )
      }
    } +

    # Projection embedding
    {
      if (projection_embedding == TRUE) {
        ggplot2::geom_point(
          data = aggregated_embeddings_data[3, ],
          size = aggregated_point_size,
          shape = aggregated_shape,
          ggplot2::aes(color = projection_color)
        )
      }
    } +

    # set word size range and the guide
    ggplot2::scale_size_continuous(
      range = word_size_range,
      guide = ggplot2::guide_legend(
        title = "Frequency",
        title.position = "top",
        direction = "horizontal",
        label.position = "bottom",
        title.theme = ggplot2::element_text(color = titles_color)
      )
    ) +

    # Title
    ggplot2::ggtitle(paste0(title_top)) +
    ggplot2::labs(y = y_axes_label, x = x_axes_label) +

    # Help create possibility to remove y-axes numbers
    ggplot2::scale_x_continuous(limits = scale_x_axes_lim) +
    ggplot2::scale_y_continuous(limits = scale_y_axes_lim) +

    # Minimal theme, and turning off legends
    ggplot2::theme_minimal() +
    ggplot2::theme(
      legend.position = c("bottom"),
      plot.title = ggplot2::element_text(hjust = 0.5),
      legend.justification = c("right", "top"),
      panel.grid.major = ggplot2::element_blank(),
      panel.grid.minor = ggplot2::element_blank(),
      axis.text.y = y_axes_values,
      title = ggplot2::element_text(color = titles_color),
      axis.title.x = ggplot2::element_text(color = titles_color),
      axis.title.y = ggplot2::element_text(color = titles_color)
    )
  return(plot)
}


#' Creates the legend for the plot.
#' @return A legend object that can be combined with the plot object.
#' @noRd
textLegend <- function(
    bivariate_color_codes = bivariate_color_codes,
    y_axes_1 = y_axes_1,
    fill = fill,
    legend_title = legend_title,
    legend_title_size = legend_title_size,
    legend_x_axes_label = legend_x_axes_label,
    legend_y_axes_label = legend_y_axes_label,
    word_data_all = word_data_all,
    legend_number_size = legend_number_size,
    legend_number_colour = legend_number_colour,
    titles_color = titles_color) {

  bivariate_color_data <- tibble::tibble(
    "1 - 3" = "#XXXXXX", "2 - 3" = "#XXXXXX", "3 - 3" = "#XXXXXX",
    "1 - 2" = "#XXXXXX", "2 - 2" = "#XXXXXX", "3 - 2" = "#XXXXXX",
    "1 - 1" = "#XXXXXX", "2 - 1" = "#XXXXXX", "3 - 1" = "#XXXXXX"
  )
  bivariate_color_data <- rbind(bivariate_color_data, bivariate_color_codes)
  bivariate_color_data <- bivariate_color_data[-1, ]

  if (y_axes_1 == "only_x_dimension") {
    # Only select 3 colors
    bivariate_color_data <- bivariate_color_data[, c(4, 5, 6)]
    colnames(bivariate_color_data) <- c("1 - 2", "2 - 2", "3 - 2")
    bivariate_color_data
    # Remove the y axes title on the legend
    legend_y_axes_label <- " "
  }

  legend <- bivariate_color_data %>%
    tidyr::gather("group", "fill") %>%
    tidyr::separate(group, into = c("x", "y"), sep = " - ") %>%
    dplyr::mutate(
      x = as.integer(x),
      y = as.integer(y)
    ) %>%
    ggplot2::ggplot(ggplot2::aes(x, y)) +
    ggplot2::geom_tile(ggplot2::aes(fill = fill)) +
    ggplot2::ggtitle(paste0(legend_title)) +
    ggplot2::scale_fill_identity() +
    ggplot2::labs(
      x = legend_x_axes_label,
      y = legend_y_axes_label
    ) +
    ggplot2::theme_void() +
    #    ggplot2::annotate(geom="text", x=2, y=2, label="ns",
    #               color = titles_color, size=legend_number_size)+
    {
      if (y_axes_1 != "only_x_dimension") {
        ggplot2::annotate(
          geom = "text", x = 1, y = 3, label = sum(word_data_all$colour_categories == bivariate_color_codes[1],
            na.rm = TRUE
          ),
          color = legend_number_colour, size = legend_number_size
        )
      }
    } +
    {
      if (y_axes_1 != "only_x_dimension") {
        ggplot2::annotate(
          geom = "text", x = 2, y = 3, label = sum(word_data_all$colour_categories == bivariate_color_codes[2],
            na.rm = TRUE
          ),
          color = legend_number_colour, size = legend_number_size
        )
      }
    } +
    {
      if (y_axes_1 != "only_x_dimension") {
        ggplot2::annotate(
          geom = "text", x = 3, y = 3, label = sum(word_data_all$colour_categories == bivariate_color_codes[3],
            na.rm = TRUE
          ),
          color = legend_number_colour, size = legend_number_size
        )
      }
    } +
    ggplot2::annotate(
      geom = "text", x = 1, y = 2, label = sum(word_data_all$colour_categories == bivariate_color_codes[4],
        na.rm = TRUE
      ),
      color = legend_number_colour, size = legend_number_size
    ) +
    ggplot2::annotate(
      geom = "text", x = 2, y = 2, label = sum(word_data_all$colour_categories == bivariate_color_codes[5],
        na.rm = TRUE
      ),
      color = legend_number_colour, size = legend_number_size
    ) +
    ggplot2::annotate(
      geom = "text", x = 3, y = 2, label = sum(word_data_all$colour_categories == bivariate_color_codes[6],
        na.rm = TRUE
      ),
      color = legend_number_colour, size = legend_number_size
    ) +
    {
      if (y_axes_1 != "only_x_dimension") {
        ggplot2::annotate(
          geom = "text", x = 1, y = 1, label = sum(word_data_all$colour_categories == bivariate_color_codes[7],
            na.rm = TRUE
          ),
          color = legend_number_colour, size = legend_number_size
        )
      }
    } +
    {
      if (y_axes_1 != "only_x_dimension") {
        ggplot2::annotate(
          geom = "text", x = 2, y = 1, label = sum(word_data_all$colour_categories == bivariate_color_codes[8],
            na.rm = TRUE
          ),
          color = legend_number_colour, size = legend_number_size
        )
      }
    } +
    {
      if (y_axes_1 != "only_x_dimension") {
        ggplot2::annotate(
          geom = "text", x = 3, y = 1, label = sum(word_data_all$colour_categories == bivariate_color_codes[9],
            na.rm = TRUE
          ),
          color = legend_number_colour, size = legend_number_size
        )
      }
    } +
    ggplot2::theme(
      plot.title = ggplot2::element_text(hjust = 0.5, size = legend_title_size + 1),
      title = ggplot2::element_text(color = titles_color),
      axis.title.x = ggplot2::element_text(color = titles_color),
      axis.title = ggplot2::element_text(size = legend_title_size),
      axis.title.y = ggplot2::element_text(angle = 90, color = titles_color)
    ) +
    ggplot2::coord_fixed()
  legend
}



#' Computes the dot product projection for added data.
#' @return Word_data_all_yadjusted with added information for the added words.
#' @noRd
textOwnWordsProjection <- function(
    word_data = word_data,
    word_data_all = word_data_all,
    word_data_all_yadjusted = word_data_all_yadjusted,
    y_axes = y_axes,
    explore_words = explore_words,
    explore_words_color = explore_words_color,
    explore_words_point = explore_words_point,
    explore_words_aggregation = explore_words_aggregation,
    space = space,
    text_plot_comment = text_plot_comment,
    scaling = scaling,
    ...
    ) {
  # For loop for different batches of added words; i_add_w=1 explore_words = "happy harmony love"
  forloops_add_w <- length(explore_words)
  added_words_information <- list()

  for (i_add_w in 1:forloops_add_w) {
    # If using a contextualized language model
    if (is.null(space) == TRUE) {
      # Creating word embeddings for the words.
      model_text <- sub(".*model: ", "", text_plot_comment)
      model_name <- sub(" ;.*", "", model_text)
      layers_text <- sub(".*layers: ", "", text_plot_comment)
      layers_number <- sub(" ;.*", "", layers_text)
      layers_number_split <- stringi::stri_split_boundaries(layers_number,
        type = "word",
        skip_word_none = TRUE,
        skip_word_number = FALSE
      )

      #
      aggregate_layers_text <- sub(".*aggregation_from_layers_to_tokens =  ", "", text_plot_comment)
      aggregate_layers_type <- sub(" aggregation_from_tokens_to_texts.*", "", aggregate_layers_text)

      aggregation_tokens_text <- sub(".*aggregation_from_tokens_to_texts =  ", "", text_plot_comment)
      aggregation_tokens_type <- sub(" tokens_select.*", "", aggregation_tokens_text)

      aggregation_word_text <- sub(".*aggregation_from_tokens_to_word_types =  ", "", text_plot_comment)
      aggregation_word_type <- sub("  ; decontextualize.*", "", aggregation_word_text)

      explore_words_embeddings <- text_embed(
        explore_words[i_add_w],
        model = model_name,
        layers = as.numeric(layers_number_split[[1]]),
        dim_name = F,
        aggregation_from_layers_to_tokens = aggregate_layers_type,
        aggregation_from_tokens_to_texts = aggregation_tokens_type,
        aggregation_from_tokens_to_word_types = aggregation_word_type
        #, ...
      )
    }
    # If using a static/decontextualized language model
    if (!is.null(space) == TRUE) {
      explore_words_embeddings <- textEmbedStatic(data.frame(explore_words[i_add_w]),
        space = space,
        aggregation_from_tokens_to_texts = explore_words_aggregation
      )
    }

    words <- tibble::as_tibble_col(explore_words_point[i_add_w])
    colnames(words) <- "words"
    n_words <- tibble::as_tibble_col(1)
    colnames(n_words) <- "n"

    # Scaling embeddings before aggregation
    if (scaling == TRUE) {
      singlewords_we_x <- dplyr::select(explore_words_embeddings$word_types, dplyr::starts_with("Dim"))

      # Applying scaling parameters to all the unique word's embeddings
      scale_center_weights <- word_data$background[[1]]$scale_centre.x %>%
        dplyr::slice(rep(1:dplyr::n(), each = nrow(singlewords_we_x)))

      scale_scale_weights <- word_data$background[[1]]$scale_scale.x %>%
        dplyr::slice(rep(1:dplyr::n(), each = nrow(singlewords_we_x)))

      singlewords_we_x_scaled <- tibble::as_tibble((singlewords_we_x - scale_center_weights) / scale_scale_weights)

      singlewords_we_x_scaled_w_n <- bind_cols(explore_words_embeddings$word_types[1:2], singlewords_we_x_scaled)

      #### Create token and aggregated word embeddings ####
      # Aggregate the words
      Aggregated_embedding_added_words <- tibble::as_tibble_row(textEmbeddingAggregation(singlewords_we_x_scaled,
        aggregation = explore_words_aggregation
      ))

      Mean1 <- dplyr::bind_cols(words, n_words, Aggregated_embedding_added_words)
      manual_words_mean1 <- bind_rows(singlewords_we_x_scaled_w_n, Mean1)
    } else {
      # Aggregate the words
      Aggregated_embedding_added_words <- tibble::as_tibble_row(textEmbeddingAggregation(
        dplyr::select(
          explore_words_embeddings$word_types[[1]],
          dplyr::starts_with("Dim")
        ),
        aggregation = explore_words_aggregation
      ))
      Mean1 <- dplyr::bind_cols(words, n_words, Aggregated_embedding_added_words)
      manual_words_mean1 <- bind_rows(explore_words_embeddings$word_types[[1]], Mean1)
    }

    #### Project embedding on the x axes ######
    projected_embedding.x <- as.vector(word_data$background[[1]]$Aggregated_word_embedding_group2.x -
                                         word_data$background[[1]]$Aggregated_word_embedding_group1.x)

    # Position words in relation to Aggregated word embedding
    # Position the embedding; i.e., taking the word embedding subtracted with aggregated word embedding
    embedding_to_anchour_with.x <- tibble::as_tibble(
      (word_data$background[[1]]$Aggregated_word_embedding_group2.x +
         word_data$background[[1]]$Aggregated_word_embedding_group1.x) / 2)
    manual_words_mean1_1.x <- dplyr::select(manual_words_mean1, dplyr::starts_with("Dim"))

    embedding_to_anchour_with.x_df <- embedding_to_anchour_with.x %>%
      dplyr::slice(rep(1:dplyr::n(), each = nrow(manual_words_mean1_1.x)))

    words_positioned_embeddings <- tibble::as_tibble(manual_words_mean1_1.x - embedding_to_anchour_with.x_df)

    projected_embedding.x_df <- tibble::as_tibble(projected_embedding.x) %>%
      slice(rep(1:dplyr::n(), each = nrow(manual_words_mean1)))

    # Project the embeddings using dot product.
    dot_products_observed.x <- rowSums(words_positioned_embeddings * projected_embedding.x_df)

    ### Compare observed dot-product with null
    p_values_dot_prod.x <- purrr::map(as.list(purrr::as_vector(dot_products_observed.x)), p_value_comparing_with_Null,
      word_data$background[[1]]$dot_null_distribution[[1]],
      alternative = "two_sided"
    )

    p_values_dot_prod.x <- unlist(p_values_dot_prod.x)

    #### Project embedding on the Y axes ####

    if (y_axes == TRUE) {
      projected_embedding.y <- as.vector(word_data$background[[2]]$Aggregated_word_embedding_group2.y -
                                           word_data$background[[2]]$Aggregated_word_embedding_group1.y)
      # Position words in relation to Aggregated word embedding
      # Position the embedding; i.e., taking the word embedding subtracted with aggregated word embedding
      embedding_to_anchour_with.y <- tibble::as_tibble(
        (word_data$background[[2]]$Aggregated_word_embedding_group2.y +
           word_data$background[[2]]$Aggregated_word_embedding_group1.y) / 2)
      manual_words_mean1_1.y <- dplyr::select(manual_words_mean1, dplyr::starts_with("Dim"))

      embedding_to_anchour_with.y_df <- embedding_to_anchour_with.y %>%
        dplyr::slice(rep(1:dplyr::n(), each = nrow(manual_words_mean1_1.y)))

      words_positioned_embeddings <- tibble::as_tibble(manual_words_mean1_1.y - embedding_to_anchour_with.y_df)

      projected_embedding.y_df <- tibble::as_tibble(projected_embedding.y) %>%
        slice(rep(1:dplyr::n(), each = nrow(manual_words_mean1)))

      # Project the embeddings using dot product.
      dot_products_observed.y <- rowSums(words_positioned_embeddings * projected_embedding.y_df)

      ### Compare observed dot-product with null
      p_values_dot_prod.y <- purrr::map(as.list(purrr::as_vector(dot_products_observed.y)), p_value_comparing_with_Null,
        word_data$background[[2]]$dot_null_distribution[[1]],
        alternative = "two_sided"
      )

      p_values_dot_prod.y <- unlist(p_values_dot_prod.y)
    }

    #### Sort out dataframe for textOwnWordsProjection ####
    explore_words_results <- manual_words_mean1[1:2]
    explore_words_results$x_plotted <- dot_products_observed.x
    explore_words_results$p_values_x <- p_values_dot_prod.x
    explore_words_results$adjusted_p_values.x <- p_values_dot_prod.x

    if (y_axes == TRUE) {
      explore_words_results$y_plotted <- dot_products_observed.y
      explore_words_results$p_values_y <- p_values_dot_prod.y
      explore_words_results$adjusted_p_values.y <- p_values_dot_prod.y
    }

    explore_words_results$colour_categories <- explore_words_color[i_add_w]
    # TODO; should not have to print extreme?
    explore_words_results$extremes_all_x <- rep(NA, nrow(explore_words_results))
    explore_words_results$n <- rep(mean(word_data_all$n), nrow(explore_words_results))
    explore_words_results$n.percent <- rep(0.5, nrow(explore_words_results))
    explore_words_results$n_g2.x <- rep(5, nrow(explore_words_results))
    explore_words_results$N_participant_responses <- rep(
      max(word_data_all$N_participant_responses),
      nrow(explore_words_results)
    )

    added_words_information[[i_add_w]] <- explore_words_results
  }
  added_words_information_unlist <- dplyr::bind_rows(added_words_information)
  word_data_all_yadjusted <- dplyr::bind_rows(word_data_all_yadjusted, added_words_information_unlist)

  return(word_data_all_yadjusted)
}


#' Computes the dot product projection for added data.
#' @return Word_data_all_yadjusted with added infomration for the added words.
#' @noRd
textOwnWordPrediction <- function(word_data = word_data,
                                  word_data_all = word_data_all,
                                  word_data_all_yadjusted = word_data_all_yadjusted,
                                  y_axes = y_axes,
                                  explore_words = explore_words,
                                  explore_words_color = explore_words_color,
                                  explore_words_point = explore_words_point,
                                  explore_words_aggregation = explore_words_aggregation,
                                  space = space,
                                  text_plot_comment = text_plot_comment,
                                  scaling = scaling) {
  # For loop for different batches of added words; i_add_w=1 explore_words = "happy harmony love"
  forloops_add_w <- length(explore_words)
  added_words_information <- list()


  for (i_add_w in 1:forloops_add_w) {
    # If using a contextualized language model
    if (is.null(space) == TRUE) {
      # Creating word embeddings for the words.
      model_text <- sub(".*model: ", "", text_plot_comment)
      model_name <- sub(" ; layer.*", "", model_text)
      layers_text <- sub(".*layers: ", "", text_plot_comment)
      layers_number <- sub(" ; word_type_embeddings.*", "", layers_text)
      layers_number_split <- stringi::stri_split_boundaries(layers_number,
        type = "word",
        skip_word_none = TRUE,
        skip_word_number = FALSE
      )

      #
      aggregate_layers_text <- sub(".*aggregation_from_layers_to_tokens =  ", "", text_plot_comment)
      aggregate_layers_type <- sub(" aggregation_from_tokens_to_texts.*", "", aggregate_layers_text)

      aggregation_tokens_text <- sub(".*aggregation_from_tokens_to_texts =  ", "", text_plot_comment)
      aggregation_tokens_type <- sub(" tokens_select.*", "", aggregation_tokens_text)

      aggregation_word_text <- sub(".*aggregation_from_tokens_to_word_types =  ", "", text_plot_comment)
      aggregation_word_type <- sub("  ; decontextualize.*", "", aggregation_word_text)

      explore_words_embeddings <- text_embed(
        explore_words[i_add_w],
        model = model_name,
        layers = as.numeric(layers_number_split[[1]]),
        aggregation_from_layers_to_tokens = aggregate_layers_type,
        aggregation_from_tokens_to_texts = aggregation_tokens_type,
        aggregation_from_tokens_to_word_types = aggregation_word_type,
        dim_name = F
      )
    }
    # If using a static/decontextualized language model
    if (!is.null(space) == TRUE) {
      explore_words_embeddings <- textEmbedStatic(data.frame(explore_words[i_add_w]),
        space = space,
        aggregation_from_tokens_to_texts = explore_words_aggregation
      )
    }

    #### Create token and aggregated word embeddings ####
    words <- tibble::as_tibble_col(explore_words_point[i_add_w])
    colnames(words) <- "words"
    n_words <- tibble::as_tibble_col(1)
    colnames(n_words) <- "n"

    # Aggregate the words
    Aggregated_embedding_added_words <- tibble::as_tibble_row(textEmbeddingAggregation(
      dplyr::select(
        explore_words_embeddings$word_types,
        dplyr::starts_with("Dim")
      ),
      aggregation = explore_words_aggregation
    ))
    Mean1 <- dplyr::bind_cols(words, n_words, Aggregated_embedding_added_words)
    manual_words_mean1 <- bind_rows(explore_words_embeddings$word_types, Mean1)


    prediction_x <- textPredict(word_data$model_x, manual_words_mean1)$.pred # , ...

    # Creating p-value column  TO DO: implement a real p-value
    p_value_x <- rep(1, length(prediction_x))

    if (y_axes == TRUE) {
      prediction_y <- textPredict(word_data$model_y, manual_words_mean1)$.pred # , ...

      # Creating p-value column  TO DO: implement a real p-value
      p_value_y <- rep(1, length(prediction_y))
    }

    #### Sort out df for textOwnWordPrediction ####
    explore_words_results <- manual_words_mean1[1:2]
    explore_words_results$x_plotted <- prediction_x
    explore_words_results$p_values_x <- p_value_x
    explore_words_results$adjusted_p_values.x <- p_value_x

    if (y_axes == TRUE) {
      explore_words_results$y_plotted <- prediction_y
      explore_words_results$p_values_y <- p_value_y
      explore_words_results$adjusted_p_values.y <- p_value_y
    }

    explore_words_results$colour_categories <- explore_words_color[i_add_w]
    explore_words_results$extremes_all_x <- rep(NA, nrow(explore_words_results))
    explore_words_results$n <- rep(mean(word_data_all$n), nrow(explore_words_results))
    explore_words_results$n.percent <- rep(0.5, nrow(explore_words_results))
    explore_words_results$n_g2.x <- rep(5, nrow(explore_words_results))

    added_words_information[[i_add_w]] <- explore_words_results
  }
  added_words_information_unlist <- dplyr::bind_rows(added_words_information)
  word_data_all_yadjusted <- dplyr::bind_rows(word_data_all_yadjusted, added_words_information_unlist)

  return(word_data_all_yadjusted)
}


#' Find out plot type to be plotted and adjust word_data columns accordingly.
#' @return word_data with generically called columns that can run in textPlot.
#' @noRd
adjust_for_plot_type <- function(
    word_data,
    y_axes,
    projection_metric = NULL
    ) {
  type_text <- sub(".*type = ", "", comment(word_data))
  type_name <- sub(" .*", "", type_text)

  # Making column names generic across different input

  if (type_name == "textWordPrediction") {
    colnames(word_data$word_data)[which(names(word_data$word_data) == "p_value_w_pred_x")] <- "p_values_x"
    colnames(word_data$word_data)[which(names(word_data$word_data) == "embedding_based_prediction_x")] <- "x_plotted"
    if (y_axes) {
      colnames(word_data$word_data)[which(names(word_data$word_data) == "p_value_w_pred_y")] <- "p_values_y"
      colnames(word_data$word_data)[which(names(word_data$word_data) == "embedding_based_prediction_y")] <- "y_plotted"
    }
  }

  if (type_name == "textProjection") {
    colnames(word_data$word_data)[which(names(word_data$word_data) == "p_values_dot.x")] <- "p_values_x"

    if (projection_metric == "dot_product"){
      colnames(word_data$word_data)[which(names(word_data$word_data) == "dot.x")] <- "x_plotted"
    } else if (projection_metric == "cohens_d"){
      colnames(word_data$word_data)[which(names(word_data$word_data) == "cohens_d.x")] <- "x_plotted"
    }

    if (y_axes) {
      colnames(word_data$word_data)[which(names(word_data$word_data) == "p_values_dot.y")] <- "p_values_y"

      if (projection_metric == "dot_product"){
        colnames(word_data$word_data)[which(names(word_data$word_data) == "dot.y")] <- "y_plotted"
      } else if (projection_metric == "cohens_d"){
        colnames(word_data$word_data)[which(names(word_data$word_data) == "cohens_d.y")] <- "y_plotted"
      }

    }
  }
  return(word_data)
}


#' Plot words
#'
#' textPlot() plots words from textProjection() or textWordPrediction().
#' @param word_data Dataframe from textProjection.
#' @param k_n_words_to_test Select the k most frequent words to significance
#' test (k = sqrt(100*N); N = number of participant responses) (default = TRUE).
#' @param min_freq_words_test Select words to significance test that have occurred
#' at least min_freq_words_test (default = 1).
#' @param min_freq_words_plot Select words to plot that has occurred at
#' least min_freq_words_plot times (default = 1).
#' @param plot_n_words_square Select number of significant words in each square
#' of the figure to plot. The significant words, in each square is selected
#' according to most frequent words (default = 3).
#' @param plot_n_words_p Number of significant words to plot on each (positive
#' and negative) side of the x-axes and y-axes, (where duplicates are removed);
#' selects first according to lowest p-value and then according to frequency (default = 5). Hence, on a two
#' dimensional plot it is possible that plot_n_words_p = 1 yield 4 words.
#' @param plot_n_word_extreme Number of words that are extreme on Supervised Dimension
#' Projection per dimension. (i.e., even if not significant; per dimension,
#' where duplicates are removed).
#' @param plot_n_word_extreme_xy Number of words that are extreme in both x and y dimensions,
#' considering overall distance from the origin in the Supervised Dimension Projection space.
#' This selects words based on their combined extremity score, calculated as
#' the Euclidean distance from (0,0). Ensures balance across all nine squares by selecting
#' at least one extreme word per square if available.
#' @param plot_n_word_frequency Number of words based on being most frequent (default = 5).
#' (i.e., even if not significant).
#' @param plot_n_words_middle Number of words plotted that are in the middle in Supervised
#' Dimension Projection score (default = 5). (i.e., even if not significant;  per dimensions,
#' where duplicates are removed).
#' @param plot_n_word_random (numeric) select random words to plot.
#' @param title_top Title (default "  ").
#' @param titles_color Color for all the titles (default: "#61605e").
# @param x_axes If TRUE, plotting on the x_axes.
#' @param y_axes (boolean) If TRUE, also plotting on the y-axes (default = FALSE, i.e, a 1-dimensional
#' plot is generated). Also plotting on y-axes produces a two dimension 2-dimensional plot, but the
#' textProjection function has to have had a variable on the y-axes.
#' @param p_alpha Alpha (default = .05).
#' @param overlapping (boolean) Allow overlapping (TRUE) or disallow (FALSE) (default = TRUE).
#' @param p_adjust_method (character) Method to adjust/correct p-values for multiple comparisons
#' (default = "none"; see also "holm", "hochberg", "hommel", "bonferroni", "BH", "BY",  "fdr").
#' @param projection_metric (character) Metric to plot according to; "dot_product" or "cohens_d".
#' @param x_axes_label (character) Label on the x-axes (default = "Supervised Dimension Projection (SDP)").
#' @param y_axes_label (character) Label on the y-axes (default = "Supervised Dimension Projection (SDP)").
#' @param scale_x_axes_lim Manually set the length of the x-axes (default = NULL, which uses
#' ggplot2::scale_x_continuous(limits = scale_x_axes_lim); change e.g., by trying c(-5, 5)).
#' @param scale_y_axes_lim Manually set the length of the y-axes (default = NULL; which uses
#' ggplot2::scale_y_continuous(limits = scale_y_axes_lim); change e.g., by trying c(-5, 5)).
#' @param word_font Font type (default = NULL).
#' @param bivariate_color_codes (HTML color codes. Type = character) The different colors of the words.
#'  Note that, at the moment, two squares should not have the exact same colour-code because the numbers
#'  within the squares of the legend will then be aggregated (and show the same, incorrect  value).
#' (default: c("#398CF9", "#60A1F7", "#5dc688",
#' "#e07f6a", "#EAEAEA", "#40DD52",
#' "#FF0000", "#EA7467", "#85DB8E")).
#' @param word_size_range Vector with minimum and maximum font size (default: c(3, 8)).
#' @param position_jitter_hight Jitter height (default: .0).
#' @param position_jitter_width Jitter width (default: .03).
#' @param point_size Size of the points indicating the words' position (default: 0.5).
#' @param arrow_transparency Transparency of the lines between each word and point (default: 0.1).
#' @param points_without_words_size Size of the points not linked with a words
#' (default is to not show it, i.e., 0).
#' @param points_without_words_alpha Transparency of the points not linked with a words
#' (default is to not show it, i.e., 0).
#' @param legend_title Title on the color legend (default: "SDP").
#' @param legend_x_axes_label Label on the color legend (default: "x").
#' @param legend_y_axes_label Label on the color legend (default: "y").
#' @param legend_x_position Position on the x coordinates of the color legend (default: 0.02).
#' @param legend_y_position Position on the y coordinates of the color legend (default: 0.05).
#' @param legend_h_size Height of the color legend (default 0.15).
#' @param legend_w_size Width of the color legend (default 0.15).
#' @param legend_title_size Font size (default: 7).
#' @param legend_number_size Font size of the values in the legend (default: 2).
#' @param legend_number_colour (string) Colour of the numbers in the box legend.
#' @param group_embeddings1 (boolean) Shows a point representing the aggregated word embedding
#' for group 1 (default = FALSE).
#' @param group_embeddings2 (boolean) Shows a point representing the aggregated word embedding
#' for group 2 (default = FALSE).
#' @param projection_embedding (boolean) Shows a point representing the aggregated direction
#' embedding (default = FALSE).
#' @param aggregated_point_size Size of the points representing the group_embeddings1,
#' group_embeddings2 and projection_embedding (default = 0.8).
#' @param aggregated_shape Shape type of the points representing the group_embeddings1,
#' group_embeddings2 and projection_embedding (default = 8).
#' @param aggregated_color_G1 Color (default = "black").
#' @param aggregated_color_G2 Color (default = "black").
#' @param projection_color Color (default = "blue").
#' @param seed (numeric) Set different seed (default = 1005)..
#' @param explore_words Explore where specific words are positioned in the embedding space.
#' For example, c("happy content", "sad down") (default = NULL).
#' @param explore_words_color Specify the color(s) of the words being explored.
#' For example c("#ad42f5", "green") (default = "#ad42f5").
#' @param explore_words_point Specify the names of the point for the aggregated word embeddings
#' of all the explored words (default = "ALL_1").
#' @param explore_words_aggregation Specify how to aggregate the word embeddings of
#' the explored words (default = "mean").
#' @param remove_words Manually remove words from the plot (which is done just before the
#' words are plotted so that the remove_words are part of previous counts/analyses) (default = NULL).
#' @param space Provide a semantic space if using static embeddings and wanting to explore words (default = NULL).
#' @param n_contrast_group_color Set color to words that have higher frequency (N)
#' on the other opposite side of its dot product projection (default = NULL).
#' @param n_contrast_group_remove Remove words that have higher frequency (N) on the other
#' opposite side of its dot product projection (default = FALSE).
#' @param scaling Scaling word embeddings before aggregation (default = FALSE).
#' @param ... Settings for textOwnWordsProjection().
#' @return A 1- or 2-dimensional word plot, as well as tibble with processed data used
#' to plot.
#' @examples
#' # The test-data included in the package is called: DP_projections_HILS_SWLS_100
#'
#' # Supervised Dimension Projection Plot
#' plot_projection <- textPlot(
#'   word_data = DP_projections_HILS_SWLS_100,
#'   k_n_words_to_test = FALSE,
#'   min_freq_words_test = 1,
#'   plot_n_words_square = 3,
#'   plot_n_words_p = 3,
#'   plot_n_word_extreme = 1,
#'   plot_n_word_frequency = 1,
#'   plot_n_words_middle = 1,
#'   y_axes = FALSE,
#'   p_alpha = 0.05,
#'   title_top = "Supervised Dimension Projection (SDP)",
#'   x_axes_label = "Low vs. High HILS score",
#'   y_axes_label = "Low vs. High SWLS score",
#'   p_adjust_method = "bonferroni",
#'   scale_y_axes_lim = NULL
#' )
#' plot_projection
#'
#' names(DP_projections_HILS_SWLS_100)
#' @seealso See \code{\link{textProjection}}.
#' @importFrom tibble as_tibble tibble
#' @importFrom dplyr row_number slice mutate mutate_if bind_rows group_by summarize left_join %>% n
#' @importFrom tidyr gather separate
#' @importFrom ggplot2 position_jitter element_text element_blank coord_fixed theme
#' theme_void theme_minimal aes labs scale_color_identity
#' @importFrom rlang sym
#' @importFrom cowplot ggdraw draw_plot
#' @importFrom purrr as_vector
#' @importFrom stringi stri_split_boundaries
#' @export
textPlot <- function(
    word_data,
    k_n_words_to_test = FALSE,
    min_freq_words_test = 1,
    min_freq_words_plot = 1,
    plot_n_words_square = 3,
    plot_n_words_p = 5,
    plot_n_word_extreme = 5,
    plot_n_word_extreme_xy = 0,
    plot_n_word_frequency = 5,
    plot_n_words_middle = 5,
    plot_n_word_random = 0,
    titles_color = "#61605e",
    y_axes = FALSE,
    p_alpha = 0.05,
    overlapping = TRUE,
    p_adjust_method = "none",
    projection_metric = "dot_product",
    title_top = "Supervised Dimension Projection",
    x_axes_label = "Supervised Dimension Projection (SDP)",
    y_axes_label = "Supervised Dimension Projection (SDP)",
    scale_x_axes_lim = NULL,
    scale_y_axes_lim = NULL,
    word_font = NULL,
    bivariate_color_codes = c(
      "#398CF9", "#60A1F7", "#5dc688",
      "#e07f6a", "#EAEAEA", "#40DD52",
      "#FF0000", "#EA7467", "#85DB8E"
    ),
    word_size_range = c(3, 8),
    position_jitter_hight = .0,
    position_jitter_width = .03,
    point_size = 0.5,
    arrow_transparency = 0.1,
    points_without_words_size = 0.2,
    points_without_words_alpha = 0.2,
    legend_title = "SDP",
    legend_x_axes_label = "x",
    legend_y_axes_label = "y",
    legend_x_position = 0.02,
    legend_y_position = 0.02,
    legend_h_size = 0.2,
    legend_w_size = 0.2,
    legend_title_size = 7,
    legend_number_size = 2,
    legend_number_colour = "white",
    group_embeddings1 = FALSE,
    group_embeddings2 = FALSE,
    projection_embedding = FALSE,
    aggregated_point_size = 0.8,
    aggregated_shape = 8,
    aggregated_color_G1 = "black",
    aggregated_color_G2 = "black",
    projection_color = "blue",
    seed = 1005,
    explore_words = NULL,
    explore_words_color = "#ad42f5",
    explore_words_point = "ALL_1",
    explore_words_aggregation = "mean",
    remove_words = NULL,
    n_contrast_group_color = NULL,
    n_contrast_group_remove = FALSE,
    space = NULL,
    scaling = FALSE,
    ...) {

  ##### Comment to be saved ####
  text_plot_comment <- paste(
    "INFORMATION ABOUT THE PROJECTION",
    comment(word_data),
    "INFORMATION ABOUT THE PLOT",
    "word_data =", substitute(word_data),
    "k_n_words_to_test =", k_n_words_to_test,
    "min_freq_words_test =", min_freq_words_test,
    "min_freq_words_plot =", min_freq_words_plot,
    "plot_n_words_square =", plot_n_words_square,
    "plot_n_words_p =", plot_n_words_p,
    "plot_n_word_extreme =", plot_n_word_extreme,
    "plot_n_word_frequency =", plot_n_word_frequency,
    "plot_n_words_middle =", plot_n_words_middle,
    "y_axes =", y_axes,
    "p_alpha =", p_alpha,
    "overlapping", overlapping,
    "p_adjust_method = ", p_adjust_method,
    "projection_metric = ", projection_metric,
    "bivariate_color_codes =", paste(bivariate_color_codes, collapse = " "),
    "word_size_range =", paste(word_size_range, sep = "-", collapse = " - "),
    "position_jitter_hight =", position_jitter_hight,
    "position_jitter_width =", position_jitter_width,
    "point_size =", point_size,
    "arrow_transparency =", point_size,
    "points_without_words_size =", points_without_words_size,
    "points_without_words_alpha =", points_without_words_alpha,
    "legend_x_position =", legend_x_position,
    "legend_y_position =", legend_y_position,
    "legend_h_size =", legend_h_size,
    "legend_w_size =", legend_w_size,
    "legend_title_size =", legend_title_size,
    "legend_number_size =", legend_number_size,
    "legend_number_colour =", legend_number_colour
  )

  set.seed(seed)
  plot_type_text <- sub(".*type = ", "", comment(word_data))
  plot_type_name <- sub(" .*", "", plot_type_text)

  #### Sorting out axes ####
  # Renaming column names from different types of word_data
  word_data <- adjust_for_plot_type(
    word_data = word_data,
    y_axes = y_axes,
    projection_metric = projection_metric
  )

  # Set Metric to use for plotting
  x_axes_1 <- "x_plotted"
#  if(projection_metric == "cohens_d"){
#    x_axes_1 <- "cohens_d.x"
#  }

  p_values_x <- "p_values_x"

  if (y_axes == TRUE) {

    y_axes_1 <- "y_plotted"
#    if(projection_metric == "cohens_d"){
#      y_axes_1 <- "cohens_d.y"
#    }

    p_values_y <- "p_values_y"
    y_axes_values_hide <- FALSE

  } else if (y_axes == FALSE) {
    y_axes_1 <- NULL
    p_values_y <- NULL
    y_axes_values_hide <- TRUE
  }

  #### Allow overlapping ####
  if (overlapping == TRUE) {
    options(ggrepel.max.overlaps = 1000)
  }

  #### Removing words MANUALY #######

  if (!is.null(remove_words)) {
    word_data$word_data <- word_data$word_data %>% dplyr::filter(!words %in% remove_words)
  }

  #### Selecting words to plot ####
  # Computing adjusted p-values with those words selected by min_freq_words_test
  word_data_padjusted <- word_data$word_data[word_data$word_data$n >= min_freq_words_test, ]

  # Selected Aggregated points
  aggregated_embeddings_data <- word_data$word_data[word_data$word_data$n == 0, ]

  # View(word_data_padjusted) Computing adjusted p-values with those words selected by: k = sqrt(100*N)
  if (k_n_words_to_test == TRUE) {
    words_k <- sqrt(100 * word_data$word_data$N_participant_responses[1])
    word_data_padjusted <- word_data_padjusted %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:words_k)
  }
  # word_data_padjusted$p_values_x
  word_data_padjusted$adjusted_p_values.x <- stats::p.adjust(purrr::as_vector(word_data_padjusted[, "p_values_x"]),
    method = p_adjust_method
  )
  word_data1 <- dplyr::left_join(word_data$word_data, word_data_padjusted[, c("words", "adjusted_p_values.x")],
    by = "words"
  )
  # word_data$adjusted_p_values.x

  if (is.null(y_axes_1) == FALSE) {
    # Computing adjusted p-values
    word_data1_padjusted_y <- word_data1[word_data1$n >= min_freq_words_test, ]
    word_data1_padjusted_y$adjusted_p_values.y <- stats::p.adjust(
      purrr::as_vector(word_data1_padjusted_y[, "p_values_y"]),
      method = p_adjust_method
    )
    word_data1 <- dplyr::left_join(word_data1,
                                   word_data1_padjusted_y[, c("words", "adjusted_p_values.y")], by = "words")
  }

  # Select only min_freq_words_plot to plot (i.e., after correction of multiple comparison for sig. test)
  word_data1 <- word_data1[word_data1$n >= min_freq_words_plot, ]

  #  Select only words based on square-position; and then top frequency in each "square"
  # (see legend) plot_n_words_square
  if (is.null(y_axes_1) == TRUE) {
    word_data1 <- word_data1 %>%
      dplyr::mutate(square_categories = dplyr::case_when(
        x_plotted < 0 & adjusted_p_values.x < p_alpha ~ 1,
        x_plotted < 0 & adjusted_p_values.x > p_alpha ~ 2,
        x_plotted > 0 & adjusted_p_values.x < p_alpha ~ 3
      ))

    data_p_sq1 <- word_data1[word_data1$square_categories == 1, ] %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_square)

    data_p_sq3 <- word_data1[word_data1$square_categories == 3, ] %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_square)

    data_p_sq_all <- rbind(data_p_sq1, data_p_sq3) # data_p_sq2,
  }

  if (is.null(y_axes_1) == FALSE) {
    # Categorize words to apply specific color plot_n_words_square=1
    word_data1 <- word_data1 %>%
      dplyr::mutate(square_categories = dplyr::case_when(
        x_plotted < 0 & adjusted_p_values.x < p_alpha & y_plotted > 0 & adjusted_p_values.y < p_alpha ~ 1,
        adjusted_p_values.x > p_alpha & y_plotted > 0 & adjusted_p_values.y < p_alpha ~ 2,
        x_plotted > 0 & adjusted_p_values.x < p_alpha & y_plotted > 0 & adjusted_p_values.y < p_alpha ~ 3,
        x_plotted < 0 & adjusted_p_values.x < p_alpha & adjusted_p_values.y > p_alpha ~ 4,
        adjusted_p_values.x > p_alpha & adjusted_p_values.y > p_alpha ~ 5,
        x_plotted > 0 & adjusted_p_values.x < p_alpha & adjusted_p_values.y > p_alpha ~ 6,
        x_plotted < 0 & adjusted_p_values.x < p_alpha & y_plotted < 0 & adjusted_p_values.y < p_alpha ~ 7,
        adjusted_p_values.x > p_alpha & y_plotted < 0 & adjusted_p_values.y < p_alpha ~ 8,
        x_plotted > 0 & adjusted_p_values.x < p_alpha & y_plotted < 0 & adjusted_p_values.y < p_alpha ~ 9
      ))

    data_p_sq1 <- word_data1[word_data1$square_categories == 1, ] %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_square)
    data_p_sq2 <- word_data1[word_data1$square_categories == 2, ] %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_square)
    data_p_sq3 <- word_data1[word_data1$square_categories == 3, ] %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_square)
    data_p_sq4 <- word_data1[word_data1$square_categories == 4, ] %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_square)
    #  data_p_sq5
    data_p_sq6 <- word_data1[word_data1$square_categories == 6, ] %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_square)
    data_p_sq7 <- word_data1[word_data1$square_categories == 7, ] %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_square)
    data_p_sq8 <- word_data1[word_data1$square_categories == 8, ] %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_square)
    data_p_sq9 <- word_data1[word_data1$square_categories == 9, ] %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_square)

    data_p_sq_all <- rbind(
      data_p_sq1, data_p_sq2, data_p_sq3,
      data_p_sq4, data_p_sq6, # data_p_sq5,
      data_p_sq7, data_p_sq8, data_p_sq9
    )
  }


  # Select only words below alpha; and then top x_plotted
  data_p_x_neg <- word_data1 %>%
    dplyr::filter(adjusted_p_values.x < p_alpha) %>%
    dplyr::arrange(x_plotted) %>%
    dplyr::slice(0:plot_n_words_p)

  data_p_x_pos <- word_data1 %>%
    dplyr::filter(adjusted_p_values.x < p_alpha) %>%
    dplyr::arrange(-x_plotted) %>%
    dplyr::slice(0:plot_n_words_p)

  # Select plot_n_word_extreme and Select plot_n_word_frequency
  word_data1_extrem_max_x <- word_data1 %>%
    dplyr::arrange(-x_plotted) %>%
    dplyr::slice(0:plot_n_word_extreme)

  word_data1_extrem_min_x <- word_data1 %>%
    dplyr::arrange(x_plotted) %>%
    dplyr::slice(0:plot_n_word_extreme)

  word_data1_frequency_x <- word_data1 %>%
    dplyr::arrange(-n) %>%
    dplyr::slice(0:plot_n_word_frequency)

  # 2025 7 Mars Add random set of words plot_n_word_random = 10
  # Identify all selected words
  selected_words_x <- dplyr::bind_rows(
    data_p_x_neg, data_p_x_pos,
    word_data1_extrem_max_x, word_data1_extrem_min_x,
    word_data1_frequency_x
  ) %>%
    dplyr::distinct(words)

  # Exclude selected words and sample random words from the remaining
  word_data1_random_x <- word_data1 %>%
    dplyr::anti_join(selected_words_x, by = "words") %>%  # Exclude words already selected
    dplyr::sample_n(min(plot_n_word_random, n()))  # Ensure it does not exceed available words

  # Select the middle range, order according to frequency and then select the plot_n_words_middle = 5
  mean_m_sd_x <- mean(word_data1$x_plotted, na.rm = TRUE) - (sd(word_data1$x_plotted, na.rm = TRUE) / 10)
  mean_p_sd_x <- mean(word_data1$x_plotted, na.rm = TRUE) + (sd(word_data1$x_plotted, na.rm = TRUE) / 10)
  word_data1_middle_x <- word_data1 %>%
    dplyr::filter(dplyr::between(word_data1$x_plotted, mean_m_sd_x, mean_p_sd_x)) %>%
    dplyr::arrange(-n) %>%
    dplyr::slice(0:plot_n_words_middle)

  word_data1_x <- word_data1 %>%
    dplyr::left_join(data_p_sq_all %>%
                       dplyr::transmute(words, check_p_square = 1), by = "words") %>%
    dplyr::left_join(data_p_x_neg %>%
                       dplyr::transmute(words, check_p_x_neg = 1), by = "words") %>%
    dplyr::left_join(data_p_x_pos %>%
                       dplyr::transmute(words, check_p_x_pos = 1), by = "words") %>%
    dplyr::left_join(word_data1_extrem_max_x %>%
                       dplyr::transmute(words, check_extreme_max_x = 1), by = "words") %>%
    dplyr::left_join(word_data1_extrem_min_x %>%
                       dplyr::transmute(words, check_extreme_min_x = 1), by = "words") %>%
    dplyr::left_join(word_data1_frequency_x %>%
                       dplyr::transmute(words, check_extreme_frequency_x = 1), by = "words") %>%
    dplyr::left_join(word_data1_middle_x %>%
                       dplyr::transmute(words, check_middle_x = 1), by = "words") %>%
    dplyr::left_join(word_data1_random_x %>%
                       dplyr::transmute(words, check_random_x = 1), by = "words") %>%  # Adding the random selection
    dplyr::mutate(extremes_all_x = rowSums(cbind(
      check_p_square, check_p_x_neg, check_p_x_pos, check_extreme_max_x, check_extreme_min_x,
      check_extreme_frequency_x, check_middle_x, check_random_x
    ), na.rm = TRUE))


  ###### Sort words for y-axes.
  if (is.null(y_axes_1) == FALSE) {
    # Computing adjusted p-values
    # Select only words below alpha; and then top x_plotted
    data_p_y_neg <- word_data1 %>%
      dplyr::filter(adjusted_p_values.y < p_alpha) %>%
      dplyr::arrange(y_plotted) %>%
      dplyr::slice(0:plot_n_words_p)

    data_p_y_pos <- word_data1 %>%
      dplyr::filter(adjusted_p_values.y < p_alpha) %>%
      dplyr::arrange(-y_plotted) %>%
      dplyr::slice(0:plot_n_words_p)

    # Select plot_n_word_extreme and Select plot_n_word_frequency
    word_data1_extrem_max_y <- word_data1 %>%
      dplyr::arrange(-y_plotted) %>%
      dplyr::slice(0:plot_n_word_extreme)

    word_data1_extrem_min_y <- word_data1 %>%
      dplyr::arrange(y_plotted) %>%
      dplyr::slice(0:plot_n_word_extreme)



    ##### Extreme x and y #######
#    if (plot_n_word_extreme_xy > 0) {
      # Compute combined extremity score
      word_data1 <- word_data1 %>%
        dplyr::mutate(extremity_score = sqrt(x_plotted^2 + y_plotted^2))


    # Ensure selection from all nine squares (if available)
    word_data1_extreme_xy <- word_data1 %>%
      dplyr::group_by(square_categories) %>%
      dplyr::arrange(dplyr::if_else(square_categories == 5, extremity_score, -extremity_score)) %>%  # Flip sorting for category 5
      dplyr::slice_head(n = plot_n_word_extreme_xy) %>%  # Ensure spread across squares
      dplyr::ungroup()

      # Combine extreme selections
      word_data1_extreme_xy <- dplyr::bind_rows(word_data1_extreme_xy) %>%
        dplyr::distinct(words, .keep_all = TRUE)  # Remove duplicates
#    }

    ################
    word_data1_frequency_y <- word_data1 %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_word_frequency)

    # 2025 7 Mars Add random set of words plot_n_word_random = 10
    # Identify all selected words
    selected_words <- dplyr::bind_rows(
      word_data1_x,
      data_p_y_neg, data_p_y_pos,
      word_data1_extrem_max_y, word_data1_extrem_min_y,
      word_data1_frequency_y
    ) %>%
      dplyr::distinct(words)

    # Exclude selected words and sample random words from the remaining
    word_data1_random_y <- word_data1 %>%
      dplyr::anti_join(selected_words, by = "words") %>%  # Exclude words already selected
      dplyr::sample_n(min(plot_n_word_random, n()))  # Ensure it does not exceed available words

    # Select the middle range, order according to frequency and then select the plot_n_words_middle =5
    mean_m_sd_y <- mean(word_data1$y_plotted, na.rm = TRUE) - (sd(word_data1$y_plotted, na.rm = TRUE) / 10)
    mean_p_sd_y <- mean(word_data1$y_plotted, na.rm = TRUE) + (sd(word_data1$y_plotted, na.rm = TRUE) / 10)
    word_data1_middle_y <- word_data1 %>%
      dplyr::filter(dplyr::between(word_data1$y_plotted, mean_m_sd_y, mean_p_sd_y)) %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_middle) # TODO selecting on frequency again. perhaps point to have exact middle?

    word_data_all <- word_data1_x %>%
      dplyr::left_join(data_p_y_pos %>%
                         dplyr::transmute(words, check_p_y_pos = 1), by = "words") %>%
      dplyr::left_join(data_p_y_neg %>%
                         dplyr::transmute(words, check_p_y_neg = 1), by = "words") %>%
      dplyr::left_join(word_data1_extrem_max_y %>%
                         dplyr::transmute(words, check_extreme_max_y = 1), by = "words") %>%
      dplyr::left_join(word_data1_extrem_min_y %>%
                         dplyr::transmute(words, check_extreme_min_y = 1), by = "words") %>%
      dplyr::left_join(word_data1_extreme_xy %>%
                         dplyr::transmute(words, check_extreme_xy = 1), by = "words") %>%
      dplyr::left_join(word_data1_frequency_y %>%
                         dplyr::transmute(words, check_extreme_frequency_y = 1), by = "words") %>%
      dplyr::left_join(word_data1_middle_y %>%
                         dplyr::transmute(words, check_middle_y = 1), by = "words") %>%
      dplyr::left_join(word_data1_random_y %>%
                         dplyr::transmute(words, check_random_y = 1), by = "words") %>%  # Adding the random selection
      dplyr::mutate(extremes_all_y = rowSums(cbind(
        check_p_y_neg, check_p_y_pos, check_extreme_max_y, check_extreme_min_y, check_extreme_xy,
        check_extreme_frequency_y, check_middle_y, check_random_y
      ), na.rm = TRUE)) %>%
      dplyr::mutate(extremes_all = rowSums(cbind(extremes_all_x, extremes_all_y), na.rm = TRUE))



    # Categorize words to apply specific color
    word_data_all <- word_data_all %>%
      dplyr::mutate(colour_categories = dplyr::case_when(
        x_plotted < 0 & adjusted_p_values.x < p_alpha & y_plotted > 0 & adjusted_p_values.y < p_alpha ~ bivariate_color_codes[1],
        adjusted_p_values.x > p_alpha & y_plotted > 0 & adjusted_p_values.y < p_alpha ~ bivariate_color_codes[2],
        x_plotted > 0 & adjusted_p_values.x < p_alpha & y_plotted > 0 & adjusted_p_values.y < p_alpha ~ bivariate_color_codes[3],
        x_plotted < 0 & adjusted_p_values.x < p_alpha & adjusted_p_values.y > p_alpha ~ bivariate_color_codes[4],
        adjusted_p_values.x > p_alpha & adjusted_p_values.y > p_alpha ~ bivariate_color_codes[5],
        x_plotted > 0 & adjusted_p_values.x < p_alpha & adjusted_p_values.y > p_alpha ~ bivariate_color_codes[6],
        x_plotted < 0 & adjusted_p_values.x < p_alpha & y_plotted < 0 & adjusted_p_values.y < p_alpha ~ bivariate_color_codes[7],
        adjusted_p_values.x > p_alpha & y_plotted < 0 & adjusted_p_values.y < p_alpha ~ bivariate_color_codes[8],
        x_plotted > 0 & adjusted_p_values.x < p_alpha & y_plotted < 0 & adjusted_p_values.y < p_alpha ~ bivariate_color_codes[9]
      ))
  }


  if (is.null(y_axes_1) == TRUE) {
    word_data_all <- word_data1_x %>%
      dplyr::mutate(colour_categories = dplyr::case_when(
        x_plotted < 0 & adjusted_p_values.x < p_alpha ~ bivariate_color_codes[4],
        # x_plotted < 0 & adjusted_p_values.x > p_alpha ~ bivariate_color_codes[5],
        # Some adjusted_p_values.x has NA becasue they where not tested as multiple input
        # (this is because min_frequency selects out before)
        adjusted_p_values.x > p_alpha | is.na(adjusted_p_values.x) ~ bivariate_color_codes[5],
        x_plotted > 0 & adjusted_p_values.x < p_alpha ~ bivariate_color_codes[6]
      ))
  }

  #### Colorize words that are more frequent on the opposite side of the dot product projection ####
  if (is.character(n_contrast_group_color) == TRUE) {
    # Select words with MORE words in G1 and POSITIVE dot product (i.e., remove words that are
    # more represented in the opposite group of its dot product projection)
    word_data_all$colour_categories[(abs(word_data_all$n_g1.x) > abs(word_data_all$n_g2.x) &
                                       word_data_all$x_plotted > 0)] <- n_contrast_group_color

    # Select words with MORE words in G2 and POSITIVE dot product (i.e., remove words that are more
    # represented in the opposite group of its dot product projection)
    word_data_all$colour_categories[(abs(word_data_all$n_g1.x) < abs(word_data_all$n_g2.x) &
                                       word_data_all$x_plotted < 0)] <- n_contrast_group_color
  }

  #### Remove more frequent words on the opposite side of the dot product projection ####
  if (n_contrast_group_remove == TRUE) {
    word_data_all1 <- word_data_all %>%
      # Select words with MORE words in G1 and NEGATIVE dot product (i.e., do not select words
      # that are more represented in the opposite group of its dot product projection)
      filter((abs(n_g1.x) > abs(n_g2.x) &
                x_plotted < 0))

    word_data_all2 <- word_data_all %>%
      # Select words with MORE words in G2 and POSITIVE dot product (i.e., do not select words that
      # are more represented in the opposite group of its dot product projection)
      filter((abs(n_g1.x) < abs(n_g2.x) &
                x_plotted > 0))

    word_data_all <- bind_rows(word_data_all1, word_data_all2)
  }

  #### Preparing for the plot function ####

  # This solution is because it is not possible to send "0" as a parameter
  only_x_dimension <- NULL
  if (is.null(y_axes_1) == TRUE) {
    only_x_dimension <- 0
    y_axes_1 <- "only_x_dimension"
  }

  # Add or Remove values on y-axes
  if (y_axes_values_hide) {
    y_axes_values <- ggplot2::element_blank()
  } else {
    y_axes_values <- ggplot2::element_text()
  }

  # Word data adjusted for if y_axes exists
  if (y_axes == TRUE) {
    word_data_all_yadjusted <- word_data_all[word_data_all$extremes_all_x >= 1 | word_data_all$extremes_all_y >= 1, ]
  } else if (y_axes == FALSE) {
    word_data_all_yadjusted <- word_data_all[word_data_all$extremes_all_x >= 1, ]
  }


  ##### Adding/exploring words MANUALY ###### explore_words = "happy"

  if (!is.null(explore_words) == TRUE) {
    if (plot_type_name == "textProjection") {
      word_data_all_yadjusted <- textOwnWordsProjection(
        word_data = word_data,
        word_data_all = word_data_all,
        word_data_all_yadjusted = word_data_all_yadjusted,
        y_axes = y_axes,
        explore_words = explore_words,
        explore_words_color = explore_words_color,
        explore_words_point = explore_words_point,
        explore_words_aggregation = explore_words_aggregation,
        space = space,
        text_plot_comment = text_plot_comment,
        scaling = scaling,
        ...
      )
    }

    if (plot_type_name == "textWordPrediction") {
      word_data_all_yadjusted <- textOwnWordPrediction(
        word_data = word_data,
        word_data_all = word_data_all,
        word_data_all_yadjusted = word_data_all_yadjusted,
        y_axes = y_axes,
        explore_words = explore_words,
        explore_words_color = explore_words_color,
        explore_words_point = explore_words_point,
        explore_words_aggregation = explore_words_aggregation,
        space = space,
        text_plot_comment = text_plot_comment,
        scaling = scaling
      )
    }
  }

  #### Plotting  ####
  plot <- textPlotting(
    word_data_all = word_data_all,
    word_data_all_yadjusted = word_data_all_yadjusted,
    only_x_dimension = only_x_dimension,
    x_axes_1 = x_axes_1,
    y_axes_1 = y_axes_1,
    group_embeddings1 = group_embeddings1,
    group_embeddings2 = group_embeddings2,
    projection_embedding = projection_embedding,
    label = words,
    points_without_words_size = points_without_words_size,
    points_without_words_alpha = points_without_words_alpha,
    colour_categories = colour_categories,
    arrow_transparency = arrow_transparency,
    scale_x_axes_lim = scale_x_axes_lim,
    scale_y_axes_lim = scale_y_axes_lim,
    position_jitter_hight = position_jitter_hight,
    position_jitter_width = position_jitter_width,
    word_font = word_font,
    point_size = point_size,
    aggregated_embeddings_data = aggregated_embeddings_data,
    aggregated_point_size = aggregated_point_size,
    aggregated_shape = aggregated_shape,
    aggregated_color_G1 = aggregated_color_G1,
    aggregated_color_G2 = aggregated_color_G2,
    projection_color = projection_color,
    word_size_range = word_size_range,
    # titles
    title_top = title_top,
    titles_color = titles_color,
    x_axes_label = x_axes_label,
    y_axes_label = y_axes_label,
    y_axes_values = y_axes_values
  )

  #### Reset overlapping ####
  options(ggrepel.max.overlaps = 10)

  # plot

  #### Creating the legend ####

  legend <- textLegend(
    bivariate_color_codes = bivariate_color_codes,
    y_axes_1 = y_axes_1,
    fill = fill,
    legend_title = legend_title,
    legend_title_size = legend_title_size,
    legend_x_axes_label = legend_x_axes_label,
    legend_y_axes_label = legend_y_axes_label,
    word_data_all = word_data_all,
    legend_number_size = legend_number_size,
    legend_number_colour = legend_number_colour,
    titles_color = titles_color
  )
  # legend

  #### Plot both figure and legend help(null_dev_env) ####
  final_plot <- suppressWarnings(cowplot::ggdraw() +
                                   cowplot::draw_plot(plot, 0, 0, 1, 1) +
                                   cowplot::draw_plot(legend,
                                                      legend_x_position,
                                                      legend_y_position,
                                                      legend_h_size,
                                                      legend_w_size))

  output_plot_data <- list(final_plot, text_plot_comment, word_data_all)
  names(output_plot_data) <- c("final_plot", "description", "processed_word_data")
  output_plot_data
}
Any scripts or data that you put into this service are public.
text documentation built on June 8, 2025, 1:32 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
text
Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

R/4_0_textPlot.R
In text: Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

Defines functions textPlot adjust_for_plot_type textOwnWordPrediction textOwnWordsProjection textLegend textPlotting

Documented in textPlot

Try the text package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

text Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

R/4_0_textPlot.R In text: Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

Defines functions textPlot adjust_for_plot_type textOwnWordPrediction textOwnWordsProjection textLegend textPlotting

Documented in textPlot

Try the text package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

text
Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

R/4_0_textPlot.R
In text: Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning