text: Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

Documented in textPlot

#### Supervised Dimension Projection #####

#' Creates the plot object (except for the legend).
#' @return A plot object.
#' @noRd
textPlotting <- function(
    word_data_all = word_data_all,
    word_data_all_yadjusted = word_data_all_yadjusted,
    only_x_dimension = only_x_dimension,
    x_axes_1 = x_axes_1,
    y_axes_1 = y_axes_1,
    group_embeddings1 = group_embeddings1,
    group_embeddings2 = group_embeddings2,
    projection_embedding = projection_embedding,
    label = words,
    points_without_words_size = points_without_words_size,
    points_without_words_alpha = points_without_words_alpha,
    colour_categories = colour_categories,
    arrow_transparency = arrow_transparency,
    scale_x_axes_lim = scale_x_axes_lim,
    scale_y_axes_lim = scale_y_axes_lim,
    position_jitter_hight = position_jitter_hight,
    position_jitter_width = position_jitter_width,
    word_font = word_font,
    point_size = point_size,
    aggregated_embeddings_data = aggregated_embeddings_data,
    aggregated_point_size = aggregated_point_size,
    aggregated_shape = aggregated_shape,
    aggregated_color_G1 = aggregated_color_G1,
    aggregated_color_G2 = aggregated_color_G2,
    projection_color = projection_color,
    word_size_range = word_size_range,
    # titles
    title_top = title_top,
    titles_color = titles_color,
    x_axes_label = x_axes_label,
    y_axes_label = y_axes_label,
    y_axes_values = y_axes_values) {
  plot <-
    # construct ggplot; the !!sym( ) is to  turn the strings into symbols.
    ggplot2::ggplot(data = word_data_all, ggplot2::aes(!!rlang::sym(x_axes_1), !!rlang::sym(y_axes_1), label = words)) +
    ggplot2::geom_point(
      data = word_data_all,
      size = points_without_words_size,
      alpha = points_without_words_alpha,
      ggplot2::aes(color = colour_categories)
    ) +

    # ggrepel geom, make arrows transparent, color by rank, size by n
    ggrepel::geom_text_repel(
      data = word_data_all_yadjusted,
      segment.alpha = arrow_transparency,
      position = ggplot2::position_jitter(h = position_jitter_hight, w = position_jitter_width),
      ggplot2::aes(color = colour_categories, size = n, family = word_font),
    ) +
    ggplot2::scale_color_identity() +

    # Decide size and color of the points
    ggplot2::geom_point(
      data = word_data_all_yadjusted,
      size = point_size,
      ggplot2::aes(color = colour_categories)
    ) +
    {
      if (group_embeddings1 == TRUE) {
        # Aggregated point help(geom_point)
        ggplot2::geom_point(
          data = aggregated_embeddings_data[1, ],
          size = aggregated_point_size,
          shape = aggregated_shape,
          ggplot2::aes(color = aggregated_color_G1)
        )
      }
    } +

    # Aggregated point 2
    {
      if (group_embeddings2 == TRUE) {
        ggplot2::geom_point(
          data = aggregated_embeddings_data[2, ],
          size = aggregated_point_size,
          shape = aggregated_shape,
          ggplot2::aes(color = aggregated_color_G2)
        )
      }
    } +

    # Projection embedding
    {
      if (projection_embedding == TRUE) {
        ggplot2::geom_point(
          data = aggregated_embeddings_data[3, ],
          size = aggregated_point_size,
          shape = aggregated_shape,
          ggplot2::aes(color = projection_color)
        )
      }
    } +

    # set word size range and the guide
    ggplot2::scale_size_continuous(
      range = word_size_range,
      guide = ggplot2::guide_legend(
        title = "Frequency",
        title.position = "top",
        direction = "horizontal",
        label.position = "bottom",
        title.theme = ggplot2::element_text(color = titles_color)
      )
    ) +

    # Title
    ggplot2::ggtitle(paste0(title_top)) +
    ggplot2::labs(y = y_axes_label, x = x_axes_label) +

    # Help create possibility to remove y-axes numbers
    ggplot2::scale_x_continuous(limits = scale_x_axes_lim) +
    ggplot2::scale_y_continuous(limits = scale_y_axes_lim) +

    # Minimal theme, and turning off legends
    ggplot2::theme_minimal() +
    ggplot2::theme(
      legend.position = c("bottom"),
      plot.title = ggplot2::element_text(hjust = 0.5),
      legend.justification = c("right", "top"),
      panel.grid.major = ggplot2::element_blank(),
      panel.grid.minor = ggplot2::element_blank(),
      axis.text.y = y_axes_values,
      title = ggplot2::element_text(color = titles_color),
      axis.title.x = ggplot2::element_text(color = titles_color),
      axis.title.y = ggplot2::element_text(color = titles_color)
    )
  return(plot)
}


#' Creates the legend for the plot.
#' @return A legend object that can be combined with the plot object.
#' @noRd
textLegend <- function(
    bivariate_color_codes = bivariate_color_codes,
    y_axes_1 = y_axes_1,
    fill = fill,
    legend_title = legend_title,
    legend_title_size = legend_title_size,
    legend_x_axes_label = legend_x_axes_label,
    legend_y_axes_label = legend_y_axes_label,
    word_data_all = word_data_all,
    legend_number_size = legend_number_size,
    legend_number_colour = legend_number_colour,
    titles_color = titles_color) {

  bivariate_color_data <- tibble::tibble(
    "1 - 3" = "#XXXXXX", "2 - 3" = "#XXXXXX", "3 - 3" = "#XXXXXX",
    "1 - 2" = "#XXXXXX", "2 - 2" = "#XXXXXX", "3 - 2" = "#XXXXXX",
    "1 - 1" = "#XXXXXX", "2 - 1" = "#XXXXXX", "3 - 1" = "#XXXXXX"
  )
  bivariate_color_data <- rbind(bivariate_color_data, bivariate_color_codes)
  bivariate_color_data <- bivariate_color_data[-1, ]

  if (y_axes_1 == "only_x_dimension") {
    # Only select 3 colors
    bivariate_color_data <- bivariate_color_data[, c(4, 5, 6)]
    colnames(bivariate_color_data) <- c("1 - 2", "2 - 2", "3 - 2")
    bivariate_color_data
    # Remove the y axes title on the legend
    legend_y_axes_label <- " "
  }

  legend <- bivariate_color_data %>%
    tidyr::gather("group", "fill") %>%
    tidyr::separate(group, into = c("x", "y"), sep = " - ") %>%
    dplyr::mutate(
      x = as.integer(x),
      y = as.integer(y)
    ) %>%
    ggplot2::ggplot(ggplot2::aes(x, y)) +
    ggplot2::geom_tile(ggplot2::aes(fill = fill)) +
    ggplot2::ggtitle(paste0(legend_title)) +
    ggplot2::scale_fill_identity() +
    ggplot2::labs(
      x = legend_x_axes_label,
      y = legend_y_axes_label
    ) +
    ggplot2::theme_void() +
    #    ggplot2::annotate(geom="text", x=2, y=2, label="ns",
    #               color = titles_color, size=legend_number_size)+
    {
      if (y_axes_1 != "only_x_dimension") {
        ggplot2::annotate(
          geom = "text", x = 1, y = 3, label = sum(word_data_all$colour_categories == bivariate_color_codes[1],
            na.rm = TRUE
          ),
          color = legend_number_colour, size = legend_number_size
        )
      }
    } +
    {
      if (y_axes_1 != "only_x_dimension") {
        ggplot2::annotate(
          geom = "text", x = 2, y = 3, label = sum(word_data_all$colour_categories == bivariate_color_codes[2],
            na.rm = TRUE
          ),
          color = legend_number_colour, size = legend_number_size
        )
      }
    } +
    {
      if (y_axes_1 != "only_x_dimension") {
        ggplot2::annotate(
          geom = "text", x = 3, y = 3, label = sum(word_data_all$colour_categories == bivariate_color_codes[3],
            na.rm = TRUE
          ),
          color = legend_number_colour, size = legend_number_size
        )
      }
    } +
    ggplot2::annotate(
      geom = "text", x = 1, y = 2, label = sum(word_data_all$colour_categories == bivariate_color_codes[4],
        na.rm = TRUE
      ),
      color = legend_number_colour, size = legend_number_size
    ) +
    ggplot2::annotate(
      geom = "text", x = 2, y = 2, label = sum(word_data_all$colour_categories == bivariate_color_codes[5],
        na.rm = TRUE
      ),
      color = legend_number_colour, size = legend_number_size
    ) +
    ggplot2::annotate(
      geom = "text", x = 3, y = 2, label = sum(word_data_all$colour_categories == bivariate_color_codes[6],
        na.rm = TRUE
      ),
      color = legend_number_colour, size = legend_number_size
    ) +
    {
      if (y_axes_1 != "only_x_dimension") {
        ggplot2::annotate(
          geom = "text", x = 1, y = 1, label = sum(word_data_all$colour_categories == bivariate_color_codes[7],
            na.rm = TRUE
          ),
          color = legend_number_colour, size = legend_number_size
        )
      }
    } +
    {
      if (y_axes_1 != "only_x_dimension") {
        ggplot2::annotate(
          geom = "text", x = 2, y = 1, label = sum(word_data_all$colour_categories == bivariate_color_codes[8],
            na.rm = TRUE
          ),
          color = legend_number_colour, size = legend_number_size
        )
      }
    } +
    {
      if (y_axes_1 != "only_x_dimension") {
        ggplot2::annotate(
          geom = "text", x = 3, y = 1, label = sum(word_data_all$colour_categories == bivariate_color_codes[9],
            na.rm = TRUE
          ),
          color = legend_number_colour, size = legend_number_size
        )
      }
    } +
    ggplot2::theme(
      plot.title = ggplot2::element_text(hjust = 0.5, size = legend_title_size + 1),
      title = ggplot2::element_text(color = titles_color),
      axis.title.x = ggplot2::element_text(color = titles_color),
      axis.title = ggplot2::element_text(size = legend_title_size),
      axis.title.y = ggplot2::element_text(angle = 90, color = titles_color)
    ) +
    ggplot2::coord_fixed()
  legend
}



#' Computes the dot product projection for added data.
#' @return Word_data_all_yadjusted with added information for the added words.
#' @noRd
textOwnWordsProjection <- function(word_data = word_data,
                                   word_data_all = word_data_all,
                                   word_data_all_yadjusted = word_data_all_yadjusted,
                                   y_axes = y_axes,
                                   explore_words = explore_words,
                                   explore_words_color = explore_words_color,
                                   explore_words_point = explore_words_point,
                                   explore_words_aggregation = explore_words_aggregation,
                                   space = space,
                                   text_plot_comment = text_plot_comment,
                                   scaling = scaling) {
  # For loop for different batches of added words; i_add_w=1 explore_words = "happy harmony love"
  forloops_add_w <- length(explore_words)
  added_words_information <- list()

  for (i_add_w in 1:forloops_add_w) {
    # If using a contextualized language model
    if (is.null(space) == TRUE) {
      # Creating word embeddings for the words.
      model_text <- sub(".*model: ", "", text_plot_comment)
      model_name <- sub(" ;.*", "", model_text)
      layers_text <- sub(".*layers: ", "", text_plot_comment)
      layers_number <- sub(" ;.*", "", layers_text)
      layers_number_split <- stringi::stri_split_boundaries(layers_number,
        type = "word",
        skip_word_none = TRUE,
        skip_word_number = FALSE
      )

      #
      aggregate_layers_text <- sub(".*aggregation_from_layers_to_tokens =  ", "", text_plot_comment)
      aggregate_layers_type <- sub(" aggregation_from_tokens_to_texts.*", "", aggregate_layers_text)

      aggregation_tokens_text <- sub(".*aggregation_from_tokens_to_texts =  ", "", text_plot_comment)
      aggregation_tokens_type <- sub(" tokens_select.*", "", aggregation_tokens_text)

      aggregation_word_text <- sub(".*aggregation_from_tokens_to_word_types =  ", "", text_plot_comment)
      aggregation_word_type <- sub("  ; decontextualize.*", "", aggregation_word_text)

      explore_words_embeddings <- text_embed(
        explore_words[i_add_w],
        model = model_name,
        layers = as.numeric(layers_number_split[[1]]),
        aggregation_from_layers_to_tokens = aggregate_layers_type,
        aggregation_from_tokens_to_texts = aggregation_tokens_type,
        aggregation_from_tokens_to_word_types = aggregation_word_type
      )
    }
    # If using a static/decontextualized language model
    if (!is.null(space) == TRUE) {
      explore_words_embeddings <- textEmbedStatic(data.frame(explore_words[i_add_w]),
        space = space,
        aggregation_from_tokens_to_texts = explore_words_aggregation
      )
    }

    words <- tibble::as_tibble_col(explore_words_point[i_add_w])
    colnames(words) <- "words"
    n_words <- tibble::as_tibble_col(1)
    colnames(n_words) <- "n"

    # Scaling embeddings before aggregation
    if (scaling == TRUE) {
      singlewords_we_x <- dplyr::select(explore_words_embeddings$word_types, dplyr::starts_with("Dim"))

      # Applying scaling parameters to all the unique word's embeddings
      scale_center_weights <- word_data$background[[1]]$scale_centre.x %>%
        dplyr::slice(rep(1:dplyr::n(), each = nrow(singlewords_we_x)))

      scale_scale_weights <- word_data$background[[1]]$scale_scale.x %>%
        dplyr::slice(rep(1:dplyr::n(), each = nrow(singlewords_we_x)))

      singlewords_we_x_scaled <- tibble::as_tibble((singlewords_we_x - scale_center_weights) / scale_scale_weights)

      singlewords_we_x_scaled_w_n <- bind_cols(explore_words_embeddings$word_types[1:2], singlewords_we_x_scaled)

      #### Create token and aggregated word embeddings ####
      # Aggregate the words
      Aggregated_embedding_added_words <- tibble::as_tibble_row(textEmbeddingAggregation(singlewords_we_x_scaled,
        aggregation = explore_words_aggregation
      ))

      Mean1 <- dplyr::bind_cols(words, n_words, Aggregated_embedding_added_words)
      manual_words_mean1 <- bind_rows(singlewords_we_x_scaled_w_n, Mean1)
    } else {
      # Aggregate the words
      Aggregated_embedding_added_words <- tibble::as_tibble_row(textEmbeddingAggregation(
        dplyr::select(
          explore_words_embeddings$word_types[[1]],
          dplyr::starts_with("Dim")
        ),
        aggregation = explore_words_aggregation
      ))
      Mean1 <- dplyr::bind_cols(words, n_words, Aggregated_embedding_added_words)
      manual_words_mean1 <- bind_rows(explore_words_embeddings$word_types[[1]], Mean1)
    }

    #### Project embedding on the x axes ######
    projected_embedding.x <- as.vector(word_data$background[[1]]$Aggregated_word_embedding_group2.x -
                                         word_data$background[[1]]$Aggregated_word_embedding_group1.x)

    # Position words in relation to Aggregated word embedding
    # Position the embedding; i.e., taking the word embedding subtracted with aggregated word embedding
    embedding_to_anchour_with.x <- tibble::as_tibble(
      (word_data$background[[1]]$Aggregated_word_embedding_group2.x +
         word_data$background[[1]]$Aggregated_word_embedding_group1.x) / 2)
    manual_words_mean1_1.x <- dplyr::select(manual_words_mean1, dplyr::starts_with("Dim"))

    embedding_to_anchour_with.x_df <- embedding_to_anchour_with.x %>%
      dplyr::slice(rep(1:dplyr::n(), each = nrow(manual_words_mean1_1.x)))

    words_positioned_embeddings <- tibble::as_tibble(manual_words_mean1_1.x - embedding_to_anchour_with.x_df)

    projected_embedding.x_df <- tibble::as_tibble(projected_embedding.x) %>%
      slice(rep(1:dplyr::n(), each = nrow(manual_words_mean1)))

    # Project the embeddings using dot product.
    dot_products_observed.x <- rowSums(words_positioned_embeddings * projected_embedding.x_df)

    ### Compare observed dot-product with null
    p_values_dot_prod.x <- purrr::map(as.list(purrr::as_vector(dot_products_observed.x)), p_value_comparing_with_Null,
      word_data$background[[1]]$dot_null_distribution[[1]],
      alternative = "two_sided"
    )

    p_values_dot_prod.x <- unlist(p_values_dot_prod.x)

    #### Project embedding on the Y axes ####

    if (y_axes == TRUE) {
      projected_embedding.y <- as.vector(word_data$background[[2]]$Aggregated_word_embedding_group2.y -
                                           word_data$background[[2]]$Aggregated_word_embedding_group1.y)
      # Position words in relation to Aggregated word embedding
      # Position the embedding; i.e., taking the word embedding subtracted with aggregated word embedding
      embedding_to_anchour_with.y <- tibble::as_tibble(
        (word_data$background[[2]]$Aggregated_word_embedding_group2.y +
           word_data$background[[2]]$Aggregated_word_embedding_group1.y) / 2)
      manual_words_mean1_1.y <- dplyr::select(manual_words_mean1, dplyr::starts_with("Dim"))

      embedding_to_anchour_with.y_df <- embedding_to_anchour_with.y %>%
        dplyr::slice(rep(1:dplyr::n(), each = nrow(manual_words_mean1_1.y)))

      words_positioned_embeddings <- tibble::as_tibble(manual_words_mean1_1.y - embedding_to_anchour_with.y_df)

      projected_embedding.y_df <- tibble::as_tibble(projected_embedding.y) %>%
        slice(rep(1:dplyr::n(), each = nrow(manual_words_mean1)))

      # Project the embeddings using dot product.
      dot_products_observed.y <- rowSums(words_positioned_embeddings * projected_embedding.y_df)

      ### Compare observed dot-product with null
      p_values_dot_prod.y <- purrr::map(as.list(purrr::as_vector(dot_products_observed.y)), p_value_comparing_with_Null,
        word_data$background[[2]]$dot_null_distribution[[1]],
        alternative = "two_sided"
      )

      p_values_dot_prod.y <- unlist(p_values_dot_prod.y)
    }

    #### Sort out dataframe for textOwnWordsProjection ####
    explore_words_results <- manual_words_mean1[1:2]
    explore_words_results$x_plotted <- dot_products_observed.x
    explore_words_results$p_values_x <- p_values_dot_prod.x
    explore_words_results$adjusted_p_values.x <- p_values_dot_prod.x

    if (y_axes == TRUE) {
      explore_words_results$y_plotted <- dot_products_observed.y
      explore_words_results$p_values_y <- p_values_dot_prod.y
      explore_words_results$adjusted_p_values.y <- p_values_dot_prod.y
    }

    explore_words_results$colour_categories <- explore_words_color[i_add_w]
    # TODO; should not have to print extreme?
    explore_words_results$extremes_all_x <- rep(NA, nrow(explore_words_results))
    explore_words_results$n <- rep(mean(word_data_all$n), nrow(explore_words_results))
    explore_words_results$n.percent <- rep(0.5, nrow(explore_words_results))
    explore_words_results$n_g2.x <- rep(5, nrow(explore_words_results))
    explore_words_results$N_participant_responses <- rep(
      max(word_data_all$N_participant_responses),
      nrow(explore_words_results)
    )

    added_words_information[[i_add_w]] <- explore_words_results
  }
  added_words_information_unlist <- dplyr::bind_rows(added_words_information)
  word_data_all_yadjusted <- dplyr::bind_rows(word_data_all_yadjusted, added_words_information_unlist)

  return(word_data_all_yadjusted)
}


#' Computes the dot product projection for added data.
#' @return Word_data_all_yadjusted with added infomration for the added words.
#' @noRd
textOwnWordPrediction <- function(word_data = word_data,
                                  word_data_all = word_data_all,
                                  word_data_all_yadjusted = word_data_all_yadjusted,
                                  y_axes = y_axes,
                                  explore_words = explore_words,
                                  explore_words_color = explore_words_color,
                                  explore_words_point = explore_words_point,
                                  explore_words_aggregation = explore_words_aggregation,
                                  space = space,
                                  text_plot_comment = text_plot_comment,
                                  scaling = scaling) {
  # For loop for different batches of added words; i_add_w=1 explore_words = "happy harmony love"
  forloops_add_w <- length(explore_words)
  added_words_information <- list()


  for (i_add_w in 1:forloops_add_w) {
    # If using a contextualized language model
    if (is.null(space) == TRUE) {
      # Creating word embeddings for the words.
      model_text <- sub(".*model: ", "", text_plot_comment)
      model_name <- sub(" ; layer.*", "", model_text)
      layers_text <- sub(".*layers: ", "", text_plot_comment)
      layers_number <- sub(" ; word_type_embeddings.*", "", layers_text)
      layers_number_split <- stringi::stri_split_boundaries(layers_number,
        type = "word",
        skip_word_none = TRUE,
        skip_word_number = FALSE
      )

      #
      aggregate_layers_text <- sub(".*aggregation_from_layers_to_tokens =  ", "", text_plot_comment)
      aggregate_layers_type <- sub(" aggregation_from_tokens_to_texts.*", "", aggregate_layers_text)

      aggregation_tokens_text <- sub(".*aggregation_from_tokens_to_texts =  ", "", text_plot_comment)
      aggregation_tokens_type <- sub(" tokens_select.*", "", aggregation_tokens_text)

      aggregation_word_text <- sub(".*aggregation_from_tokens_to_word_types =  ", "", text_plot_comment)
      aggregation_word_type <- sub("  ; decontextualize.*", "", aggregation_word_text)

      explore_words_embeddings <- text_embed(
        explore_words[i_add_w],
        model = model_name,
        layers = as.numeric(layers_number_split[[1]]),
        aggregation_from_layers_to_tokens = aggregate_layers_type,
        aggregation_from_tokens_to_texts = aggregation_tokens_type,
        aggregation_from_tokens_to_word_types = aggregation_word_type
      )
    }
    # If using a static/decontextualized language model
    if (!is.null(space) == TRUE) {
      explore_words_embeddings <- textEmbedStatic(data.frame(explore_words[i_add_w]),
        space = space,
        aggregation_from_tokens_to_texts = explore_words_aggregation
      )
    }

    #### Create token and aggregated word embeddings ####
    words <- tibble::as_tibble_col(explore_words_point[i_add_w])
    colnames(words) <- "words"
    n_words <- tibble::as_tibble_col(1)
    colnames(n_words) <- "n"

    # Aggregate the words
    Aggregated_embedding_added_words <- tibble::as_tibble_row(textEmbeddingAggregation(
      dplyr::select(
        explore_words_embeddings$word_types,
        dplyr::starts_with("Dim")
      ),
      aggregation = explore_words_aggregation
    ))
    Mean1 <- dplyr::bind_cols(words, n_words, Aggregated_embedding_added_words)
    manual_words_mean1 <- bind_rows(explore_words_embeddings$word_types, Mean1)


    prediction_x <- textPredict(word_data$model_x, manual_words_mean1)$.pred # , ...

    # Creating p-value column  TO DO: implement a real p-value
    p_value_x <- rep(1, length(prediction_x))

    if (y_axes == TRUE) {
      prediction_y <- textPredict(word_data$model_y, manual_words_mean1)$.pred # , ...

      # Creating p-value column  TO DO: implement a real p-value
      p_value_y <- rep(1, length(prediction_y))
    }

    #### Sort out df for textOwnWordPrediction ####
    explore_words_results <- manual_words_mean1[1:2]
    explore_words_results$x_plotted <- prediction_x
    explore_words_results$p_values_x <- p_value_x
    explore_words_results$adjusted_p_values.x <- p_value_x

    if (y_axes == TRUE) {
      explore_words_results$y_plotted <- prediction_y
      explore_words_results$p_values_y <- p_value_y
      explore_words_results$adjusted_p_values.y <- p_value_y
    }

    explore_words_results$colour_categories <- explore_words_color[i_add_w]
    explore_words_results$extremes_all_x <- rep(NA, nrow(explore_words_results))
    explore_words_results$n <- rep(mean(word_data_all$n), nrow(explore_words_results))
    explore_words_results$n.percent <- rep(0.5, nrow(explore_words_results))
    explore_words_results$n_g2.x <- rep(5, nrow(explore_words_results))

    added_words_information[[i_add_w]] <- explore_words_results
  }
  added_words_information_unlist <- dplyr::bind_rows(added_words_information)
  word_data_all_yadjusted <- dplyr::bind_rows(word_data_all_yadjusted, added_words_information_unlist)

  return(word_data_all_yadjusted)
}


#' Find out plot type to be plotted and adjust word_data columns accordingly.
#' @return word_data with generically called columns that can run in textPlot.
#' @noRd
adjust_for_plot_type <- function(
    word_data,
    y_axes,
    projection_metric = NULL
    ) {
  type_text <- sub(".*type = ", "", comment(word_data))
  type_name <- sub(" .*", "", type_text)

  # Making column names generic across different input

  if (type_name == "textWordPrediction") {
    colnames(word_data$word_data)[which(names(word_data$word_data) == "p_value_w_pred_x")] <- "p_values_x"
    colnames(word_data$word_data)[which(names(word_data$word_data) == "embedding_based_prediction_x")] <- "x_plotted"
    if (y_axes) {
      colnames(word_data$word_data)[which(names(word_data$word_data) == "p_value_w_pred_y")] <- "p_values_y"
      colnames(word_data$word_data)[which(names(word_data$word_data) == "embedding_based_prediction_y")] <- "y_plotted"
    }
  }

  if (type_name == "textProjection") {
    colnames(word_data$word_data)[which(names(word_data$word_data) == "p_values_dot.x")] <- "p_values_x"

    if (projection_metric == "dot_product"){
      colnames(word_data$word_data)[which(names(word_data$word_data) == "dot.x")] <- "x_plotted"
    } else if (projection_metric == "cohens_d"){
      colnames(word_data$word_data)[which(names(word_data$word_data) == "cohens_d.x")] <- "x_plotted"
    }

    if (y_axes) {
      colnames(word_data$word_data)[which(names(word_data$word_data) == "p_values_dot.y")] <- "p_values_y"

      if (projection_metric == "dot_product"){
        colnames(word_data$word_data)[which(names(word_data$word_data) == "dot.y")] <- "y_plotted"
      } else if (projection_metric == "cohens_d"){
        colnames(word_data$word_data)[which(names(word_data$word_data) == "cohens_d.y")] <- "y_plotted"
      }

    }
  }
  return(word_data)
}


#' Plot words
#'
#' textPlot() plots words from textProjection() or textWordPrediction().
#' @param word_data Dataframe from textProjection.
#' @param k_n_words_to_test Select the k most frequent words to significance
#' test (k = sqrt(100*N); N = number of participant responses) (default = TRUE).
#' @param min_freq_words_test Select words to significance test that have occurred
#' at least min_freq_words_test (default = 1).
#' @param min_freq_words_plot Select words to plot that has occurred at
#' least min_freq_words_plot times (default = 1).
#' @param plot_n_words_square Select number of significant words in each square
#' of the figure to plot. The significant words, in each square is selected
#' according to most frequent words (default = 3).
#' @param plot_n_words_p Number of significant words to plot on each (positive
#' and negative) side of the x-axes and y-axes, (where duplicates are removed);
#' selects first according to lowest p-value and then according to frequency (default = 5). Hence, on a two
#' dimensional plot it is possible that plot_n_words_p = 1 yield 4 words.
#' @param plot_n_word_extreme Number of words that are extreme on Supervised Dimension
#' Projection per dimension. (i.e., even if not significant; per dimension,
#' where duplicates are removed).
#' @param plot_n_word_extreme_xy Number of words that are extreme in both x and y dimensions,
#' considering overall distance from the origin in the Supervised Dimension Projection space.
#' This selects words based on their combined extremity score, calculated as
#' the Euclidean distance from (0,0). Ensures balance across all nine squares by selecting
#' at least one extreme word per square if available.
#' @param plot_n_word_frequency Number of words based on being most frequent (default = 5).
#' (i.e., even if not significant).
#' @param plot_n_words_middle Number of words plotted that are in the middle in Supervised
#' Dimension Projection score (default = 5). (i.e., even if not significant;  per dimensions,
#' where duplicates are removed).
#' @param plot_n_word_random (numeric) select random words to plot.
#' @param title_top Title (default "  ").
#' @param titles_color Color for all the titles (default: "#61605e").
# @param x_axes If TRUE, plotting on the x_axes.
#' @param y_axes (boolean) If TRUE, also plotting on the y-axes (default = FALSE, i.e, a 1-dimensional
#' plot is generated). Also plotting on y-axes produces a two dimension 2-dimensional plot, but the
#' textProjection function has to have had a variable on the y-axes.
#' @param p_alpha Alpha (default = .05).
#' @param overlapping (boolean) Allow overlapping (TRUE) or disallow (FALSE) (default = TRUE).
#' @param p_adjust_method (character) Method to adjust/correct p-values for multiple comparisons
#' (default = "none"; see also "holm", "hochberg", "hommel", "bonferroni", "BH", "BY",  "fdr").
#' @param projection_metric (character) Metric to plot according to; "dot_product" or "cohens_d".
#' @param x_axes_label (character) Label on the x-axes (default = "Supervised Dimension Projection (SDP)").
#' @param y_axes_label (character) Label on the y-axes (default = "Supervised Dimension Projection (SDP)").
#' @param scale_x_axes_lim Manually set the length of the x-axes (default = NULL, which uses
#' ggplot2::scale_x_continuous(limits = scale_x_axes_lim); change e.g., by trying c(-5, 5)).
#' @param scale_y_axes_lim Manually set the length of the y-axes (default = NULL; which uses
#' ggplot2::scale_y_continuous(limits = scale_y_axes_lim); change e.g., by trying c(-5, 5)).
#' @param word_font Font type (default = NULL).
#' @param bivariate_color_codes (HTML color codes. Type = character) The different colors of the words.
#'  Note that, at the moment, two squares should not have the exact same colour-code because the numbers
#'  within the squares of the legend will then be aggregated (and show the same, incorrect  value).
#' (default: c("#398CF9", "#60A1F7", "#5dc688",
#' "#e07f6a", "#EAEAEA", "#40DD52",
#' "#FF0000", "#EA7467", "#85DB8E")).
#' @param word_size_range Vector with minimum and maximum font size (default: c(3, 8)).
#' @param position_jitter_hight Jitter height (default: .0).
#' @param position_jitter_width Jitter width (default: .03).
#' @param point_size Size of the points indicating the words' position (default: 0.5).
#' @param arrow_transparency Transparency of the lines between each word and point (default: 0.1).
#' @param points_without_words_size Size of the points not linked with a words
#' (default is to not show it, i.e., 0).
#' @param points_without_words_alpha Transparency of the points not linked with a words
#' (default is to not show it, i.e., 0).
#' @param legend_title Title on the color legend (default: "SDP").
#' @param legend_x_axes_label Label on the color legend (default: "x").
#' @param legend_y_axes_label Label on the color legend (default: "y").
#' @param legend_x_position Position on the x coordinates of the color legend (default: 0.02).
#' @param legend_y_position Position on the y coordinates of the color legend (default: 0.05).
#' @param legend_h_size Height of the color legend (default 0.15).
#' @param legend_w_size Width of the color legend (default 0.15).
#' @param legend_title_size Font size (default: 7).
#' @param legend_number_size Font size of the values in the legend (default: 2).
#' @param legend_number_colour (string) Colour of the numbers in the box legend.
#' @param group_embeddings1 (boolean) Shows a point representing the aggregated word embedding
#' for group 1 (default = FALSE).
#' @param group_embeddings2 (boolean) Shows a point representing the aggregated word embedding
#' for group 2 (default = FALSE).
#' @param projection_embedding (boolean) Shows a point representing the aggregated direction
#' embedding (default = FALSE).
#' @param aggregated_point_size Size of the points representing the group_embeddings1,
#' group_embeddings2 and projection_embedding (default = 0.8).
#' @param aggregated_shape Shape type of the points representing the group_embeddings1,
#' group_embeddings2 and projection_embedding (default = 8).
#' @param aggregated_color_G1 Color (default = "black").
#' @param aggregated_color_G2 Color (default = "black").
#' @param projection_color Color (default = "blue").
#' @param seed (numeric) Set different seed (default = 1005)..
#' @param explore_words Explore where specific words are positioned in the embedding space.
#' For example, c("happy content", "sad down") (default = NULL).
#' @param explore_words_color Specify the color(s) of the words being explored.
#' For example c("#ad42f5", "green") (default = "#ad42f5").
#' @param explore_words_point Specify the names of the point for the aggregated word embeddings
#' of all the explored words (default = "ALL_1").
#' @param explore_words_aggregation Specify how to aggregate the word embeddings of
#' the explored words (default = "mean").
#' @param remove_words Manually remove words from the plot (which is done just before the
#' words are plotted so that the remove_words are part of previous counts/analyses) (default = NULL).
#' @param space Provide a semantic space if using static embeddings and wanting to explore words (default = NULL).
#' @param n_contrast_group_color Set color to words that have higher frequency (N)
#' on the other opposite side of its dot product projection (default = NULL).
#' @param n_contrast_group_remove Remove words that have higher frequency (N) on the other
#' opposite side of its dot product projection (default = FALSE).
#' @param scaling Scaling word embeddings before aggregation (default = FALSE).
#' @return A 1- or 2-dimensional word plot, as well as tibble with processed data used
#' to plot.
#' @examples
#' # The test-data included in the package is called: DP_projections_HILS_SWLS_100
#'
#' # Supervised Dimension Projection Plot
#' plot_projection <- textPlot(
#'   word_data = DP_projections_HILS_SWLS_100,
#'   k_n_words_to_test = FALSE,
#'   min_freq_words_test = 1,
#'   plot_n_words_square = 3,
#'   plot_n_words_p = 3,
#'   plot_n_word_extreme = 1,
#'   plot_n_word_frequency = 1,
#'   plot_n_words_middle = 1,
#'   y_axes = FALSE,
#'   p_alpha = 0.05,
#'   title_top = "Supervised Dimension Projection (SDP)",
#'   x_axes_label = "Low vs. High HILS score",
#'   y_axes_label = "Low vs. High SWLS score",
#'   p_adjust_method = "bonferroni",
#'   scale_y_axes_lim = NULL
#' )
#' plot_projection
#'
#' names(DP_projections_HILS_SWLS_100)
#' @seealso See \code{\link{textProjection}}.
#' @importFrom tibble as_tibble tibble
#' @importFrom dplyr row_number slice mutate mutate_if bind_rows group_by summarize left_join %>% n
#' @importFrom tidyr gather separate
#' @importFrom ggplot2 position_jitter element_text element_blank coord_fixed theme
#' theme_void theme_minimal aes labs scale_color_identity
#' @importFrom rlang sym
#' @importFrom cowplot ggdraw draw_plot
#' @importFrom purrr as_vector
#' @importFrom stringi stri_split_boundaries
#' @export
textPlot <- function(
    word_data,
    k_n_words_to_test = FALSE,
    min_freq_words_test = 1,
    min_freq_words_plot = 1,
    plot_n_words_square = 3,
    plot_n_words_p = 5,
    plot_n_word_extreme = 5,
    plot_n_word_extreme_xy = 0,
    plot_n_word_frequency = 5,
    plot_n_words_middle = 5,
    plot_n_word_random = 0,
    titles_color = "#61605e",
    y_axes = FALSE,
    p_alpha = 0.05,
    overlapping = TRUE,
    p_adjust_method = "none",
    projection_metric = "dot_product",
    title_top = "Supervised Dimension Projection",
    x_axes_label = "Supervised Dimension Projection (SDP)",
    y_axes_label = "Supervised Dimension Projection (SDP)",
    scale_x_axes_lim = NULL,
    scale_y_axes_lim = NULL,
    word_font = NULL,
    bivariate_color_codes = c(
      "#398CF9", "#60A1F7", "#5dc688",
      "#e07f6a", "#EAEAEA", "#40DD52",
      "#FF0000", "#EA7467", "#85DB8E"
    ),
    word_size_range = c(3, 8),
    position_jitter_hight = .0,
    position_jitter_width = .03,
    point_size = 0.5,
    arrow_transparency = 0.1,
    points_without_words_size = 0.2,
    points_without_words_alpha = 0.2,
    legend_title = "SDP",
    legend_x_axes_label = "x",
    legend_y_axes_label = "y",
    legend_x_position = 0.02,
    legend_y_position = 0.02,
    legend_h_size = 0.2,
    legend_w_size = 0.2,
    legend_title_size = 7,
    legend_number_size = 2,
    legend_number_colour = "white",
    group_embeddings1 = FALSE,
    group_embeddings2 = FALSE,
    projection_embedding = FALSE,
    aggregated_point_size = 0.8,
    aggregated_shape = 8,
    aggregated_color_G1 = "black",
    aggregated_color_G2 = "black",
    projection_color = "blue",
    seed = 1005,
    explore_words = NULL,
    explore_words_color = "#ad42f5",
    explore_words_point = "ALL_1",
    explore_words_aggregation = "mean",
    remove_words = NULL,
    n_contrast_group_color = NULL,
    n_contrast_group_remove = FALSE,
    space = NULL,
    scaling = FALSE) {

  ##### Comment to be saved ####
  text_plot_comment <- paste(
    "INFORMATION ABOUT THE PROJECTION",
    comment(word_data),
    "INFORMATION ABOUT THE PLOT",
    "word_data =", substitute(word_data),
    "k_n_words_to_test =", k_n_words_to_test,
    "min_freq_words_test =", min_freq_words_test,
    "min_freq_words_plot =", min_freq_words_plot,
    "plot_n_words_square =", plot_n_words_square,
    "plot_n_words_p =", plot_n_words_p,
    "plot_n_word_extreme =", plot_n_word_extreme,
    "plot_n_word_frequency =", plot_n_word_frequency,
    "plot_n_words_middle =", plot_n_words_middle,
    "y_axes =", y_axes,
    "p_alpha =", p_alpha,
    "overlapping", overlapping,
    "p_adjust_method = ", p_adjust_method,
    "projection_metric = ", projection_metric,
    "bivariate_color_codes =", paste(bivariate_color_codes, collapse = " "),
    "word_size_range =", paste(word_size_range, sep = "-", collapse = " - "),
    "position_jitter_hight =", position_jitter_hight,
    "position_jitter_width =", position_jitter_width,
    "point_size =", point_size,
    "arrow_transparency =", point_size,
    "points_without_words_size =", points_without_words_size,
    "points_without_words_alpha =", points_without_words_alpha,
    "legend_x_position =", legend_x_position,
    "legend_y_position =", legend_y_position,
    "legend_h_size =", legend_h_size,
    "legend_w_size =", legend_w_size,
    "legend_title_size =", legend_title_size,
    "legend_number_size =", legend_number_size,
    "legend_number_colour =", legend_number_colour
  )

  set.seed(seed)
  plot_type_text <- sub(".*type = ", "", comment(word_data))
  plot_type_name <- sub(" .*", "", plot_type_text)

  #### Sorting out axes ####
  # Renaming column names from different types of word_data
  word_data <- adjust_for_plot_type(
    word_data = word_data,
    y_axes = y_axes,
    projection_metric = projection_metric
  )

  # Set Metric to use for plotting
  x_axes_1 <- "x_plotted"
#  if(projection_metric == "cohens_d"){
#    x_axes_1 <- "cohens_d.x"
#  }

  p_values_x <- "p_values_x"

  if (y_axes == TRUE) {

    y_axes_1 <- "y_plotted"
#    if(projection_metric == "cohens_d"){
#      y_axes_1 <- "cohens_d.y"
#    }

    p_values_y <- "p_values_y"
    y_axes_values_hide <- FALSE

  } else if (y_axes == FALSE) {
    y_axes_1 <- NULL
    p_values_y <- NULL
    y_axes_values_hide <- TRUE
  }

  #### Allow overlapping ####
  if (overlapping == TRUE) {
    options(ggrepel.max.overlaps = 1000)
  }

  #### Removing words MANUALY #######

  if (!is.null(remove_words)) {
    word_data$word_data <- word_data$word_data %>% dplyr::filter(!words %in% remove_words)
  }

  #### Selecting words to plot ####
  # Computing adjusted p-values with those words selected by min_freq_words_test
  word_data_padjusted <- word_data$word_data[word_data$word_data$n >= min_freq_words_test, ]

  # Selected Aggregated points
  aggregated_embeddings_data <- word_data$word_data[word_data$word_data$n == 0, ]

  # View(word_data_padjusted) Computing adjusted p-values with those words selected by: k = sqrt(100*N)
  if (k_n_words_to_test == TRUE) {
    words_k <- sqrt(100 * word_data$word_data$N_participant_responses[1])
    word_data_padjusted <- word_data_padjusted %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:words_k)
  }
  # word_data_padjusted$p_values_x
  word_data_padjusted$adjusted_p_values.x <- stats::p.adjust(purrr::as_vector(word_data_padjusted[, "p_values_x"]),
    method = p_adjust_method
  )
  word_data1 <- dplyr::left_join(word_data$word_data, word_data_padjusted[, c("words", "adjusted_p_values.x")],
    by = "words"
  )
  # word_data$adjusted_p_values.x

  if (is.null(y_axes_1) == FALSE) {
    # Computing adjusted p-values
    word_data1_padjusted_y <- word_data1[word_data1$n >= min_freq_words_test, ]
    word_data1_padjusted_y$adjusted_p_values.y <- stats::p.adjust(
      purrr::as_vector(word_data1_padjusted_y[, "p_values_y"]),
      method = p_adjust_method
    )
    word_data1 <- dplyr::left_join(word_data1,
                                   word_data1_padjusted_y[, c("words", "adjusted_p_values.y")], by = "words")
  }

  # Select only min_freq_words_plot to plot (i.e., after correction of multiple comparison for sig. test)
  word_data1 <- word_data1[word_data1$n >= min_freq_words_plot, ]

  #  Select only words based on square-position; and then top frequency in each "square"
  # (see legend) plot_n_words_square
  if (is.null(y_axes_1) == TRUE) {
    word_data1 <- word_data1 %>%
      dplyr::mutate(square_categories = dplyr::case_when(
        x_plotted < 0 & adjusted_p_values.x < p_alpha ~ 1,
        x_plotted < 0 & adjusted_p_values.x > p_alpha ~ 2,
        x_plotted > 0 & adjusted_p_values.x < p_alpha ~ 3
      ))

    data_p_sq1 <- word_data1[word_data1$square_categories == 1, ] %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_square)

    data_p_sq3 <- word_data1[word_data1$square_categories == 3, ] %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_square)

    data_p_sq_all <- rbind(data_p_sq1, data_p_sq3) # data_p_sq2,
  }

  if (is.null(y_axes_1) == FALSE) {
    # Categorize words to apply specific color plot_n_words_square=1
    word_data1 <- word_data1 %>%
      dplyr::mutate(square_categories = dplyr::case_when(
        x_plotted < 0 & adjusted_p_values.x < p_alpha & y_plotted > 0 & adjusted_p_values.y < p_alpha ~ 1,
        adjusted_p_values.x > p_alpha & y_plotted > 0 & adjusted_p_values.y < p_alpha ~ 2,
        x_plotted > 0 & adjusted_p_values.x < p_alpha & y_plotted > 0 & adjusted_p_values.y < p_alpha ~ 3,
        x_plotted < 0 & adjusted_p_values.x < p_alpha & adjusted_p_values.y > p_alpha ~ 4,
        adjusted_p_values.x > p_alpha & adjusted_p_values.y > p_alpha ~ 5,
        x_plotted > 0 & adjusted_p_values.x < p_alpha & adjusted_p_values.y > p_alpha ~ 6,
        x_plotted < 0 & adjusted_p_values.x < p_alpha & y_plotted < 0 & adjusted_p_values.y < p_alpha ~ 7,
        adjusted_p_values.x > p_alpha & y_plotted < 0 & adjusted_p_values.y < p_alpha ~ 8,
        x_plotted > 0 & adjusted_p_values.x < p_alpha & y_plotted < 0 & adjusted_p_values.y < p_alpha ~ 9
      ))

    data_p_sq1 <- word_data1[word_data1$square_categories == 1, ] %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_square)
    data_p_sq2 <- word_data1[word_data1$square_categories == 2, ] %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_square)
    data_p_sq3 <- word_data1[word_data1$square_categories == 3, ] %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_square)
    data_p_sq4 <- word_data1[word_data1$square_categories == 4, ] %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_square)
    #  data_p_sq5
    data_p_sq6 <- word_data1[word_data1$square_categories == 6, ] %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_square)
    data_p_sq7 <- word_data1[word_data1$square_categories == 7, ] %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_square)
    data_p_sq8 <- word_data1[word_data1$square_categories == 8, ] %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_square)
    data_p_sq9 <- word_data1[word_data1$square_categories == 9, ] %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_square)

    data_p_sq_all <- rbind(
      data_p_sq1, data_p_sq2, data_p_sq3,
      data_p_sq4, data_p_sq6, # data_p_sq5,
      data_p_sq7, data_p_sq8, data_p_sq9
    )
  }


  # Select only words below alpha; and then top x_plotted
  data_p_x_neg <- word_data1 %>%
    dplyr::filter(adjusted_p_values.x < p_alpha) %>%
    dplyr::arrange(x_plotted) %>%
    dplyr::slice(0:plot_n_words_p)

  data_p_x_pos <- word_data1 %>%
    dplyr::filter(adjusted_p_values.x < p_alpha) %>%
    dplyr::arrange(-x_plotted) %>%
    dplyr::slice(0:plot_n_words_p)

  # Select plot_n_word_extreme and Select plot_n_word_frequency
  word_data1_extrem_max_x <- word_data1 %>%
    dplyr::arrange(-x_plotted) %>%
    dplyr::slice(0:plot_n_word_extreme)

  word_data1_extrem_min_x <- word_data1 %>%
    dplyr::arrange(x_plotted) %>%
    dplyr::slice(0:plot_n_word_extreme)

  word_data1_frequency_x <- word_data1 %>%
    dplyr::arrange(-n) %>%
    dplyr::slice(0:plot_n_word_frequency)

  # 2025 7 Mars Add random set of words plot_n_word_random = 10
  # Identify all selected words
  selected_words_x <- dplyr::bind_rows(
    data_p_x_neg, data_p_x_pos,
    word_data1_extrem_max_x, word_data1_extrem_min_x,
    word_data1_frequency_x
  ) %>%
    dplyr::distinct(words)

  # Exclude selected words and sample random words from the remaining
  word_data1_random_x <- word_data1 %>%
    dplyr::anti_join(selected_words_x, by = "words") %>%  # Exclude words already selected
    dplyr::sample_n(min(plot_n_word_random, n()))  # Ensure it does not exceed available words

  # Select the middle range, order according to frequency and then select the plot_n_words_middle = 5
  mean_m_sd_x <- mean(word_data1$x_plotted, na.rm = TRUE) - (sd(word_data1$x_plotted, na.rm = TRUE) / 10)
  mean_p_sd_x <- mean(word_data1$x_plotted, na.rm = TRUE) + (sd(word_data1$x_plotted, na.rm = TRUE) / 10)
  word_data1_middle_x <- word_data1 %>%
    dplyr::filter(dplyr::between(word_data1$x_plotted, mean_m_sd_x, mean_p_sd_x)) %>%
    dplyr::arrange(-n) %>%
    dplyr::slice(0:plot_n_words_middle)

  word_data1_x <- word_data1 %>%
    dplyr::left_join(data_p_sq_all %>%
                       dplyr::transmute(words, check_p_square = 1), by = "words") %>%
    dplyr::left_join(data_p_x_neg %>%
                       dplyr::transmute(words, check_p_x_neg = 1), by = "words") %>%
    dplyr::left_join(data_p_x_pos %>%
                       dplyr::transmute(words, check_p_x_pos = 1), by = "words") %>%
    dplyr::left_join(word_data1_extrem_max_x %>%
                       dplyr::transmute(words, check_extreme_max_x = 1), by = "words") %>%
    dplyr::left_join(word_data1_extrem_min_x %>%
                       dplyr::transmute(words, check_extreme_min_x = 1), by = "words") %>%
    dplyr::left_join(word_data1_frequency_x %>%
                       dplyr::transmute(words, check_extreme_frequency_x = 1), by = "words") %>%
    dplyr::left_join(word_data1_middle_x %>%
                       dplyr::transmute(words, check_middle_x = 1), by = "words") %>%
    dplyr::left_join(word_data1_random_x %>%
                       dplyr::transmute(words, check_random_x = 1), by = "words") %>%  # Adding the random selection
    dplyr::mutate(extremes_all_x = rowSums(cbind(
      check_p_square, check_p_x_neg, check_p_x_pos, check_extreme_max_x, check_extreme_min_x,
      check_extreme_frequency_x, check_middle_x, check_random_x
    ), na.rm = TRUE))


  ###### Sort words for y-axes.
  if (is.null(y_axes_1) == FALSE) {
    # Computing adjusted p-values
    # Select only words below alpha; and then top x_plotted
    data_p_y_neg <- word_data1 %>%
      dplyr::filter(adjusted_p_values.y < p_alpha) %>%
      dplyr::arrange(y_plotted) %>%
      dplyr::slice(0:plot_n_words_p)

    data_p_y_pos <- word_data1 %>%
      dplyr::filter(adjusted_p_values.y < p_alpha) %>%
      dplyr::arrange(-y_plotted) %>%
      dplyr::slice(0:plot_n_words_p)

    # Select plot_n_word_extreme and Select plot_n_word_frequency
    word_data1_extrem_max_y <- word_data1 %>%
      dplyr::arrange(-y_plotted) %>%
      dplyr::slice(0:plot_n_word_extreme)

    word_data1_extrem_min_y <- word_data1 %>%
      dplyr::arrange(y_plotted) %>%
      dplyr::slice(0:plot_n_word_extreme)



    ##### Extreme x and y #######
#    if (plot_n_word_extreme_xy > 0) {
      # Compute combined extremity score
      word_data1 <- word_data1 %>%
        dplyr::mutate(extremity_score = sqrt(x_plotted^2 + y_plotted^2))


    # Ensure selection from all nine squares (if available)
    word_data1_extreme_xy <- word_data1 %>%
      dplyr::group_by(square_categories) %>%
      dplyr::arrange(dplyr::if_else(square_categories == 5, extremity_score, -extremity_score)) %>%  # Flip sorting for category 5
      dplyr::slice_head(n = plot_n_word_extreme_xy) %>%  # Ensure spread across squares
      dplyr::ungroup()

      # Combine extreme selections
      word_data1_extreme_xy <- dplyr::bind_rows(word_data1_extreme_xy) %>%
        dplyr::distinct(words, .keep_all = TRUE)  # Remove duplicates
#    }

    ################
    word_data1_frequency_y <- word_data1 %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_word_frequency)

    # 2025 7 Mars Add random set of words plot_n_word_random = 10
    # Identify all selected words
    selected_words <- dplyr::bind_rows(
      word_data1_x,
      data_p_y_neg, data_p_y_pos,
      word_data1_extrem_max_y, word_data1_extrem_min_y,
      word_data1_frequency_y
    ) %>%
      dplyr::distinct(words)

    # Exclude selected words and sample random words from the remaining
    word_data1_random_y <- word_data1 %>%
      dplyr::anti_join(selected_words, by = "words") %>%  # Exclude words already selected
      dplyr::sample_n(min(plot_n_word_random, n()))  # Ensure it does not exceed available words

    # Select the middle range, order according to frequency and then select the plot_n_words_middle =5
    mean_m_sd_y <- mean(word_data1$y_plotted, na.rm = TRUE) - (sd(word_data1$y_plotted, na.rm = TRUE) / 10)
    mean_p_sd_y <- mean(word_data1$y_plotted, na.rm = TRUE) + (sd(word_data1$y_plotted, na.rm = TRUE) / 10)
    word_data1_middle_y <- word_data1 %>%
      dplyr::filter(dplyr::between(word_data1$y_plotted, mean_m_sd_y, mean_p_sd_y)) %>%
      dplyr::arrange(-n) %>%
      dplyr::slice(0:plot_n_words_middle) # TODO selecting on frequency again. perhaps point to have exact middle?

    word_data_all <- word_data1_x %>%
      dplyr::left_join(data_p_y_pos %>%
                         dplyr::transmute(words, check_p_y_pos = 1), by = "words") %>%
      dplyr::left_join(data_p_y_neg %>%
                         dplyr::transmute(words, check_p_y_neg = 1), by = "words") %>%
      dplyr::left_join(word_data1_extrem_max_y %>%
                         dplyr::transmute(words, check_extreme_max_y = 1), by = "words") %>%
      dplyr::left_join(word_data1_extrem_min_y %>%
                         dplyr::transmute(words, check_extreme_min_y = 1), by = "words") %>%
      dplyr::left_join(word_data1_extreme_xy %>%
                         dplyr::transmute(words, check_extreme_xy = 1), by = "words") %>%
      dplyr::left_join(word_data1_frequency_y %>%
                         dplyr::transmute(words, check_extreme_frequency_y = 1), by = "words") %>%
      dplyr::left_join(word_data1_middle_y %>%
                         dplyr::transmute(words, check_middle_y = 1), by = "words") %>%
      dplyr::left_join(word_data1_random_y %>%
                         dplyr::transmute(words, check_random_y = 1), by = "words") %>%  # Adding the random selection
      dplyr::mutate(extremes_all_y = rowSums(cbind(
        check_p_y_neg, check_p_y_pos, check_extreme_max_y, check_extreme_min_y, check_extreme_xy,
        check_extreme_frequency_y, check_middle_y, check_random_y
      ), na.rm = TRUE)) %>%
      dplyr::mutate(extremes_all = rowSums(cbind(extremes_all_x, extremes_all_y), na.rm = TRUE))



    # Categorize words to apply specific color
    word_data_all <- word_data_all %>%
      dplyr::mutate(colour_categories = dplyr::case_when(
        x_plotted < 0 & adjusted_p_values.x < p_alpha & y_plotted > 0 & adjusted_p_values.y < p_alpha ~ bivariate_color_codes[1],
        adjusted_p_values.x > p_alpha & y_plotted > 0 & adjusted_p_values.y < p_alpha ~ bivariate_color_codes[2],
        x_plotted > 0 & adjusted_p_values.x < p_alpha & y_plotted > 0 & adjusted_p_values.y < p_alpha ~ bivariate_color_codes[3],
        x_plotted < 0 & adjusted_p_values.x < p_alpha & adjusted_p_values.y > p_alpha ~ bivariate_color_codes[4],
        adjusted_p_values.x > p_alpha & adjusted_p_values.y > p_alpha ~ bivariate_color_codes[5],
        x_plotted > 0 & adjusted_p_values.x < p_alpha & adjusted_p_values.y > p_alpha ~ bivariate_color_codes[6],
        x_plotted < 0 & adjusted_p_values.x < p_alpha & y_plotted < 0 & adjusted_p_values.y < p_alpha ~ bivariate_color_codes[7],
        adjusted_p_values.x > p_alpha & y_plotted < 0 & adjusted_p_values.y < p_alpha ~ bivariate_color_codes[8],
        x_plotted > 0 & adjusted_p_values.x < p_alpha & y_plotted < 0 & adjusted_p_values.y < p_alpha ~ bivariate_color_codes[9]
      ))
  }


  if (is.null(y_axes_1) == TRUE) {
    word_data_all <- word_data1_x %>%
      dplyr::mutate(colour_categories = dplyr::case_when(
        x_plotted < 0 & adjusted_p_values.x < p_alpha ~ bivariate_color_codes[4],
        # x_plotted < 0 & adjusted_p_values.x > p_alpha ~ bivariate_color_codes[5],
        # Some adjusted_p_values.x has NA becasue they where not tested as multiple input
        # (this is because min_frequency selects out before)
        adjusted_p_values.x > p_alpha | is.na(adjusted_p_values.x) ~ bivariate_color_codes[5],
        x_plotted > 0 & adjusted_p_values.x < p_alpha ~ bivariate_color_codes[6]
      ))
  }

  #### Colorize words that are more frequent on the opposite side of the dot product projection ####
  if (is.character(n_contrast_group_color) == TRUE) {
    # Select words with MORE words in G1 and POSITIVE dot product (i.e., remove words that are
    # more represented in the opposite group of its dot product projection)
    word_data_all$colour_categories[(abs(word_data_all$n_g1.x) > abs(word_data_all$n_g2.x) &
                                       word_data_all$x_plotted > 0)] <- n_contrast_group_color

    # Select words with MORE words in G2 and POSITIVE dot product (i.e., remove words that are more
    # represented in the opposite group of its dot product projection)
    word_data_all$colour_categories[(abs(word_data_all$n_g1.x) < abs(word_data_all$n_g2.x) &
                                       word_data_all$x_plotted < 0)] <- n_contrast_group_color
  }

  #### Remove more frequent words on the opposite side of the dot product projection ####
  if (n_contrast_group_remove == TRUE) {
    word_data_all1 <- word_data_all %>%
      # Select words with MORE words in G1 and NEGATIVE dot product (i.e., do not select words
      # that are more represented in the opposite group of its dot product projection)
      filter((abs(n_g1.x) > abs(n_g2.x) &
                x_plotted < 0))

    word_data_all2 <- word_data_all %>%
      # Select words with MORE words in G2 and POSITIVE dot product (i.e., do not select words that
      # are more represented in the opposite group of its dot product projection)
      filter((abs(n_g1.x) < abs(n_g2.x) &
                x_plotted > 0))

    word_data_all <- bind_rows(word_data_all1, word_data_all2)
  }

  #### Preparing for the plot function ####

  # This solution is because it is not possible to send "0" as a parameter
  only_x_dimension <- NULL
  if (is.null(y_axes_1) == TRUE) {
    only_x_dimension <- 0
    y_axes_1 <- "only_x_dimension"
  }

  # Add or Remove values on y-axes
  if (y_axes_values_hide) {
    y_axes_values <- ggplot2::element_blank()
  } else {
    y_axes_values <- ggplot2::element_text()
  }

  # Word data adjusted for if y_axes exists
  if (y_axes == TRUE) {
    word_data_all_yadjusted <- word_data_all[word_data_all$extremes_all_x >= 1 | word_data_all$extremes_all_y >= 1, ]
  } else if (y_axes == FALSE) {
    word_data_all_yadjusted <- word_data_all[word_data_all$extremes_all_x >= 1, ]
  }


  ##### Adding/exploring words MANUALY ###### explore_words = "happy"

  if (!is.null(explore_words) == TRUE) {
    if (plot_type_name == "textProjection") {
      word_data_all_yadjusted <- textOwnWordsProjection(
        word_data = word_data,
        word_data_all = word_data_all,
        word_data_all_yadjusted = word_data_all_yadjusted,
        y_axes = y_axes,
        explore_words = explore_words,
        explore_words_color = explore_words_color,
        explore_words_point = explore_words_point,
        explore_words_aggregation = explore_words_aggregation,
        space = space,
        text_plot_comment = text_plot_comment,
        scaling = scaling
      )
    }

    if (plot_type_name == "textWordPrediction") {
      word_data_all_yadjusted <- textOwnWordPrediction(
        word_data = word_data,
        word_data_all = word_data_all,
        word_data_all_yadjusted = word_data_all_yadjusted,
        y_axes = y_axes,
        explore_words = explore_words,
        explore_words_color = explore_words_color,
        explore_words_point = explore_words_point,
        explore_words_aggregation = explore_words_aggregation,
        space = space,
        text_plot_comment = text_plot_comment,
        scaling = scaling
      )
    }
  }

  #### Plotting  ####
  plot <- textPlotting(
    word_data_all = word_data_all,
    word_data_all_yadjusted = word_data_all_yadjusted,
    only_x_dimension = only_x_dimension,
    x_axes_1 = x_axes_1,
    y_axes_1 = y_axes_1,
    group_embeddings1 = group_embeddings1,
    group_embeddings2 = group_embeddings2,
    projection_embedding = projection_embedding,
    label = words,
    points_without_words_size = points_without_words_size,
    points_without_words_alpha = points_without_words_alpha,
    colour_categories = colour_categories,
    arrow_transparency = arrow_transparency,
    scale_x_axes_lim = scale_x_axes_lim,
    scale_y_axes_lim = scale_y_axes_lim,
    position_jitter_hight = position_jitter_hight,
    position_jitter_width = position_jitter_width,
    word_font = word_font,
    point_size = point_size,
    aggregated_embeddings_data = aggregated_embeddings_data,
    aggregated_point_size = aggregated_point_size,
    aggregated_shape = aggregated_shape,
    aggregated_color_G1 = aggregated_color_G1,
    aggregated_color_G2 = aggregated_color_G2,
    projection_color = projection_color,
    word_size_range = word_size_range,
    # titles
    title_top = title_top,
    titles_color = titles_color,
    x_axes_label = x_axes_label,
    y_axes_label = y_axes_label,
    y_axes_values = y_axes_values
  )

  #### Reset overlapping ####
  options(ggrepel.max.overlaps = 10)

  # plot

  #### Creating the legend ####

  legend <- textLegend(
    bivariate_color_codes = bivariate_color_codes,
    y_axes_1 = y_axes_1,
    fill = fill,
    legend_title = legend_title,
    legend_title_size = legend_title_size,
    legend_x_axes_label = legend_x_axes_label,
    legend_y_axes_label = legend_y_axes_label,
    word_data_all = word_data_all,
    legend_number_size = legend_number_size,
    legend_number_colour = legend_number_colour,
    titles_color = titles_color
  )
  # legend

  #### Plot both figure and legend help(null_dev_env) ####
  final_plot <- suppressWarnings(cowplot::ggdraw() +
                                   cowplot::draw_plot(plot, 0, 0, 1, 1) +
                                   cowplot::draw_plot(legend,
                                                      legend_x_position,
                                                      legend_y_position,
                                                      legend_h_size,
                                                      legend_w_size))

  output_plot_data <- list(final_plot, text_plot_comment, word_data_all)
  names(output_plot_data) <- c("final_plot", "description", "processed_word_data")
  output_plot_data
}
OscarKjell/text documentation built on April 3, 2025, 3:07 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
OscarKjell/text
Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

R/4_0_textPlot.R
In OscarKjell/text: Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

Defines functions textPlot adjust_for_plot_type textOwnWordPrediction textOwnWordsProjection textLegend textPlotting

Documented in textPlot

R Package Documentation

Browse R Packages

We want your feedback!

OscarKjell/text Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

R/4_0_textPlot.R In OscarKjell/text: Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

Defines functions textPlot adjust_for_plot_type textOwnWordPrediction textOwnWordsProjection textLegend textPlotting

Documented in textPlot

R Package Documentation

Browse R Packages

We want your feedback!

OscarKjell/text
Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning

R/4_0_textPlot.R
In OscarKjell/text: Analyses of Text using Transformers Models from HuggingFace, Natural Language Processing and Machine Learning