R/4_2_wordclouds.R
In topics: Creating and Significance Testing Language Features for Visualisation

#' Rename the columns of a model with the vocabulary
#' @param model (list) the model to be modified
#' @param col (string) the column to be modified
#' @param vocabulary (list) the vocabulary to be used
#' @return (list) the modified model
#' @noRd
name_cols_with_vocab <- function(
    model, 
    col, 
    vocabulary){
  
  colnames(model[[col]]) <- as.character(unlist(vocabulary))
  return(model)
}

#' This is a private function
#' @param df_list (list) a list of data frames with each topic
#' @param phi (data.frame) data frame with topic word scores
#' @param model_type (string) "mallet"
#' @return list of data.frames with assigned phi
#' @noRd
assign_phi_to_words <- function(
    df_list, 
    phi, 
    model_type){
  
  
  for (i in 1:length(df_list)){
    df <- data.frame(df_list[[i]]) 
    colnames(df)[1] <- "Word"
    phi_vector <- c()
    for (j in 1:nrow(df)){
      word <- df[j,]
      if (model_type=="mallet"){
        phi_vector <- c(phi_vector,phi[i,][word])
      } else {
        phi_vector <- c(phi_vector,phi[paste0("t_",i),][word])
      }
    }
    df$phi <- phi_vector
    df_list[[i]] <- df
  }
  return(df_list)
}

#
#' Create list of data.frames with topics most frequent words and topic term scores
#' @param df (string) 
#' @return list of data.frames
#' @noRd
create_df_list_bert_topics <- function(
    df
    ) {
  
  n <- nrow(df$summary)
  df_list <- vector("list", n)

  for (i in 1:n) {
    top_terms_bert <- df$summary$top_terms[i]
    df_list[[i]] <- top_terms_bert
  }
  
  # Convert each topic's word list to a dataframe with Word and phi
  df_list <- lapply(df_list, function(words_string) {
    # Split into individual words
    words <- unlist(strsplit(words_string, ",\\s*"))
    # If removing/changing below line then update message in text-package
    phi <- rev(seq_along(words)) / sum(seq_along(words))
    
    # Return as data.frame
    tibble::tibble(Word = words, phi = phi)
  })
  
  # Assign pseudo-phi values as descending importance scores (normalized from 1 to N)
  # This is moved to the text-package message(colourise("Note: BERTopic does not provide true 'phi' values (i.e., P(word | topic)) as in probabilistic models like LDA. Instead, we assign descending normalized values to approximate word importance.", "blue"))
  
  return(df_list)
}



#' This is a private function 
#' @param summary (data.frame) the models summary
#' @return a list of dataframes for each topic filled with top terms
#' @importFrom stats complete.cases
#' @noRd
create_topic_words_dfs <- function(
    summary){
  
  n <- nrow(summary)
  df_list <- vector("list", n)
  
  # Create and name the dataframes in a loop
  for (i in 1:n) {
    word_vector <- unlist(strsplit(summary[paste0("t_",i),]$top_terms, ", "))
    df <- data.frame(Word = word_vector) # Create an empty dataframe
    df <- df_cleaned <- df[stats::complete.cases(df), ]
    df_list[[i]] <- df  # Add the dataframe to the list
    
    name <- paste("t", i, sep = "_")  # Create the name for the dataframe
    assign(name, df_list[[i]])  # Assign the dataframe to a variable with the specified name
  }
  return(df_list)
}



#' Function to select non-overlapping texts and count excluded topics
#'
#' @param df A tibble or data frame containing the text data.
#' @param text_column A character string specifying the name of the text column.
#' @param n_texts A numeric value indicating the number of texts to select (default is 3).
#' @param allowed_word_overlap A numeric value indicating the maximum number of words allowed to overlap (default is 5).
#' @return A list containing:
#'   - `selected`: A tibble or data frame containing the selected non-overlapping texts.
#'   - `excluded_count`: The number of topics that were not selected due to overlap.
#' @importFrom stringr str_split
#' @importFrom dplyr filter
#' @noRd
select_non_overlapping_texts <- function(
    df, 
    text_column, 
    n_texts = 3, 
    allowed_word_overlap = 5) {
  
  # Function to count word overlap between two texts
  count_word_overlap <- function(text1, text2) {
    words1 <- unlist(stringr::str_split(text1, "\\s+"))
    words2 <- unlist(stringr::str_split(text2, "\\s+"))
    length(intersect(words1, words2))
  }
  
  # Extract the text data
  texts <- df[[text_column]]
  
  # Initialize with the first text
  selected_texts <- texts[1]
  excluded_count <- 0  # Counter for excluded topics
  
  # Loop through each subsequent text
  for (i in 2:length(texts)) {
    # Check overlap with all currently selected texts
    overlap_counts <- sapply(selected_texts, function(selected) {
      count_word_overlap(selected, texts[i])
    })
    
    # Include the text only if the overlap with all selected texts is less than allowed_word_overlap
    if (all(overlap_counts < allowed_word_overlap)) {
      selected_texts <- c(selected_texts, texts[i])
    } else {
      # Increment the excluded count if the text is not selected
      excluded_count <- excluded_count + 1
    }
    
    # Stop if we have reached the desired number of texts
    if (length(selected_texts) >= n_texts) {
      break
    }
  }
  
  # Filter the data frame to keep only the selected texts
  #filtered_df <- filter(df, !!rlang::sym(text_column) %in% selected_texts)
  filtered_df <- df[df[[text_column]] %in% selected_texts, ]
  
  if(!is.null(filtered_df$color_categories1[[1]])){
    
    colour_category_number <- unique(filtered_df$color_categories1)
    # Return a list with the filtered data frame and excluded count
    message(colourise(paste0(
      excluded_count," topic(s) were excluded in color_category ", 
      colour_category_number, " due to the `allowed_word_overlap` filter. "
    ), "blue"))
    
  } else {
    
    message(colourise(paste0(
      excluded_count," topics were excluded due to the `allowed_word_overlap` filter. ",
      "blue")))
    
  }
  
  
  return(filtered_df)
  #return(list(
  #  selected = filtered_df,
  #  excluded_count = excluded_count
  #))
}

#' Assgning numeric categories for further topic visualization colors.
#' @param topic_loadings_all (tibble) The tibble from topicsTest1
#' @param p_alpha (numeric) Threshold of p value set by the user for visualising significant topics 
#' @param dimNo (numeric) 1 dimension or 2 dimensions
#' @return A new tibble with the assigned numbers
#' @importFrom dplyr mutate
#' @noRd
topicsNumAssign_dim2 <- function(
    topic_loadings_all,
    p_alpha = 0.05, 
    dimNo = 2){
  
  if (dimNo == 1){
    num1 <- 1:3
    topic_loadings_all <- topic_loadings_all %>%
      dplyr::mutate(color_categories = dplyr::case_when(
        x_plotted < 0 & adjusted_p_values.x < p_alpha ~ num1[1],
        adjusted_p_values.x > p_alpha ~ num1[2],
        x_plotted > 0 & adjusted_p_values.x < p_alpha ~ num1[3]
      ))
  }else{
    num1 <- 1:9
    topic_loadings_all <- topic_loadings_all %>%
      dplyr::mutate(color_categories = dplyr::case_when(
        x_plotted < 0 & adjusted_p_values.x < p_alpha & y_plotted > 0 & adjusted_p_values.y < p_alpha ~ num1[1],
        adjusted_p_values.x > p_alpha & y_plotted > 0 & adjusted_p_values.y < p_alpha ~ num1[2],
        x_plotted > 0 & adjusted_p_values.x < p_alpha & y_plotted > 0 & adjusted_p_values.y < p_alpha ~ num1[3],
        x_plotted < 0 & adjusted_p_values.x < p_alpha & adjusted_p_values.y > p_alpha ~ num1[4],
        adjusted_p_values.x > p_alpha & adjusted_p_values.y > p_alpha ~ num1[5],
        x_plotted > 0 & adjusted_p_values.x < p_alpha & adjusted_p_values.y > p_alpha ~ num1[6],
        x_plotted < 0 & adjusted_p_values.x < p_alpha & y_plotted < 0 & adjusted_p_values.y < p_alpha ~ num1[7],
        adjusted_p_values.x > p_alpha & y_plotted < 0 & adjusted_p_values.y < p_alpha ~ num1[8],
        x_plotted > 0 & adjusted_p_values.x < p_alpha & y_plotted < 0 & adjusted_p_values.y < p_alpha ~ num1[9]
      ))
  }
  
  return (topic_loadings_all)
}

#' A Private function to check the validity of color hex codes for parameter highlight_topic_words.
#' highlight_topic_words (named vector) the named vector with negative words as elements and color hex codes as names for each word
#' @noRd
all_hex <- function(highlight_topic_words) {
  all(grepl("^#[0-9A-Fa-f]{6}$", highlight_topic_words))
}

#' Separate words for negative words as a fixed-colour tibble and other words as a gradient-color tibble.
#' @noRd
separate_neg_words <- function(
    df_list_element, 
    highlight_topic_words) {
  
  pattern <- paste0("(^|[ _])(", paste(highlight_topic_words, collapse = "|"), ")(?=[ _])")
  
  # Create df_list_ele_fixed with words that match or contain the dictionary keys
  df_list_ele_fixed <- df_list_element %>%
    dplyr::filter(stringr::str_detect(Word, pattern)) %>%
    dplyr::mutate(FixedColor = highlight_topic_words[stringr::str_extract(
      Word, 
      pattern)])
  
  # Create df_list_ele_gradient with the remaining words
  df_list_ele_gradient <- df_list_element %>%
    dplyr::filter(!stringr::str_detect(
      Word,
      pattern))
  
  return (list(
    df_list_ele_gradient, 
    df_list_ele_fixed))
}


# Helper functions for create_plots 
#' @noRd
ensure_dir <- function(path){ if(!dir.exists(path)) dir.create(path, recursive = TRUE) }

#' @noRd
save_plot <- function(plot, stub, header='', save_dir, figure_format, width, height, seed){
  if(is.null(save_dir)) return(invisible(NULL))
  root <- file.path(save_dir, sprintf("seed_%s", seed), "wordclouds")
  ensure_dir(root)
  ggplot2::ggsave(
    file.path(root, sprintf("%s%s.%s", header, stub, figure_format)),
    plot   = plot,
    width  = width,
    height = height,
    units  = "in"
  )
}

#choose_scheme <- function(est){ if(est[1] < 0) color_negative_cor else color_positive_cor }

#' @noRd
highlight_neg <- function(topic_df){
  if(is.null(highlight_topic_words) || !is.character(highlight_topic_words))
    return(topic_df)
  separated <- separate_neg_words(topic_df, highlight_topic_words)
  separated[[2]] <- dplyr::mutate(separated[[2]], source = "negDic",
                                  label_content = sprintf("<b><i>%s</i></b>", Word))
  separated[[1]] <- dplyr::mutate(separated[[1]], source = "notNegDic",
                                  label_content = Word)
  dplyr::bind_rows(separated[[1]], separated[[2]]) %>%
    dplyr::mutate(phi = phi^1.1 / sum(phi^1.1))
}

#' @noRd
build_cloud <- function(data, scheme, max_size_local){
  lbl <- if("label_content" %in% colnames(data)) "label_content" else "Word"
  # Here we set what the size and colour of the words in topics represents
  ggplot2::ggplot(data, ggplot2::aes_string(label = lbl, size = "phi", color = "phi")) +
    ggwordcloud::geom_text_wordcloud() +
    ggplot2::scale_size_area(max_size = max_size_local) +
    ggplot2::theme_minimal() +
    scheme
}

#' @noRd
min_p <- function(p_vec) suppressWarnings(min(p_vec, na.rm = TRUE))

#' @noRd
format_p <- function(p_vec) format(min_p(p_vec), scientific = TRUE, digits = 2)

#' Generate topic and n-gram wordcloud plots
#'
#' @param df_list list of data.frames with topic word distributions
#' @param summary data.frame with topic prevalences (row‑names t_1, t_2, …)
#' @param ngrams data.frame with two columns: ngrams, prevalence
#' @param test data.frame returned by textTopicTest()
#' @param test_type "linear_regression" | "binary_regression"
#' @param cor_var variable name used in the regression columns (e.g. "age")
#' @param popout tibble of topic indices to always plot regardless of p‑value
#' @param color_negative_cor, color_positive_cor ggplot2 scale functions
#' @param grid_pos numeric grid position used when emphasising topics
#' @param scale_size logical, scale cloud size by topic prevalence?
#' @param plot_topics_idx vector of topic indices to plot (NULL => all)
#' @param p_alpha max adjusted p‑value for a topic/term to be plotted
#' @param highlight_topic_words character vector of words to emphasise
#' @param save_dir directory where plots are saved (mandatory)
#' @param figure_format image file extension (svg | png | …)
#' @param width,height,max_size basic ggsave options
#' @param seed integer used in the file‑path so different runs do not collide
#' @importFrom ggwordcloud geom_text_wordcloud
#' @importFrom ggplot2 ggsave labs scale_size_area theme_minimal ggplot aes
#' @importFrom ggplot2 scale_colour_identity scale_color_gradient
#' @importFrom dplyr rename left_join filter bind_rows mutate
#' @noRd
create_plots <- function(
    df_list = NULL,
    summary = NULL,
    ngrams = NULL,
    test = NULL,
    test_type = NULL,
    cor_var = NULL,
    popout = NULL,
    color_negative_cor = NULL,
    color_positive_cor = NULL,
    grid_pos = "",
    scale_size = FALSE,
    plot_topics_idx = NULL,
    p_alpha = NULL,
    highlight_topic_words = c("not", "never"),
    save_dir,
    figure_format = "svg",
    width = 10,
    height = 8,
    max_size = 10,
    seed = 42){
  
  # Dependencies
  for(pkg in c("ggplot2", "ggwordcloud", "dplyr", "tibble", "rlang")){
    if(!requireNamespace(pkg, quietly = TRUE))
      stop(sprintf("Package '%s' is required but not installed.", pkg), call. = FALSE)
  }
  
  # Pre‑processing
  if(is.null(plot_topics_idx)){
    plot_topics_idx <- seq(1, length(df_list))
    #plot_topics_idx <- test$topic# seq_along(test)
  }else{plot_topics_idx <- plot_topics_idx}
  
  plots <- list()
  
  # CASE 1: Topic models + test
  if(!is.null(df_list) && !is.null(summary) && !is.null(test)){
    # Column extractors for test‑results 
    split_vars <- strsplit(cor_var, "__", fixed = TRUE)[[1]]
    get_stats <- function(topic_id){
      if(is.null(test)) return(list(est = NA, p = NA))
      row <- dplyr::filter(tibble::as_tibble(test, .name_repair = "minimal"), 
                           topic == paste0("t_", topic_id))
      if(length(split_vars) == 1){
        list(
          est = row[[grep(paste0(cor_var, ".estimate"),names(row))]],
          p   = row[[grep(paste0(cor_var, ".p_adjusted"),names(row))]]
        )
      } else {
        est_x <- row[[grep(paste0(split_vars[1],".estimate"),names(row))]]
        est_y <- row[[grep(paste0(split_vars[2],".estimate"),names(row))]]
        p_x   <- row[[grep(paste0(split_vars[1],".p_adjusted"),names(row))]]
        p_y   <- row[[grep(paste0(split_vars[2],".p_adjusted"),names(row))]]
        list(est = c(x = est_x, y = est_y), p = c(x = p_x, y = p_y), na.rm = TRUE)
      }
    }
    
    for(topic in plot_topics_idx){
      stats <- get_stats(topic)
      est   <- stats$est
      p_vec <- stats$p
      p_val_min <- min_p(p_vec)
      cur_alpha <- if(is.null(p_alpha)) p_val_min + 1 else p_alpha
      is_pop <- !is.null(popout) && paste0("t_",topic) %in% popout$topic
      if(!is.nan(p_val_min) && (p_val_min < cur_alpha || is_pop)){
        idx   <- as.numeric(sub(".*_", "", topic))
        data  <- df_list[[idx]]
        if(!is.null(highlight_topic_words))
          data <- highlight_neg(data)
        # size scaling
        max_size_topic <- if(scale_size){
          prev <- summary[topic, "prevalence"][[1]]
          max_size * log(prev)
        } else max_size
        #scheme <- choose_scheme(est)
        scheme <- if (est[1] < 0) color_negative_cor else color_positive_cor
        #choose_scheme <- function(est){ if(est[1] < 0) color_negative_cor else color_positive_cor }
        plot   <- build_cloud(data, scheme, max_size_topic)
        # axis annotations
        x_lab  <- if(length(est) == 1){
          sprintf("r = %.4f", est)
        } else {
          sprintf("r_x = %.4f\nr_y = %.4f", est["x"], est["y"])
        }
        plot   <- plot + ggplot2::labs(x = x_lab)
        # file name stub
        stub <- if(length(est) == 1){
          sprintf("%scorvar_%s_%s_r_%.4f_p_%s",
                  if(grid_pos != "") sprintf("grid_pos_%s_", grid_pos) else "",
                  cor_var, topic, est, format_p(p_vec))
        } else {
          sprintf("%scorvar_%s_%s_rx_%.4f_ry_%.4f_px_%s_py_%s",
                  if(grid_pos != "") sprintf("grid_pos_%s_", grid_pos) else "",
                  cor_var, topic, est["x"], est["y"],
                  format(p_vec["x"], scientific = TRUE, digits = 2),
                  format(p_vec["y"], scientific = TRUE, digits = 2))
        }
        save_plot(plot, stub, 't_', save_dir, figure_format, width, height, seed)
        plots[[paste0("t_",topic)]] <- plot
      }
    }
  }
  
  # CASE 2: Topic models alone
  if(!is.null(df_list) && !is.null(summary) && is.null(test)){
    for(topic in plot_topics_idx){
      idx      <- as.numeric(sub(".*_", "", topic))
      data     <- df_list[[idx]]
      max_size_topic <- if(scale_size){
        prev <- summary[topic, "prevalence"][[1]]
        max_size * log(prev)
      } else max_size
      plot <- build_cloud(data, color_negative_cor, max_size_topic)
      save_plot(plot, topic, 't_', save_dir, figure_format, width, height, seed)
      plots[[paste0('t_',as.character(topic))]] <- plot
    }
  }
  
  # CASE 3: N‑grams (no topic model) 
  if(is.null(df_list) && !is.null(ngrams)){
    if(is.null(test)){
      plot <- ggplot2::ggplot(ngrams, ggplot2::aes(label = ngrams, size = prevalence, color = prevalence)) +
        ggwordcloud::geom_text_wordcloud(show.legend = TRUE) +
        ggplot2::theme_minimal() +
        color_positive_cor
      save_plot(plot, "ngrams", '', save_dir, figure_format, width, height, seed)
      plots <- plot
    } else {
      ng  <- ngrams %>% dplyr::rename(top_terms = ngrams)
      tst <- test   %>% dplyr::rename(estimate   = dplyr::contains("estimate"),
                                      p_adjusted = dplyr::contains("p_adjusted")) %>%
        dplyr::left_join(ng, by = "top_terms")
      if(!is.null(p_alpha))
        tst <- dplyr::filter(tst, p_adjusted < p_alpha)
      if(nrow(tst) == 0){
        message("No significant terms. No wordclouds generated.")
      } else {
        make_ngram_plot <- function(df, scheme){
          # Identify the first column name that starts with "prevalence"
          prevalence_col <- grep("^prevalence", names(df), value = TRUE)[1]
          
          # Custom legend titles
          size_legend_title <- "prevalence"
          color_legend_title <- "estimate"
          
          ggplot2::ggplot(df, 
                          ggplot2::aes(label = top_terms, 
                                       size = .data[[prevalence_col]], 
                                       color = abs(estimate))) +
            ggwordcloud::geom_text_wordcloud(show.legend = TRUE) +
            ggplot2::scale_size_area(max_size = max_size, name = size_legend_title)+
            ggplot2::labs(color = color_legend_title) +
            ggplot2::theme_minimal() +
            scheme
        }
        pos  <- make_ngram_plot(df = dplyr::filter(tst, estimate > 0), scheme = color_positive_cor)
        neg  <- make_ngram_plot(dplyr::filter(tst, estimate < 0), color_negative_cor)
        save_plot(pos, "ngrams_positive", '', save_dir, figure_format, width, height, seed)
        save_plot(neg, "ngrams_negative", '', save_dir, figure_format, width, height, seed)
        plots <- list(positive = pos, negative = neg)
      }
    }
  }
  return(plots)
}



#' This is a private function
#' @param df_list (list) list of data.frames with topics most frequent words and assigned topic term scores
#' @param test (data.frame) the test returned from textTopicTest()
#' @param test_type (string) "linear_regression", or "binary_regression"
#' @param cor_var (string) Variable for t-test, linear, or binary/logistic regression
#' @param popout (tibble) The tibble containing topic idx to popout
#' @param color_negative_cor (function) color of topic cloud with negative correlation
#' @param color_positive_cor (function) color of topic cloud with positive correlation
#' @param grid_pos (numeric) position of grid plots
#' @param scale_size (bool) if True, then the size of the topic cloud is scaled by the prevalence of the topic
#' @param plot_topics_idx (list) if specified, then only the specified topics are plotted
#' @param p_alpha (float) set threshold which determines which topics are plotted
#' @param highlight_topic_words (str vector) The dictionary to popout negative words to an individual plot for easier reading. 
#'  Default words are "not", "never". 
#'  The values of the vector determine the color code to popout. The color values can be different for different words.
#' @param save_dir (string) save plots in specified directory, if left blank, plots is not saved,
#' thus save_dir is necessary.
#' @param figure_format (string) Set the figure format, e.g., .svg, or .png.
#' @param seed (int) seed is needed for saving the plots in the correct directory
#' @importFrom ggwordcloud geom_text_wordcloud
#' @importFrom ggplot2 ggsave labs scale_size_area theme_minimal ggplot aes scale_color_gradient scale_colour_identity
#' @importFrom dplyr rename
#' @noRd
create_plots_old <- function(
    df_list = NULL,
    summary = NULL,
    ngrams = NULL,
    test = NULL,
    test_type = NULL,
    cor_var = NULL,
    popout = NULL,
    color_negative_cor = NULL,
    color_positive_cor = NULL,
    grid_pos = "",
    scale_size = FALSE,
    plot_topics_idx = NULL,
    p_alpha = NULL,
    highlight_topic_words = c("not", "never"),
    save_dir,
    figure_format = "svg",
    width = 10,
    height = 8,
    max_size = 10,
    seed = 42){
  
  plot_list <- list()
  # this code plots the wordclouds in respect to the statistical test
  # For test, topics list and summary
  if(!is.null(test) & !is.null(df_list) & !is.null(summary)){
    
    if (is.null(plot_topics_idx)){
      grid1 <- ""
      plot_topics_idx <- seq(1, length(df_list))
    } else {
      grid1 <- paste0("grid_pos_", grid_pos, "_")
      pred_var_x <- strsplit(cor_var, "__")[[1]][1]
      if (length(strsplit(cor_var, "__")[[1]]) > 1){
        pred_var_y <- strsplit(cor_var, "__")[[1]][2]
      }
    }
    
    for (i in paste0('t_', plot_topics_idx)){ 
      #for (i in 1:length(df_list)){
      #view(df_list[[i]])
      if (test_type == "linear_regression" | test_type == "logistic_regression"){
        if (grid1 == ""){
          colNo.estimate <- grep(paste0(cor_var, '.estimate'),colnames(test))
          colNo.p_adjusted <- grep(paste0(cor_var, '.p_adjusted'),colnames(test))
          estimate_col <- colnames(test)[colNo.estimate] 
          p_adjusted_col <- colnames(test)[colNo.p_adjusted]
          #estimate_col <- paste0(cor_var,".estimate") # grep(partial_name, data_frame_names, value = TRUE)
          #p_adjusted_col <- paste0(cor_var,".p_adjusted")
        } else {
          colNo.estimate.x <- grep(paste0(pred_var_x, '.estimate'),colnames(test))
          colNo.p_adjusted.x <- grep(paste0(pred_var_x, '.p_adjusted'),colnames(test))
          estimate_col_x <- colnames(test)[colNo.estimate.x] 
          p_adjusted_col_x <- colnames(test)[colNo.p_adjusted.x]
          
          # Why is it doing this? Its not reliable to split it up based on "_"
          if (length(strsplit(cor_var, "__")[[1]]) > 1){
            colNo.estimate.y <- grep(paste0(pred_var_y, '.estimate'),colnames(test))
            colNo.p_adjusted.y <- grep(paste0(pred_var_y, '.p_adjusted'),colnames(test))
            estimate_col_y <- colnames(test)[colNo.estimate.y] 
            p_adjusted_col_y <- colnames(test)[colNo.p_adjusted.y]
          }}
      } else if (test_type == "t-test"){
        estimate_col <- "cohens d" # probably doesn't work yet
        
      } #else if (test_type == "logistic_regression"){
      #estimate_col <- "estimate"
      #p_adjusted_col <- "p_adjusted"
      #}
      if (grid1 == ""){
        estimate <- dplyr::filter(tibble::as_tibble(test,.name_repair='minimal'), topic==i)[[estimate_col]] # $PHQtot.estimate
        p_adjusted <- dplyr::filter(tibble::as_tibble(test,.name_repair='minimal'), topic==i)[[p_adjusted_col]] # $PHQtot.p_adjustedfdr
      }else{
        estimate_x <- dplyr::filter(tibble::as_tibble(test,.name_repair='minimal'), topic==i)[[estimate_col_x]]
        p_adjusted_x <- dplyr::filter(tibble::as_tibble(test,.name_repair='minimal'), topic==i)[[p_adjusted_col_x]]
        estimate <- estimate_x
        if (length(strsplit(cor_var, "__")[[1]]) > 1){
          estimate_y <- dplyr::filter(tibble::as_tibble(test,.name_repair='minimal'), topic==i)[[estimate_col_y]]
          p_adjusted_y <- dplyr::filter(tibble::as_tibble(test,.name_repair='minimal'), topic==i)[[p_adjusted_col_y]]
          p_adjusted <- min(p_adjusted_x, p_adjusted_y)
        }else{
          estimate <- estimate_x
          p_adjusted <- p_adjusted_x
        }
      }
      
      
      if (scale_size==TRUE){
        prevalence <- summary[paste0("t_",i),]$prevalence
      }
      
      
      # this will ensure that all topics are plotted
      if (is.null(p_alpha) ){
        if (grid1 == ""){
          p_alpha <- p_adjusted +1 
        }
      }
      
      
      if ((!is.nan(p_adjusted) & p_adjusted < p_alpha) || i %in% popout$topic){
        
        #estimate <- test[i,][[grep(estimate_col, colnames(test), value=TRUE)]]# $PHQtot.estimate
        #p_adjusted <- test[i,][[grep("p_adjusted", colnames(test), value=TRUE)]] # $PHQtot.p_adjustedfdr
        if (estimate < 0){
          color_scheme <- color_negative_cor # scale_color_gradient(low = "darkgreen", high = "green")
        } else {
          color_scheme <- color_positive_cor # scale_color_gradient(low = "darkred", high = "red")
        }
        if (scale_size == TRUE){
          max_size <- max_size*log(prevalence)
          y <- paste0("P = ", prevalence)
        } else {
          max_size <- max_size
          y <- ""
        }
        
        # For constant bold and itatlic format of negative words. 
        if (!is.null(highlight_topic_words) && is.vector(highlight_topic_words)){
          if(is.character(highlight_topic_words) && is.vector(highlight_topic_words)){
            # Assign color to neg_words in the dictionary object "highlight_topic_words"
            df_list_separated <- separate_neg_words(df_list[[as.numeric(sub(".*_", "", i))]], highlight_topic_words)
            df_list_separated[[2]] <- dplyr::mutate(df_list_separated[[2]], source = "negDic") 
            #df_list_separated[[1]]$color <- df_list_separated[[1]]$phi^3 / sum(df_list_separated[[1]]$phi^3) 
            #df_list_separated[[1]]$color <- color_scheme$palette(df_list_separated[[1]]$color)
            colnames(df_list_separated[[2]])[3] <- c('color') 
            df_list_separated[[1]] <- dplyr::mutate(df_list_separated[[1]], source = "notNegDic")
            target_topic <- dplyr::bind_rows(df_list_separated[[1]],df_list_separated[[2]]) 
            target_topic <- target_topic %>%
              dplyr::mutate(label_content = if_else(source == "negDic",
                                                    sprintf("<b><i>%s</i></b>", Word),
                                                    Word))
            target_topic$phi_ori <- target_topic$phi
            target_topic$phi <- target_topic$phi^1.1 / sum(target_topic$phi^1.1) # Make the differences of size and color more obvious.
          }else{stop('Invalid settings for the parameter "highlight_topic_words".\nConsider use the default option!\n')}
        }else{target_topic <- df_list[[as.numeric(sub(".*_", "", i))]]}
        
        if (grid1 == ""){ .
          plot <- ggplot2::ggplot(target_topic, 
                                  ggplot2::aes(label = Word, 
                                               size = phi, 
                                               color = phi)) + #,x=estimate)) +
            ggwordcloud::geom_text_wordcloud() +
            ggplot2::scale_size_area(max_size = max_size) +
            ggplot2::theme_minimal() +
            #theme(plot.margin = margin(0,0,0,0, "cm")) +
            color_scheme + 
            ggplot2::labs(x = paste0("r = ", estimate),
                          y= y)
        } else {
          
          if (length(strsplit(cor_var, "__")[[1]]) > 1){
            x_message = paste0("r_x = ", round(estimate_x,4), "\n",
                               "r_y = ", round(estimate_y,4))
          } else {
            
            x_message = paste0("r_x = ", round(estimate_x,4))                    
          }
          if (!is.null(highlight_topic_words) && is.vector(highlight_topic_words)){
            plot <- ggplot2::ggplot(target_topic, 
                                    ggplot2::aes(label = Word, 
                                                 size = phi, 
                                                 color = color_scheme$palette(phi),
                                                 label_content = label_content
                                    ))  # +,x=estimate)) +
          }else{
            plot <- ggplot2::ggplot(target_topic, 
                                    ggplot2::aes(label = Word, 
                                                 size = phi, 
                                                 color = phi))
          }
          
          plot <- plot + ggwordcloud::geom_text_wordcloud() + 
            ggplot2::scale_size_area(max_size = max_size)
          
          if (!is.null(highlight_topic_words) && is.vector(highlight_topic_words)){
            plot <- plot + ggplot2::scale_colour_identity() + 
              ggplot2::theme_minimal() 
          }else{
            plot <- plot + ggplot2::theme_minimal() +
              #theme(plot.margin = margin(0,0,0,0, "cm")) +
              color_scheme
          }
          plot <- plot + ggplot2::labs(x = x_message, y = y)
        }
        
        if (!is.null(save_dir)){
          if (!dir.exists(save_dir)) {
            
            # Create the directory
            dir.create(save_dir)
            msg <- "Directory created successfully.\n"
            message(colourise(msg, "green"))
          } 
          if(!dir.exists(paste0(save_dir, "/seed_", seed, "/wordclouds"))){
            dir.create(paste0(save_dir, "/seed_", seed, "/wordclouds"))
          }
          
          p_adjusted <- sprintf("%.2e", p_adjusted)
          
          if (grid1 == ""){
            ggplot2::ggsave(paste0(save_dir,"/seed_", seed, 
                                   "/wordclouds/",
                                   grid1,
                                   "corvar_", cor_var,"_",
                                   i, "_r_", 
                                   estimate, "_p_", 
                                   p_adjusted,
                                   ".",
                                   figure_format),
                            plot = plot, 
                            width = width, 
                            height = height, 
                            units = "in", 
                            create.dir = TRUE)
          } else {
            
            if (length(strsplit(cor_var, "__")[[1]]) > 1){
              p_adjusted_x <- sprintf("%.2e", p_adjusted_x)
              p_adjusted_y <- sprintf("%.2e", p_adjusted_y)
              fileMsg <- paste0(
                "_rx_", estimate_x,
                "_ry_", estimate_y, 
                "_px_", p_adjusted_x,
                "_py_", p_adjusted_y, ".",
                figure_format
              )
            } else {
              
              p_adjusted_x <- sprintf("%.2e", p_adjusted_x)
              fileMsg <- paste0(
                "_rx_", estimate_x, 
                "_px_", p_adjusted_x, ".",
                figure_format
              )
            }
            
            if (is.null(popout)){fileHead <- ''} else {
              if (i %in% popout$topic){fileHead <- '0_scatter_emphasised_'}else{fileHead <- ''}
            }
            
            ggplot2::ggsave(paste0(save_dir,"/seed_", seed, 
                                   "/wordclouds/", 
                                   fileHead,
                                   grid1,
                                   "corvar_", cor_var,"_",
                                   i, fileMsg),
                            plot = plot, 
                            width = width, 
                            height = height, 
                            units = "in", 
                            create.dir = TRUE)
          }
        }
      }
      plot_list[[i]] <- plot
    }
  }
  
  
  # For topic df_list, summary but no test
  if (!is.null(df_list) & !is.null(summary) & is.null(test)){
    if (is.null(plot_topics_idx)){
      plot_topics_idx <- seq(1, length(df_list))
    }
    if (scale_size==TRUE){
      prevalence <- summary[paste0("t_",i),]$prevalence
      max_size <- max_size*log(prevalence)
    } else {
      max_size <- max_size
    }
    
    
    if (!is.null(save_dir)){
      if (!dir.exists(save_dir)) {
        # Create the directory
        dir.create(save_dir)
        msg <- "Directory created successfully.\n"
        message(colourise(msg, "green"))
      } 
      if(!dir.exists(paste0(save_dir, "/seed_", seed, "/wordclouds"))){
        dir.create(paste0(save_dir, "/seed_", seed, "/wordclouds"))
      }
    }
    
    for (i in paste0('t_', plot_topics_idx)){
      
      plot <- ggplot2::ggplot(
        df_list[[as.numeric(sub(".*_", "", i))]],
        ggplot2::aes(label = Word,
                     size = phi,
                     #label_content = label_content,
                     color = phi)) +
        ggwordcloud::geom_text_wordcloud() +
        ggplot2::scale_size_area(max_size = max_size) +
        ggplot2::theme_minimal() +
        color_negative_cor
      
      if (!is.null(save_dir)){
        ggplot2::ggsave(paste0(save_dir,"/seed_", seed, 
                               "/wordclouds/",
                               i, 
                               ".",
                               figure_format),
                        plot = plot, 
                        width = width, 
                        height = height, 
                        units = "in", 
                        create.dir = TRUE)
      }
      
      plot_list[[i]] <- plot
    }
  }
  
  # For N-grams with NO test
  if (is.null(df_list) & is.null(test) & !is.null(ngrams)){
    
    if (!is.null(save_dir)){
      if (!dir.exists(save_dir)) {
        # Create the directory
        dir.create(save_dir)
        msg <- "Directory created successfully.\n"
        message(colourise(msg, "green"))
      } 
      if(!dir.exists(paste0(save_dir, "/seed_", seed, "/wordclouds"))){
        dir.create(paste0(save_dir, "/seed_", seed, "/wordclouds"))
      }
    }
    
    plot <- ggplot2::ggplot(ngrams, 
                            ggplot2::aes(label = ngrams, 
                                         size = prevalence, 
                                         color = prevalence)) +
      ggwordcloud::geom_text_wordcloud(show.legend = TRUE) +
      ggplot2::theme_minimal() +
      color_positive_cor # ggplot2::scale_color_gradient(low = "yellow", high = "red")
    
    if (!is.null(save_dir)){
      ggplot2::ggsave(paste0(save_dir,"/seed_", seed, 
                             "/wordclouds/ngrams", 
                             ".",
                             figure_format),
                      plot = plot, 
                      width = width, 
                      height = height, 
                      units = "in", 
                      create.dir = TRUE)
    }
    
    plot_list <- plot
  } 
  
  # For N-gram with test
  if (is.null(df_list) & !is.null(test) & !is.null(ngrams)){
    
    ngrams <- ngrams %>% dplyr::rename(top_terms = ngrams)
    test <- test %>% dplyr::rename(estimate = contains("estimate"))
    test <- test %>% dplyr::rename(p_adjusted = contains("p_adjusted"))
    test <- test %>% dplyr::left_join(ngrams, by = "top_terms")
    if (!is.null(p_alpha)){
      test <- test %>% dplyr::filter(p_adjusted < p_alpha)
    }
    # test for the fact that all words could be insignificant
    if (nrow(test) == 0){
      msg <- "No significant terms.\n No wordclouds generated."
      message(colourise(msg, "blue"))
    } else {
      test_positive <- test%>% dplyr::filter(estimate > 0)
      test_negative <- test%>% dplyr::filter(estimate < 0)
      
      if (!is.null(save_dir)){
        if (!dir.exists(save_dir)) {
          # Create the directory
          dir.create(save_dir)
          msg <- "Directory created successfully.\n"
          message(colourise(msg, "green"))
        } 
        if(!dir.exists(paste0(save_dir, "/seed_", seed, "/wordclouds"))){
          dir.create(paste0(save_dir, "/seed_", seed, "/wordclouds"))
        }
      }
      
      # Word cloud with correlation strength mapped to color gradient
      plot1 <- ggplot2::ggplot(test_positive, 
                               ggplot2::aes(label = top_terms,
                                            size = prevalence,
                                            color = estimate)) +
        ggwordcloud::geom_text_wordcloud(show.legend = TRUE) +
        ggplot2::scale_size_area(max_size = max_size) +  # Adjust max size
        #scale_color_gradient(low = "grey", high = "red") +  # Blue for low, red for high correlation strength
        ggplot2::theme_minimal() +
        color_positive_cor
      
      if (!is.null(save_dir)){
        ggplot2::ggsave(paste0(save_dir,"/seed_", seed, 
                               "/wordclouds/ngrams_positive", 
                               ".",
                               figure_format),
                        plot = plot1, 
                        width = width, 
                        height = height, 
                        units = "in", 
                        create.dir = TRUE) 
      }
      
      # Word cloud with correlation strength mapped to color gradient
      plot2 <- ggplot2::ggplot(test_negative, 
                               ggplot2::aes(label = top_terms,
                                            size = prevalence,
                                            color = abs(estimate))) +
        ggwordcloud::geom_text_wordcloud(show.legend = TRUE) +
        ggplot2::scale_size_area(max_size = max_size) +  # Adjust max size
        #scale_color_gradient(low = "grey", high = "red") +  # Blue for low, red for high correlation strength
        ggplot2::theme_minimal() +
        color_negative_cor
      
      if (!is.null(save_dir)){
        ggplot2::ggsave(paste0(save_dir,"/seed_", seed, 
                               "/wordclouds/ngrams_negative", 
                               ".",
                               figure_format),
                        plot = plot2, 
                        width = width, 
                        height = height, 
                        units = "in", 
                        create.dir = TRUE)
      }
      
      plot_list[[1]] <- plot1
      plot_list[[2]] <- plot2
      names(plot_list) <- c("positive", "negative")
    }
  }
  return(plot_list)
  
}
Any scripts or data that you put into this service are public.
topics documentation built on Aug. 29, 2025, 5:15 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
topics
Creating and Significance Testing Language Features for Visualisation

R/4_2_wordclouds.R
In topics: Creating and Significance Testing Language Features for Visualisation

Defines functions create_plots_old create_plots format_p min_p build_cloud highlight_neg save_plot ensure_dir separate_neg_words all_hex topicsNumAssign_dim2 select_non_overlapping_texts create_topic_words_dfs create_df_list_bert_topics assign_phi_to_words name_cols_with_vocab

Try the topics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

topics Creating and Significance Testing Language Features for Visualisation

R/4_2_wordclouds.R In topics: Creating and Significance Testing Language Features for Visualisation

Defines functions create_plots_old create_plots format_p min_p build_cloud highlight_neg save_plot ensure_dir separate_neg_words all_hex topicsNumAssign_dim2 select_non_overlapping_texts create_topic_words_dfs create_df_list_bert_topics assign_phi_to_words name_cols_with_vocab

Try the topics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

topics
Creating and Significance Testing Language Features for Visualisation

R/4_2_wordclouds.R
In topics: Creating and Significance Testing Language Features for Visualisation