R/fct_reddit.R

Defines functions get_top_2grams get_top_words make_subreddit_plot make_subreddit_table plot_reddit_posts_over_time

# set up custom stop words
custom_stop_words <- dplyr::bind_rows(tidytext::stop_words,
                                      dplyr::tibble(word = c(1:10, "it's", "it’s", "http", "https", "i’m",
                                                             "à", "c'est", "une", "gt", "de", "pas", "du",
                                                             "le", "une", "dans", "en", "les", "la", "des")))


#' Heatmaps over time 
#'
#' @description A fct function
#'
#' @return The return value, if any, from executing the function.
#'
#' @noRd
plot_reddit_heatmap <- function (input_data){
  created_datetime <- plot_hour <- n <- NULL
  
  num_comments <- nrow(input_data)
  if (num_comments > 0) {
    
    input_data %>%
      dplyr::select(created_datetime) %>%
      dplyr::mutate(plot_hour = lubridate::hour(created_datetime)) %>%
      dplyr::group_by(plot_hour) %>%
      dplyr::count() %>%
      ggplot2::ggplot(ggplot2::aes(x=plot_hour, y= " ", fill = n)) +
      ggplot2::geom_tile() +
      ggplot2::theme_minimal() +
      ggplot2::scale_fill_viridis_c(option = "C") +
      ggplot2::scale_x_continuous(limits = c(-1, 24), 
                                  expand = c(0,0),
                                  breaks = c(0,4,8,12,16,20)) +
      ggplot2::scale_y_discrete(expand = c(0,0)) +
      ggplot2::labs(title = "Heat map of search term usage by time (EST)",
                    subtitle = sprintf("White zones indicate no data found. (n=%s)",num_comments),
                    x = "Hour",
                    y = NULL,
                    fill = "Posts"
      ) 
  } else {NULL}
}


#' Barchart of posts over time
#'
#' @description A fct function
#'
#' @return The return value, if any, from executing the function.
#'
#' @noRd
plot_reddit_posts_over_time <- function(input_data) {
  created_datetime <- n <- plot_date <- NULL
  
  if (nrow(input_data) > 0) {
    
    timespan <- difftime(max(input_data$created_datetime),
                         min(input_data$created_datetime),
                         units = "hours")
    plot_timespan_unit <- dplyr::case_when(
      timespan < 6 ~ "minutes",
      timespan < 24 * 3 ~ "hours",
      timespan < 24 * 14 ~ "days",
      timespan < 24 * 7 * 8 ~ "weeks",
      timespan < 24 * 7 * 4 * 24 ~ "months",
      TRUE ~ "years"
    )
    
    #message(sprintf("timespan: %s", timespan))
    #message(sprintf("plot_timespan_unit: %s", plot_timespan_unit))
    
    # for the subtitle: avg. posts/day
    timespan_days <- difftime(max(input_data$created_datetime),
                              min(input_data$created_datetime),
                              units = "days") %>%
      as.numeric()
    
    posts_day = nrow(input_data)/timespan_days
    start_day <- sprintf("%s %s, %s", 
                         lubridate::month(label = TRUE, min(input_data$created_datetime)),
                         lubridate::day(min(input_data$created_datetime)),
                         lubridate::year(min(input_data$created_datetime)))
    
    end_day <- sprintf("%s %s, %s", 
                       lubridate::month(label = TRUE, max(input_data$created_datetime)),
                       lubridate::day(max(input_data$created_datetime)),
                       lubridate::year(max(input_data$created_datetime)))
    
    if (start_day != end_day){
      subtitle_timerange <- sprintf("based on data from %s to %s.", start_day, end_day)
    } else { subtitle_timerange <- sprintf("based on data from %s.", start_day)}
    
    
    input_data %>% 
      dplyr::mutate(plot_date = lubridate::floor_date(created_datetime, unit = plot_timespan_unit)) %>%
      dplyr::group_by(plot_date) %>%
      dplyr::count() %>%
      ggplot2::ggplot(ggplot2::aes(x=plot_date, y = n)) +
      ggplot2::geom_col(fill = "lightblue") +
      # geom_smooth(method = "loess", formula = "y ~ x" )+
      ggplot2::theme_minimal() +
      ggplot2::labs(title = "Search term results over time",
                    subtitle = sprintf("Average %.1f posts/day, %s", 
                                       posts_day, 
                                       subtitle_timerange),
                    x = "Time")
    
  } else {NULL}
}



make_subreddit_table <- function(input_data){
  subreddit <- n <- NULL
  
  if (nrow(input_data) > 0) {
    
    input_data %>%
      dplyr::group_by(subreddit) %>%
      dplyr::count(sort = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(percent = sprintf("%.1f%%", (n / sum(n))*100)) %>%
      dplyr::slice_head(n=20) %>%
      dplyr::left_join(socialastronomy.pbs4dash::subreddits, by = c("subreddit" = "display_name")) %>%
      dplyr::mutate(title_description = stringr::str_trunc(paste0(title, ": ", public_description), width = 100 )) %>%
      dplyr::select(subreddit, n, percent, title_description)
    
  } else {dplyr::tibble()}
  
}


make_subreddit_plot <- function(input_data) {
  n <- pct <- plot_adjust <- plot_colour <- subreddit <- NULL
  
  if (nrow(input_data) > 0) {
    
    input_data %>%
      dplyr::group_by(subreddit) %>%
      dplyr::count(sort = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(pct = sprintf("%.1f%%", 100 * n/sum(n))) %>%
      #mutate(plot_adjust = if_else(n > .5 * max(n), n - .1 * (max(n) - min(n)), n + .1 * (max(n) - min(n)))) %>%
      dplyr::mutate(plot_adjust =  n + .1 * (max(n) - min(n))) %>%
      # mutate(plot_colour = if_else(n > .5 * max(n), "white", "black")) %>%
      dplyr::slice_head(n=10) %>%
      ggplot2::ggplot() +
      ggplot2::geom_col(ggplot2::aes(x=stats::reorder(subreddit,n), y = n), fill = "lightblue") +
      ggplot2::geom_text(ggplot2::aes(x=stats::reorder(subreddit,n), y = plot_adjust, label = pct, fontface = "bold", colour = plot_colour), colour = "black") +
      ggplot2::coord_flip() +
      ggplot2::theme_minimal() +
      ggplot2::labs(y="Count",
                    x = "Subreddit",
                    title = "Top 10 Subreddits by Search Term Prevalence"
                    #,subtitle = paste0("Search terms: q=",q,"; subreddit=",subreddit,"; author=",author)
                                      ) +
      ggplot2::scale_y_continuous(breaks = function(x) unique(floor(pretty(seq(0, (max(x) + 1) * 1.1)))))
    
  } else {NULL}
  
}


get_top_words <- function(input_data){
  id <- word <- NULL
  
  num_inputs <- nrow(input_data)
  
  if (num_inputs > 0) {
    
    words <- input_data %>%
      dplyr::select(id, body) %>%
      tidytext::unnest_tokens(word, body) %>%
      dplyr::anti_join(custom_stop_words, by = "word") 
    
    word_counts <- words %>%
      dplyr::group_by(word) %>%
      dplyr::count(name = "num_uses", sort = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::slice_head(n=50)
  
    usage_counts <- words %>% 
       dplyr::filter(word %in% word_counts$word) %>%
      tidyr::nest(data = id) %>%
      dplyr::mutate(num_comments = purrr::map_dbl(data, function(x) unlist(x) %>% unique() %>% length()),
                    pct_comments = round(num_comments / num_inputs, digits = 3)) %>%
       dplyr::select(-data)
    
    result <- dplyr::left_join(word_counts, usage_counts, by = "word")

    result
    
  } else {dplyr::tibble()}
  
}



get_top_2grams <- function(input_data){
  
  data <- id <- ngram <- num_comments <- percent <- public_description <- title <- title_description <- word1 <- word2 <- NULL
  
  if (nrow(input_data) > 0) {
    
    input_data %>%
      dplyr::select(id, body) %>%
      tidytext::unnest_tokens(ngram, body, token = "ngrams", n = 2) %>%
      tidyr::separate(ngram, into = c("word1", "word2"), sep = " ", remove = FALSE) %>%
      dplyr::filter(!word1 %in% custom_stop_words$word,
                    !word2 %in% custom_stop_words$word) %>%
      dplyr::select(-word1, -word2) %>%
      dplyr::group_by(ngram) %>%
      dplyr::count(sort = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::slice_head(n = 50)
    
  } else {dplyr::tibble()}
  
}
BelangerAnalytics/socialastronomy.pbs4dash documentation built on Feb. 15, 2022, 8:06 a.m.