# set up custom stop words
custom_stop_words <- dplyr::bind_rows(tidytext::stop_words,
dplyr::tibble(word = c(1:10, "it's", "it’s", "http", "https", "i’m",
"à", "c'est", "une", "gt", "de", "pas", "du",
"le", "une", "dans", "en", "les", "la", "des")))
#' Heatmaps over time
#'
#' @description A fct function
#'
#' @return The return value, if any, from executing the function.
#'
#' @noRd
plot_reddit_heatmap <- function (input_data){
created_datetime <- plot_hour <- n <- NULL
num_comments <- nrow(input_data)
if (num_comments > 0) {
input_data %>%
dplyr::select(created_datetime) %>%
dplyr::mutate(plot_hour = lubridate::hour(created_datetime)) %>%
dplyr::group_by(plot_hour) %>%
dplyr::count() %>%
ggplot2::ggplot(ggplot2::aes(x=plot_hour, y= " ", fill = n)) +
ggplot2::geom_tile() +
ggplot2::theme_minimal() +
ggplot2::scale_fill_viridis_c(option = "C") +
ggplot2::scale_x_continuous(limits = c(-1, 24),
expand = c(0,0),
breaks = c(0,4,8,12,16,20)) +
ggplot2::scale_y_discrete(expand = c(0,0)) +
ggplot2::labs(title = "Heat map of search term usage by time (EST)",
subtitle = sprintf("White zones indicate no data found. (n=%s)",num_comments),
x = "Hour",
y = NULL,
fill = "Posts"
)
} else {NULL}
}
#' Barchart of posts over time
#'
#' @description A fct function
#'
#' @return The return value, if any, from executing the function.
#'
#' @noRd
plot_reddit_posts_over_time <- function(input_data) {
created_datetime <- n <- plot_date <- NULL
if (nrow(input_data) > 0) {
timespan <- difftime(max(input_data$created_datetime),
min(input_data$created_datetime),
units = "hours")
plot_timespan_unit <- dplyr::case_when(
timespan < 6 ~ "minutes",
timespan < 24 * 3 ~ "hours",
timespan < 24 * 14 ~ "days",
timespan < 24 * 7 * 8 ~ "weeks",
timespan < 24 * 7 * 4 * 24 ~ "months",
TRUE ~ "years"
)
#message(sprintf("timespan: %s", timespan))
#message(sprintf("plot_timespan_unit: %s", plot_timespan_unit))
# for the subtitle: avg. posts/day
timespan_days <- difftime(max(input_data$created_datetime),
min(input_data$created_datetime),
units = "days") %>%
as.numeric()
posts_day = nrow(input_data)/timespan_days
start_day <- sprintf("%s %s, %s",
lubridate::month(label = TRUE, min(input_data$created_datetime)),
lubridate::day(min(input_data$created_datetime)),
lubridate::year(min(input_data$created_datetime)))
end_day <- sprintf("%s %s, %s",
lubridate::month(label = TRUE, max(input_data$created_datetime)),
lubridate::day(max(input_data$created_datetime)),
lubridate::year(max(input_data$created_datetime)))
if (start_day != end_day){
subtitle_timerange <- sprintf("based on data from %s to %s.", start_day, end_day)
} else { subtitle_timerange <- sprintf("based on data from %s.", start_day)}
input_data %>%
dplyr::mutate(plot_date = lubridate::floor_date(created_datetime, unit = plot_timespan_unit)) %>%
dplyr::group_by(plot_date) %>%
dplyr::count() %>%
ggplot2::ggplot(ggplot2::aes(x=plot_date, y = n)) +
ggplot2::geom_col(fill = "lightblue") +
# geom_smooth(method = "loess", formula = "y ~ x" )+
ggplot2::theme_minimal() +
ggplot2::labs(title = "Search term results over time",
subtitle = sprintf("Average %.1f posts/day, %s",
posts_day,
subtitle_timerange),
x = "Time")
} else {NULL}
}
make_subreddit_table <- function(input_data){
subreddit <- n <- NULL
if (nrow(input_data) > 0) {
input_data %>%
dplyr::group_by(subreddit) %>%
dplyr::count(sort = TRUE) %>%
dplyr::ungroup() %>%
dplyr::mutate(percent = sprintf("%.1f%%", (n / sum(n))*100)) %>%
dplyr::slice_head(n=20) %>%
dplyr::left_join(socialastronomy.pbs4dash::subreddits, by = c("subreddit" = "display_name")) %>%
dplyr::mutate(title_description = stringr::str_trunc(paste0(title, ": ", public_description), width = 100 )) %>%
dplyr::select(subreddit, n, percent, title_description)
} else {dplyr::tibble()}
}
make_subreddit_plot <- function(input_data) {
n <- pct <- plot_adjust <- plot_colour <- subreddit <- NULL
if (nrow(input_data) > 0) {
input_data %>%
dplyr::group_by(subreddit) %>%
dplyr::count(sort = TRUE) %>%
dplyr::ungroup() %>%
dplyr::mutate(pct = sprintf("%.1f%%", 100 * n/sum(n))) %>%
#mutate(plot_adjust = if_else(n > .5 * max(n), n - .1 * (max(n) - min(n)), n + .1 * (max(n) - min(n)))) %>%
dplyr::mutate(plot_adjust = n + .1 * (max(n) - min(n))) %>%
# mutate(plot_colour = if_else(n > .5 * max(n), "white", "black")) %>%
dplyr::slice_head(n=10) %>%
ggplot2::ggplot() +
ggplot2::geom_col(ggplot2::aes(x=stats::reorder(subreddit,n), y = n), fill = "lightblue") +
ggplot2::geom_text(ggplot2::aes(x=stats::reorder(subreddit,n), y = plot_adjust, label = pct, fontface = "bold", colour = plot_colour), colour = "black") +
ggplot2::coord_flip() +
ggplot2::theme_minimal() +
ggplot2::labs(y="Count",
x = "Subreddit",
title = "Top 10 Subreddits by Search Term Prevalence"
#,subtitle = paste0("Search terms: q=",q,"; subreddit=",subreddit,"; author=",author)
) +
ggplot2::scale_y_continuous(breaks = function(x) unique(floor(pretty(seq(0, (max(x) + 1) * 1.1)))))
} else {NULL}
}
get_top_words <- function(input_data){
id <- word <- NULL
num_inputs <- nrow(input_data)
if (num_inputs > 0) {
words <- input_data %>%
dplyr::select(id, body) %>%
tidytext::unnest_tokens(word, body) %>%
dplyr::anti_join(custom_stop_words, by = "word")
word_counts <- words %>%
dplyr::group_by(word) %>%
dplyr::count(name = "num_uses", sort = TRUE) %>%
dplyr::ungroup() %>%
dplyr::slice_head(n=50)
usage_counts <- words %>%
dplyr::filter(word %in% word_counts$word) %>%
tidyr::nest(data = id) %>%
dplyr::mutate(num_comments = purrr::map_dbl(data, function(x) unlist(x) %>% unique() %>% length()),
pct_comments = round(num_comments / num_inputs, digits = 3)) %>%
dplyr::select(-data)
result <- dplyr::left_join(word_counts, usage_counts, by = "word")
result
} else {dplyr::tibble()}
}
get_top_2grams <- function(input_data){
data <- id <- ngram <- num_comments <- percent <- public_description <- title <- title_description <- word1 <- word2 <- NULL
if (nrow(input_data) > 0) {
input_data %>%
dplyr::select(id, body) %>%
tidytext::unnest_tokens(ngram, body, token = "ngrams", n = 2) %>%
tidyr::separate(ngram, into = c("word1", "word2"), sep = " ", remove = FALSE) %>%
dplyr::filter(!word1 %in% custom_stop_words$word,
!word2 %in% custom_stop_words$word) %>%
dplyr::select(-word1, -word2) %>%
dplyr::group_by(ngram) %>%
dplyr::count(sort = TRUE) %>%
dplyr::ungroup() %>%
dplyr::slice_head(n = 50)
} else {dplyr::tibble()}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.