library(readr)
library(dplyr)
library(stringr)
library(tidyr)
library(ggplot2)
library(forcats)
library(tibble)
library(scales)
# for UI
library(shinyWidgets)
library(bsplus)
library(shinycssloaders)
# for table tab
library(DT)
# for counts and keyword pairs
library(ggiraph)
library(tidytext)
# for timeline tab
library(lubridate)
library(tsbox)
library(dygraphs)
library(gt)
# for map tab
library(leaflet)
library(leaflet.extras)
# read data sources -------------------------------------------------------
full_nested_df <- read_rds("full_nested_df.rds") # nested_sample.rds
full_unnested_df <- full_nested_df %>% unnest(keywords)
govt <- read_csv("govt.csv")
# static UI choices -------------------------------------------------------
app_title <- "India in The New York Times"
desks <- full_nested_df %>%
count(news_desk, sort = TRUE) %>%
pull(news_desk)
sections <- full_nested_df %>%
count(section, sort = TRUE) %>%
pull(section)
materials <- full_nested_df %>%
count(material, sort = TRUE) %>%
pull(material)
bylines <- full_nested_df %>%
count(byline, sort = TRUE) %>%
pull(byline)
values <- full_unnested_df %>%
count(value, sort = TRUE) %>%
pull(value)
governments <- govt %>%
pull(govt_name)
# need to break long options over multiple lines
# https://stackoverflow.com/questions/51355878/how-to-text-wrap-choices-from-a-pickerinput-if-the-length-of-the-choices-are-lo/51406191#51406191
bylines_broken <- bylines %>%
str_wrap(width = 40) %>%
str_replace_all("\\n", "<br>")
values_broken <- values %>%
str_wrap(width = 50) %>%
str_replace_all("\\n", "<br>")
# define input functions --------------------------------------------------
input_table <- tribble(
~id, ~label, ~widget, ~choices, ~broken_values, ~df_var, ~helper,
"single_date_range", "Single Continuous Date Range", "switch", NA_character_, NA_character_, NA_character_, NA_character_,
"dates", "Date Range (yyyy-mm-dd)", "dateRange", NA_character_, NA_character_, NA_character_, "Instead of a single date range, you can also select specific governments.",
"government", "Date Range by Governments", "picker", governments, NA_character_, NA_character_, "Government data pulled from <a href='https://en.wikipedia.org/wiki/List_of_prime_ministers_of_India' target='_blank'>Wikipedia list</a> of Indian Prime Ministers.",
"desk", "News Desk", "picker", desks, NA_character_, "news_desk", "All 'News Desk' values before 1981 are categorized as 'None'",
"section_name", "Section", "picker", sections, NA_character_, "section", "All 'Section' values before 1981 are categorized as 'Archives'",
"material_type", "Material", "picker", materials, NA_character_, "material", "All 'Material' values before '1963-11-23' are categorized as 'Archives'",
"by_line", "Byline", "picker", bylines, bylines_broken, "byline", "In searching for a particular author, the search bar is helpful to find spelling variations and co-authored pieces.",
"filter_keyword", "Filter by Keywords", "switch", NA_character_, NA_character_, NA_character_, NA_character_,
"keyword_condition_and", "Logical Operator", "switch", NA_character_, NA_character_, NA_character_, NA_character_,
"keywords", "Keyword(s)", "picker", values, values_broken, "value", "The 'AND' condition requires all selected keywords to be included in every article returned.",
"ind_rnk", "India Keyword Rank", "slider", NA_character_, NA_character_, "india_rank", "Every article includes 'India' as a location keyword. Most articles have multiple keywords, and the ranking of those keywords in order of importance is also reported.",
"filter_text", "Filter by Specific Words", "switch", NA_character_, NA_character_, NA_character_, NA_character_,
"text", "Search in Headline/Abstract/Lead (Case Insensitive)", "text", NA_character_, NA_character_, "word", "The API does not return the full text of articles. For all time periods, it returns a headline; after 2004 it returns an abstract...",
"page1", "Include Page 1 Articles Only", "switch", NA_character_, NA_character_, "front_page", NA_character_, # default is FALSE...
"is_printed", "Exclude Non-Printed Results (e.g. blogs)", "switch", NA_character_, NA_character_, "is_printed", NA_character_
)
mySwitchInput <- function(id, label, ...) {
if (id == "keyword_condition_and") {
switchInput(id, label,
value = FALSE, # FALSE is OR (default)
labelWidth = "200px",
onLabel = "AND", offLabel = "OR",
onStatus = "warning", offStatus = "warning"
)
} else {
switchInput(id, label, ..., labelWidth = "300px")
}
}
myPickerInput <- function(id, label, ...) {
# not sure why label needed here again...
label <- input_table$label[input_table$id == id]
choices <- input_table$choices[input_table$id == id][[1]]
broken_values <- input_table$broken_values[input_table$id == id][[1]]
pickerInput(id, label,
choices = choices,
selected = choices,
multiple = TRUE,
options = list(
`actions-box` = TRUE,
`live-search` = TRUE,
`liveSearchNormalize` = TRUE,
`selected-text-format` = "count > 3"
),
choicesOpt = list(content = broken_values)
)
}
myInput <- function(id, ...) {
label <- input_table$label[input_table$id == id]
widget <- input_table$widget[input_table$id == id]
helper <- input_table$helper[input_table$id == id]
if (widget == "switch") {
the_input <- mySwitchInput(id, label, ...)
} else if (widget == "picker") {
the_input <- myPickerInput(id, label, ...)
} else if (widget == "dateRange") {
the_input <- dateRangeInput(id, label,
start = min(full_nested_df$pub_date),
end = max(full_nested_df$pub_date),
min = min(full_nested_df$pub_date),
max = max(full_nested_df$pub_date),
startview = "decade"
)
} else if (widget == "slider") {
the_input <- sliderInput(id, label,
min = 1, max = max(full_nested_df$india_rank),
value = c(1, max(full_nested_df$india_rank)),
step = 1
)
} else if (widget == "text") {
the_input <- textInput(id, label)
}
if (!is.na(helper)) {
the_input %>%
shinyInput_label_embed(
icon("info") %>%
bs_embed_tooltip(title = helper)
)
} else {
the_input
}
}
withMySpinner <- function(ui_element) {
ui_element %>%
withSpinner(color = "#337ab7")
}
# filtering functions -----------------------------------------------------
filter_unnested <- function(single_date_range, dates, government,
desk, section_name, material_type,
by_line, filter_keyword,
keyword_condition_and, keywords,
ind_rnk, filter_text, text, page1,
is_printed) {
# keyword filtering is complicated; need to get desired urls first
if (filter_keyword == TRUE) {
urls <- full_unnested_df %>%
filter(value %in% keywords) %>%
{
if (keyword_condition_and == TRUE) {
group_by(., url) %>%
mutate(n = n()) %>%
filter(n == length(keywords)) %>%
ungroup()
} else {
.
}
} %>%
distinct(url) %>%
pull(url)
}
# if non-inclusive date filtering, process is different
from <- as.Date(str_sub(government, 1, 10))
to <- as.Date(str_sub(government, 14, 23))
temp_tbl <- tibble()
if (single_date_range != TRUE) {
# prevent failure if 0 govts selected
if (is.null(government)) {
temp_tbl <- full_unnested_df[0, ]
} else {
for (i in seq_along(government)) {
temp_tbl <- rbind(temp_tbl, full_unnested_df %>%
filter(pub_date >= from[i] &
pub_date <= to[i]))
}
}
} else {
temp_tbl <- full_unnested_df %>%
filter(pub_date >= dates[1] & pub_date <= dates[2])
}
temp_tbl %>%
arrange(desc(pub_date)) %>%
{
if (filter_keyword == TRUE) {
filter(., url %in% urls)
} else {
.
}
} %>%
{
if (page1 == TRUE) {
filter(., front_page == TRUE)
} else {
.
}
} %>%
{
if (is_printed == TRUE) {
filter(., printed == TRUE)
} else {
.
}
} %>%
{
if (filter_text == TRUE) {
mutate(.,
text_cols = str_c(headline, " ", abstract, " ", lead_paragraph),
text_cols = str_replace_all(
str_to_lower(text_cols),
"[^[:alnum:] ]", ""
)
) %>%
filter(
.,
str_detect(text_cols, text)
)
} else {
.
}
} %>%
filter(
news_desk %in% desk,
section %in% section_name,
material %in% material_type,
byline %in% by_line,
india_rank >= ind_rnk[1] & india_rank <= ind_rnk[2]
)
}
nest_df <- function(unnested_df) {
# instead of actually nesting/unnesting,
# much easier to get distinct url and then join
unnested_df %>%
distinct(url) %>%
left_join(full_nested_df, by = "url")
}
count_filters_on <- function(dates, government, desk,
section_name, material_type,
by_line, filter_keyword,
keywords, ind_rnk, filter_text, text,
page1, is_printed) {
n_filters <- 0
if (dates[1] != min(full_nested_df$pub_date) ||
dates[2] != max(full_nested_df$pub_date)) {
n_filters <- n_filters + 1
} else if (length(government) != length(governments)) {
n_filters <- n_filters + 1
}
if (length(desk) != length(desks)) {
n_filters <- n_filters + 1
}
if (length(section_name) != length(sections)) {
n_filters <- n_filters + 1
}
if (length(material_type) != length(materials)) {
n_filters <- n_filters + 1
}
if (length(by_line) != length(bylines)) {
n_filters <- n_filters + 1
}
if (filter_keyword == TRUE && length(keywords) != length(values)) {
n_filters <- n_filters + 1
}
if (ind_rnk[1] != 1 || ind_rnk[2] != max(full_nested_df$india_rank)) {
n_filters <- n_filters + 1
}
if (filter_text == TRUE && text != "") {
n_filters <- n_filters + 1
}
if (page1 != FALSE) {
n_filters <- n_filters + 1
}
if (is_printed != FALSE) {
n_filters <- n_filters + 1
}
n_filters
}
# table tab ---------------------------------------------------------------
prep_dt <- function(df) {
# prepare the nested df for the DataTable
df %>%
mutate(
Headline = str_glue('<a href="{url}" target="_blank">{headline}</a>')
) %>%
select(
Date = pub_date,
Headline,
`News Desk` = news_desk,
Section = section,
Material = material,
Byline = byline,
`India Keyword` = in_of_n_kword,
Abstract = abstract,
Lead = lead_paragraph
)
}
draw_dt <- function(df) {
# once df is correct format, use DT package to format it
datatable(df,
caption = tags$caption(
em(h5('Click on a cell in the News Desk, Section, Material, or Byline columns to filter for that value; click the "India Keyword" cell for keyword table.'))
),
class = "cell-border stripe order-column compact",
selection = list(mode = "single", target = "cell"),
extensions = "Responsive",
options = list(
# Cursor icon changes to hand (pointer) on Hover for keyword col
rowCallback = JS(
"function(nRow, aData, iDisplayIndex, iDisplayIndexFull) {",
"$('td:eq(6)', nRow).css('cursor', 'pointer');",
"}"
),
columnDefs = list(
list(className = "dt-center", targets = 6), # center kw col
list(
# ellipsis for long text cols
targets = c(5, 7, 8),
render = JS(
"function(data, type, row, meta) {",
"return type === 'display' && data.length > 20 ?",
"'<span title=\"' + data + '\">' + data.substr(0, 20) + '...</span>' : data;",
"}"
)
)
)
),
rownames = FALSE,
filter = "top",
escape = FALSE
)
}
prep_keyword_df <- function(cell_clicked, nested_df) {
if (is.null(cell_clicked$col) || cell_clicked$col != 6) {
return()
} else { # if (cell_clicked()$col == 6)
nested_df[cell_clicked$row, ]$keywords[[1]] %>%
select(
Name = name,
Value = value,
Rank = rank
)
}
}
draw_keyword_dt <- function(keyword_df) {
datatable(
keyword_df,
options = list(dom = "tipr"),
rownames = FALSE,
selection = "multiple"
)
}
myDTModal <- function(df, cell) {
col_name <- names(df)[cell$col + 1]
modalDialog(
title = str_glue('Do you want to filter for only articles that include "{cell$value}" as the choice for {col_name}?'),
str_glue("Doing so will remove all other {col_name} filters."),
actionButton("myDTActionButton", "Yes"),
easyClose = TRUE
)
}
myDownloadHandler <- function(df, ext) {
if (ext == ".rds") {
filename <- function() {
paste0("nyt-india-nested-", Sys.Date(), ".rds")
}
content <- function(file) {
write_rds(df, file)
}
} else if (ext == ".csv") {
filename <- function() {
paste0("nyt-india-unnested-", Sys.Date(), ".csv")
}
content <- function(file) {
write_csv(df, file)
}
}
downloadHandler(filename, content)
}
# counts tab --------------------------------------------------------------
# manually defined relationship between var in df and name in dropdown
count_choices <- list(
Categorical = list(
"News Desk" = "news_desk",
"Section" = "section",
"Material" = "material",
"Byline" = "byline"
),
Keywords = list(
"All Keywords" = "all",
"Subject Keywords" = "subject",
"Location Keywords" = "glocations",
"Person Keywords" = "persons",
"Organization Keywords" = "organizations",
"Creative Work Keywords" = "creative_works"
),
Text = list(
"Headline/Abstract/Lead" = "word",
"+/- Sentiment Words" = "sentiment"
),
Miscellaneous = list(
"India Keyword Rank" = "india_rank",
"Keywords per Article" = "max_kword",
"Front Page" = "front_page"
)
)
count_choices_df <- tibble(count_choices) %>%
unnest_longer(count_choices,
values_to = "var_name",
indices_to = "plot_name"
)
# words like india should be considered stop words
custom_stop_words <- bind_rows(
tibble(
word = c("india", "indian", "india's", "indians", "india’s", "iindian", "indias"),
lexicon = c("custom")
),
stop_words
)
prep_count_data <- function(nested_df, unnested_df, var) {
if (var == "word" || var == "sentiment") {
# need to combine 3 text columns into one to calculate sum counts
df <- nested_df %>%
mutate(
text = str_c(headline, " ", abstract, " ", lead_paragraph),
text = str_replace_all(str_to_lower(text), "[^[:alnum:] ]", "")
) %>%
select(text) %>%
unnest_tokens(word, text) %>% # from {tidytext} [already does some normalizing]
anti_join(custom_stop_words, by = "word") %>%
{
if (var == "sentiment") {
inner_join(., get_sentiments("bing"), by = "word")
} else {
.
}
} %>%
{
if (var == "sentiment") {
count(., word, sentiment, sort = TRUE)
} else {
count(., word, sort = TRUE)
}
}
} else if (var %in% c("india_rank", "max_kword", "news_desk", "section", "material", "byline", "front_page")) {
# need a simple count for categorical vars
df <- nested_df %>%
count(!!rlang::sym(var), sort = TRUE)
} else {
# last case is keyword categories: find distinct urls and join back
df <- unnested_df %>%
distinct(url) %>%
left_join(unnested_df, by = "url") %>%
filter(value != "india") %>%
{
if (var != "all") {
filter(., name == var)
} else {
.
}
} %>%
count(value, name, sort = TRUE) %>%
rename(!!var := value)
}
}
trim_count_df <- function(count_df, n_obs) {
var <- names(count_df)[1]
if (var %in% c("india_rank", "max_kword", "front_page")) {
df <- count_df
} else if (var %in% c("news_desk", "section", "material", "byline")) {
# categorical vars get lumped into an Other category
top_cats <- count_df %>% pull(!!rlang::sym(var))
df <- count_df %>%
mutate(!!var := ifelse(!!rlang::sym(var) %in% top_cats[1:n_obs - 1],
!!rlang::sym(var), "Other"
)) %>%
count(!!rlang::sym(var), wt = n)
} else {
# words, sentiment and keywords just get cut off
df <- count_df %>%
slice_head(n = n_obs)
}
df %>%
mutate(tip = str_c(.[[1]], ": ",
number(n,
accuracy = 1,
big.mark = ",")))
}
get_subtitle_str <- function(count_df, n_obs) {
if (nrow(count_df) <= n_obs) {
NULL
} else {
var <- names(count_df)[1]
left_out <- format(nrow(count_df) - n_obs, big.mark = ",")
if (var %in% c("india_rank", "max_kword", "front_page")) {
NULL
} else if (var %in% c("news_desk", "section", "material", "byline")) {
if (left_out == 1) {
str_glue('{left_out} category in "Other"')
} else {
str_glue('{left_out} categories in "Other"')
}
} else {
if (left_out == 1) {
str_glue("{left_out} other not shown")
} else {
str_glue("{left_out} others not shown")
}
}
}
}
keyword_pal <- c( # RColorBrewer::brewer.pal(n = 8, name = "Dark2")
"subject" = "#1B9E77",
"glocations" = "#D95F02",
"persons" = "#7570B3",
"organizations" = "#E7298A",
"creative_works" = "#66A61E"
)
sentiment_pal <- c( # RColorBrewer::brewer.pal(n = 3, name = "RdBu")
"positive" = "#67A9CF",
"negative" = "#EF8A62"
)
build_gg <- function(trimmed_count_df, sub_title) {
var <- names(trimmed_count_df)[1]
# fix to account for sentiment
title_var <- count_choices_df %>%
{
if (names(trimmed_count_df)[2] == "sentiment") {
filter(., var_name == "sentiment")
} else {
filter(., var_name == var)
}
} %>%
pull(plot_name)
if (var %in% c("india_rank", "max_kword", "front_page")) {
# no re-ordering
gg <- trimmed_count_df %>%
ggplot(aes(
x = !!rlang::sym(var),
y = n
))
} else if ( names(trimmed_count_df)[2] == "sentiment") {
gg <- trimmed_count_df %>%
ggplot(aes(
x = fct_reorder(!!rlang::sym(var), n),
y = n,
fill = sentiment
))
} else {
gg <- trimmed_count_df %>%
ggplot(aes(
x = fct_reorder(!!rlang::sym(var), n),
y = n
))
}
gg +
guides(fill = FALSE) +
{
if (var %in% c("all", "subject", "glocations", "persons", "organizations", "creative_works")) {
list(
geom_col_interactive(aes(
tooltip = tip,
data_id = !!rlang::sym(var),
fill = name
)),
scale_fill_manual(
values = keyword_pal,
limits = names(keyword_pal)
)
)
} else {
list(
geom_col_interactive(aes(
tooltip = tip,
data_id = !!rlang::sym(var)
)),
scale_fill_manual(
values = sentiment_pal,
limits = names(sentiment_pal)
)
)
}
} +
{
if (var != "word") {
scale_y_continuous("Number of Articles",
labels = scales::comma
)
} else {
scale_y_continuous("Number of Appearances",
labels = scales::comma
)
}
} +
{
if (!var %in% c("india_rank", "max_kword", "front_page")) {
scale_x_discrete(labels = function(x) str_wrap(x, width = 30))
}
} +
{
if (!var %in% c("india_rank", "max_kword", "front_page")) {
coord_flip()
}
} +
labs(
x = NULL,
title = str_glue("Frequency of {title_var} from Current Selection"),
subtitle = sub_title
) +
{
if (var == "india_rank") {
xlab("Rank of 'India' keyword for an article")
}
} +
{
if (var == "max_kword") {
xlab("Total number of keywords for an article")
}
} +
theme_classic(base_size = 24) + #16
theme(
# fix weird alignment when deployed
axis.text.y = element_text(hjust = 0.89)
)
}
draw_girafe <- function(gg, var) {
hover_fill <- if_else(var == "sentiment", "#fee090", "#a6cee3")
select_fill <- if_else(var == "sentiment", "#fec44f", "#1f78b4")
girafe(
ggobj = gg,
width_svg = 15, #11
height_svg = 10, #9
options = list(
opts_hover(
css = str_glue("fill:{hover_fill};stroke:gray;stroke-width:2;")
),
opts_selection(
type = "multiple",
only_shiny = FALSE,
css = str_glue("fill:{select_fill};stroke:gray;stroke-width:4;")
)
)
)
}
get_current_n <- function(nested_df) {
total_n <- format(nrow(full_nested_df),
big.mark = ","
)
n <- format(nrow(nested_df),
big.mark = ","
)
percentage_included <- sprintf(
"%.1f%%",
(nrow(nested_df) / nrow(full_nested_df)) * 100
)
if (n == total_n) {
str_glue("All {total_n} articles included (100%)")
} else {
str_glue("{n} of {total_n} articles included ({percentage_included})")
}
}
format_count_selection <- function(count_selected) {
paste(shQuote(count_selected), collapse = ", ")
}
counts_keyword_legend <- tribble(
~`Keyword Categories`,
"Subjects",
"Locations",
"Persons",
"Organizations",
"Creative Works"
) %>%
gt() %>%
data_color(
columns = vars(`Keyword Categories`),
colors = scales::col_factor( # RColorBrewer::brewer.pal(n = 8, name = "Dark2")
palette = c(
"Subjects" = "#1B9E77",
"Locations" = "#D95F02",
"Persons" = "#7570B3",
"Organizations" = "#E7298A",
"Creative Works" = "#66A61E"
),
domain = NULL,
ordered = TRUE
)
) %>%
tab_options(
table.align = "left",
table.font.size = "medium",
data_row.padding = px(2)
)
counts_sentiment_legend <- tribble(
~`Sentiment`,
"Positive",
"Negative"
) %>%
gt() %>%
data_color(
columns = vars(`Sentiment`),
colors = scales::col_factor( # RColorBrewer::brewer.pal(n = 3, name = "BrBG")
palette = c(
"Positive" = "#67A9CF",
"Negative" = "#EF8A62"
),
domain = NULL,
ordered = TRUE
)
) %>%
tab_options(
table.align = "left",
table.font.size = "medium",
data_row.padding = px(2)
)
# timeline tab ------------------------------------------------------------
get_xts_by_span <- function(df, time_span, count_or_sentiment) {
# prepare nested df in correct dygraph format
df %>%
mutate(time = floor_date(pub_date, time_span)) %>%
{
if (count_or_sentiment == "Sentiment Ratio") {
calculate_sentiment_ratio(.)
} else {
count(., time)
}
} %>%
ts_xts()
}
calculate_sentiment_ratio <- function(df) {
df %>%
mutate(text = str_c(headline, " ", abstract, " ", lead_paragraph),
text = str_replace_all(str_to_lower(text),
"[^[:alnum:] ]", "")
) %>%
select(time, text) %>%
unnest_tokens(word, text) %>% # from {tidytext} [already does some normalizing]
anti_join(custom_stop_words, by = "word") %>%
inner_join(get_sentiments("bing"), by = "word") %>%
count(time, sentiment) %>%
spread(sentiment, n) %>%
mutate(n = positive / negative) %>%
select(-c(negative, positive))
}
dyUnzoom <- function(dygraph) {
# unzoom plugin: https://rstudio.github.io/dygraphs/gallery-plugins.html
dyPlugin(
dygraph = dygraph,
name = "Unzoom",
path = system.file("plugins/unzoom.js", package = "dygraphs")
)
}
# for shading the timeline background
govt_list <- split(govt, seq(nrow(govt)))
add_shades <- function(x, periods, ...) {
# https://stackoverflow.com/questions/30805017/dyshading-r-dygraph
for (period in periods) {
x <- dyShading(x,
from = period$from,
to = period$to,
color = period$color, ...
)
}
x
}
draw_dygraph <- function(xts, nested_df, count_or_sentiment) {
if (count_or_sentiment == "Sentiment Ratio") {
main <- "Sentiment Ratio over Time"
label <- "+/- Sentiment"
strokeWidth <- 0
} else {
main <- "Number of Articles over Time"
label <- "Article Count"
strokeWidth <- 1
}
dygraph(xts, main = main) %>%
dyOptions(
drawPoints = TRUE,
pointSize = 2,
axisLineWidth = 2.5,
colors = "black",
strokeWidth = strokeWidth
) %>%
dySeries(label = label) %>%
dyHighlight(
highlightCircleSize = 5,
highlightSeriesBackgroundAlpha = 0.2
) %>%
dyRangeSelector(
dateWindow = c(
min(nested_df$pub_date),
max(nested_df$pub_date)
),
retainDateWindow = FALSE
) %>%
add_shades(govt_list) %>%
dyUnzoom() %>%
{
if (count_or_sentiment == "Sentiment Ratio") {
dyLimit(., limit = 1, color = "white")
} else {
.
}
}
}
shading_table <- tribble(
~`Timeline Shading`,
"British Raj",
"Indian National Congress / UPA",
"Bharatiya Janata Party / NDA",
"Janata Party",
"Janata Party (Secular) with INC",
"Janata Dal (National / United Front)",
"Samajwadi Janata Party with INC"
) %>%
gt() %>%
data_color(
columns = vars(`Timeline Shading`),
colors = scales::col_factor(
palette = c(
"British Raj" = "#fb9a99",
"Indian National Congress / UPA" = "#a6cee3",
"Bharatiya Janata Party / NDA" = "#FF9933",
"Janata Party" = "#1f78b4",
"Janata Party (Secular) with INC" = "#CAB2D6",
"Janata Dal (National / United Front)" = "#33a02c",
"Samajwadi Janata Party with INC" = "#b2df8a"
),
domain = NULL,
ordered = TRUE
)
) %>%
tab_options(
table.align = "left",
table.font.size = "medium",
data_row.padding = px(2)
)
# the map -----------------------------------------------------------------
clat <- 22.5
clon <- 82.5
my_zoom <- 5
prep_map_data <- function(unnested_df) {
# from unnested df, get counts of locations and coords
unnested_df %>%
filter(name == "glocations", value != "india") %>%
count(value, country, lat, lon) %>%
mutate(
value = str_to_title(value),
label = str_glue("{value}: {n}")
) %>%
filter(!is.na(lat)) # can remove this when fix the geocoding error Puducherry
}
draw_base_map <- function() {
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
setView(clon, clat, my_zoom) %>%
addResetMapButton() %>%
addMiniMap(toggleDisplay = TRUE, minimized = TRUE)
}
update_map <- function(m, df, base_radius) {
lpal <- colorFactor(
palette = c("#1b9e77", "#d95f02"), # Dark2
domain = df$country
)
leafletProxy(m, data = df) %>%
clearShapes() %>%
# getting an error from label argument w/ 0 rows: https://stackoverflow.com/questions/62000375/r-leaflet-error-when-plotting-labels-with-a-0-row-dataframe
{
if (nrow(df) != 0) {
addCircles(.,
lng = ~lon, lat = ~lat,
radius = ~ sqrt(n) * base_radius * 1000,#5000,
label = ~label, color = ~ lpal(country)
)
} else {
.
}
}
}
get_location_clicked <- function(unnested_df, latitude, longitude) {
unnested_df %>%
filter(
lat == latitude,
lon == longitude
) %>%
head(1) %>%
pull(value)
}
myMapModal <- function(location) {
modalDialog(
title = str_glue('Do you want to filter for only articles that include "{str_to_title(location)}" as a keyword?'),
"Doing so will remove all other keyword filters.",
actionButton("filter_location", "Yes"),
easyClose = TRUE
)
}
# keyword pairs tab -------------------------------------------------------
get_top_kword_counts <- function(unnested_df, n_keywords) {
unnested_df %>%
filter(value != "india") %>% # india would be in every pair
group_by(value) %>%
mutate(n = n()) %>% # keyword value counts
ungroup() %>%
arrange(desc(n)) %>%
distinct(value, n) %>%
top_n(n_keywords) %>%
mutate(Var2 = factor(value, levels = value)) %>%
select(-value, Var2_n = n)
}
percent <- function(x, digits = 1, format = "f", ...) {
paste0(formatC(x * 100, format = format, digits = digits, ...), "%")
}
get_keyword_pairs <- function(unnested_df, top_kword_counts) {
unnested_df %>%
filter(value %in% top_kword_counts$Var2) %>% # only looking at associations among top keywords
select(url, Var1 = value) %>%
mutate(n = 1) %>%
spread(Var1, n, fill = 0) %>%
select(-url) %>%
{
crossprod(as.matrix(.))
} %>% # var1 and var2 as factor
as.data.frame.table(responseName = "weight") %>%
left_join(top_kword_counts, by = c("Var2")) %>%
mutate(
weight_over_var2n = weight / Var2_n,
weight_over_var2n = if_else(weight_over_var2n == 1.0,
NA_real_,
weight_over_var2n
),
pct = if_else(is.na(weight_over_var2n),
NA_character_,
percent(weight_over_var2n)
),
tip = if_else(is.na(pct),
NA_character_,
as.character(str_glue("{pct} of articles with the keyword '{Var2}' also have the keyword '{Var1}'"))
)
) %>%
arrange(desc(Var2_n), desc(weight)) %>%
mutate(
Var1 = factor(Var1, levels = unique(Var2)),
Var2 = factor(Var2, levels = unique(Var2))
) %>%
rowid_to_column()
}
draw_keyword_heatmap <- function(keyword_pairs) {
gg <- ggplot(
keyword_pairs,
aes(x = Var1, y = Var2)
) +
geom_tile_interactive(aes(
fill = weight_over_var2n,
data_id = rowid,
tooltip = tip
)) +
scale_fill_viridis_c() +
scale_x_discrete(label = function(x) str_trunc(x, width = 20)) +
labs(x = NULL, y = NULL) +
theme_classic(base_size = 24) +
theme(
legend.title = element_blank(),
legend.key.size = unit(1.5, "cm"),
# weird alignment issue not found locally
axis.text.y = element_text(hjust = 0.89)
) +
guides(x = guide_axis(angle = 45))
girafe(
ggobj = gg,
width_svg = 15, #12
height_svg = 10, #9
options = list(
opts_selection(
type = "single",
css = "stroke:black;stroke-width:8"
),
opts_toolbar(saveaspng = FALSE),
opts_hover(css = "stroke:white;stroke-width:4")
)
)
}
format_heatmap_selection <- function(keyword_pairs, heatmap_cell) {
if (is.null(heatmap_cell)) {
return(c(""))
} else {
heatmap_row <- keyword_pairs %>%
filter(rowid == heatmap_cell)
str_glue("Of {heatmap_row$Var2_n} articles with the keyword
'{heatmap_row$Var2}', {heatmap_row$weight} also have
the keyword '{heatmap_row$Var1}'.")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.