Set-up

# Load packages
library(tidyverse, quietly = TRUE)
library(devtools, quietly = TRUE)
library(readxl, quietly = TRUE)
library(lubridate, quietly = TRUE)
library(stringr, quietly = TRUE)
library(janitor, quietly = TRUE)
library(zoo, quietly = TRUE)
library(RcppRoll, quietly = TRUE)
library(rgdal, quietly = TRUE) # for shapefiles
library(sf, quietly = TRUE) # manipulating spatial data
library(lme4, quietly = TRUE) # for glmer, etc. modelling
library(httr, quietly = TRUE)
library(ggrepel, quietly = TRUE)
library(reticulate, quietly = TRUE) # using python code in R
library(vroom, quietly = TRUE) # loading json faster
library(ndjson, quietly = TRUE)
library(jsonlite, quietly = TRUE)
library(rvest, quietly = TRUE) # reading html websites
library(maps, quietly = TRUE)
library(ggmap, quietly = TRUE)
library(stars, quietly = TRUE) # rasters
library(maptools, quietly = TRUE)
library(spatstat, quietly = TRUE)
library(koRpus, quietly = TRUE)
library(quanteda, quietly = TRUE)
library(textdata, quietly = TRUE)
library(tidytext, quietly = TRUE)
library(SnowballC, quietly = TRUE)
library(furrr, quietly = TRUE)
library(data.table, quietly = TRUE)
library(scales, quietly = TRUE)
library(patchwork, quietly = TRUE)
library(GGally, quietly = TRUE)
library(broom.mixed, quietly = TRUE)
library(modelr, quietly = TRUE)
library(sjstats, quietly = TRUE)

# Load package functions
#devtools::load_all()
# Source R scripts
list.files("R", full.names = TRUE) %>%
  purrr::map(source)

Load prepared data

# Load formatted data
# Formatted with R scripts in data-raw
load("data/formatted_gadm.RData")

load("data/formatted_supplementary.RData")

load("data/tweets_sf.RData")

senti_tokens <- fread("data/senti_tokens.csv", nThread = 20000)

tweets_sub <- fread("data/tweets_sub.csv", nThread = 20000)

Sentiment analysis

# Calculate sentiment per Tweet
# Mean sentiment per Tweet
senti_tweet <- calculate_sentiment_per_tweet(senti_tokens)
# Add regions to sentiment
senti_tweet_w_region_sf <- add_regions(senti_tweet, boundaries_subnational)

# Convert to data.table()
senti_tweet_w_region <- as.data.table(senti_tweet_w_region_sf) %>%
  select(-geometry)

Sentiment raw

# Sentiment non-adjusted
# raw is non-adjusted
senti_raw <- calculate_sentiment_per_day(senti_tweet_w_region)

senti_raw

# Regions included in data
senti_raw$region_1 %>% unique %>% paste(., collapse = ", ")

Sentiment adjusted over time

# Sentiment adjusted
# Based on regions
# average sentiment per region
# using non-point tweets to show trend over time
senti_adj <- create_region_adjusted_sentiment(senti_raw)

senti_adj %>% arrange(region_1) %>% distinct(region_1) %>% paste

# Linear interpolation to make up for missing days
# interpolate_missing_days()

Baseline estimates

# Find for-against based on mean sentiment
senti_cut_offs <- create_cut_offs(senti_tweet_w_region)

#senti_cut_offs <- senti_cut_offs[1:10000,]

# Create raw number of pros and cons per day
baseline_n_day <- calculate_baseline_n_per_day(senti_cut_offs)

# Calculate raw estimate per day
baseline_day <- calculate_baseline_estimate_per_day(raw_n_day, n_roll = 7)

baseline_day %>% filter(is.na(country))

# Add national level
baseline_day <- add_national_level(baseline_day)

# Plot pro-share by cut-offs
baseline_day %>%
  filter(leader == "Ghani") %>%
  ggplot(., aes(x = date, y = pro_share)) +
  geom_point(colour = "red") +
  geom_line(aes(y = pro_share_roll), colour = "black") +
  facet_wrap(~ cut_off) +
  labs(title = "Ghani's raw favourability rate by baseline estimates and different cut offs",
       subtitle = "Estimated favourability of Afghan leader Ghani by baseline sentiment and different cut offs") +
  theme_devpublicopinion
# Add targets (election results and polls)
# Join sentiment with target vector, election and polling results
baseline_day <- baseline_day %>%
  mutate(target_id = row_number())

baseline_day_targets_all <- join_targets(baseline_day, targets_master)

# Check which senti regions are not in targets object
baseline_day %>% filter(!region_1 %in% targets_master$region_1) %>% distinct(country, region_1)

# Select nearest target (election or poll) for each day
baseline_day_targets <- select_nearest_target(baseline_day_targets_all)

# Compare rows
nrow(baseline_day)
nrow(baseline_day_targets)

# Create variable with country and region_1
baseline_day_targets <- baseline_day_targets %>%
  mutate(year = year(date),
         country_and_region_1 = paste0(country, "_", region_1))

# Add GDL covariates
baseline_day_targets_covars <- add_gdl_covariates(baseline_day_targets, gdl_interpo)

# Check if regions are missing GDL statistics
baseline_day_targets_covars %>% filter(is.na(popshare)) %>% distinct(region_1)
# Validate baseline

# Summary statistics of each cut-off
baseline_mae <- mae_by_cut_offs(baseline_day_targets_covars, type = "election", days_diff_less = 10)

# Plot raw estimate stats of each cut-off
baseline_mae %>%
  mutate(country_elex = paste0(country, ", ", format(date_target, "%B %Y"))) %>%
  ggplot(.,
         aes(x = cut_off, y = mean*100)) +
  geom_col() +
  #geom_smooth(method = lm) +
  facet_wrap(~ country_elex) +
  labs(title = "Mean absolute error of baseline' electoral estimates at different AFINN cut offs",
       subtitle = "Mean absolute error (MAE) of election estimates based on baseline AFINN sentiment, percentage points",
       x = "Cut off values",
       y = "MAE"
  ) +
  theme_devpublicopinion

Add targets and GDL for modelling

# Add targets (election results and polls)
# Join sentiment with target vector, election and polling results
senti_raw <- senti_raw %>%
  mutate(target_id = row_number())

# Add national level to join targets
senti_raw <- add_national_level(senti_raw)

senti_raw_targets_all <- join_targets(senti_raw, targets_master)

# Check which senti regions are not in targets object
senti_raw %>% filter(!region_1 %in% targets_master$region_1) %>% distinct(country, region_1)

# Select nearest target (election or poll) for each day
senti_raw_targets <- select_nearest_target(senti_raw_targets_all)

# Compare rows
nrow(senti_raw)
nrow(senti_raw_targets)

# Create variable with country and region_1
senti_raw_targets <- senti_raw_targets %>%
  mutate(year = year(date),
         country_and_region_1 = paste0(country, "_", region_1))

# Add GDL covariates
senti_raw_targets_covars <- add_gdl_covariates(senti_raw_targets, gdl_interpo)

# Check if regions are missing GDL statistics
senti_raw_targets_covars %>% filter(is.na(popshare)) %>% distinct(region_1)
# Add row_number to join targets
senti_adj <- senti_adj %>%
  mutate(target_id = row_number())

# Fix missing national and region column
senti_adj <- add_national_level(senti_adj)

# Add targets (election results and polls)
senti_adj_targets_all <- join_targets(senti_adj, targets_master, by_region = FALSE)

# Check which senti regions are not in targets object
senti_adj %>% filter(!region_1 %in% targets_master$region_1) %>% distinct(country, region_1)

# Select nearest target (election or poll) for each day
senti_adj_targets <- select_nearest_target(senti_adj_targets_all)

# Compare rows
# New rows indicate targets being of equal diff days to a sentiment day
nrow(senti_adj)
nrow(senti_adj_targets)

# Create variable with country and region_1
senti_adj_targets <- senti_adj_targets %>%
  mutate(year = year(date),
         country_and_region_1 = paste0(country, "_", region_1))

# Add GDL covariates
senti_adj_targets_covars <- add_gdl_covariates(senti_adj_targets, gdl_interpo)

# Check if regions are missing GDL statistics
senti_adj_targets_covars %>% filter(is.na(popshare)) %>% distinct(region_1)
# Elections for each country
senti_adj_targets_covars %>%
  filter(type == "election") %>%
  distinct(country, date_target) %>%
  arrange(country)

Fit models

# Training data
days_for_training <- 100

# Sentiment raw
train_data_elex_first_raw <- create_training_data(senti_raw_targets_covars,
                                                  type = "elex", less_than_days_diff = days_for_training)
train_data_polls_raw <- create_training_data(senti_raw_targets_covars,
                                             type = "polls", less_than_days_diff = days_for_training)

# Sentiment adjusted
train_data_elex_first_adj <- create_training_data(senti_adj_targets_covars,
                                                  type = "elex", less_than_days_diff = days_for_training)
train_data_polls_adj <- create_training_data(senti_adj_targets_covars,
                                             type = "polls", less_than_days_diff = days_for_training)
# Average number of tweets per day in training data

train_data_elex_first_adj %>%
  #group_by(leader, country, region_1, region_2, date) %>%
  summarise(n_tweets_daily = mean(n_tweets_daily))
# Linear models
fit_lm_1_raw <- function(train_data){ lm(votes_share ~ afinn_mean, data = train_data) }
fit_lm_2_raw <- function(train_data){ lm(votes_share ~ n_retweets_mean, data = train_data) }
fit_lm_3_raw <- function(train_data){ lm(votes_share ~ afinn_mean*n_retweets_mean, data = train_data) }

fit_lm_1_adj <- function(train_data){ lm(votes_share ~ afinn_mean_daily, data = train_data) }
fit_lm_2_adj <- function(train_data){ lm(votes_share ~ n_retweets_mean, data = train_data) }
fit_lm_3_adj <- function(train_data){ lm(votes_share ~  afinn_mean_daily*n_retweets_mean + afinn_mean_region,
                                         data = train_data) }

# Fit models on elections
# Raw
lm_r_e <- train_data_elex_first_raw %>%
  group_by(country) %>% 
  nest() %>%
  mutate(lm_1_r_e = purrr::map(data, ~fit_lm_1_raw(.x)),
         lm_2_r_e = purrr::map(data, ~fit_lm_2_raw(.x)),
         lm_3_r_e = purrr::map(data, ~fit_lm_3_raw(.x))
  ) %>%
  ungroup() %>%
  pivot_longer(cols = starts_with("lm"), names_to = "model_name", values_to = "model") %>%
  mutate(model_type = "raw") %>%
  rename(model_country = country)

lm_r_e

# Adjusted
lm_a_e <- train_data_elex_first_adj %>%
  group_by(country) %>% 
  nest() %>%
  mutate(lm_1_a_e = purrr::map(data, ~fit_lm_1_adj(.x)),
         lm_2_a_e = purrr::map(data, ~fit_lm_2_adj(.x)),
         lm_3_a_e = purrr::map(data, ~fit_lm_3_adj(.x))
  ) %>%
  ungroup() %>%
  pivot_longer(cols = starts_with("lm"), names_to = "model_name", values_to = "model") %>%
  mutate(model_type = "adjusted") %>%
  rename(model_country = country)

lm_a_e
# Multilevel models

# Functions for multilevel regression fitting
# Raw
fit_mr_1_raw <- function(train_data){

  lmer(votes_share ~ afinn_mean + (1 + afinn_mean | country), data = train_data)

}

fit_mr_2_raw <- function(train_data){

  lmer(votes_share ~ n_retweets_mean + (1 + n_retweets_mean | country), data = train_data)

}

fit_mr_3_raw <- function(train_data){

  lmer(votes_share ~ afinn_mean*n_retweets_mean + (1 + afinn_mean | country), data = train_data)

}

fit_mr_4_raw <- function(train_data){

  lmer(votes_share ~ afinn_mean*n_retweets_mean + (1 + afinn_mean | country), data = train_data)

}


# Adjusted
fit_mr_1_adj <- function(train_data){

  lmer(votes_share ~ afinn_mean_daily + (1 + afinn_mean_daily | country), data = train_data)

}

fit_mr_2_adj <- function(train_data){

  lmer(votes_share ~ n_retweets_mean + (1 + n_retweets_mean | country), data = train_data)

}

fit_mr_3_adj <- function(train_data){

  lmer(votes_share ~ afinn_mean_daily*n_retweets_mean + (1 + afinn_mean_daily | country), data = train_data)

}

fit_mr_4_adj <- function(train_data){

  lmer(votes_share ~ afinn_mean_daily*n_retweets_mean + afinn_mean_region + phone + (1 + afinn_mean_daily | country), data = train_data)

}

# Check training data
names(train_data_elex_first_raw)

# Fit multilevel models on elections
# Raw
mr_r_e <- train_data_elex_first_raw %>%
  nest(data = everything()) %>%
  mutate(mr_1_r_e = purrr::map(data, ~fit_mr_1_raw(.x)),
         mr_2_r_e = purrr::map(data, ~fit_mr_2_raw(.x)),
         mr_3_r_e = purrr::map(data, ~fit_mr_3_raw(.x)),
         mr_4_r_e = purrr::map(data, ~fit_mr_4_raw(.x))
  ) %>%
  pivot_longer(cols = starts_with("mr"), names_to = "model_name", values_to = "model") %>%
  mutate(model_type = "raw")

# Adjusted
mr_a_e <- train_data_elex_first_adj %>%
  nest(data = everything()) %>%
  mutate(mr_1_a_e = purrr::map(data, ~fit_mr_1_adj(.x)),
         mr_2_a_e = purrr::map(data, ~fit_mr_2_adj(.x)),
         mr_3_a_e = purrr::map(data, ~fit_mr_3_adj(.x)),
         mr_4_a_e = purrr::map(data, ~fit_mr_4_adj(.x))
  ) %>%
  pivot_longer(cols = starts_with("mr"), names_to = "model_name", values_to = "model") %>%
  mutate(model_type = "adjusted")


# Fit multilevel models on polls
# Raw
mr_r_p <- train_data_polls_raw %>%
  nest(data = everything()) %>%
  mutate(mr_1_r_p = purrr::map(data, ~fit_mr_1_raw(.x)),
         mr_2_r_p = purrr::map(data, ~fit_mr_2_raw(.x)),
         mr_3_r_p = purrr::map(data, ~fit_mr_3_raw(.x)),
         mr_4_r_p = purrr::map(data, ~fit_mr_4_raw(.x))
  ) %>%
  pivot_longer(cols = starts_with("mr"), names_to = "model_name", values_to = "model") %>%
  mutate(model_type = "raw")


# Adjusted
mr_a_p <- train_data_polls_adj %>%
  nest(data = everything()) %>%
  mutate(mr_1_a_p = purrr::map(data, ~fit_mr_1_adj(.x)),
         mr_2_a_p = purrr::map(data, ~fit_mr_2_adj(.x)),
         mr_3_a_p = purrr::map(data, ~fit_mr_3_adj(.x)),
         mr_4_a_p = purrr::map(data, ~fit_mr_4_adj(.x))

  ) %>%
  pivot_longer(cols = starts_with("mr"), names_to = "model_name", values_to = "model") %>%
  mutate(model_type = "adjusted")

Predictions

# Bind MR models
models_mr <- bind_rows(mr_r_e,
                       mr_r_p,
                       mr_a_e,
                       mr_a_p) %>%
  mutate(join_id = 1)

# Bind all models together
# Multilevel and linear
# Adjusted and raw
models_master <- bind_all_models(models_mr, lm_a_e, lm_r_e)

models_master %>% filter(is.na(model_country))

models_master %>% distinct(model_name_long)
# Use tidy to get model statistics
# broom and broom.mixed packages
models_stats <- models_master %>%
  mutate(model_tidy = purrr::map(model, broom.mixed::tidy)) %>%
  unnest(model_tidy)

# List all models
# Including each linear model per country
models_stats %>%
  filter(!is.na(model_country)) %>%
  distinct(model_name_long, model_country)

# Tidy names, levels, and statistics
models_stats <- models_stats %>%
  select(model_name_long, model_country, obs, effect, group, term, estimate, std.error, statistic, p.value) %>%
  mutate(effect = recode(effect,
                         "fixed" = "Fixed",
                         "ran_pars" = "Random"),
         group = recode(group,
                        "country" = "Country"
         ),
         term = gsub("\\(|\\)", "", term),
         term = gsub("__", " ", term),
         term = gsub("\\.", " and ", term),
         term = gsub("\\:", " and ", term),
         term = gsub("afinn_mean", "Sentiment", term),
         term = gsub("cor", "Corr.,", term),
         term = gsub("n_retweets_mean", "Retweets", term),
         term = gsub("Sentiment_region", "Regional sentiment", term),
         term = gsub("Sentiment_daily", "Daily sentiment", term),
         term = gsub("sd", "SD,", term),
         term = gsub("phone", "Phone share", term)
  )

sapply(models_stats %>% select(effect, group, term), unique)

# Sort names and table column
all_tables <- models_stats %>%
  # Split tibbles
  dplyr::group_split(model_name_long, model_country) %>%
  purrr::map(~mutate(., across(where(is.double), ~round(., 2) %>% as.character))) %>%
  purrr::map(~add_row(.,
                      effect = "Effect",
                      group = "Group",
                      term = "Term",
                      estimate = "Estimate",
                      std.error = "Standard error",
                      statistic = "Statistic",
                      p.value = "P-value",
                      .before = 1)) %>%
  purrr::map(~mutate(.,
                     model_name_long = na.locf(model_name_long, fromLast = TRUE),
                     model_country = na.locf(model_country, fromLast = TRUE, na.rm = FALSE),
                     obs = na.locf(obs, fromLast = TRUE) %>% paste0("Obs.: ", .)
  )) %>%
  purrr::map(~add_row(.)) %>%
  purrr::map(~mutate(.,
                     model_name_long = if_else(row_number() %in% 2:nrow(.), NA_character_, model_name_long),
                     model_country = if_else(row_number() %in% 2:nrow(.), NA_character_, model_country),
                     obs = if_else(row_number() %in% 2:nrow(.), NA_character_, obs),
  )) %>%
  # Bind tables
  bind_rows() %>%
  # Replace NAs with empty cells
  mutate(across(everything(), ~if_else(is.na(.), "", .)))

all_tables

# Save file which can then be pasted in document
write_csv(all_tables, file = "output/tables/appendix_models_tables.csv")


# Check individual models
best_model <- models_master %>%
  filter(model_name_long == "MR 3, election fitted, adjusted")

best_model$model

tidy(best_model)
# Create test data
test_raw <- create_test_data(senti_raw_targets_covars %>% rename(n_tweets_daily = n_tweets),
                             choose_type = "election") %>%
  mutate(test_type = "raw")

test_adjusted <- create_test_data(senti_adj_targets_covars, choose_type = "election") %>%
  mutate(test_type = "adjusted")

test_master <- bind_rows(test_raw,
                         test_adjusted) %>%
  mutate(test_id = row_number())
# Add predictions for all models
predictions <- add_all_model_predictions(models_master, test_master)
# Add post-stratified estimates with GDL's popshare variable
prediction_master <- predictions %>%
  mutate(prediction_post = prediction*popshare)

# Summarise weighted prediction of each region by country
(predictions_post_national <- prediction_master %>%
    filter(region_2 != "National") %>%
    group_by(country, year, leader, model_name, model_name_long, days_diff_abs) %>%
    summarise(prediction = weighted.mean(prediction, popshare, na.rm = TRUE),
              n_tweets_daily = mean(n_tweets_daily, na.rm = TRUE),
              mean_obs = mean(obs)) %>%
    ungroup() %>%
    mutate(model_name_long = paste0(model_name_long, ", post"),
           level = "National",
           region_1 = "National",
           region_2 = "National") %>%
    arrange(country)
)

# Add actual national vote shares
# Find actual national vote shares
(actual_shares_national <- find_actual_vote_shares_national(prediction_master))

# Join
predictions_post_national <- left_join(predictions_post_national, actual_shares_national)

# Bind post-predictions back to master prediction dataset
prediction_master <- bind_rows(prediction_master, predictions_post_national)

# Check rows just added
prediction_master %>%
  filter(grepl("post", model_name_long)) %>% select(country, region_1, region_2, leader, model_name, days_diff_abs, votes_share, prediction)
# Find MAE of post-stratification

# # Find MAE for post by model
# (mae_post_by_model <- predictions_post_national %>%
#     filter(!grepl("Linear", model_name_long)) %>%
#     filter(!is.na(model_name)) %>%
#     group_by(model_name, model_name_long, mean_obs) %>%
#     summarise(mae = mean(error_abs, na.rm = TRUE),
#               n_tweets_daily_mean = mean(n_tweets_daily_mean, na.rm = TRUE)
#     ) %>%
#     ungroup() %>%
#     arrange(mae)
# )
# 
# # Find error for post by model and country
# (mae_post_by_model_country <- predictions_post_national %>%
#     filter(!grepl("Linear", model_name_long)) %>%
#     filter(!is.na(model_name)) %>%
#     group_by(country, model_name_long,mean_obs) %>%
#     summarise(mae = mean(error_abs, na.rm = TRUE),
#               n_tweets_daily_mean = mean(n_tweets_daily_mean, na.rm = TRUE)
#     ) %>%
#     ungroup() %>%
#     mutate(model_name_long = paste0(model_name_long, ", post"),
#            level = "National") %>%
#     arrange(mae)
# )
# 
# # Predictions versus actual for post-stratificatied models
# pred_vs_actual_post <- predictions_post_national %>%
#   select(-c(error, error_abs, model_name)) %>%
#   rename(prediction = prediction_post) %>%
#   mutate(model_name_long = paste0(model_name_long, ", post"),
#          level = "National")

Validation

# Calculate error
prediction_master <- prediction_master %>%
  mutate(error = prediction - votes_share,
         error_abs = abs(error))
# MAE by model

# Subset days within 10 days
prediction_master_sub <- prediction_master %>%
  filter(days_diff_abs < 10)

# Check that post predictions are included
prediction_master %>% filter(grepl("post", model_name_long))

# MAE by model for national and region level predictions
mae_by_model <- bind_rows(
  # National
  # This also includes post-stratified estimates
  prediction_master_sub %>%
    filter(region_2 == "National") %>%
    group_by(model_name_long) %>%
    mutate(mae = mean(error_abs, na.rm = TRUE),
           n_tweets_daily_mean = mean(n_tweets_daily, na.rm = TRUE)
    ) %>%
    slice(1) %>%
    transmute(model_name_long, mae, n_tweets_daily_mean, level = "National"),
  # Region
  prediction_master_sub %>%
    filter(region_2 != "National") %>%
    group_by(model_name_long) %>%
    mutate(mae = mean(error_abs, na.rm = TRUE),
           n_tweets_daily_mean = mean(n_tweets_daily, na.rm = TRUE)
    ) %>%
    slice(1) %>%
    transmute(model_name_long, mae, n_tweets_daily_mean, level = "Region")) %>%
  ungroup() #%>%
# Add post-stratified national level predictions
#bind_rows(mae_post_by_model)

# Print MAE
mae_by_model %>%
  arrange(mae)

# Print post-predictions to check they have been included
mae_by_model %>%
  filter(grepl("post", model_name_long))

# Best model is MR 3, election fitted, adjusted
best_model_name <- "MR 3, election fitted, adjusted"

# Average national MAE
mae_by_model %>%
  filter(level == "National") %>%
  summarise(mae = mean(mae))

# Average MAE for multilevel models with or without post-stratification
mae_by_model %>%
  #filter(level == "National") %>%
  filter(grepl("MR", model_name_long)) %>%
  mutate(post_or_not = grepl("post", model_name_long)) %>%
  group_by(post_or_not) %>%
  summarise(mae = mean(mae))

# Plot figure 1 bar
mae_by_model %>%
  mutate(level = as.factor(level),
         model_name_long = reorder_within(model_name_long, mae, level)) %>%
  ggplot(.,
         aes(x = model_name_long,
             y = mae)) +
  geom_col(aes(fill = level), show.legend = FALSE) +
  facet_wrap(~level, scales = "free_y") +
  coord_flip() +
  scale_x_reordered() +
  theme_devpublicopinion +
  labs(title = "National- and region-level MAE for models",
       subtitle = "Mean absolute error for national- and region-level predictions by all models",
       x = NULL,
       y = "Mean absolute error (MAE)") +
  ggsave("output/figures/fig1_mae_by_model.png", height = 10, width = 14)
# Plots for MAE by country or region

# Calculate MAE by country
mae_by_country <- bind_rows(
  # National
  # Includes post-stratified estimates
  prediction_master_sub %>%
    filter(region_2 == "National") %>%
    group_by(year, country, model_name_long) %>%
    summarise(mae = mean(error_abs, na.rm = TRUE),
              mean_obs = mean(obs),
              mean_phone = mean(phone, na.rm = TRUE),
              n_tweets_daily_mean = mean(n_tweets_daily, na.rm = TRUE)
    ) %>%
    mutate(level = "National"),
  # Region
  prediction_master_sub %>%
    filter(region_2 != "National") %>%
    group_by(year, country, model_name_long) %>%
    summarise(mae = mean(error_abs, na.rm = TRUE),
              mean_obs = mean(obs),
              mean_phone = mean(phone, na.rm = TRUE),
              n_tweets_daily_mean = mean(n_tweets_daily, na.rm = TRUE)
    ) %>%
    mutate(level = "Region")) %>%
  ungroup() %>%
  #bind_rows(mae_post_by_model_country) %>%
  mutate(model_type = if_else(grepl("Linear", model_name_long), "Linear", "Multilevel"),
         post_or_not = if_else(grepl("post", model_name_long), "Post-stratification", "No post-stratification"))

# Add supplemenetary, including English speaking share
mae_by_country_with_supp <- left_join(mae_by_country,
                                      supp %>%
                                        group_by(country) %>%
                                        slice_max(year, n = 1) %>%
                                        select(-year),
                                      by = "country")

# By country, simple bar plot

# Add separate category for best MR model, MR 3, plus post-stratification extension
mae_by_country_plus <- bind_rows(mae_by_country,
                                 mae_by_country %>%
                                   filter(model_name_long == best_model_name) %>%
                                   mutate(model_type = best_model_name),
                                 mae_by_country %>%
                                   filter(model_name_long == "MR 3, election fitted, adjusted, post") %>%
                                   mutate(model_type = best_model_name)
) %>%
  ungroup() %>%
  mutate(model_type = fct_relevel(model_type,
                                  "Linear", "Multilevel"),
         model_type = recode(model_type,
                             "Linear" = "Linear models average",
                             "Multilevel" = "Multilevel models average")) %>%
  group_by(country, level, model_type) %>%
  summarise(mae = mean(mae))

mae_by_country_plus %>%
  mutate(level = as.factor(level),
         country = reorder_within(country, mae, level)) %>%
  ggplot(.,
         aes(x = country,
             y = mae)) +
  geom_col(aes(fill = model_type),
           position = "dodge") +
  facet_wrap(~level, scales = "free_y") +
  coord_flip() +
  scale_x_reordered() +
  scale_fill_discrete(name = "") +
  theme_devpublicopinion +
  theme(strip.text = element_text(size = text_size+2)) +
  labs(title = "Multilevel regression outperforms linear regression at most levels",
       subtitle = "Average MAE of linear models, multilevel models, and the well performing MR 3 model,\nacross Zimbabwe, Georgia, Afghanistan, Mexico, and Nigeria.\nNote that MR 3 predictions at the national level used post-stratification.",
       x = NULL,
       y = "MAE, percentage points"
  ) +
  ggsave("output/figures/fig4_mae_by_country_and_model_type.png", height = 10, width = 14)

# Check numbers
# By model type
mae_by_country_plus %>%
  filter(country %in% c("Zimbabwe", "Mexico")) %>%
  #filter(level == "Region") %>%
  mutate(mae = mae*100) %>%
  arrange(country, level, mae)

# By specific model
mae_by_country %>%
  select(country, level, model_name_long, model_type, mae) %>%
  filter(country %in% c("Afghanistan", "Mexico", "Zimbabwe")) %>%
  #filter(level == "Region") %>%
  mutate(mae = mae*100) %>%
  group_by(country, level) %>%
  slice_min(mae, n = 2) %>%
  arrange(country, level, mae)

# Plot MAE by candidate, country, and region
# By sample size
mae_by_country %>%
  filter(grepl("MR", model_name_long)) %>%
  ggplot(.,
         aes(x = n_tweets_daily_mean,
             y = mae)) +
  geom_point(aes(colour = country), size = 5) +
  geom_smooth(method = "lm", se = FALSE, colour = "black") +
  scale_colour_discrete(name = "") +
  #facet_wrap(~country) +
  theme_devpublicopinion +
  labs(title = "Relationship between multilevel model error and sample size (number of Tweets per day)",
       subtitle = "Mean absolute error (MAE) by sample size, measured as the average number of Tweets per day, for multilevel models",
       x = "Mean sample size (mean number of Tweets per day)",
       y = "MAE") +
  ggsave("output/figures/fig5_country_mae_sample.png", height = 10, width = 14)
# Plot predictions against actual votes shares
# National and region-level

# Find actual votes share and prediction for each model for each region
pred_vs_actual <- bind_rows(
  # National
  # This includes post-stratified predictions as well
  prediction_master_sub %>%
    filter(region_2 == "National") %>%
    group_by(leader, year, country, region_1, region_2, model_name_long, votes_share) %>%
    summarise(prediction = mean(prediction, na.rm = TRUE),
              mean_obs = mean(obs)),
  # Region
  prediction_master_sub %>%
    filter(region_2 != "National") %>%
    group_by(leader, year, country, region_1, region_2, model_name_long, votes_share) %>%
    summarise(prediction = mean(prediction, na.rm = TRUE),
              mean_obs = mean(obs))
) %>%
  ungroup() %>%
  #bind_rows(pred_vs_actual_post) %>%
  mutate(level = if_else(region_2 == "National", "National predictions", "Regional predictions"),
         model_type = if_else(grepl("Linear", model_name_long), "Linear", "Multilevel"),
         post_or_not = if_else(grepl("post", model_name_long), "Post-stratification", "No post-stratification"),
         error = prediction - votes_share,
         error_abs = abs(error))


# Plot points

# Subset multilevel regression
pred_vs_actual_mr <- pred_vs_actual %>%
  filter(model_type == "Multilevel") %>%
  mutate(zwe_or_not = if_else(country == "Zimbabwe", "Zimbabwe", "Other country"))

# By post-stratification or not
# Also show Zimbabwe's 2013 election
ggplot(pred_vs_actual_mr,
       aes(x = votes_share,
           y = prediction)) +
  geom_point(aes(colour = zwe_or_not), size = 1) +
  geom_abline(intercept = 0, slope = 1) +
  facet_wrap(~post_or_not) +
  scale_colour_discrete(name = "") +
  scale_x_continuous(limits = c(0, 1)) +
  scale_y_continuous(limits = c(0, 1)) +
  theme_devpublicopinion +
  theme(strip.text = element_text(size = text_size+2)) +
  labs(title = "Predicted against actual vote shares by multilevel models",
       subtitle = "Predicted against actual votes share; each point is a predicted vote share for a candidate by a multilevel model",
       x = "Actual vote share",
       y = "Prediction") +
  ggsave("output/figures/fig2_predicted_against_actual.png", height = 10, width = 14)

# 27 words per figure
# 27 times 6 is 162
# Thus need word docx count to be less than 7338 words

# Footnotes plus
162+330
# Plot random effects of best model

# Subset MR model
model_to_plot <- models_master %>%
  filter(model_name_long == "MR 3, election fitted, adjusted") %>%
  .$model

model_to_plot <- model_to_plot[[1]]

# Fixed effects of coefficients
fixef(model_to_plot)

# Random effects of coefficients
ran_ef_to_plot <- ranef(model_to_plot)

ran_ef_to_plot <- ran_ef_to_plot %>%
  as.data.frame() %>%
  select(term, country = grp, value = condval, sd = condsd)

#Labels for facet wrap
ranef_labs <- c("Intercept", "AFINN sentiment")
names(ranef_labs) <- c("(Intercept)", "afinn_mean_daily")

# Plot random effects
ggplot(ran_ef_to_plot,
       aes(x = value,
           y = country)) +
  geom_col(aes(fill = term), show.legend = FALSE) +
  geom_vline(xintercept = 0) +
  facet_wrap(~ term,
             labeller = labeller(term = ranef_labs),
             scales = "free_y") +
  theme_devpublicopinion +
  theme(strip.text = element_text(size = text_size+2)) +
  labs(title = "Random effects of multilevel regression model 3 fitted on election results",
       subtitle = "Random effects associated with varying intercepts and varying slopes of AFINN sentiment",
       x = "Coefficient difference, percentage points",
       y = "") +
  ggsave("output/figures/fig3_random_effects.png", height = 10, width = 14)
# Show intra-class correlation of best model
best_model$model

# Find intra-class correlation
icc(best_model$model[[1]])

Predictions over time

# Use models to estimate votes share over time

# Subset models for time estimates
models_over_time <- models_master %>%
  filter(model_name_long == "MR 3, election fitted, adjusted") %>%
  filter(grepl("MR 3", model_name_long)) %>%
  filter(model_type == "adjusted")

# Data of daily sentiment over time
data_over_time <- senti_adj %>%
  mutate(afinn_mean = afinn_mean_daily,
         test_id = row_number(),
         test_type = "adjusted",
         year = year(date))

#data_over_time <- bind_rows(data_over_time,
#                           data_over_time %>% mutate(test_type = "adjusted"))

# Add covariates
data_over_time <- add_gdl_covariates(data_over_time, gdl_interpo)

# Get predictions
prediction_over_time <- add_all_model_predictions(models_over_time, data_over_time) %>%
  filter(country == leader_country)

prediction_over_time %>% distinct(leader, country, leader_country) %>% arrange(leader)

# Calculate post-stratified national estimates
post_over_time <- prediction_over_time %>%
  filter(model_name_long == "MR 3, election fitted, adjusted") %>%
  filter(region_2 != "National") %>%
  mutate(week = floor_date(date, "week")) %>%
  group_by(date, country, leader, model_name, model_name_long) %>%
  summarise(prediction = weighted.mean(prediction, popshare, na.rm = TRUE),
            n_tweets_daily_mean = mean(n_tweets_daily, na.rm = TRUE),
            mean_obs = mean(obs)) %>%
  ungroup() %>%
  arrange(country, leader, date)

# Add linear interpolation
post_over_time <- post_over_time %>%
  filter(prediction < 1) %>%
  group_by(leader, country) %>%
  mutate(prediction = zoo::na.approx(prediction, na.rm = FALSE, rule = 2),
         prediction_roll = roll_mean(prediction, n = 14, fill = NA)) %>%
  ungroup()

# Print over time estimates
head(post_over_time)

# Plot all leaders over time
post_over_time %>%
  ggplot(.,
         aes(x = date)) + 
  geom_point(aes(colour = leader,
                 y = prediction),
             show.legend = FALSE) +
  geom_line(aes(y = prediction_roll)) +
  facet_wrap(country~leader, scales = "free") +
  scale_colour_discrete(name = "") +
  theme_devpublicopinion +
  theme(strip.text = element_text(size = text_size+1)) +
  labs(title = "MRP estimates over time for developing country leaders and candidates",
       subtitle = "Estimated national favourability of leaders by multilevel regression with post-stratification (MRP);\nblack lines show 14-day rolling averages of model predictions",
       x = NULL,
       y = "Estimated vote share") +
  ggsave("output/figures/fig6_estimates_over_time.png", height = 10, width = 14)


# Plot estimates over time per country
plot_estimates_over_time_per_country <- function(post_over_time, country_to_plot = "Zimbabwe"){

  estimates_title <- paste0("MRP estimates over time for ", country_to_plot)

  estimates_file_name <- paste0("output/figures/appendix_estimates_over_time_", country_to_plot, ".png")


  # Subset and plot
  post_over_time %>%
    filter(country == country_to_plot) %>%
    ggplot(.,
           aes(x = date)) + 
    geom_point(aes(colour = leader,
                   y = prediction),
               show.legend = FALSE) +
    geom_line(aes(y = prediction_roll)) +
    facet_wrap(leader, scales = "free") +
    scale_colour_discrete(name = "") +
    #scale_x_date(date_labels = "%b %Y") +
    theme_devpublicopinion +
    labs(title = estimates_title,
         subtitle = "Estimated national favourability of leaders by multilevel regression with post-stratification (MRP);\nblack lines show 14-day rolling averages of model predictions",
         x = NULL,
         y = "Estimated vote share") +
    ggsave(estimates_file_name, height = 10, width = 14)


}

# Plot and save countries
plot_estimates_over_time_per_country(post_over_time, country_to_plot = "Zimbabwe")
plot_estimates_over_time_per_country(post_over_time, country_to_plot = "Georgia")
plot_estimates_over_time_per_country(post_over_time, country_to_plot = "Afghanistan")
plot_estimates_over_time_per_country(post_over_time, country_to_plot = "Mexico")
plot_estimates_over_time_per_country(post_over_time, country_to_plot = "Nigeria")

Tables

# MAE by model groups
# Linear, multilevel, and multilevel with post-stratification

# Create by model type
mae_sample_table <- mae_by_country %>%
  filter(grepl("election", model_name_long)) %>%
  filter(!(model_type == "Linear" & post_or_not == "Post-stratification")) %>%
  mutate(model_group = case_when(model_type == "Linear" ~ "Linear regression",
                                 model_type == "Multilevel" & post_or_not == "Post-stratification" ~ "Multilevel regression with post-stratification (MRP)",
                                 model_type == "Multilevel" & post_or_not == "No post-stratification" ~ "Multilevel regression without post-stratification"),
         model_group = fct_relevel(model_group,
                                   "Linear regression",
                                   "Multilevel regression without post-stratification",
                                   "Multilevel regression with post-stratification (MRP)")
  ) %>%
  arrange(model_group, country)

mae_sample_table %>%
  group_by(model_group, raw = grepl("raw", model_name_long)) %>%
  summarise(mean = mean(n_tweets_daily_mean),
            n = n())

mae_sample_table <- mae_sample_table %>%
  group_by(model_group) %>%
  summarise(mae = mean(mae),
            n_tweets_daily_mean = mean(n_tweets_daily_mean, na.rm = TRUE)) %>%
  mutate(across(c(mae, n_tweets_daily_mean), ~round(., 4))) %>%
  rename("Model group" = model_group, "MAE" = mae, "Average number of Tweets per day" = n_tweets_daily_mean)

mae_sample_table

# Save as csv
write_csv(mae_sample_table, "output/tables/table3_mae_sample_by_model_group.csv")

# Mean daily tweets by model
mae_by_model %>% group_by(level) %>% slice_min(mae, n = 3)
# List elections in training data
training_elections <- train_data_elex_first_raw %>%
  distinct(country, date_target) %>%
  arrange(country, date_target) %>%
  transmute(Data = "Training", Elections = paste0(country, " (", year(date_target), ")"))

train_data_elex_first_adj %>%
  distinct(country, date_target)

# List elections in test data
validation_elections <- prediction_master %>%
  filter(type == "election") %>%
  distinct(country, date_target) %>%
  transmute(Data = "Validation", Elections = paste0(country, " (", year(date_target), ")"))


# Create table
elex_table <- bind_rows(training_elections,
                        validation_elections) %>%
  arrange(Data, Elections) %>%
  group_by(Data) %>%
  summarise(Elections = toString(Elections))

elex_table

# Save as csv
write_csv(elex_table, "output/tables/table2_elex_for_train_or_vali.csv")
# Table of country statistics
tweets_stats_country <- tweets_sub %>%
  transmute(leader_country,
            week = floor_date(date, "week"),
            year = year(date),
            week_country = paste0(leader_country, week),
            year_country = paste0(leader_country, year)
  )

add_total <- function(x) { mutate(x, Total = Afghanistan + Georgia + Mexico + Nigeria + Zimbabwe) }

# tweets_total <- tweets_sub %>%
#   mutate(leader_country = "Total")
# 
# tweets_stats_country <- rbind(tweets_stats_country, tweets_total)

# Tweet count per country
(n_tweets_per_country <- tweets_stats_country %>%
    group_by(leader_country) %>%
    summarise(n = n()) %>%
    mutate(variable = "Total tweets") %>%
    pivot_wider(names_from = leader_country, values_from = n) %>%
    add_total()
)

# Average number of tweets per country
(n_weekly_tweets_per_country <- rbind(tweets_stats_country,
                                      tweets_stats_country %>% mutate(leader_country = "Total")) %>%
    group_by(leader_country, week_country) %>%
    summarise(n = n()) %>%
    group_by(leader_country) %>%
    summarise(n = mean(n) %>% round(2)) %>%
    mutate(variable = "Average weekly number of tweets") %>%
    pivot_wider(names_from = leader_country, values_from = n)
)

# Number of unique weeks
(n_weeks_per_country <- tweets_stats_country %>%
    group_by(leader_country, week) %>%
    summarise(n = n()) %>%
    group_by(leader_country) %>%
    summarise(n = n()) %>%
    mutate(variable = "Number of weeks") %>%
    pivot_wider(names_from = leader_country, values_from = n) %>%
    add_total()
)

# Years per country
(n_years_per_country <- tweets_stats_country %>%
    group_by(leader_country, year) %>%
    summarise(n = n()) %>%
    group_by(leader_country) %>%
    summarise(n = n()) %>%
    mutate(variable = "Number of years") %>%
    pivot_wider(names_from = leader_country, values_from = n) %>%
    add_total()
)

# Subnational regions per country in data
(n_regions_1_per_country <- tweets_sf %>%
    as.data.frame() %>%
    select(-geometry) %>%
    group_by(leader_country, region_1) %>%
    summarise(n = n()) %>%
    group_by(leader_country) %>%
    summarise(n = n()) %>%
    mutate(variable = "Number of subnational regions/levels") %>%
    pivot_wider(names_from = leader_country, values_from = n) %>%
    add_total()
)

# Number of elections per country
(n_elections_per_country <- elex_master %>%
    group_by(country, elex_date) %>%
    summarise(n = n()) %>%
    group_by(country) %>%
    summarise(n = n()) %>%
    mutate(variable = "Number of elections") %>%
    pivot_wider(names_from = country, values_from = n) %>%
    add_total()
)

# Number of unique candidacies and leaders
(n_candidates_per_country <- candidates %>%
    group_by(country, name, elex_date) %>%
    summarise(n = n()) %>%
    group_by(country) %>%
    summarise(n = n()) %>%
    mutate(variable = "Number of unique candidacies or leaders") %>%
    pivot_wider(names_from = country, values_from = n) %>%
    add_total()
)

# Number of polls per country
(n_polls_per_country <- polling_master %>%
    group_by(country, date) %>%
    summarise(n = n()) %>%
    group_by(country) %>%
    summarise(n = n()) %>%
    mutate(variable = "Number of polls") %>%
    pivot_wider(names_from = country, values_from = n) %>%
    add_total()
)

# Combine into table
(by_country_table <- bind_rows(n_tweets_per_country,
                               n_weekly_tweets_per_country,
                               n_weeks_per_country,
                               n_years_per_country,
                               n_regions_1_per_country,
                               n_candidates_per_country,
                               n_elections_per_country,
                               n_polls_per_country) %>%
    rename(Name = variable)
)

# Save as csv
write_csv(by_country_table, file = "output/tables/table1_per_country.csv")

Plots

# Plot traditional polling and election results

# Subset database to include national level rows, only polls
poll_plot_data <- targets_master %>%
  filter(name %in% candidates$name) %>%
  filter(region_1 == "National") %>%
  filter(type == "poll") %>%
  mutate(votes_share = votes_share*100)

# Add US polls for comparison
poll_plot_data <- bind_rows(poll_plot_data,
                            polling_us %>%
                              mutate(votes_share = votes_share*100) %>%
                              filter(year(date_target) > 2006))

# Plot
ggplot(poll_plot_data,
       aes(x = date_target,
           y = votes_share)) +
  geom_point(aes(colour = name), show.legend = FALSE,
             size = 3, shape = 20) +
  geom_label_repel(data = poll_plot_data %>% group_by(name) %>% slice_sample(n = 1),
                   min.segment.length = 0.5,
                   size = 5,
                   aes(x = date_target, y = votes_share, label = name)) +
  facet_wrap(~country# , scales = "free"
  ) +
  theme_devpublicopinion +
  theme(strip.text = element_text(size = text_size+2)) +
  labs(title = "Traditional polling for developing countries is sparce and clustered over time",
       subtitle = "% vote share, each point is a traditional poll conducted in the given country; numbers of polls included in this figure are indicative but not exhaustive",
       x = NULL,
       y = "%, vote share") +
  ggsave("output/figures/appendix_polling_is_scarce.png", height = 10, width = 14)



# Check numbers

# Polls per country
poll_plot_data %>%
  # Add global level
  mutate(country = "Global") %>%
  bind_rows(poll_plot_data) %>%
  filter(type == "poll") %>%
  distinct(country, date_target) %>%
  group_by(country) %>%
  summarise(n = n(),
  )
# Plot model estimates versus polling and election results
# Summarise tweets per week
tweets_n_per_week <- tweets_sub %>%
  group_by(leader_country, leader, week = floor_date(date, "week")) %>%
  summarise(n = n()) %>%
  ungroup()

# Mean number of tweets per week
tweets_n_per_week %>%
  group_by(week) %>%
  summarise(mean = mean(n))

# Plot
ggplot(tweets_n_per_week, aes(x = week, y = n)) +
  geom_col(aes(colour = leader_country), show.legend = FALSE) +
  facet_wrap(~ leader_country,
             scales = "free_y",
             labeller = label_wrap_gen(width = 20),
             #space = "free"
  ) +
  labs(title = "Weekly number of Tweets per country",
       subtitle = "Number of Tweets per week by country",
       x = NULL,
       y = "Number of Tweets") +
  theme_devpublicopinion +
  theme(strip.text = element_text(size = text_size+2)) +
  ggsave("output/figures/tweets_n_per_week.png")
# Number of tweets over time
tweets_sf %>%
  as.data.frame() %>%
  group_by(has_point, year = year(date)) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  mutate(has_point = case_when(has_point ~ "Geotagged",
                               !has_point ~ "Non-geotagged")) %>%
  ggplot(., aes(x = year, y = n)) +
  geom_col(aes(fill = has_point), show.legend = FALSE) +
  facet_wrap(~has_point, scales = "free_y"
  ) +
  scale_y_continuous(labels = comma) +
  labs(title = "Number of tweets with or without coordinates",
       subtitle = "Number of Tweets per year that were geotagged or not by the Twitter users"
  ) +
  theme_devpublicopinion +
  ggsave("output/figures/number_of_tweets_has_point.png")
# Share of tweets with no lexicon match for any of their stemmed words
(share_tweets_no_senti <- senti_tweets %>%
   group_by(id, username, language) %>%
   summarise(afinn_mean = mean(afinn_value, na.rm = TRUE)) %>%
   mutate(has_afinn = !is.na(afinn_mean),
          in_english = language == "en") %>%
   group_by(has_afinn, in_english) %>%
   summarise(n = n())
)
# Simply GADM for plotting ease
boundaries_subnational_simp <- boundaries_subnational %>%
  st_simplify(., dTolerance = 0.05)

# Plot geotagged Tweets on map
tweets_geotagged <- tweets_sf %>%
  filter(has_point)

plot_tweets_in_country <- function(plot_country = "Afghanistan"){

  ggplot() +
    geom_sf(data = boundaries_subnational_simp %>% filter(country == plot_country),
            fill = NA) +
    geom_sf(data = tweets_geotagged %>% filter(country == plot_country),
            size = 3, shape = 21, fill = blue_colour) +
    labs(title = paste0(plot_country)) +
    theme_devpublicopinion +
    theme(panel.grid.major = element_blank())

}

# Plot and combine maps by country with patchwork
(map_afg <- plot_tweets_in_country("Afghanistan"))
map_zwe <- plot_tweets_in_country("Zimbabwe")
map_geo <- plot_tweets_in_country("Georgia")
map_mex <- plot_tweets_in_country("Mexico")
map_nga <- plot_tweets_in_country("Nigeria")


(map_afg | map_mex | map_zwe)/(map_geo | map_nga) +
  plot_annotation(title = "Geotagged Tweets by country and subnational regions",
                  subtitle = "The spatial location of geotagged Tweets that mentions a country leader or election candidate",
                  theme = theme_devpublicopinion
  ) +
  ggsave("output/figures/maps.png")
# Plot GDL indicators by country
gdl_interpo %>%
  #pivot_longer(cols = c(eye, popshare, phone, cellphone)) %>%
  #filter(gadm_region == "National") %>%
  ggplot(.,
         aes(x = phone,
             fill = country)) +
  geom_boxplot() +
  facet_wrap(~year)
# Plot geotagged share of Tweets by actual population
tweets_sf_df <- tweets_sf %>%
  as.data.frame() %>%
  select(-geometry) %>%
  mutate(year = year(date))

# Fix national level
tweets_sf_df <- add_national_level(tweets_sf_df)

# Calculate geotagged Tweets frequency and share
geotagged_volume <- tweets_sf_df %>%
  group_by(year, country, region_1, region_2, has_point) %>%
  summarise(tweets_region_2_n = n()) %>%
  filter(has_point & region_1 != "National" | !has_point) %>% # remove points outside of country
  filter(has_point) %>%
  group_by(year, country) %>%
  mutate(tweets_national_share = tweets_region_2_n/sum(tweets_region_2_n)) %>%
  ungroup()

# Add GDL statistics (targets)
geotagged_volume <- add_gdl_covariates(geotagged_volume, gdl_interpo)

geotagged_volume

geotagged_volume %>%
  group_by(year, country) %>%
  summarise(popshare = sum(popshare, na.rm = TRUE)) %>%
  arrange(-popshare)

# Plot by country
ggplot(geotagged_volume,
       aes(x = tweets_national_share,
           y = popshare)) +
  geom_point(aes(colour = country), show.legend = FALSE) +
  geom_label_repel(data = geotagged_volume %>%
                     filter(tweets_national_share/popshare > 3) %>%
                     group_by(country) %>%
                     slice_sample(n = 1),
                   aes(x = tweets_national_share,
                       y = popshare,
                       label = region_1),
                   min.segment.length = 0.5,
                   size = 5) +
  facet_wrap(~country) +
  labs(title = "Share of geotagged Tweets in subnational region by actual population share") +
  theme_devpublicopinion


# Plot over time
# if the correlation becomes closer to one, estimates may get less biased?
ggplot(geotagged_volume,
       aes(x = tweets_national_share,
           y = popshare)) +
  geom_point(aes(colour = country), show.legend = FALSE) +
  geom_smooth(method = "lm") +
  # geom_label_repel(data = geotagged_volume %>%
  #                    filter(tweets_national_share/popshare > 3) %>%
  #                    group_by(country) %>%
  #                    slice_sample(n = 1),
  #                  aes(x = tweets_national_share,
  #                      y = popshare,
  #                      label = region_1),
  #                  min.segment.length = 0.5,
  #                  size = 5) +
  facet_wrap(~year) +
  scale_x_continuous(limits = c(0, 1)) +
  scale_y_continuous(limits = c(0, 1)) +
  labs(title = "Share of geotagged Tweets in subnational region by actual population share") +
  theme_devpublicopinion


williamrohdemadsen/dev-public-opinion documentation built on Feb. 11, 2023, 7:17 p.m.