# Load packages library(tidyverse, quietly = TRUE) library(devtools, quietly = TRUE) library(readxl, quietly = TRUE) library(lubridate, quietly = TRUE) library(stringr, quietly = TRUE) library(janitor, quietly = TRUE) library(zoo, quietly = TRUE) library(RcppRoll, quietly = TRUE) library(rgdal, quietly = TRUE) # for shapefiles library(sf, quietly = TRUE) # manipulating spatial data library(lme4, quietly = TRUE) # for glmer, etc. modelling library(httr, quietly = TRUE) library(ggrepel, quietly = TRUE) library(reticulate, quietly = TRUE) # using python code in R library(vroom, quietly = TRUE) # loading json faster library(ndjson, quietly = TRUE) library(jsonlite, quietly = TRUE) library(rvest, quietly = TRUE) # reading html websites library(maps, quietly = TRUE) library(ggmap, quietly = TRUE) library(stars, quietly = TRUE) # rasters library(maptools, quietly = TRUE) library(spatstat, quietly = TRUE) library(koRpus, quietly = TRUE) library(quanteda, quietly = TRUE) library(textdata, quietly = TRUE) library(tidytext, quietly = TRUE) library(SnowballC, quietly = TRUE) library(furrr, quietly = TRUE) library(data.table, quietly = TRUE) library(scales, quietly = TRUE) library(patchwork, quietly = TRUE) library(GGally, quietly = TRUE) library(broom.mixed, quietly = TRUE) library(modelr, quietly = TRUE) library(sjstats, quietly = TRUE) # Load package functions #devtools::load_all()
# Source R scripts list.files("R", full.names = TRUE) %>% purrr::map(source)
# Load formatted data # Formatted with R scripts in data-raw load("data/formatted_gadm.RData") load("data/formatted_supplementary.RData") load("data/tweets_sf.RData") senti_tokens <- fread("data/senti_tokens.csv", nThread = 20000) tweets_sub <- fread("data/tweets_sub.csv", nThread = 20000)
# Calculate sentiment per Tweet # Mean sentiment per Tweet senti_tweet <- calculate_sentiment_per_tweet(senti_tokens)
# Add regions to sentiment senti_tweet_w_region_sf <- add_regions(senti_tweet, boundaries_subnational) # Convert to data.table() senti_tweet_w_region <- as.data.table(senti_tweet_w_region_sf) %>% select(-geometry)
# Sentiment non-adjusted # raw is non-adjusted senti_raw <- calculate_sentiment_per_day(senti_tweet_w_region) senti_raw # Regions included in data senti_raw$region_1 %>% unique %>% paste(., collapse = ", ")
# Sentiment adjusted # Based on regions # average sentiment per region # using non-point tweets to show trend over time senti_adj <- create_region_adjusted_sentiment(senti_raw) senti_adj %>% arrange(region_1) %>% distinct(region_1) %>% paste # Linear interpolation to make up for missing days # interpolate_missing_days()
# Find for-against based on mean sentiment senti_cut_offs <- create_cut_offs(senti_tweet_w_region) #senti_cut_offs <- senti_cut_offs[1:10000,] # Create raw number of pros and cons per day baseline_n_day <- calculate_baseline_n_per_day(senti_cut_offs) # Calculate raw estimate per day baseline_day <- calculate_baseline_estimate_per_day(raw_n_day, n_roll = 7) baseline_day %>% filter(is.na(country)) # Add national level baseline_day <- add_national_level(baseline_day) # Plot pro-share by cut-offs baseline_day %>% filter(leader == "Ghani") %>% ggplot(., aes(x = date, y = pro_share)) + geom_point(colour = "red") + geom_line(aes(y = pro_share_roll), colour = "black") + facet_wrap(~ cut_off) + labs(title = "Ghani's raw favourability rate by baseline estimates and different cut offs", subtitle = "Estimated favourability of Afghan leader Ghani by baseline sentiment and different cut offs") + theme_devpublicopinion
# Add targets (election results and polls) # Join sentiment with target vector, election and polling results baseline_day <- baseline_day %>% mutate(target_id = row_number()) baseline_day_targets_all <- join_targets(baseline_day, targets_master) # Check which senti regions are not in targets object baseline_day %>% filter(!region_1 %in% targets_master$region_1) %>% distinct(country, region_1) # Select nearest target (election or poll) for each day baseline_day_targets <- select_nearest_target(baseline_day_targets_all) # Compare rows nrow(baseline_day) nrow(baseline_day_targets) # Create variable with country and region_1 baseline_day_targets <- baseline_day_targets %>% mutate(year = year(date), country_and_region_1 = paste0(country, "_", region_1)) # Add GDL covariates baseline_day_targets_covars <- add_gdl_covariates(baseline_day_targets, gdl_interpo) # Check if regions are missing GDL statistics baseline_day_targets_covars %>% filter(is.na(popshare)) %>% distinct(region_1)
# Validate baseline # Summary statistics of each cut-off baseline_mae <- mae_by_cut_offs(baseline_day_targets_covars, type = "election", days_diff_less = 10) # Plot raw estimate stats of each cut-off baseline_mae %>% mutate(country_elex = paste0(country, ", ", format(date_target, "%B %Y"))) %>% ggplot(., aes(x = cut_off, y = mean*100)) + geom_col() + #geom_smooth(method = lm) + facet_wrap(~ country_elex) + labs(title = "Mean absolute error of baseline' electoral estimates at different AFINN cut offs", subtitle = "Mean absolute error (MAE) of election estimates based on baseline AFINN sentiment, percentage points", x = "Cut off values", y = "MAE" ) + theme_devpublicopinion
# Add targets (election results and polls) # Join sentiment with target vector, election and polling results senti_raw <- senti_raw %>% mutate(target_id = row_number()) # Add national level to join targets senti_raw <- add_national_level(senti_raw) senti_raw_targets_all <- join_targets(senti_raw, targets_master) # Check which senti regions are not in targets object senti_raw %>% filter(!region_1 %in% targets_master$region_1) %>% distinct(country, region_1) # Select nearest target (election or poll) for each day senti_raw_targets <- select_nearest_target(senti_raw_targets_all) # Compare rows nrow(senti_raw) nrow(senti_raw_targets) # Create variable with country and region_1 senti_raw_targets <- senti_raw_targets %>% mutate(year = year(date), country_and_region_1 = paste0(country, "_", region_1)) # Add GDL covariates senti_raw_targets_covars <- add_gdl_covariates(senti_raw_targets, gdl_interpo) # Check if regions are missing GDL statistics senti_raw_targets_covars %>% filter(is.na(popshare)) %>% distinct(region_1)
# Add row_number to join targets senti_adj <- senti_adj %>% mutate(target_id = row_number()) # Fix missing national and region column senti_adj <- add_national_level(senti_adj) # Add targets (election results and polls) senti_adj_targets_all <- join_targets(senti_adj, targets_master, by_region = FALSE) # Check which senti regions are not in targets object senti_adj %>% filter(!region_1 %in% targets_master$region_1) %>% distinct(country, region_1) # Select nearest target (election or poll) for each day senti_adj_targets <- select_nearest_target(senti_adj_targets_all) # Compare rows # New rows indicate targets being of equal diff days to a sentiment day nrow(senti_adj) nrow(senti_adj_targets) # Create variable with country and region_1 senti_adj_targets <- senti_adj_targets %>% mutate(year = year(date), country_and_region_1 = paste0(country, "_", region_1)) # Add GDL covariates senti_adj_targets_covars <- add_gdl_covariates(senti_adj_targets, gdl_interpo) # Check if regions are missing GDL statistics senti_adj_targets_covars %>% filter(is.na(popshare)) %>% distinct(region_1)
# Elections for each country senti_adj_targets_covars %>% filter(type == "election") %>% distinct(country, date_target) %>% arrange(country)
# Training data days_for_training <- 100 # Sentiment raw train_data_elex_first_raw <- create_training_data(senti_raw_targets_covars, type = "elex", less_than_days_diff = days_for_training) train_data_polls_raw <- create_training_data(senti_raw_targets_covars, type = "polls", less_than_days_diff = days_for_training) # Sentiment adjusted train_data_elex_first_adj <- create_training_data(senti_adj_targets_covars, type = "elex", less_than_days_diff = days_for_training) train_data_polls_adj <- create_training_data(senti_adj_targets_covars, type = "polls", less_than_days_diff = days_for_training)
# Average number of tweets per day in training data train_data_elex_first_adj %>% #group_by(leader, country, region_1, region_2, date) %>% summarise(n_tweets_daily = mean(n_tweets_daily))
# Linear models fit_lm_1_raw <- function(train_data){ lm(votes_share ~ afinn_mean, data = train_data) } fit_lm_2_raw <- function(train_data){ lm(votes_share ~ n_retweets_mean, data = train_data) } fit_lm_3_raw <- function(train_data){ lm(votes_share ~ afinn_mean*n_retweets_mean, data = train_data) } fit_lm_1_adj <- function(train_data){ lm(votes_share ~ afinn_mean_daily, data = train_data) } fit_lm_2_adj <- function(train_data){ lm(votes_share ~ n_retweets_mean, data = train_data) } fit_lm_3_adj <- function(train_data){ lm(votes_share ~ afinn_mean_daily*n_retweets_mean + afinn_mean_region, data = train_data) } # Fit models on elections # Raw lm_r_e <- train_data_elex_first_raw %>% group_by(country) %>% nest() %>% mutate(lm_1_r_e = purrr::map(data, ~fit_lm_1_raw(.x)), lm_2_r_e = purrr::map(data, ~fit_lm_2_raw(.x)), lm_3_r_e = purrr::map(data, ~fit_lm_3_raw(.x)) ) %>% ungroup() %>% pivot_longer(cols = starts_with("lm"), names_to = "model_name", values_to = "model") %>% mutate(model_type = "raw") %>% rename(model_country = country) lm_r_e # Adjusted lm_a_e <- train_data_elex_first_adj %>% group_by(country) %>% nest() %>% mutate(lm_1_a_e = purrr::map(data, ~fit_lm_1_adj(.x)), lm_2_a_e = purrr::map(data, ~fit_lm_2_adj(.x)), lm_3_a_e = purrr::map(data, ~fit_lm_3_adj(.x)) ) %>% ungroup() %>% pivot_longer(cols = starts_with("lm"), names_to = "model_name", values_to = "model") %>% mutate(model_type = "adjusted") %>% rename(model_country = country) lm_a_e
# Multilevel models # Functions for multilevel regression fitting # Raw fit_mr_1_raw <- function(train_data){ lmer(votes_share ~ afinn_mean + (1 + afinn_mean | country), data = train_data) } fit_mr_2_raw <- function(train_data){ lmer(votes_share ~ n_retweets_mean + (1 + n_retweets_mean | country), data = train_data) } fit_mr_3_raw <- function(train_data){ lmer(votes_share ~ afinn_mean*n_retweets_mean + (1 + afinn_mean | country), data = train_data) } fit_mr_4_raw <- function(train_data){ lmer(votes_share ~ afinn_mean*n_retweets_mean + (1 + afinn_mean | country), data = train_data) } # Adjusted fit_mr_1_adj <- function(train_data){ lmer(votes_share ~ afinn_mean_daily + (1 + afinn_mean_daily | country), data = train_data) } fit_mr_2_adj <- function(train_data){ lmer(votes_share ~ n_retweets_mean + (1 + n_retweets_mean | country), data = train_data) } fit_mr_3_adj <- function(train_data){ lmer(votes_share ~ afinn_mean_daily*n_retweets_mean + (1 + afinn_mean_daily | country), data = train_data) } fit_mr_4_adj <- function(train_data){ lmer(votes_share ~ afinn_mean_daily*n_retweets_mean + afinn_mean_region + phone + (1 + afinn_mean_daily | country), data = train_data) } # Check training data names(train_data_elex_first_raw) # Fit multilevel models on elections # Raw mr_r_e <- train_data_elex_first_raw %>% nest(data = everything()) %>% mutate(mr_1_r_e = purrr::map(data, ~fit_mr_1_raw(.x)), mr_2_r_e = purrr::map(data, ~fit_mr_2_raw(.x)), mr_3_r_e = purrr::map(data, ~fit_mr_3_raw(.x)), mr_4_r_e = purrr::map(data, ~fit_mr_4_raw(.x)) ) %>% pivot_longer(cols = starts_with("mr"), names_to = "model_name", values_to = "model") %>% mutate(model_type = "raw") # Adjusted mr_a_e <- train_data_elex_first_adj %>% nest(data = everything()) %>% mutate(mr_1_a_e = purrr::map(data, ~fit_mr_1_adj(.x)), mr_2_a_e = purrr::map(data, ~fit_mr_2_adj(.x)), mr_3_a_e = purrr::map(data, ~fit_mr_3_adj(.x)), mr_4_a_e = purrr::map(data, ~fit_mr_4_adj(.x)) ) %>% pivot_longer(cols = starts_with("mr"), names_to = "model_name", values_to = "model") %>% mutate(model_type = "adjusted") # Fit multilevel models on polls # Raw mr_r_p <- train_data_polls_raw %>% nest(data = everything()) %>% mutate(mr_1_r_p = purrr::map(data, ~fit_mr_1_raw(.x)), mr_2_r_p = purrr::map(data, ~fit_mr_2_raw(.x)), mr_3_r_p = purrr::map(data, ~fit_mr_3_raw(.x)), mr_4_r_p = purrr::map(data, ~fit_mr_4_raw(.x)) ) %>% pivot_longer(cols = starts_with("mr"), names_to = "model_name", values_to = "model") %>% mutate(model_type = "raw") # Adjusted mr_a_p <- train_data_polls_adj %>% nest(data = everything()) %>% mutate(mr_1_a_p = purrr::map(data, ~fit_mr_1_adj(.x)), mr_2_a_p = purrr::map(data, ~fit_mr_2_adj(.x)), mr_3_a_p = purrr::map(data, ~fit_mr_3_adj(.x)), mr_4_a_p = purrr::map(data, ~fit_mr_4_adj(.x)) ) %>% pivot_longer(cols = starts_with("mr"), names_to = "model_name", values_to = "model") %>% mutate(model_type = "adjusted")
# Bind MR models models_mr <- bind_rows(mr_r_e, mr_r_p, mr_a_e, mr_a_p) %>% mutate(join_id = 1) # Bind all models together # Multilevel and linear # Adjusted and raw models_master <- bind_all_models(models_mr, lm_a_e, lm_r_e) models_master %>% filter(is.na(model_country)) models_master %>% distinct(model_name_long)
# Use tidy to get model statistics # broom and broom.mixed packages models_stats <- models_master %>% mutate(model_tidy = purrr::map(model, broom.mixed::tidy)) %>% unnest(model_tidy) # List all models # Including each linear model per country models_stats %>% filter(!is.na(model_country)) %>% distinct(model_name_long, model_country) # Tidy names, levels, and statistics models_stats <- models_stats %>% select(model_name_long, model_country, obs, effect, group, term, estimate, std.error, statistic, p.value) %>% mutate(effect = recode(effect, "fixed" = "Fixed", "ran_pars" = "Random"), group = recode(group, "country" = "Country" ), term = gsub("\\(|\\)", "", term), term = gsub("__", " ", term), term = gsub("\\.", " and ", term), term = gsub("\\:", " and ", term), term = gsub("afinn_mean", "Sentiment", term), term = gsub("cor", "Corr.,", term), term = gsub("n_retweets_mean", "Retweets", term), term = gsub("Sentiment_region", "Regional sentiment", term), term = gsub("Sentiment_daily", "Daily sentiment", term), term = gsub("sd", "SD,", term), term = gsub("phone", "Phone share", term) ) sapply(models_stats %>% select(effect, group, term), unique) # Sort names and table column all_tables <- models_stats %>% # Split tibbles dplyr::group_split(model_name_long, model_country) %>% purrr::map(~mutate(., across(where(is.double), ~round(., 2) %>% as.character))) %>% purrr::map(~add_row(., effect = "Effect", group = "Group", term = "Term", estimate = "Estimate", std.error = "Standard error", statistic = "Statistic", p.value = "P-value", .before = 1)) %>% purrr::map(~mutate(., model_name_long = na.locf(model_name_long, fromLast = TRUE), model_country = na.locf(model_country, fromLast = TRUE, na.rm = FALSE), obs = na.locf(obs, fromLast = TRUE) %>% paste0("Obs.: ", .) )) %>% purrr::map(~add_row(.)) %>% purrr::map(~mutate(., model_name_long = if_else(row_number() %in% 2:nrow(.), NA_character_, model_name_long), model_country = if_else(row_number() %in% 2:nrow(.), NA_character_, model_country), obs = if_else(row_number() %in% 2:nrow(.), NA_character_, obs), )) %>% # Bind tables bind_rows() %>% # Replace NAs with empty cells mutate(across(everything(), ~if_else(is.na(.), "", .))) all_tables # Save file which can then be pasted in document write_csv(all_tables, file = "output/tables/appendix_models_tables.csv") # Check individual models best_model <- models_master %>% filter(model_name_long == "MR 3, election fitted, adjusted") best_model$model tidy(best_model)
# Create test data test_raw <- create_test_data(senti_raw_targets_covars %>% rename(n_tweets_daily = n_tweets), choose_type = "election") %>% mutate(test_type = "raw") test_adjusted <- create_test_data(senti_adj_targets_covars, choose_type = "election") %>% mutate(test_type = "adjusted") test_master <- bind_rows(test_raw, test_adjusted) %>% mutate(test_id = row_number())
# Add predictions for all models predictions <- add_all_model_predictions(models_master, test_master)
# Add post-stratified estimates with GDL's popshare variable prediction_master <- predictions %>% mutate(prediction_post = prediction*popshare) # Summarise weighted prediction of each region by country (predictions_post_national <- prediction_master %>% filter(region_2 != "National") %>% group_by(country, year, leader, model_name, model_name_long, days_diff_abs) %>% summarise(prediction = weighted.mean(prediction, popshare, na.rm = TRUE), n_tweets_daily = mean(n_tweets_daily, na.rm = TRUE), mean_obs = mean(obs)) %>% ungroup() %>% mutate(model_name_long = paste0(model_name_long, ", post"), level = "National", region_1 = "National", region_2 = "National") %>% arrange(country) ) # Add actual national vote shares # Find actual national vote shares (actual_shares_national <- find_actual_vote_shares_national(prediction_master)) # Join predictions_post_national <- left_join(predictions_post_national, actual_shares_national) # Bind post-predictions back to master prediction dataset prediction_master <- bind_rows(prediction_master, predictions_post_national) # Check rows just added prediction_master %>% filter(grepl("post", model_name_long)) %>% select(country, region_1, region_2, leader, model_name, days_diff_abs, votes_share, prediction)
# Find MAE of post-stratification # # Find MAE for post by model # (mae_post_by_model <- predictions_post_national %>% # filter(!grepl("Linear", model_name_long)) %>% # filter(!is.na(model_name)) %>% # group_by(model_name, model_name_long, mean_obs) %>% # summarise(mae = mean(error_abs, na.rm = TRUE), # n_tweets_daily_mean = mean(n_tweets_daily_mean, na.rm = TRUE) # ) %>% # ungroup() %>% # arrange(mae) # ) # # # Find error for post by model and country # (mae_post_by_model_country <- predictions_post_national %>% # filter(!grepl("Linear", model_name_long)) %>% # filter(!is.na(model_name)) %>% # group_by(country, model_name_long,mean_obs) %>% # summarise(mae = mean(error_abs, na.rm = TRUE), # n_tweets_daily_mean = mean(n_tweets_daily_mean, na.rm = TRUE) # ) %>% # ungroup() %>% # mutate(model_name_long = paste0(model_name_long, ", post"), # level = "National") %>% # arrange(mae) # ) # # # Predictions versus actual for post-stratificatied models # pred_vs_actual_post <- predictions_post_national %>% # select(-c(error, error_abs, model_name)) %>% # rename(prediction = prediction_post) %>% # mutate(model_name_long = paste0(model_name_long, ", post"), # level = "National")
# Calculate error prediction_master <- prediction_master %>% mutate(error = prediction - votes_share, error_abs = abs(error))
# MAE by model # Subset days within 10 days prediction_master_sub <- prediction_master %>% filter(days_diff_abs < 10) # Check that post predictions are included prediction_master %>% filter(grepl("post", model_name_long)) # MAE by model for national and region level predictions mae_by_model <- bind_rows( # National # This also includes post-stratified estimates prediction_master_sub %>% filter(region_2 == "National") %>% group_by(model_name_long) %>% mutate(mae = mean(error_abs, na.rm = TRUE), n_tweets_daily_mean = mean(n_tweets_daily, na.rm = TRUE) ) %>% slice(1) %>% transmute(model_name_long, mae, n_tweets_daily_mean, level = "National"), # Region prediction_master_sub %>% filter(region_2 != "National") %>% group_by(model_name_long) %>% mutate(mae = mean(error_abs, na.rm = TRUE), n_tweets_daily_mean = mean(n_tweets_daily, na.rm = TRUE) ) %>% slice(1) %>% transmute(model_name_long, mae, n_tweets_daily_mean, level = "Region")) %>% ungroup() #%>% # Add post-stratified national level predictions #bind_rows(mae_post_by_model) # Print MAE mae_by_model %>% arrange(mae) # Print post-predictions to check they have been included mae_by_model %>% filter(grepl("post", model_name_long)) # Best model is MR 3, election fitted, adjusted best_model_name <- "MR 3, election fitted, adjusted" # Average national MAE mae_by_model %>% filter(level == "National") %>% summarise(mae = mean(mae)) # Average MAE for multilevel models with or without post-stratification mae_by_model %>% #filter(level == "National") %>% filter(grepl("MR", model_name_long)) %>% mutate(post_or_not = grepl("post", model_name_long)) %>% group_by(post_or_not) %>% summarise(mae = mean(mae)) # Plot figure 1 bar mae_by_model %>% mutate(level = as.factor(level), model_name_long = reorder_within(model_name_long, mae, level)) %>% ggplot(., aes(x = model_name_long, y = mae)) + geom_col(aes(fill = level), show.legend = FALSE) + facet_wrap(~level, scales = "free_y") + coord_flip() + scale_x_reordered() + theme_devpublicopinion + labs(title = "National- and region-level MAE for models", subtitle = "Mean absolute error for national- and region-level predictions by all models", x = NULL, y = "Mean absolute error (MAE)") + ggsave("output/figures/fig1_mae_by_model.png", height = 10, width = 14)
# Plots for MAE by country or region # Calculate MAE by country mae_by_country <- bind_rows( # National # Includes post-stratified estimates prediction_master_sub %>% filter(region_2 == "National") %>% group_by(year, country, model_name_long) %>% summarise(mae = mean(error_abs, na.rm = TRUE), mean_obs = mean(obs), mean_phone = mean(phone, na.rm = TRUE), n_tweets_daily_mean = mean(n_tweets_daily, na.rm = TRUE) ) %>% mutate(level = "National"), # Region prediction_master_sub %>% filter(region_2 != "National") %>% group_by(year, country, model_name_long) %>% summarise(mae = mean(error_abs, na.rm = TRUE), mean_obs = mean(obs), mean_phone = mean(phone, na.rm = TRUE), n_tweets_daily_mean = mean(n_tweets_daily, na.rm = TRUE) ) %>% mutate(level = "Region")) %>% ungroup() %>% #bind_rows(mae_post_by_model_country) %>% mutate(model_type = if_else(grepl("Linear", model_name_long), "Linear", "Multilevel"), post_or_not = if_else(grepl("post", model_name_long), "Post-stratification", "No post-stratification")) # Add supplemenetary, including English speaking share mae_by_country_with_supp <- left_join(mae_by_country, supp %>% group_by(country) %>% slice_max(year, n = 1) %>% select(-year), by = "country") # By country, simple bar plot # Add separate category for best MR model, MR 3, plus post-stratification extension mae_by_country_plus <- bind_rows(mae_by_country, mae_by_country %>% filter(model_name_long == best_model_name) %>% mutate(model_type = best_model_name), mae_by_country %>% filter(model_name_long == "MR 3, election fitted, adjusted, post") %>% mutate(model_type = best_model_name) ) %>% ungroup() %>% mutate(model_type = fct_relevel(model_type, "Linear", "Multilevel"), model_type = recode(model_type, "Linear" = "Linear models average", "Multilevel" = "Multilevel models average")) %>% group_by(country, level, model_type) %>% summarise(mae = mean(mae)) mae_by_country_plus %>% mutate(level = as.factor(level), country = reorder_within(country, mae, level)) %>% ggplot(., aes(x = country, y = mae)) + geom_col(aes(fill = model_type), position = "dodge") + facet_wrap(~level, scales = "free_y") + coord_flip() + scale_x_reordered() + scale_fill_discrete(name = "") + theme_devpublicopinion + theme(strip.text = element_text(size = text_size+2)) + labs(title = "Multilevel regression outperforms linear regression at most levels", subtitle = "Average MAE of linear models, multilevel models, and the well performing MR 3 model,\nacross Zimbabwe, Georgia, Afghanistan, Mexico, and Nigeria.\nNote that MR 3 predictions at the national level used post-stratification.", x = NULL, y = "MAE, percentage points" ) + ggsave("output/figures/fig4_mae_by_country_and_model_type.png", height = 10, width = 14) # Check numbers # By model type mae_by_country_plus %>% filter(country %in% c("Zimbabwe", "Mexico")) %>% #filter(level == "Region") %>% mutate(mae = mae*100) %>% arrange(country, level, mae) # By specific model mae_by_country %>% select(country, level, model_name_long, model_type, mae) %>% filter(country %in% c("Afghanistan", "Mexico", "Zimbabwe")) %>% #filter(level == "Region") %>% mutate(mae = mae*100) %>% group_by(country, level) %>% slice_min(mae, n = 2) %>% arrange(country, level, mae) # Plot MAE by candidate, country, and region # By sample size mae_by_country %>% filter(grepl("MR", model_name_long)) %>% ggplot(., aes(x = n_tweets_daily_mean, y = mae)) + geom_point(aes(colour = country), size = 5) + geom_smooth(method = "lm", se = FALSE, colour = "black") + scale_colour_discrete(name = "") + #facet_wrap(~country) + theme_devpublicopinion + labs(title = "Relationship between multilevel model error and sample size (number of Tweets per day)", subtitle = "Mean absolute error (MAE) by sample size, measured as the average number of Tweets per day, for multilevel models", x = "Mean sample size (mean number of Tweets per day)", y = "MAE") + ggsave("output/figures/fig5_country_mae_sample.png", height = 10, width = 14)
# Plot predictions against actual votes shares # National and region-level # Find actual votes share and prediction for each model for each region pred_vs_actual <- bind_rows( # National # This includes post-stratified predictions as well prediction_master_sub %>% filter(region_2 == "National") %>% group_by(leader, year, country, region_1, region_2, model_name_long, votes_share) %>% summarise(prediction = mean(prediction, na.rm = TRUE), mean_obs = mean(obs)), # Region prediction_master_sub %>% filter(region_2 != "National") %>% group_by(leader, year, country, region_1, region_2, model_name_long, votes_share) %>% summarise(prediction = mean(prediction, na.rm = TRUE), mean_obs = mean(obs)) ) %>% ungroup() %>% #bind_rows(pred_vs_actual_post) %>% mutate(level = if_else(region_2 == "National", "National predictions", "Regional predictions"), model_type = if_else(grepl("Linear", model_name_long), "Linear", "Multilevel"), post_or_not = if_else(grepl("post", model_name_long), "Post-stratification", "No post-stratification"), error = prediction - votes_share, error_abs = abs(error)) # Plot points # Subset multilevel regression pred_vs_actual_mr <- pred_vs_actual %>% filter(model_type == "Multilevel") %>% mutate(zwe_or_not = if_else(country == "Zimbabwe", "Zimbabwe", "Other country")) # By post-stratification or not # Also show Zimbabwe's 2013 election ggplot(pred_vs_actual_mr, aes(x = votes_share, y = prediction)) + geom_point(aes(colour = zwe_or_not), size = 1) + geom_abline(intercept = 0, slope = 1) + facet_wrap(~post_or_not) + scale_colour_discrete(name = "") + scale_x_continuous(limits = c(0, 1)) + scale_y_continuous(limits = c(0, 1)) + theme_devpublicopinion + theme(strip.text = element_text(size = text_size+2)) + labs(title = "Predicted against actual vote shares by multilevel models", subtitle = "Predicted against actual votes share; each point is a predicted vote share for a candidate by a multilevel model", x = "Actual vote share", y = "Prediction") + ggsave("output/figures/fig2_predicted_against_actual.png", height = 10, width = 14) # 27 words per figure # 27 times 6 is 162 # Thus need word docx count to be less than 7338 words # Footnotes plus 162+330
# Plot random effects of best model # Subset MR model model_to_plot <- models_master %>% filter(model_name_long == "MR 3, election fitted, adjusted") %>% .$model model_to_plot <- model_to_plot[[1]] # Fixed effects of coefficients fixef(model_to_plot) # Random effects of coefficients ran_ef_to_plot <- ranef(model_to_plot) ran_ef_to_plot <- ran_ef_to_plot %>% as.data.frame() %>% select(term, country = grp, value = condval, sd = condsd) #Labels for facet wrap ranef_labs <- c("Intercept", "AFINN sentiment") names(ranef_labs) <- c("(Intercept)", "afinn_mean_daily") # Plot random effects ggplot(ran_ef_to_plot, aes(x = value, y = country)) + geom_col(aes(fill = term), show.legend = FALSE) + geom_vline(xintercept = 0) + facet_wrap(~ term, labeller = labeller(term = ranef_labs), scales = "free_y") + theme_devpublicopinion + theme(strip.text = element_text(size = text_size+2)) + labs(title = "Random effects of multilevel regression model 3 fitted on election results", subtitle = "Random effects associated with varying intercepts and varying slopes of AFINN sentiment", x = "Coefficient difference, percentage points", y = "") + ggsave("output/figures/fig3_random_effects.png", height = 10, width = 14)
# Show intra-class correlation of best model best_model$model # Find intra-class correlation icc(best_model$model[[1]])
# Use models to estimate votes share over time # Subset models for time estimates models_over_time <- models_master %>% filter(model_name_long == "MR 3, election fitted, adjusted") %>% filter(grepl("MR 3", model_name_long)) %>% filter(model_type == "adjusted") # Data of daily sentiment over time data_over_time <- senti_adj %>% mutate(afinn_mean = afinn_mean_daily, test_id = row_number(), test_type = "adjusted", year = year(date)) #data_over_time <- bind_rows(data_over_time, # data_over_time %>% mutate(test_type = "adjusted")) # Add covariates data_over_time <- add_gdl_covariates(data_over_time, gdl_interpo) # Get predictions prediction_over_time <- add_all_model_predictions(models_over_time, data_over_time) %>% filter(country == leader_country) prediction_over_time %>% distinct(leader, country, leader_country) %>% arrange(leader) # Calculate post-stratified national estimates post_over_time <- prediction_over_time %>% filter(model_name_long == "MR 3, election fitted, adjusted") %>% filter(region_2 != "National") %>% mutate(week = floor_date(date, "week")) %>% group_by(date, country, leader, model_name, model_name_long) %>% summarise(prediction = weighted.mean(prediction, popshare, na.rm = TRUE), n_tweets_daily_mean = mean(n_tweets_daily, na.rm = TRUE), mean_obs = mean(obs)) %>% ungroup() %>% arrange(country, leader, date) # Add linear interpolation post_over_time <- post_over_time %>% filter(prediction < 1) %>% group_by(leader, country) %>% mutate(prediction = zoo::na.approx(prediction, na.rm = FALSE, rule = 2), prediction_roll = roll_mean(prediction, n = 14, fill = NA)) %>% ungroup() # Print over time estimates head(post_over_time) # Plot all leaders over time post_over_time %>% ggplot(., aes(x = date)) + geom_point(aes(colour = leader, y = prediction), show.legend = FALSE) + geom_line(aes(y = prediction_roll)) + facet_wrap(country~leader, scales = "free") + scale_colour_discrete(name = "") + theme_devpublicopinion + theme(strip.text = element_text(size = text_size+1)) + labs(title = "MRP estimates over time for developing country leaders and candidates", subtitle = "Estimated national favourability of leaders by multilevel regression with post-stratification (MRP);\nblack lines show 14-day rolling averages of model predictions", x = NULL, y = "Estimated vote share") + ggsave("output/figures/fig6_estimates_over_time.png", height = 10, width = 14) # Plot estimates over time per country plot_estimates_over_time_per_country <- function(post_over_time, country_to_plot = "Zimbabwe"){ estimates_title <- paste0("MRP estimates over time for ", country_to_plot) estimates_file_name <- paste0("output/figures/appendix_estimates_over_time_", country_to_plot, ".png") # Subset and plot post_over_time %>% filter(country == country_to_plot) %>% ggplot(., aes(x = date)) + geom_point(aes(colour = leader, y = prediction), show.legend = FALSE) + geom_line(aes(y = prediction_roll)) + facet_wrap(leader, scales = "free") + scale_colour_discrete(name = "") + #scale_x_date(date_labels = "%b %Y") + theme_devpublicopinion + labs(title = estimates_title, subtitle = "Estimated national favourability of leaders by multilevel regression with post-stratification (MRP);\nblack lines show 14-day rolling averages of model predictions", x = NULL, y = "Estimated vote share") + ggsave(estimates_file_name, height = 10, width = 14) } # Plot and save countries plot_estimates_over_time_per_country(post_over_time, country_to_plot = "Zimbabwe") plot_estimates_over_time_per_country(post_over_time, country_to_plot = "Georgia") plot_estimates_over_time_per_country(post_over_time, country_to_plot = "Afghanistan") plot_estimates_over_time_per_country(post_over_time, country_to_plot = "Mexico") plot_estimates_over_time_per_country(post_over_time, country_to_plot = "Nigeria")
# MAE by model groups # Linear, multilevel, and multilevel with post-stratification # Create by model type mae_sample_table <- mae_by_country %>% filter(grepl("election", model_name_long)) %>% filter(!(model_type == "Linear" & post_or_not == "Post-stratification")) %>% mutate(model_group = case_when(model_type == "Linear" ~ "Linear regression", model_type == "Multilevel" & post_or_not == "Post-stratification" ~ "Multilevel regression with post-stratification (MRP)", model_type == "Multilevel" & post_or_not == "No post-stratification" ~ "Multilevel regression without post-stratification"), model_group = fct_relevel(model_group, "Linear regression", "Multilevel regression without post-stratification", "Multilevel regression with post-stratification (MRP)") ) %>% arrange(model_group, country) mae_sample_table %>% group_by(model_group, raw = grepl("raw", model_name_long)) %>% summarise(mean = mean(n_tweets_daily_mean), n = n()) mae_sample_table <- mae_sample_table %>% group_by(model_group) %>% summarise(mae = mean(mae), n_tweets_daily_mean = mean(n_tweets_daily_mean, na.rm = TRUE)) %>% mutate(across(c(mae, n_tweets_daily_mean), ~round(., 4))) %>% rename("Model group" = model_group, "MAE" = mae, "Average number of Tweets per day" = n_tweets_daily_mean) mae_sample_table # Save as csv write_csv(mae_sample_table, "output/tables/table3_mae_sample_by_model_group.csv") # Mean daily tweets by model mae_by_model %>% group_by(level) %>% slice_min(mae, n = 3)
# List elections in training data training_elections <- train_data_elex_first_raw %>% distinct(country, date_target) %>% arrange(country, date_target) %>% transmute(Data = "Training", Elections = paste0(country, " (", year(date_target), ")")) train_data_elex_first_adj %>% distinct(country, date_target) # List elections in test data validation_elections <- prediction_master %>% filter(type == "election") %>% distinct(country, date_target) %>% transmute(Data = "Validation", Elections = paste0(country, " (", year(date_target), ")")) # Create table elex_table <- bind_rows(training_elections, validation_elections) %>% arrange(Data, Elections) %>% group_by(Data) %>% summarise(Elections = toString(Elections)) elex_table # Save as csv write_csv(elex_table, "output/tables/table2_elex_for_train_or_vali.csv")
# Table of country statistics tweets_stats_country <- tweets_sub %>% transmute(leader_country, week = floor_date(date, "week"), year = year(date), week_country = paste0(leader_country, week), year_country = paste0(leader_country, year) ) add_total <- function(x) { mutate(x, Total = Afghanistan + Georgia + Mexico + Nigeria + Zimbabwe) } # tweets_total <- tweets_sub %>% # mutate(leader_country = "Total") # # tweets_stats_country <- rbind(tweets_stats_country, tweets_total) # Tweet count per country (n_tweets_per_country <- tweets_stats_country %>% group_by(leader_country) %>% summarise(n = n()) %>% mutate(variable = "Total tweets") %>% pivot_wider(names_from = leader_country, values_from = n) %>% add_total() ) # Average number of tweets per country (n_weekly_tweets_per_country <- rbind(tweets_stats_country, tweets_stats_country %>% mutate(leader_country = "Total")) %>% group_by(leader_country, week_country) %>% summarise(n = n()) %>% group_by(leader_country) %>% summarise(n = mean(n) %>% round(2)) %>% mutate(variable = "Average weekly number of tweets") %>% pivot_wider(names_from = leader_country, values_from = n) ) # Number of unique weeks (n_weeks_per_country <- tweets_stats_country %>% group_by(leader_country, week) %>% summarise(n = n()) %>% group_by(leader_country) %>% summarise(n = n()) %>% mutate(variable = "Number of weeks") %>% pivot_wider(names_from = leader_country, values_from = n) %>% add_total() ) # Years per country (n_years_per_country <- tweets_stats_country %>% group_by(leader_country, year) %>% summarise(n = n()) %>% group_by(leader_country) %>% summarise(n = n()) %>% mutate(variable = "Number of years") %>% pivot_wider(names_from = leader_country, values_from = n) %>% add_total() ) # Subnational regions per country in data (n_regions_1_per_country <- tweets_sf %>% as.data.frame() %>% select(-geometry) %>% group_by(leader_country, region_1) %>% summarise(n = n()) %>% group_by(leader_country) %>% summarise(n = n()) %>% mutate(variable = "Number of subnational regions/levels") %>% pivot_wider(names_from = leader_country, values_from = n) %>% add_total() ) # Number of elections per country (n_elections_per_country <- elex_master %>% group_by(country, elex_date) %>% summarise(n = n()) %>% group_by(country) %>% summarise(n = n()) %>% mutate(variable = "Number of elections") %>% pivot_wider(names_from = country, values_from = n) %>% add_total() ) # Number of unique candidacies and leaders (n_candidates_per_country <- candidates %>% group_by(country, name, elex_date) %>% summarise(n = n()) %>% group_by(country) %>% summarise(n = n()) %>% mutate(variable = "Number of unique candidacies or leaders") %>% pivot_wider(names_from = country, values_from = n) %>% add_total() ) # Number of polls per country (n_polls_per_country <- polling_master %>% group_by(country, date) %>% summarise(n = n()) %>% group_by(country) %>% summarise(n = n()) %>% mutate(variable = "Number of polls") %>% pivot_wider(names_from = country, values_from = n) %>% add_total() ) # Combine into table (by_country_table <- bind_rows(n_tweets_per_country, n_weekly_tweets_per_country, n_weeks_per_country, n_years_per_country, n_regions_1_per_country, n_candidates_per_country, n_elections_per_country, n_polls_per_country) %>% rename(Name = variable) ) # Save as csv write_csv(by_country_table, file = "output/tables/table1_per_country.csv")
# Plot traditional polling and election results # Subset database to include national level rows, only polls poll_plot_data <- targets_master %>% filter(name %in% candidates$name) %>% filter(region_1 == "National") %>% filter(type == "poll") %>% mutate(votes_share = votes_share*100) # Add US polls for comparison poll_plot_data <- bind_rows(poll_plot_data, polling_us %>% mutate(votes_share = votes_share*100) %>% filter(year(date_target) > 2006)) # Plot ggplot(poll_plot_data, aes(x = date_target, y = votes_share)) + geom_point(aes(colour = name), show.legend = FALSE, size = 3, shape = 20) + geom_label_repel(data = poll_plot_data %>% group_by(name) %>% slice_sample(n = 1), min.segment.length = 0.5, size = 5, aes(x = date_target, y = votes_share, label = name)) + facet_wrap(~country# , scales = "free" ) + theme_devpublicopinion + theme(strip.text = element_text(size = text_size+2)) + labs(title = "Traditional polling for developing countries is sparce and clustered over time", subtitle = "% vote share, each point is a traditional poll conducted in the given country; numbers of polls included in this figure are indicative but not exhaustive", x = NULL, y = "%, vote share") + ggsave("output/figures/appendix_polling_is_scarce.png", height = 10, width = 14) # Check numbers # Polls per country poll_plot_data %>% # Add global level mutate(country = "Global") %>% bind_rows(poll_plot_data) %>% filter(type == "poll") %>% distinct(country, date_target) %>% group_by(country) %>% summarise(n = n(), )
# Plot model estimates versus polling and election results
# Summarise tweets per week tweets_n_per_week <- tweets_sub %>% group_by(leader_country, leader, week = floor_date(date, "week")) %>% summarise(n = n()) %>% ungroup() # Mean number of tweets per week tweets_n_per_week %>% group_by(week) %>% summarise(mean = mean(n)) # Plot ggplot(tweets_n_per_week, aes(x = week, y = n)) + geom_col(aes(colour = leader_country), show.legend = FALSE) + facet_wrap(~ leader_country, scales = "free_y", labeller = label_wrap_gen(width = 20), #space = "free" ) + labs(title = "Weekly number of Tweets per country", subtitle = "Number of Tweets per week by country", x = NULL, y = "Number of Tweets") + theme_devpublicopinion + theme(strip.text = element_text(size = text_size+2)) + ggsave("output/figures/tweets_n_per_week.png")
# Number of tweets over time tweets_sf %>% as.data.frame() %>% group_by(has_point, year = year(date)) %>% summarise(n = n()) %>% ungroup() %>% mutate(has_point = case_when(has_point ~ "Geotagged", !has_point ~ "Non-geotagged")) %>% ggplot(., aes(x = year, y = n)) + geom_col(aes(fill = has_point), show.legend = FALSE) + facet_wrap(~has_point, scales = "free_y" ) + scale_y_continuous(labels = comma) + labs(title = "Number of tweets with or without coordinates", subtitle = "Number of Tweets per year that were geotagged or not by the Twitter users" ) + theme_devpublicopinion + ggsave("output/figures/number_of_tweets_has_point.png")
# Share of tweets with no lexicon match for any of their stemmed words (share_tweets_no_senti <- senti_tweets %>% group_by(id, username, language) %>% summarise(afinn_mean = mean(afinn_value, na.rm = TRUE)) %>% mutate(has_afinn = !is.na(afinn_mean), in_english = language == "en") %>% group_by(has_afinn, in_english) %>% summarise(n = n()) )
# Simply GADM for plotting ease boundaries_subnational_simp <- boundaries_subnational %>% st_simplify(., dTolerance = 0.05) # Plot geotagged Tweets on map tweets_geotagged <- tweets_sf %>% filter(has_point) plot_tweets_in_country <- function(plot_country = "Afghanistan"){ ggplot() + geom_sf(data = boundaries_subnational_simp %>% filter(country == plot_country), fill = NA) + geom_sf(data = tweets_geotagged %>% filter(country == plot_country), size = 3, shape = 21, fill = blue_colour) + labs(title = paste0(plot_country)) + theme_devpublicopinion + theme(panel.grid.major = element_blank()) } # Plot and combine maps by country with patchwork (map_afg <- plot_tweets_in_country("Afghanistan")) map_zwe <- plot_tweets_in_country("Zimbabwe") map_geo <- plot_tweets_in_country("Georgia") map_mex <- plot_tweets_in_country("Mexico") map_nga <- plot_tweets_in_country("Nigeria") (map_afg | map_mex | map_zwe)/(map_geo | map_nga) + plot_annotation(title = "Geotagged Tweets by country and subnational regions", subtitle = "The spatial location of geotagged Tweets that mentions a country leader or election candidate", theme = theme_devpublicopinion ) + ggsave("output/figures/maps.png")
# Plot GDL indicators by country gdl_interpo %>% #pivot_longer(cols = c(eye, popshare, phone, cellphone)) %>% #filter(gadm_region == "National") %>% ggplot(., aes(x = phone, fill = country)) + geom_boxplot() + facet_wrap(~year)
# Plot geotagged share of Tweets by actual population tweets_sf_df <- tweets_sf %>% as.data.frame() %>% select(-geometry) %>% mutate(year = year(date)) # Fix national level tweets_sf_df <- add_national_level(tweets_sf_df) # Calculate geotagged Tweets frequency and share geotagged_volume <- tweets_sf_df %>% group_by(year, country, region_1, region_2, has_point) %>% summarise(tweets_region_2_n = n()) %>% filter(has_point & region_1 != "National" | !has_point) %>% # remove points outside of country filter(has_point) %>% group_by(year, country) %>% mutate(tweets_national_share = tweets_region_2_n/sum(tweets_region_2_n)) %>% ungroup() # Add GDL statistics (targets) geotagged_volume <- add_gdl_covariates(geotagged_volume, gdl_interpo) geotagged_volume geotagged_volume %>% group_by(year, country) %>% summarise(popshare = sum(popshare, na.rm = TRUE)) %>% arrange(-popshare) # Plot by country ggplot(geotagged_volume, aes(x = tweets_national_share, y = popshare)) + geom_point(aes(colour = country), show.legend = FALSE) + geom_label_repel(data = geotagged_volume %>% filter(tweets_national_share/popshare > 3) %>% group_by(country) %>% slice_sample(n = 1), aes(x = tweets_national_share, y = popshare, label = region_1), min.segment.length = 0.5, size = 5) + facet_wrap(~country) + labs(title = "Share of geotagged Tweets in subnational region by actual population share") + theme_devpublicopinion # Plot over time # if the correlation becomes closer to one, estimates may get less biased? ggplot(geotagged_volume, aes(x = tweets_national_share, y = popshare)) + geom_point(aes(colour = country), show.legend = FALSE) + geom_smooth(method = "lm") + # geom_label_repel(data = geotagged_volume %>% # filter(tweets_national_share/popshare > 3) %>% # group_by(country) %>% # slice_sample(n = 1), # aes(x = tweets_national_share, # y = popshare, # label = region_1), # min.segment.length = 0.5, # size = 5) + facet_wrap(~year) + scale_x_continuous(limits = c(0, 1)) + scale_y_continuous(limits = c(0, 1)) + labs(title = "Share of geotagged Tweets in subnational region by actual population share") + theme_devpublicopinion
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.