# Format data
# This will serve to decide and describe the countries in focus
# Clean supplementary -----
# UN language population
## Appears to only show population by primary language
## May use rural, urban distinctions for weighting
un_lang_raw %>%
clean_names() %>%
filter(language %in% c("Total", "English")) %>%
mutate(value = round(value)) %>%
pivot_wider(names_from = language, values_from = value) %>%
clean_names() %>%
mutate(english_prop = english/total %>% round) %>%
rename(country = country_or_area) %>%
group_by(country, area, sex) %>%
filter(year == max(year))
# Ethnologue language data, English speakers
ethno <- ethno_raw %>%
mutate(country = if_else(grepl("Hide Details", English), gsub("Hide Details", "", English), NA_character_),
country = if_else(English == "English", "United Kingdom", country),
name = lag(English),
) %>%
fill(country, name) %>%
filter(lag(English) %in% c("User Population", "Location", "Language Status", "Other Comments")) %>%
pivot_wider(names_from = name, values_from = English) %>%
clean_names() %>%
mutate(eng_total = gsub(" .+", "", user_population) %>% gsub(",", "", .) %>% as.integer,
l1 = if_else(grepl("L1", user_population),
gsub(".+ L1 users: ", "", user_population),
NA_character_),
l1_src = str_extract(l1, "(?<=\\().+?(?=\\))"),
l1 = sub(" .+", "", l1) %>% gsub(",", "", .) %>% as.integer,
l1_yr = str_extract(l1_src, "\\d+") %>% as.integer,
l2 = if_else(grepl("L2", user_population),
gsub(".+ L2 users: ", "", user_population),
NA_character_),
l2_src = str_extract(l2, "(?<=\\().+?(?=\\))"),
l2 = sub(" .+", "", l2) %>% gsub(",", "", .) %>% as.integer,
l2_yr = str_extract(l2_src, "\\d+") %>% as.integer,
total_yr = if_else(l2 > l1, l2_yr, l1_yr),
total_yr = if_else(is.na(total_yr), l2_yr, total_yr) %>% if_else(is.na(.), l1_yr, .),
total_src = if_else(is.na(l1_src) & is.na(l2_src),
str_extract(user_population, "(?<=\\().+?(?=\\))"),
NA_character_),
total_yr = if_else(is.na(total_yr),
str_extract(total_src, "\\d+") %>% as.integer,
total_yr)
)
## Subset key variables
ethno_sub <- ethno %>%
select(country, eng_total, total_yr)
# English speakers Wikipedia
wiki_eng <- wiki_raw %>%
slice(-1) %>%
clean_names() %>%
rename(eligible_pop = eligible_population,
total_eng_speak = total_english_speakers,
as_1st_lang = as_first_language,
as_additional = as_an_additional_language) %>%
mutate(across(c(2:5), as.integer),
eng_prop_wiki = total_eng_speak/eligible_pop,
country = str_trim(country),
) %>%
select(country, eng_prop_wiki)
# Corruption
cpi <- cpi_raw %>%
clean_names() %>%
rename_with(~if_else(str_count(., "_") == 2, sub("_", "", .), .)) %>% # remove first _ for those with two
pivot_longer(c(4:ncol(.)),
names_to = c(".value", "year"),
names_pattern = "(.+)_(.+)") %>%
transmute(country, year = as.double(year), cpiscore)
# WGI, Voice and accountability
## paste the first row with the column name and assigning them as column names
wgi <- wgi_raw
colnames(wgi) <- paste0(colnames(wgi_raw), wgi_raw[1, ])
wgi <- wgi %>%
rename_with(~gsub("\\.\\.\\.\\d*", "_", .)) %>%
slice(-1) %>%
pivot_longer(cols = c(3:ncol(.)),
names_to = c("year", "name"),
names_pattern = "(.+)_(.+)",
values_to = "wgi_est"
) %>%
filter(name == "Estimate") %>%
transmute(country = `_Country/Territory`,
year = as.integer(year),
wgi_est = if_else(wgi_est == "#NA", as.double(NA), as.double(wgi_est))
)
# UN population estimates
pop <- pop_raw %>%
clean_names() %>%
rename(country = location, year = time) %>%
mutate(across(c(7:10), ~.*1000)) %>%
filter(variant == "Medium") %>%
filter(year %in% c(1989:2021)) %>%
select(country, year, pop_total)
# GDP PPP data
gdp_ppp <- gdp_ppp_raw %>%
pivot_longer(c(5:ncol(.)), names_to = "year", values_to = "gdp_ppp") %>%
clean_names() %>%
filter(gdp_ppp != "X66") %>%
transmute(country = country_name, year = as.integer(year), gdp_ppp)
# Twitter users, January 2020
hootsuite <- hootsuite_raw %>%
transmute(country,
twitter_users = as.numeric(users)*1000) # thousands
# Bind supplementary ------
supp <- cpi %>%
full_join(ethno_sub, by = c("country", "year" = "total_yr")) %>%
left_join(wiki_eng, by = "country") %>%
left_join(pop, by = c("country", "year")) %>%
left_join(gdp_ppp, by = c("country", "year")) %>%
left_join(wgi, by = c("country", "year")) %>%
left_join(hootsuite, by = c("country")) %>%
mutate(eng_prop = eng_total/pop_total,
gdp_ppp_pc = gdp_ppp/pop_total,
twitter_users_pc = twitter_users/pop_total
) %>%
filter(!is.na(eng_prop))
# Format sentiment lexicons ----
## Prepare afinn by stemming and finding mean value of words with same stem
afinn_stem <- afinn %>%
mutate(stem = SnowballC::wordStem(word)) %>%
group_by(stem) %>%
summarise(afinn_value = max(value))
senti_lexicons <- afinn %>%
full_join(bing %>% rename(bing_sentiment = sentiment)) %>%
full_join(nrc %>% rename(nrc_sentiment = sentiment))
# Format GADM boundary data ------
## National boundaries ----
boundaries_national <- gadm_nat_raw %>%
clean_names() %>%
transmute(country = as.character(name_0),
geometry
)
## Sub-national boundaries ----
boundaries_subnational <- gadm_sub_raw %>%
clean_names() %>%
transmute(country = as.character(name_0),
region_1 = as.character(name_1),
engtype_1,
region_2 = as.character(name_2),
engtype_2,
geometry
) %>%
# correct typos or adapt before joining with other objects, e.g. election data
mutate(region_1 = case_when(region_1 == "Nassarawa" ~ "Nasarawa",
TRUE ~ region_1))
# Vector of subnational boundary names
subnational_names <- boundaries_subnational %>%
as.data.frame() %>%
select(-geometry)
# Format REIGN ----
## Leadership and term variables
reign <- reign_raw %>%
clean_names() %>%
select(country, name = leader, year, month) %>%
# filter countries
filter(country %in% c("Nigeria", "Zimbabwe", "Georgia", "Mexico", "Afghanistan")) %>%
# get start and end of term for each leader, need to use row_number to get last term of last country
filter(lag(name) != name | lead(name) != name | row_number() == n()) %>%
# note term number, several sequentially count as one
mutate(date_type = if_else(lead(name) != name | row_number() == n(), "term_end", "term_start"),
date = paste0(year, "-", str_pad(month, 2, "left", pad = "0"), "-01"),
date = as.Date(date)
) %>%
# drop leaders with terms that ended before 2006 (Twitter's founding)
filter(!(year < 2006 & date_type == "term_end" | date_type == "term_start" & lead(year) < 2006)) %>%
select(country, name, date_type, date) %>%
group_by(country, name, date_type) %>%
# count terms per leader - NEED TO ADJUST FOR NAMES, e.g. Løkke, father-son?
mutate(term_n = paste0("term ", 1:n())) %>%
ungroup() %>%
pivot_wider(names_from = date_type, values_from = date) %>%
mutate(term_start = if_else(is.na(term_start), term_end, term_start),
) %>%
# match between reign and candidates objects
left_join(., name_lookup, by = c("name" = "from")) %>%
mutate(common = if_else(is.na(common), name, common)) %>%
select(-name) %>%
rename(start = term_start, end = term_end, name = common) %>%
# drop NA names, i.e. countries or leaders not included in look-up
filter(!is.na(name))
# Format election results -------
## NGA Nigeria presidential election ----
# from inspecting Stears website
## Initial formatting
nga_p_19 <- stears_19_raw %>%
as_tibble() %>%
select(president) %>%
unnest(cols = c(president)) %>%
unnest(cols = c(president)) %>%
filter(!is.na(candidate))
nga_p_15 <- stears_15_raw[1] %>%
as_tibble() %>%
unnest(cols = c(stateData))
# missing 2011 election, have attempted to contact INEC (Independent National Electoral Commission, Nigeria)
# Bind and save Nigerian presidential election data
nga_pres <- bind_rows(nga_p_15, nga_p_19) %>%
rename(name = candidate) %>%
mutate(across(c(total_votes, votes), ~gsub(",", "", .) %>% as.integer),
year = as.integer(year),
# election date, from Wikipedia, if held over several days, take first day
elex_date = case_when(year == 2019 ~ as.Date("2019-02-23"),
year == 2015 ~ as.Date("2015-03-28")
),
country = "Nigeria"
)
# Add state names from abbreviations by joining look-up table
nga_pres <- nga_pres %>%
full_join(., nga_state_abbrev %>% select(-official_abbrev),
by = c("state" = "stears_abbrev")) %>%
select(elex_date, country, region_1 = state_name, name, votes)
# Add national count rows to Nigeria (to match with Tweets without points)
nga_pres <- nga_pres %>%
mutate(region_1 = "National") %>%
group_by(elex_date, country, region_1, name) %>%
summarise(votes = sum(votes)) %>%
ungroup() %>%
bind_rows(nga_pres)
## AFG Afghanistan president data ----
afg_19 <- afg_19_raw %>%
rename(province = name,
total = votes
) %>%
pivot_longer(cols = c(4:ncol(.)), values_to = "votes") %>%
mutate(year = 2019)
afg_14 <- afg_14_raw %>%
rename(province = name,
total = votes,
total_population = totalPopulation
) %>%
pivot_longer(cols = c(5:ncol(.)), values_to = "votes") %>%
mutate(year = 2014)
afg_09 <- afg_09_raw %>%
rename(province = name
) %>%
pivot_longer(c(8:ncol(.)), values_to = "votes") %>%
clean_names() %>%
mutate(year = 2009)
# Bind Afghanistani elections
afg_pres <- bind_rows(afg_09, afg_14) %>%
bind_rows(afg_19) %>%
filter(!name %in% c("votes")) %>%
# add election date, first day if several (but second round)
mutate(elex_date = case_when(year == 2009 ~ as.Date("2009-08-20"),
year == 2014 ~ as.Date("2014-06-14"), # second round
year == 2019 ~ as.Date("2019-09-28")
),
country = "Afghanistan"
) %>%
transmute(elex_date, country, region_1 = province, name, votes)
# Add national level
afg_pres <- afg_pres %>%
mutate(region_1 = "National") %>%
group_by(elex_date, country, region_1, name) %>%
summarise(votes = sum(votes, na.rm = TRUE)) %>%
ungroup() %>%
bind_rows(afg_pres)
## GEO Georgia presidential elections ----
geo_08 <- geo_08_raw %>%
pivot_longer(cols = c(6:12), names_to = "name", values_to = "votes_share") %>%
clean_names() %>%
filter(map_level %in% c("Country", "District")) %>%
select(map_level, country_name, name, votes_share, votes_total = total_voter_turnout_number) %>%
mutate(elex_date = as.Date("2008-01-05"),
votes_share = as.double(votes_share))
geo_13 <- geo_13_raw %>%
pivot_longer(cols = c(6:28), names_to = "name", values_to = "votes_share") %>%
clean_names() %>%
filter(map_level %in% c("Country", "District")) %>%
select(map_level, country_name, name, votes_share, votes_total = total_voter_turnout_number) %>%
mutate(elex_date = as.Date("2013-10-27"))
geo_pres <- bind_rows(geo_08, geo_13) %>%
transmute(elex_date,
country = "Georgia",
region_2 = case_when(map_level == "Country" ~ "National",
map_level == "District" ~ country_name),
name,
votes_share = votes_share/100,
votes_total)
## MEX Mexico presidential ----
# 2012
mex_12 <- mex_12_raw %>%
pivot_longer(cols = 5:10, names_to = "movement", values_to = "votes") %>%
clean_names() %>%
mutate(region_1 = name_state %>% str_to_title,
name = case_when(
# Peña Nieto
movement == "COMMITMENT TO MEXICO" ~ "Pena Nieto",
# Obrador
movement == "PROGRESSIVE MOVEMENT" ~ "Obrador",
# Quadria
movement == "NEW ALLIANCE" ~ "Quadri",
# Vazquez mota
movement == "BREAD" ~ "Vázquez Mota",
TRUE ~ movement)
) %>%
select(region_1, name, votes, total_votes)
mex_12 <- mex_12 %>%
group_by(region_1, name) %>%
summarise(votes = sum(votes, na.rm = TRUE)) %>%
mutate(elex_date = as.Date("2012-07-01"))
# 2018
mex_18 <- mex_18_raw %>%
clean_names() %>%
pivot_longer(cols = 13:35, names_to = "party", values_to = "votes") %>%
transmute(region_1 = str_to_title(nombre_estado),
party, votes,
total_votes = total_votos_calculados) %>%
mutate(name = case_when(
# Anaya cortes
party %in% c("pan", "prd", "mc",
"pan_prd_mc", "pan_prd",
"pan_mc",
"prd_mc",
"movimiento_ciudadano") ~ "Anaya",
# Lopez obrador
party %in% c("pt", "morena", "pes",
"pt_morena_pes", "pt_morena",
"pt_pes", "encuentro_social",
"morena_pes") ~ "Obrador",
# Meade kuribre
party %in% c("pri", "pvem", "nueva_alianza",
"pri_pvem_na", "pri_pvem",
"pri_na",
"pvem_na") ~ "Meade",
TRUE ~ "Other")
)
mex_18 <- mex_18 %>%
group_by(region_1, name) %>%
summarise(votes = sum(votes, na.rm = TRUE)) %>%
mutate(elex_date = as.Date("2018-07-01"))
# Bind Mexican years
mex_pres <- bind_rows(mex_12, mex_18) %>%
# fix region names to match GDL, old to new with recode()
mutate(region_1 = recode(region_1,
"Nuevo Le”N" = "Nuevo León",
"San Luis Potosi" = "San Luis Potosí",
"San Luis Potosõ" = "San Luis Potosí",
"Federal District" = "Distrito Federal")) %>%
mutate(country = "Mexico") %>%
ungroup()
# Add national level
mex_pres <- mex_pres %>%
mutate(region_1 = "National") %>%
group_by(elex_date, country, region_1, name) %>%
summarise(votes = sum(votes, na.rm = TRUE)) %>%
ungroup() %>%
bind_rows(mex_pres)
## ZWE Zimbabwe presidential ----
zwe_13 <- zwe_13_raw %>%
pivot_longer(2:ncol(.), names_to = "name", values_to = "votes") %>%
rename(region_1 = Province) %>%
mutate(elex_date = as.Date("2013-07-31")) %>%
filter(!name %in% c("Votes Rejected",
"Total Votes Cast"))
zwe_18 <- zwe_18_raw %>%
pivot_longer(2:8, names_to = "name", values_to = "votes") %>%
rename(region_1 = Province) %>%
select(-`Valid votes`) %>%
mutate(elex_date = as.Date("2018-07-3"))
# Bind Zimbabwe
zwe_pres <- bind_rows(zwe_13, zwe_18) %>%
filter(!region_1 %in% c("Total", "Percent")) %>%
mutate(country = "Zimbabwe",
name = case_when(name == "Mugabe Robert Gabriel (ZANU PF)" ~ "Mugabe",
name == "Tsvangirai Morgan (MDC-T)" ~ "Tsvangirai",
name == "Dabengwa Dumiso (ZAPU)" ~ "Dabengwa",
name == "Ncube Welshman (MDC)" ~ "Ncube",
name == "Mukwazhe Munodei Kisinoti (ZDP)" ~ "Mukwazhe",
TRUE ~ name
)
)
zwe_pres$name %>% unique
# Add national level
zwe_pres <- zwe_pres %>%
mutate(region_1 = "National") %>%
group_by(elex_date, country, region_1, name) %>%
summarise(votes = sum(votes, na.rm = TRUE)) %>%
ungroup() %>%
bind_rows(zwe_pres)
## Combine different countries' elections ----
# Bind countries
elex_combined <- bind_rows(afg_pres, nga_pres) %>%
bind_rows(., geo_pres) %>% # Georgia
bind_rows(mex_pres) %>% # Mexico
bind_rows(zwe_pres) %>% # Zimbabwe
left_join(., name_lookup, by = c("name" = "from")) %>%
mutate(name = if_else(is.na(common), name, common)) %>%
select(-common) %>%
mutate(region_2 = if_else(is.na(region_2), region_1, region_2))
# Format master election object
elex_master <- elex_combined %>%
group_by(elex_date, country, region_1, region_2) %>%
mutate(votes_total = sum(votes)) %>%
ungroup() %>%
# add votes share if not already there
# votes shares before for Georgia
mutate(votes_share = if_else(is.na(votes_share), votes/votes_total, votes_share)) %>%
mutate(region_1 = if_else(is.na(region_1), region_2, region_1),
region_2 = if_else(is.na(region_2), region_1, region_2)
)
# Subset two candidates per election with most votes for Tweet collection
candidates <- elex_master %>%
filter(region_1 == "National" | region_2 == "National") %>%
filter(!name %in% c("Other", "other")) %>%
group_by(elex_date, country, region_1) %>%
#mutate(winner = if_else(votes_share = max(votes_share) == ))
slice_max(votes_share, n = 2) %>%
ungroup() %>%
arrange(country, elex_date)
## Unique candidates
candidates %>% distinct(country, elex_date, name) #%>% filter(country == "Georgia")
# Format polling data -----
## Ad hoc polling ----
polling_adhoc <- polling_adhoc_raw %>%
mutate(date = as.Date(date)) %>%
select(-source)
## Country sheets ----
polling_mex <- polling_mex_raw %>%
mutate(date = as.Date(date, "%d-%m-%y")) %>%
select(-company, -remarks) %>%
pivot_longer(cols = 2:6, names_to = "leader", values_to = "votes_share") %>%
mutate(country = "Mexico",
region_1 = "National",
region_2 = "National",
# old to new leader names
leader = recode(leader,
"obrador" = "Obrador",
"anaya" = "Anaya",
"nieto" = "Pena Nieto",
"quadri" = "Quadri",
"vazquez" = "Vázquez Mota")
) %>%
filter(!is.na(votes_share))
# US,
# Note this is approval rating
polling_us_elliott <- polling_us_elliott_raw %>%
clean_names() %>%
transmute(country = "United States",
region_1 = "National",
region_2 = "National",
name = president,
date_target = as.Date(start_date, format = "%m/%d/%Y"),
votes_share = approving/100)
polling_us_538 <- polling_us_538_raw %>%
clean_names() %>%
transmute(name = politician,
date_target = as.Date(start_date, format = "%m/%d/%y"),
votes_share = yes/100)
polling_us_around_elex <- bind_rows(polling_us_2008_raw %>%
clean_names() %>%
transmute(date_target = as.Date(start_date, format = "%m/%d/%y"),
obama,
mccain),
polling_us_2012_raw %>%
clean_names() %>%
transmute(date_target = as.Date(start_date, format = "%m/%d/%y"),
obama,
romney),
polling_us_2016_raw %>%
clean_names() %>%
filter(population == "Likely Voters") %>%
transmute(date_target = as.Date(start_date, format = "%m/%d/%y"),
trump,
clinton)
) %>%
pivot_longer(cols = c(obama, romney, mccain, trump, clinton), values_to = "votes_share", values_drop_na = TRUE) %>%
mutate(votes_share = votes_share/100)
# Bind together
polling_us <- bind_rows(polling_us_elliott, polling_us_538, polling_us_around_elex) %>%
mutate(country = "United States",
region_1 = "National",
region_2 = "National",
name = recode(name,
"Bush 2" = "Bush Jr",
"mccain" = "McCain",
"obama" = "Obama",
"clinton" = "Clinton",
"trump" = "Trump",
"Donald Trump" = "Trump",
"romney" = "Romney")
)
## Combine country sheets and ad hoc ----
polling_master <- bind_rows(polling_adhoc, polling_mex)
polling_master %>% distinct(leader)
# ## Afrobarometer rounds ----
# # q99 Vote for which party in r7
# # q40 is How much fear political intimidation or violence
# afro_r7 <- afro_r7_raw %>%
# mutate(across(where(is.labelled),
# ~as_factor(., levels = "labels", ordered = TRUE) %>% trimws #%>% str_squish %>% tolower
# )
# )
#
# afro_r7 <- afro_r7 %>%
# clean_names() %>%
# select(country, region, q99, intimidation = q40, dateintr, withinwt)
#
# # q99 is Vote for which party in r6
# # q49 is How much fear political intimidation or violence
# afro_r6 <- afro_r6_raw %>%
# mutate(across(where(is.labelled),
# ~as_factor(., levels = "labels", ordered = TRUE) %>% trimws #%>% str_squish %>% tolower
# )
# )
#
# afro_r6 <- afro_r6 %>%
# clean_names() %>%
# select(country, region, q99, intimidation = q49, dateintr, withinwt)
#
#
# # q54 is How much fear political intimidation or violence
# # q99 is Vote for which party in round 5
# afro_r5 <- afro_r5_raw %>%
# mutate(across(where(is.labelled),
# ~as_factor(., levels = "labels", ordered = TRUE) %>% trimws #%>% str_squish %>% tolower
# )
# )
#
# afro_r5 <- afro_r5 %>%
# clean_names() %>%
# select(country, region, q99, intimidation = q54, dateintr, withinwt)
#
# # Bind Afrobaro rounds
# afro_all <- bind_rows(afro_r7, afro_r6) %>%
# bind_rows(afro_r5) %>%
# filter(country %in% elex_master$country) %>%
# rename(date = dateintr,
# vote = q99,
# weight = withinwt) %>%
# # pivot longer
# pivot_longer(cols = c(vote, intimidation)) %>%
# mutate(value = str_remove_all(value, "'|’")) %>%
# # old to new, party to leader
# mutate(value = case_when(
# # Goodluck Jonathan
# value %in% c("Peoples Democratic Party (PDP)"
# ) & year(date) < 2016 ~ "Goodluck Jonathan",
# # Atiku
# value %in% c("Peoples Democratic Party (PDP)"
# ) & year(date) > 2015 ~ "Atiku",
# # Buhari
# # Parties that were merged into APC as well
# value %in% c("All Progressive Congress (APC)",
# "All Progressive Congres (APC)",
# "Action Congress of Nigeria (ACN)",
# "All Nigeria Peoples Party (ANPP)",
# "Conscience Peoples Congress (CPC)",
# "All Peoples Party (APP)",
# "Advanced Congress of Democrats (ACD)",
# "Alliance for Democracy (AD)"
# ) ~ "Buhari",
# TRUE ~ value)
# )
#
# afro_all %>% distinct(country, year(date))
#
# afro_share <- afro_all %>%
# mutate(month = floor_date(date, "month")) %>%
# # calculate number of respondents
# group_by(country, region, month, name, value, weight) %>%
# summarise(n = n()) %>%
# # find share
# group_by(country, region, month, name, weight) %>%
# mutate(share = n/sum(n)) %>%
# ungroup()
#
# afro_share %>% distinct(country, month)
#
# # Find weighted mean
# afro_weighted <- afro_share %>%
# group_by(country, region, month, name, value) %>%
# summarise(share = weighted.mean(share, weight = weight),
# n = n()) %>%
# ungroup() %>%
# arrange(country, region, month)
#
# afro_weighted %>%
# filter(name == "vote") %>%
# group_by(country, value) %>%
# summarise(n = sum(n)) %>%
# arrange(-n) %>% view
# slice_max(share, n = 3) %>%
# ungroup() %>%
# distinct(country, value)
# Format GDL ----
names(gdl_raw)
gdl <- gdl_raw %>%
filter(level %in% c("Subnat", "National")) %>%
mutate(region = if_else(region == "Total", "National", region)) %>%
# subset countries included in election dataset
filter(country %in% unique(elex_master$country)) %>%
select(country, region, year,
eye, popshare, phone, cellphone) %>%
arrange(country, region, year)
## Create GDL to GADM region look-up ------
# Below we test ad hoc which are missing - I then update recode() function
gdl_to_gadm_regions <- gdl %>%
distinct(country, region) %>%
mutate(gdl_region = region) %>%
mutate(new = str_replace_all(gdl_region, "\\(|\\)", "")) %>%
separate(new, into = letters[seq(1, 10)], sep = "([, ? ])") %>%
pivot_longer(cols = c(region, letters[seq(1, 10)]), values_to = "gadm_region") %>%
select(-name) %>%
filter(!is.na(gadm_region)) %>%
# gdl (old) to gadm (new) with recode
mutate(gadm_region = recode(gadm_region,
"Abuja FCT" = "Federal Capital Territory",
"Helmand" = "Hilmand",
"Daikundi" = "Daykundi",
"Herat" = "Hirat",
"Nooristan" = "Nuristan",
"Panjsher" = "Panjshir",
"Sar-e-Pul" = "Sari Pul",
"Nassarawa" = "Nasarawa",
"Zamfora" = "Zamfara",
"Mexico" = "México",
"Michoacan" = "Michoacán",
"Nuevo Leon" = "Nuevo León",
"Queretaro" = "Querétaro",
"Potosi" = "San Luis Potosí", # was split up by separate()
"Yucatan" = "Yucatán",
"Matebeleland North" = "Matabeleland North",
"Matebeleland South" = "Matabeleland South",
"Racha-Lochkhumi" = "Racha-Lechkhumi-Kvemo Svaneti", # split up because of space
"Samegrelo-Zemo Svateni" = "Samegrelo-Zemo Svaneti"
)) %>%
distinct(gdl_region, gadm_region) %>%
# remove gadm_names with parantheses
filter(!str_detect(gadm_region, "\\(|\\)")) %>%
# remove if name doesn't exist in GADM
filter(gadm_region %in% subnational_names$region_1 | gadm_region %in% subnational_names$region_2)
# Check if the look-up misses any GADM names
subnational_names %>%
pivot_longer(cols = c(region_1, region_2)) %>%
filter(!is.na(value)) %>%
#filter(!country == "Georgia" & !name == "region_2") %>%
filter(name == "region_1") %>%
filter(!value %in% gdl_to_gadm_regions$gadm_region)
## Add GADM names to GDL----
gdl_w_gadm <- gdl %>%
left_join(.,
gdl_to_gadm_regions,
by = c("region" = "gdl_region")) %>%
# Fix that GDL national observations weren't matched
mutate(gadm_region = if_else(region == "National", "National", gadm_region)) %>%
select(-region) %>%
select(country, gadm_region, year, everything()) %>%
arrange(country, gadm_region, year) %>%
distinct()
# Choose one observation when a region appears more than once in a year
# Choose that which is smallest/lowest pop share, implying more precise data
gdl_w_gadm <- gdl_w_gadm %>%
group_by(country, gadm_region, year) %>%
slice_min(popshare, n = 1) %>%
ungroup()
## Fill missing years with linear interpolation -----
gdl_interpo <- gdl_w_gadm %>%
group_by(country, gadm_region) %>%
complete(year = 2006:2021) %>% # Twitter founded in 2006
arrange(country, gadm_region, year) %>%
# linear interpolation for missing years
# extrapolate with rule = 2: the value at the closest data extreme is used
mutate(
across(c(eye, popshare, cellphone, phone), ~zoo::na.locf(., na.rm = FALSE) %>% zoo::na.locf(., fromLast = TRUE)),
#across(c(eye, popshare, cellphone, phone), ~zoo::na.approx(., na.rm = FALSE, rule = 2)),
across(c(eye, popshare, cellphone, phone), ~./100),
#popshare = if_else(gadm_region == "National", as.double(NA_integer_), popshare) # if national, popshare is NA
) %>%
ungroup()
# CHeck popshare summarise
gdl_interpo %>% filter(gadm_region != "National") %>%
group_by(country, year) %>% summarise(popshare = sum(popshare, na.rm = TRUE))
# Create targets master -----
# Check that polling names all match a leader name in election data
unique(polling_master$leader) %in% unique(elex_master$name)
targets_master <- bind_rows(polling_master %>%
rename(name = leader) %>%
mutate(type = "poll"),
elex_master %>%
rename(date = elex_date) %>%
mutate(type = "election")
) %>%
rename(date_target = date) %>%
arrange(date_target) %>%
select(-c(votes, votes_total))
## Add GDL statistics -----
# covariates <- polling_elex_master %>%
# mutate(id = row_number(),
# year = year(date)) %>%
# # Join by region_2 because it is equal to region_1
# # if the country did not have that level to begin with
# left_join(gdl_interpo,
# by = c("year", "country", "region_2" = "gadm_region"))
#
# # Check if any regions appear twice in same year
# covariates %>% group_by(id) %>% filter(n() > 1) %>% distinct(region_1)
#
# # Check regions in the rows that don't match with GDL
# covariates %>% filter(is.na(popshare) & is.na(phone)) %>% distinct(country, region_1)
#
# anti_join(polling_elex_master %>% mutate(year = year(date)),
# gdl_interpo,
# by = c("year", "country", "region_2" = "gadm_region")) %>%
# distinct(country, region_1) %>% view()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.