data-raw/cow_gw_years.R

library(tidyverse)
library(countrycode)
library(lubridate)
library(peacesciencer)

# I'll have to do this manually. The dates are going to be weird, but I can fix that. I'll scan mostly for code discrepancies.
# GW don't have Dominica (54), Grenada (55), St. Lucia (56), SVG (57), A&B (58), and St. Kitts and Nevis (60)
# GW have the United Provinces of Central America as gwcode == 89. This was the union of Guatemala (90), Honduras (91),
#   El Salvador (92), Costa Rica (94), and Nicaragua (93). GW have this from July 1, 1823 to Dec. 31, 1839.
#   What's at stake: all those successor states have Jan. 1, 1840 starts in GW. Start dates in CoW are a little more scattered.
#   CoW has Guatemala at Jan. 1, 1868, Honduras at Jan. 1, 1899, El Salvador at Jan. 1, 1875, Nicaragua at Jan. 1, 1900,
#   Costa Rica at Jan. 1, 1920, and Nicaragua at Jan. 1, 1920. That's pretty remarkable to have such a wild discrepancy.

# Here's a biggie: GW have Gran Colombia (gwcode == 99) from Aug. 30, 1821 to Sept. 22, 1830. That encompassed Colombia (duh),
# Ecuador, Panama, Venezuela, and even parts of northern Peru and northwestern Brazil. For those successor states, GW have
# Venezuela as emerging on Jan. 1, 1829 (while still in GCL, which is weird), Ecuador on May 13, 1830, Peru (which was only
# partly in GCL) on Dec. 9, 1824, and, lest we forget, Colombia emerging on Sept. 23, 1830. The country codes are identical
# to CoW.

# GW don't have Monaco (221), Liechtenstein (223), Andorra (232), and San Marino (331).

# Of note: Germany dies 1945 in the GW data. GW interpret (reasonably, but unfortunately from a CoW data perspective) that
# unification amounts to a consolidation of East Germany (265) into West Germany (260).

# The big enchilada: Serbia. GW have Serbia as 340 before and after Yugoslavia. Yugoslavia, for when it existed, is 345.
# This stands at odds with the CoW interpretation (which I tend to favor): Yugoslavia had a Serbian center. Serbia preceded
# Yugoslavia and should be understood as both predecessor state, successor state, and the center of gravity (if you will) of
# Yugoslavia for when Yugoslavia was a thing.

# GW don't have Sao Tome and Principe (403) and Seychelles (591)
# CoW doesn't have Transvaal (563), Orange Free State (564)

# Another headache: GW treat YPR (680) as merging into YAR (678), the interpretation I also share. CoW treats the successor
# state as a new entity (ccode == 679).

# GW have Tibet (711). CoW does not.

# Another of the familiar headaches: GW has three Vietnams (815, during and preceding the French consolidation of the region),
# Vietnam proper (816), and RVN (1954-1975)


# Hmm... how about this. Let's, in particular, focus on the stateabbs. That's the closest to a perfect overlap between mutual
# observations. There are a few discrepancies though. Let's do this with a full_join(), but starting with the GW data (which
# seem to have the broadest scope, at least in terms of days). Where applicable, let's standardize to CoW.

gw_states %>%
  mutate(stateabb = case_when(
    stateabb == "HSD" ~ "HSG",
    # Justification for Serbia: the dates don't at all overlap, nor should they.
    stateabb == "SER" ~ "YUG",
    stateabb == "RUM" ~ "ROM",
    stateabb == "FJI" ~ "FIJ",
    TRUE ~ stateabb
  )) %>%
  rename(gw_statename = statename) %>%
  mutate(enddate = if_else(enddate == as_date("2017-12-31"), as_date("2020-12-31"), enddate)) %>%
  rowwise() %>%
  mutate(day = list(seq(startdate, enddate, by = '1 day'))) %>%
  unnest(day) %>%
  mutate(gwday = 1) %>%
  select(gwcode, stateabb, gw_statename, day, gwday) -> gwdays


cow_states %>%
  mutate(stdate = ymd(paste0(styear,"/",stmonth, "/", stday)),
         enddate = ymd(paste0(endyear,"/",endmonth,"/",endday))) %>%
  select(stateabb:statenme, stdate, enddate) %>%
  rename(cow_statename = statenme) %>%
  mutate(enddate = if_else(enddate == as_date("2016-12-31"), as_date("2020-12-31"), enddate)) %>%
  rowwise() %>%
  mutate(day = list(seq(stdate, enddate, by = '1 day'))) %>%
  unnest(day) %>%
  select(-stdate, -enddate) %>%
  mutate(cowday = 1) -> cowdays

gwdays %>% full_join(., cowdays) -> cow_gw_days

cow_gw_days %>% mutate(year = year(day)) %>%
  distinct(stateabb, year, gwcode, ccode, gw_statename, cow_statename) %>%
  group_by(ccode, year) %>%
  mutate(gwcode = ifelse(is.na(gwcode) & n() > 1, max(gwcode, na.rm=T), gwcode)) %>%
  slice(1) %>% ungroup() -> cow_gw_years

# gwdays %>% full_join(., cowdays) -> cow_gw_days
#
# cow_gw_days %>% mutate(year = year(day)) %>% distinct(stateabb, year, gwcode, ccode, gw_statename, cow_statename) -> cow_gw_years
#
# cow_gw_years %>% select(stateabb, year, gwcode, ccode, gw_statename, cow_statename) -> cow_gw_years
#
# cow_gw_years %>% filter(!is.na(ccode)) %>% group_by(ccode,year) %>% filter(n() > 1) %>% arrange(ccode) %>% data.frame

# ^ There are still about 24 duplicate ccode-years. These are cases where the ccode appears twice but the gwcode just once.

# Upon further review, there's going to be some weirdness for cases where the stateabbs differ because of a change or some other oddity.
# Consider Yemen here. GW have one big ol' Yemen from 1918 to 2017 (stateabb: YEM). CoW has the two Yemens, which has YAR for the
# predecessor (ccode: 678) and YEM for the successor state (ccode: 679). Let's manually fix this.

# We'll do the same for Germany after 1990. Justification: we used state abbs to get started, but we really want some consistency in the codes.
# We'll also manage duplicates in the functions that use the underlying data.

cow_gw_years %>%
  mutate(gwcode = case_when(
    stateabb == "YAR" ~ 678,
    stateabb == "GMY" & year >= 1990 ~ 260,
    TRUE ~ gwcode
  )) -> cow_gw_years

# So, I think this is going to have to do. YOu can't group-by and slice/filter(max) because you'll omit cases where there's a gwcode
# but no ccode. I'm just going to leave this as is. I think there's workable stuff in here, no matter.

cow_gw_years %>%
  filter(!is.na(ccode)) %>%
  group_by(ccode, year) %>% filter(n() > 1) %>% arrange(ccode, year)

# But let's do a group_by fill, though.
# I had encountered a situation where I had though left_join was smart enough to overcome my laziness
# However, it mean that some gwcodes weren't registering.

cow_gw_years %>%
  group_by(ccode, year) %>%
  fill(gwcode) %>%
  group_by(ccode, year) %>%
  slice(1) -> cow_gw_years

# Forgot to do this the first time around
cow_gw_years %>% ungroup() -> cow_gw_years

# In case this doesn't register...
cow_gw_years %>%
  mutate(gw_statename = case_when(
    gwcode == 437 ~ "Cote D'Ivoire",
    gwcode == 271 ~ "Wuerttemberg",
    TRUE ~ gw_statename)) -> cow_gw_years

save(cow_gw_years, file="data/cow_gw_years.rda")
svmiller/peacesciencer documentation built on April 3, 2025, 1:15 p.m.