In zdaarau/c2d: Download Data from the Referendum Database (RDB), Which Covers Direct Democratic Votes Worldwide

NOTES

This file is not included in the source package because of the .nopurl suffix in its filename.
The chunks below have to be manually executed in order to regenerate the package data.
Although the datasets below are saved as "internal data" in R/sysdata.rda, they can still be exported and documented (by documenting the dataset's quoted name in the main .Rmd source file -- which only works when the dataset is also @exported), something not explicitly mentioned in the book R Packages. To do so, you first need to manually add the export() directive in the NAMESPACE file since roxygen2 won't add it automatically.

Setup

library(rlang,
        include.only = c("%|%", "%||%"))
library(magrittr,
        include.only = c("%>%", "%<>%", "%T>%", "%!>%", "%$%"))

Define data

`data_codebook`

NOTES:

In the following cases, a variable's prototype (ptype) has to be explicitly set in the raw codebook.Rmd:
- It's a floating point number. -> Set ptype = "numeric()".
- It's a complex number. -> Set ptype = "complex()".
- It's non-basic data type, i.e. a structure like a date. Set ptype to the proper value like ptype = "vctrs::new_date()".

data_codebook <- {

  codebook <- pal::toml_read("data-raw/rdb_codebook.toml")

  c(codebook$`01_main`$item,
    codebook$`02_institutional`$`01_status`$item,
    codebook$`02_institutional`$`02_trigger`$item,
    codebook$`02_institutional`$`03_object`$item,
    codebook$`02_institutional`$`04_other`$item) |>
    purrr::map(~ tibble::tibble(variable_name = .x$variable_name %||% NA_character_,
                                variable_name_print = .x$variable_name_print %||% NA_character_,
                                variable_name_unnested = .x$variable_name_unnested %||% variable_name,
                                variable_name_unnested_print = .x$variable_name_unnested_print %||% variable_name_print,
                                variable_label = .x$variable_label %||% NA_character_,
                                is_multi_valued = .x$is_multi_valued %||% FALSE,
                                is_nested = .x$is_nested %||% FALSE,
                                applicability_constraint = .x$applicability_constraint %||% NA_character_,
                                variable_values = list(.x$variable_values %||% character()),
                                value_label_prefix = .x$value_label_prefix %||% NA_character_,
                                value_labels = list(.x$value_labels %||% character()),
                                value_label_suffix = .x$value_label_suffix %||% NA_character_,
                                value_default = list(.x$value_default),
                                value_scale = .x$value_scale %||% NA_character_,
                                ptype =
                                  .x$ptype |>
                                  pal::when(!is.null(.) ~ eval(parse(text = .)),
                                            ~ .) |>
                                  list(),
                                derived_from =
                                  .x$derived_from |>
                                  pal::when(is.null(.) ~ list(character()),
                                            is.character(.) ~ list(.),
                                            ~ .),
                                gen_fn = .x$gen_fn %||% NA_character_,
                                is_opt = .x$is_opt %||% FALSE)) |>
    purrr::list_rbind() |>
    # infer missing prototypes
    dplyr::mutate(ptype = purrr::pmap(.l = list(is_multi_valued,
                                                is_nested,
                                                variable_values,
                                                value_labels,
                                                value_scale,
                                                ptype),
                                      .f = function(is_multi_valued,
                                                    is_nested,
                                                    variable_values,
                                                    value_labels,
                                                    value_scale,
                                                    ptype) {

                                        value_scale |>
                                          pal::when(!is.null(ptype) ~
                                                      ptype,
                                                    . == "undefined" || is_multi_valued || is_nested ~
                                                      list(),
                                                    . == "binary" && (length(value_labels) == 0L || all(is.logical(unlist(variable_values)))) ~
                                                      logical(),
                                                    . %in% c("nominal") && length(value_labels) == 0L ~
                                                      character(),
                                                    . %in% c("nominal", "binary", "ordinal_ascending", "ordinal_descending") ~
                                                      factor(levels = variable_values,
                                                             ordered = . %in% c("ordinal_ascending", "ordinal_descending")),
                                                    . %in% c("interval", "ratio") ~
                                                      integer())
                                      })) |>
    # integrity checks
    ## not NA
    assertr::assert(predicate = assertr::not_na,
                    variable_name,
                    variable_name_print,
                    variable_name_unnested,
                    variable_name_unnested_print,
                    variable_label,
                    value_scale,
                    is_multi_valued,
                    is_nested,
                    ptype,
                    is_opt) |>
    ## not empty string
    assertr::assert(predicate = \(x) !(x == ""),
                    variable_name,
                    variable_name_print,
                    variable_name_unnested,
                    variable_name_unnested_print,
                    variable_label,
                    value_scale)
}

ISO 3166

NOTES:

ISO monetizes the standards itselves as well as derived datasets like the Country Codes Collection instead of offering open data APIs! 🤬 So most open-source projects including the R package ISOcodes source their ISO 3166 data from Debian's iso-codes project which might deviate from the official standard and lack behind updates.
ISO 3166-1 does not specify the time dimensions of countries. It maintains only information with regard to their most recent status. A period of validity is only defined when a country is deleted from ISO 3166-1 and moved to ISO 3166-3. Thus, this information must be retrieved from another source.

Noteworthy candidates for this include:
- The Territory Information System (TIS) by the CISAC which at first glance seems to be exactly what we're looking for. The latest revision can be downloaded here. But as it turns out, it's tailored to the CISAC's specific needs and doesn't go back further than the 1990s. ☹
- The Standard Classification of Countries and Areas of Interest (SCCAI) by Statistics Canada which includes a list of Current and Historical Countries and Areas of Interest that define a start and end date for each country/area. The latest version from 2019 is available here. Countries there are identified by a five-digit SCCAI code where the last three digit correspond to the United Nations numeric code (NUM-3) for countries or areas. To get the ISO 3166-1 alpha-2 code, one can also simply join the data with the main SCCAI 2019 data. Unfortunately, the precision of the provided start and end dates is only years, not actual dates. Also it's unclear when and how SCCAI gets updated, i.e. how well it'll be maintained in the future.
- The Correlates of War project's State System Membership dataset
- Wikidata!
To verify periods of validity of countries, the following resources are useful:
- Wikipedia's List of sovereign states by date of formation
- Wikipedia's List of former sovereign states
- Wikipedia's List of national independence days

`data_iso_3166_1`

ISO 3166-1 data, corrected and extended by unofficial information for countries which are not covered by the ISO standard yet. See also the Differences between SCCAI 2019 and ISO 3166-1:2013 for inspiration on name corrections.

data_iso_3166_1 <-
  ISOcodes::ISO_3166_1 |>
  tibble::as_tibble() |>
  # add `Common_name` where it's currently missing
  # TODO: consider submitting PRs once [upstream issue #44](https://salsa.debian.org/iso-codes-team/iso-codes/-/issues/44) is answered
  # TODO: remove IR, LA and SY once ISOcodes pkg version > 2022.09.29 is released
  dplyr::mutate(Common_name = dplyr::case_when(Alpha_2 == "BN" ~ "Brunei",
                                               Alpha_2 == "CD" ~ "Democratic Republic of Congo",
                                               Alpha_2 == "CG" ~ "Congo Republic",
                                               Alpha_2 == "FK" ~ "Falkland Islands",
                                               Alpha_2 == "FM" ~ "Micronesia",
                                               Alpha_2 == "IR" ~ "Iran",
                                               Alpha_2 == "LA" ~ "Laos",
                                               Alpha_2 == "PS" ~ "Palestine",
                                               Alpha_2 == "SY" ~ "Syria",
                                               Alpha_2 == "VA" ~ "Vatican City",
                                               .default = Common_name)) |>
  # extend with unofficial information
  dplyr::add_row(Alpha_2 = "XK",
                 Alpha_3 = "XKS",
                 Numeric = NA_character_,
                 Name = "Kosovo, Republic of",
                 Official_name = "Republic of Kosovo",
                 Common_name = "Kosovo") |>
  dplyr::mutate(name_short = Common_name %|% Name,
                name_long = Official_name %|% Name) |>
  dplyr::arrange(Alpha_2) |>
  # ensure there are no duplicates
  assertr::assert(predicate = assertr::is_uniq,
                  Alpha_2,
                  Alpha_3)

`data_iso_3166_3`

data_iso_3166_3 <-
  ISOcodes::ISO_3166_3 |>
  tibble::as_tibble() |>
  dplyr::mutate(Alpha_2 = stringr::str_sub(string = Alpha_4,
                                           end = 2L),
                Alpha_2_new = stringr::str_sub(string = Alpha_4,
                                               start = 3L),
                # variation of `Alpha_2_new` that in case of multiple successor countries (*HH/*XX) holds the biggest one (1. population-, then 2. area-wise)
                # (main purpose is to be able to match with UN M49 area codes)
                Alpha_2_new_main = purrr::map_chr(Alpha_4,
                                                  \(x) {
                                                    if (stringr::str_detect(x, "(HH|XX)$")) {
                                                      return(switch(x,
                                                             ANHH = "CW",
                                                             CSHH = "CZ",
                                                             CSXX = "RS",
                                                             FQHH = "AQ",
                                                             GEHH = "KI",
                                                             NTHH = "IQ",
                                                             PCHH = "FM",
                                                             SUHH = "RU",
                                                             cli::abort(paste0("No ISO 3166-3 alpha-4 \"n-to-1\" ISO 3166-1 alpha-2 conversion rule defined ",
                                                                               "for {.val x}. Please update {.var data_iso_3166_3} accordingly and run ",
                                                                               "again."))))
                                                    } else {
                                                      return(stringr::str_sub(x, start = 3L))
                                                    }
                                                  }),
                Date_withdrawn =
                  Date_withdrawn |>
                  purrr::map(\(x) {
                    if (nchar(x) == 4L) {
                      clock::date_build(year = as.integer(x))
                    } else {
                      clock::date_parse(x, format = "%F")
                    }
                  }) |>
                  purrr::reduce(c)) |>
  dplyr::relocate(Alpha_2, Alpha_2_new, Alpha_2_new_main,
                  .after = Alpha_3) |>
  # harmonize name style
  dplyr::mutate(name_short = dplyr::case_match(Alpha_4,
                                               "BYAA" ~ "Byelorussian SSR",
                                               "FXFR" ~ "Metropolitan France",
                                               .default = stringr::str_extract(string = Name,
                                                                               pattern = "[^,]+")),
                name_long = dplyr::case_match(Alpha_4,
                                              "BYAA" ~ "Byelorussian Soviet Socialist Republic",
                                              "CSHH" ~ Name,
                                              "SUHH" ~ "Union of Soviet Socialist Republics (USSR)",
                                              "YDYE" ~ "Democratic Yemen, People's Democratic Republic of Yemen",
                                              .default = stringr::str_replace(string = Name,
                                                                              pattern = "^([^,]+), (.+)$",
                                                                              replacement = "\\2 \\1"))) |>
  dplyr::arrange(Alpha_4) |>
  # ensure there are no duplicates
  assertr::assert(predicate = assertr::is_uniq,
                  Alpha_4)

`country_codes_sudd_invalid`

country_codes_sudd_invalid <- c("MB", "ZZ")

# ensure they aren't used in ISO 3166
if (any(country_codes_sudd_invalid %in% c(data_iso_3166_1$Alpha_2,
                                          data_iso_3166_3$Alpha_2,
                                          data_iso_3166_3$Alpha_2_new_main))) {

  cli::cli_abort("At least one of {.var country_codes_sudd_invalid} is used in regular ISO 3166 alpha-2 codes. Please investigate.")
}

`data_topics`

data_topics <-
  # download file from private CCM Design repo
  yay::gh_text_file(owner = "zdaarau",
                    name = "c2d-app",
                    rev = "master",
                    path = "ch.c2d/web/themes.json") |>
  jsonlite::fromJSON(flatten = FALSE) |>
  tibble::as_tibble() |>
  # create tidy data version
  dplyr::rename(topic_tier_1 = name,
                topic_tier_2 = children) |>
  tidyr::unnest(cols = topic_tier_2,
                keep_empty = TRUE) |>
  dplyr::rename(topic_tier_2 = name,
                topic_tier_3 = children) |>
  # ensure consistent col subtypes
  dplyr::mutate(topic_tier_3 = purrr::map(topic_tier_3,
                                          \(x) { if (is.list(x)) character() else x })) |>
  tidyr::unnest(cols = topic_tier_3,
                keep_empty = TRUE)

`months_de*`

rlang::check_installed("clock")

months_de <-
  1:12 |>
  magrittr::set_names(clock::clock_labels_lookup("de")$month) |>
  as.list()

months_de_fms <- rdb:::as_fm_list(months_de)

`pkg_config`

pkg_config <-
  tibble::tibble(key = character(),
                 default_value = list(),
                 description = character()) %>%
  tibble::add_row(key = "api_username",
                  description = "RDB Services API username") %>%
  tibble::add_row(key = "api_password",
                  description = "RDB Services API password") %>%
  tibble::add_row(key = "global_max_cache_age",
                  default_value = list("30 days"),
                  description = pkgsnip::md_snip("opt_global_max_cache_age")) %>%
  tibble::add_row(key = "use_testing_server",
                  default_value = list(FALSE),
                  description = "Whether or not to use the testing servers instead of the production servers for RDB Services API calls etc.") %>%
  tibble::add_row(key = "test_testing_server",
                  default_value = list(FALSE),
                  description = "Whether or not to run the tests that use the testing servers for RDB Services API calls etc. during `devtools::test()`.")

`rfrnd_cols_order`

rfrnd_cols_order <- data_codebook$variable_name
i_loop <- 0

for (i in which(data_codebook$variable_name != data_codebook$variable_name_unnested)) {

  rfrnd_cols_order %<>% append(values = data_codebook$variable_name_unnested[i],
                               after = i + i_loop)
  i_loop <- i_loop + 1L
}

`topics_tier_#_`

Only for performance-reasons.

topics_tier_1_ <- rdb::topics(tiers = 1L)
topics_tier_2_ <- rdb::topics(tiers = 2L)
topics_tier_3_ <- rdb::topics(tiers = 3L)

`un_regions`

NOTES:

Antarctica (AQ) is not part of any UN subregion (i.e. it's directly below World).

# compile `country_code` <-> `un_country_code` dict
codes <-
  data_iso_3166_1 |>
  # temporarily add ISO 3166-1 alpha-3 codes for matching with M49 codes
  dplyr::select(country_code = Alpha_2,
                Alpha_3) |>
  # temporarily add M49 code for matching with actual UN region codes and names
  dplyr::left_join(y = ISOcodes::UN_M.49_Countries |> dplyr::select(un_country_code = Code,
                                                                    Alpha_3 = ISO_Alpha_3),
                   by = "Alpha_3") |>
  dplyr::select(-Alpha_3) |>
  # manual corrections
  dplyr::mutate(
    # certain countries share the same UN country code / don't have their own one for stupid political reasons (like China & Taiwan or Serbia & Kosovo), thus we
    # assign M49 code of
    # - China to Taiwan, cf. https://en.wikipedia.org/wiki/United_Nations_geoscheme_for_Asia#Note_on_Taiwan
    # - Serbia to Kosovo, cf. https://en.wikipedia.org/wiki/XK_(user_assigned_code)#Potential_assignment_of_an_official_ISO_3166-1_code_for_Kosovo
    un_country_code = dplyr::case_when(country_code == "TW" ~ "156",
                                       country_code == "XK" ~ "688",
                                       .default = un_country_code)
  ) |>
  # ensure there are no NAs left
  assertr::assert(predicate = assertr::not_na,
                  un_country_code)

# extract UN regions of the 3 different tiers
un_regions_tier_1 <-
  ISOcodes::UN_M.49_Regions |>
  tibble::as_tibble() |>
  dplyr::filter(Type == "Region" & Parent == "001") |>
  tidyr::separate_longer_delim(cols = Children,
                               delim = ", ") |>
  dplyr::mutate(un_region_tier_1_name = Name,
                un_region_tier_1_code = Code,
                children_tier_1 = Children,
                .keep = "none")

un_regions_tier_2 <-
  ISOcodes::UN_M.49_Regions |>
  tibble::as_tibble() |>
  dplyr::filter(Type == "Region" & Parent %in% un_regions_tier_1$un_region_tier_1_code) |>
  tidyr::separate_longer_delim(cols = Children,
                               delim = ", ") |>
  dplyr::mutate(un_region_tier_2_name = Name,
                un_region_tier_2_code = Code,
                children_tier_2 = Children,
                .keep = "none")

un_regions_tier_3 <-
  ISOcodes::UN_M.49_Regions |>
  tibble::as_tibble() |>
  dplyr::filter(Type == "Region" & Parent %in% un_regions_tier_2$un_region_tier_2_code) |>
  tidyr::separate_longer_delim(cols = Children,
                               delim = ", ") |>
  dplyr::mutate(un_region_tier_3_name = Name,
                un_region_tier_3_code = Code,
                children_tier_3 = Children,
                .keep = "none")

# combine UN regions of different tiers into single dataset and add `country_code`
un_regions <-
  un_regions_tier_1 |>
  dplyr::full_join(y = un_regions_tier_2,
                   by = dplyr::join_by(children_tier_1 == un_region_tier_2_code),
                   relationship = "one-to-many") |>
  dplyr::full_join(y = un_regions_tier_3,
                   by = dplyr::join_by(children_tier_2 == un_region_tier_3_code),
                   relationship = "one-to-many") |>
  dplyr::mutate(un_region_tier_2_code = dplyr::if_else(is.na(un_region_tier_2_name),
                                                       NA_character_,
                                                       children_tier_1),
                un_region_tier_3_code = dplyr::if_else(is.na(un_region_tier_3_name),
                                                       NA_character_,
                                                       children_tier_2),
                un_country_code = dplyr::if_else(is.na(children_tier_3),
                                                 children_tier_2,
                                                 children_tier_3),
                # add UN subregion which, except for Northern Europe, corresponds to the lowest `un_region_tier_*_name`
                un_subregion = dplyr::if_else(un_region_tier_1_name != "Europe",
                                              un_region_tier_3_name %|% un_region_tier_2_name,
                                              un_region_tier_2_name)) |>
  dplyr::select(un_country_code,
                starts_with("un_region_tier_1_"),
                starts_with("un_region_tier_2_"),
                starts_with("un_region_tier_3_"),
                un_subregion) |>
  dplyr::left_join(y = codes,
                   by = "un_country_code",
                   # NOTE: we must allow multiple matches because of the manual "corrections" in `codes` above
                   relationship = "one-to-many") |>
  dplyr::relocate(country_code) |>
  # remove rows without `country_code` (Sark)
  dplyr::filter(!is.na(country_code)) |>
  # ensure there are no NAs left
  assertr::assert(predicate = assertr::not_na,
                  un_country_code) |>
  # convert UN cols to type fct
  dplyr::mutate(
    # tier-1 name lvls are ordered alphabetically
    un_region_tier_1_name = factor(x = un_region_tier_1_name,
                                   levels = sort(unique(un_region_tier_1_name))),
    # tier-2 name lvls are ordered by tier-1 name, then ~ clockwise cardinal direction
    un_region_tier_2_name = factor(x = un_region_tier_2_name,
                                   levels = c(
                                     # Africa
                                     "Northern Africa",
                                     "Sub-Saharan Africa",
                                     # Americas
                                     "Northern America",
                                     "Latin America and the Caribbean",
                                     # Asia
                                     "Central Asia",
                                     "Eastern Asia",
                                     "South-eastern Asia",
                                     "Southern Asia",
                                     "Western Asia",
                                     # Europe
                                     "Northern Europe",
                                     "Eastern Europe",
                                     "Southern Europe",
                                     "Western Europe",
                                     # Oceania
                                     "Micronesia",
                                     "Polynesia",
                                     "Australia and New Zealand",
                                     "Melanesia")),
    # tier-3 name lvls are ordered by tier-2 name, then ~ clockwise cardinal direction
    un_region_tier_3_name = factor(x = un_region_tier_3_name,
                                   levels = c(
                                     # Africa
                                     "Middle Africa",
                                     "Eastern Africa",
                                     "Southern Africa",
                                     "Western Africa",
                                     # Americas
                                     "Caribbean",
                                     "South America",
                                     "Central America",
                                     # Europe
                                     "Channel Islands")),
    # subregion lvls inherit the order of tier-2 and -3 names
    un_subregion = factor(x = un_subregion,
                          levels = c(
                            # Africa
                            "Northern Africa",
                            "Eastern Africa",
                            "Southern Africa",
                            "Middle Africa",
                            "Western Africa",
                            # Americas
                            "Northern America",
                            "Caribbean",
                            "South America",
                            "Central America",
                            # Asia
                            "Central Asia",
                            "Eastern Asia",
                            "South-eastern Asia",
                            "Southern Asia",
                            "Western Asia",
                            # Europe
                            "Northern Europe",
                            "Eastern Europe",
                            "Southern Europe",
                            "Western Europe",
                            # Oceania
                            "Micronesia",
                            "Polynesia",
                            "Australia and New Zealand",
                            "Melanesia")),
    # code lvls are simply in ascending order
    un_country_code = factor(x = un_country_code,
                             levels = sort(unique(ISOcodes::UN_M.49_Countries$Code))),
    dplyr::across(.cols = matches("un_region_tier_\\d+_code"),
                  .fns = ~ factor(x = .x,
                                  levels = sort(unique(.x))))
  )

`var_lbls`

Only for performance-reasons.

var_lbls <-
  data_codebook$variable_label |>
  pal::strip_md() |>
  as.list() |>
  magrittr::set_names(value = data_codebook$variable_name)

`val_set`

val_set                        <- list()
val_set$country_code           <- c(data_iso_3166_1$Alpha_2,
                                    data_iso_3166_3$Alpha_4)
val_set$country_code_long      <- sort(unique(c(data_iso_3166_1$Alpha_3,
                                                data_iso_3166_3$Alpha_3)))
val_set$country_code_continual <- sort(unique(data_iso_3166_1$Alpha_2,
                                              data_iso_3166_3$Alpha_2_new_main))
val_set$country_name           <- sort(unique(c(data_iso_3166_1$name_short,
                                                data_iso_3166_3$name_short)))
val_set$country_name_long      <- sort(unique(c(data_iso_3166_1$name_long,
                                                data_iso_3166_3$name_long)))

Write data

Save all the small data objects as a single internal file R/sysdata.rda. Note that when documenting them, they must be explicitly @exported to be available to package users.

usethis::use_data(country_codes_sudd_invalid,
                  data_codebook,
                  data_iso_3166_1,
                  data_iso_3166_3,
                  data_topics,
                  months_de,
                  months_de_fms,
                  pkg_config,
                  rfrnd_cols_order,
                  topics_tier_1_,
                  topics_tier_2_,
                  topics_tier_3_,
                  un_regions,
                  var_lbls,
                  val_set,
                  internal = TRUE,
                  overwrite = TRUE,
                  compress = "xz",
                  version = 3L)

zdaarau/c2d documentation built on Dec. 18, 2024, 1:24 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

zdaarau/c2d
Download Data from the Referendum Database (RDB), Which Covers Direct Democratic Votes Worldwide

In zdaarau/c2d: Download Data from the Referendum Database (RDB), Which Covers Direct Democratic Votes Worldwide

NOTES

Setup

Define data

`data_codebook`

ISO 3166

`data_iso_3166_1`

`data_iso_3166_3`

`country_codes_sudd_invalid`

`data_topics`

`months_de*`

`pkg_config`

`rfrnd_cols_order`

`topics_tier_#_`

`un_regions`

`var_lbls`

`val_set`

Write data

R Package Documentation

Browse R Packages

We want your feedback!

zdaarau/c2d Download Data from the Referendum Database (RDB), Which Covers Direct Democratic Votes Worldwide

In zdaarau/c2d: Download Data from the Referendum Database (RDB), Which Covers Direct Democratic Votes Worldwide

NOTES

Setup

Define data

data_codebook

ISO 3166

data_iso_3166_1

data_iso_3166_3

country_codes_sudd_invalid

data_topics

months_de*

pkg_config

rfrnd_cols_order

topics_tier_#_

un_regions

var_lbls

val_set

Write data

R Package Documentation

Browse R Packages

We want your feedback!

zdaarau/c2d
Download Data from the Referendum Database (RDB), Which Covers Direct Democratic Votes Worldwide

`data_codebook`

`data_iso_3166_1`

`data_iso_3166_3`

`country_codes_sudd_invalid`

`data_topics`

`months_de*`

`pkg_config`

`rfrnd_cols_order`

`topics_tier_#_`

`un_regions`

`var_lbls`

`val_set`