NOTES

Setup

library(rlang,
        include.only = c("%|%", "%||%"))
library(magrittr,
        include.only = c("%>%", "%<>%", "%T>%", "%!>%", "%$%"))

Define data

data_codebook

NOTES:

data_codebook <- {

  codebook <- pal::toml_read("data-raw/rdb_codebook.toml")

  c(codebook$`01_main`$item,
    codebook$`02_institutional`$`01_status`$item,
    codebook$`02_institutional`$`02_trigger`$item,
    codebook$`02_institutional`$`03_object`$item,
    codebook$`02_institutional`$`04_other`$item) |>
    purrr::map(~ tibble::tibble(variable_name = .x$variable_name %||% NA_character_,
                                variable_name_print = .x$variable_name_print %||% NA_character_,
                                variable_name_unnested = .x$variable_name_unnested %||% variable_name,
                                variable_name_unnested_print = .x$variable_name_unnested_print %||% variable_name_print,
                                variable_label = .x$variable_label %||% NA_character_,
                                is_multi_valued = .x$is_multi_valued %||% FALSE,
                                is_nested = .x$is_nested %||% FALSE,
                                applicability_constraint = .x$applicability_constraint %||% NA_character_,
                                variable_values = list(.x$variable_values %||% character()),
                                value_label_prefix = .x$value_label_prefix %||% NA_character_,
                                value_labels = list(.x$value_labels %||% character()),
                                value_label_suffix = .x$value_label_suffix %||% NA_character_,
                                value_default = list(.x$value_default),
                                value_scale = .x$value_scale %||% NA_character_,
                                ptype =
                                  .x$ptype |>
                                  pal::when(!is.null(.) ~ eval(parse(text = .)),
                                            ~ .) |>
                                  list(),
                                derived_from =
                                  .x$derived_from |>
                                  pal::when(is.null(.) ~ list(character()),
                                            is.character(.) ~ list(.),
                                            ~ .),
                                gen_fn = .x$gen_fn %||% NA_character_,
                                is_opt = .x$is_opt %||% FALSE)) |>
    purrr::list_rbind() |>
    # infer missing prototypes
    dplyr::mutate(ptype = purrr::pmap(.l = list(is_multi_valued,
                                                is_nested,
                                                variable_values,
                                                value_labels,
                                                value_scale,
                                                ptype),
                                      .f = function(is_multi_valued,
                                                    is_nested,
                                                    variable_values,
                                                    value_labels,
                                                    value_scale,
                                                    ptype) {

                                        value_scale |>
                                          pal::when(!is.null(ptype) ~
                                                      ptype,
                                                    . == "undefined" || is_multi_valued || is_nested ~
                                                      list(),
                                                    . == "binary" && (length(value_labels) == 0L || all(is.logical(unlist(variable_values)))) ~
                                                      logical(),
                                                    . %in% c("nominal") && length(value_labels) == 0L ~
                                                      character(),
                                                    . %in% c("nominal", "binary", "ordinal_ascending", "ordinal_descending") ~
                                                      factor(levels = variable_values,
                                                             ordered = . %in% c("ordinal_ascending", "ordinal_descending")),
                                                    . %in% c("interval", "ratio") ~
                                                      integer())
                                      })) |>
    # integrity checks
    ## not NA
    assertr::assert(predicate = assertr::not_na,
                    variable_name,
                    variable_name_print,
                    variable_name_unnested,
                    variable_name_unnested_print,
                    variable_label,
                    value_scale,
                    is_multi_valued,
                    is_nested,
                    ptype,
                    is_opt) |>
    ## not empty string
    assertr::assert(predicate = \(x) !(x == ""),
                    variable_name,
                    variable_name_print,
                    variable_name_unnested,
                    variable_name_unnested_print,
                    variable_label,
                    value_scale)
}

ISO 3166

NOTES:

data_iso_3166_1

ISO 3166-1 data, corrected and extended by unofficial information for countries which are not covered by the ISO standard yet. See also the Differences between SCCAI 2019 and ISO 3166-1:2013 for inspiration on name corrections.

data_iso_3166_1 <-
  ISOcodes::ISO_3166_1 |>
  tibble::as_tibble() |>
  # add `Common_name` where it's currently missing
  # TODO: consider submitting PRs once [upstream issue #44](https://salsa.debian.org/iso-codes-team/iso-codes/-/issues/44) is answered
  # TODO: remove IR, LA and SY once ISOcodes pkg version > 2022.09.29 is released
  dplyr::mutate(Common_name = dplyr::case_when(Alpha_2 == "BN" ~ "Brunei",
                                               Alpha_2 == "CD" ~ "Democratic Republic of Congo",
                                               Alpha_2 == "CG" ~ "Congo Republic",
                                               Alpha_2 == "FK" ~ "Falkland Islands",
                                               Alpha_2 == "FM" ~ "Micronesia",
                                               Alpha_2 == "IR" ~ "Iran",
                                               Alpha_2 == "LA" ~ "Laos",
                                               Alpha_2 == "PS" ~ "Palestine",
                                               Alpha_2 == "SY" ~ "Syria",
                                               Alpha_2 == "VA" ~ "Vatican City",
                                               .default = Common_name)) |>
  # extend with unofficial information
  dplyr::add_row(Alpha_2 = "XK",
                 Alpha_3 = "XKS",
                 Numeric = NA_character_,
                 Name = "Kosovo, Republic of",
                 Official_name = "Republic of Kosovo",
                 Common_name = "Kosovo") |>
  dplyr::mutate(name_short = Common_name %|% Name,
                name_long = Official_name %|% Name) |>
  dplyr::arrange(Alpha_2) |>
  # ensure there are no duplicates
  assertr::assert(predicate = assertr::is_uniq,
                  Alpha_2,
                  Alpha_3)

data_iso_3166_3

data_iso_3166_3 <-
  ISOcodes::ISO_3166_3 |>
  tibble::as_tibble() |>
  dplyr::mutate(Alpha_2 = stringr::str_sub(string = Alpha_4,
                                           end = 2L),
                Alpha_2_new = stringr::str_sub(string = Alpha_4,
                                               start = 3L),
                # variation of `Alpha_2_new` that in case of multiple successor countries (*HH/*XX) holds the biggest one (1. population-, then 2. area-wise)
                # (main purpose is to be able to match with UN M49 area codes)
                Alpha_2_new_main = purrr::map_chr(Alpha_4,
                                                  \(x) {
                                                    if (stringr::str_detect(x, "(HH|XX)$")) {
                                                      return(switch(x,
                                                             ANHH = "CW",
                                                             CSHH = "CZ",
                                                             CSXX = "RS",
                                                             FQHH = "AQ",
                                                             GEHH = "KI",
                                                             NTHH = "IQ",
                                                             PCHH = "FM",
                                                             SUHH = "RU",
                                                             cli::abort(paste0("No ISO 3166-3 alpha-4 \"n-to-1\" ISO 3166-1 alpha-2 conversion rule defined ",
                                                                               "for {.val x}. Please update {.var data_iso_3166_3} accordingly and run ",
                                                                               "again."))))
                                                    } else {
                                                      return(stringr::str_sub(x, start = 3L))
                                                    }
                                                  }),
                Date_withdrawn =
                  Date_withdrawn |>
                  purrr::map(\(x) {
                    if (nchar(x) == 4L) {
                      clock::date_build(year = as.integer(x))
                    } else {
                      clock::date_parse(x, format = "%F")
                    }
                  }) |>
                  purrr::reduce(c)) |>
  dplyr::relocate(Alpha_2, Alpha_2_new, Alpha_2_new_main,
                  .after = Alpha_3) |>
  # harmonize name style
  dplyr::mutate(name_short = dplyr::case_match(Alpha_4,
                                               "BYAA" ~ "Byelorussian SSR",
                                               "FXFR" ~ "Metropolitan France",
                                               .default = stringr::str_extract(string = Name,
                                                                               pattern = "[^,]+")),
                name_long = dplyr::case_match(Alpha_4,
                                              "BYAA" ~ "Byelorussian Soviet Socialist Republic",
                                              "CSHH" ~ Name,
                                              "SUHH" ~ "Union of Soviet Socialist Republics (USSR)",
                                              "YDYE" ~ "Democratic Yemen, People's Democratic Republic of Yemen",
                                              .default = stringr::str_replace(string = Name,
                                                                              pattern = "^([^,]+), (.+)$",
                                                                              replacement = "\\2 \\1"))) |>
  dplyr::arrange(Alpha_4) |>
  # ensure there are no duplicates
  assertr::assert(predicate = assertr::is_uniq,
                  Alpha_4)

country_codes_sudd_invalid

country_codes_sudd_invalid <- c("MB", "ZZ")

# ensure they aren't used in ISO 3166
if (any(country_codes_sudd_invalid %in% c(data_iso_3166_1$Alpha_2,
                                          data_iso_3166_3$Alpha_2,
                                          data_iso_3166_3$Alpha_2_new_main))) {

  cli::cli_abort("At least one of {.var country_codes_sudd_invalid} is used in regular ISO 3166 alpha-2 codes. Please investigate.")
}

data_topics

data_topics <-
  # download file from private CCM Design repo
  yay::gh_text_file(owner = "zdaarau",
                    name = "c2d-app",
                    rev = "master",
                    path = "ch.c2d/web/themes.json") |>
  jsonlite::fromJSON(flatten = FALSE) |>
  tibble::as_tibble() |>
  # create tidy data version
  dplyr::rename(topic_tier_1 = name,
                topic_tier_2 = children) |>
  tidyr::unnest(cols = topic_tier_2,
                keep_empty = TRUE) |>
  dplyr::rename(topic_tier_2 = name,
                topic_tier_3 = children) |>
  # ensure consistent col subtypes
  dplyr::mutate(topic_tier_3 = purrr::map(topic_tier_3,
                                          \(x) { if (is.list(x)) character() else x })) |>
  tidyr::unnest(cols = topic_tier_3,
                keep_empty = TRUE)

months_de*

rlang::check_installed("clock")

months_de <-
  1:12 |>
  magrittr::set_names(clock::clock_labels_lookup("de")$month) |>
  as.list()

months_de_fms <- rdb:::as_fm_list(months_de)

pkg_config

pkg_config <-
  tibble::tibble(key = character(),
                 default_value = list(),
                 description = character()) %>%
  tibble::add_row(key = "api_username",
                  description = "RDB Services API username") %>%
  tibble::add_row(key = "api_password",
                  description = "RDB Services API password") %>%
  tibble::add_row(key = "global_max_cache_age",
                  default_value = list("30 days"),
                  description = pkgsnip::md_snip("opt_global_max_cache_age")) %>%
  tibble::add_row(key = "use_testing_server",
                  default_value = list(FALSE),
                  description = "Whether or not to use the testing servers instead of the production servers for RDB Services API calls etc.") %>%
  tibble::add_row(key = "test_testing_server",
                  default_value = list(FALSE),
                  description = "Whether or not to run the tests that use the testing servers for RDB Services API calls etc. during `devtools::test()`.")

rfrnd_cols_order

rfrnd_cols_order <- data_codebook$variable_name
i_loop <- 0

for (i in which(data_codebook$variable_name != data_codebook$variable_name_unnested)) {

  rfrnd_cols_order %<>% append(values = data_codebook$variable_name_unnested[i],
                               after = i + i_loop)
  i_loop <- i_loop + 1L
}

topics_tier_#_

Only for performance-reasons.

topics_tier_1_ <- rdb::topics(tiers = 1L)
topics_tier_2_ <- rdb::topics(tiers = 2L)
topics_tier_3_ <- rdb::topics(tiers = 3L)

un_regions

NOTES:

# compile `country_code` <-> `un_country_code` dict
codes <-
  data_iso_3166_1 |>
  # temporarily add ISO 3166-1 alpha-3 codes for matching with M49 codes
  dplyr::select(country_code = Alpha_2,
                Alpha_3) |>
  # temporarily add M49 code for matching with actual UN region codes and names
  dplyr::left_join(y = ISOcodes::UN_M.49_Countries |> dplyr::select(un_country_code = Code,
                                                                    Alpha_3 = ISO_Alpha_3),
                   by = "Alpha_3") |>
  dplyr::select(-Alpha_3) |>
  # manual corrections
  dplyr::mutate(
    # certain countries share the same UN country code / don't have their own one for stupid political reasons (like China & Taiwan or Serbia & Kosovo), thus we
    # assign M49 code of
    # - China to Taiwan, cf. https://en.wikipedia.org/wiki/United_Nations_geoscheme_for_Asia#Note_on_Taiwan
    # - Serbia to Kosovo, cf. https://en.wikipedia.org/wiki/XK_(user_assigned_code)#Potential_assignment_of_an_official_ISO_3166-1_code_for_Kosovo
    un_country_code = dplyr::case_when(country_code == "TW" ~ "156",
                                       country_code == "XK" ~ "688",
                                       .default = un_country_code)
  ) |>
  # ensure there are no NAs left
  assertr::assert(predicate = assertr::not_na,
                  un_country_code)

# extract UN regions of the 3 different tiers
un_regions_tier_1 <-
  ISOcodes::UN_M.49_Regions |>
  tibble::as_tibble() |>
  dplyr::filter(Type == "Region" & Parent == "001") |>
  tidyr::separate_longer_delim(cols = Children,
                               delim = ", ") |>
  dplyr::mutate(un_region_tier_1_name = Name,
                un_region_tier_1_code = Code,
                children_tier_1 = Children,
                .keep = "none")

un_regions_tier_2 <-
  ISOcodes::UN_M.49_Regions |>
  tibble::as_tibble() |>
  dplyr::filter(Type == "Region" & Parent %in% un_regions_tier_1$un_region_tier_1_code) |>
  tidyr::separate_longer_delim(cols = Children,
                               delim = ", ") |>
  dplyr::mutate(un_region_tier_2_name = Name,
                un_region_tier_2_code = Code,
                children_tier_2 = Children,
                .keep = "none")

un_regions_tier_3 <-
  ISOcodes::UN_M.49_Regions |>
  tibble::as_tibble() |>
  dplyr::filter(Type == "Region" & Parent %in% un_regions_tier_2$un_region_tier_2_code) |>
  tidyr::separate_longer_delim(cols = Children,
                               delim = ", ") |>
  dplyr::mutate(un_region_tier_3_name = Name,
                un_region_tier_3_code = Code,
                children_tier_3 = Children,
                .keep = "none")

# combine UN regions of different tiers into single dataset and add `country_code`
un_regions <-
  un_regions_tier_1 |>
  dplyr::full_join(y = un_regions_tier_2,
                   by = dplyr::join_by(children_tier_1 == un_region_tier_2_code),
                   relationship = "one-to-many") |>
  dplyr::full_join(y = un_regions_tier_3,
                   by = dplyr::join_by(children_tier_2 == un_region_tier_3_code),
                   relationship = "one-to-many") |>
  dplyr::mutate(un_region_tier_2_code = dplyr::if_else(is.na(un_region_tier_2_name),
                                                       NA_character_,
                                                       children_tier_1),
                un_region_tier_3_code = dplyr::if_else(is.na(un_region_tier_3_name),
                                                       NA_character_,
                                                       children_tier_2),
                un_country_code = dplyr::if_else(is.na(children_tier_3),
                                                 children_tier_2,
                                                 children_tier_3),
                # add UN subregion which, except for Northern Europe, corresponds to the lowest `un_region_tier_*_name`
                un_subregion = dplyr::if_else(un_region_tier_1_name != "Europe",
                                              un_region_tier_3_name %|% un_region_tier_2_name,
                                              un_region_tier_2_name)) |>
  dplyr::select(un_country_code,
                starts_with("un_region_tier_1_"),
                starts_with("un_region_tier_2_"),
                starts_with("un_region_tier_3_"),
                un_subregion) |>
  dplyr::left_join(y = codes,
                   by = "un_country_code",
                   # NOTE: we must allow multiple matches because of the manual "corrections" in `codes` above
                   relationship = "one-to-many") |>
  dplyr::relocate(country_code) |>
  # remove rows without `country_code` (Sark)
  dplyr::filter(!is.na(country_code)) |>
  # ensure there are no NAs left
  assertr::assert(predicate = assertr::not_na,
                  un_country_code) |>
  # convert UN cols to type fct
  dplyr::mutate(
    # tier-1 name lvls are ordered alphabetically
    un_region_tier_1_name = factor(x = un_region_tier_1_name,
                                   levels = sort(unique(un_region_tier_1_name))),
    # tier-2 name lvls are ordered by tier-1 name, then ~ clockwise cardinal direction
    un_region_tier_2_name = factor(x = un_region_tier_2_name,
                                   levels = c(
                                     # Africa
                                     "Northern Africa",
                                     "Sub-Saharan Africa",
                                     # Americas
                                     "Northern America",
                                     "Latin America and the Caribbean",
                                     # Asia
                                     "Central Asia",
                                     "Eastern Asia",
                                     "South-eastern Asia",
                                     "Southern Asia",
                                     "Western Asia",
                                     # Europe
                                     "Northern Europe",
                                     "Eastern Europe",
                                     "Southern Europe",
                                     "Western Europe",
                                     # Oceania
                                     "Micronesia",
                                     "Polynesia",
                                     "Australia and New Zealand",
                                     "Melanesia")),
    # tier-3 name lvls are ordered by tier-2 name, then ~ clockwise cardinal direction
    un_region_tier_3_name = factor(x = un_region_tier_3_name,
                                   levels = c(
                                     # Africa
                                     "Middle Africa",
                                     "Eastern Africa",
                                     "Southern Africa",
                                     "Western Africa",
                                     # Americas
                                     "Caribbean",
                                     "South America",
                                     "Central America",
                                     # Europe
                                     "Channel Islands")),
    # subregion lvls inherit the order of tier-2 and -3 names
    un_subregion = factor(x = un_subregion,
                          levels = c(
                            # Africa
                            "Northern Africa",
                            "Eastern Africa",
                            "Southern Africa",
                            "Middle Africa",
                            "Western Africa",
                            # Americas
                            "Northern America",
                            "Caribbean",
                            "South America",
                            "Central America",
                            # Asia
                            "Central Asia",
                            "Eastern Asia",
                            "South-eastern Asia",
                            "Southern Asia",
                            "Western Asia",
                            # Europe
                            "Northern Europe",
                            "Eastern Europe",
                            "Southern Europe",
                            "Western Europe",
                            # Oceania
                            "Micronesia",
                            "Polynesia",
                            "Australia and New Zealand",
                            "Melanesia")),
    # code lvls are simply in ascending order
    un_country_code = factor(x = un_country_code,
                             levels = sort(unique(ISOcodes::UN_M.49_Countries$Code))),
    dplyr::across(.cols = matches("un_region_tier_\\d+_code"),
                  .fns = ~ factor(x = .x,
                                  levels = sort(unique(.x))))
  )

var_lbls

Only for performance-reasons.

var_lbls <-
  data_codebook$variable_label |>
  pal::strip_md() |>
  as.list() |>
  magrittr::set_names(value = data_codebook$variable_name)

val_set

val_set                        <- list()
val_set$country_code           <- c(data_iso_3166_1$Alpha_2,
                                    data_iso_3166_3$Alpha_4)
val_set$country_code_long      <- sort(unique(c(data_iso_3166_1$Alpha_3,
                                                data_iso_3166_3$Alpha_3)))
val_set$country_code_continual <- sort(unique(data_iso_3166_1$Alpha_2,
                                              data_iso_3166_3$Alpha_2_new_main))
val_set$country_name           <- sort(unique(c(data_iso_3166_1$name_short,
                                                data_iso_3166_3$name_short)))
val_set$country_name_long      <- sort(unique(c(data_iso_3166_1$name_long,
                                                data_iso_3166_3$name_long)))

Write data

Save all the small data objects as a single internal file R/sysdata.rda. Note that when documenting them, they must be explicitly @exported to be available to package users.

usethis::use_data(country_codes_sudd_invalid,
                  data_codebook,
                  data_iso_3166_1,
                  data_iso_3166_3,
                  data_topics,
                  months_de,
                  months_de_fms,
                  pkg_config,
                  rfrnd_cols_order,
                  topics_tier_1_,
                  topics_tier_2_,
                  topics_tier_3_,
                  un_regions,
                  var_lbls,
                  val_set,
                  internal = TRUE,
                  overwrite = TRUE,
                  compress = "xz",
                  version = 3L)


zdaarau/c2d documentation built on Dec. 18, 2024, 1:24 p.m.