Country name and code aggregation

Intro

This is the script that scrapes and assembles the data used by the package. While both the script and the data are included, this script has a very high chance of breaking, given the likelihood of websites or URLs changing, locales differing, etc. If you'd like to assemble the data differently or are just curious how it was assembled, though, this script will be of use.

A function to refresh data is not included in the package because this script is depends on web structures that may vary, requires a lot of dependencies (see below) and is irrelevant to the use of the package. If you do choose to run it, keep in mind that it scrapes a lot of sources; please be curteous and don't re-scrape more times than necessary. Due to the number and size of sources, it will take a while to run, anyway, and so is probably best run by chunk. While structured as an RMarkdown notebook for ease of documentation and structure, the HTML file it knits is not useful and is thus not included.

Setup

Packages required:

Aside from rvest and tidyverse, most could be fairly easily refactored into an alternative.

library(rvest)
library(tidyverse)

Scraping

CIA World Factbook

# CIA World Factbook country names
country_names <- 'https://www.cia.gov/library/publications/the-world-factbook/fields/2142.html' %>%
    read_html() %>%
    html_nodes('table tr[id]') %>%
    { setNames(map(., html_nodes, 'td'), toupper(html_attr(., 'id'))) } %>%
    map_df(~data_frame(
        country = html_text(.x[1]),
        key = map(.x[2], html_nodes, css = 'strong') %>% map(html_text),
        value = map(.x[2], html_nodes, xpath = 'text()') %>%
            map(html_text, trim = TRUE) %>%
            map(discard, `==`, '')
    ), .id = 'gec') %>%
    unnest() %>%
    mutate(key = gsub(':\\s+$|`', '', key),
           key = gsub('etymolgy', 'etymology', key),
           key = gsub('official|Papiamentu', 'local', key),
           key = gsub('English', 'conventional', key),
           key = ifelse(country == 'Curacao',
                        gsub('Dutch', 'conventional', key),
                        gsub('Dutch', 'local', key)),
           country = ifelse(gec == 'AS', 'Australia', country)) %>%
    spread(key, value) %>% 
    mutate(abbreviation = ifelse(country == 'Australia', NA, abbreviation))


# CIA World Factbook country codes
country_codes <- 'https://www.cia.gov/library/publications/the-world-factbook/appendix/appendix-d.html' %>%
    read_html() %>%
    html_nodes('ul#GetAppendix_D li') %>%
    map(html_nodes, css = 'td') %>%
    map(html_text, trim = TRUE) %>%
    map(~.[c(-3, -10)]) %>%
    transpose() %>%
    simplify_all() %>%
    setNames(c('country', 'gec', 'iso2c', 'iso3c', 'iso3n', 'stanag', 'tld', 'comment')) %>%
    as_data_frame() %>%
    mutate_all(na_if, y = '-') %>%
    mutate(comment = na_if(comment, ''))

National Geospatial-Intelligence Agency

# http://geonames.nga.mil/gns/html/countrycodes.html
# http://geonames.nga.mil/gns/html/docs/GENC_ED3U5_GEC_XWALK.xlsx

genc <- rio::import('http://geonames.nga.mil/gns/html/docs/GENC_ED3U5_GEC_XWALK.xlsx', 
                    skip = 2, setclass = 'tbl_df', na = '--') %>% 
    modify(na_if, '[None]')

Wikipedia

# Wikipedia country codes
w <- 'https://en.wikipedia.org/wiki/Category:Lists_of_country_codes' %>%
    read_html() %>%
    html_nodes('a[title*="Country codes:"]') %>%
    html_attr('href') %>%
    paste0('https://en.wikipedia.org', .) %>%
    map(read_html) %>%
    map(html_nodes, 'h2 + table') %>%
    modify_depth(2, html_nodes, 'td') %>%
    map(map_df, ~list(
        key = map(.x, html_nodes, xpath = 'a|text()') %>%
            map(html_text) %>%
            map_chr(paste, collapse = '') %>%
            trimws(),
        value = html_nodes(.x, 'p') %>% html_text()
    ), .id = 'row') %>%
    map_df(spread, key, value) %>%
    select(-row) %>%
    mutate_all(na_if, y = '—') %>% modify(na_if, '-') %>%
    modify(~gsub('\\n', ', ', .x)) %>%
    setNames(c('tld', 'calling', 'mcc', 'gec', 'gs1_gtin', 'icao_aircraft', 'icao_airport', 'ioc', 'iso2c', 'iso3c', 'iso3n', 'itu_callsign', 'itu', 'itu_maritime', 'license_plate', 'marc', 'stanag', 'nato2c', 'undp', 'wmo'))


# # Wikipedia FIFA codes - unused
# fifa <- 'https://en.wikipedia.org/wiki/List_of_FIFA_country_codes' %>%
#     read_html() %>%
#     html_nodes('table.wikitable') %>%
#     map(html_table, fill = TRUE) %>%
#     .[-7:-9] %>%    # remove obsolete codes tables
#     reduce(full_join) %>%
#     mutate(FIFA = coalesce(Code, FIFA)) %>%
#     select(-Code, -Confederation) %>%
#     mutate_all(funs(gsub('\\[.*\\]', '', .))) %>% mutate_all(na_if, y = '') %>%
#     setNames(tolower(names(.))) %>% rename(iso3c = iso)

ITU

Unused except for corrections of Wikipedia innacuracies.

itu <- 'https://www.itu.int/online/mm/scripts/gensel8' %>% 
    read_html() %>% 
    html_node('table') %>% 
    html_table(header = TRUE) %>% 
    select(-5) %>% 
    set_names(c('en_iso', 'itu_region', 'itu', 'tld', 'itu_language')) %>% 
    mutate(tld = tolower(tld))

UN ISO regions

unstats <- 'https://unstats.un.org/unsd/methodology/m49/overview/' %>% 
    read_html() %>% 
    html_nodes('table') %>% 
    map_df(~html_table(.x, header = TRUE) %>% 
               mutate(language = tolower(sub('downloadTable', '', 
                                             html_attr(.x, 'id'))))) %>% 
    tbl_df()

OKFN

This data seems to have moved here.

okfn <- read_csv('https://github.com/datasets/country-codes/raw/master/data/country-codes.csv')

Unicode CLDR

# http://unicode.org/copyright.html
# http://unicode.org/repos/cldr/trunk/unicode-license.txt

# modern
langs_modern <- 'https://github.com/unicode-cldr/cldr-localenames-modern/tree/master/main' %>%
    read_html() %>%
    html_nodes('.content span a') %>% html_text()

unicode_modern <- langs_modern %>%
    set_names(
        paste0('https://github.com/unicode-cldr/cldr-localenames-modern/raw/master/main/',
               ., '/territories.json'),
        .) %>%
    map(jsonlite::fromJSON) %>%
    map(c(1, 1, 2, 1)) %>%
    simplify_all() %>%
    imap(~set_names(data_frame(names(.x), .x),
                    c('code', .y))) %>%
    reduce(full_join, by = 'code')

# # unused
# lang_codes <- 'https://github.com/unicode-cldr/cldr-localenames-modern/raw/master/main/en-US-POSIX/languages.json' %>%
#     jsonlite::fromJSON() %>%
#     map(c(1,2,1)) %>% .[[1]] %>%
#     simplify()
# 
# lang_code_df <- data_frame(language = lang_codes,
#                            code = names(lang_codes))

unicode_codes <- 'https://github.com/unicode-cldr/cldr-core/raw/master/supplemental/codeMappings.json' %>% 
    jsonlite::fromJSON() %>% 
    pluck(1, 2) %>% 
    bind_rows(.id = 'iso2c') %>% 
    set_names(c('iso2c', 'iso3n', 'iso3c', 'gec', 'tld')) %>% 
    select(-tld) %>% 
    mutate(iso3c = coalesce(iso3c, iso2c), 
           iso2c = ifelse(nchar(iso2c) > 2, NA, iso2c))

Cleaning

# cleaning
country_codes_c <- country_codes %>% select(-country, -comment)

country_names_c <- country_names %>%
    select(-country, -etymology, -note, -former) %>%
    mutate_all(funs(na_if(., 'none'))) %>%
    janitor::clean_names() %>%
    set_names(gsub('conventional_|_form', '', names(.))) %>%
    set_names(c('gec', paste0('en_cia_', names(.)[-1])))

country_names_c <- country_names_c %>% 
    mutate(en_cia_long = coalesce(en_cia_long, en_cia_short), 
           en_cia_local_long = coalesce(en_cia_local_long, en_cia_local_short)) %>%
    gather(code, name, -gec, -en_cia_abbreviation) %>% 
    separate(code, c('code', 'alt'), sep = '_(?=short|long)') %>% 
    spread(code, name) %>% 
    filter(., rowSums(is.na(.[-1])) < 3) %>%
    mutate(alt = na_if(alt, 'long'),
           en_cia = gsub(' \\(sometimes.*|; note.*', '', en_cia))

genc_c <- genc %>%
    janitor::clean_names() %>% select(-name) %>%
    setNames(c('iso3c', 'iso2c', 'iso3n', 'gec')) %>% 
        filter(!gec %in% c('GZ', 'SV'))

unstats_c <- unstats %>% 
    janitor::clean_names() %>%
    transmute(language, 
              un_region_code = region_code,
              un_region_name = region_name,
              un_subregion_code = sub_region_code,
              un_subregion_name = sub_region_name,
              un_intermediate_region_code = intermediate_region_code,
              un_intermediate_region_name = intermediate_region_name,
              un_name = country_or_area,
              m49 = m49_code,
              iso3c = iso_alpha3_code,
              ldc = least_developed_countries_ldc == 'x',
              lldc = land_locked_developing_countries_lldc == 'x',
              sids = small_island_developing_states_sids == 'x',
              is_developed = developed_developing_countries) %>% 
    modify(na_if, '') %>%
    gather(var, val, matches('name')) %>% 
    unite(var, language, var) %>% 
    mutate(var = gsub('_name', '', var)) %>% 
    spread(var, val)

okfn_c <- okfn %>% 
    set_names(~sub('official_name_(\\w{2})', '\\1_iso', .x)) %>% 
    select(contains('_iso'), iso2c = 7, iso3c = 8, iso3n = 9,
           MARC, EDGAR, WMO, FIFA, gec = FIPS, GAUL, IOC, TLD,
           iso4217_3c = 10,iso4217_name = 13, iso4217_3n = 14, 
           is_independent, Capital, Continent) %>%
    mutate(gec = ifelse(iso3c == 'BES', NA, gec)) %>% 
    janitor::clean_names()

w_c <- w %>% select(tld, 2:4, 6:20) %>%
    mutate(license_plate = gsub(' \\(.*\\)', '', license_plate),
           itu = case_when(iso2c == 'TL' ~ 'TLS',
                           iso2c == 'KN' ~ 'KNA',
                           TRUE ~ itu),
           nato2c = ifelse(iso2c == 'RU', 'RU', nato2c)) %>% 
    drop_na(iso2c)

unicode_modern_c <- unicode_modern %>%
    separate(code, c('code', 'alt'), sep = '-alt-', fill = 'right') %>% 
    mutate(iso3n = ifelse(nchar(code) == 3, code, NA), 
           iso2c = ifelse(nchar(code) == 2, code, NA)) %>% 
    select(-code) %>%
    janitor::clean_names()

codelist <- countrycode::codelist

countrycode_data_c <- codelist %>% 
    mutate(iso2c = coalesce(iso2c, genc2c)) %>% 
    select(iso2c, matches('regex')) %>% 
    drop_na(iso2c) %>%
    janitor::clean_names() %>% 
    set_names(~sub('country_name_', '', .x))

Joining

cia <- full_join(country_codes_c %>% drop_na(gec), country_names_c)

usg <- full_join(genc_c %>% drop_na(gec), cia, by = 'gec') %>%
    mutate(iso3c = coalesce(iso3c.x, iso3c.y),
           iso2c = coalesce(iso2c.x, iso2c.y),
           iso3n = coalesce(iso3n.x, iso3n.y)) %>%
    select(-matches('\\.'))

ok_un <- okfn_c %>% full_join(unstats_c)

us_ok_un <- full_join(ok_un, usg, by = 'iso3n') %>%
    mutate(iso3c = coalesce(iso3c.x, iso3c.y),
           iso2c = coalesce(iso2c.x, iso2c.y),
           gec = coalesce(gec.x, gec.y),
           tld = coalesce(tld.x, tld.y)) %>%
    select(-matches('\\.'))

us_ok_un_w <- full_join(us_ok_un, w_c, by = 'iso2c') %>%
    mutate(iso3c = coalesce(iso3c.x, iso3c.y),
           iso3n = coalesce(iso3n.x, iso3n.y),
           gec = coalesce(gec.x, gec.y),
           marc = coalesce(marc.x, marc.y),
           # wmo = coalesce(wmo.x, wmo.y),    # w/okfn codes differ, and no official list to verify
           ioc = coalesce(ioc.x, ioc.y),
           stanag = coalesce(stanag.x, stanag.y),
           tld = coalesce(tld.x, tld.y)) %>%
    select(-matches('\\.'))

unicode <- unicode_codes %>% 
    drop_na(iso2c) %>% 
    right_join(unicode_modern_c, by = 'iso2c') %>% 
    mutate(iso3n = coalesce(iso3n.x, iso3n.y)) %>% 
    select(-matches('\\.')) %>% 
    mutate(iso3n = case_when(iso2c == 'CP' ~ '905',
                             iso2c == 'DG' ~ '908',
                             iso2c == 'XK' ~ '901',
                             TRUE ~ iso3n),
           iso3c = ifelse(iso2c == 'XK', NA, iso3c))

countries <- full_join(unicode, us_ok_un_w, by = c('iso3n', 'iso2c', 'alt')) %>% 
    mutate(iso3c = coalesce(iso3c.x, iso3c.y), 
           gec = coalesce(gec.x, gec.y)) %>% 
    select(-matches('\\.'))

countries <- left_join(countries, countrycode_data_c)
# check code duplication
us_ok_un_w %>% names() %>% 
    map_int(~ countries %>% 
                filter(is.na(alt)) %>% 
                group_by_at(.x) %>% 
                filter(n() > 1) %>% 
                select(gec, en, iso3c, iso2c, iso3n) %>% 
                .[!is.na(.[[.x]]),] %>% 
                nrow()) %>% 
    set_names(names(us_ok_un_w))

Documenting

countries_colnames <- names(countries)

codes <- data_frame(column = countries_colnames, 
                 code = gsub('_', '-', column)) %>% 
    mutate(expansion = map(code, safely(NLP::parse_IETF_language_tag), expand = TRUE), 
           expansion = map(expansion, c(1, 1)), 
           expansion = map(expansion, ~if(length(.x) == 0) {c(Language = NA_character_)} else .x), 
           expansion = map(expansion, map_df, ~suppressWarnings(na_if(toString(na.omit(.x)), ''))), 
           expansion = map(expansion, ~set_names(.x, sub('=.*', '', names(.x))))) %>% 
    unnest() %>% 
    mutate(Language = case_when(column == 'alt' ~ NA_character_,
                                column == 'mcc' ~ NA_character_,
                                column == 'tld' ~ NA_character_,
                                TRUE ~ Language),
           Variant = ifelse(column == 'en_us_posix', 'POSIX', Variant),
           Variant = ifelse(grepl('_iso', column), 'ISO', Variant),
           Variant = ifelse(grepl('_cia', column), 'CIA World Factbook', Variant),
           Extension = ifelse(grepl('_cia_', column), 
                              sub('en_cia_', '', column), Extension),
           name = case_when(
               column == 'iso2c' ~ 'ISO 3166-1 Alpha-2 code',
               column == 'iso3c' ~ 'ISO 3166-1 Alpha-3 code',
               column == 'iso3n' ~ 'ISO 3166-1 numeric code',
               column == 'en_iso' ~ 'ISO English name',
               column == 'fr_iso' ~ 'ISO French name',
               column == 'gec' ~ 'Geopolitical Entities and Codes',
               column == 'fifa' ~ 'FIFA (Fédération Internationale de Football Association) code',
               column == 'gaul' ~ 'Global Administrative Unit Layers from the Food and Agriculture Organization (FAO) code',
               column == 'iso4217_3c' ~ 'ISO 4217 3-character currency code',
               column == 'iso4217_name' ~ 'ISO 4217 currency name',
               column == 'iso4217_3n' ~ 'ISO 4217 numeric currency code',
               column == 'is_independent' ~ 'Country sovereignty status from the CIA World Factbook',
               column == 'capital' ~ 'Capital city',
               column == 'edgar' ~ 'EDGAR country code from the SEC',
               column == 'en_cia' ~ 'Country names from the CIA World Factbook',
               column == 'en_cia_local' ~ 'Local country names from the CIA World Factbook',
               column == 'en_cia_abbreviation' ~ 'Commonly used country abbreviations from the CIA World Factbook.',
               column == 'mcc' ~ 'International Telecommunication Union (ITU) Telecommunication Standardization Sector (ITU-T) E.212 Mobile Country Code',
               column == 'itu_callsign' ~ 'International Telecommunication Union (ITU) callsign prefixes for radio and television stations',
               column == 'itu' ~ 'International Telecommunication Union (ITU) 1-3 character country code',
               column == 'itu_maritime' ~ 'International Telecommunication Union (ITU) Maritime Identification Digits',
               column == 'license_plate' ~ 'Motor vehicle licence plate country code',
               column == 'stanag' ~ 'North Atlantic Treaty Organization (NATO/OTAN) STANAG 1059 Letter Codes for Geographical Entities',
               column == 'nato2c' ~ 'North Atlantic Treaty Organization (NATO/OTAN) 2-letter code.',
               column == 'undp' ~ 'United Nations Development Programme (UNDP) country code',
               column == 'marc' ~ 'MAchine-Readable Cataloging (MARC) codes from the US Library of Congress',
               column == 'calling' ~ 'International Telecommunication Union (ITU) Telecommunication Standardization Sector (ITU-T) E.164 international telephone calling code',
               column == 'ioc' ~ 'International Olympic Committee country code',
               column == 'tld' ~ 'Internet Assigned Numbers Authority (IANA) country code top-level domain',
               column == 'm49' ~ 'United Nations Statistics Division (UNSD) M.49 area code',
               column == 'ldc' ~ 'United Nations (UN) Least Developed Countries',
               column == 'lldc' ~ 'United Nations (UN) Land Locked Developing Countries',
               column == 'sids' ~ 'United Nations (UN) Small Island Developing States',
               column == 'is_developed' ~ 'United Nations (UN) development status',
               grepl('_un', column) ~ 'United Nations (UN) Geoscheme region name',
               TRUE ~ NA_character_
           ),
           notes = case_when(
               column == 'gec' ~ 'Formerly FIPS Pub 10-4, which was withdrawn by NIST in 2008. Maintained until 2014 by the National Geospatial-Intelligence Agency (NGA), after which it was frozen and superceded by GENC, the US government profile of ISO 3166.',
               column == 'en_iso' ~ 'Does not include Taiwan, which is not a member of the UN. A few use uncommon offical forms, including North and South Korea and Bolivia.',
               column == 'fr_iso' ~ 'Does not include Taiwan, which is not a member of the UN. A few use uncommon offical forms, including North and South Korea and Bolivia.',
               column == 'en_cia' ~ 'Many short and long forms. Uses "Burma" instead of "Myanmar".',
               column == 'en_cia_local' ~ 'Many short and long forms. Includes alternatives inline. Transliterates to Latin script.',
               column == 'en_cia_abbreviation' ~ 'Only included where commonly used. Includes alternatives inline.',
               column == 'mcc' ~ 'Includes ranges.',
               column == 'itu_callsign' ~ 'Includes ranges.',
               column == 'itu_maritime' ~ 'Includes ranges.',
               column == 'nato2c' ~ 'Officially deprecated in favor of STANAG 1059 (see "stanag").',
               column == 'tld' ~ 'Includes leading period.',
               grepl('_un_region', column) ~ 'Continent',
               grepl('regex', column) ~ 'Regex used for `parse_country`.',
               TRUE ~ NA_character_
           )) %>% 
    janitor::clean_names() %>% 
    arrange(column) %>% 
    select(1:2, 8:9, 3:7)

Saving

# to avoid ASCII CRAN warning
codes <- codes %>% modify(stringi::stri_trans_general, 'Latin-ASCII')

devtools::use_data(countries, countries_colnames, 
                   internal = TRUE, overwrite = TRUE)
devtools::use_data(codes, overwrite = TRUE)

Licensing

Data is licensed according to its source, most of which are in the public domain. Exceptions include



Try the passport package in your browser

Any scripts or data that you put into this service are public.

passport documentation built on Nov. 8, 2020, 4:28 p.m.