This is the script that scrapes and assembles the data used by the package. While both the script and the data are included, this script has a very high chance of breaking, given the likelihood of websites or URLs changing, locales differing, etc. If you'd like to assemble the data differently or are just curious how it was assembled, though, this script will be of use.
A function to refresh data is not included in the package because this script is depends on web structures that may vary, requires a lot of dependencies (see below) and is irrelevant to the use of the package. If you do choose to run it, keep in mind that it scrapes a lot of sources; please be curteous and don't re-scrape more times than necessary. Due to the number and size of sources, it will take a while to run, anyway, and so is probably best run by chunk. While structured as an RMarkdown notebook for ease of documentation and structure, the HTML file it knits is not useful and is thus not included.
Packages required:
Aside from rvest and tidyverse, most could be fairly easily refactored into an alternative.
library(rvest) library(tidyverse)
# CIA World Factbook country names country_names <- 'https://www.cia.gov/library/publications/the-world-factbook/fields/2142.html' %>% read_html() %>% html_nodes('table tr[id]') %>% { setNames(map(., html_nodes, 'td'), toupper(html_attr(., 'id'))) } %>% map_df(~data_frame( country = html_text(.x[1]), key = map(.x[2], html_nodes, css = 'strong') %>% map(html_text), value = map(.x[2], html_nodes, xpath = 'text()') %>% map(html_text, trim = TRUE) %>% map(discard, `==`, '') ), .id = 'gec') %>% unnest() %>% mutate(key = gsub(':\\s+$|`', '', key), key = gsub('etymolgy', 'etymology', key), key = gsub('official|Papiamentu', 'local', key), key = gsub('English', 'conventional', key), key = ifelse(country == 'Curacao', gsub('Dutch', 'conventional', key), gsub('Dutch', 'local', key)), country = ifelse(gec == 'AS', 'Australia', country)) %>% spread(key, value) %>% mutate(abbreviation = ifelse(country == 'Australia', NA, abbreviation)) # CIA World Factbook country codes country_codes <- 'https://www.cia.gov/library/publications/the-world-factbook/appendix/appendix-d.html' %>% read_html() %>% html_nodes('ul#GetAppendix_D li') %>% map(html_nodes, css = 'td') %>% map(html_text, trim = TRUE) %>% map(~.[c(-3, -10)]) %>% transpose() %>% simplify_all() %>% setNames(c('country', 'gec', 'iso2c', 'iso3c', 'iso3n', 'stanag', 'tld', 'comment')) %>% as_data_frame() %>% mutate_all(na_if, y = '-') %>% mutate(comment = na_if(comment, ''))
# http://geonames.nga.mil/gns/html/countrycodes.html # http://geonames.nga.mil/gns/html/docs/GENC_ED3U5_GEC_XWALK.xlsx genc <- rio::import('http://geonames.nga.mil/gns/html/docs/GENC_ED3U5_GEC_XWALK.xlsx', skip = 2, setclass = 'tbl_df', na = '--') %>% modify(na_if, '[None]')
# Wikipedia country codes w <- 'https://en.wikipedia.org/wiki/Category:Lists_of_country_codes' %>% read_html() %>% html_nodes('a[title*="Country codes:"]') %>% html_attr('href') %>% paste0('https://en.wikipedia.org', .) %>% map(read_html) %>% map(html_nodes, 'h2 + table') %>% modify_depth(2, html_nodes, 'td') %>% map(map_df, ~list( key = map(.x, html_nodes, xpath = 'a|text()') %>% map(html_text) %>% map_chr(paste, collapse = '') %>% trimws(), value = html_nodes(.x, 'p') %>% html_text() ), .id = 'row') %>% map_df(spread, key, value) %>% select(-row) %>% mutate_all(na_if, y = '—') %>% modify(na_if, '-') %>% modify(~gsub('\\n', ', ', .x)) %>% setNames(c('tld', 'calling', 'mcc', 'gec', 'gs1_gtin', 'icao_aircraft', 'icao_airport', 'ioc', 'iso2c', 'iso3c', 'iso3n', 'itu_callsign', 'itu', 'itu_maritime', 'license_plate', 'marc', 'stanag', 'nato2c', 'undp', 'wmo')) # # Wikipedia FIFA codes - unused # fifa <- 'https://en.wikipedia.org/wiki/List_of_FIFA_country_codes' %>% # read_html() %>% # html_nodes('table.wikitable') %>% # map(html_table, fill = TRUE) %>% # .[-7:-9] %>% # remove obsolete codes tables # reduce(full_join) %>% # mutate(FIFA = coalesce(Code, FIFA)) %>% # select(-Code, -Confederation) %>% # mutate_all(funs(gsub('\\[.*\\]', '', .))) %>% mutate_all(na_if, y = '') %>% # setNames(tolower(names(.))) %>% rename(iso3c = iso)
Unused except for corrections of Wikipedia innacuracies.
itu <- 'https://www.itu.int/online/mm/scripts/gensel8' %>% read_html() %>% html_node('table') %>% html_table(header = TRUE) %>% select(-5) %>% set_names(c('en_iso', 'itu_region', 'itu', 'tld', 'itu_language')) %>% mutate(tld = tolower(tld))
unstats <- 'https://unstats.un.org/unsd/methodology/m49/overview/' %>% read_html() %>% html_nodes('table') %>% map_df(~html_table(.x, header = TRUE) %>% mutate(language = tolower(sub('downloadTable', '', html_attr(.x, 'id'))))) %>% tbl_df()
This data seems to have moved here.
okfn <- read_csv('https://github.com/datasets/country-codes/raw/master/data/country-codes.csv')
# http://unicode.org/copyright.html # http://unicode.org/repos/cldr/trunk/unicode-license.txt # modern langs_modern <- 'https://github.com/unicode-cldr/cldr-localenames-modern/tree/master/main' %>% read_html() %>% html_nodes('.content span a') %>% html_text() unicode_modern <- langs_modern %>% set_names( paste0('https://github.com/unicode-cldr/cldr-localenames-modern/raw/master/main/', ., '/territories.json'), .) %>% map(jsonlite::fromJSON) %>% map(c(1, 1, 2, 1)) %>% simplify_all() %>% imap(~set_names(data_frame(names(.x), .x), c('code', .y))) %>% reduce(full_join, by = 'code') # # unused # lang_codes <- 'https://github.com/unicode-cldr/cldr-localenames-modern/raw/master/main/en-US-POSIX/languages.json' %>% # jsonlite::fromJSON() %>% # map(c(1,2,1)) %>% .[[1]] %>% # simplify() # # lang_code_df <- data_frame(language = lang_codes, # code = names(lang_codes)) unicode_codes <- 'https://github.com/unicode-cldr/cldr-core/raw/master/supplemental/codeMappings.json' %>% jsonlite::fromJSON() %>% pluck(1, 2) %>% bind_rows(.id = 'iso2c') %>% set_names(c('iso2c', 'iso3n', 'iso3c', 'gec', 'tld')) %>% select(-tld) %>% mutate(iso3c = coalesce(iso3c, iso2c), iso2c = ifelse(nchar(iso2c) > 2, NA, iso2c))
# cleaning country_codes_c <- country_codes %>% select(-country, -comment) country_names_c <- country_names %>% select(-country, -etymology, -note, -former) %>% mutate_all(funs(na_if(., 'none'))) %>% janitor::clean_names() %>% set_names(gsub('conventional_|_form', '', names(.))) %>% set_names(c('gec', paste0('en_cia_', names(.)[-1]))) country_names_c <- country_names_c %>% mutate(en_cia_long = coalesce(en_cia_long, en_cia_short), en_cia_local_long = coalesce(en_cia_local_long, en_cia_local_short)) %>% gather(code, name, -gec, -en_cia_abbreviation) %>% separate(code, c('code', 'alt'), sep = '_(?=short|long)') %>% spread(code, name) %>% filter(., rowSums(is.na(.[-1])) < 3) %>% mutate(alt = na_if(alt, 'long'), en_cia = gsub(' \\(sometimes.*|; note.*', '', en_cia)) genc_c <- genc %>% janitor::clean_names() %>% select(-name) %>% setNames(c('iso3c', 'iso2c', 'iso3n', 'gec')) %>% filter(!gec %in% c('GZ', 'SV')) unstats_c <- unstats %>% janitor::clean_names() %>% transmute(language, un_region_code = region_code, un_region_name = region_name, un_subregion_code = sub_region_code, un_subregion_name = sub_region_name, un_intermediate_region_code = intermediate_region_code, un_intermediate_region_name = intermediate_region_name, un_name = country_or_area, m49 = m49_code, iso3c = iso_alpha3_code, ldc = least_developed_countries_ldc == 'x', lldc = land_locked_developing_countries_lldc == 'x', sids = small_island_developing_states_sids == 'x', is_developed = developed_developing_countries) %>% modify(na_if, '') %>% gather(var, val, matches('name')) %>% unite(var, language, var) %>% mutate(var = gsub('_name', '', var)) %>% spread(var, val) okfn_c <- okfn %>% set_names(~sub('official_name_(\\w{2})', '\\1_iso', .x)) %>% select(contains('_iso'), iso2c = 7, iso3c = 8, iso3n = 9, MARC, EDGAR, WMO, FIFA, gec = FIPS, GAUL, IOC, TLD, iso4217_3c = 10,iso4217_name = 13, iso4217_3n = 14, is_independent, Capital, Continent) %>% mutate(gec = ifelse(iso3c == 'BES', NA, gec)) %>% janitor::clean_names() w_c <- w %>% select(tld, 2:4, 6:20) %>% mutate(license_plate = gsub(' \\(.*\\)', '', license_plate), itu = case_when(iso2c == 'TL' ~ 'TLS', iso2c == 'KN' ~ 'KNA', TRUE ~ itu), nato2c = ifelse(iso2c == 'RU', 'RU', nato2c)) %>% drop_na(iso2c) unicode_modern_c <- unicode_modern %>% separate(code, c('code', 'alt'), sep = '-alt-', fill = 'right') %>% mutate(iso3n = ifelse(nchar(code) == 3, code, NA), iso2c = ifelse(nchar(code) == 2, code, NA)) %>% select(-code) %>% janitor::clean_names() codelist <- countrycode::codelist countrycode_data_c <- codelist %>% mutate(iso2c = coalesce(iso2c, genc2c)) %>% select(iso2c, matches('regex')) %>% drop_na(iso2c) %>% janitor::clean_names() %>% set_names(~sub('country_name_', '', .x))
cia <- full_join(country_codes_c %>% drop_na(gec), country_names_c) usg <- full_join(genc_c %>% drop_na(gec), cia, by = 'gec') %>% mutate(iso3c = coalesce(iso3c.x, iso3c.y), iso2c = coalesce(iso2c.x, iso2c.y), iso3n = coalesce(iso3n.x, iso3n.y)) %>% select(-matches('\\.')) ok_un <- okfn_c %>% full_join(unstats_c) us_ok_un <- full_join(ok_un, usg, by = 'iso3n') %>% mutate(iso3c = coalesce(iso3c.x, iso3c.y), iso2c = coalesce(iso2c.x, iso2c.y), gec = coalesce(gec.x, gec.y), tld = coalesce(tld.x, tld.y)) %>% select(-matches('\\.')) us_ok_un_w <- full_join(us_ok_un, w_c, by = 'iso2c') %>% mutate(iso3c = coalesce(iso3c.x, iso3c.y), iso3n = coalesce(iso3n.x, iso3n.y), gec = coalesce(gec.x, gec.y), marc = coalesce(marc.x, marc.y), # wmo = coalesce(wmo.x, wmo.y), # w/okfn codes differ, and no official list to verify ioc = coalesce(ioc.x, ioc.y), stanag = coalesce(stanag.x, stanag.y), tld = coalesce(tld.x, tld.y)) %>% select(-matches('\\.')) unicode <- unicode_codes %>% drop_na(iso2c) %>% right_join(unicode_modern_c, by = 'iso2c') %>% mutate(iso3n = coalesce(iso3n.x, iso3n.y)) %>% select(-matches('\\.')) %>% mutate(iso3n = case_when(iso2c == 'CP' ~ '905', iso2c == 'DG' ~ '908', iso2c == 'XK' ~ '901', TRUE ~ iso3n), iso3c = ifelse(iso2c == 'XK', NA, iso3c)) countries <- full_join(unicode, us_ok_un_w, by = c('iso3n', 'iso2c', 'alt')) %>% mutate(iso3c = coalesce(iso3c.x, iso3c.y), gec = coalesce(gec.x, gec.y)) %>% select(-matches('\\.')) countries <- left_join(countries, countrycode_data_c)
# check code duplication us_ok_un_w %>% names() %>% map_int(~ countries %>% filter(is.na(alt)) %>% group_by_at(.x) %>% filter(n() > 1) %>% select(gec, en, iso3c, iso2c, iso3n) %>% .[!is.na(.[[.x]]),] %>% nrow()) %>% set_names(names(us_ok_un_w))
countries_colnames <- names(countries) codes <- data_frame(column = countries_colnames, code = gsub('_', '-', column)) %>% mutate(expansion = map(code, safely(NLP::parse_IETF_language_tag), expand = TRUE), expansion = map(expansion, c(1, 1)), expansion = map(expansion, ~if(length(.x) == 0) {c(Language = NA_character_)} else .x), expansion = map(expansion, map_df, ~suppressWarnings(na_if(toString(na.omit(.x)), ''))), expansion = map(expansion, ~set_names(.x, sub('=.*', '', names(.x))))) %>% unnest() %>% mutate(Language = case_when(column == 'alt' ~ NA_character_, column == 'mcc' ~ NA_character_, column == 'tld' ~ NA_character_, TRUE ~ Language), Variant = ifelse(column == 'en_us_posix', 'POSIX', Variant), Variant = ifelse(grepl('_iso', column), 'ISO', Variant), Variant = ifelse(grepl('_cia', column), 'CIA World Factbook', Variant), Extension = ifelse(grepl('_cia_', column), sub('en_cia_', '', column), Extension), name = case_when( column == 'iso2c' ~ 'ISO 3166-1 Alpha-2 code', column == 'iso3c' ~ 'ISO 3166-1 Alpha-3 code', column == 'iso3n' ~ 'ISO 3166-1 numeric code', column == 'en_iso' ~ 'ISO English name', column == 'fr_iso' ~ 'ISO French name', column == 'gec' ~ 'Geopolitical Entities and Codes', column == 'fifa' ~ 'FIFA (Fédération Internationale de Football Association) code', column == 'gaul' ~ 'Global Administrative Unit Layers from the Food and Agriculture Organization (FAO) code', column == 'iso4217_3c' ~ 'ISO 4217 3-character currency code', column == 'iso4217_name' ~ 'ISO 4217 currency name', column == 'iso4217_3n' ~ 'ISO 4217 numeric currency code', column == 'is_independent' ~ 'Country sovereignty status from the CIA World Factbook', column == 'capital' ~ 'Capital city', column == 'edgar' ~ 'EDGAR country code from the SEC', column == 'en_cia' ~ 'Country names from the CIA World Factbook', column == 'en_cia_local' ~ 'Local country names from the CIA World Factbook', column == 'en_cia_abbreviation' ~ 'Commonly used country abbreviations from the CIA World Factbook.', column == 'mcc' ~ 'International Telecommunication Union (ITU) Telecommunication Standardization Sector (ITU-T) E.212 Mobile Country Code', column == 'itu_callsign' ~ 'International Telecommunication Union (ITU) callsign prefixes for radio and television stations', column == 'itu' ~ 'International Telecommunication Union (ITU) 1-3 character country code', column == 'itu_maritime' ~ 'International Telecommunication Union (ITU) Maritime Identification Digits', column == 'license_plate' ~ 'Motor vehicle licence plate country code', column == 'stanag' ~ 'North Atlantic Treaty Organization (NATO/OTAN) STANAG 1059 Letter Codes for Geographical Entities', column == 'nato2c' ~ 'North Atlantic Treaty Organization (NATO/OTAN) 2-letter code.', column == 'undp' ~ 'United Nations Development Programme (UNDP) country code', column == 'marc' ~ 'MAchine-Readable Cataloging (MARC) codes from the US Library of Congress', column == 'calling' ~ 'International Telecommunication Union (ITU) Telecommunication Standardization Sector (ITU-T) E.164 international telephone calling code', column == 'ioc' ~ 'International Olympic Committee country code', column == 'tld' ~ 'Internet Assigned Numbers Authority (IANA) country code top-level domain', column == 'm49' ~ 'United Nations Statistics Division (UNSD) M.49 area code', column == 'ldc' ~ 'United Nations (UN) Least Developed Countries', column == 'lldc' ~ 'United Nations (UN) Land Locked Developing Countries', column == 'sids' ~ 'United Nations (UN) Small Island Developing States', column == 'is_developed' ~ 'United Nations (UN) development status', grepl('_un', column) ~ 'United Nations (UN) Geoscheme region name', TRUE ~ NA_character_ ), notes = case_when( column == 'gec' ~ 'Formerly FIPS Pub 10-4, which was withdrawn by NIST in 2008. Maintained until 2014 by the National Geospatial-Intelligence Agency (NGA), after which it was frozen and superceded by GENC, the US government profile of ISO 3166.', column == 'en_iso' ~ 'Does not include Taiwan, which is not a member of the UN. A few use uncommon offical forms, including North and South Korea and Bolivia.', column == 'fr_iso' ~ 'Does not include Taiwan, which is not a member of the UN. A few use uncommon offical forms, including North and South Korea and Bolivia.', column == 'en_cia' ~ 'Many short and long forms. Uses "Burma" instead of "Myanmar".', column == 'en_cia_local' ~ 'Many short and long forms. Includes alternatives inline. Transliterates to Latin script.', column == 'en_cia_abbreviation' ~ 'Only included where commonly used. Includes alternatives inline.', column == 'mcc' ~ 'Includes ranges.', column == 'itu_callsign' ~ 'Includes ranges.', column == 'itu_maritime' ~ 'Includes ranges.', column == 'nato2c' ~ 'Officially deprecated in favor of STANAG 1059 (see "stanag").', column == 'tld' ~ 'Includes leading period.', grepl('_un_region', column) ~ 'Continent', grepl('regex', column) ~ 'Regex used for `parse_country`.', TRUE ~ NA_character_ )) %>% janitor::clean_names() %>% arrange(column) %>% select(1:2, 8:9, 3:7)
# to avoid ASCII CRAN warning codes <- codes %>% modify(stringi::stri_trans_general, 'Latin-ASCII') devtools::use_data(countries, countries_colnames, internal = TRUE, overwrite = TRUE) devtools::use_data(codes, overwrite = TRUE)
Data is licensed according to its source, most of which are in the public domain. Exceptions include
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.