##' Source code to build Country-based data source
##'
#####################################################
# Import and Clean the Data
# # First Check that you have internet connection and can access github. If not, import local data source
# has_internet <- TRUE
# tryCatch({
# is.character(getURL("https://raw.githubusercontent.com/datasets/country-codes/master/data/"))==TRUE &
# !is.null(curl::nslookup("www.github.com")) },
# error= function(x){
# has_internet <- FALSE
# print('No Internet Connection')
# }
# )
#
#
# # If you have internet and github is accessible, update the data
# if (has_internet) {
# iso_data <- NULL
#
# # Get Country Codes, Regions,
# tryCatch({
country_data <- read.csv("https://raw.githubusercontent.com/datasets/country-codes/master/data/country-codes.csv", stringsAsFactors=FALSE, na.strings = c(""," ","NA"))
dhs_countrydata <- read.csv("data-raw/DHS_countrydata.csv", stringsAsFactors=FALSE, na.strings = c(""," ","NA"))
nationality_data <- read.csv("data-raw/nationalities_and_languages.csv", stringsAsFactors=FALSE, na.strings = c(""," ","NA"))
alt_country_names <- rworldmap::countrySynonyms
ISOcodes_level1 <- ISOcodes::ISO_3166_1
ISOcodes_level2 <- ISOcodes::ISO_3166_2
ISOcodes_level3 <- ISOcodes::ISO_3166_3
# },
# error= function(x) print('No Internet Connection')
# )
# Write these data to directory if they are available so they are updated
write.csv(country_data, 'data-raw/iso_data.csv', row.names = FALSE)
#write.csv(dhs_countrydata, 'data-raw/dhs_countrydata.csv', row.names = FALSE)
#write.csv(nationality_data, 'data-raw/nationalities_and_languages.csv', row.names = FALSE)
write.csv(alt_country_names, 'data-raw/alt_country_names.csv', row.names = FALSE)
# # If not, use the local data
# } else {
# source.path.folder <- file.path(source.path, 'iso_source')
#
# if (file.exists('data-raw/iso_data.csv')) {
# iso_data <- read.csv('data-raw/iso_data.csv', header=TRUE, stringsAsFactors=FALSE, na.strings=c(""," ","NA"))
# dhs_countrydata <- read.csv('data-raw/dhs_countrydata.csv', header=TRUE, stringsAsFactors=FALSE, na.strings=c(""," ","NA"))
# nationality_data <- read.csv('data-raw/nationalities_and_languages.csv', header=TRUE, stringsAsFactors=FALSE, na.strings=c(""," ","NA"))
# alt_country_names <- read.csv('data-raw/alt_country_names.csv', header=TRUE, stringsAsFactors=FALSE, na.strings=c(""," ","NA"))
#
# } else {
# print('No access to internet/github and local files could not be located')
# stop()
# }
# }
# Build Country Reference Dataset ---------------------------------------------
# - this will be linked to using country codes
country_df <- ISOcodes_level1 %>% dplyr::mutate(ISO_level=1, Type="Country") %>%
dplyr::select(ISO3=Alpha_3, ISO2=Alpha_2, Numeric, ISO_level, tidyselect::everything()) %>% as.data.frame()
country_df$Name[!is.na(country_df$Common_name)] <- country_df$Common_name[!is.na(country_df$Common_name)] # replace name with common name if available
country_df <- country_df %>% dplyr::select(-Common_name)
# Save for the package
usethis::use_data(country_df, overwrite = TRUE, compress = 'xz')
# Sub-Country Locations ---------------------------------------------------
# -- Includes England, Wales, etc.
ISOcodes_level2 <- ISOcodes_level2 %>% dplyr::mutate(ISO2=substr(Code,1,2)) %>% dplyr::select(ISO2, tidyselect::everything())
ISOcodes_level2$ISO3 <- country_df$ISO3[match(ISOcodes_level2$ISO2,country_df$ISO2)]
ISOcodes_level2$Parent <- country_df$Name[match(ISOcodes_level2$ISO2,country_df$ISO2)]
ISOcodes_level2 <- ISOcodes_level2 %>% dplyr::mutate(ISO_level=2) %>% dplyr::select(ISO3, ISO2, tidyselect::everything())
ISOcodes_level2$Name[ISOcodes_level2$Code=="AE-AZ"] <- "Abu Dhabi"
ISOcodes_level2$Name[ISOcodes_level2$Code=="AE-AJ"] <- "Ajman"
ISOcodes_level2$Name[ISOcodes_level2$Code=="AE-DU"] <- "Dubai"
locations_lvl2 <- ISOcodes_level2
usethis::use_data(locations_lvl2, overwrite = TRUE, compress = 'xz')
# BUILD COUNTRY NAME DATA -------------------------------------------------
# Check for missing ISOs
sum(is.na(country_data$ISO3166.1.Alpha.3))
country_data <- country_data %>% dplyr::select(ISO3=ISO3166.1.Alpha.3,
ISO2=ISO3166.1.Alpha.2,
Country=official_name_en,
Country2=CLDR.display.name,
UNcode=ISO3166.1.numeric, tidyselect::everything()) %>%
dplyr::filter(!is.na(ISO3))
# get rid of non-ASCII
country_data$Country = iconv(country_data$Country, from = 'UTF-8', to = 'ASCII//TRANSLIT')
country_data$Country2 = iconv(country_data$Country2, from = 'UTF-8', to = 'ASCII//TRANSLIT')
# Check for differences with codes from ISOcodes package
not_matched <- country_data[which(is.na(match(country_data$ISO3, country_df$ISO3))),]$Country
grep(not_matched, country_df$Name)
grep(not_matched, ISOcodes_level2$Name)
# --> currently includes only "Sark" which is part of the UK
# Fix missing ISOs (temporary fix)
#country_data$ISO3[is.na(country_data$ISO3)] <- country_data$Country[is.na(country_data$ISO3)]
# Fix missing Country
country_data$Country[is.na(country_data$Country)] <- country_data$Country2[is.na(country_data$Country)]
# COUNTRY NAMES -----------------------------------------------------------
# Separate to country name database, country codes, and other details
country_names_df <- country_data %>% dplyr::select(ISO3, Country, Country2, FIFA, IOC,
tidyselect::contains("official_name"),
tidyselect::contains("UNTERM"), ISO4217.currency_country_name) %>%
dplyr::mutate(UNTERM.Spanish.Short = iconv(UNTERM.Spanish.Short, from = 'UTF-8', to = 'ASCII//TRANSLIT'),
UNTERM.Spanish.Formal = iconv(UNTERM.Spanish.Formal, from = 'UTF-8', to = 'ASCII//TRANSLIT'),
UNTERM.French.Formal = iconv(UNTERM.French.Formal, from = 'UTF-8', to = 'ASCII//TRANSLIT'),
UNTERM.French.Short = iconv(UNTERM.French.Short, from = 'UTF-8', to = 'ASCII//TRANSLIT'),
UNTERM.English.Short = iconv(UNTERM.English.Short, from = 'UTF-8', to = 'ASCII//TRANSLIT'),
UNTERM.English.Formal = iconv(UNTERM.English.Formal, from = 'UTF-8', to = 'ASCII//TRANSLIT'),
official_name_es = iconv(official_name_es, from = 'UTF-8', to = 'ASCII//TRANSLIT'),
official_name_fr = iconv(official_name_fr, from = 'UTF-8', to = 'ASCII//TRANSLIT'),
IOC = iconv(IOC, from = 'UTF-8', to = 'ASCII//TRANSLIT'),
FIFA = iconv(FIFA, from = 'UTF-8', to = 'ASCII//TRANSLIT')
)
# Fix IOC & FIFA
country_names_df$IOC[country_names_df$IOC=="" | country_names_df$IOC==" "] <- NA
country_names_df$FIFA[country_names_df$FIFA=="" | country_names_df$FIFA==" "] <- NA
# Merge Reference data with names data
country_names_df <- dplyr::full_join(country_df %>% dplyr::select(-ISO2), country_names_df %>% dplyr::mutate(ISO3=toupper(ISO3)),
by=c("ISO3"="ISO3")) %>%
dplyr::select(ISO3, Numeric, Name, tidyselect::everything())
country_names_df$ISO3[country_names_df$ISO3=="" | country_names_df$ISO3==" "] <- NA
# Check for unmatched duplicates
dups <- match(tolower((country_names_df %>% dplyr::filter(is.na(ISO3)))$Country),
tolower(country_names_df$Name))
dups <- tolower((country_names_df %>% dplyr::filter(is.na(ISO3)))$name1)[!is.na(dups)]
if (length(dups)>0){
dupped.a <- match(dups, tolower(country_names_df$name1))
dupped.b <- match(dups, tolower(country_names_df$Name))
country_names_df[dupped.b,(which(colnames(country_names_df)=="Country")):ncol(country_names_df)] <-
country_names_df[dupped.a,(which(colnames(country_names_df)=="Country")):ncol(country_names_df)]
country_names_df <- country_names_df[-dupped.a,]
}
country_names_df$Name[is.na(country_names_df$Name)] <- country_names_df$Country[is.na(country_names_df$Name)]
# Merge with alt_country names
country_names_df <- full_join(country_names_df, alt_country_names %>% dplyr::mutate(ISO3=toupper(ISO3)),
by=c("ISO3"="ISO3")) %>%
dplyr::select(ISO3, Numeric, Name, tidyselect::everything())
country_names_df$ISO3[country_names_df$ISO3=="" | country_names_df$ISO3==" "] <- NA
# Check for unmatched duplicates
dups <- match(tolower((country_names_df %>% dplyr::filter(is.na(ISO3)))$name1),
tolower(country_names_df$Name))
dups <- tolower((country_names_df %>% dplyr::filter(is.na(ISO3)))$name1)[!is.na(dups)]; print(dups)
if (length(dups)>0){
dupped.a <- match(dups, tolower(country_names_df$name1))
dupped.b <- match(dups, tolower(country_names_df$Name))
country_names_df[dupped.b,(which(colnames(country_names_df)=="name1")):ncol(country_names_df)] <-
country_names_df[dupped.a,(which(colnames(country_names_df)=="name1")):ncol(country_names_df)]
country_names_df <- country_names_df[-dupped.a,]
}
# Fix missing Name
country_names_df$Name[is.na(country_names_df$Name)] <- country_names_df$name1[is.na(country_names_df$Name)]
# Check ISO Level-2 data
dups <- match(tolower((country_names_df %>% dplyr::filter(is.na(Numeric)))$Name),
tolower(locations_lvl2$Name))
dups <- tolower((country_names_df %>% dplyr::filter(is.na(Numeric)))$Name)[!is.na(dups)]; print(dups)
if (length(dups)>0){
dupped.a <- match(dups, tolower(country_names_df$Name))
dupped.b <- match(dups, tolower(locations_lvl2$Name))
country_names_df[dupped.a, c("Name", "ISO3", "ISO_level")] <- locations_lvl2[dupped.b,c("Name", "Code", "ISO_level")]
}
# Fix missing ISO
country_names_df$ISO3true <- country_names_df$ISO3
country_names_df$ISO3[is.na(country_names_df$ISO3)] <- country_names_df$Name[is.na(country_names_df$ISO3)]
# Remove West Bank (already there with palestine)
country_names_df <- country_names_df[country_names_df$ID!=276,]
country_names_df <- country_names_df %>% dplyr::filter(!is.na(ISO3))
row.names(country_names_df) <- country_names_df$ISO3
# # Convert to list for easy manipulation
# country_name_list <- lapply(split(country_names_df, row.names(country_names_df)), unlist)
# # TEST
# country_name_list$USA
# Convert to long -- easier to search
country_names <- country_names_df %>% dplyr::select(-ID, -Numeric, -ISOtrue) %>% tidyr::gather(key="name_type", value="names", -ISO3, -Name, -ISO_level)
country_names <- country_names %>% dplyr::filter(!is.na(names) & names!="" & names!=" ")
# Add "America" to USA
country_names <- rbind(country_names, as.vector(c(country_names[which(country_names$ISO3=="USA")[1],1:3],
name_type="added1", names="America")))
# Get rid of duplicates
name_types <- unique(country_names$name_type)
country_names <- country_names %>% dplyr::mutate(name_type=factor(name_type, levels=name_types, labels = name_types)) %>%
dplyr::arrange(ISO3, name_type) %>%
dplyr::mutate(duplic_name=duplicated(toupper(names)))
country_names <- country_names %>% dplyr::filter(duplic_name==FALSE) %>% dplyr::select(-duplic_name)
country_iso <- country_names_df %>% dplyr::select(ISO3, Name)
# Save names data
usethis::use_data(country_iso, overwrite = TRUE, compress = 'xz')
usethis::use_data(country_names, overwrite = TRUE, compress = 'xz')
#usethis::use_data(country_name_list, overwrite = TRUE)
# COUNTRY CODES -----------------------------------------------------------
country_codes <- country_data %>% dplyr::select(ISO3, Country, FIFA, IOC, tidyselect::tidyselect::everything())
country_codes <- country_codes %>% dplyr::select(-tidyselect::contains("official_name"),
-tidyselect::contains("UNTERM"),
-Dial, -is_independent,
-Developed...Developing.Countries,
-tidyselect::contains("ISO4217.currency"),
-Small.Island.Developing.States..SIDS.,
-tidyselect::contains("region"),
-tidyselect::contains("Region"),
-Country2, -Capital, -Languages,
-Least.Developed.Countries..LDC.,
-Global.Name, -Global.Code,
-Land.Locked.Developing.Countries..LLDC.)
# Save codes data
#write_csv(country_codes, "data-raw/country_codes.csv")
usethis::use_data(country_codes, overwrite = TRUE, compress = 'xz')
# ALL OTHER DATA ----------------------------------------------------------
# CITY DATA #
# Need to cite the source for this !!!!
city_data <- read_csv("data-raw/worldcities.csv") %>% as.data.frame()
city_data <- city_data %>% rename(Name=city, ISO3=iso3, ISO2=iso2)
#check unique ISOs
length(unique(city_data$iso3)) # should be 249 or less
usethis::use_data(city_data, overwrite = TRUE, compress = 'xz')
# TEST AND EXAMPLE DATA ---------------------------------------------------
# Example of country names
test_country_names <- c("NEWZEALAND", "Sudan", "THAILLAND", "Denmark", "ITALY", "LEBANON",
"THAILAND", "KSA", "BAHARAIN", "UAE", "INDONEISIA", "Spain",
"U.S.A", "UK", "MALDIVE", "Oman", "PORTUGAL", "SAUDIARAB",
"SRILANKA", "SOUDIARABIA", "Italy", "TAIWAN", "Honkong",
"K.S.A", "Nepal", "LIBIYA", "Indoesia", "Chine", "America")
usethis::use_data(test_country_names, overwrite = TRUE, compress = 'xz')
# Example of country names mixed with sub-country
test_mixed_names <- c("NEWZEALAND", "Sudan", "THAILLAND", "Denmark", "ITALY", "LEBANON",
"THAILAND", "KSA", "BAHARAIN", "UAE", "INDONEISIA", "Spain",
"U.S.A", "UK", "MALDIVE", "Oman", "PORTUGAL", "SAUDIARAB",
"SRILANKA", "SOUDIARABIA", "Italy", "TAIWAN", "Honkong",
"K.S.A", "Nepal", "LIBIYA", "Indoesia", "Chine", "America",
"England", "Dubai", "Dubaaay", "Wales")
usethis::use_data(test_mixed_names, overwrite = TRUE, compress = 'xz')
# Example of country names mixed with sub-country
test_city_names <- c("Chikago", "Newyork", "Dohaah", "Hong Kong", "Honkong",
"Cayro", "Shanghay", "Dubai", "Dubaay", "Bankok", "Sanfrancisco", "Seaatel")
usethis::use_data(test_city_names, overwrite = TRUE, compress = 'xz')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.