library(plyr)
library(dplyr)
library(magrittr)
library(lubridate)
load_all()
get_current_sos()
names(sos)
# Create Unique ID
sosid <- paste0("SOS", 1:nrow(sos))
# Countries
countries <- sos$Countries
countries_unlist <- countries %<>%
tolower() %>%
tokenize(split_and = FALSE) %>%
unlist()
table(countries_unlist)[order(table(countries_unlist), decreasing = TRUE)]
# There's an EU entry. The EU countries are: Austria, Belgium, Bulgaria, Croatia, Republic of Cyprus, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Ireland, Italy, Latvia, Lithuania, Luxembourg, Malta, Netherlands, Poland, Portugal, Romania, Slovakia, Slovenia, Spain, Sweden and the UK.
countries_unique <- unique(countries_unlist)
data(country_codes)
# These are the countries that match more than one "officialname":
lapply(countries_unique, grep, tolower(country_codes$officialname)) %>%
sapply(length) %>%
sapply(`>`, 1) %>%
countries_unique[.]
# However, I don't think this solves all of our problems.
# For instance, "uk" won't match "United Kingdom; it'll match "Ukraine".
# These are the countries that match zero "officialname" entries:
lapply(countries_unique, grep, tolower(country_codes$officialname)) %>%
sapply(length) %>%
sapply(`==`, 0) %>%
countries_unique[.]
# How the same thing for the shortname: countries matching >1
lapply(countries_unique, grep, tolower(country_codes$shortname)) %>%
sapply(length) %>%
sapply(`>`, 1) %>%
countries_unique[.]
# Countries matching 0 shortnames:
lapply(countries_unique, grep, tolower(country_codes$officialname)) %>%
sapply(length) %>%
sapply(`==`, 0) %>%
countries_unique[.]
officialname_matches <- lapply(countries_unique, grep, tolower(country_codes$officialname), value = TRUE)
names(officialname_matches) <- countries_unique
shortname_matches <- lapply(countries_unique, grep, tolower(country_codes$shortname), value = TRUE)
names(shortname_matches) <- countries_unique
officialname_indices <- lapply(countries_unique, grep, tolower(country_codes$officialname))
lapply(officialname_indices, function(x, countries) {
if (length(x) > 1) x <- x[1]
country_codes[x, ]
}, countries = country_codes)
match_country <- function(x, first_only = FALSE, include_search = FALSE) {
official_rows <- grep(x, tolower(country_codes$officialname))
short_rows <- grep(x, tolower(country_codes$shortname))
rows <- list(official_rows, short_rows)
score <- function(x) {
as.numeric(dist(c(1, length(x))))
}
scores <- function(x) {
sapply(x, score)
}
if (identical(official_rows, short_rows)) {
best_rows <- short_rows # If they're identical and multiple, we just pick one.
} else {
best_rows <- rows[[which.min(scores(rows))]]
}
to_return <- country_codes[best_rows, ]
if (nrow(to_return) == 0) return(NULL)
if (first_only == TRUE) to_return <- to_return[1, ]
if (include_search == TRUE) to_return$search <- x
return(to_return)
}
test <- lapply(countries_unique, match_country)
names(test) <- countries_unique
lapply(test, nrow) # There are still some countries which have two matches.
countries <- sos$Countries %>%
tolower() %>%
tokenize(split_and = FALSE)
names(countries) <- sosid
# Pre-processing notes
# Change 'nf', as it matches "switzerland"
sos_country_codes <- list()
i <- 659
i <- 687
i <- 759
i <- 750
i <- 5
for (i in seq_along(countries)) {
x <- countries[[i]]
if (length(x) < 1) {
warning("Empty string for match.")
next
}
countries_matched <- lapply(x, match_country, first_only = TRUE, include_search = TRUE)
countries_matched <- do.call(rbind, countries_matched)
if (is.null(countries_matched)) {
warning("No matches found.")
next
}
countries_matched$sosid <- sosid[i]
sos_country_codes[[i]] <- countries_matched
}
sos_country_codes <- do.call(rbind, sos_country_codes)
# Here's how we'll process countries:
# 1. We'll conduct some pre-processing, for countries we know won't match properly.
# 2. We'll match every country with its shortname and officialname
# 3. We'll convert that into a data frame.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.