library(httr)
library(rvest)
library(dplyr)
library(stringr)
library(readxl)
library(readr)
library(tidyr)
library(digest)
# -----------------------------------------------------------------------------
# Utility functions
# Download a file unless a local copy already exists
download_if_absent <- function(url, dest, force = FALSE) {
absent <- !file.exists(dest)
if (force && !absent) {
file.remove(dest)
absent <- TRUE
}
if (absent)
GET(url, write_disk(dest), progress())
}
# Build a table with details of each variable.
# Retain the PIAAC Name, Label, Level, and Domain.
# Recode Type with codes as those use by the readr package:
# - 'i': Integer variables
# - 'd': floating point variables
# - 'c': character variables
# - '_': ignored variables
build_variables_table <- function(path) {
# Read codebook sheet 'Variables': Drop columns 'Link to values' and following.
vars <- read_excel(path, sheet = 'Variables')[1:10]
# Variables to retain
idx <- str_detect(vars$Domain, regex("^(Background|Sampling|Scale|Not)"))
# Variables to drop
drop_vars <- vars[!idx, ] %>% mutate(Type = "_")
sel_vars <- vars[idx, ]
# String variables
str_vars <- sel_vars %>%
filter(Type == 'String/character (Unicode)') %>%
mutate(Type = "c")
# Integer variables
int_vars <- sel_vars %>%
filter(Type == 'Integer', Width < 10) %>%
mutate(Type = "i")
# Floating point variables
dbl_vars <- sel_vars %>%
filter(Type == 'Numeric/floating point' |
(Type == "Integer" & Width >= 10)) %>%
mutate(Type = "d")
# Merge all dataframes
bind_rows(drop_vars, str_vars, int_vars, dbl_vars) %>%
select(Name, Label, Type, Level, Domain)
}
# Return a list with three elements:
# - missing_codes: the codes used for missing values in the csv files.
# - codes: a dataframe with coding schemes
# - variable_code: a table that associates variables and coding schemes
build_values_tables <- function(path) {
# Read codebook sheet 'Values'
cb_values <- read_excel(path, sheet = 'Values')
# Missing value codes -------------------------------------------------
# The missing values in the CSV files are recorded as the SAS codes
# without the leading dot.
missing_codes <- cb_values %>%
filter(`Value Type` == 'Missing', !is.na(`Value (SAS)`)) %>%
distinct(`Value (SAS)`) %>%
transmute(Miss = substr(`Value (SAS)`, 2, 2))
# Apparently, variable CTRYQUAL also uses Z as a missing value. This is
# not documented in the codebook
missing_codes <- c(missing_codes$Miss, 'Z')
# Variable codings ----------------------------------------------------
# Filter out missing values
valids <- cb_values %>%
filter(`Value Type` != 'Missing') %>%
select(Name = `Variable Name`, Label = `Value Label`, Value = `Value (SAS)`)
# Use an environment as a hash table
tb_env <- new.env(parent = emptyenv())
# 1. Get the values scheme for a variable.
# 2. Compute the hash value for that scheme.
# 3. Add the scheme to the hash table if it is not already included.
# 4. Return the hash.
handle_var <- function(var) {
tb <- valids %>% filter(Name == var) %>% select(Label, Value)
md5 <- digest(tb)
if (!exists(md5, where = tb_env))
assign(md5, tb, envir = tb_env)
return(md5)
}
# Compute the hash code for each variable and update if necessary
# the hashtable.
coded_vars <- (valids %>% distinct(Name))$Name
var_hashes <- vapply(coded_vars, handle_var, "", USE.NAMES = FALSE)
# Get the hashes in the hash table
hashes <- unique(var_hashes)
# Associate each var with code scheme
variable_code <- tibble(Name = coded_vars, ID = match(var_hashes, hashes))
# Transform the hashtable into a dataframe
codes <- do.call(bind_rows, lapply(seq_along(hashes), function(i) {
get(hashes[i], envir = tb_env) %>% mutate(ID = i)
}))
list(missing_codes = missing_codes, codes = codes, variable_code = variable_code)
}
# -----------------------------------------------------------------------------
# Missing variables by country
# Returns a dataframe that holds the missing variables
# in each country dataset.
build_missing_variables_table <- function(path) {
# Read info in the "all data" sheet.
# There are some extra rows in that sheet: drop them.
miss_vars <- read_excel(path, sheet = "all data",
col_types = rep("text", 35)) %>%
filter(!is.na(Domain)) %>%
gather(Country, missing, aut:tur, na.rm = TRUE) %>%
select(Name = `Variable name`, Country)
}
# PIAAC codebook
cb_url <-
paste0("http://www.oecd.org/site/piaac/", URLencode(
"International Codebook_PIAAC Public-use File (PUF) Variables and Values.xlsx"))
# Missing variables by country
miss_vars_url <-
"http://www.oecd.org/skills/piaac/List_of_missing_variables_in_PUF_by_country.xls"
# Where the local copies are stored
cb_file <- file.path('data-raw', 'codebook.xlsx')
miss_vars_file <- file.path('data-raw', 'missing-vars.xls')
download_if_absent(cb_url, cb_file)
download_if_absent(miss_vars_url, miss_vars_file)
# Build the PIAAC metadata tables
variables <- build_variables_table(cb_file)
values <- build_values_tables(cb_file)
missing_variables <- build_missing_variables_table(miss_vars_file)
# Variable names in missing variables file are lower case.
# Capitalize them as in the codebook.
idx <- match(missing_variables$Name, str_to_lower(variables$Name))
missing_variables$Name <- variables$Name[idx]
# Drop unused variables from metadata tables
idx <- variables$Type == '_'
ignore_vars <- variables$Name[idx]
variables <- variables[!idx, ]
idx <- missing_variables$Name %in% ignore_vars
missing_variables <- missing_variables[!idx, ]
idx <- values$variable_code$Name %in% ignore_vars
values$variable_code <- values$variable_code[!idx, ]
idx <- values$codes$ID %in% unique(values$variable_code$ID)
values$codes <- values$codes[idx, ]
use_data(missing_variables, variables, values, internal = TRUE)
# URL where PIAAC data files are stored
PIAAC_URL <- 'http://vs-web-fs-1.oecd.org/piaac/puf-data/CSV'
csv_names <- html_text(html_nodes(read_html(PIAAC_URL), "a"), trim = TRUE)
csv_names <- csv_names[str_detect(csv_names, regex('\\.csv$'))]
if (!dir.exists('data'))
dir.create('data')
country_data <- function(csv) {
# Variables to read and their types
build_col_types <- function(country) {
types <- setNames(variables$Type, variables$Name)
miss_vars <- missing_variables[missing_variables$Country == country, ]$Name
types[miss_vars] <- "_"
do.call(cols_only, as.list(types[types != "_"]))
}
country <- str_sub(csv, 4, 6)
url <- paste(PIAAC_URL, csv, sep = "/")
dest <- file.path('data', paste0(country, ".rda"))
data <- read_csv(content(GET(url), "raw"),
na = values$missing_codes,
col_types = build_col_types(country))
assign(country, data)
save(list = country, file = dest, compress = "xz")
}
for (csv in csv_names) {
cat(csv, '\n')
country_data(csv)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.