R/EurostatBDF.R

# Header ------------------------------------------------------------------
#
# Title: R code for interacting with the Eurostat Bulk Download Facility
#
# Author: Graeme Walsh <graeme.walsh@hotmail.co.uk>
# Date: 17/08/2018


# Package Info ------------------------------------------------------------

#' A package for downloading datasets from the Eurostat Bulk Download Facility.
#'
#' The workhorse function of the package is \code{getEurostat()}. This function returns a
#' list containing a Eurostat dataset, the code lists for the dataset, and the dataset metadata.
#' It is optional for the function to also return the raw data (as it was downloaded)
#' and the data in CSV form (containing codes and labels). The code behind this package also
#' serves as a backend to an EViews add-in created by the author.
#'
#'
#' @section EurostatBDF functions:
#' \code{getEurostat()}
#'
#' @docType package
#' @name EurostatBDF
NULL


# Important variables -----------------------------------------------------

# New environment
EurostatBDF.env <- new.env()

# Eurostat Bulk Download Facility URL
EurostatBDF.env$base_url <- "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing"

# Query strings
EurostatBDF.env$data_query <- "?sort=1&file=data%2F"
EurostatBDF.env$dic_query  <- "?sort=1&file=dic%2Fen%2F"
EurostatBDF.env$home_query <- "?sort=1&file="

# Table dictionary
EurostatBDF.env$tdic_name <- "table_dic"

# Table of contents
EurostatBDF.env$toc_name <- "table_of_contents_en"

# File extensions
EurostatBDF.env$filext <- c(".tsv.gz", ".sdmx.zip", ".dic", ".sdmx.xml", ".txt", ".gz", ".zip")

# Flags in database
EurostatBDF.env$flags <- "[[:space:]]+[bcdefnprsuz]"


# Dictionary functions ----------------------------------------------------

#' @title Get a dictionary file from the Eurostat Bulk Download Facility
#' @description Reads a Eurostat dictionary file into R as a two-column dataframe
#' @param dic_name The name of the dictionary file of interest
#' @export
#' @importFrom utils read.table
#' @author Graeme Walsh
#' @details The dictionary file is read from the /dic/en sub-folder
getDic <- function(dic_name) {
  dic <- read.table(file = paste0(EurostatBDF.env$base_url, EurostatBDF.env$dic_query, dic_name, EurostatBDF.env$filext[3]), sep = "\t", quote = "", stringsAsFactors = FALSE, col.names = c("Code", "Label"))
  return(dic)
}

#' @title Get the table dictionary file from the Eurostat Bulk Download Facility
#' @description Reads the Eurostat table dictionary file into R as a two-column dataframe
#' @export
#' @importFrom utils read.table
#' @author Graeme Walsh
#' @details The table dictionary file is read from the /dic/en sub-folder
getTdic <- function() {
  tdic <- getDic(EurostatBDF.env$tdic_name)
  return(tdic)
}

#' @title Get the code lists for a dataset
#' @description Creates a list containing all of the dataset dimensions including values and labels
#' @param tidy_data A tidy Eurostat dataset
#' @export
#' @author Graeme Walsh
#' @details The dataset dimensions are matched to the values in the corresponding dictionary files
getCodeLists <- function(tidy_data){

  # Get dataset dimensions (e.g. geo, unit)
  dims <- strsplit(sub("\\\\", ",", names(tidy_data)[1]), split = ",")[[1]]
  dims <- dims[-length(dims)]

  # Get dataset dimension values (e.g. IE, PC_GDP)
  dim_vals <- apply(matrix(unlist(strsplit(tidy_data[,1], split = ",")), ncol = length(dims), byrow = TRUE), 2 , unique)

  # Get the dictionary files
  dics <- lapply(dims, getDic)

  # Match the dataset dimension values to their values and labels in the dictionary files
  code_lists <- list()
  for (i in c(1:length(dims))) {
    code_lists[[i]] <- dics[[i]][match(dim_vals[[i]], dics[[i]][,1]),]
    colnames(code_lists[[i]]) <- c("Code","Label")
    rownames(code_lists[[i]]) <- NULL
  }
  names(code_lists) <- dims

  return(code_lists)
}


# Table of contents functions ---------------------------------------------

#' @title Get the table of contents file from the Eurostat Bulk Download Facility
#' @description Reads the Eurostat table of contents file into R as a eight-column dataframe
#' @export
#' @importFrom utils read.table
#' @author Graeme Walsh
#' @details The table of contents file is read from the home directory
getToc <- function() {
  toc <- read.table(file = paste0(EurostatBDF.env$base_url, EurostatBDF.env$home_query, EurostatBDF.env$toc_name, EurostatBDF.env$filext[5]), sep = "\t", quote = "\"", header = TRUE, stringsAsFactors = FALSE, check.names = FALSE)
  return(toc)
}

#' @title Get the metadata associated with a dataset
#' @description Reads the metadata for a dataset from the table of contents file
#' @param dataset_name The dataset code of interest
#' @export
#' @author Graeme Walsh
#' @details The metdata contains information such as title, code, last update, etc.
getMeta <- function(dataset_name){

  # Get the table of contents file
  toc <- getToc()

  # Find the metadata by matching the dataset code name its value in the toc
  metadata <- toc[which(toc[,"code"] == dataset_name),][1,]
  rownames(metadata) <- NULL

  return(metadata)
}

#' @title Get the time period and frequency of a dataset
#' @description Gets the time period and frequency information from the table of contents file
#' @param dataset_name The dataset code of interest
#' @export
#' @author Graeme Walsh
#' @details Returns the frequency, start date, and end date of the dataset
getPeriod <- function(dataset_name){

  # Get the metadata from table of contents file
  metadata <- getMeta(dataset_name)

  # Get the start and end period of the dataset
  time_period <- metadata[,c("data start", "data end")]

  # Get the frequency of the dataset
  if(grepl("Q", time_period[1])) {
    freq <- "Q"
  } else if(grepl("M", time_period[1])){
    freq <- "M"
  } else if(grepl("H", time_period[1])){
    freq <- "H"
  } else if(grepl("W", time_period[1])){
    freq <- "W"
  } else if(grepl("D", time_period[1])){
    freq <- "D"
  } else if(!grepl("\\D", time_period[1])){
    freq <- "A"
  } else{
    freq <- "Unknown"
  }

  period <- cbind(freq, time_period)

  return(period)
}


# Read data functions -----------------------------------------------------

#' @title Get a dataset in raw format from the Eurostat Bulk Download Facility
#' @description Download data from Eurostat
#' @param dataset_name The dataset code of interest
#' @param format The format of the dataset
#' @export
#' @importFrom utils download.file read.table
#' @author Graeme Walsh
#' @details The dataset is downloaded from the Bulk Download Facility
getEurostatRaw <- function(dataset_name, format="tsv"){
  if(format == "tsv"){

    # Create .gz temp file
    temp <- tempfile(fileext = EurostatBDF.env$filext[6])

    # Download the file
    download.file(url = paste0(EurostatBDF.env$base_url, EurostatBDF.env$data_query, dataset_name, EurostatBDF.env$filext[1]), destfile = temp)

    # Read the .tsv file as dataframe
    raw_data <- read.table(file = gzfile(temp), sep = "\t", header = TRUE, stringsAsFactors = FALSE, strip.white = TRUE, check.names = FALSE)

  }else if(format == "sdmx"){

    stop("SDMX file format not yet supported by the EurostatBDF package. Try TSV format, instead.")

    # Create .zip temp file
    # temp <- tempfile(fileext = EurostatBDF.env$filext[7])

    # Download the file
    # download.file(url = paste0(EurostatBDF.env$base_url, EurostatBDF.env$data_query, dataset_name, EurostatBDF.env$filext[2]), destfile = temp)

    # Code needs to be written...
  }
  return(raw_data)
}

#' @title Get a complete dataset from the Eurostat Bulk Download Facility
#' @description Download data from Eurostat (including code lists and metadata)
#' @param dataset_name The dataset code of interest
#' @param append_raw TRUE or FALSE
#' @param append_csv TRUE or FALSE
#' @export
#' @author Graeme Walsh
#' @details The dataset is returned as a list containing data, code lists, and metadata. The raw data can also be returned.
getEurostat <- function(dataset_name, append_raw=FALSE, append_csv=FALSE){

  # Get the raw data
  raw_data <- getEurostatRaw(dataset_name)

  # Tidy the raw data
  tidy_data <- tidyRawData(raw_data)

  # Get the code lists for the dataset
  code_lists <- getCodeLists(tidy_data)

  # Get the metadata for the dataset
  meta_data <- getMeta(dataset_name)

  # Package together all of the information into a list
  eurostat_dataset <- list(data = tidy_data, code_lists = code_lists, metadata = meta_data)

  # Append raw data
  if(append_raw == TRUE){
    eurostat_dataset <- append(eurostat_dataset, list(raw_data), after = length(eurostat_dataset))
    names(eurostat_dataset)[length(eurostat_dataset)] <- "raw_data"
  }

  # Prepare and append CSV data
  if(append_csv == TRUE){
    csv_data <- prepareCSV(tidy_data)
    eurostat_dataset <- append(eurostat_dataset, list(csv_data), after = length(eurostat_dataset))
    names(eurostat_dataset)[length(eurostat_dataset)] <- "csv_data"
  }

  return(eurostat_dataset)
}


# Data manipulation functions ---------------------------------------------

#' @title Tidy a raw format Eurostat dataset (remove flags, etc.)
#' @description Tidy a raw format Eurostat dataset (remove flags, etc.)
#' @param raw_data The raw dataset of interest
#' @export
#' @author Graeme Walsh
#' @details Flags are removed, missing values are replaced (: to NA), and columns are converted from descending to ascending
tidyRawData <- function(raw_data){

  # Remove flags, replace missing values with NA, and convert from strings to numerics
  raw_data[2:ncol(raw_data)] <- lapply(raw_data[2:ncol(raw_data)], function(x) {
    x <- gsub(EurostatBDF.env$flags, "", x)
    x[x == ":"] <- NA
    x <- as.numeric(as.character(x))
  })

  # Rearrange columns from descending to ascending order
  tidy_data <- raw_data[, c(1, ncol(raw_data):2)]

  return(tidy_data)
}


# Functions for EViews ----------------------------------------------------

#' @title Prepare a dataset to be saved in CSV format (add codes and labels to dataframe)
#' @description Prepares a dataset to be saved in CSV format (containing codes and labels)
#' @param tidy_data A tidy Eurostat dataset
#' @export
#' @author Graeme Walsh
#' @details Codes and labels are added to the dataframe
prepareCSV <- function(tidy_data){

  # Get dataset dimensions (e.g. geo, unit)
  dims <- strsplit(sub("\\\\", ",", names(tidy_data)[1]), split = ",")[[1]]
  dims <- dims[-length(dims)]

  # Get dataset dimension values (e.g. IE, PC_GDP)
  dim_vals <- apply(matrix(unlist(strsplit(tidy_data[,1], split = ",")), ncol = length(dims), byrow = TRUE), 2 , unique)

  # Get the dictionary files
  dics <- lapply(dims, getDic)

  # Split the identifier into separate columns
  var_id    <- strsplit(tidy_data[,1], split = ",")
  var_codes <- do.call("rbind", var_id)

  # Match the dimension values to their labels in the dictionary files
  labs <- list()
  for (i in c(1:(length(dims)))) {
    labs[[i]] <- dics[[i]][match(var_codes[,i], dics[[i]][,1]),]
  }
  labs_cols <- do.call("cbind", labs)
  colnames(labs_cols) <- paste0(rep(dims[1:length(dims)], each=2), c("_code", "_description"))

  # Add labels to the dataframe
  csv_data <- cbind(tidy_data[,1], labs_cols, tidy_data[,-1])
  rownames(csv_data) <- NULL
  colnames(csv_data)[1] <- "identifier"

  return(csv_data)
}

#' @title Get column header properties (for use in EViews)
#' @description Gets the names and number of column headers (for use in EViews)
#' @param csv_data A tidy Eurostat dataset prepared to be exported as CSV file for EViews
#' @export
#' @author Graeme Walsh
#' @details Returns the names and number of column headers for importing a CSV file into EViews
getColhead <- function(csv_data){

  # Get column header names
  colheadnames <- names(which(sapply(csv_data, is.character)))
  colheadnames <- c("Name", colheadnames)

  # Get the number of column headers
  colheadnum <- length(colheadnames)

  # Prepare the column header names string for EViews
  colheadnamesQ <- paste0("\"", colheadnames, "\"")
  colheadnamesQC <- paste0(colheadnamesQ, ",")
  colheadnamesQC[length(colheadnamesQC)] <- colheadnamesQ[length(colheadnamesQ)]
  colheadnamesFinal <- paste(colheadnamesQC, collapse = " ")

  # Return the names and number as a list
  colhead <- list(names=colheadnamesFinal, num=colheadnum)

  return(colhead)
}
xprimexinverse/EurostatBDF documentation built on May 5, 2019, 6:56 p.m.