data-raw/PreprocessIEAData.R

# This is an old script that was used in the past to preprocess IEA data.
# This script has been superseded by the functions in the IEATools package.
# The script should remain here until fixes to 
#   * GHA PSB
#   * GHA Industrial electricity
#   * HND fuels
# are implemented in functions.
# 
#    --- MKH, 9 March 2020


# This script loads IEA data in (nearly) its as-downloaded format (in ktoe, not EJ) and
# converts to an R object that is stored in this repository.
# The only changes you need to make to the as-downloaded format of the IEA data files are
#
# (1) Collapse the top two lines of the file to look like this:
# Country	Flow	Product	1960	1961	1962 ...
#
# (2) Add columns entitled Ledger.side (Production of Consumption) and Flow.aggregation.point.
# 
# (3) Save as tab-deliminted text files somewhere besides this repository. 
#     (The files are too big.)
#     
# Then, this script munges the data into tidy format.
# 
# Thereafter, corrections are made for countries we have studied.
# 
# Finally, all data are saved to locations where they become part of the package.
# 
# The workflow for this package should be as follows:
# 
# (1) Source this script.  (This script sources the "FixIEAData.R" script, so no need to source "FixIEAData.R".)
# (2) Build the package.

library(dplyr)
library(tidyr)
library(countrycode)
library(devtools)

# This script assumes that the working directory is set to the project directory.
# When that is true, 
# the resultant .rds file will live in the data directory.

# Paths of raw data files may need to be edited.
# R apparently does not follow aliases!
oecd_path <- file.path("~", "Documents", "Calvin stuff", "Useful Work", "IEA Data", 
                       "All Countries", "energy-balances-oecd-extended-energy.txt")
nonoecd_path <- file.path("~", "Documents", "Calvin stuff", "Useful Work", "IEA Data", 
                          "All Countries", "energy-balances-nonoecd-extended-energy.txt")

# Define latest year to be evaluated
LATEST_YEAR <- 2013

# Get country code information
CountryInfo <- countrycode::codelist %>% 
  select(country.name.en, iso2c) %>% 
  rename(
    Country = country.name.en
  )

# There are some "Countries" in the IEA data set that do not have corresponding
# iso2c abbreviations in the countrycode database.  
# None of these countries are of interest to us now (March 2018),
# so we will not try any corrections at this time.
# Later, we can add additional rows to the CountryInfo data frame to pick up ISO abbreviations
# for missing countries.
# The code might look something like this:
# bind_rows(
#   data.frame(Country = c("Former Soviet Union (if no detail)",
#                          "Former Yugoslavia (if no detail)",
#                          "Republic of Vietnam",
#                          "Tanzania",
#                          "Venezuela",
#                          "Islamic Republic of Iran",
#                          "Dem. Republic of the Congo",
#                          "Dem. People's Rep. of Korea",
#                          "People's Republic of China",
#                          "C\x99te d'Iviore"),
#   iso2c = c("SO", "YU", "VN", "TZ", "VE", "IR", "CD", "KP", "CN", "CI"))
# )

#
# Load raw data
#
IEAData1 <- lapply(
  list(oecd = oecd_path, nonoecd = nonoecd_path),
  function(f){
    # check.names = FALSE avoids prefix "X" on all year column headings
    # stringsAsFactors = FALSE avoids conversion to factors, 
    # thereby eliminating warnings in bind_rows and gather.
    # We set as factors later.
    return(read.delim(f, check.names = FALSE, stringsAsFactors = FALSE)) 
  }) %>%
  bind_rows()

#
# Manipulate raw data
#
IEAData2 <- IEAData1 %>%
  gather(Year, E.ktoe, -c(Country, Ledger.side, Flow, Flow.aggregation.point, Product))

IEAData3 <- IEAData2 %>%
  # Remove missing values to reduce memory footprint
  filter(!E.ktoe %in% c("..", "x", "0", "c")) %>%
  filter(!is.na(E.ktoe)) %>%
  filter(!("E" %in% Year)) %>% # Estimated data
  filter(Year <= LATEST_YEAR) %>%
  # Eliminate aggregation rows.  We'll do our own aggregation if we need it.  
  filter(
    !Flow %in% c(
      "Total primary energy supply",
      "Total final consumption", 
      "Transformation processes", 
      "Energy industry own use",
      "Industry",
      "Transport",
      "Other",
      "Non-energy use",
      "Memo: Non-energy use chemical/petrochemical",       # Encoded this way in OECD data
      "   Memo: Non-energy use chemical/petrochemical",    # Encoded this way in Non-OECD data. Why, oh why?
      "Electricity output (GWh)",
      "Electricity output (GWh)-main activity producer electricity plants",
      "Electricity output (GWh)-autoproducer electricity plants",
      "Electricity output (GWh)-main activity producer CHP plants",
      "Electricity output (GWh)-autoproducer CHP plants",
      "Heat output",
      "Heat output-main activity producer CHP plants",
      "Heat output-autoproducer CHP plants",
      "Heat output-main activity producer heat plants",
      "Heat output-autoproducer heat plants",
      "OECD Total" # The aggregation data for "OECD Total" is screwed up (2x)
    )) %>%
  filter(!Product %in% c("Total", "Memo: Renewables"))

# Add country codes, and names
IEAData4 <- IEAData3 %>%
  left_join(CountryInfo, by = c("Country")) %>% # left_join preserves all rows of IEA data
  rename(
    Name = Country,
    Country = iso2c
  ) %>% 
  mutate(
    Country = dplyr::case_when(
      # Add "World" to the Country column to preserve that data.
      Name == "World" ~ "World", 
      TRUE ~ Country
    )
  ) %>% 
  # The effect of the next line is to eliminate non-countries from the data set.
  # For example, OECD, IEA, etc. are not Countries, so they are dropped here.
  filter(!is.na(Country))

AllIEAData <- IEAData4 %>%
  mutate(
    # Set data types
    E.ktoe = as.numeric(E.ktoe),
    Year = as.numeric(Year)
  ) %>%
  # Orders rows same as the original downloaded IEA data.
  # Orders the columns in a reasonable manner
  # Also, drops the Name of the country. 
  select(Country, Ledger.side, Flow.aggregation.point, Flow, Product, Year, E.ktoe)

#
# Clean up the environment
#
rm(oecd_path, nonoecd_path, LATEST_YEAR, CountryInfo, IEAData1, IEAData2, IEAData3, IEAData4)

#
# Fix the IEA data in places where we have better information
#
source(file = file.path("data-raw", "FixIEAData.R"))

#
# Save AllIEAData as an R object for later retrieval
#
use_data(AllIEAData, overwrite = TRUE)
MatthewHeun/IEATools documentation built on Feb. 6, 2024, 3:29 p.m.