R/fix_IEA_data_for_Industry_subsectors.R

Defines functions fix_IEA_data_for_Industry_subsectors

#' Apply corrections to IEA data needed for Industry subsectors
#' 
#' Apply corrections to IEA data to cope with fragmentary time series and 
#' replace outputs from blast furnaces and coke ovens, that are inputs into 
#' industry subsectors, by their respective inputs.  
#' The corrections done by this function are rather rudimentary and crude. This
#' gets smoothed away in regional aggregation. But do not use the resulting 
#' country-level data without additinonal scrutiny.
#' 
#' Use regional or global averages if IEA industry data lists energy use only as
#' "non-specified". 
#' Outputs from blast furnaces (\code{BLFURGS}, \code{OGASES}) and coke ovens 
#' (\code{OVENCOKE}, \code{COKEOVGS}, \code{COALTAR}, \code{NONCRUDE}), that are
#' inputs into industry subsectors, are replaced with the respective inputs 
#' based on regional averages. 
#' Used internally in \code{\link{calcIO}} for subtype 
#' \code{output_Industry_subsectors}.
#'
#' @param data MAgPIE object containing the IEA Energy Balances data
#' 
#' @param ieamatch mapping of IEA product/flow combinations to REMIND 
#'        \code{sety}/\code{fety}/\code{te} combinations as used in 
#'        \code{\link{calcIO}}.
#'
#' @return a MAgPIE object
#' 
#' @author Michaja Pehl
#' 
#' @importFrom rlang .data
#' @importFrom readr read_delim cols col_skip col_character
#' @importFrom quitte cartesian interpolate_missing_periods overwrite
#'             character.data.frame
#' @importFrom dplyr mutate inner_join group_by summarise anti_join left_join
#'             tbl_df rename
#' @importFrom assertr not_na
#' @importFrom tidyr spread gather complete nesting_
#' @importFrom magclass getRegions getYears getNames

fix_IEA_data_for_Industry_subsectors <- function(data, ieamatch) {
  
  # all industry subsector flows
  flows_to_fix <- c('IRONSTL', 'CHEMICAL', 'NONFERR', 'NONMET', 'TRANSEQ', 
                    'MACHINE','MINING', 'FOODPRO', 'PAPERPRO', 'WOODPRO', 
                    'CONSTRUC', 'TEXTILES')
  
  # all products associated with those flows
  products_to_fix <- ieamatch %>%
    filter(.data$iea_flows %in% flows_to_fix) %>%
    getElement('iea_product') %>%
    unique()

  region_mapping <- read_delim(
    file = toolMappingFile('regional', getConfig('regionmapping')),
    delim = ';',
    col_names = c('country', 'iso3c', 'region'),
    col_types = cols(country = col_skip(),
                     iso3c   = col_character(),
                     region  = col_character()),
    skip = 1)

  # ---- extend industry subsector time series ----
  # subset of data containing industry subsector products and flows
  data_industry <- data[,,cartesian(products_to_fix,
                                    c(flows_to_fix, 'TOTIND', 'INONSPEC'))] %>%
    as.data.frame() %>%
    select(iso3c = 'Region', year = 'Year', product = 'Data1', flow = 'Data2',
           value = 'Value') %>%
    mutate(year = as.integer(as.character(.data$year))) %>%
    filter(0 != .data$value) %>%
    inner_join(region_mapping, 'iso3c') %>%
    assert(not_na, .data$region)

  # all products that are consumed only in the non-specified subsector of
  # industry are "suspicious" and are therefore fixed
  data_to_fix <- inner_join(
    data_industry %>%
      filter('TOTIND' != .data$flow) %>%
      group_by(.data$iso3c, .data$region, .data$year, .data$product) %>%
      summarise(total = sum(.data$value, na.rm = TRUE)) %>%
      ungroup(),

    data_industry %>%
      filter(.data$flow %in% c('TOTIND', 'INONSPEC')) %>%
      spread(.data$flow, .data$value),

    c('iso3c', 'region', 'year', 'product')
  ) %>%
    filter(  abs(1 - (.data$total / .data$TOTIND)) > 1e-3 
           | .data$INONSPEC == .data$TOTIND) %>%
    select(.data$iso3c, .data$region, .data$year, .data$product, .data$TOTIND)

  # use all non-suspicious data to calculate reginal and global averages
  data_for_fixing <- anti_join(
    data_industry %>%
      filter('TOTIND' != .data$flow),

    data_to_fix %>%
      select(-.data$TOTIND),

    c('iso3c', 'region', 'year', 'product')
  ) %>% 
    tbl_df()

  data_for_fixing <- full_join(
    # compute global averages
    data_for_fixing %>%
      group_by(.data$year, .data$product, .data$flow) %>%
      summarise(value = sum(.data$value)) %>%
      ungroup() %>%   # FIXME: dplyr 0.7.4 seems unable to group again
      group_by(.data$year, .data$product) %>%
      mutate(global_share = .data$value / sum(.data$value)) %>%
      ungroup() %>%
      select(-.data$value) %>%
      # and expand to all regions
      mutate(region = NA_character_) %>%
      complete(nesting_(list('year', 'product', 'flow', 'global_share')),
               region = unique(region_mapping$region)),

    # compute regional averages
    data_for_fixing %>%
      group_by(.data$year, .data$region, .data$product, .data$flow) %>%
      summarise(value = sum(.data$value)) %>%
      ungroup() %>%   # FIXME: dplyr 0.7.4 seems unable to group again
      group_by(.data$year, .data$region, .data$product) %>%
      mutate(regional_share = .data$value / sum(.data$value)) %>%
      ungroup() %>%
      select(-.data$value),

    c('year', 'region', 'product', 'flow')
  ) %>%
    # use regional averages if available, global averages otherwise
    mutate(value = ifelse(!is.na(.data$regional_share), .data$regional_share,
                          .data$global_share)) %>%
    select(-.data$regional_share, -.data$global_share) %>%
    interpolate_missing_periods_(
      periods = list(year = sub('^y([0-9]{4})$', '\\1', getYears(data)) %>% 
                       as.integer() %>% 
                       sort()), 
      expand.values = TRUE, method = 'linear')

  # calculated fixed data
  data_industry_fixed <- left_join(
    data_to_fix,
    data_for_fixing,
    c('region', 'year', 'product')
  ) %>%
    # replace "suspicious" data with averages
    mutate(value = .data$TOTIND * .data$value) %>%
    select(.data$iso3c, .data$region, .data$year, .data$product, .data$flow, 
           .data$value) %>%
    assert(not_na, .data$value) %>%
    overwrite(data_industry) %>%
    select(COUNTRY = .data$iso3c, TIME = .data$year, PRODUCT = .data$product, 
           FLOW = .data$flow, Value = .data$value) %>%
    as.magpie()

  # replace fixed data
  data[getRegions(data_industry_fixed),
       getYears(data_industry_fixed),
       getNames(data_industry_fixed)] <- data_industry_fixed
  
  # ---- calculate factors to replace blast furnace outputs ----
  # get subset of data pertaining to blast furnaces
  flows_BLASTFUR <- grep('^(TOTAL|MRENEW)', 
                         grep('[ET]BLASTFUR', getNames(data), value = TRUE), 
                         value = TRUE, invert = TRUE)
  
  data_BLASTFUR <- data[,,flows_BLASTFUR] %>% 
    as.data.frame() %>% 
    tbl_df() %>% 
    select(iso3c = 'Region', year = 'Year', product = 'Data1', flow = 'Data2', 
           value = 'Value') %>% 
    filter(0 != .data$value) %>% 
    mutate(year = as.integer(as.character(.data$year)))
  
  # save output products from blast furnaces for replacement further upstream
  # (coke ovens) and downstream (industry)
  outputs_BLASTFUR <- data_BLASTFUR %>% 
    filter(0 < .data$value) %>% 
    getElement('product') %>% 
    unique() %>% 
    as.character()
  
  # aggregate over regions
  factors_BLASTFUR <- inner_join(data_BLASTFUR, region_mapping, 'iso3c') %>% 
    group_by(.data$region, .data$year, .data$product) %>% 
    summarise(value = sum(.data$value)) %>% 
    ungroup()
  
  # calculate the factor with which blast furnace outputs (i.e. industry inputs) 
  # are replaced by blast furnace inputs
  factors_BLASTFUR <- inner_join(
    # inputs into blast furnaces
    factors_BLASTFUR %>% 
      filter(0 > .data$value),
    
    # outputs from blast furnaces
    factors_BLASTFUR %>% 
      filter(0 < .data$value) %>% 
      group_by(.data$region, .data$year) %>% 
      summarise(output = sum(.data$value)) %>% 
      ungroup(),
    
    c('region', 'year')
  ) %>% 
    group_by(.data$region, .data$year, .data$product) %>% 
    summarise(factor = abs(.data$value / .data$output)) %>% 
    ungroup()
  
  # ==== replace coke oven outputs with coke oven inputs ====
  
  # get subset of data pertaining to coke ovens
  flows_COKEOVS <- grep('^(TOTAL|MRENEW)', 
                         grep('[ET]COKEOVS', getNames(data), value = TRUE), 
                         value = TRUE, invert = TRUE)

  data_COKEOVS <- data[,,flows_COKEOVS] %>% 
    as.data.frame() %>% 
    tbl_df() %>% 
    select(iso3c = 'Region', year = 'Year', product = 'Data1', flow = 'Data2',
           value = 'Value') %>% 
    filter(0 != .data$value) %>% 
    mutate(year = as.integer(as.character(.data$year)))
  
  # save output products from coke ovens for replacement downstream (industry)
  outputs_COKEOVS <- data_COKEOVS %>% 
    filter(0 < .data$value) %>% 
    getElement('product') %>% 
    unique() %>% 
    as.character()
  
  # aggregate over regions
  factors_COKEOVS <- inner_join(data_COKEOVS, region_mapping, 'iso3c') %>% 
    group_by(.data$region, .data$year, .data$product) %>% 
    summarise(value = sum(.data$value)) %>% 
    ungroup()
  
  # replace inputs into coke ovens, that are themselves outputs of blast 
  # furnaces, by the appropriate amount o blast furnace inputs
  factors_COKEOVS <- bind_rows(
    # for each Joule of blast furnace output, use factor Joules of all blast 
    # furnace inputs instead
    factors_COKEOVS %>% 
      filter(.data$product %in% outputs_BLASTFUR) %>% 
      rename(product.replace = 'product') %>% 
      inner_join(factors_BLASTFUR, c('region', 'year')) %>% 
      group_by(.data$region, .data$year, .data$product) %>% 
      summarise(value = sum(.data$value * .data$factor)) %>% 
      ungroup(),
    
    factors_COKEOVS %>% 
      filter(!.data$product %in% outputs_BLASTFUR)
  ) %>% 
    group_by(.data$region, .data$year, .data$product) %>% 
    summarise(value = sum(.data$value)) %>% 
    ungroup()
  
  # calculate the factor with which coke oven outputs (i.e. industry inputs) 
  # are replaced by coke oven inputs
  factors_COKEOVS <- inner_join(
    # inputs into coke ovens
    factors_COKEOVS %>% 
      filter(0 > .data$value),
    
    # outputs from coke ovens
    factors_COKEOVS %>% 
      filter(0 < .data$value) %>% 
      group_by(.data$region, .data$year) %>% 
      summarise(output = sum(.data$value)) %>% 
      ungroup(),
    
    c('region', 'year')
  ) %>% 
    group_by(.data$region, .data$year, .data$product) %>% 
    summarise(factor = abs(.data$value / .data$output)) %>% 
    ungroup()
  
  # ---- replace blast furnace and coke oven outputs by respective inputs ----
  
  # get data that needs to be replaced: blast furnace and coke oven outputs that
  # are inputs in industry sectors
  REMIND_industry_flows <- c(
    flows_to_fix, 
    'INONSPEC', 'AGRICULT', 'FISHING',
    'MAINELEC', 'AUTOELEC', 'MAINCHP', 'AUTOCHP', 'MAINHEAT', 'AUTOHEAT',
    'NONENUSE')
  
  replace_product.flow <- cartesian(c(outputs_BLASTFUR, outputs_COKEOVS), 
                                    REMIND_industry_flows)
  
  data_to_fix <- data[,,replace_product.flow] %>% 
    as.data.frame() %>% 
    tbl_df() %>% 
    select(iso3c = 'Region', year = 'Year', product = 'Data1', flow = 'Data2', 
           value = 'Value') %>% 
    filter(0 != .data$value) %>% 
    mutate(year = as.integer(as.character(.data$year)))
  
  # replace blast furnace outputs
  data_for_fixing <- bind_rows(
    data_to_fix %>% 
      filter(.data$product %in% outputs_BLASTFUR,
             .data$flow %in% REMIND_industry_flows,
             0 != .data$value) %>% 
      select(-.data$product) %>% 
      inner_join(region_mapping, 'iso3c') %>% 
      inner_join(factors_BLASTFUR, c('region', 'year')) %>% 
      group_by(.data$iso3c, .data$year, .data$product, .data$flow) %>% 
      summarise(value = sum(.data$value * .data$factor)) %>% 
      ungroup() %>% 
      character.data.frame(),
    
    data_to_fix %>% 
      filter( !.data$product %in% outputs_BLASTFUR 
            | !.data$flow %in% REMIND_industry_flows) %>% 
      character.data.frame()
  )
  
  # replace coke oven outputs
  data_for_fixing <- bind_rows(
    data_for_fixing %>% 
      filter(.data$product %in% outputs_COKEOVS,
             .data$flow %in% REMIND_industry_flows,
             0 != .data$value) %>% 
      select(-.data$product) %>% 
      inner_join(region_mapping, 'iso3c') %>% 
      inner_join(factors_COKEOVS, c('region', 'year')) %>% 
      group_by(.data$iso3c, .data$year, .data$product, .data$flow) %>% 
      summarise(value = sum(.data$value * .data$factor)) %>% 
      ungroup() %>% 
      character.data.frame(),
    
    data_for_fixing %>% 
      filter( !.data$product %in% outputs_COKEOVS 
            | !.data$flow %in% REMIND_industry_flows)
  ) %>% 
    group_by(.data$iso3c, .data$year, .data$product, .data$flow) %>% 
    summarise(value = sum(.data$value)) %>% 
    ungroup() %>% 
    character.data.frame() %>% 
    rename(Region = 'iso3c', Year = 'year', Data1 = 'product', Data2 = 'flow',
           Value = 'value') %>%
    as.magpie(spatial = 'Region', temporal = 'Year')
  
  # replace NAs with zeros
  data_for_fixing[is.na(data_for_fixing)] <- 0
  
  # replace
  regions_keep <- getRegions(data)
  years_keep   <- getYears(data)
  names_keep   <- setdiff(getNames(data), replace_product.flow)
  
  regions_replace <- getRegions(data_for_fixing)
  years_replace   <- getYears(data_for_fixing)
  names_replace   <- getNames(data_for_fixing)
  
  data_fixed <- new.magpie(cells_and_regions = sort(regions_keep), 
                    years = sort(years_keep),
                    names = unique(c(names_keep, names_replace)),
                    fill = 0)
  
  data_fixed[regions_keep,years_keep,names_keep] <- (
    data[regions_keep,years_keep,names_keep])
  
  data_fixed[regions_replace,years_replace,names_replace] <- (
      data_fixed[regions_replace,years_replace,names_replace]
    + data_for_fixing[regions_replace,years_replace,names_replace])
  
  return(data_fixed)
}
pik-piam/moinput documentation built on June 9, 2020, 12:23 p.m.