Settings

# Coefficient for outlier detection
# See coef argument in ?boxplot.stats
out_coef <- 1.5

## Debug logic for mapping of commodity
debughsfclmap <- TRUE
## Is it multicore?
multicore <- TRUE
# ---- libs ----
library(tradeproc)
library(stringr)
library(testthat)
library(ggplot2)
library(dplyr, warn.conflicts = F)

if(multicore) {
  suppressPackageStartupMessages(library(doParallel))
  library(foreach)
  doParallel::registerDoParallel(cores=detectCores(all.tests=TRUE))
}

Year under consideration:

year <- 2009L

Datasets from other packages for mapping of commodity and countries

# ---- datasets ----
## Data sets with hs->fcl map (from mdb files)
# and UNSD area codes (M49)
## TODO: replace by ad hoc tables

## Mapping of commodity hs to fcl dataset
data("hsfclmap2", package = "hsfclmap", envir = environment())
## Loading of the adjustments 
data("adjustments", package = "hsfclmap", envir = environment())
## Mapping of countries M49
data("unsdpartnersblocks", package = "tradeproc", envir = environment())
data("unsdpartners", package = "tradeproc", envir = environment())
## Units for fcl commodity
data("fclunits", package = "tradeproc", envir = environment())
## Fao country code with names
data("faocountrycode", package = "tradeproc", envir = environment())

# ---- hsfclmapsubset ----
# HS -> FCL map
## Filter hs->fcl links we need (based on year)

hsfclmap <- hsfclmap2 %>%
  # Filter out all records from future years
  filter_(~mdbyear <= year) %>%
  # Distance from year of interest to year in the map
  mutate_(yeardistance = ~year - mdbyear) %>%
  # Select nearest year for every reporter
  # if year == 2011 and mdbyear == 2011, then distance is 0
  # if year == 2011 and mdbyear == 2010, distance is 1
  group_by_(~area) %>%
  filter_(~yeardistance == min(yeardistance)) %>%
  ungroup() %>%
  select_(~-yeardistance) %>%
  ## and add trailing 9 to tocode, where it is shorter
  ## TODO: check how many such cases and, if possible, move to manualCorrectoins
  mutate_(tocode = ~hsfclmap::trailingDigits(fromcode,
                                             tocode,
                                             digit = 9))

Loading Tariffline (UNSD) and Eurostat data

# ---- tradeload ----

#### Get list of agri codes #### (Not needed at the moment)
## agricodeslist <- paste0(shQuote(getAgriHSCodes(), "sh"), collapse=", ")

## Function of Alex
# tldata <- getRawAgriTL(year, agricodeslist)
## Loading from local dataset
#load("../tldata_raw_from_db.RData")
#load("~/Dropbox/tradeproc/tldata_raw_from_db.RData")

## Function to read table from drive of Giorgio
load(paste0("~/Desktop/FAO/Trade/RData/tldata_",year,".RData"))
## 40262359

#### Download ES data ####

## Function of Alex
# esdata <- getRawAgriES(year, agricodeslist)
## Loading from local dataset
#load("../esdata_raw_from_db.RData")
#load("~/Dropbox/tradeproc/esdata_raw_from_db.RData")

## Function to read table from drive of Giorgio
load(paste0("~/Desktop/FAO/Trade/RData/esdata_",year,".RData"))
## 8665414
tldata <- tldata_raw
tldata = tldata %>%
  mutate_(lenhs = ~stringr::str_length(hs))

esdata <- esdata_raw
esdata = esdata %>%
  mutate_(lenhs = ~stringr::str_length(hs))

Pre-processing

Mapping country codes

# ---- geonom2fao ----
esdata <- esdata %>%
  mutate_(reporter = ~convertGeonom2FAO(reporter),
          partner = ~convertGeonom2FAO(partner))

# ---- tl_m49fao ----
## Based on Excel file from UNSD (unsdpartners..)

tldata1 <- tldata %>%
  left_join(unsdpartnersblocks %>%
              select_(wholepartner = ~rtCode,
                      part = ~formula) %>%
              # Exclude EU grouping and old countries
              filter_(~wholepartner %in% c(251, 381, 579, 581, 711, 757, 842)),
            by = c("partner" = "part")) %>%
  mutate_(partner = ~ifelse(is.na(wholepartner), partner, wholepartner),
          m49rep = ~reporter,
          m49par = ~partner,
          # Conversion from Comtrade M49 to FAO area list
          reporter = ~as.integer(tradeproc::convertComtradeM49ToFAO(m49rep)),
          partner = ~as.integer(tradeproc::convertComtradeM49ToFAO(m49par)))
distribution_hslen = function(data, title){
  to_plot = as.data.frame(table(data$lenhs))
  colnames(to_plot) = c("Lenght_hs","Frequency")
  ggplot(data = to_plot, aes(x = Lenght_hs, y = Frequency)) +
    geom_bar(position = 'dodge',stat = 'identity') +
    geom_text(aes(label=paste0(Frequency, " - ",round(Frequency/sum(Frequency)*100,2),"%")),
              position=position_dodge(width=0.9), vjust=-0.25,
              size = 2) +
    ggtitle(title)
}

# --- tariffline ---
distribution_hslen(tldata, title = paste0("Tariffline data, year ", year))

# --- eurostat ---
distribution_hslen(esdata, title = paste0("Eurostat data, year ", year))

Analysis on aggregates

For each country we stratify by flow and lenght of the hs code (lenhs) and we check if the values are the same for same flow but different hs lenght. In the plots all available data are shown, thus all lenhs available for that specific country.The y axis (total of values) are in log10 scale.

aggregate = function(data){
  result = data %>%
    group_by_(~reporter, ~year, ~flow, ~lenhs) %>%
    summarise_(n_records = ~n(),
               n_partners = ~n_distinct(partner),
               n_hs = ~n_distinct(hs),
               values = ~sum(value)) %>%
    left_join(fao_country_code, by = "reporter")
}

aggregate_tl = aggregate(tldata)
aggregate_es = aggregate(esdata)

# ---- TLdata ----
list_countries_tl = unique(aggregate_tl$reporter)
splitt = split(list_countries_tl, ceiling(seq_along(list_countries_tl)/25))
for (i in 1:length(splitt)){
  g = ggplot(data = as.data.frame(aggregate_tl %>% filter_(~reporter %in% splitt[[i]])), 
         aes(x=factor(lenhs), y=log10(values), fill = factor(flow))) +
    geom_bar(aes(fill=factor(flow)), stat="identity", position = "dodge") + 
    facet_wrap(~country_name, ncol = 5, scale = "free")
  print(g)
}

# ---- ESdata ----
ggplot(data = as.data.frame(aggregate_es), aes(x=factor(lenhs), y=log10(values), fill = factor(flow))) +
  geom_bar(stat="identity", position="dodge") + facet_wrap(~country_name)

Filter countries

For the country mapping we have a specific code in FAO country code, which aggregates all not defined countries. We have:

data("geonom2fao", package = "tradeproc", envir = environment())
table = geonom2fao %>% 
  select_(~code,~active) %>%
  filter_(~active == 252) 

print(xtable::xtable(table), type = "html", include.rownames = F)
data("m49faomap", package = "tradeproc", envir = environment())
table = m49faomap %>%
  filter_(~fao == 252)
print(xtable::xtable(table), type = "html", include.rownames = F)

We then remove, from both Eurostat and Tariffline, all the countries not having commodity mapping information from hs to fcl, because we are not able, afterwards, to map the corresponding commodity. For Tariffline we additionally remove all the European countries present in the Eurostat dataset.

# ---- es_partner_filter ----
## Eurostat data with partner = 252
esdata_discard_country = esdata %>%
  filter_(~partner == 252)

## Fitering countries with partner = 252
esdata = esdata %>%
  filter_(~partner != 252)

Eurostat, removing code 252, has r NROW(esdata) records.

# ---- drop_es_from_tl ----
# They will be replaced by ES data

tldata <- tldata %>%
  anti_join(esdata %>%
              select_(~reporter) %>%
              distinct(),
            by = "reporter")

Tariffline, removing European countries, has r NROW(tldata) records.

For Eurostat, this is the list of the countries non having hs-to-fcl mapping: (if empty not shown)

# ---- drop_reps_not_in_mdb ----
# We drop reporters what are absent in MDB hsfcl map
# because in any case we can proceed their data

## Eurostat data with not information in the hsfcl mapping
esdata_not_mapping_commodity <- esdata %>%
  filter_(~(!(reporter %in% unique(hsfclmap$area))))

## Record we loose including the hsfcl mapping information
table = esdata_not_mapping_commodity %>% select_(~reporter) %>% 
  unique() %>%
  left_join(fao_country_code, by = "reporter") %>%
print(xtable::xtable(table), type = "html", include.rownames = F)

## Eurostat dataset after complete mapping of the countries
esdata <- esdata %>%
  filter_(~reporter %in% unique(hsfclmap$area))

And removing these countries, the records, for Eurostats are r NROW(esdata).

For Tariffline, this is the list of the countries non having hs-to-fcl mapping:

tldata_not_mapping_commodity <- tldata %>%
  filter_(~(!(reporter %in% unique(hsfclmap$area))))

table = tldata_not_mapping_commodity %>% select_(~reporter) %>%
  unique() %>%
  left_join(fao_country_code, by = "reporter")

print(xtable::xtable(table), type = "html", include.rownames = F)

## Tariffline dataset after complete mapping of the countries
tldata <- tldata %>%
  filter_(~reporter %in% unique(hsfclmap$area))

And removing these countries, Tariffline has r NROW(tldata) records.

Recoding of re-Imports and re-Exports

We recode re-Import (4) to Import (1) and re-Export (3) to Export (2) for tariffline data.

Distinction between Exports and Re-exports / Imports and Re-imports

Exports of a country can be distinguished as exports of domestic goods and exports of foreign goods. The second class is generally referred to as re-exports. The exports shown in our database contain both the exports of domestic and foreign goods. Re-exports are exports of foreign goods in the same state as previously imported; they are to be included in the country exports. It is recommended that they be recorded separately for analytical purposes. This may require the use of supplementary sources of information in order to determine the origin of re-exports, i.e., to determine that the goods in question are indeed re-exports rather than the export of goods that have acquired domestic origin through processing. Re-imports are goods imported in the same state as previously exported. They are included in the country imports. It is recommended that they be recorded separately for analytical purposes. This may require the use of supplementary sources of information in order to determine the origin of re-imports, i.e., to determine that the goods in question are indeed re-imports rather than the import of goods that have acquired foreign origin through processing. There are several reasons why an exported good might return to the country of origin. The exported good might be defective, the importer might have defaulted on payments or cancelled the order, the authorities might have imposed an import barrier, or demand or prices in the country of origin might have made it worthwhile to bring the good back. UNSD

# ---- reexptoexp ----

# { "id": "1", "text": "Import" },
# { "id": "2", "text": "Export" },
# { "id": "4", "text": "re-Import" },
# { "id": "3", "text": "re-Export" }

tldata <- tldata %>%
  mutate_(flow = ~ifelse(flow == 4, 1L, ifelse(flow == 3, 2L, flow)))

Filter records with less than 6 digits for hs codes

We can now remove all the records having hs code lenght less than 6.

tldata = tldata %>%
  filter(nchar(hs) > 5)

esdata = esdata %>%
  filter(nchar(hs) > 2)

And we end up, before applying the mapping of the commodity with:



SWS-Methodology/faoswsTrade documentation built on Feb. 13, 2023, 1:04 a.m.