# Coefficient for outlier detection # See coef argument in ?boxplot.stats out_coef <- 1.5 ## Debug logic for mapping of commodity debughsfclmap <- TRUE ## Is it multicore? multicore <- TRUE
# ---- libs ---- library(tradeproc) library(stringr) library(testthat) library(ggplot2) library(dplyr, warn.conflicts = F) if(multicore) { suppressPackageStartupMessages(library(doParallel)) library(foreach) doParallel::registerDoParallel(cores=detectCores(all.tests=TRUE)) }
Year under consideration:
year <- 2009L
Datasets from other packages for mapping of commodity and countries
# ---- datasets ---- ## Data sets with hs->fcl map (from mdb files) # and UNSD area codes (M49) ## TODO: replace by ad hoc tables ## Mapping of commodity hs to fcl dataset data("hsfclmap2", package = "hsfclmap", envir = environment()) ## Loading of the adjustments data("adjustments", package = "hsfclmap", envir = environment()) ## Mapping of countries M49 data("unsdpartnersblocks", package = "tradeproc", envir = environment()) data("unsdpartners", package = "tradeproc", envir = environment()) ## Units for fcl commodity data("fclunits", package = "tradeproc", envir = environment()) ## Fao country code with names data("faocountrycode", package = "tradeproc", envir = environment()) # ---- hsfclmapsubset ---- # HS -> FCL map ## Filter hs->fcl links we need (based on year) hsfclmap <- hsfclmap2 %>% # Filter out all records from future years filter_(~mdbyear <= year) %>% # Distance from year of interest to year in the map mutate_(yeardistance = ~year - mdbyear) %>% # Select nearest year for every reporter # if year == 2011 and mdbyear == 2011, then distance is 0 # if year == 2011 and mdbyear == 2010, distance is 1 group_by_(~area) %>% filter_(~yeardistance == min(yeardistance)) %>% ungroup() %>% select_(~-yeardistance) %>% ## and add trailing 9 to tocode, where it is shorter ## TODO: check how many such cases and, if possible, move to manualCorrectoins mutate_(tocode = ~hsfclmap::trailingDigits(fromcode, tocode, digit = 9))
Loading Tariffline (UNSD) and Eurostat data
# ---- tradeload ---- #### Get list of agri codes #### (Not needed at the moment) ## agricodeslist <- paste0(shQuote(getAgriHSCodes(), "sh"), collapse=", ") ## Function of Alex # tldata <- getRawAgriTL(year, agricodeslist) ## Loading from local dataset #load("../tldata_raw_from_db.RData") #load("~/Dropbox/tradeproc/tldata_raw_from_db.RData") ## Function to read table from drive of Giorgio load(paste0("~/Desktop/FAO/Trade/RData/tldata_",year,".RData")) ## 40262359 #### Download ES data #### ## Function of Alex # esdata <- getRawAgriES(year, agricodeslist) ## Loading from local dataset #load("../esdata_raw_from_db.RData") #load("~/Dropbox/tradeproc/esdata_raw_from_db.RData") ## Function to read table from drive of Giorgio load(paste0("~/Desktop/FAO/Trade/RData/esdata_",year,".RData")) ## 8665414
tldata <- tldata_raw tldata = tldata %>% mutate_(lenhs = ~stringr::str_length(hs)) esdata <- esdata_raw esdata = esdata %>% mutate_(lenhs = ~stringr::str_length(hs))
r NROW(tldata)
records.r NROW(esdata)
records.# ---- geonom2fao ---- esdata <- esdata %>% mutate_(reporter = ~convertGeonom2FAO(reporter), partner = ~convertGeonom2FAO(partner)) # ---- tl_m49fao ---- ## Based on Excel file from UNSD (unsdpartners..) tldata1 <- tldata %>% left_join(unsdpartnersblocks %>% select_(wholepartner = ~rtCode, part = ~formula) %>% # Exclude EU grouping and old countries filter_(~wholepartner %in% c(251, 381, 579, 581, 711, 757, 842)), by = c("partner" = "part")) %>% mutate_(partner = ~ifelse(is.na(wholepartner), partner, wholepartner), m49rep = ~reporter, m49par = ~partner, # Conversion from Comtrade M49 to FAO area list reporter = ~as.integer(tradeproc::convertComtradeM49ToFAO(m49rep)), partner = ~as.integer(tradeproc::convertComtradeM49ToFAO(m49par)))
distribution_hslen = function(data, title){ to_plot = as.data.frame(table(data$lenhs)) colnames(to_plot) = c("Lenght_hs","Frequency") ggplot(data = to_plot, aes(x = Lenght_hs, y = Frequency)) + geom_bar(position = 'dodge',stat = 'identity') + geom_text(aes(label=paste0(Frequency, " - ",round(Frequency/sum(Frequency)*100,2),"%")), position=position_dodge(width=0.9), vjust=-0.25, size = 2) + ggtitle(title) } # --- tariffline --- distribution_hslen(tldata, title = paste0("Tariffline data, year ", year)) # --- eurostat --- distribution_hslen(esdata, title = paste0("Eurostat data, year ", year))
For each country we stratify by flow and lenght of the hs code (lenhs) and we check if the values are the same for same flow but different hs lenght. In the plots all available data are shown, thus all lenhs available for that specific country.The y axis (total of values) are in log10 scale.
aggregate = function(data){ result = data %>% group_by_(~reporter, ~year, ~flow, ~lenhs) %>% summarise_(n_records = ~n(), n_partners = ~n_distinct(partner), n_hs = ~n_distinct(hs), values = ~sum(value)) %>% left_join(fao_country_code, by = "reporter") } aggregate_tl = aggregate(tldata) aggregate_es = aggregate(esdata) # ---- TLdata ---- list_countries_tl = unique(aggregate_tl$reporter) splitt = split(list_countries_tl, ceiling(seq_along(list_countries_tl)/25)) for (i in 1:length(splitt)){ g = ggplot(data = as.data.frame(aggregate_tl %>% filter_(~reporter %in% splitt[[i]])), aes(x=factor(lenhs), y=log10(values), fill = factor(flow))) + geom_bar(aes(fill=factor(flow)), stat="identity", position = "dodge") + facet_wrap(~country_name, ncol = 5, scale = "free") print(g) } # ---- ESdata ---- ggplot(data = as.data.frame(aggregate_es), aes(x=factor(lenhs), y=log10(values), fill = factor(flow))) + geom_bar(stat="identity", position="dodge") + facet_wrap(~country_name)
For the country mapping we have a specific code in FAO country code, which aggregates all not defined countries. We have:
data("geonom2fao", package = "tradeproc", envir = environment()) table = geonom2fao %>% select_(~code,~active) %>% filter_(~active == 252) print(xtable::xtable(table), type = "html", include.rownames = F)
data("m49faomap", package = "tradeproc", envir = environment()) table = m49faomap %>% filter_(~fao == 252) print(xtable::xtable(table), type = "html", include.rownames = F)
We then remove, from both Eurostat and Tariffline, all the countries not having commodity mapping information from hs to fcl, because we are not able, afterwards, to map the corresponding commodity. For Tariffline we additionally remove all the European countries present in the Eurostat dataset.
# ---- es_partner_filter ---- ## Eurostat data with partner = 252 esdata_discard_country = esdata %>% filter_(~partner == 252) ## Fitering countries with partner = 252 esdata = esdata %>% filter_(~partner != 252)
Eurostat, removing code 252, has r NROW(esdata)
records.
# ---- drop_es_from_tl ---- # They will be replaced by ES data tldata <- tldata %>% anti_join(esdata %>% select_(~reporter) %>% distinct(), by = "reporter")
Tariffline, removing European countries, has r NROW(tldata)
records.
For Eurostat, this is the list of the countries non having hs-to-fcl mapping: (if empty not shown)
# ---- drop_reps_not_in_mdb ---- # We drop reporters what are absent in MDB hsfcl map # because in any case we can proceed their data ## Eurostat data with not information in the hsfcl mapping esdata_not_mapping_commodity <- esdata %>% filter_(~(!(reporter %in% unique(hsfclmap$area)))) ## Record we loose including the hsfcl mapping information table = esdata_not_mapping_commodity %>% select_(~reporter) %>% unique() %>% left_join(fao_country_code, by = "reporter") %>% print(xtable::xtable(table), type = "html", include.rownames = F) ## Eurostat dataset after complete mapping of the countries esdata <- esdata %>% filter_(~reporter %in% unique(hsfclmap$area))
And removing these countries, the records, for Eurostats are r NROW(esdata)
.
For Tariffline, this is the list of the countries non having hs-to-fcl mapping:
tldata_not_mapping_commodity <- tldata %>% filter_(~(!(reporter %in% unique(hsfclmap$area)))) table = tldata_not_mapping_commodity %>% select_(~reporter) %>% unique() %>% left_join(fao_country_code, by = "reporter") print(xtable::xtable(table), type = "html", include.rownames = F) ## Tariffline dataset after complete mapping of the countries tldata <- tldata %>% filter_(~reporter %in% unique(hsfclmap$area))
And removing these countries, Tariffline has r NROW(tldata)
records.
We recode re-Import (4) to Import (1) and re-Export (3) to Export (2) for tariffline data.
Distinction between Exports and Re-exports / Imports and Re-imports
Exports of a country can be distinguished as exports of domestic goods and exports of foreign goods. The second class is generally referred to as re-exports. The exports shown in our database contain both the exports of domestic and foreign goods. Re-exports are exports of foreign goods in the same state as previously imported; they are to be included in the country exports. It is recommended that they be recorded separately for analytical purposes. This may require the use of supplementary sources of information in order to determine the origin of re-exports, i.e., to determine that the goods in question are indeed re-exports rather than the export of goods that have acquired domestic origin through processing. Re-imports are goods imported in the same state as previously exported. They are included in the country imports. It is recommended that they be recorded separately for analytical purposes. This may require the use of supplementary sources of information in order to determine the origin of re-imports, i.e., to determine that the goods in question are indeed re-imports rather than the import of goods that have acquired foreign origin through processing. There are several reasons why an exported good might return to the country of origin. The exported good might be defective, the importer might have defaulted on payments or cancelled the order, the authorities might have imposed an import barrier, or demand or prices in the country of origin might have made it worthwhile to bring the good back. UNSD
# ---- reexptoexp ---- # { "id": "1", "text": "Import" }, # { "id": "2", "text": "Export" }, # { "id": "4", "text": "re-Import" }, # { "id": "3", "text": "re-Export" } tldata <- tldata %>% mutate_(flow = ~ifelse(flow == 4, 1L, ifelse(flow == 3, 2L, flow)))
We can now remove all the records having hs code lenght less than 6.
tldata = tldata %>% filter(nchar(hs) > 5) esdata = esdata %>% filter(nchar(hs) > 2)
And we end up, before applying the mapping of the commodity with:
Tariffline: r NROW(tldata)
Eurostat: r NROW(esdata)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.