# ---- settings ----
# Year for processing
year <- 2011L
# Coefficient for outlier detection
# See coef argument in ?boxplot.stats
out_coef <- 1.5
debughsfclmap <- TRUE
multicore <- TRUE
# ---- libs ----
library(tradeproc)
library(stringr)
library(testthat)
library(dplyr, warn.conflicts = F)
if(multicore) {
suppressPackageStartupMessages(library(doParallel))
library(foreach)
doParallel::registerDoParallel(cores=detectCores(all.tests=TRUE))
}
# ---- swsdebug ----
# Connection to SWS
# TODO: DEV MODE!!!!!!!!
## ADDED COMMENT
# faosws::GetTestEnvironment(
# # baseUrl = "https://hqlprswsas1.hq.un.fao.org:8181/sws", # intranet.fao.org/sws
# # baseUrl = "https://hqlprsws2.hq.un.fao.org:8181/sws",
# baseUrl = "https://hqlqasws1.hq.un.fao.org:8181/sws", # QA?
# # token = "349ce2c9-e6bf-485d-8eac-00f6d7183fd6") # Token for QA)
# token = "da889579-5684-4593-aa36-2d86af5d7138") # http://hqlqasws1.hq.un.fao.org:8080/sws/
# # token = "f5e52626-a015-4bbc-86d2-6a3b9f70950a") # Second token for QA
# #token = token)
# ---- datasets ----
## Data sets with hs->fcl map (from mdb files)
# and UNSD area codes (M49)
## TODO: replace by ad hoc tables
data("hsfclmap2", package = "hsfclmap", envir = environment())
data("adjustments", package = "hsfclmap", envir = environment())
data("unsdpartnersblocks", package = "tradeproc", envir = environment())
data("unsdpartners", package = "tradeproc", envir = environment())
## units for fcl
data("fclunits", package = "tradeproc", envir = environment())
# ---- hsfclmapsubset ----
# HS -> FCL map
## Filter hs->fcl links we need (based on year)
hsfclmap <- hsfclmap2 %>%
# Filter out all records from future years
filter_(~mdbyear <= year) %>%
# Distance from year of interest to year in the map
mutate_(yeardistance = ~year - mdbyear) %>%
# Select nearest year for every reporter
# if year == 2011 and mdbyear == 2011, then distance is 0
# if year == 2011 and mdbyear == 2010, distance is 1
group_by_(~area) %>%
filter_(~yeardistance == min(yeardistance)) %>%
ungroup() %>%
select_(~-yeardistance) %>%
## and add trailing 9 to tocode, where it is shorter
## TODO: check how many such cases and, if possible, move to manualCorrectoins
mutate_(tocode = ~hsfclmap::trailingDigits(fromcode,
tocode,
digit = 9))
# ---- tradeload ----
#### Get list of agri codes ####
### ADDED COMMENT
## agricodeslist <- paste0(shQuote(getAgriHSCodes(), "sh"), collapse=", ")
### Download TL data ####
# tldata <- getRawAgriTL(year, agricodeslist)
#load("../tldata_raw_from_db.RData")
load("~/Dropbox/tradeproc/tldata_raw_from_db.RData")
#### Download ES data ####
# esdata <- getRawAgriES(year, agricodeslist)
#load("../esdata_raw_from_db.RData")
load("~/Dropbox/tradeproc/esdata_raw_from_db.RData")
# ---- geonom2fao ----
esdata <- esdata %>%
mutate_(reporter = ~convertGeonom2FAO(reporter),
partner = ~convertGeonom2FAO(partner)) %>%
filter_(~partner != 252)
# ---- es_hs2fcl ----
esdata <- convertHS2FCL(esdata, hsfclmap, parallel = multicore)
# ---- es_join_fclunits ----
esdata <- esdata %>%
left_join(fclunits, by = "fcl")
# ---- tl_m49fao ----
## Based on Excel file from UNSD (unsdpartners..)
tldata <- tldata %>%
left_join(unsdpartnersblocks %>%
select_(wholepartner = ~rtCode,
part = ~formula) %>%
# Exclude EU grouping and old countries
filter_(~wholepartner %in% c(251, 381, 579, 581, 711, 757, 842)),
by = c("partner" = "part")) %>%
mutate_(partner = ~ifelse(is.na(wholepartner), partner, wholepartner),
m49rep = ~reporter,
m49par = ~partner,
# Conversion from Comtrade M49 to FAO area list
reporter = ~as.integer(tradeproc::convertComtradeM49ToFAO(m49rep)),
partner = ~as.integer(tradeproc::convertComtradeM49ToFAO(m49par)))
# ---- drop_es_from_tl ----
# They will be replaced by ES data
tldata <- tldata %>%
anti_join(esdata %>%
select_(~reporter) %>%
distinct(),
by = "reporter")
# ---- drop_reps_not_in_mdb ----
# We drop reporters what are absent in MDB hsfcl map
# because in any case we can proceed their data
tldata <- tldata %>%
filter_(~reporter %in% unique(hsfclmap$area))
# ---- reexptoexp ----
# { "id": "1", "text": "Import" },
# { "id": "2", "text": "Export" },
# { "id": "4", "text": "re-Import" },
# { "id": "3", "text": "re-Export" }
tldata <- tldata %>%
mutate_(flow = ~ifelse(flow == 4, 1L, ifelse(flow == 3, 2L, flow)))
# ---- tl_hslength ----
#### Max length of HS-codes in MDB-files ####
mapmaxlength <- hsfclmap %>%
group_by_(~area, ~flow) %>%
summarise_(mapmaxlength = ~max(stringr::str_length(fromcode))) %>%
ungroup()
### Calculate length of hs codes in TL
tldata <- tldata %>%
group_by_(~reporter, ~flow) %>%
mutate_(tlmaxlength = ~max(stringr::str_length(hs), na.rm = TRUE)) %>%
ungroup()
## Dataset with max length in TL ####
tlmaxlength <- tldata %>%
select_(~reporter,
~flow,
~tlmaxlength) %>%
group_by_(~reporter, ~flow) %>%
summarize_(tlmaxlength = ~max(tlmaxlength, na.rm = TRUE)) %>%
ungroup()
## Common max length between TL and map ####
maxlengthdf <- tlmaxlength %>%
left_join(mapmaxlength,
by = c("reporter" = "area", "flow")) %>%
group_by_(~reporter, ~flow) %>%
mutate_(maxlength = ~max(tlmaxlength, mapmaxlength, na.rm = TRUE)) %>%
# na.rm here: some reporters are absent in map
# 122 145 180 224 276
ungroup()
### Extension of HS-codes in TL ####
tldata <- tldata %>%
select_(~-tlmaxlength) %>%
left_join(maxlengthdf %>%
select_(~-tlmaxlength, ~-mapmaxlength),
by = c("reporter", "flow")) %>%
mutate_(hsext = ~as.numeric(hsfclmap::trailingDigits2(hs,
maxlength = maxlength,
digit = 0)))
### Extension of HS ranges in map ####
hsfclmap1 <- hsfclmap %>%
left_join(maxlengthdf %>%
select_(~-tlmaxlength, ~-mapmaxlength),
by = c("area" = "reporter", "flow")) %>%
filter_(~!is.na(maxlength)) ## Attention!!!
hsfclmap1 <- hsfclmap1 %>%
mutate_(fromcode = ~as.numeric(hsfclmap::trailingDigits2(fromcode, maxlength, 0)),
tocode = ~as.numeric(hsfclmap::trailingDigits2(tocode, maxlength, 9)))
# ---- tl_hs2fcl ----
tldata <- convertHS2FCL(tldata %>%
select_(~-hs) %>%
rename_(hs = ~hsext),
hsfclmap1, parallel = multicore)
#############Units of measurment in TL ####
## Add target fclunit
# What units does FAO expects for given FCL codes
tldata <- tldata %>%
left_join(fclunits,
by = "fcl")
## Units of Comtrade
data("comtradeunits",
package = "tradeproc",
envir = environment())
tldata <- tldata %>%
mutate_(qunit = ~as.integer(qunit)) %>%
left_join(comtradeunits %>%
select_(~qunit, ~wco),
by = "qunit")
## Dataset with all matches between Comtrade and FAO units
ctfclunitsconv <- tldata %>%
select_(~qunit, ~wco, ~fclunit) %>%
distinct() %>%
arrange_(~qunit)
################ Conv. factor (TL) ################
##### Table for conv. factor
ctfclunitsconv$conv <- 0
ctfclunitsconv$conv[ctfclunitsconv$qunit == 1] <- NA # Missing quantity
ctfclunitsconv$conv[ctfclunitsconv$fclunit == "$ value only"] <- NA # Missing quantity
ctfclunitsconv$conv[ctfclunitsconv$fclunit == "mt" &
ctfclunitsconv$wco == "m²"] <- 1
ctfclunitsconv$conv[ctfclunitsconv$fclunit == "mt" &
ctfclunitsconv$wco == "l"] <- .001
ctfclunitsconv$conv[ctfclunitsconv$fclunit == "heads" &
ctfclunitsconv$wco == "u"] <- 1
ctfclunitsconv$conv[ctfclunitsconv$fclunit == "1000 heads" &
ctfclunitsconv$wco == "u"] <- .001
## This is just a trick
ctfclunitsconv$conv[ctfclunitsconv$fclunit == "1000 heads" &
ctfclunitsconv$wco == "kg"] <- 1
## This one too :)
ctfclunitsconv$conv[ctfclunitsconv$fclunit == "heads" &
ctfclunitsconv$wco == "kg"] <- 1
ctfclunitsconv$conv[ctfclunitsconv$fclunit == "number" &
ctfclunitsconv$wco == "u"] <- 1
ctfclunitsconv$conv[ctfclunitsconv$fclunit == "mt" &
ctfclunitsconv$wco == "kg"] <- .001
ctfclunitsconv$conv[ctfclunitsconv$fclunit == "mt" &
ctfclunitsconv$wco == "m³"] <- 1
ctfclunitsconv$conv[ctfclunitsconv$fclunit == "mt" &
ctfclunitsconv$wco == "carat"] <- 5e-6
##### Add conv factor to the dataset
tldata <- tldata %>%
left_join(ctfclunitsconv,
by = c("qunit", "wco", "fclunit"))
#### Commodity specific conversion
fcl_spec_mt_conv <- tldata %>%
filter_(~fclunit == "mt" & is.na(weight) & conv == 0) %>%
select_(~fcl, ~wco) %>%
distinct() %>%
mutate_(fcldesc = ~descFCL(fcl))
fcl_spec_mt_conv$convspec <- 0
fcl_spec_mt_conv$convspec[fcl_spec_mt_conv$fcldesc == "Cigarettes" &
fcl_spec_mt_conv$wco == "1000u"] <- .01
fcl_spec_mt_conv$convspec[fcl_spec_mt_conv$fcldesc == "Cigarettes" &
fcl_spec_mt_conv$wco == "u"] <- .0001
fcl_spec_mt_conv$convspec[fcl_spec_mt_conv$fcldesc == "Hen Eggs" &
fcl_spec_mt_conv$wco == "u"] <- .00006
fcl_spec_mt_conv$convspec[fcl_spec_mt_conv$fcldesc == "Hen Eggs" &
fcl_spec_mt_conv$wco == "2u"] <- .00012
fcl_spec_mt_conv$convspec[fcl_spec_mt_conv$fcldesc == "Hen Eggs" &
fcl_spec_mt_conv$wco == "12u"] <- .00072
fcl_spec_mt_conv$convspec[fcl_spec_mt_conv$fcldesc == "Hen Eggs" &
fcl_spec_mt_conv$wco == "1000u"] <- .006
fcl_spec_mt_conv$convspec[fcl_spec_mt_conv$fcldesc == "Cigars Cheroots" &
fcl_spec_mt_conv$wco == "u"] <- 0.000008
fcl_spec_mt_conv$convspec[fcl_spec_mt_conv$fcldesc == "Cigars Cheroots" &
fcl_spec_mt_conv$wco == "1000u"] <- 0.0008
fcl_spec_mt_conv$convspec[fcl_spec_mt_conv$fcldesc == "Tobacco Products nes" &
fcl_spec_mt_conv$wco == "u"] <- 0.000008
fcl_spec_mt_conv$convspec[fcl_spec_mt_conv$fcldesc == "Tobacco Products nes" &
fcl_spec_mt_conv$wco == "1000u"] <- 0.0008
fcl_spec_mt_conv$convspec[fcl_spec_mt_conv$fcldesc == "Fruit Prepared nes" &
fcl_spec_mt_conv$wco == "U (jeu/pack)"] <- 0.0208333
### Add commodity specific conv.factors to dataset
tldata <- tldata %>%
left_join(fcl_spec_mt_conv %>%
select_(~-fcldesc),
by = c("fcl", "wco"))
## Save intermediate dataset for checks
tldata_old = tldata
########## Conversion of units
#### FCL specific conv
tldata$qtyfcl <- tldata$qty * tldata$convspec
#### Common conv
# If no specific conv. factor, we apply general
tldata$qtyfcl <- ifelse(is.na(tldata$convspec),
tldata$qty * tldata$conv,
tldata$qtyfcl)
##### No qty, but weight and target is mt: we take weight from there
tldata$qtyfcl <- ifelse(tldata$qty == 0 &
tldata$fclunit == "mt" &
is.na(tldata$qtyfcl) &
tldata$weight > 0,
tldata$weight,
tldata$qtyfcl)
######### Value from USD to thousands of USD
tldata$value <- tldata$value / 1000
## TLDATA: aggregate by fcl
## Here we select column qtyfcl which contains quantity, requested by FAO
# tldata <- tldata %>%
# select_(~year,
# ~reporter,
# ~partner,
# ~flow,
# ~fcl,
# qty = ~qtyfcl, # <----
# ~value) %>%
# group_by_(~year, ~reporter, ~partner, ~flow, ~fcl) %>%
# summarise_each_(funs(sum(., na.rm = T)), vars = c("qty", "value")) %>%
# ungroup()
# Replace weight (first quantity column) by newly produced qtyfcl column
tldata <- tldata %>%
select_(~year,
~reporter,
~partner,
~flow,
~fcl,
~hs,
weight = ~qtyfcl,
~qty,
~value)
tldata_mid = tldata
# Loading of notes/adjustments should be added here
esdata_old = esdata
esdata <- plyr::ldply(sort(unique(esdata$reporter)),
function(x) {
applyadj(x, year, adjustments, esdata)
},
.progress = ifelse(multicore, "none", "text"),
.inform = FALSE,
.parallel = multicore)
## Apply conversion EUR to USD
load("data/EURconversionUSD.RData")
esdata$value <- esdata$value * as.numeric(EURconversionUSD %>%
filter(Year == year) %>%
select(ExchangeRate))
tldata <- plyr::ldply(sort(unique(tldata$reporter)),
function(x) {
applyadj(x, year, adjustments, tldata)
},
.progress = ifelse(multicore, "none", "text"),
.inform = FALSE,
.parallel = multicore)
tradedata <- bind_rows(
tldata %>%
select_(~year, ~reporter, ~partner, ~flow,
~fcl, qty = ~weight, ~value),
# TODO Check quantity/weight
esdata %>%
mutate_(uniqqty = ~ifelse(fclunit == "mt", weight, qty)) %>%
select_(~year, ~reporter, ~partner, ~flow,
~fcl, qty = ~uniqqty, ~value)
)
tradedata <- tradedata %>%
select_(~year, ~reporter, ~partner, ~flow, ~fcl, ~qty, ~value) %>%
group_by_(~year, ~reporter, ~partner, ~flow, ~fcl) %>%
summarise_each_(funs(sum = sum(., na.rm = TRUE)), vars = c("qty", "value")) %>%
ungroup()
# Missing quantities and values
tradedata <- tradedata %>%
mutate_(no_quant = ~qty == 0 | is.na(qty), # There are no NA qty, but may be changes later
no_value = ~value == 0 | is.na(value))
# UV calculation
tradedata <- mutate_(tradedata,
uv = ~ifelse(no_quant | no_value, # Only 0 here. Should we care about NA?
NA,
value / qty))
# Outlier detection
tradedata <- tradedata %>%
group_by_(~year, ~reporter, ~flow, ~fcl) %>%
mutate_(
uv_reporter = ~median(uv, na.rm = T),
outlier = ~uv %in% boxplot.stats(uv, coef = out_coef, do.conf = F)$out) %>%
ungroup()
# Imputation of missings and outliers
tradedata <- tradedata %>%
mutate_(qty = ~ifelse(no_quant | outlier,
value / uv_reporter,
qty))
# Non reporting countries
nonreporting <- unique(tradedata$partner)[!is.element(unique(tradedata$partner),
unique(tradedata$reporter))]
## Mirroring for non reporting countries
tradedatanonrep <- tradedata %>%
filter_(~partner %in% nonreporting) %>%
mutate_(partner_mirr = ~reporter,
reporter = ~partner,
flow = ~ifelse(flow == 2, 1,
ifelse(flow == 1, 2,
NA)))
tradedata <- bind_rows(tradedata,
tradedatanonrep)
## Non mapped FCL
tldata_old %>%
filter(is.na(fcl)) %>%
select(hs6) %>%
unique() %>%
write.table(file = "non_mapped_fcl.csv",
row.names = F,quote = F)
## Rule 4 "order of magnitude"
#https://github.com/mkao006/sws_r_api/blob/040fb7f7b6af05ec35293dd5459ee131b31e5856/r_modules/trade_prevalidation/R/magnitudeOrder.R
## Kernel calculation
#https://github.com/mkao006/sws_r_api/blob/040fb7f7b6af05ec35293dd5459ee131b31e5856/r_modules/trade_prevalidation/R/kdeMode.R
## Simple reliability index
#https://github.com/mkao006/sws_r_api/blob/040fb7f7b6af05ec35293dd5459ee131b31e5856/r_modules/trade_reliability/simpleReliabilityExampleForDocumentation.R
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.