options(width = 120) 
knitr::opts_chunk$set(echo = F)
suppressWarnings(suppressPackageStartupMessages(library(dplyr)))
library(stringr)
library(hsfclmap)
library(fclcpcmap)

subdir <- "OrangeBook"
sourcedir <- "tradeR"


source(file.path(Sys.getenv("HOME"), "r_adhoc", "trade_prevalid_testing", "setupconnection.R"))

if(length(lapply(dir(file.path(Sys.getenv("HOME"), "r_adhoc", "privateFAO", subdir, sourcedir), 
                     full.names = T), 
                 source)) == 0) stop("Files for sourcing not found")

data("hsfclmap")

# source(file.path(Sys.getenv("HOME"), ".pwd.R"))

trade_src <- src_postgres("sws_data", "localhost", 5432, "trade", .pwd, 
                          options = "-c search_path=ess")

agri_db <- tbl(trade_src, sql("
select * from ess.agri
"))


hsfclmap <- hsfclmap %>% 
  mutate(tocode = trailingDigits(fromcode, tocode, digit = 9),
         validyear = as.integer(validyear),
         validyear = ifelse(is.na(validyear), 0, validyear)) %>% 
  manualCorrections() %>% 
  mutate(fao = as.integer(fao))


# Max length of HS-codes in MDB-files

mapmaxlength <- hsfclmap %>% 
  group_by(fao, flow) %>% 
  summarise(mapmaxlength = max(str_length(fromcode))) %>% 
  ungroup()





### Extract TL data and calculate length of hs codes


tldata <- agri_db %>% 
  select(-hs2, -hs4, -hs6) %>% 
  filter(year == "2011",
         flow %in% c("1", "2")) %>% #Drop reexport and reimport 
  collect() %>% 
  mutate(hs = stringr::str_extract(hs, "^[0-9]*")) %>% # Artifacts in reporters 646 and 208
  mutate(flow = as.character(factor(flow, labels = c("Import", "Export")))) %>% 
  group_by(reporter, flow) %>% 
  mutate(tlmaxlength = max(stringr::str_length(hs), na.rm = T)) %>% 
  ungroup() %>% 
  mutate(reporter_fao = tradeproc::convertComtradeM49ToFAO(reporter))


## Max length in TL


tlmaxlength <- tldata %>% 
  select(fao = reporter_fao,
         flow,
         tlmaxlength) %>% 
  group_by(fao, flow) %>% 
  summarize(tlmaxlength = max(tlmaxlength, na.rm = T)) %>% 
  ungroup()


## Common max length


maxlength <- tlmaxlength %>% 
  left_join(mapmaxlength,
            by = c("fao", "flow")) %>% 
  group_by(fao, flow) %>% 
  mutate(maxlength = max(tlmaxlength, mapmaxlength, na.rm = T)) %>% 
  # na.rm here: some reporters are absent in map
  #  122 145 180 224 276
  ungroup()



### Extension of HS-codes in TL


tldata1 <- tldata %>% 
  select(-tlmaxlength) %>% 
  left_join(maxlength %>% 
              select(-tlmaxlength, -mapmaxlength),
            by = c("reporter_fao" = "fao", "flow")) 

tldata1 <- tldata1 %>% 
  mutate(hsext = as.numeric(trailingDigits2(hs, 
                                            maxlength = maxlength,
                                            digit = 0)))

### Extension of HS ranges in map


hsfclmap1 <- hsfclmap %>% 
  left_join(maxlength %>% 
              select(-tlmaxlength, -mapmaxlength),
            by = c("fao", "flow")) %>% 
  filter(!is.na(maxlength)) %>% 
  mutate(fromcode = as.numeric(trailingDigits2(fromcode, maxlength, 0)),
         tocode = as.numeric(trailingDigits2(tocode, maxlength, 9)))



### Conversion of TL

Parallel expirements

library(doParallel)
library(foreach)
registerDoParallel(cores=detectCores(all.tests=TRUE))

df <- tldata1 %>% 
  sample_n(10000)

tests <- microbenchmark::microbenchmark(
  notpar = hsInRange(df$hsext, df$reporter_fao, df$flow, hsfclmap1, 
                    calculation = "grouping",
                    parallel = F),
  par = hsInRange(df$hsext, df$reporter_fao, df$flow, hsfclmap1, 
                    calculation = "grouping",
                    parallel = T),
  times = 10L)

print(tests) Unit: seconds expr min lq mean median uq max neval notpar 27.470985 27.54354 27.925076 27.805991 28.232779 28.850961 10 par 7.141453 7.20453 7.500217 7.574347 7.700204 7.938214 10

Totally it is expected to take r (nrow(tldata1) / 10000) * 7.57 / 3600. Half an hour? 8-0

df <- tldata1 %>% 
  select(reporter_fao, flow, hsext) %>% 
  distinct() #%>% 
  # sample_n(1000)

fcldf <- hsInRange(df$hsext, df$reporter_fao, df$flow, hsfclmap1, 
                    calculation = "grouping",
                    parallel = T)
nrow(fcldf) == nrow(df)






sum(is.na(fcldf$fcl))/nrow(fcldf)
# 0.0335684 or 6864


d <- fcldf %>% filter(is.na(fcl)) %>% sample_n(1)
inspect <- d %>% left_join(hsfclmap1,
                by = c("areacode" = "fao", "flowname" = "flow"))

inspect %>% 
  filter((fromcode >= d$hs & lag(fromcode,3 ) < d$hs) |
           (fromcode < d$hs & lead(fromcode) >= d$hs) |
           (fromcode > d$hs & lag(fromcode, 3) < d$hs)) %>% 
  select(hs_tariff = hs, faoarea = areacode, flow = flowname, fromcode, tocode, fcl = fcl.y) %>% 
  distinct() %>% 
  arrange(fromcode)


tldata1 %>% 
  filter(reporter_fao == d$areacode,
         flow == d$flowname,
         hsext == d$hs)

tldata1 %>% 
  filter(reporter_fao == d$areacode,
         flow == d$flowname) %>% 
  mutate(hslength = stringr::str_length(hs)) %>% 
  group_by(hslength) %>% 
  summarize(n = n())

tldata1 %>% 
  filter(reporter_fao == d$areacode,
         flow == d$flowname) %>% 
  sample_n(5)



hsfclmap1 %>% 
  filter(fao == d$areacode,
         flow == d$flowname) %>% sample_n(10)

hsfclmap %>% 
  filter(fao == d$areacode,
         flow == d$flowname) %>% sample_n(10)

  maxlength %>% 
    filter(fao == d$areacode, flow == d$flowname)

  mapmaxlength %>% 
    filter(fao == d$areacode, flow == d$flowname)
tldata1 <- tldata %>% 
  left_join(maxlength,
            by = c("reporter_fao" = "fao", "flow")) %>% 
  mutate(extendtl = maxlength < maxlengthmap,
         extendmap = maxlength > maxlengthmap,
         howmany = ifelse(extendtl, maxlengthmap - maxlength, 0)) %>% 
  filter(!is.na(howmany)) %>% 
  mutate(hsext = paste0(hs, 
                        vapply(howmany,
                               FUN = function(x) {
                                 paste0(rep.int(0, times = x), collapse = "")
                               },
                               FUN.VALUE = character(1)
                        )),
                        hs)

Extension of HS-codes in the map

Info about length

lengthformap <- tldata %>% 
  select(reporter_fao, flow, maxlength, maxlengthmap, extendmap) %>% 
  distinct() %>% 
  mutate(howmany = ifelse(extendmap, maxlength - maxlengthmap, 0))

Adding info to map and extend codes

trailingDigits2 <- function(client, howmany, digit) {
  paste0(client, 
         vapply(howmany,
                FUN = function(x) {
                  paste0(rep.int(digit, times = x), collapse = "")
                },
                FUN.VALUE = character(1)
         ))

}

hsfclmap1 <- hsfclmap %>% 
  left_join(lengthformap,
            by = c("fao" = "reporter_fao", "flow")) %>% 
  filter(!is.na(howmany))

hsfclmap1 <- hsfclmap1 %>% # We lose here mappings of countries which are absent in TL dataset
  mutate(fromcode = trailingDigits2(fromcode, howmany, 0),
         tocode   = trailingDigits2(tocode, howmany, 9)) 


SWS-Methodology/hsfclmap documentation built on May 9, 2019, 11:53 a.m.