options(width = 120) knitr::opts_chunk$set(echo = F) suppressWarnings(suppressPackageStartupMessages(library(dplyr))) library(stringr) library(hsfclmap) library(fclcpcmap) subdir <- "OrangeBook" sourcedir <- "tradeR" source(file.path(Sys.getenv("HOME"), "r_adhoc", "trade_prevalid_testing", "setupconnection.R")) if(length(lapply(dir(file.path(Sys.getenv("HOME"), "r_adhoc", "privateFAO", subdir, sourcedir), full.names = T), source)) == 0) stop("Files for sourcing not found") data("hsfclmap") # source(file.path(Sys.getenv("HOME"), ".pwd.R")) trade_src <- src_postgres("sws_data", "localhost", 5432, "trade", .pwd, options = "-c search_path=ess") agri_db <- tbl(trade_src, sql(" select * from ess.agri ")) hsfclmap <- hsfclmap %>% mutate(tocode = trailingDigits(fromcode, tocode, digit = 9), validyear = as.integer(validyear), validyear = ifelse(is.na(validyear), 0, validyear)) %>% manualCorrections() %>% mutate(fao = as.integer(fao)) # Max length of HS-codes in MDB-files mapmaxlength <- hsfclmap %>% group_by(fao, flow) %>% summarise(mapmaxlength = max(str_length(fromcode))) %>% ungroup() ### Extract TL data and calculate length of hs codes tldata <- agri_db %>% select(-hs2, -hs4, -hs6) %>% filter(year == "2011", flow %in% c("1", "2")) %>% #Drop reexport and reimport collect() %>% mutate(hs = stringr::str_extract(hs, "^[0-9]*")) %>% # Artifacts in reporters 646 and 208 mutate(flow = as.character(factor(flow, labels = c("Import", "Export")))) %>% group_by(reporter, flow) %>% mutate(tlmaxlength = max(stringr::str_length(hs), na.rm = T)) %>% ungroup() %>% mutate(reporter_fao = tradeproc::convertComtradeM49ToFAO(reporter)) ## Max length in TL tlmaxlength <- tldata %>% select(fao = reporter_fao, flow, tlmaxlength) %>% group_by(fao, flow) %>% summarize(tlmaxlength = max(tlmaxlength, na.rm = T)) %>% ungroup() ## Common max length maxlength <- tlmaxlength %>% left_join(mapmaxlength, by = c("fao", "flow")) %>% group_by(fao, flow) %>% mutate(maxlength = max(tlmaxlength, mapmaxlength, na.rm = T)) %>% # na.rm here: some reporters are absent in map # 122 145 180 224 276 ungroup() ### Extension of HS-codes in TL tldata1 <- tldata %>% select(-tlmaxlength) %>% left_join(maxlength %>% select(-tlmaxlength, -mapmaxlength), by = c("reporter_fao" = "fao", "flow")) tldata1 <- tldata1 %>% mutate(hsext = as.numeric(trailingDigits2(hs, maxlength = maxlength, digit = 0))) ### Extension of HS ranges in map hsfclmap1 <- hsfclmap %>% left_join(maxlength %>% select(-tlmaxlength, -mapmaxlength), by = c("fao", "flow")) %>% filter(!is.na(maxlength)) %>% mutate(fromcode = as.numeric(trailingDigits2(fromcode, maxlength, 0)), tocode = as.numeric(trailingDigits2(tocode, maxlength, 9))) ### Conversion of TL
library(doParallel) library(foreach) registerDoParallel(cores=detectCores(all.tests=TRUE)) df <- tldata1 %>% sample_n(10000) tests <- microbenchmark::microbenchmark( notpar = hsInRange(df$hsext, df$reporter_fao, df$flow, hsfclmap1, calculation = "grouping", parallel = F), par = hsInRange(df$hsext, df$reporter_fao, df$flow, hsfclmap1, calculation = "grouping", parallel = T), times = 10L)
print(tests) Unit: seconds expr min lq mean median uq max neval notpar 27.470985 27.54354 27.925076 27.805991 28.232779 28.850961 10 par 7.141453 7.20453 7.500217 7.574347 7.700204 7.938214 10
Totally it is expected to take r (nrow(tldata1) / 10000) * 7.57 / 3600
. Half an hour? 8-0
df <- tldata1 %>% select(reporter_fao, flow, hsext) %>% distinct() #%>% # sample_n(1000) fcldf <- hsInRange(df$hsext, df$reporter_fao, df$flow, hsfclmap1, calculation = "grouping", parallel = T) nrow(fcldf) == nrow(df) sum(is.na(fcldf$fcl))/nrow(fcldf) # 0.0335684 or 6864 d <- fcldf %>% filter(is.na(fcl)) %>% sample_n(1) inspect <- d %>% left_join(hsfclmap1, by = c("areacode" = "fao", "flowname" = "flow")) inspect %>% filter((fromcode >= d$hs & lag(fromcode,3 ) < d$hs) | (fromcode < d$hs & lead(fromcode) >= d$hs) | (fromcode > d$hs & lag(fromcode, 3) < d$hs)) %>% select(hs_tariff = hs, faoarea = areacode, flow = flowname, fromcode, tocode, fcl = fcl.y) %>% distinct() %>% arrange(fromcode) tldata1 %>% filter(reporter_fao == d$areacode, flow == d$flowname, hsext == d$hs) tldata1 %>% filter(reporter_fao == d$areacode, flow == d$flowname) %>% mutate(hslength = stringr::str_length(hs)) %>% group_by(hslength) %>% summarize(n = n()) tldata1 %>% filter(reporter_fao == d$areacode, flow == d$flowname) %>% sample_n(5) hsfclmap1 %>% filter(fao == d$areacode, flow == d$flowname) %>% sample_n(10) hsfclmap %>% filter(fao == d$areacode, flow == d$flowname) %>% sample_n(10) maxlength %>% filter(fao == d$areacode, flow == d$flowname) mapmaxlength %>% filter(fao == d$areacode, flow == d$flowname)
tldata1 <- tldata %>% left_join(maxlength, by = c("reporter_fao" = "fao", "flow")) %>% mutate(extendtl = maxlength < maxlengthmap, extendmap = maxlength > maxlengthmap, howmany = ifelse(extendtl, maxlengthmap - maxlength, 0)) %>% filter(!is.na(howmany)) %>% mutate(hsext = paste0(hs, vapply(howmany, FUN = function(x) { paste0(rep.int(0, times = x), collapse = "") }, FUN.VALUE = character(1) )), hs)
lengthformap <- tldata %>% select(reporter_fao, flow, maxlength, maxlengthmap, extendmap) %>% distinct() %>% mutate(howmany = ifelse(extendmap, maxlength - maxlengthmap, 0))
trailingDigits2 <- function(client, howmany, digit) { paste0(client, vapply(howmany, FUN = function(x) { paste0(rep.int(digit, times = x), collapse = "") }, FUN.VALUE = character(1) )) } hsfclmap1 <- hsfclmap %>% left_join(lengthformap, by = c("fao" = "reporter_fao", "flow")) %>% filter(!is.na(howmany)) hsfclmap1 <- hsfclmap1 %>% # We lose here mappings of countries which are absent in TL dataset mutate(fromcode = trailingDigits2(fromcode, howmany, 0), tocode = trailingDigits2(tocode, howmany, 9))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.