In stix-global/eutradeflows: Download bilateral trade flow data from Eurostat Comext

library(knitr)
opts_knit$set(root.dir="..") # file paths are relative to the root of the project directory
library(tradeflows)
library(eutradeflows)
library(dplyr)
library(tidyr)
library(ggplot2)
# Connect to the database
con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "tradeflows")

Introduction

Formatting data for users

Monthly data from the last 3 years are needed, people don’t like to work with flat files.

Data format: Declarant, partner, product codes, flow, with time columns

In the long term there should be a simple interface to search the SQL database.

File naming convention

8dq_wide
8dw_wide
8d_long
6dq_wide
6d_long

CSV explort example

UK import from Indonesia

tbl(con, "vld_comext_reporter") %>% collect() %>% 
    filter(grepl("Kingdom", reporter))
# UK reportercode is 6
tbl(con, "vld_comext_partner") %>% collect() %>% 
    filter(grepl("Indonesia",partner))
# Indonesia partnercode is 700
# recent data
wpukind_r <- tbl(con, "raw_comext_monthly_201709") %>% 
    filter(reportercode == 6 & partnercode == 700) %>% 
    addproreppar2tbl(con, .) %>% 
    # show_query() %>% 
    collect()

# archive data
wpukind_a <- tbl(con, "raw_comext_monthly_2016S1") %>% 
    filter(reportercode == 6 & partnercode == 700) %>% 
    addproreppar2tbl(con, .) %>% 
    collect()


# bind archive and recent data, keep only imports
wpukind <- rbind(wpukind_a, wpukind_r) %>% 
    filter(flowcode == 1)

# keep only product 44
wpukind44 <- wpukind %>% filter(productcode == 44)
wpukind44182080 <- wpukind %>% filter(productcode == 44182080)

# Keep only furniture product 94
wpukind94 <- wpukind %>% filter(productcode == 94)
wpukind94036090 <- wpukind %>% filter(productcode == 94036090)

write.csv(wpukind44, "/tmp/comext/ukindonesia44.csv")
write.csv(wpukind94, "/tmp/comext/ukindonesia94.csv")
# 2 highest traded products at the 8 digit level

write.csv(wpukind44182080, "/tmp/comext/ukindonesia44182080.csv")
write.csv(wpukind94036090, "/tmp/comext/ukindonesia94036090.csv")


# keep only product with highest import in value over the period
# Calculate highest import value
wpukind %>% group_by(productcode) %>% 
    summarise(tradevalue = sum(tradevalue)) %>% 
    arrange(desc(tradevalue))

# Wood products
ggplot(wpukind44, 
       aes(x = as.factor(period), y = tradevalue)) +
    geom_point() + ylab("Trade value in 1000€")
lubridate::ymd(c("201001"))

List of VPA countries

Spread data along the period

In short: statregime should be included, otherwise there are duplications of the trade flows. Monthly data from the last 3 years are needed, Data format: Declarant, partner, product codes, flow, with time columns

Export csv based on raw data for a test

productanalysed <- "44071091"
# Load recent montly raw data for one product
dtfr <- tbl(con, "raw_comext_monthly_201709") %>%
    filter(productcode == productanalysed) %>%
    # Add quantity units
    eutradeflows::addunit2tbl(con, maintbl = .,
                              tableunit = "vld_comext_unit")  %>%
    # Add productdescription, reporter and partner
    addproreppar2tbl(con,.) %>% 
    collect()

dtfr %>% 
    select(productcode, reportercode, partnercode, flowcode, unitcode,
           period, quantity,
           productdescription, reporter, partner, statregime) %>%
    group_by(period) %>%
    # Create a unique id (optional)
    # unite(id, reportercode, partnercode, productcode, flowcode, statregime,
          # remove = FALSE) %>% 
    spread(period, quantity) %>% 
    # Move description to the back
    select(-productdescription, productdescription) %>% 
    write.csv("/tmp/comext/raw.csv", row.names = FALSE)

Develop the function that will export validated data to csv

productanalysed <- "44071190" #"44071091"
periodstart <- (as.numeric(format(Sys.time(), "%Y")) - 3) * 100
tableread <- "vld_comext_monthly"

# Load recent montly raw data for one product
dtf <- tbl(con, tableread) %>%
    filter(productcode == productanalysed &
               period > periodstart) %>%
    # Add productdescription, reporter and partner
    addproreppar2tbl(con,.) %>% 
    collect() %>% 
    # Add flow information, import export, 
    # cannot be done in addproreppar2tbl because the little data frame
    # is not present in the database 
    left_join(data_frame(flow = c("import", "export"), flowcode = 1:2),
              by = "flowcode")  %>% 
    mutate(quantity = round(quantity),
           # Cut long partner description
           partner = substr(partner, 0, 40),
           # Replace empty partner by the partner code
           partner = ifelse(is.na(partner), partnercode, partner),
           # Rename Germany so that it's consistent with partner name
           reporter = if_else(grepl("Fr Germany",reporter),
                              "Germany", reporter)) 

csvfile <- "/tmp/comext/vld.csv"

# Spread quantity data along the period and write to a csv file
dtf %>%
    # Keep only these columns
    select(productcode, flow, statregime, flag, reporter, partner,
           reportercode, partnercode, flowcode, unitcode,
           period, quantity,
           productdescription) %>%
    # Collapse flags present in the given line together
    group_by(productcode, flowcode, reportercode, partnercode, statregime) %>% 
    mutate(flag = paste(unique(flag), collapse = ", ")) %>% 
    spread(period, quantity) %>% 
    # Move description and code to the last columns
    select(-productdescription, -reportercode, -partnercode, -flowcode, -unitcode,
           productdescription, reportercode, partnercode, flowcode, unitcode) %>% 
    arrange(productcode, flowcode, reporter, partner) %>% 
    write.csv(csvfile, row.names = FALSE, na="")

Use the write2csv_spread and write2csv_long functions on one product

swdpine <- loadflows(con, productanalysed = "44071190", periodstart = 201700, 
                     tableread = "vld_comext_monthly") %>% 
    formatflows()
# Write the wide format
write2csv_spread(swdpine, "/tmp/comext/swdpine_spread.csv")
readLines("/tmp/comext/swdpine_spread.csv", 5)
# Write the long format
write2csv_long(swdpine, "/tmp/comext/swdpine_long.csv")
readLines("/tmp/comext/swdpine_long.csv", 5)

Use the write2csv functions on many products

tableread <- "vld_comext_monthly"
vld <- tbl(con, tableread) %>%
    distinct(productcode) %>% collect()
lapply(vld$productcode,
       function(p) loadflows(con, p) %>% formatflows() %>% write2csv_spread(paste0("/tmp/",p,".csv")))

message("If there is no quantity, do not export the file")
message("Add cleanallcomextcodes(con) to the cleancomext function")

# On the server (make this into a function)
if(FALSE){
    # 
    con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "tradeflows")
    tableread <- "vld_comext_monthly"
    vld <- tbl(con, tableread) %>%
        distinct(productcode) %>% collect()
    lapply(vld$productcode,
           function(p) loadflows(con, p) %>% formatflows() %>%  write2csv_spread(paste0("/mnt/sdb/data_raw/csv/8d/",p,".csv")))
}

Prepare aggregates and all products for codes at the higher level

6 digit level

All products under a given 6 digit level product

productanalysed <- 440712
periodstart <- (as.numeric(format(Sys.time(), "%Y")) - 3) * 100
tableread <- "vld_comext_monthly"

# Edit the pattern
productpattern <- paste0(productanalysed, "%")
dtf <- tbl(con, tableread) %>%
    filter(productcode %like% productpattern &
               period > periodstart) %>% 
    addproreppar2tbl(con,.) %>% 
    collect()

Use the function

Aggregates at the 6 digit level

productanalysed <- 440712
periodstart <- 201600
tableread <- "vld_comext_monthly"

# Edit the pattern
productpattern <- paste0(productanalysed, "%")
dtf <- tbl(con, tableread) %>%
    filter(productcode %like% productpattern &
               period > periodstart) %>% 
    group_by(period, reportercode, partnercode, flowcode, unit, statregime) %>% 
    summarise_at(vars(tradevalue, weight, starts_with("quantity")), 
                 sum) %>%
    # Cannot use addproreppar2tbl() because of the error below
    # addproreppar2tbl(con, .) %>% 
    # Error: `by` can't contain join column `productcode` which is missing from LHS
    left_join(tbl(con, "vld_comext_reporter"),
              by = "reportercode") %>%
    left_join(tbl(con, "vld_comext_partner"),
              by = "partnercode") %>%
    collect() %>% 
    mutate(productcode = as.character(productanalysed)) %>% 
    left_join(collect(tbl(con, "vld_comext_product")),
              by = "productcode")

Write aggregates to a csv file

Use the function

4 digit level

All products at the 4 digit level

Aggregates at the 4 digit level

# 4407

Aggregates at the 2 digit level

# 44

Issues

Issues when statregime is not included

productanalysed <- "44071091"
dtfr <- tbl(con, "raw_comext_monthly_201709") %>%
    filter(productcode == productanalysed) %>%
    collect()

# Spread the data as-is (droping the statregime column) returns an error 
if(FALSE){
    dtfr %>% 
        select(ends_with("code"), period, quantity) %>% 
        spread(period, quantity)
}
# Error: Duplicate identifiers for rows (907, 908), (933, 934), (723, 724), (744, 745),
# There are indeed duplicated rows, except for the statregime which is different
dtfr[907:908,]

# Creating an additional index as explained in StackOverflow answers
# https://stackoverflow.com/a/44511254/2641825
# https://stackoverflow.com/a/43259735
dtfr %>% 
    select(ends_with("code"), period, quantity) %>% 
    group_by(period) %>%
    mutate(id = 1:n()) %>%
    spread(period, quantity)
# Leads to a lot of row duplication and a lot of empty column values
# In fact I needed to keep the statregime column as a variable before the spread

RMariaDB::dbDisconnect(con)

stix-global/eutradeflows documentation built on Nov. 13, 2020, 9:23 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

stix-global/eutradeflows
Download bilateral trade flow data from Eurostat Comext

In stix-global/eutradeflows: Download bilateral trade flow data from Eurostat Comext

Introduction

Formatting data for users

File naming convention

CSV explort example

UK import from Indonesia

List of VPA countries

Spread data along the period

Export csv based on raw data for a test

Develop the function that will export validated data to csv

Use the write2csv_spread and write2csv_long functions on one product

Use the write2csv functions on many products

Prepare aggregates and all products for codes at the higher level

6 digit level

All products under a given 6 digit level product

Aggregates at the 6 digit level

4 digit level

All products at the 4 digit level

Aggregates at the 4 digit level

Aggregates at the 2 digit level

Issues

Issues when statregime is not included

R Package Documentation

Browse R Packages

We want your feedback!

stix-global/eutradeflows Download bilateral trade flow data from Eurostat Comext

In stix-global/eutradeflows: Download bilateral trade flow data from Eurostat Comext

Introduction

Formatting data for users

File naming convention

CSV explort example

UK import from Indonesia

List of VPA countries

Spread data along the period

Export csv based on raw data for a test

Develop the function that will export validated data to csv

Use the write2csv_spread and write2csv_long functions on one product

Use the write2csv functions on many products

Prepare aggregates and all products for codes at the higher level

6 digit level

All products under a given 6 digit level product

Aggregates at the 6 digit level

4 digit level

All products at the 4 digit level

Aggregates at the 4 digit level

Aggregates at the 2 digit level

Issues

Issues when statregime is not included

R Package Documentation

Browse R Packages

We want your feedback!

stix-global/eutradeflows
Download bilateral trade flow data from Eurostat Comext