message("A lot of the code below has to be rewritten because missing data is not zero anymore. Missing data is now correctly encoded as NULL in the database and imported as NA values in R.") # Comment out to pass tableanalysed as a parameter tableanalysed = 'raw_comext_monthly_201709' # Render this long document in a separate bash console with the command: # cd ~/R/eutradeflows && Rscript -e "tableanalysed = 'raw_comext_monthly_201709'; rmarkdown::render('docs/gapanalysis.Rmd')"
library(knitr) opts_knit$set(root.dir="..") # file paths are relative to the root of the project directory library(eutradeflows) library(dplyr) library(dbplyr) library(tidyr) library(ggplot2) # Use rotated text at some point to place country names in table heading # https://tex.stackexchange.com/questions/201696/how-to-rotate-text-in-a-cell-and-center-it-vertically-and-horizontally
Connect to the database.
con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "tradeflows")
Following analysis is based on the most recent monthly data downloaded from the Eurostat Comext bulk repository.
The eurostat Comext database contains information on import and export flows between european countries and over 200 partner countries. Each flow between a reporter and a partner country contains 3 informations: trade value, weight and quantity. All 3 informations given by the Comext trade database can have gaps. Gaps are measured below in terms of number of database rows for which a given value is equal to zero. There are far fewer gaps in trade value compared to weight and quantity. ```{sql comparegaps, connection = con, tab.cap="Gaps in the 3 variables present in Comext"} select * from (select count() as 'gaps in trade value' from raw_comext_monthly_201709 where tradevalue = 0 ) as x, (select count() as 'gaps in weight' from raw_comext_monthly_201709 where weight = 0) as y, (select count() as 'gaps in quantity' from raw_comext_monthly_201709 where quantity = 0) as z, (select count() as 'total number of rows' from raw_comext_monthly_201709) as t;
## Descriptive statistics of the recent Comext monthly data Descriptive queries on the database table raw_comext_monthly. Number of rows in the table: ```{sql rowsintable, connection=con} select count(*) from raw_comext_monthly_201709;
Number of products ```{sql countproduct, connection=con} select count(distinct(productcode)) from raw_comext_monthly_201709;
Number of reporter and partner countries ```{sql countreporter, connection=con} select count(distinct(reportercode)) as 'number of reporters', count(distinct(partnercode)) as 'number of partners' from raw_comext_monthly_201709;
First and last month ```{sql minperiod, connection=con} select min(period), max(period) from raw_comext_monthly_201709;
# Gaps in trade value Number of database rows for which the trade value is equal to 0. ```r gtv <- tbl(con, tableanalysed) %>% filter(tradevalue == 0) %>% group_by(productcode, reportercode, period) %>% summarise(n = n()) %>% left_join(tbl(con, "vld_comext_reporter"), by="reportercode") %>% collect() %>% ungroup() %>% mutate(productcode = as.character(productcode), prod2 = as.numeric(substr(productcode, 1,2))) gtv %>% group_by(reporter, prod2) %>% summarise(n = sum(n)) %>% left_join(tradeharvester::products2harvest, by = c("prod2" = "productcode")) %>% unite(product, prod2, productdescription) %>% spread(product, n, fill="") %>% kable() gtv %>% mutate(year = substr(as.character(period),1,4)) %>% group_by(reporter, year) %>% summarise(n = sum(n)) %>% spread(year, n, fill="") %>% kable()
Heat map plots inspired by a blog post on learnr.wordpress.com.
gtv_period <- tbl(con, tableanalysed) %>% filter(tradevalue == 0) %>% group_by(reportercode, period) %>% summarise(n = n()) %>% left_join(tbl(con, "vld_comext_reporter"), by = "reportercode") %>% collect() %>% mutate(period = as.character(period)) ggplot(gtv_period, aes(reporter, period)) + geom_tile(aes(fill = n), colour = "white") + scale_fill_gradient(low = "white", high = "steelblue") + theme(axis.text.x = element_text(angle = 60, hjust = 1))
gtv_product <- tbl(con, tableanalysed) %>% filter(tradevalue == 0) %>% group_by(reportercode, productcode) %>% summarise(n = n()) %>% left_join(tbl(con, "vld_comext_reporter"), by = "reportercode") %>% collect() %>% mutate(productcode = as.character(productcode), p4 = substr(productcode, 1,4)) ggplot(gtv_product, aes(reporter, p4)) + geom_tile(aes(fill = n), colour = "white") + scale_fill_gradient(low = "white", high = "steelblue") + theme(axis.text.x = element_text(angle = 60, hjust = 1))
There was a time in 2015 where Croatia had a great number of gaps in the trade value, but in 2017, this has been mostly fixed.
Number of lines for which the weight is equal to zero.
gw <- tbl(con, tableanalysed) %>% filter(weight == 0) %>% group_by(productcode, reportercode, period) %>% summarise(n = n()) %>% left_join(tbl(con, "vld_comext_reporter"), by="reportercode") %>% collect() %>% ungroup() %>% mutate(productcode = as.character(productcode), prod2 = as.numeric(substr(productcode, 1,2))) gw %>% group_by(reporter, prod2) %>% summarise(n = sum(n)) %>% left_join(tradeharvester::products2harvest, by = c("prod2" = "productcode")) %>% unite(product, prod2, productdescription) %>% spread(product, n, fill="") %>% kable() gw %>% mutate(year = substr(as.character(period),1,4)) %>% group_by(reporter, year) %>% summarise(n = sum(n)) %>% spread(year, n, fill="") %>% kable()
gw_period <- tbl(con, tableanalysed) %>% filter(weight == 0) %>% group_by(reportercode, period) %>% summarise(n = n()) %>% left_join(tbl(con, "vld_comext_reporter"), by = "reportercode") %>% collect() %>% mutate(period = as.character(period)) ggplot(gw_period, aes(reporter, period)) + geom_tile(aes(fill = n), colour = "white") + scale_fill_gradient(low = "white", high = "steelblue") + theme(axis.text.x = element_text(angle = 60, hjust = 1))
gw_product <- tbl(con, tableanalysed) %>% filter(weight== 0) %>% group_by(reportercode, productcode) %>% summarise(n = n()) %>% left_join(tbl(con, "vld_comext_reporter"), by = "reportercode") %>% collect() %>% mutate(productcode = as.character(productcode), p4 = substr(productcode, 1,4)) ggplot(gw_product, aes(reporter, p4)) + geom_tile(aes(fill = n), colour = "white") + scale_fill_gradient(low = "white", high = "steelblue") + theme(axis.text.x = element_text(angle = 60, hjust = 1))
# Number of rows which have missing weight and quantity dtf <- tbl(con, tableanalysed) %>% group_by(productcode, reportercode, flowcode) %>% mutate(weightna = is.na(weight), weight0 = weight==0, quantityna = is.na(quantity), quantity0 = quantity == 0) %>% summarise(nrow = n(), # total number of rows weightna = sum(weightna), weight0 = sum(weight0), quantityna = sum(quantityna), quantity0 = sum(quantity0)) %>% collect() %>% # keep only products at the 8 digit level filter(nchar(productcode) == 8) # Missing data by great product code dtf %>% mutate(productcode2d = substr(productcode, 1,2)) %>% group_by(productcode2d) %>% summarise(nrow = sum(nrow), weightna = sum(weightna), weight0 = sum(weight0, na.rm=TRUE), quantityna = sum(quantityna), quantity0 = sum(quantity0, na.rm = TRUE)) dtf %>% mutate(productcode2d = substr(productcode, 1,2)) %>% group_by(productcode2d) %>% summarise(weightna_percent = round(sum(weightna) /sum(nrow) * 100,1), quantityna_percent = round(sum(quantityna) / sum(nrow)*100,1)) %>% kable() # Missing data by reporter for product code 44 dtf %>% filter(substr(productcode, 1,2) == "44") %>% group_by(reportercode) %>% summarise(weightna_percent = round(sum(weightna) /sum(nrow) * 100,1), quantityna_percent = round(sum(quantityna) / sum(nrow)*100,1)) %>% kable()
dtf <- tbl(con, tableanalysed) %>% # filter(is.na(weight) & !is.na(quantity)) %>% group_by(productcode, reportercode, flowcode) %>% mutate(weightna = is.na(weight), quantityna = is.na(quantity), weightnaquantitynona = is.na(weight) & !is.na(quantity)) %>% summarise(nrow = n(), # total number of rows weightna = sum(weightna), quantityna = sum(quantityna), weightnaquantitynona = sum(weightnaquantitynona) ) %>% collect() %>% # keep only products at the 8 digit level filter(nchar(productcode) == 8) dtf %>% mutate(productcode2d = substr(productcode, 1,2)) %>% group_by(productcode2d) %>% summarise(weightnaquantitynona_percent = round(sum(weightnaquantitynona) /sum(nrow) * 100,1)) %>% kable()
How are empty values encoded in the text file? ```{bash eval=FALSE} cp ~/R/tradeharvester/data_raw/comext/201709/data/nc201501.7z /tmp/comext cd /tmp/comext/ 7zr e nc201501.7z
awk -F, '$3 ~ /^44/' nc201501.dat | less
In `less`, search for /2675.752 Empty value is represented by an empty string at the end of the line. Compared for example the end of the 2 lines below:
001,0053,44,1,4,201501,2675.752,3247.776, 001,0053,44032011,2,4,201501,35.48,391.5,870
Can this be read as NA instead of 0? How about in the middle of the line? It seems empty values are then encoded as "0" when in the middle of the line:
003,0960,44,1,4,201501,0.157,0, 003,0960,44,2,4,201501,0.141,0.003, 003,0960,44190090,2,4,201501,0.139,0.003, 003,0960,44MMM000,1,4,201501,0.157,0, 003,0960,44MMM000,2,4,201501,0.002,0,
# Product descriptions ```r tbl(con, tableanalysed) %>% distinct(productcode) %>% left_join(tbl(con, "vld_comext_product"), by = "productcode") %>% collect() %>% # correct encoding issue, see docs/productcodes.Rmd mutate(productdescription = gsub(", MA.*ARANDUBA",", MACARANDUBA", productdescription)) %>% kable()
Disconnect from the database.
RMariaDB::dbDisconnect(con)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.