library(knitr) opts_knit$set(root.dir="..") # file paths are relative to the root of the project directory library(tradeflows) library(dplyr) library(reshape2) library(tidyr)
Documentation on how I loaded raw product codes into the database: ~/R/tradeharvester/docs/productcodes.Rmd
if(FALSE){ # If codes are not present, transfer them # create emtpty database structure for raw codes createdbstructure(sqlfile = 'raw_comext.sql', dbname = 'test') # create empty database structure for validated codes createdbstructure(sqlfile = 'vld_comext.sql', dbname = 'test') # transfer raw codes con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "test") tradeharvester::transfertxtcodesfolder2db(con, rawdatacomextfolder = "~/R/tradeharvester/data_raw/comext/201707/text/english/") RMariaDB::dbDisconnect(con) }
con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "test") # Load all products in a dataframe allproducts <- tbl(con, "raw_comext_product") %>% collect() RMariaDB::dbDisconnect(con) # Keep wood products only wp <- allproducts %>% filter(grepl("^44|^94", productcode)) # For information regexp filtering is also possible with a SQL statement, # see for example # https://stackoverflow.com/questions/15687136/logical-and-operator-in-mysql-regexp # and MySQL regexp documentation # https://dev.mysql.com/doc/refman/5.5/en/regexp.html # Why are dates read as character vectors?
# A data frame to view in the data explorer wpdates <- wp %>% group_by(productcode) %>% summarise(maxdatestart = max(datestart), datestart = paste(datestart, collapse=", "), dateend = paste(dateend, collapse = ", "), # keep only distinct product description productdescription = paste(unique(productdescription), collapse = ", "), n = n()) %>% arrange(desc(n)) # Wood products repeated # Keep only the most recent datestart, which still appears more than once wprep <- wp %>% group_by(productcode) %>% filter(datestart == max(datestart)) %>% summarise(datestart = paste(datestart, collapse=", "), dateend = paste(dateend, collapse = ", "), # keep only distinct product description productdescription = paste(unique(productdescription), collapse = ", "), n = n()) %>% arrange(desc(n)) %>% filter(n>1) wprep %>% kable() # It seems these lines contain identical information # In other words, distinct rows for all columns nrow(unique(wprep)) # should be equal to distinct product codes length(unique(wprep$productcode)) # This can be tested with stopifnot(identical(nrow(unique(wprep)), length(unique(wprep$productcode))))
summary(allproducts$datestart == allproducts$datestart2) summary(allproducts$dateend == allproducts$dateend2) diffdates <- allproducts %>% filter(datestart != datestart2 | dateend != dateend2)
No there are 8 rows in which this is not the case.
# See chunks above to add raw products to the database if they are missing con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "test") try(cleanallcomextcodes(con)) RMariaDB::dbDisconnect(con)
# first update the package # devtools::install_github("stix-global/eutradeflows") library(tradeflows) # Update the database structures for raw and validated data createdbstructure(sqlfile = "raw_comext.sql", dbname = "tradeflows") createdbstructure(sqlfile = "vld_comext.sql", dbname = "tradeflows") con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "tradeflows") # Transfer raw codes from the files to the database tradeharvester::transfertxtcodesfolder2db(con, rawdatacomextfolder = "/mnt/sdb/data_raw/comext/201707/text/english/") # Clean codes cleanallcomextcodes(con) # If needed, transfer monthly data again library(tradeharvester) # Transfer monthly recent transfer7zfolder2db(con, rawdatacomextfolder = "/mnt/sdb/data_raw/comext/201708/data", productcodestart = c(44, 94), tablename = "raw_comext_monthly_201708", tabletemplate = "raw_comext_monthly") # Transfer monthly archive # raw_comext_monthly_2016S1 had approximately 30309561 rows before the update transfer7zfolder2db(con, rawdatacomextfolder = "/mnt/sdb/data_raw/comext/2016S1/data", productcodestart = c(44, 94), tablename = "raw_comext_monthly_2016S1", tabletemplate = "raw_comext_monthly")
# See steps above to recreate the database if needed con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "test") try(cleancode(con, "raw_comext_product", "vld_comext_product", productcode)) # In debug mode inside the cleancode function, # nrow(dtf) # [1] 23994 # nrow(unique(dtf)) # [1] 23994 # Browse[2]> nrow(distinct(dtf, !!codevariable)) # [1] 23994 # Are there no duplicates? # number of rows res <- RMariaDB::dbSendQuery(con, "SELECT COUNT(*) as nrow FROM vld_comext_product") sqltable <- RMariaDB::dbFetch(res) RMariaDB::dbClearResult(res) sqltable$nrow RMariaDB::dbDisconnect(con)
Since units are changing over time, it is not possible to get a single unit per product. This means that a unit is attached to a product and a period.
unit <- tbl(con, "raw_comext_unit") %>% collect() # Is the startdate always in January? unitdates <- unit %>% transmute(unitcode, mdstart = substr(datestart, 5, 10), mdend = substr(dateend, 5, 10)) %>% distinct() # All possible start and end months unitdates %>% distinct(mdstart, mdend) # The only weird product starting in July unitdates %>% filter(mdstart == "-07-01" | mdend == "-06-30") # Change start and end dates to a period in month unit2 <- unit %>% mutate(periodstart = gsub("-", "", substr(datestart,1,7)), periodend = gsub("-", "", substr(dateend,1,7))) %>% select(-datestart, -dateend) # Keep start and end dates only if there is a chage in unit ## Get unique combination of product codes and units unitdistinct <- unit %>% distinct(productcode, unitcode) # A look at the various starting and ending period unit2 %>% distinct(periodstart) %>% arrange(periodstart) unit2 %>% distinct(periodend) %>% arrange(periodend) # When merging units with the trade flows table, # we will take the unit which has a # periodstart <= period <= periodend # Its possible to compare period strings "201206" <= c("201201", "201207", "201206") periods <- c("201201", "201207", "201206", "201412") "201206" <= periods & periods <= "201412" # As well as integer version 201206 <= c(201201, 201207, 201206)
con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "test") # tables product <- tbl(con, "vld_comext_product") reporter <- tbl(con, "vld_comext_reporter") partner <- tbl(con, "vld_comext_partner") monthly <- tbl(con, "raw_comext_monthly_201707") # if needed put back the data again if(FALSE){ # Run this in a separate R session, the transfer will take several minutes library(tradeharvester) setwd("~/R/tradeharvester") con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "test") transfer7zfolder2db(con, rawdatacomextfolder = "data_raw/comext/201707/data", productcodestart = c(44, 94), tablename = "raw_comext_monthly_201707", tabletemplate = "raw_comext_monthly") RMariaDB::dbDisconnect(con) } # Get only raw data for one product code wp <- monthly %>% filter(productcode == 44L) %>% collect() # Get raw data with product and country information for one product code query <- monthly %>% filter(productcode == 44L) %>% # join left_join(product, by = "productcode") %>% left_join(reporter, by = "reportercode") %>% left_join(partner, by = "partnercode") # Show the SQL query query %>% explain() # Get data from the database dtf <- query %>% collect() dtf %>% head %>% kable() RMariaDB::dbDisconnect(con)
con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "test") monthly <- tbl(con, "raw_comext_monthly_201707") m44 <-monthly %>% filter(productcode == 44) %>% addproreppar2tbl(con, .) %>% collect() m44 %>% head(20) %>% kable() unique(m44[c("productcode","productdescription")])
``{sql sqlfromdplyr, connection=con}
SELECT *
FROM (SELECT
TBL_LEFT.
reportercodeAS
reportercode,
TBL_LEFT.
partnercodeAS
partnercode,
TBL_LEFT.
productcodeAS
productcode,
TBL_LEFT.
flowcodeAS
flowcode,
TBL_LEFT.
statregimeAS
statregime,
TBL_LEFT.
periodAS
period,
TBL_LEFT.
tradevalueAS
tradevalue,
TBL_LEFT.
weightAS
weight,
TBL_LEFT.
quantityAS
quantity,
TBL_LEFT.
productdescriptionAS
productdescription,
TBL_RIGHT.
reporterAS
reporterFROM (SELECT
TBL_LEFT.
reportercodeAS
reportercode,
TBL_LEFT.
partnercodeAS
partnercode,
TBL_LEFT.
productcodeAS
productcode,
TBL_LEFT.
flowcodeAS
flowcode,
TBL_LEFT.
statregimeAS
statregime,
TBL_LEFT.
periodAS
period,
TBL_LEFT.
tradevalueAS
tradevalue,
TBL_LEFT.
weightAS
weight,
TBL_LEFT.
quantityAS
quantity,
TBL_RIGHT.
productdescriptionAS
productdescriptionFROM (SELECT * FROM
raw_comext_monthly_201707WHERE (
productcode= 44))
TBL_LEFTLEFT JOIN
vld_comext_productAS
TBL_RIGHTON (
TBL_LEFT.
productcode=
TBL_RIGHT.
productcode))
TBL_LEFTLEFT JOIN
vld_comext_reporterAS
TBL_RIGHTON (
TBL_LEFT.
reportercode=
TBL_RIGHT.
reportercode))
TBL_LEFTLEFT JOIN
vld_comext_partnerAS
TBL_RIGHTON (
TBL_LEFT.
partnercode=
TBL_RIGHT.
partnercode`)
Is it possible to reduce the size of this chunk? Stackoverflow [Multiple left join and where clause](https://stackoverflow.com/a/14261094/2641825) ## Join 2 tables ```{sql eval=FALSE} SELECT * FROM raw_comext_monthly_201707 JOIN vld_comext_product ON raw_comext_monthly_201707.productcode = vld_comext_product.productcode WHERE raw_comext_monthly_201707.productcode = 44
```{sql eval=FALSE} SELECT * FROM raw_comext_monthly_201707 LEFT JOIN vld_comext_product ON raw_comext_monthly_201707.productcode = vld_comext_product.productcode LEFT JOIN vld_comext_reporter ON raw_comext_monthly_201707.reportercode = vld_comext_reporter.reportercode LEFT JOIN vld_comext_partner ON raw_comext_monthly_201707.partnercode = vld_comext_partner.partnercode WHERE raw_comext_monthly_201707.productcode = 44
The same using an alias ```{sql sqlwithalias, connection=con} SELECT * FROM raw_comext_monthly_201707 as m LEFT JOIN vld_comext_product ON m.productcode = vld_comext_product.productcode LEFT JOIN vld_comext_reporter ON m.reportercode = vld_comext_reporter.reportercode LEFT JOIN vld_comext_partner ON m.partnercode = vld_comext_partner.partnercode WHERE m.productcode = 44
con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "test") inputfields <- RMariaDB::dbListFields(con,"raw_comext_product") outputfields <- RMariaDB::dbListFields(con,"vld_comext_product") # validated fields should exist in the raw table stopifnot(outputfields %in% inputfields) # clean products: cleancode(con, "raw_comext_product", "vld_comext_product", productcode) # inside that function, dtf has 21833 rows, this is the number of most recent codes # returns the error # Error: identical(nrow(unique(dtf)), nrow(distinct(dtf, !(!codevariable)))) is not TRUE # Load all products in a dataframe allproducts <- tbl(con, "raw_comext_product") %>% collect() RMariaDB::dbDisconnect(con) # Select output fields # allproducts %>% select(outputfields) # allproducts[outputfields] dtf <- allproducts %>% group_by(productcode) %>% filter(datestart == max(datestart)) stopifnot(identical(nrow(unique(dtf)), nrow(distinct(dtf, productcode)))) # difference nrow(unique(dtf)) - nrow(distinct(dtf, productcode)) # What are those duplicated rows? # group by productcode dtf_c <- dtf %>% group_by(productcode) %>% mutate(n = n()) %>% arrange(desc(n)) %>% filter(n>2) dtf_c_u <- unique(dtf_c) length(unique(dtf_c_u$productcode)) # group by productcode and productdescription dtf_cd <- dtf %>% group_by(productcode, productdescription) %>% mutate(n = n()) %>% arrange(desc(n)) %>% filter(n>2) dtf_cd_u <- unique(dtf_cd) length(unique(dtf_cd_u$productcode)) # group by productcode, productdescription, datestart and dateend dtf_cdse <- dtf %>% group_by(productcode, productdescription, datestart, dateend) %>% mutate(n = n()) %>% arrange(desc(n)) %>% filter(n>2) dtf_cdse_u <- unique(dtf_cdse) length(unique(dtf_cdse_u$productcode)) # It seems the descriptions are different: dtf_c_u$productdescription[1:5]
# Looking at the source file, the duplicated codes issue seems to be due to the fact that # productcodes do not contain only numbers, but can also contain characters # Reading the text file line by line readLines("../tradeharvester/data_raw/comext/201707/text/english/CN.txt") %>% grep("7302;", ., value=TRUE) # For example 7302I0 or 7302I2 are product codes, which get truncated during the database load process. # I decided to change this variable to a VARCHAR type in MySQL # Trying to read the text file into a data frame # Maximum character length # product <- read.table("../tradeharvester/data_raw/comext/201707/text/english/CN.txt", # sep = "\t") # Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, : # line 5534 did not have 6 elements cntxt <- readLines("../tradeharvester/data_raw/comext/201707/text/english/CN.txt") cntxt[5534] # but it does have 6 elements! product_20089218 <- read.table(text = cntxt[5534], header = FALSE, sep="\t") # cntxt[c(1:5533,5535,length(cntxt))] product0 <- read.table(text = paste(cntxt[1:1000],collapse="\n"), header = FALSE, sep = "\t") max(nchar(as.character(product0$V1))) # I'll make it a varchar(10)
This issue is also present in the harvested raw data. See further development in tradeharvester/docs/productcoddes.Rmd.
# Reuse a MySQL connection handle "con" from above # Issue present in validated product codes pvld <- tbl(con, "vld_comext_product") %>% filter(productcode == 03035590) %>% collect() pvld$productdescription pvld %>% kable() # Issue also present in raw product codes praw <- tbl(con, "raw_comext_product") %>% collect() # !!! only visible if I collect that product alone (data type issue?) praw03035590 <- tbl(con, "raw_comext_product") %>% filter(productcode == 03035590) %>% collect() praw03035590$productdescription praw03035590 %>% kable() # by comparison this data frame supposed to contain all products doesn't contain it praw %>% filter(productcode == 03035590) # Did something similar happen for forest products as well?
What happened in the text file?
cntxt <- readLines("../tradeharvester/data_raw/comext/201707/text/english/CN.txt") %>% grep("03035590", ., value=TRUE) cntxt
It appears there are 3 quotes in the middle of the description. Which seems to be traded as one long character by the function that transfer product codes.
See product code 47073090
Again the issue seems to be related to an unever number of quotation marks.
But I have alread changed the read.delim
parameter in writetxtcodes2db
The issue due to the fact that I was using
the old table name val_comext_product
instead of the new name vld_comext_product
.
cntxt <- readLines("../tradeharvester/data_raw/comext/201707/text/english/CN.txt") %>% grep("47073090", ., value=TRUE) cntxt cn <- read.delim("../tradeharvester/data_raw/comext/201707/text/english/CN.txt", header = FALSE, quote = "", # disable quotation stringsAsFactors = FALSE) cn[cn$V1==47073090,]$V4 # The next one cn[cn$V1==470790,]$V4 cn[cn$V1==47079010,]$V4 # Looks OK, how about in the database? # Raw table rawcn4707 <- tbl(con, "raw_comext_product") %>% collect() %>% filter(grepl("4707", productcode)) # rawcn4707 %>% kable() rawcn4707$productdescription[rawcn4707$productcode==47073090] # Validated table valcn4707 <- tbl(con, "vld_comext_product") %>% collect() %>% filter(grepl("4707", productcode)) # valcn4707 %>% kable() valcn4707$productdescription[valcn4707$productcode==47073090]
# In the tradeflows database, where most of the real data is located con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "tradeflows") # Get a vector of products available in the database # Products in the recent data dtfr <- tbl(con, "raw_comext_monthly_201709") %>% distinct(productcode) %>% collect() %>% # Remove 2 digit level products, keep only 8 digit level filter(nchar(productcode)>2) # Count the number of products under each 2 digit product code dtfr %>% mutate(productcode2d = substr(productcode,1,2)) %>% count(productcode2d) # Count the number of products under each 4 digit product code dtfr %>% mutate(productcode2d = substr(productcode,1,4)) %>% count(productcode2d) %>% arrange(desc(n)) # There are 71 product codes at the 4 digit level. # The 4 digit code containing most products is 4407. # Under 4407 there are 73 product codes at the 8 digit level. # Count the number of products under each 6 digit product code dtfr %>% mutate(productcode2d = substr(productcode,1,6)) %>% count(productcode2d) %>% arrange(desc(n)) # There are 304 product codes at the 4 digit level. RMariaDB::dbDisconnect(con)
See data_raw/produc_classifications.R to see where the IMM product classification is loaded.
eutradeflows::classificationimm
is part of the package.
Based on the number of product per group below, I'll make two drop down lists. One with productgroupimm and one with the productimm. The list containing productimm will be populated from the first one.
How many product codes per imm product group?
classificationimm %>% group_by(productgroupimm) %>% tally() %>% arrange(desc(n))
How many productimm per imm product group?
classificationimm %>% distinct(productgroupimm, productimm) %>% group_by(productgroupimm) %>% tally()
# Checking if products under a productimm name share the same product code at the 4 digit level cimm <- classificationimm %>% group_by(productimm) %>% mutate(pc4 = substr(productcode,0,4)) %>% summarise(n = n(), pc4 = paste(unique(pc4),collapse=", ")) %>% arrange(desc(n)) # How many productimm under 2 digit product code and productgroupimm? classificationimm %>% mutate(pc2 = substr(productcode,0,2)) %>% distinct(pc2, productimm, productgroupimm) %>% group_by(productgroupimm) %>% summarise(n=n(), pc2 = paste(unique(pc2), collapse=", ")) %>% arrange(productgroupimm)
Are product codes unique in that table?
classificationimm %>% nrow() classificationimm %>% distinct(productcode) %>% nrow()
cleancodes should first create the database structure. The cleancode function should check which columns are present in the validated table and select only these ones.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.