library(knitr)
opts_knit$set(root.dir="..") # file paths are relative to the root of the project directory
library(tradeflows)
library(dplyr)
library(reshape2)
library(tidyr)

See also

Documentation on how I loaded raw product codes into the database: ~/R/tradeharvester/docs/productcodes.Rmd

If codes are not present, transfer them

if(FALSE){ # If codes are not present, transfer them
    # create emtpty database structure for raw codes
    createdbstructure(sqlfile = 'raw_comext.sql', dbname = 'test')
    # create empty database structure for validated codes
    createdbstructure(sqlfile = 'vld_comext.sql', dbname = 'test')
    # transfer raw codes
    con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "test")
    tradeharvester::transfertxtcodesfolder2db(con, rawdatacomextfolder = "~/R/tradeharvester/data_raw/comext/201707/text/english/")
    RMariaDB::dbDisconnect(con)
}

Explore raw product codes

Load raw product codes in a data frame

con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "test")

# Load all products in a dataframe
allproducts <- tbl(con, "raw_comext_product") %>% collect()
RMariaDB::dbDisconnect(con)

# Keep wood products only
wp <- allproducts %>% 
    filter(grepl("^44|^94", productcode))

# For information regexp filtering is also possible with a SQL statement, 
# see for example 
# https://stackoverflow.com/questions/15687136/logical-and-operator-in-mysql-regexp
# and MySQL regexp documentation
# https://dev.mysql.com/doc/refman/5.5/en/regexp.html

# Why are dates read as character vectors?

Are recent product codes unique?

# A data frame to view in the data explorer
wpdates <- wp %>% 
    group_by(productcode) %>% 
    summarise(maxdatestart = max(datestart),
              datestart = paste(datestart, collapse=", "), 
              dateend = paste(dateend, collapse = ", "),
              # keep only distinct product description
              productdescription = paste(unique(productdescription), collapse = ", "),
              n = n()) %>% 
    arrange(desc(n))

# Wood products repeated
# Keep only the most recent datestart, which still appears more than once
wprep <- wp %>% 
    group_by(productcode) %>% 
    filter(datestart == max(datestart)) %>% 
    summarise(datestart = paste(datestart, collapse=", "), 
              dateend = paste(dateend, collapse = ", "),
              # keep only distinct product description
              productdescription = paste(unique(productdescription), collapse = ", "),
              n = n()) %>% 
    arrange(desc(n)) %>% 
    filter(n>1) 
wprep %>% kable()

# It seems these lines contain identical information
# In other words, distinct rows for all columns
nrow(unique(wprep)) 
# should be equal to distinct product codes
length(unique(wprep$productcode))
# This can be tested with 
stopifnot(identical(nrow(unique(wprep)), length(unique(wprep$productcode))))    

Are datestart2 and dateend2 allways identicall to datestart and dateend?

summary(allproducts$datestart == allproducts$datestart2)
summary(allproducts$dateend == allproducts$dateend2)
diffdates <- allproducts %>% 
    filter(datestart != datestart2 |
           dateend != dateend2)

No there are 8 rows in which this is not the case.

cleancodes

cleanallcomextcodes function

# See chunks above to add raw products to the database if they are missing
con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "test")
try(cleanallcomextcodes(con))
RMariaDB::dbDisconnect(con)

cleanallcomextcodes on the server

# first update the package
# devtools::install_github("stix-global/eutradeflows")
library(tradeflows)
# Update the database structures for raw and validated data
createdbstructure(sqlfile = "raw_comext.sql", dbname = "tradeflows")
createdbstructure(sqlfile = "vld_comext.sql", dbname = "tradeflows")
con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "tradeflows")
# Transfer raw codes from the files to the database
tradeharvester::transfertxtcodesfolder2db(con, rawdatacomextfolder = "/mnt/sdb/data_raw/comext/201707/text/english/")
# Clean codes
cleanallcomextcodes(con)


# If needed, transfer monthly data again
library(tradeharvester)
# Transfer monthly recent
transfer7zfolder2db(con,
                    rawdatacomextfolder = "/mnt/sdb/data_raw/comext/201708/data",
                    productcodestart = c(44, 94),
                    tablename = "raw_comext_monthly_201708",
                    tabletemplate = "raw_comext_monthly")

# Transfer monthly archive
# raw_comext_monthly_2016S1 had approximately 30309561 rows before the update
transfer7zfolder2db(con,
                    rawdatacomextfolder = "/mnt/sdb/data_raw/comext/2016S1/data",
                    productcodestart = c(44, 94),
                    tablename = "raw_comext_monthly_2016S1",
                    tabletemplate = "raw_comext_monthly")

Clean product codes function

# See steps above to recreate the database if needed
con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "test")
try(cleancode(con, "raw_comext_product", "vld_comext_product", productcode))

# In debug mode inside the cleancode function,
# nrow(dtf) 
# [1] 23994
# nrow(unique(dtf))
# [1] 23994
# Browse[2]> nrow(distinct(dtf, !!codevariable))
# [1] 23994
# Are there no duplicates?

# number of rows
res <- RMariaDB::dbSendQuery(con, "SELECT COUNT(*) as nrow FROM vld_comext_product")
sqltable <- RMariaDB::dbFetch(res)
RMariaDB::dbClearResult(res)
sqltable$nrow

RMariaDB::dbDisconnect(con)

Clean units

Since units are changing over time, it is not possible to get a single unit per product. This means that a unit is attached to a product and a period.

unit <- tbl(con, "raw_comext_unit") %>% collect()

# Is the startdate always in January?
unitdates <- unit %>% 
    transmute(unitcode, 
              mdstart = substr(datestart, 5, 10),
              mdend   = substr(dateend, 5, 10)) %>% 
    distinct()
# All possible start and end months
unitdates %>% distinct(mdstart, mdend) 
# The only weird product starting in July
unitdates %>% filter(mdstart == "-07-01" | mdend == "-06-30")

# Change start and end dates to a period in month
unit2 <- unit %>%
    mutate(periodstart = gsub("-", "", substr(datestart,1,7)),
           periodend   = gsub("-", "", substr(dateend,1,7))) %>% 
    select(-datestart, -dateend)

# Keep start and end dates only if there is a chage in unit
## Get unique combination of product codes and units
unitdistinct <- unit %>% distinct(productcode, unitcode)

# A look at the various starting and ending period
unit2 %>% distinct(periodstart) %>% arrange(periodstart)
unit2 %>% distinct(periodend) %>% arrange(periodend)


# When merging units with the trade flows table, 
# we will take the unit which has a 
# periodstart <= period <= periodend
# Its possible to compare period strings
"201206" <= c("201201", "201207", "201206") 
periods <- c("201201", "201207", "201206", "201412") 
"201206" <= periods & periods <= "201412"
# As well as integer version
201206 <= c(201201, 201207, 201206)    

Add information

Merge with dplyr::left_join

con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "test")
# tables
product <- tbl(con, "vld_comext_product")
reporter <- tbl(con, "vld_comext_reporter")
partner <- tbl(con, "vld_comext_partner")
monthly <- tbl(con, "raw_comext_monthly_201707")


# if needed put back the data again
if(FALSE){
    # Run this in a separate R session, the transfer will take several minutes
    library(tradeharvester)
    setwd("~/R/tradeharvester")
    con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "test")
    transfer7zfolder2db(con,
                        rawdatacomextfolder = "data_raw/comext/201707/data",
                        productcodestart = c(44, 94),
                        tablename = "raw_comext_monthly_201707",
                        tabletemplate = "raw_comext_monthly")
    RMariaDB::dbDisconnect(con)
}


# Get only raw data for one product code
wp <- monthly %>% 
    filter(productcode == 44L) %>% collect()
# Get raw data with product and country information for one product code
query <- monthly %>% 
    filter(productcode == 44L) %>% 
    # join    
    left_join(product, by = "productcode") %>% 
    left_join(reporter, by = "reportercode") %>% 
    left_join(partner, by = "partnercode")
# Show the SQL query
query %>% explain()
# Get data from the database
dtf <- query %>% collect()
dtf %>% head %>% kable()

RMariaDB::dbDisconnect(con)

Use the function addproreppar2tbl

con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "test")
monthly <- tbl(con, "raw_comext_monthly_201707")
m44 <-monthly %>%
    filter(productcode == 44) %>%
    addproreppar2tbl(con, .) %>%
    collect()
m44 %>% head(20) %>%  kable()
unique(m44[c("productcode","productdescription")])

Merge with SQL LEFT JOIN

``{sql sqlfromdplyr, connection=con} SELECT * FROM (SELECTTBL_LEFT.reportercodeASreportercode,TBL_LEFT.partnercodeASpartnercode,TBL_LEFT.productcodeASproductcode,TBL_LEFT.flowcodeASflowcode,TBL_LEFT.statregimeASstatregime,TBL_LEFT.periodASperiod,TBL_LEFT.tradevalueAStradevalue,TBL_LEFT.weightASweight,TBL_LEFT.quantityASquantity,TBL_LEFT.productdescriptionASproductdescription,TBL_RIGHT.reporterASreporterFROM (SELECTTBL_LEFT.reportercodeASreportercode,TBL_LEFT.partnercodeASpartnercode,TBL_LEFT.productcodeASproductcode,TBL_LEFT.flowcodeASflowcode,TBL_LEFT.statregimeASstatregime,TBL_LEFT.periodASperiod,TBL_LEFT.tradevalueAStradevalue,TBL_LEFT.weightASweight,TBL_LEFT.quantityASquantity,TBL_RIGHT.productdescriptionASproductdescriptionFROM (SELECT * FROMraw_comext_monthly_201707WHERE (productcode= 44))TBL_LEFTLEFT JOINvld_comext_productASTBL_RIGHTON (TBL_LEFT.productcode=TBL_RIGHT.productcode))TBL_LEFTLEFT JOINvld_comext_reporterASTBL_RIGHTON (TBL_LEFT.reportercode=TBL_RIGHT.reportercode))TBL_LEFTLEFT JOINvld_comext_partnerASTBL_RIGHTON (TBL_LEFT.partnercode=TBL_RIGHT.partnercode`)

Is it possible to reduce the size of this chunk?

Stackoverflow [Multiple left join and where clause](https://stackoverflow.com/a/14261094/2641825)

## Join 2 tables
```{sql eval=FALSE}
SELECT *
FROM   raw_comext_monthly_201707
JOIN   vld_comext_product ON  raw_comext_monthly_201707.productcode = vld_comext_product.productcode
WHERE  raw_comext_monthly_201707.productcode = 44

Join 4 tables: monthly data, product, reporter, partner

```{sql eval=FALSE} SELECT * FROM raw_comext_monthly_201707 LEFT JOIN vld_comext_product ON raw_comext_monthly_201707.productcode = vld_comext_product.productcode LEFT JOIN vld_comext_reporter ON raw_comext_monthly_201707.reportercode = vld_comext_reporter.reportercode LEFT JOIN vld_comext_partner ON raw_comext_monthly_201707.partnercode = vld_comext_partner.partnercode WHERE raw_comext_monthly_201707.productcode = 44

The same using an alias
```{sql sqlwithalias, connection=con}
SELECT *
FROM   raw_comext_monthly_201707 as m
LEFT JOIN vld_comext_product ON m.productcode = vld_comext_product.productcode
LEFT JOIN vld_comext_reporter ON m.reportercode = vld_comext_reporter.reportercode
LEFT JOIN vld_comext_partner ON m.partnercode = vld_comext_partner.partnercode
WHERE  m.productcode = 44

Issues

20170816 cleancodes function debugging before the change of productcodes to a character variable

con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "test")
inputfields <- RMariaDB::dbListFields(con,"raw_comext_product")
outputfields <- RMariaDB::dbListFields(con,"vld_comext_product")
# validated fields should exist in the raw table
stopifnot(outputfields %in% inputfields)

# clean products:
cleancode(con, "raw_comext_product", "vld_comext_product", productcode)
# inside that function, dtf has 21833 rows, this is the number of most recent codes
# returns the error
# Error: identical(nrow(unique(dtf)), nrow(distinct(dtf, !(!codevariable)))) is not TRUE

# Load all products in a dataframe
allproducts <- tbl(con, "raw_comext_product") %>% collect()
RMariaDB::dbDisconnect(con)


# Select output fields
# allproducts %>% select(outputfields)
# allproducts[outputfields]

dtf <- allproducts %>% 
    group_by(productcode) %>%
    filter(datestart == max(datestart))
stopifnot(identical(nrow(unique(dtf)),
                    nrow(distinct(dtf, productcode))))
# difference
nrow(unique(dtf)) - nrow(distinct(dtf, productcode))

# What are those duplicated rows?
# group by productcode
dtf_c <- dtf %>%  
    group_by(productcode) %>% 
    mutate(n = n()) %>% 
    arrange(desc(n)) %>% 
    filter(n>2)
dtf_c_u <- unique(dtf_c)
length(unique(dtf_c_u$productcode))

# group by productcode and productdescription
dtf_cd <- dtf %>%  
    group_by(productcode, productdescription) %>% 
    mutate(n = n()) %>% 
    arrange(desc(n)) %>% 
    filter(n>2)
dtf_cd_u <- unique(dtf_cd)
length(unique(dtf_cd_u$productcode))

# group by productcode, productdescription, datestart and dateend
dtf_cdse <- dtf %>%  
    group_by(productcode, productdescription, datestart, dateend) %>% 
    mutate(n = n()) %>% 
    arrange(desc(n)) %>% 
    filter(n>2)
dtf_cdse_u <- unique(dtf_cdse)
length(unique(dtf_cdse_u$productcode))


# It seems the descriptions are different:
dtf_c_u$productdescription[1:5]

Reading the raw text file directly

# Looking at the source file, the duplicated codes issue seems to be due to the fact that 
# productcodes do not contain only numbers, but can also contain characters
# Reading the text file line by line
readLines("../tradeharvester/data_raw/comext/201707/text/english/CN.txt") %>% 
    grep("7302;", ., value=TRUE)
# For example 7302I0 or 7302I2 are product codes, which get truncated during the database load process.

# I decided to change this variable to a VARCHAR type in MySQL

# Trying to read the text file into a data frame
# Maximum character length
# product <- read.table("../tradeharvester/data_raw/comext/201707/text/english/CN.txt",
#                       sep = "\t")  
# Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,  : 
#   line 5534 did not have 6 elements
cntxt <- readLines("../tradeharvester/data_raw/comext/201707/text/english/CN.txt")
cntxt[5534]
# but it does have 6 elements!
product_20089218 <- read.table(text = cntxt[5534], header = FALSE, sep="\t")
# cntxt[c(1:5533,5535,length(cntxt))]
product0 <- read.table(text = paste(cntxt[1:1000],collapse="\n"), 
                  header = FALSE, sep = "\t")
max(nchar(as.character(product0$V1)))

# I'll make it a varchar(10)

20170823 many codes in one description

This issue is also present in the harvested raw data. See further development in tradeharvester/docs/productcoddes.Rmd.

# Reuse a MySQL connection handle "con" from above
# Issue present in validated product codes
pvld <- tbl(con, "vld_comext_product") %>% 
    filter(productcode == 03035590) %>% 
    collect()
pvld$productdescription
pvld %>% kable()

# Issue also present in raw product codes
praw <- tbl(con, "raw_comext_product") %>% collect()

# !!! only visible if I collect that product alone (data type issue?)
praw03035590 <- tbl(con, "raw_comext_product")  %>% 
    filter(productcode == 03035590) %>% 
    collect()
praw03035590$productdescription
praw03035590 %>% kable()
# by comparison this data frame supposed to contain all products doesn't contain it
praw %>% filter(productcode == 03035590) 


# Did something similar happen for forest products as well?

What happened in the text file?

cntxt <- readLines("../tradeharvester/data_raw/comext/201707/text/english/CN.txt") %>% 
    grep("03035590", ., value=TRUE)
cntxt

It appears there are 3 quotes in the middle of the description. Which seems to be traded as one long character by the function that transfer product codes.

20170828 again many codes in one description

See product code 47073090 Again the issue seems to be related to an unever number of quotation marks. But I have alread changed the read.delim parameter in writetxtcodes2db The issue due to the fact that I was using the old table name val_comext_product instead of the new name vld_comext_product.

cntxt <- readLines("../tradeharvester/data_raw/comext/201707/text/english/CN.txt") %>% 
    grep("47073090", ., value=TRUE)

cntxt

cn <- read.delim("../tradeharvester/data_raw/comext/201707/text/english/CN.txt",
                 header = FALSE,
                 quote = "", # disable quotation
                 stringsAsFactors = FALSE)
cn[cn$V1==47073090,]$V4
# The next one
cn[cn$V1==470790,]$V4
cn[cn$V1==47079010,]$V4
# Looks OK, how about in the database?

# Raw table
rawcn4707 <- tbl(con, "raw_comext_product") %>% 
    collect() %>% 
    filter(grepl("4707", productcode)) 
# rawcn4707 %>% kable() 
rawcn4707$productdescription[rawcn4707$productcode==47073090]

# Validated table
valcn4707 <- tbl(con, "vld_comext_product") %>% 
    collect() %>% 
    filter(grepl("4707", productcode)) 
# valcn4707 %>% kable() 
valcn4707$productdescription[valcn4707$productcode==47073090]

Explore product codes

# In the tradeflows database, where most of the real data is located
con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "tradeflows") 


# Get a vector of products available in the database
# Products in the recent data
dtfr <- tbl(con, "raw_comext_monthly_201709") %>% 
    distinct(productcode) %>% collect() %>% 
    # Remove 2 digit level products, keep only 8 digit level
    filter(nchar(productcode)>2)

# Count the number of products under each 2 digit product code
dtfr %>% 
    mutate(productcode2d = substr(productcode,1,2)) %>% 
    count(productcode2d)

# Count the number of products under each 4 digit product code
dtfr %>% 
    mutate(productcode2d = substr(productcode,1,4)) %>% 
    count(productcode2d) %>% 
    arrange(desc(n))
# There are 71 product codes at the 4 digit level.

# The 4 digit code containing most products is 4407.
# Under 4407 there are 73 product codes at the 8 digit level.

# Count the number of products under each 6 digit product code
dtfr %>% 
    mutate(productcode2d = substr(productcode,1,6)) %>% 
    count(productcode2d) %>% 
    arrange(desc(n))
# There are 304 product codes at the 4 digit level.

RMariaDB::dbDisconnect(con)

Product codes IMM

See data_raw/produc_classifications.R to see where the IMM product classification is loaded. eutradeflows::classificationimm is part of the package.

Based on the number of product per group below, I'll make two drop down lists. One with productgroupimm and one with the productimm. The list containing productimm will be populated from the first one.

How many product codes per imm product group?

classificationimm %>% group_by(productgroupimm) %>% 
    tally() %>% arrange(desc(n))

How many productimm per imm product group?

classificationimm %>% 
    distinct(productgroupimm, productimm) %>% 
    group_by(productgroupimm) %>% tally()
# Checking if products under a productimm name share the same product code at the 4 digit level
cimm <- classificationimm %>% group_by(productimm) %>% 
    mutate(pc4 = substr(productcode,0,4)) %>% 
    summarise(n = n(),
              pc4 = paste(unique(pc4),collapse=", ")) %>% arrange(desc(n)) 

# How many productimm under 2 digit product code and productgroupimm?
classificationimm %>% 
    mutate(pc2 = substr(productcode,0,2)) %>% 
    distinct(pc2, productimm, productgroupimm) %>% 
    group_by(productgroupimm) %>% 
    summarise(n=n(),
              pc2 = paste(unique(pc2), collapse=", ")) %>%
    arrange(productgroupimm)

Are product codes unique in that table?

classificationimm %>% nrow()
classificationimm %>% distinct(productcode) %>% nrow()

Conclusions

cleancodes should first create the database structure. The cleancode function should check which columns are present in the validated table and select only these ones.



stix-global/eutradeflows documentation built on Nov. 13, 2020, 9:23 p.m.