Purpose

For wood products, each trade flows should have an estimated volume in m3 For paper products, each trade flwos should have an estimated weight in kg

Each tradeflow goes through 2 filters: 1. Handling missing data + If the weight is available in kg. Estimate missing quantity from weight using a regional conversion factor. + If neither volume nor quantity is available, use a regional unit price to convert trade value to a quantity. 2. Handling out of bounds data + Check if the unit price is above lowerprice and below upperprice correct using medianprice if out of bounds

library(knitr)
opts_knit$set(root.dir="../../..") # file paths are relative to the root of the project directory
library(tradeflows)
library(dplyr)
library(RMySQL)

Calculate without pre-made functions

Load data from a file. (in the final system data will be loaded from a database) Calculate median unit price by region and by year. Calculate median conversion factor by year for the world. Median price calculation can be put in a function for further use.

load("data-raw/comtrade/440799.RData")
dtfraw <- dtf %>% renamecolumns %>% removeduplicatedflows

# Quantity not available but weight is available
dtfraw %>% filter(is.na(quantity) & !is.na(weight)) %>% nrow
# Quantity and weight not available
dtfraw %>% filter(is.na(quantity) & is.na(weight)) %>% nrow

dtf <- dtfraw %>% 
    filter(flow %in% c("Import", "Export")) %>%
    # Remove EU28 reporter
    filter(!reporter %in%c("EU-28")) %>%
    # Remove EU-28 and World partner
    filter(!partner %in%c("EU-28", "World")) %>%
    # Add regionreporter and regionpartner
    merge(select(reportercomtrade, reportercode, regionreporter=region), 
          all.x=TRUE) %>%
    merge(select(reportercomtrade, 
                 partnercode = reportercode, regionpartner=region),
          all.x=TRUE) %>%
    mutate(conversion = weight / quantity,
           price = tradevalue / quantity,
           # To avoid "integer overflow - use sum(as.numeric(.))" error 
           # on sum of all values
           tradevalue = as.numeric(tradevalue),
           # To avoid Error in dtf$c(NA_integer_,  : 
           # invalid subscript type 'integer'
           quantity = as.numeric(quantity))

Check for unique flows

nrow(dtf)
dtf %>% select(year, flow, reportercode, partnercode, 
               productcode, classification) %>% unique %>%nrow
message("There were duplicated lines for the following reporters:")
dtf %>% findduplicatedflows %>% select(year, reporter) %>% unique %>%
    arrange(reporter) %>% kable

Prepare Unit prices and conversion factors

Include (or not) comtrade estimates of quantity from the unit price calculation

Check filter on flag below to see if comtrade estimates are included or not.

pricesregion <- dtf %>%
    filter(!is.na(quantity) & unit !="No Quantity") %>%
    # Exclude comtrade quantity estimates
#     filter(flag==0 |flag==4 ) %>%
    group_by(flow, regionreporter, year, unit) %>%
    summarise(lowerprice = round(0.5 * quantile(price, 0.25, 
                                                names=FALSE, na.rm=TRUE)),
              medianprice = round(median(price, na.rm=TRUE)),
              upperprice = round(2 * quantile(price, 0.75,
                                              names=FALSE, na.rm=TRUE))) %>%
    arrange(-medianprice)

conversionfactorworld <- dtf %>%
    filter(!is.na(quantity) & unit !="No Quantity") %>%
    group_by(flow, year, unit) %>%
    summarise(medianconversion = round(median(conversion,na.rm=TRUE)))

Handling missing quantity data

# Quantity based on the weight using a conversion factor
dtf <- merge(dtf, conversionfactorworld, all.x=TRUE) %>%
    mutate(quantity_cf = weight / medianconversion)
# Quantity based on the trade value, using a unit price
dtf <- merge(dtf, pricesregion, all.x=TRUE) %>%
    mutate(quantity_up = tradevalue / medianprice,
           havequantity = !is.na(quantity)) 

summarychange  <- function(dtf){
    dtf %>% summarise(rows = n(),
                      quantity = sum(quantity, na.rm=TRUE) ,
                      quantity_cf = sum(quantity_cf, na.rm=TRUE),
                      cf_increase = round((quantity_cf-quantity)/ quantity *100),
                      quantity_up = sum(quantity_up, na.rm=TRUE),
                      up_increase = round((quantity_up-quantity)/ quantity *100))
    }

dtf %>% group_by(havequantity, year, flow) %>% summarychange %>% kable
dtf %>% group_by(havequantity, flag, flow) %>% summarychange %>% kable
dtf %>% group_by(flag) %>% summarychange %>% kable
dtf %>% group_by(round(log(quantity)), flag) %>% summarychange %>% kable

# Check if both tables contain the same total world trade flows


print("Percent change in world trade flows ")
wtf <- sum(dtf$quantity,na.rm=TRUE)
dtf %>% 
    #filter(!havequantity) %>% 
    group_by(havequantity, year, flow, flag) %>%
    summarise(changeinworldtrade_cf = round((sum(quantity_cf,na.rm=TRUE) - 
                             sum(quantity,na.rm=TRUE)) / wtf*100,3),
              changeinworldtrade_up = 
                  round((sum(quantity_up,na.rm=TRUE) - 
                             sum(quantity,na.rm=TRUE)) / 
                            wtf*100,3))  %>%
    kable

Handling price bounds


Handling partner information


Calculate with functions from clean.R

Compare unit prices and conversion factor tables

Depending on whether quantity estimates have been removed or not, unit prices will be very different.

dtf2 <- dtfraw %>% 
    addconversionfactorandprice %>%
    addregion
#     merge(select(reportercomtrade, 
#                  partnercode = reportercode, regionpartner=region))    
# Check for duplicated flows
nrow(findduplicatedflows(dtf2))
# Extract prices and conversion factors
pricesregion2 <- dtf2 %>% extractprices(includeqestimates = TRUE)
conversionfactorworld2 <- dtf2 %>% 
    extractconversionfactors(includeqwestimates = TRUE)

# Compare with other chunk
summary(pricesregion$medianprice - pricesregion2$medianprice)
summary(conversionfactorworld$medianconversion -
            conversionfactorworld2$medianconversion)

Handling missing quantity data and price bounds

dtfq <- dtfraw %>% 
    addconversionfactorandprice %>% 
    addregion %>%
    estimatequantity(price=pricesregion2, 
                     conversionfactor=conversionfactorworld2, shaveprice=FALSE)
nrow(findduplicatedflows(dtfq))

fr2 <- dtfq %>% filter(reporter =="France") #%>% removeduplicatedflows
# Check flags
fr <- estimatequantity(dtf, price=pricesregion2, 
                       conversionfactor=conversionfactorworld2) %>%
    filter(reporter=="France") 
fr2$partner[!fr2$partner %in% fr$partner]
# World makes the difference
dtfqn <- dtfq %>% group_by(reporter) %>% summarise(n=n()) %>% arrange(-n)
dtfn <- dtf %>% group_by(reporter) %>% summarise(n=n())%>% arrange(-n)
fr1 <- dtf %>% filter(reporter=="France")

Handling partner information

dtf_p <- addpartnerflow(dtf)

Main clean function

dtfclean <- dtfraw %>% 
    clean(shaveprice=TRUE) 
dtfdup <- dtfclean %>% findduplicatedflows
dtfclean %>% addpartnerflow
dtfclean %>% group_by(flag) %>% summarychange %>% kable

Clean from the database

Read from the database, clean, write to the database

charcoal90 <- loadrawdata(440290)
# Add missing fields as of January 2015 # should be fixed later
charcoal90$flag <- 0
# Check that all required columns are there
stopifnot(column_names$efi[column_names$raw_flow] %in% names(dtf))
missingcolumnsraw <- column_names$efi[column_names$raw_flow &
                                          !column_names$efi %in% names(dtf)]
if (length(missingcolumnsraw)>0){
    warning("Missing columns in the raw_flow table: \n",
            paste(missingcolumnsraw, collapse=", "))
}
charcoal90 <- clean(charcoal90, deleteextracolumns = FALSE)

Clean in detail again using all functions

swd99 <- readdbproduct(440799, 
                       tableread = "raw_flow_yearly") %>% 
    removeduplicatedflows %>%
    addconversionfactorandprice %>%
    addregion
priceregion <- extractprices(swd99)
conversionfactorworld <- extractconversionfactors(swd99)
swd99 <- swd99 %>%
        estimatequantity(priceregion, conversionfactorworld,
                         shaveprice = TRUE) %>%
        addpartnerflow
names(swd99)
deletedbproduct(440799, dbtable= "validated_flow_yearly")
writedbproduct(swd99, dbtable = "validated_flow_yearly" )

Clean using the main clean function

swd99 <- readdbproduct(440799, dbtable = "raw_flow_yearly")
rawnames <- names(swd99)
swd99 <- clean(swd99)
# Only 2 new columns because clean removed extracolumns
names(swd99)[!names(swd99) %in% rawnames]
writedbproduct(swd99, dbtable = "validated_flow_yearly" )

Cleandb

checkdbcolumns()
cleandbproduct(440799, tableread =  "raw_flow_yearly", tablewrite = "validated_flow_yearly")

# Wrong writetable to test failure
try(cleandbproduct(440799, tableread =  "raw_flow_yearly", tablewrite = "validated_flow_yearly_wrong"))

Other experiments with dbwrite

setdatabaseconfig(message=FALSE)
require(RMySQL)
db <- getOption("tradeflowsDB")
DBwrite <- dbConnect(MySQL(), user=db["user"], host=db["host"],
                     password=db["password"], dbname=db["dbname"])
result <- dbWriteTable(DBwrite, name="validated_flow_yearly",
                       value=data.frame(swd99), append=TRUE, row.names = FALSE)
dbDisconnect(DBwrite)

writedbpaul <-function(dtf, dbtable){
    # Write only to a validated_flow table
    stopifnot(dbtable %in% c("validated_flow_yearly", "validated_flow_monthly"))
    setdatabaseconfig(message=FALSE)
    require(RMySQL)
    db <- getOption("tradeflowsDB")
    DBwrite <- dbConnect(MySQL(), user=db["user"], host=db["host"],
                         password=db["password"], dbname=db["dbname"])
    result <- dbWriteTable(DBwrite, name="validated_flow_yearly",
                           value=data.frame(dtf), append=TRUE, row.names = FALSE)
    dbDisconnect(DBwrite)
    return(result)
    }
writedbpaul(swd99, "validated_flow_yearly")


writedbproduct <- function(dtf, dbtable){
    # Write only to a validated_flow table
    stopifnot(dbtable %in% c("validated_flow_yearly", "validated_flow_monthly"))
    setdatabaseconfig(message=FALSE)
    require(RMySQL)
    db <- getOption("tradeflowsDB")
    DBwrite <- dbConnect(MySQL(), user=db["user"], host=db["host"],
                         password=db["password"], dbname=db["dbname"])
    dtf <- data.frame(dtf)
    result <- dbWriteTable(DBwrite, name=table,
                           value=dtf, append=TRUE, row.names = FALSE)
    dbDisconnect(DBwrite)
    return(result)
}
writedbproduct(swd99, "validated_flow_yearly")

Misceleanous remarks

Issue with duplicated Sudan in reportercomtrade

message("There were duplicated lines for the following reporters:")
# For example this flow reported by Sudan is duplicated
dtf %>% filter(reporter=="Fmr Sudan" & partner=="USA") %>% select(-productdescription)

tfraw %>% renamecolumns %>%
      merge(select(reportercomtrade, 
                 partnercode = reportercode, regionpartner=region)) %>%
    duplicates %>% select(year, reporter, partner) %>% unique %>%
    arrange(reporter) %>% kable

# What happens when we 
#       merge(select(reportercomtrade, 
#                  partnercode = reportercode, regionpartner=region)) 
bli <- dtfraw %>% addconversionfactorandprice %>%
      merge(select(reportercomtrade, 
                 partnercode = reportercode, regionpartner=region)) %>%
    duplicates
# For example this flow reported by France is duplicated
dtf %>% filter(reporter=="France" & partner=="Fmr Sudan") %>% select(-productdescription)
#  partnercode reportercode classification year flowcode   flow reporter reporteriso
# 1         736          251             H2 2005        2 Export   France         FRA
# 2         736          251             H2 2005        2 Export   France         FRA
#     partner partneriso productcode unitcode                   unit quantity weight
# 1 Fmr Sudan        SDN      440799       12 Volume in cubic meters        2   2000
# 2 Fmr Sudan        SDN      440799       12 Volume in cubic meters        2   2000
#   tradevalue flag regionreporter regionpartner conversion price
# 1       2488    0         Europe        Africa       1000  1244
# 2       2488    0         Europe        Africa       1000  1244

# Issue with duplicated Sudan reportercode 729 and Fmr Sudan reportercode 729
# in the reportercomtrade table with causes duplicates in the merged tables
reportercomtrade %>% filter(reportercode %in% c(736,729))

# Are there other duplicates in the reportercomtrade table?
reportercomtrade %>% filter(duplicated(reportercode))  
#  reportercode  reporter reporteriso reportercodefao regionfao       subregion
# 1          729     Sudan         SDN             206    Africa Northern Africa
# 2          736 Fmr Sudan         SDN             206    Africa Northern Africa
#   reporternamefao region duplicate
# 1       the Sudan Africa      TRUE
# 2       the Sudan Africa      TRUE

# This is due to a merge based on duplicated iso codes for Sudan
filter(reportercomtrade, !is.na(reporteriso) & duplicated(reporteriso))
#   reportercode  reporter reporteriso reportercodefao regionfao       subregion
# 1          729     Sudan         SDN             206    Africa Northern Africa
# 2          736 Fmr Sudan         SDN             276    Africa Northern Africa
# 3          736 Fmr Sudan         SDN             206    Africa Northern Africa
#   reporternamefao region
# 1       the Sudan Africa
# 2       the Sudan Africa
# 3       the Sudan Africa
nrow(dtf)
# [1] 47493
nrow(dtf2)
# [1] 49082
# After removing duplicates
 nrow(dtf)
# [1] 47351
nrow(dtf2)
# [1] 48940

Units

In 2004 volume was reporter in litres

dtf %>% filter( unit=="Volume in litres") %>% select(year) %>% unique

Unit No quantity creates problem while merging price and conversion factor tables

# look at an issue with the absence of median price and conversion factor in 
# flag 200
dtf200 <- dtfclean %>% filter(flag==200)
dtf200 <- dtf200 %>% merge(conversionfactorworld, all.x=TRUE)
# That was meainly because of this unit "No Quantity"


EuropeanForestInstitute/tradeflows documentation built on Oct. 7, 2019, 10:57 a.m.