For wood products, each trade flows should have an estimated volume in m3 For paper products, each trade flwos should have an estimated weight in kg
Each tradeflow goes through 2 filters: 1. Handling missing data + If the weight is available in kg. Estimate missing quantity from weight using a regional conversion factor. + If neither volume nor quantity is available, use a regional unit price to convert trade value to a quantity. 2. Handling out of bounds data + Check if the unit price is above lowerprice and below upperprice correct using medianprice if out of bounds
library(knitr) opts_knit$set(root.dir="../../..") # file paths are relative to the root of the project directory library(tradeflows) library(dplyr) library(RMySQL)
Load data from a file. (in the final system data will be loaded from a database) Calculate median unit price by region and by year. Calculate median conversion factor by year for the world. Median price calculation can be put in a function for further use.
load("data-raw/comtrade/440799.RData") dtfraw <- dtf %>% renamecolumns %>% removeduplicatedflows # Quantity not available but weight is available dtfraw %>% filter(is.na(quantity) & !is.na(weight)) %>% nrow # Quantity and weight not available dtfraw %>% filter(is.na(quantity) & is.na(weight)) %>% nrow dtf <- dtfraw %>% filter(flow %in% c("Import", "Export")) %>% # Remove EU28 reporter filter(!reporter %in%c("EU-28")) %>% # Remove EU-28 and World partner filter(!partner %in%c("EU-28", "World")) %>% # Add regionreporter and regionpartner merge(select(reportercomtrade, reportercode, regionreporter=region), all.x=TRUE) %>% merge(select(reportercomtrade, partnercode = reportercode, regionpartner=region), all.x=TRUE) %>% mutate(conversion = weight / quantity, price = tradevalue / quantity, # To avoid "integer overflow - use sum(as.numeric(.))" error # on sum of all values tradevalue = as.numeric(tradevalue), # To avoid Error in dtf$c(NA_integer_, : # invalid subscript type 'integer' quantity = as.numeric(quantity))
Check for unique flows
nrow(dtf) dtf %>% select(year, flow, reportercode, partnercode, productcode, classification) %>% unique %>%nrow message("There were duplicated lines for the following reporters:") dtf %>% findduplicatedflows %>% select(year, reporter) %>% unique %>% arrange(reporter) %>% kable
Check filter on flag below to see if comtrade estimates are included or not.
pricesregion <- dtf %>% filter(!is.na(quantity) & unit !="No Quantity") %>% # Exclude comtrade quantity estimates # filter(flag==0 |flag==4 ) %>% group_by(flow, regionreporter, year, unit) %>% summarise(lowerprice = round(0.5 * quantile(price, 0.25, names=FALSE, na.rm=TRUE)), medianprice = round(median(price, na.rm=TRUE)), upperprice = round(2 * quantile(price, 0.75, names=FALSE, na.rm=TRUE))) %>% arrange(-medianprice) conversionfactorworld <- dtf %>% filter(!is.na(quantity) & unit !="No Quantity") %>% group_by(flow, year, unit) %>% summarise(medianconversion = round(median(conversion,na.rm=TRUE)))
# Quantity based on the weight using a conversion factor dtf <- merge(dtf, conversionfactorworld, all.x=TRUE) %>% mutate(quantity_cf = weight / medianconversion) # Quantity based on the trade value, using a unit price dtf <- merge(dtf, pricesregion, all.x=TRUE) %>% mutate(quantity_up = tradevalue / medianprice, havequantity = !is.na(quantity)) summarychange <- function(dtf){ dtf %>% summarise(rows = n(), quantity = sum(quantity, na.rm=TRUE) , quantity_cf = sum(quantity_cf, na.rm=TRUE), cf_increase = round((quantity_cf-quantity)/ quantity *100), quantity_up = sum(quantity_up, na.rm=TRUE), up_increase = round((quantity_up-quantity)/ quantity *100)) } dtf %>% group_by(havequantity, year, flow) %>% summarychange %>% kable dtf %>% group_by(havequantity, flag, flow) %>% summarychange %>% kable dtf %>% group_by(flag) %>% summarychange %>% kable dtf %>% group_by(round(log(quantity)), flag) %>% summarychange %>% kable # Check if both tables contain the same total world trade flows print("Percent change in world trade flows ") wtf <- sum(dtf$quantity,na.rm=TRUE) dtf %>% #filter(!havequantity) %>% group_by(havequantity, year, flow, flag) %>% summarise(changeinworldtrade_cf = round((sum(quantity_cf,na.rm=TRUE) - sum(quantity,na.rm=TRUE)) / wtf*100,3), changeinworldtrade_up = round((sum(quantity_up,na.rm=TRUE) - sum(quantity,na.rm=TRUE)) / wtf*100,3)) %>% kable
Depending on whether quantity estimates have been removed or not, unit prices will be very different.
dtf2 <- dtfraw %>% addconversionfactorandprice %>% addregion # merge(select(reportercomtrade, # partnercode = reportercode, regionpartner=region)) # Check for duplicated flows nrow(findduplicatedflows(dtf2)) # Extract prices and conversion factors pricesregion2 <- dtf2 %>% extractprices(includeqestimates = TRUE) conversionfactorworld2 <- dtf2 %>% extractconversionfactors(includeqwestimates = TRUE) # Compare with other chunk summary(pricesregion$medianprice - pricesregion2$medianprice) summary(conversionfactorworld$medianconversion - conversionfactorworld2$medianconversion)
dtfq <- dtfraw %>% addconversionfactorandprice %>% addregion %>% estimatequantity(price=pricesregion2, conversionfactor=conversionfactorworld2, shaveprice=FALSE) nrow(findduplicatedflows(dtfq)) fr2 <- dtfq %>% filter(reporter =="France") #%>% removeduplicatedflows # Check flags fr <- estimatequantity(dtf, price=pricesregion2, conversionfactor=conversionfactorworld2) %>% filter(reporter=="France") fr2$partner[!fr2$partner %in% fr$partner] # World makes the difference dtfqn <- dtfq %>% group_by(reporter) %>% summarise(n=n()) %>% arrange(-n) dtfn <- dtf %>% group_by(reporter) %>% summarise(n=n())%>% arrange(-n) fr1 <- dtf %>% filter(reporter=="France")
dtf_p <- addpartnerflow(dtf)
dtfclean <- dtfraw %>% clean(shaveprice=TRUE) dtfdup <- dtfclean %>% findduplicatedflows dtfclean %>% addpartnerflow dtfclean %>% group_by(flag) %>% summarychange %>% kable
charcoal90 <- loadrawdata(440290) # Add missing fields as of January 2015 # should be fixed later charcoal90$flag <- 0 # Check that all required columns are there stopifnot(column_names$efi[column_names$raw_flow] %in% names(dtf)) missingcolumnsraw <- column_names$efi[column_names$raw_flow & !column_names$efi %in% names(dtf)] if (length(missingcolumnsraw)>0){ warning("Missing columns in the raw_flow table: \n", paste(missingcolumnsraw, collapse=", ")) } charcoal90 <- clean(charcoal90, deleteextracolumns = FALSE)
swd99 <- readdbproduct(440799, tableread = "raw_flow_yearly") %>% removeduplicatedflows %>% addconversionfactorandprice %>% addregion priceregion <- extractprices(swd99) conversionfactorworld <- extractconversionfactors(swd99) swd99 <- swd99 %>% estimatequantity(priceregion, conversionfactorworld, shaveprice = TRUE) %>% addpartnerflow names(swd99) deletedbproduct(440799, dbtable= "validated_flow_yearly") writedbproduct(swd99, dbtable = "validated_flow_yearly" )
swd99 <- readdbproduct(440799, dbtable = "raw_flow_yearly") rawnames <- names(swd99) swd99 <- clean(swd99) # Only 2 new columns because clean removed extracolumns names(swd99)[!names(swd99) %in% rawnames] writedbproduct(swd99, dbtable = "validated_flow_yearly" )
checkdbcolumns() cleandbproduct(440799, tableread = "raw_flow_yearly", tablewrite = "validated_flow_yearly") # Wrong writetable to test failure try(cleandbproduct(440799, tableread = "raw_flow_yearly", tablewrite = "validated_flow_yearly_wrong"))
setdatabaseconfig(message=FALSE) require(RMySQL) db <- getOption("tradeflowsDB") DBwrite <- dbConnect(MySQL(), user=db["user"], host=db["host"], password=db["password"], dbname=db["dbname"]) result <- dbWriteTable(DBwrite, name="validated_flow_yearly", value=data.frame(swd99), append=TRUE, row.names = FALSE) dbDisconnect(DBwrite) writedbpaul <-function(dtf, dbtable){ # Write only to a validated_flow table stopifnot(dbtable %in% c("validated_flow_yearly", "validated_flow_monthly")) setdatabaseconfig(message=FALSE) require(RMySQL) db <- getOption("tradeflowsDB") DBwrite <- dbConnect(MySQL(), user=db["user"], host=db["host"], password=db["password"], dbname=db["dbname"]) result <- dbWriteTable(DBwrite, name="validated_flow_yearly", value=data.frame(dtf), append=TRUE, row.names = FALSE) dbDisconnect(DBwrite) return(result) } writedbpaul(swd99, "validated_flow_yearly") writedbproduct <- function(dtf, dbtable){ # Write only to a validated_flow table stopifnot(dbtable %in% c("validated_flow_yearly", "validated_flow_monthly")) setdatabaseconfig(message=FALSE) require(RMySQL) db <- getOption("tradeflowsDB") DBwrite <- dbConnect(MySQL(), user=db["user"], host=db["host"], password=db["password"], dbname=db["dbname"]) dtf <- data.frame(dtf) result <- dbWriteTable(DBwrite, name=table, value=dtf, append=TRUE, row.names = FALSE) dbDisconnect(DBwrite) return(result) } writedbproduct(swd99, "validated_flow_yearly")
message("There were duplicated lines for the following reporters:") # For example this flow reported by Sudan is duplicated dtf %>% filter(reporter=="Fmr Sudan" & partner=="USA") %>% select(-productdescription) tfraw %>% renamecolumns %>% merge(select(reportercomtrade, partnercode = reportercode, regionpartner=region)) %>% duplicates %>% select(year, reporter, partner) %>% unique %>% arrange(reporter) %>% kable # What happens when we # merge(select(reportercomtrade, # partnercode = reportercode, regionpartner=region)) bli <- dtfraw %>% addconversionfactorandprice %>% merge(select(reportercomtrade, partnercode = reportercode, regionpartner=region)) %>% duplicates # For example this flow reported by France is duplicated dtf %>% filter(reporter=="France" & partner=="Fmr Sudan") %>% select(-productdescription) # partnercode reportercode classification year flowcode flow reporter reporteriso # 1 736 251 H2 2005 2 Export France FRA # 2 736 251 H2 2005 2 Export France FRA # partner partneriso productcode unitcode unit quantity weight # 1 Fmr Sudan SDN 440799 12 Volume in cubic meters 2 2000 # 2 Fmr Sudan SDN 440799 12 Volume in cubic meters 2 2000 # tradevalue flag regionreporter regionpartner conversion price # 1 2488 0 Europe Africa 1000 1244 # 2 2488 0 Europe Africa 1000 1244 # Issue with duplicated Sudan reportercode 729 and Fmr Sudan reportercode 729 # in the reportercomtrade table with causes duplicates in the merged tables reportercomtrade %>% filter(reportercode %in% c(736,729)) # Are there other duplicates in the reportercomtrade table? reportercomtrade %>% filter(duplicated(reportercode)) # reportercode reporter reporteriso reportercodefao regionfao subregion # 1 729 Sudan SDN 206 Africa Northern Africa # 2 736 Fmr Sudan SDN 206 Africa Northern Africa # reporternamefao region duplicate # 1 the Sudan Africa TRUE # 2 the Sudan Africa TRUE # This is due to a merge based on duplicated iso codes for Sudan filter(reportercomtrade, !is.na(reporteriso) & duplicated(reporteriso)) # reportercode reporter reporteriso reportercodefao regionfao subregion # 1 729 Sudan SDN 206 Africa Northern Africa # 2 736 Fmr Sudan SDN 276 Africa Northern Africa # 3 736 Fmr Sudan SDN 206 Africa Northern Africa # reporternamefao region # 1 the Sudan Africa # 2 the Sudan Africa # 3 the Sudan Africa
nrow(dtf) # [1] 47493 nrow(dtf2) # [1] 49082 # After removing duplicates nrow(dtf) # [1] 47351 nrow(dtf2) # [1] 48940
In 2004 volume was reporter in litres
dtf %>% filter( unit=="Volume in litres") %>% select(year) %>% unique
# look at an issue with the absence of median price and conversion factor in # flag 200 dtf200 <- dtfclean %>% filter(flag==200) dtf200 <- dtf200 %>% merge(conversionfactorworld, all.x=TRUE) # That was meainly because of this unit "No Quantity"
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.