library(knitr) opts_knit$set(root.dir="../../..") # file paths are relative to the root of the project directory opts_chunk$set(echo=FALSE, warning=FALSE) library(tradeflows) library(dplyr) library(ggplot2) library(reshape2)
# Remove this chunk in the template file # load("data-raw/comtrade/440799.RData") # replace file by database source dtf <- readdbproduct(440799, "raw_flow_yearly") regionpartner <- reportercomtrade %>% select(reportercode = reportercode, regionpartner=region) swd99 <- dtf %>% # renamecolumns %>% filter(flow %in% c("Import", "Export")) %>% # Remove EU28 reporter filter(!reporter %in%c("EU-28")) %>% # Remove World partner filter(!partner %in%c("EU-28", "World")) %>% # keep region # but as of 23 December 2014, use regionreport and regionpartner instead merge(select(reportercomtrade, reportercode, region, subregion)) %>% merge(select(reportercomtrade, reportercode, regionreporter=region)) %>% merge(select(reportercomtrade, partnercode = reportercode, regionpartner=region)) %>% mutate(conversion = weight / quantity, price = tradevalue / quantity, # To avoid "integer overflow - use sum(as.numeric(.))" error # on sum of all values tradevalue = as.numeric(tradevalue), # To avoid Error in swd99$c(NA_integer_, : # invalid subscript type 'integer' quantity = as.numeric(quantity), # tradeflow within a region withinregion = regionreporter == regionpartner)
Not all quantities are available. which percentage of the trade value cannot be taken into account for the unit price calculation?
r nrow(swd99)
flows in the r as.character(unique(swd99$productcode))
dataset. r nrow(filter(swd99, is.na(tradevalue)))
don't have a trade valuer nrow(filter(swd99, is.na(weight)))
don't have a weightr nrow(filter(swd99, is.na(quantity)))
don't have a quantityswd99 %>% mutate(has_a_quantity = !is.na(quantity)) %>% group_by(flow, has_a_quantity) %>% summarise(number_of_flows = n(), total_tradevalue = sum(tradevalue, na.rm=TRUE)) %>% mutate(percentage_of_value = round(total_tradevalue / sum(total_tradevalue, na.rm=TRUE)*100)) %>% arrange(-has_a_quantity) %>% kable
swd99 %>% mutate(has_a_weight = !is.na(weight)) %>% group_by(flow, has_a_weight) %>% summarise(number_of_flows = n(), total_tradevalue = sum(tradevalue, na.rm=TRUE)) %>% mutate(percentage_of_value = round(total_tradevalue / sum(total_tradevalue, na.rm=TRUE)*100)) %>% arrange(-has_a_weight) %>% kable
Calculate prices for product code r as.character(unique(swd99$productcode))
.
Calculate median prices and an upper and lower bound. Upper price is the third quartile multiplied by 2, lower price is the first quartile divided by 2.
In a boxplot, the upper and lower "hinges" correspond to the first and third quartiles (the 25th and 75th percentiles).
swd99region <- swd99 %>% group_by(flow, region, year) %>% summarise(lowerprice = round(0.5 * quantile(price, 0.25, names=FALSE, na.rm=TRUE)), medianprice = round(median(price, na.rm=TRUE)), upperprice = round(2 * quantile(price, 0.75, names=FALSE, na.rm=TRUE))) %>% arrange(-medianprice) swd99region %>% dcast(region + flow ~ year, value.var="medianprice") %>% kable(caption="Median unit prices") ggplot(data=swd99region, aes(x=year, y=medianprice, color=flow)) + coord_cartesian(ylim=c(0,2500)) + geom_ribbon(aes(ymin=lowerprice, ymax=upperprice, fill=flow, alpha=0.05))+ geom_point(colour="grey50", size = 4) + geom_point(size=3) + ggtitle("Median prices by region, shaded area represents a zone between our limits on lowerprice and upperprice") + facet_wrap(region~flow) cat("\n\n") p <- ggplot(data=swd99, aes(x=factor(year), y=price)) + geom_boxplot(na.rm=TRUE) + xlab("") + facet_wrap(region~flow) p + coord_cartesian(ylim=c(0,2500)) + ggtitle( "440799 zoom plot (price<2500$)") cat("\n\n") p + scale_y_log10() + ggtitle( "440799 Log scale") # Do we need a separate calculation because of NA quantity # compare to Unit price of aggregated trade flows swd99regionunitprice <- swd99 %>% filter(!is.na(quantity)) %>% group_by(region, flow, year) %>% summarise(price = round(sum(tradevalue) / sum(quantity))) swd99regionunitprice %>% dcast(region + flow~year, value.var="price") %>% kable(caption="Unit prices of aggregated trade flows")
# Median prices swd99region2 <- swd99 %>% group_by(withinregion, flow, region, year, flag) %>% summarise(lowerprice = round(0.5 * quantile(price, 0.25, names=FALSE, na.rm=TRUE)), medianprice = round(median(price, na.rm=TRUE)), upperprice = round(2 * quantile(price, 0.75, names=FALSE, na.rm=TRUE))) %>% arrange(-medianprice) ggplot(swd99region2, aes(x=year, y=medianprice, color=withinregion)) + geom_point() + facet_grid(flow+flag~ region) + ylim(c(0,1000)) swd99region2 %>% dcast(withinregion + region + flow ~ year, value.var="medianprice") %>% kable(caption="Median unit prices")
# medianprice
# Table of median flow prices swd99subregion <- swd99 %>% group_by(flow,subregion, year) %>% summarise(lowerprice = round(0.5 * quantile(price, 0.25, names=FALSE, na.rm=TRUE)), medianprice = round(median(price, na.rm=TRUE)), upperprice = round(2 * quantile(price, 0.75, names=FALSE, na.rm=TRUE))) %>% arrange(-medianprice) kable(swd99subregion)
# Transform subregion to an ordered vector by import prices f = "Import" swd99imp <- swd99subregion %>% as.data.frame %>% # Otherwise grouping variables are used before arrange() filter(flow==f &year==2011) %>% arrange(-medianprice) # Add these lower, upper and median prices on the plot swd99subregionmelt <- swd99subregion %>% as.data.frame %>% #Prevent message cannot modify grouping variable filter(year==2011) %>% mutate(subregion = ordered(subregion, levels = swd99imp$subregion)) %>% melt(id=c("flow", "subregion", "year")) %>% mutate(variable = ordered(variable, levels=c("upperprice", "medianprice","lowerprice" ))) ggplot(data=swd99subregionmelt) + geom_point(aes(x = subregion, y = value, color = variable))+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) + facet_grid(flow~.) p <- ggplot(data=swd99, aes(x=subregion, y=price)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + geom_point(data=swd99subregionmelt, aes(x = subregion, y = value, color = variable))+ facet_grid(flow~.) + xlab("") cat("\n\n") p + coord_cartesian(ylim=c(0,2500)) + ggtitle( "440799 zoom plot (price<2500$)") cat("\n\n") p + scale_y_log10() + ggtitle( "440799 Log scale")
summary(select(swd99, tradevalue, weight, quantity, price))
Slice the dataset in log and show the distribution in number of rows and in value along the trade value split in log.
round(log(max(swd99$tradevalue))) swd99logtv <- swd99 %>% mutate(logtradevalue = round(log(tradevalue))) %>% group_by(logtradevalue, region, withinregion) %>% summarise(tradevalue = sum(tradevalue), number_of_flows = n()) %>% melt(id=c("logtradevalue", "region", "withinregion")) p <- ggplot(swd99logtv, aes(x=logtradevalue, y=value)) + geom_bar(stat="identity") + ylab("Number of flows | Trade value in $") p + facet_grid(variable~., scales="free_y") p + facet_grid(variable~region, scales="free_y") p + facet_grid(variable+withinregion~region, scales="free_y") + ggtitle("Trade within the region (TRUE) or with a partner outside the region (FALSE)")
round(log(max(swd99$quantity))) swd99logq <- swd99 %>% mutate(logquantity = round(log(quantity))) %>% group_by(logquantity, region) %>% summarise(quantity = sum(quantity), number_of_flows = n()) %>% melt(id=c("logquantity", "region")) p <- ggplot(filter(swd99logq, !is.na(logquantity)&logquantity!=-Inf), aes(x=logquantity, y=value)) + geom_bar(stat="identity") + ylab("Number of flows | Quantity in m3") p + facet_grid(variable~., scales="free_y") p + facet_grid(variable~region, scales="free_y")
swd99logw <- swd99 %>% mutate(logweight = round(log(weight))) %>% group_by(logweight, region) %>% summarise(weight = sum(weight), number_of_flows = n()) %>% melt(id=c("logweight", "region")) p <- ggplot(filter(swd99logw, !is.na(logweight)&logweight!=-Inf), aes(x=logweight, y=value)) + geom_bar(stat="identity") + ylab("Number of flows | weight in kg") p + facet_grid(variable~., scales="free_y") p + facet_grid(variable~region, scales="free_y")
r min(swd99$year)
and r max(swd99$year)
p <- ggplot(swd99, aes(x=price)) + geom_histogram(binwidth=100) + xlim(0,2500) + ylab("Number of trade flows") p + facet_grid(flow~.) + ggtitle("All world flows for 440799") p + facet_grid(flow~withinregion+flag) + ggtitle("All world flows for 440799") # p + facet_grid(region~flow) + ggtitle("World flows for 440799 by regions") # p + facet_grid(year~flow) + ggtitle("World flows for 440799 by year") p + facet_grid(flow + year ~flag) + ggtitle("World flows for 440799 by flag") p + facet_grid(region + flow ~ year) + ggtitle("Number of flows for 440799 by region and year vertical red line represents median price") + geom_vline(data=swd99region, aes(xintercept = medianprice), color="red")
# Calculate total trade value by region, year and show total trade value p <- swd99 %>% mutate(pricegroup = round(price/100) * 100) %>% group_by(region, flow, year, pricegroup) %>% summarise(tradevalue = sum(tradevalue)) %>% filter(!is.na(pricegroup)&pricegroup!=Inf) %>% ggplot(aes(x=pricegroup,y=tradevalue)) + geom_bar(stat="identity") + ylab("Total trade value in $") + xlim(0,2500) + xlab("Price in $") cat("\n\n") p + facet_grid(flow~.) + ggtitle("All world flows for 440799") #+ aes(fill=region) cat("\n\n") p + ggtitle("Total trade value for 440799 by region and year Vertical red line represents median unit prices (by number of flows for each region), Vertical green line represents aggregate unit prices for each region") + geom_vline(data=swd99region, aes(xintercept = medianprice), colour="red") + geom_vline(data=swd99regionunitprice, aes(xintercept = price), colour="green") + facet_grid(region + flow ~ year) # density plot # ggplot(swd99, aes(price, fill = region)) + # geom_density(alpha = 0.2) + xlim(0, 2500) + facet_wrap(~region) ####################################################################### # # Different way of showing total trade value, using ggplot weight aesthetic ###################################################################### # p <- ggplot(swd99, aes(x=price, weight=tradevalue)) + geom_histogram(binwidth=100) + xlim(0,2500) + ylab("Total trade value in $") cat("\n\n") p + facet_grid(flow~.) + ggtitle("All world flows for 440799, using price as a weight aesthetic") cat("\n\n") p + facet_grid(region + flow ~ year) + ggtitle("Total trade value for 440799 by region and year vertical red line represents median price") + geom_vline(data=swd99region, aes(xintercept = medianprice), color="red")
Vertical lines, represent large quantities which are at the same prices.
if (FALSE){ # Visualise raw prices swd99r <- readdbproduct(440799,"raw_flow_yearly") swd99 <- swd99r %>% addconversionfactorandprice() swd99 <- swd99 %>% filter(flag == 0) # Visualise validated prices swd99c <- swd99r %>% clean() swd99 <- swd99c %>% addconversionfactorandprice() # Check dtfinbound in the shaveprice function a lot of rawprices are out of range there # Due to the fact that the regional bounds on prices could not be calculated. } swd99$logprice <- round(log(swd99$price)) # Remove infinite prices cat("\n\n") p <- ggplot(filter(swd99, price!=Inf), aes(x = tradevalue, y = price)) + geom_point(alpha = 1/5) + coord_cartesian(ylim =c(0,2000)) + ggtitle("Variation of prices along the tradevalue") p + scale_x_log10(limits=c(1000, NA)) + xlab("Trade value $ (log scale)") p + scale_x_log10(limits=c(1000, NA)) + xlab("Trade value $ (log scale)") + facet_wrap(~flow) + aes(color=as.factor(flag)) summary(swd99[c("tradevalue","price")]) # Zoom on those price levels between first and third quartile firstquartile <- round(quantile(swd99$price ,probs=0.25, na.rm=TRUE)) thirdquartile <- round(quantile(swd99$price ,probs=0.75, na.rm=TRUE)) p + coord_cartesian(ylim=c(firstquartile,thirdquartile)) + # + xlim(c(0,1e6)) + scale_x_log10(limits=c(100, NA)) + xlab("Trade value $ (log scale)") + ggtitle(paste("Variation of prices along the trade value for price between the first and third quartile", firstquartile, "$ and", thirdquartile, "$")) + facet_grid(flow + region ~ year) #+ aes(color=year) # Add a line for median prices on this graph # Median price and total volume by region swd99 %>% group_by(flow, region) %>% summarise(tradevalue = sum(tradevalue, na.rm=TRUE), price = median(price, na.rm=TRUE)) %>% ggplot() + aes(x = tradevalue, y = price, label=region) + geom_text() + facet_wrap(~flow) + ggtitle("Median prices and total trade value by regions") # Median price and volume by subregion swd99 %>% group_by(flow, subregion) %>% summarise(tradevalue = median(tradevalue, na.rm=TRUE), price = median(price, na.rm=TRUE)) %>% ggplot() + aes(x = tradevalue, y = price, label=subregion) + geom_text() + facet_wrap(~flow)
swd99 %>% group_by(year, region, flow, flag) %>% summarise(medianprice = round(median(price, na.rm=TRUE))) %>% dcast(year + region + flow ~ flag, value.var="medianprice" )
swd99re <- dtf %>% renamecolumns %>% filter(flow %in% c("Re-Import", "Re-Export")) %>% merge(select(reportercomtrade, reportercode, region, subregion)) %>% mutate(conversion = weight / quantity, price = tradevalue / quantity, # To avoid "integer overflow - use sum(as.numeric(.))" error # on sum of all values tradevalue = as.numeric(tradevalue)) # Remove infinite prices p <- ggplot(filter(swd99re, price!=Inf), aes(x = tradevalue, y = price)) + geom_point(alpha = 1/10) + coord_cartesian(ylim =c(0,2000)) + ggtitle("Variation of prices along the tradevalue") p + scale_x_log10(limits=c(1000, NA)) + xlab("Trade value $ (log scale)") p + scale_x_log10(limits=c(1000, NA)) + xlab("Trade value $ (log scale)") + facet_wrap(~flow) + aes(color=year)
The slope of those graphs represents unit prices.
p <- ggplot(mutate(swd99, splitquantity=round(log(quantity)))) + aes(x=quantity, y=tradevalue) + geom_point(alpha = 1/10) cat("\n\n") p + scale_x_log10() + scale_y_log10() + ggtitle("Trade value and quantity (log scale)") cat("\n\n") p + xlim(c(0,2e3)) + ylim(c(0,1e6)) + ggtitle("Trade values below 1e6$, lines represent the median unit price by year and region") + facet_grid(flow+region~year) + # " blue line corresponds to a price of 400$/m3, # red line to a price of 800$/m3" # geom_abline(intercept = 20, slope= 400 , colour = "blue", size = 2) + # geom_abline(intercept = 20, slope= 800 , colour = "red", size = 2) + # geom_smooth(aes(group=splitquantity), method="lm") + geom_abline(data=(swd99region), aes(intercept=0, slope=medianprice))
# Very poor fit and irrelevant slope for the whole dataset mtvq <- lm(tradevalue ~ quantity, swd99) summary(mtvq) # Model with log # mtvqlog <- lm(log(tradevalue)~log(quantity), swd99, na.action=na.exclude) # For quantities above 500 the slope is a realistic price mtvq <- lm(tradevalue ~ quantity, filter(swd99, quantity>500& quantity<1000)) summary(mtvq)
p <- ggplot(swd99) + aes(x=weight, y=tradevalue) + geom_point(alpha = 1/10) #p + ggtitle("Trade value and weight") # p + xlim(c(0,1e6)) + ylim(c(0,1e6)) + ggtitle("Values below 1e6") + # facet_grid(flow ~ year) + stat_smooth(method="lm") p + scale_x_log10() + scale_y_log10() + ggtitle("Trade value and weight (log scale)")
See also the other file on conversion factors.
The slope of those graph represents conversion factors.
p <- ggplot(swd99) + aes(x=quantity, y=weight) + geom_point(alpha = 1/10) # p + ggtitle("Trade value and weight") cat("\n\n") p + xlim(c(0,1e3)) + ylim(c(0,1e6)) + ggtitle("Values below 1e6") + facet_wrap(~flow) cat("\n\n") p + scale_x_log10() + scale_y_log10() + aes(color=region) + ggtitle("Weight and quantity (log scale)") + facet_wrap(~region)
r nrow(filter(swd99, is.na(quantity)))
flows using unit pricesUse regional median prices to calculate quantities. Increase in global trade import and export due to the estimate for all flows
# Use regional median prices swd99regioncorrected <- merge(swd99, swd99region, all.x=TRUE) %>% mutate(quantity2 = tradevalue / medianprice, havequantity = !is.na(quantity)) swd99regioncorrected %>% group_by(havequantity,year, flow) %>% summarise(rows = n(), volume = sum(quantity, na.rm=TRUE) , estimatedvolume = sum(quantity2, na.rm=TRUE), increase = estimatedvolume - volume, percent = round(increase / volume*100)) %>% kable # Check if both tables contain the same total world trade flows stopifnot(sum(swd99$quantity,na.rm=TRUE) == sum(swd99regioncorrected$quantity, na.rm=TRUE)) print("Percent change in world trade flows ") wtf <- sum(swd99$quantity,na.rm=TRUE) swd99regioncorrected %>% filter(!havequantity) %>% group_by(havequantity, year, flow) %>% summarise(percentchangeinworldtrade = round((sum(quantity2,na.rm=TRUE) - sum(quantity,na.rm=TRUE)) / wtf*100,3))
# Use sub-regional median prices swd99subregioncorrected <- merge(swd99, swd99subregion) %>% mutate(quantity3 = tradevalue / medianprice, havequantity = !is.na(quantity)) %>% group_by(havequantity, year, flow) %>% summarise(rows = n(), volume = sum(quantity, na.rm=TRUE) , estimatedvolume = sum(quantity3, na.rm=TRUE), increase = estimatedvolume - volume, percent = round(increase / volume*100)) swd99subregioncorrected %>% kable print("Total world trade increase") # Check if both tables contain the same total world trade flows stopifnot(sum(swd99$quantity,na.rm=TRUE) == sum(swd99subregioncorrected$volume)) print("Percent change in world trade flows ") swd99subregioncorrected %>% filter(!havequantity) %>% group_by(year, flow) %>% summarise(percentchangeinworldtrade = round(sum(increase)/sum(swd99$quantity,na.rm=TRUE)*100,3))
swd99regioncorrected %>% filter(price > upperprice ) %>% group_by(year, flow) %>% summarise(percentchangeinworldtrade = round((sum(quantity2,na.rm=TRUE) - sum(quantity,na.rm=TRUE)) / wtf*100,3)) %>% kable
Double the threshold for upperprice
swd99regioncorrected %>% filter(price > 2*upperprice ) %>% group_by(year, flow) %>% summarise(percentchangeinworldtrade = round((sum(quantity2,na.rm=TRUE) - sum(quantity,na.rm=TRUE)) / wtf*100,3)) %>% kable
swd99regioncorrected %>% filter(price < lowerprice ) %>% group_by(year, flow) %>% summarise(percentchangeinworldtrade = round((sum(quantity2,na.rm=TRUE) - sum(quantity,na.rm=TRUE)) / wtf*100,3)) %>% kable
Half the threshold for lower price
swd99regioncorrected %>% filter(price < lowerprice/2 ) %>% group_by(year, flow) %>% summarise(percentchangeinworldtrade = round((sum(quantity2,na.rm=TRUE) - sum(quantity,na.rm=TRUE)) / wtf*100,3)) %>% kable
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.