# Generate this document from a R command line rmarkdown::render("~/R/eutradeflows/docs/pricedistribution.Rmd") # # Issues #### message("Issues:") message("Add price bounds to the plots") message("Generate density plots which include the price weighted by trade value, instead of by number of flows, according to https://stackoverflow.com/a/12625991/2641825 it's possible to pass a `weights` aesthetic to geom_density() Acording to https://stats.stackexchange.com/a/22245/68318 The vector of weights should sum to one for the density to be a true density.")
library(knitr) opts_knit$set(root.dir="..") # file paths are relative to the root of the project directory opts_chunk$set(fig.width = 10, message=FALSE, warning=FALSE) library(tradeflows) library(dplyr) library(tidyr) library(ggplot2)
Connect to the database.
con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "tradeflows")
This document displays price distribution for yearly and monthly Comext data. Various plots explore the variation of the price distribution among products and through time. The data is extracted from the database and stored into 2 data frames:
The first plots, Figure 1 and 2 are histograms. For a given price range, the height of the bar represents the number of flows occuring at that price range.
Figure 3 and onwards are density plots. For a given price range, the area under the curve represents the probability that a given flow has a price withing the range. Because most of the density is less than 1, the curve has to rise higher than 1 in some points in order to have a total area of 1 as required for all probability distributions.
products <- tbl(con, "vld_comext_product") %>% filter(productcode %like% "4407%") %>% # explain() %>% collect() # Monthly archive wpm <- tbl(con, "raw_comext_monthly_2016S1") %>% filter(productcode %in% products$productcode) %>% collect() # Monthly recent # Yearly archive wpy <- tbl(con, "raw_comext_yearly_2016S2") %>% filter(productcode %in% products$productcode) %>% collect() # Add prices wpm <- wpm %>% tradeflows::addconversionfactorandprice() wpy <- wpy %>% tradeflows::addconversionfactorandprice() # Bind the monthly and yearly dataframes together # Specify monthly and yearly timeframe wpm$timeframe <- "monthly" wpy$timeframe <- "yearly" wp <- rbind(wpm, wpy) %>% mutate(year = substr(period, 1, 4))
Number of rows and total trade value
# Number of rows in millions nrow(wp)/1e6 # Total trade value in million € sum(wp$tradevalue, na.rm=TRUE) /1e6 # Time frame unique(wp$year) # Trade value per year in million € wp %>% group_by(year, flowcode, timeframe) %>% summarise(tradevalue = round(sum(tradevalue, na.rm=TRUE)/1e6)) %>% left_join(data_frame(flow = c("import", "export"), flowcode = 1:2)) %>% unite(time_flow, timeframe, flow) %>% ungroup() %>% select(-flowcode) %>% spread(time_flow, tradevalue)
Descriptive statistics of tradevalue, quantity and weight for monthly and yearly data:
summary(wpm[c("tradevalue", "quantity", "weight")]) summary(wpy[c("tradevalue", "quantity", "weight")])
Descriptive statistics of prices and convertion factor for monthly and yearly data:
summary(wpm[c("price", "pricew", "conversion")]) summary(wpy[c("price", "pricew", "conversion")])
\pagebreak
# Interleaved histograms ggplot(wp, aes(x=price, fill=timeframe)) + geom_histogram(binwidth=.1, position="dodge") + xlim(c(0,10)) ggplot(wp, aes(x=price, fill=timeframe)) + geom_histogram(binwidth=.01, position="dodge") + xlim(c(0,1))
ggplot(wp, aes(x=price, fill=timeframe)) + geom_density(alpha=.3) + xlim(c(0,10)) # Same plot scaled between 0 and 1 ggplot(wp, aes(x=price, fill=timeframe)) + geom_density(alpha=.3) + xlim(c(0,1))
ggplot(filter(wp, !is.na(tradevalue)), aes(x=price, fill=timeframe, weight = tradevalue/sum(tradevalue))) + geom_density(alpha=.3) + scale_x_continuous(breaks = c(0.5,0:10), limits = c(0,10)) # Same plot scaled between 0 and 1 ggplot(filter(wp, !is.na(tradevalue)), aes(x=price, fill=timeframe, weight = tradevalue/sum(tradevalue))) + geom_density(alpha=.3) + scale_x_continuous(breaks = 0:10/10, limits = c(0,1))
ggplot(wpm, aes(x = log(price))) + geom_histogram(binwidth = 0.1) ggplot(wp, aes(x = log(price), fill = timeframe)) + geom_density(alpha=.3) + xlim(c(-10,5))
wp$productcode6d <- substr(wp$productcode, 1,6) ggplot(wp, aes(x=price, fill=timeframe)) + geom_density(alpha=.3) + xlim(c(0,1)) + facet_wrap(~productcode6d, scales = "free_y")
Same plot weighted by the trade value
wp %>% filter(!is.na(tradevalue)) %>% group_by(productcode6d) %>% mutate(sumtradevalue = sum(tradevalue)) %>% # summarise(sum(tradevalue)/unique(sumtradevalue)) # are they all 1? ggplot(aes(x=price, fill=timeframe, weight = tradevalue/sumtradevalue)) + geom_density(alpha=.3) + xlim(c(0,1)) + facet_wrap(~productcode6d, scales = "free_y")
Plot generated externally Note: some products only have yearly data, no monthly data. Is this due to missing quantity information? Or are the monthly flows missing completely for these products?
ggplot(wp, aes(x=price, fill=timeframe)) + geom_density(alpha=.3) + xlim(c(0,1)) + facet_wrap(~productcode, scales = "free_y")
Yearly distribution by product
# Monthly distribution by year by product # Yearly distribution by year by product
Large plot generated externaly
wp$productcode6d <- substr(wp$productcode, 1,6) ggplot(wp, aes(x=price, fill=timeframe)) + geom_density(alpha=.3) + xlim(c(0,1)) + facet_grid(year~productcode6d, scales = "free_y")
library(ggridges) # Create a factor of years wpm$year <- wpm$period %/% 100 wpm$year <- factor(wpm$year, levels = rev(unique(wpm$year))) ggplot(wpm, aes(x = price, y = year)) + geom_density_ridges(scale = 1, alpha = .5) + xlim(c(0,1)) + theme_ridges()
Add price bounds to this plot.
# Add product code at the 6 digit level wpm$productcode6d <- substr(wpm$productcode, 1, 6) ggplot(wpm, aes(x = price, y = year)) + geom_density_ridges(scale = 1, alpha = .5) + xlim(c(0,1)) + theme_ridges() + facet_wrap(~productcode6d)
Products under 440799 only
wpm %>% filter(productcode6d == "440799") %>% ggplot(aes(x = price, y = year)) + geom_density_ridges(scale = 1, alpha = .5) + xlim(c(0,1)) + theme_ridges() + facet_wrap(~productcode)
\pagebreak
ggplot(wpm, aes(x = pricew, y = year)) + geom_density_ridges(scale = 1, alpha = .5, rel_min_height = 0.01) + # Cut trailing tail xlim(c(0,10)) + theme_ridges() + facet_wrap(~productcode6d)
Show red bounds using the 5th percentile and 95the percentile and blue bounds, using half the first quartile and twice the third quartile
b <- quantile(wpm$pricew,c(.25,.75), na.rm = TRUE) b b * c(0.5, 2) quantile(wpm$pricew,c(.05,.95), na.rm = TRUE) # Group by products wpm %>% filter(!is.na(pricew)) %>% group_by(productcode) %>% summarise(q5 = quantile(pricew, 0.05, names = FALSE), q25 = quantile(pricew, 0.25, names = FALSE), q75 = quantile(pricew, 0.75, names = FALSE), q95 = quantile(pricew, 0.95, names = FALSE)) %>% kable()
# With base R, # see https://stackoverflow.com/questions/43373568/dplyr-to-iterate-over-all-columns-for-quantile # sapply(wp[!is.na(wp$pricew),], # function(x) quantile(x, c(0.25, 0.5, 0.75)))%>% t # But this doesn't take care of the grouping
Disconnect from the database.
RMariaDB::dbDisconnect(con)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.