# Generate this document from a R command line
rmarkdown::render("~/R/eutradeflows/docs/pricedistribution.Rmd")
#
# Issues ####
message("Issues:")
message("Add price bounds to the plots")
message("Generate density plots which include the price weighted by trade value,
        instead of by number of flows, according to
        https://stackoverflow.com/a/12625991/2641825
        it's possible to pass a `weights` aesthetic to geom_density()
        Acording to 
        https://stats.stackexchange.com/a/22245/68318
        The vector of weights should sum to one for the density to be a true density.")
library(knitr)
opts_knit$set(root.dir="..") # file paths are relative to the root of the project directory
opts_chunk$set(fig.width = 10, message=FALSE, warning=FALSE)
library(tradeflows)
library(dplyr)
library(tidyr)
library(ggplot2)

Connect to the database.

con <- RMariaDB::dbConnect(RMariaDB::MariaDB(), dbname = "tradeflows")

Introduction

This document displays price distribution for yearly and monthly Comext data. Various plots explore the variation of the price distribution among products and through time. The data is extracted from the database and stored into 2 data frames:

The first plots, Figure 1 and 2 are histograms. For a given price range, the height of the bar represents the number of flows occuring at that price range.

Figure 3 and onwards are density plots. For a given price range, the area under the curve represents the probability that a given flow has a price withing the range. Because most of the density is less than 1, the curve has to rise higher than 1 in some points in order to have a total area of 1 as required for all probability distributions.

Compare yearly and monthly price distributions for sub products of 4407

products <- tbl(con, "vld_comext_product") %>% 
    filter(productcode %like% "4407%")  %>%
    # explain() %>% 
    collect()

# Monthly archive 
wpm <- tbl(con, "raw_comext_monthly_2016S1") %>% 
    filter(productcode %in% products$productcode) %>% 
    collect()

# Monthly recent

# Yearly archive
wpy <- tbl(con, "raw_comext_yearly_2016S2") %>% 
    filter(productcode %in% products$productcode) %>% 
    collect()

# Add prices
wpm <- wpm %>% 
    tradeflows::addconversionfactorandprice()
wpy <- wpy %>% 
    tradeflows::addconversionfactorandprice()

# Bind the monthly and yearly dataframes together
# Specify monthly and yearly timeframe
wpm$timeframe <- "monthly"
wpy$timeframe <- "yearly"
wp <- rbind(wpm, wpy) %>% 
    mutate(year = substr(period, 1, 4))

Descriptive statistics (min, max, quartiles)

Number of rows and total trade value

# Number of rows in millions
nrow(wp)/1e6
# Total trade value in million €
sum(wp$tradevalue, na.rm=TRUE) /1e6
# Time frame
unique(wp$year)
# Trade value per year in million €
wp %>% group_by(year, flowcode, timeframe) %>% 
    summarise(tradevalue = round(sum(tradevalue, na.rm=TRUE)/1e6)) %>% 
    left_join(data_frame(flow = c("import", "export"), flowcode = 1:2)) %>% 
    unite(time_flow, timeframe, flow) %>% 
    ungroup() %>% select(-flowcode) %>% 
    spread(time_flow, tradevalue)

Descriptive statistics of tradevalue, quantity and weight for monthly and yearly data:

summary(wpm[c("tradevalue", "quantity", "weight")])
summary(wpy[c("tradevalue", "quantity", "weight")])

Descriptive statistics of prices and convertion factor for monthly and yearly data:

summary(wpm[c("price", "pricew", "conversion")])
summary(wpy[c("price", "pricew", "conversion")])

\pagebreak

Price distributions

# Interleaved histograms
ggplot(wp, aes(x=price, fill=timeframe)) +
    geom_histogram(binwidth=.1, position="dodge") +
    xlim(c(0,10))

ggplot(wp, aes(x=price, fill=timeframe)) +
    geom_histogram(binwidth=.01, position="dodge") +
    xlim(c(0,1))

Density plots of the price variable

ggplot(wp, aes(x=price, fill=timeframe)) +
    geom_density(alpha=.3) +
    xlim(c(0,10))

# Same plot scaled between 0 and 1
ggplot(wp, aes(x=price, fill=timeframe)) +
    geom_density(alpha=.3) +
    xlim(c(0,1))

Density plot of the price variable weighted by the tradevalue

ggplot(filter(wp, !is.na(tradevalue)),
       aes(x=price, fill=timeframe, weight = tradevalue/sum(tradevalue))) +
    geom_density(alpha=.3) +
    scale_x_continuous(breaks = c(0.5,0:10), limits = c(0,10))

# Same plot scaled between 0 and 1
ggplot(filter(wp, !is.na(tradevalue)),
       aes(x=price, fill=timeframe, weight = tradevalue/sum(tradevalue))) +
    geom_density(alpha=.3) +
    scale_x_continuous(breaks = 0:10/10, limits = c(0,1))

log(price) distribution monthly and yearly

ggplot(wpm, aes(x = log(price))) + 
    geom_histogram(binwidth = 0.1) 

ggplot(wp, aes(x = log(price), fill = timeframe)) +
    geom_density(alpha=.3) +
    xlim(c(-10,5))

Price distribution by product

wp$productcode6d <- substr(wp$productcode, 1,6)
ggplot(wp, aes(x=price, fill=timeframe)) +
    geom_density(alpha=.3) +
    xlim(c(0,1)) +
    facet_wrap(~productcode6d, scales = "free_y")

Same plot weighted by the trade value

wp %>% 
    filter(!is.na(tradevalue)) %>% 
    group_by(productcode6d) %>% 
    mutate(sumtradevalue = sum(tradevalue)) %>%
    # summarise(sum(tradevalue)/unique(sumtradevalue)) # are they all 1?
    ggplot(aes(x=price, fill=timeframe, weight = tradevalue/sumtradevalue)) +
    geom_density(alpha=.3) +
    xlim(c(0,1)) +
    facet_wrap(~productcode6d, scales = "free_y")

Monthly distribution by product, 8 digit level

Plot generated externally Note: some products only have yearly data, no monthly data. Is this due to missing quantity information? Or are the monthly flows missing completely for these products?

ggplot(wp, aes(x=price, fill=timeframe)) +
    geom_density(alpha=.3) +
    xlim(c(0,1)) +
    facet_wrap(~productcode, scales = "free_y")

Yearly distribution by product

# Monthly distribution by year by product
# Yearly distribution by year by product

Price distribution by year

Large plot generated externaly

wp$productcode6d <- substr(wp$productcode, 1,6)
ggplot(wp, aes(x=price, fill=timeframe)) +
    geom_density(alpha=.3) +
    xlim(c(0,1)) +
    facet_grid(year~productcode6d, scales = "free_y")

Ridge plots

library(ggridges)
# Create a factor of years
wpm$year <- wpm$period %/% 100
wpm$year <- factor(wpm$year, levels = rev(unique(wpm$year)))
ggplot(wpm, aes(x = price, y = year)) +
       geom_density_ridges(scale = 1, alpha = .5) +
    xlim(c(0,1)) +
    theme_ridges()

Add price bounds to this plot.

# Add product code at the 6 digit level
wpm$productcode6d <- substr(wpm$productcode, 1, 6)
ggplot(wpm, aes(x = price, y = year)) +
       geom_density_ridges(scale = 1, alpha = .5) +
    xlim(c(0,1)) +
    theme_ridges() +
    facet_wrap(~productcode6d)

Products under 440799 only

wpm %>% 
    filter(productcode6d == "440799") %>% 
    ggplot(aes(x = price, y = year)) +
       geom_density_ridges(scale = 1, alpha = .5) +
    xlim(c(0,1)) +
    theme_ridges() +
    facet_wrap(~productcode)

\pagebreak

Price per weight

ggplot(wpm, aes(x = pricew, y = year)) +
       geom_density_ridges(scale = 1, alpha = .5,
                           rel_min_height = 0.01) + # Cut trailing tail
    xlim(c(0,10)) +
    theme_ridges() +
    facet_wrap(~productcode6d)

Show red bounds using the 5th percentile and 95the percentile and blue bounds, using half the first quartile and twice the third quartile

b <- quantile(wpm$pricew,c(.25,.75), na.rm = TRUE) 
b
b * c(0.5, 2)
quantile(wpm$pricew,c(.05,.95), na.rm = TRUE)

# Group by products
wpm %>% 
    filter(!is.na(pricew)) %>% 
    group_by(productcode) %>% 
    summarise(q5  = quantile(pricew, 0.05, names = FALSE),
              q25 = quantile(pricew, 0.25, names = FALSE),
              q75 = quantile(pricew, 0.75, names = FALSE),
              q95 = quantile(pricew, 0.95, names = FALSE)) %>% 
    kable()
# With base R,
# see https://stackoverflow.com/questions/43373568/dplyr-to-iterate-over-all-columns-for-quantile
# sapply(wp[!is.na(wp$pricew),],
#        function(x) quantile(x, c(0.25, 0.5, 0.75)))%>% t
# But this doesn't take care of the grouping

Disconnect from the database.

RMariaDB::dbDisconnect(con)


stix-global/eutradeflows documentation built on Nov. 13, 2020, 9:23 p.m.