inst/extdata/R-code/10-Figures.R

#' # Creating and Saving Figures with `ggplot2` {#
#' 
## ---- include=FALSE-------------------------------------------------------------------------------------------------
source('Scripts/preamble_chapters.R')

my_fig_height <- 6
my_fig_width <- 7

#' 
#' It is a well-known fact that communication is o
#' 
#' The easiness of which you can create and presen
#' 
#' Visual attractiveness
#' : Always facilitate the analysis of your audien
#' 
#' A figure must be self-contained
#' : Any technical information such as origin and 
#' 
#' The graphical analysis should help your analysi
#' : Just because you can make a plot, doesn't mea
#' 
#' Check your references
#' : Science and data analysis evolves as building
#' 
#' Said that, let's go back to our R code.
#' 
#' 
#' ## The `ggplot2` Package
#' 
#' R has built-in functions for creating figures, 
#' 
#' This deficiency was remedied by users. In 2005,
#' 
#' In this book, we will not go deep into `ggplot2
#' 
#' For most examples given here, we will work with
#' 
#' First, let's load the data and check its conten
#' 
## -------------------------------------------------------------------------------------------------------------------
library(tidyverse)

# set file and load data
my_f  <- afedR::get_data_file('SP500-Stocks-WithRet.rds')
df_sp500 <- read_rds(my_f )

# print first 5 rows
glimpse(df_sp500)

#' 
#' It is a fairly common table used in previous ch
#' 
#' 
#' ## Using Graphics Windows
#' 
#' Before studying the use of `ggplot2`, we need t
#' 
#' A more intelligent approach to managing figures
#' 
## ---- eval=FALSE----------------------------------------------------------------------------------------------------
## x11()
## plot(1:10)

#' 
#' The visual result in RStudio should be similar 
#' 

#' 
#' Each call to `x11()` will create a new empty wi
#' 
#' After creating so many windows, it is best to c
#' 
#' 
#' ## Creating Figures with Function `qplot`
#' 
#' Package `ggplot2` has an introductory function,
#' 
#' To build a time series plot with the prices of 
#' 
## ----eval=TRUE, fig.height=my_fig_height, fig.width=my_fig_width----------------------------------------------------
library(ggplot2)

# filter stock data
temp_df <- df_sp500 %>%
  filter(ticker == 'MMM')

# plot its prices
qplot(data = temp_df, 
      x = ref.date, 
      y = price.adjusted, 
      geom = 'line')

#' 
#' In the previous example, the name of the axis c
#' 
## ----eval=TRUE, fig.height=my_fig_height, fig.width=my_fig_width----------------------------------------------------
qplot(data = temp_df, 
      x = ref.date, 
      y = price.adjusted, 
      geom = 'line', 
      xlab = 'Dates', 
      ylab = 'Adjusted closing prices', 
      main = 'Prices of MMM')

#' 
#' Much better! Notice how the horizontal axis of 
#' 
#' 
#' ## Creating Figures with Function `ggplot` {#gg
#' 
#' Using function `qplot` is recommended when you 
#' 
#' Before presenting examples using `ggplot`, let'
#' 
#' The distinction between the steps of creating a
#' 
#' Look at the syntax of the following example tha
#' 
## ---- eval=TRUE, tidy=FALSE, fig.height=my_fig_height, fig.width=my_fig_width---------------------------------------
p <- ggplot(data = temp_df, 
            mapping = aes(x = ref.date, 
                          y = price.adjusted))
p <- p + geom_line()
p <- p + labs(x = 'Dates', 
              y = 'Adjusted closing prices', 
              title = 'Prices of MMM')
print(p)

#' 
#' In using `ggplot`, it is always necessary to pr
#' 
#' After defining the input information in argumen
#' 

#' 
#' Once the data and axis are defined, we save it 
#' 
## ----eval=TRUE------------------------------------------------------------------------------------------------------
library(ggplot2)
library(stringr)

# get names of functions in ggplot2
fcts <- ls('package:ggplot2')

# select those that starts with geom_
idx <- str_sub(fcts, 1, 5) == 'geom_'
fcts <- fcts[idx]

# print result
print(fcts)

#' 
#' As you can see, the `ggplot2` package offers a 
#' 
#' Going back to our example, the third line of th
#' 
#' Using the **pipeline operator** is also possibl
#' 
## -------------------------------------------------------------------------------------------------------------------
p <- temp_df %>%
  ggplot(aes(x = ref.date, y = price.adjusted)) +
  geom_line() +
  labs(x = 'Dates', 
       y = 'Adjusted Closing Prices', 
       title = 'Prices of MMM')

#' 
#' Do notice that we used symbol `+` and **not** `
#' 
#' One of the great advantages of using `ggplot` i
#' 
## ----eval=TRUE------------------------------------------------------------------------------------------------------
# fix seed
set.seed(10)

# select 4 stocks randomly
tickers <- sample(unique(df_sp500$ticker), 4)

# create temporary df
temp_df <- df_sp500 %>%
  filter(ticker %in% tickers)

#' 
#' In this code, we use operator `%in%` to find ou
#' 
## ----fig.height=my_fig_height, fig.width=my_fig_width---------------------------------------------------------------
p <- temp_df %>%
  ggplot(aes(x = ref.date, 
             y = price.adjusted, 
             color = ticker)) +
  geom_line() +
  labs(x = 'Dates', 
       y = 'Adjusted closing prices', 
       title = 'Prices of four random stocks',
       subtitle = paste0('Date from ', min(temp_df$ref.date), 
                         ' to ', max(temp_df$ref.date) ),
       caption = 'Data from Yahoo Finance')

print(p)

#' 
#' A difference from the previous examples is that
#' 
#' 
## ----fig.height=my_fig_height, fig.width=my_fig_width---------------------------------------------------------------
p <- p +
  scale_color_brewer(palette = "RdYlBu")

print(p)

#' 
#' Notice how easy and quick it was to adjust the 
#' 
#' 
#' ### The US Yield Curve
#' 
#' Now, let's use what we learned so far to create
#' 
#' In the following code, we will first download t
#' 
## ---- include=FALSE-------------------------------------------------------------------------------------------------
my_api_key <- 'Esv7Ac7zuZzJSCGxynyF'

#' 
## ---- message=FALSE-------------------------------------------------------------------------------------------------
library(GetQuandlData)
library(tidyverse)

# set symbol and dates
my_symbol <- 'USTREASURY/YIELD'
first_date <- as.Date('2010-01-01')
last_date <- Sys.Date()

# get data!
df_yc <- get_Quandl_series(id_in = my_symbol, 
                           first_date = first_date,
                           last_date = last_date,
                           api_key = my_api_key)

print(head(df_yc))

#' 
#' The result is a dataframe in the **wide format*
#' 
## -------------------------------------------------------------------------------------------------------------------
# change to long format and convert to factor
df_yc_long <- df_yc %>%
  tidyr::pivot_longer(cols = !ref_date, 
                      names_to = 'maturity', 
                      values_to = 'rate'
                      ) %>%
  mutate(rate = as.numeric(rate))

  
# keep only longer term yields (names with YR)
idx <- str_detect(df_yc_long$maturity, 'YR')
df_yc_long <- df_yc_long[idx, ]

# change name to year number with regex
# obs: regex ([0-9]+) extracts all numbers within a string
out <- str_extract_all(string = df_yc_long$maturity,
                       pattern = '([0-9]+)')
df_yc_long$maturity <- as.numeric(out)

# glimpse result
glimpse(df_yc_long)

#' 
#' We have a dataframe in the long format with thr
#' 
## -------------------------------------------------------------------------------------------------------------------
# keep only last date of each
last_date <- max(df_yc_long$ref_date)
df_yc_lastdate <- df_yc_long[df_yc_long$ref_date == last_date, ]

# plot it!
p <- ggplot(df_yc_lastdate, aes(x=maturity, y=rate)) + 
  geom_point(size = 2.5) + geom_line(size=1) + 
  labs(x = 'Maturity (years)', 
       y='Yield Rate (%)',
       title = paste0('US Yield Curve for ',last_date),
       caption = paste0('Data from Quandl table ', my_symbol, '\n',
                        'Access at ', Sys.time()))

print(p)

#' 
#' As expected, the current yield curve is upward 
#' 
## -------------------------------------------------------------------------------------------------------------------
# set number of periods 
n_periods <- 5
my_year <- 2020

# filter for year 2019
df_yc_my_year <- df_yc_long %>%
  filter(lubridate::year(ref_date) == my_year )

# get unique dates in data
unique_dates <- unique(df_yc_my_year$ref_date)

# set sequence of observations
my_seq <- floor(seq(1, length(unique_dates), 
                    length.out = n_periods))

# get actual dates from sequence
my_dates <- unique_dates[my_seq]

# find rows for dates in df
idx <- df_yc_my_year$ref_date %in% my_dates
df_yc_periods <- df_yc_my_year[idx, ]

# plot it!
p <- ggplot(df_yc_periods, aes(x=maturity, 
                               y=rate, 
                               color= factor(ref_date))) + 
  geom_point(size = 2.5) + 
  geom_line(size = 1) + 
  labs(x = 'Maturity (years)', 
       y='Yield Rate (%)',
       title = paste0('US Yield Curve for ', my_year),
       color = 'Dates',
       caption = paste0('Data from Quandl table ', 
                        my_symbol, '\n',
                        'Access at ', Sys.time()))

print(p)

#' 
#' The yield curve is not static and will change o
#' 
#' 
#' ## Using Themes
#' 
#' One way of customizing graphics in `ggplot2` is
#' 

#' 
#' Package `ggplot` has a pre-packaged collection 
#' 
## ----eval=TRUE------------------------------------------------------------------------------------------------------
library(ggplot2)
library(stringr)

# get all functions
fcts <- ls('package:ggplot2')

# find out those that start with theme_
idx <- str_sub(fcts, 1, 6) == 'theme_'
fcts <- fcts[idx]

# print result
print(fcts)

#' 
#' Let's try it with the theme from function `them
#' 
## ----fig.height=my_fig_height, fig.width=my_fig_width---------------------------------------------------------------
p <- temp_df %>%
  ggplot(aes(x = ref.date, y = price.adjusted, color=ticker)) +
  geom_line() +
  labs(x = 'Dates', 
       y = 'Adjusted closing prices', 
       title = 'Prices of four random stocks',
       caption = 'Data from Yahoo Finance') + 
  theme_bw()

print(p)

#' 
#' As you can see, the new theme was a white backg
#' 
#' Let's now use package `gridExtra` to create a g
#' 
## ---- fig.height=9 , fig.width = 9----------------------------------------------------------------------------------
require(gridExtra)

p1 <- p + 
  theme_bw() + 
  labs(title = 'Theme BW')

p2 <- p + 
  theme_dark() + 
  labs(title = 'Theme Dark')

p3 <- p + 
  theme_grey() + 
  labs(title = 'Theme Grey')

p4 <- p + 
  theme_light() + 
  labs(title = 'Theme Light')

p5 <- p + 
  theme_classic() + 
  labs(title = 'Theme Classic')

p6 <- p + 
  theme_minimal() + 
  labs(title = 'Theme Minimal')

grid.arrange(p1, p2, p3,
             p4, p5, p6,
             ncol=2, nrow = 3)

#' 
#' 
#' You can try other themes on your computer and s
#' 
#' In the previous example, notice how the structu
#' 
#' Digging deeper, the selection of the colors fol
#' 
## ---- eval=TRUE, tidy=FALSE, fig.height=my_fig_height, fig.width=my_fig_width---------------------------------------
p <- p + 
  theme_bw() + 
  scale_colour_grey(start = 0.0, end = 0.6)

print(p)  

#' 
#' The lines of the plot are now in grey. The inpu
#' 
#' 
#' ## Creating Panels with `facet_wrap`
#' 
#' Another possibility of creating graphics for di
#' 
#' Facets are possible with function `facet_wrap`,
#' 
## ----eval=TRUE, fig.height=my_fig_height, fig.width=my_fig_width, tidy=FALSE----------------------------------------
library(dplyr)
# fix seed
set.seed(10)

# select 4 stocks randomly
tickers <- sample(unique(df_sp500$ticker), 4)

p <- df_sp500 %>%
  filter(ticker %in% tickers) %>%
  ggplot(aes(x = ref.date, y = price.adjusted)) + 
  geom_line() + 
  labs(x = 'Date', 
       y = 'Adjusted closing prices',
       title = 'Prices of four random stocks',
       caption = 'Data from Yahoo Finance') + 
  facet_wrap(facets = ~ticker) + 
  theme_bw()

print(p)

#' 
#' Using panels is recommended when the data of th
#' 
## ----eval=TRUE, fig.height=my_fig_height, fig.width=my_fig_width, tidy=FALSE----------------------------------------
# fix seed
set.seed(5)

# select 4 stocks randomly
tickers <- sample(unique(df_sp500$ticker), 4)

p <- df_sp500 %>%
  filter(ticker %in% tickers) %>%
  ggplot(aes(x = ref.date, y = ret)) + 
  geom_line(size=1) + 
  labs(x = 'Date', 
       y = 'Returns',
       title = 'Daily returns of four random stocks',
       caption = 'Data from Yahoo Finance') +   
  facet_wrap(facets = ~ticker) + 
  theme_bw()

print(p)

#' 
#' Notice how the vertical axis of the panels is f
#' 
## ----eval=TRUE, fig.height=my_fig_height, fig.width=my_fig_width----------------------------------------------------
p <- p + 
  facet_wrap(facets = ~ticker, scales = 'free_y')

print(p)

#' 
#' Here, both axis, x and y, have their own scale 
#' 
#' 
#' ## Using the Pipeline
#' 
#' We previously saw that `ggplot2` is a friend of
#' 
## ---- eval=TRUE, tidy=FALSE,fig.height=my_fig_height, fig.width=my_fig_width----------------------------------------
library(tidyverse)
library(ggplot2)

# calculated mean and sd of returns, plot result
my_f <- afedR::get_data_file(
  'SP500_Stocks_long_by_year.rds'
  )

df_sp500 <- read_rds(my_f)

p <- df_sp500 %>%
  na.omit() %>%
  group_by(ticker) %>%
  summarise(mean_ret = mean(ret.adjusted.prices),
            std_ret = sd(ret.adjusted.prices)) %>%
  ggplot(aes(x = std_ret, y = mean_ret)) +
  geom_point() + 
  labs(x = 'Standard Deviation of Yearly returns', 
       y = 'Average Yearly Returns',
       title = 'Expected Return and Risk for SP500 Stocks',
       subtitle = paste0('Annual price data from 2010 to 2019, ',
                         length(unique(df_sp500$ticker)), 
                         ' stocks included'),
       caption = 'Data imported from Yahoo Finance') + 
  scale_y_continuous(labels = scales::percent) + 
  scale_x_continuous(labels = scales::percent) + 
  theme_bw()

print(p)

#' 
#' The previous code is self-contained, easy to re
#' 
#' This is another example of an elegant code prod
#' 
## Be careful when mixing operators in the `tidyverse`. Whenever passing tables between `dplyr` commands, use the operator `%>%`. In the case of sequencing layers of a `ggplot` chart, always use the sum symbol (`+`).

#' 
#' 
#' ## Creating Statistical Graphics
#' 
#' Package `ggplot` has several options for creati
#' 
#' 
#' ### Creating Histograms
#' 
#' A histogram shows the empirical distribution of
#' 
## ----eval=TRUE, fig.height=my_fig_height, fig.width=my_fig_width----------------------------------------------------
# set file and load data
my_f  <- afedR::get_data_file('SP500-Stocks-WithRet.rds')
df_sp500 <- read_rds(my_f )

# remove outliers
my_prob_outliers <- 0.01
df_sp500$ret <- afedR::afedR_replace_outliers(
  df_sp500$ret, 
  my_prob = my_prob_outliers
  )

# plot the data
p <- ggplot(data = df_sp500, aes(x = ret)) + 
  geom_histogram(bins = 100) + 
  labs(y = 'Frequency', 
       x = 'Returns',
       title = paste0('Distribution of returns for ', 
                      'all stocks in the SP500 index'),
       subtitle = paste0('Data from 2010 to 2019\n',
                         'Distribution based quantiles at the ', 
                         scales::percent(my_prob_outliers), 
                         ' were removed'),
       caption = 'Data from Yahoo Finance'
  ) + 
  scale_x_continuous(labels = scales::percent) + 
  theme_bw()

print(p)

#' 
#' Here, we only need to define the _x_ value, wit
#' 
#' We can also use groups and facets as we did for
#' 
## ----eval=TRUE, fig.height=my_fig_height, fig.width=my_fig_width, tidy=FALSE----------------------------------------
# fix seed
set.seed(30)

# select 4 stocks randomly
tickers <- sample(unique(df_sp500$ticker), 4)

p <- df_sp500 %>%
  filter(ticker %in% tickers) %>%
  ggplot(aes(x = ret)) + 
  geom_histogram(bins = 50) + 
  labs(y = 'Frequency', 
       x = 'Returns',
       title = 'Distribution of returns for four random stocks',
       subtitle = paste0('Data from 2010 to 2019\n',
                         'Quantiles at the ', 
                         scales::percent(my_prob_outliers), 
                         ' of the distribution were removed'),
       caption = 'Data from Yahoo Finance'
  ) + 
  scale_x_continuous(labels = scales::percent) + 
  theme_bw() + 
  facet_wrap(facets = ~ticker)

print(p)

#' 
#' A histogram with the empirical densities of the
#' 
## ----eval=TRUE, fig.height=my_fig_height, fig.width=my_fig_width, tidy=FALSE----------------------------------------
p <- df_sp500 %>%
  filter(ticker %in% tickers) %>%
  ggplot(aes(x = ret)) + 
  geom_density() + 
  facet_wrap(facets = ~ticker) + 
  labs(y = 'Interpolated Frequency', 
       x = 'Returns',
       title = paste0('Interpolated distribution of returns',
                      'for all stocks in the SP500 index'),
       subtitle = paste0('Data from 2010 to 2019\n',
                         'Quantiles at the ', 
                         scales::percent(my_prob_outliers), 
                         ' of both sides of the distribution', 
                         'were removed'),
       caption = 'Data from Yahoo Finance'
  ) + 
  scale_x_continuous(labels = scales::percent) + 
  theme_bw()

print(p)

#' 
#' The previous figure allows a clear visual compa
#' 
#' 
#' ### Creating _boxplot_ Figures
#' 
#' Figures of type _boxplot_, or box and whisker d
#' 
## ----eval=TRUE, fig.height=my_fig_height, fig.width=my_fig_width, tidy=FALSE----------------------------------------
# fix seed
set.seed(30)

# select 4 stocks randomly
tickers <- sample(unique(df_sp500$ticker), 4)

p <- df_sp500 %>%
  filter(ticker %in% tickers) %>%
  ggplot(aes(x = ticker, y = price.adjusted)) + 
  geom_boxplot() + 
  labs(y = 'Adjusted Price',
       x = 'Ticker',
       title = paste0('Distribution of daily prices', 
                      ' of four random stocks'),
       caption = paste0('Data imported from Yahoo Finance', 
                        ' (2010 - 2019)')
       ) + 
  theme_bw()

print(p)

#' 
#' We define a box plot by setting the `x` and `y`
#' 
#' Another interesting application of boxplot figu
#' 
## -------------------------------------------------------------------------------------------------------------------
p <- ggplot(df_yc_long, aes(x = factor(maturity), y = rate)) + 
  geom_boxplot() + 
  labs(title = 'Distribution of Yield Rates over Maturities',
       x = 'Maturity (years)', 
       y = 'Yield Rate (% per year)',
       caption = paste0('Data from Quandl \n',
                        'Created at ', Sys.time()) ) + 
  theme_bw()

print(p)

#' 
#' The statistical plot shows the upward pattern f
#' 
#' 
#' ### Creating _QQ_ Plots
#' 
#' QQ plots show a comparison between the empirica
#' 
#' Let's try an example with some simulated data.
#' 
## ----eval=TRUE, fig.height=my_fig_height, fig.width=my_fig_width, tidy=FALSE----------------------------------------
# fix seed
set.seed(40)

# set options
N <- 10000
my_mean <- 10
my_sd <- 2

# create tibble
temp_df <- tibble(y=rnorm(n = N, 
                          mean = my_mean, 
                          sd = my_sd))

# plot QQ
p <- ggplot(data = temp_df, aes(sample = y)) + 
  geom_qq(distribution = qnorm, 
          dparams = c(mean=my_mean, sd=my_sd)) + 
  labs(title = 'QQ plot of simulated Normal Distribution')

print(p)

#' 
#' In the previous code, we simulate random normal
#' 
#' Now, let's try it for our table of stock's retu
#' 
## ----eval=TRUE,fig.height=my_fig_height, fig.width=my_fig_width, tidy=FALSE-----------------------------------------
# fix seed
set.seed(10)

# select 4 stock randomly and filter
tickers <- sample(unique(df_sp500$ticker), 4)

temp_df <- df_sp500 %>%
  filter(ticker %in% tickers)

# set function for normalization
norm_vec <- function(y){
  # Normalizes a vector by subtracting mean and dividing
  # by the standard deviation
  #
  # Args:
  #   y - numerical vector
  #
  # Returns:
  #   A normalized vector
  
  y.norm <- (y-mean(y, na.rm = TRUE))/sd(y, na.rm = TRUE)
  return(y.norm)
}

# apply function  
my_l <- tapply(X = temp_df$ret, 
               INDEX = factor(temp_df$ticker), 
               FUN = norm_vec)

# reorder list (tapply sorts alphabetically)
my_l <- my_l[as.character(unique(temp_df$ticker))]

# save new column norm.ret
temp_df$norm_ret <- unlist(my_l)

# plot it!
p <- ggplot(data = temp_df, aes(sample = norm_ret)) + 
  geom_qq() + 
  facet_wrap(~ticker) + 
  labs(title = 'QQ plot for normalized returns',
       subtitle = 'Daily returns from 2010 to 2019', 
       x = 'Theoretical value (from Normal)',
       y = 'True/observed value',
       caption = 'Data from Yahoo Finance') + 
  theme_bw()

print(p)

#' 
#' 
#' As you can see, the result is not visually simi
#' 
#' 
#' ## Saving Graphics to a File
#' 
#' To save pictures created with `ggplot`, use fun
#' 
#' Consider the following example, where we create
#' 
## ----eval=TRUE, tidy=FALSE,fig.height=my_fig_height, fig.width=my_fig_width-----------------------------------------
library(tidyverse)

# fix seed
set.seed(40)

# select 4 stocks randomly
tickers <- sample(unique(df_sp500$ticker), 4)

p <- df_sp500 %>%
  filter(ticker %in% tickers) %>%
  ggplot(aes(x = ref.date, 
             y = price.adjusted, 
             color = ticker)) + 
  geom_line() + 
  labs(x = 'Date', 
       y = 'Adjusted closing prices',
       title = 'Prices of four random stocks',
       caption = 'Data from Yahoo Finance')

# save file
my_fig_file <- 'fig_ggplot/MyPrices.png'
ggsave(filename = my_fig_file, 
       plot=p,
       dpi = 600)

#' 
#' You can verify the creation of the file with fu
#' 
## -------------------------------------------------------------------------------------------------------------------
print(list.files('fig_ggplot'))

#' 
#' As expected, the file is available in folder `f
#' 
#' 
#' ## Exercises 
#' 
## ---- echo=FALSE, results='asis'------------------------------------------------------------------------------------
f_in <- list.files('../02-EOCE-Rmd/Chapter10-Figures/', 
                   full.names = TRUE)

compile_eoc_exercises(f_in, type_doc = my_engine)

#' 
#' 
msperlin/afedR documentation built on Sept. 11, 2022, 9:49 a.m.