#' # Financial Econometrics with R {#models}
#'
#'
#' The modeling tools from econometrics and statis
#'
#' The variety of models used in financial econome
#'
#' - Linear models (OLS)
#' - Generalized linear models (GLS)
#' - Panel data models
#' - Arima models (Integrated Autoregressive Movin
#' - Garch models (Generalized Autoregressive Cond
#' - Markov Regime switching models
#'
#' We will not present a full description of the u
#'
#'
#' ## Linear Models (OLS) {#linear-models}
#'
#' A linear model is, without a doubt, one of the
#'
#' In finance, the most direct and popular use of
#'
#' A linear model with _N_ explanatory variables c
#'
#' $$y _t = \alpha + \beta _1 x_{1,t} + \beta _2 x
#'
#' The left side of the equation, (`r if (my.engin
#'
#'
#' ### Simulating a Linear Model
#'
#' Consider the following equation:
#'
#' $$y _t = 0.5 + 2 x_{t} + \epsilon _t$$
#'
#' We can use R to simulate _1.000_ observations f
#'
## -------------------------------------------------------------------------------------------------------------------
set.seed(50)
# number of obs
n_T <- 1000
# set x as Normal (0, 1)
x <- rnorm(n_T)
# set coefficients
my_alpha <- 0.5
my_beta <- 2
# build y
y <- my_alpha + my_beta*x + rnorm(n_T)
#'
#' Using `ggplot`, we can create a scatter plot to
#'
## ---- eval=TRUE, tidy=FALSE, fig.height=my.fig.height, fig.width=my.fig.width---------------------------------------
library(tidyverse)
# set temp df
temp_df <- tibble(x = x,
y = y)
# plot it
p <- ggplot(temp_df, aes(x = x, y = y)) +
geom_point(size=0.5) +
labs(title = 'Example of Correlated Data') +
theme_bw()
print(p)
#'
#' Clearly, there is a positive linear correlation
#'
#'
#' ### Estimating a Linear Model {#estimating-ols}
#'
#' In R, the main function for estimating a linear
#'
## -------------------------------------------------------------------------------------------------------------------
# set df
lm_df <- tibble(x, y)
# estimate linear model
my_lm <- lm(data = lm_df, formula = y ~ x)
print(my_lm)
#'
#' The `formula` argument defines the shape of the
#'
#' Argument `formula` allows other custom options,
#'
## -------------------------------------------------------------------------------------------------------------------
set.seed(15)
# set simulated dataset
N <- 100
df <- tibble(x = runif(N),
y = runif(N),
z = runif(N),
group = sample(LETTERS[1:3],
N,
replace = TRUE ))
#'
## -------------------------------------------------------------------------------------------------------------------
# Vanilla formula
#
# example: y ~ x + z
# model: y(t) = alpha + beta(1)*x(t) + beta(2)*z(t) + error(t)
my_formula <- y ~ x + z
print(lm(data = df,
formula = my_formula))
#'
## -------------------------------------------------------------------------------------------------------------------
# vannila formula with dummies
#
# example: y ~ group + x + z
# model: y(t) = alpha + beta(1)*D_1(t)+beta(2)*D_2(t) +
# beta(3)*x(t) + beta(4)*z(t) + error(t)
# D_i(t) - dummy for group i
my_formula <- y ~ group + x + z
print(lm(data = df,
formula = my_formula))
#'
## -------------------------------------------------------------------------------------------------------------------
# Without intercept
#
# example: y ~ -1 + x + z
# model: y(t) = beta(1)*x(t) + beta(2)*z(t) + error(t)
my_formula <- y ~ -1 + x + z
print(lm(data = df,
formula = my_formula))
#'
## -------------------------------------------------------------------------------------------------------------------
# Using combinations of variables
# example: y ~ x*z
# model: y(t) = alpha + beta(1)*x(t) + beta(2)*z(t) +
# beta(3)*x(t)*z(t) + error(t)
my_formula <- y ~ x*z
print(lm(data = df,
formula = my_formula))
#'
## -------------------------------------------------------------------------------------------------------------------
# Interacting variables
# example: y ~ x:group + z
# model: y(t) = alpha + beta(1)*z(t) + beta(2)*x(t)*D_1(t) +
# beta(3)*x(t)*D_2(t) + beta(4)*x(t)*D_3(t) +
# error(t)
# D_i(t) - dummy for group i
my_formula <- y ~ x:group + z
print(lm(data = df,
formula = my_formula))
#'
#' The different options in the `formula` input al
#'
#' The output of `lm` is an special type of `list`
#'
## -------------------------------------------------------------------------------------------------------------------
# print names in model
print(names(my_lm))
#'
#' As you can see, there is a slot called `coeffic
#'
## -------------------------------------------------------------------------------------------------------------------
print(my_lm$coefficients)
#'
#' The result is a simple atomic vector that incre
#'
#' In our example of using `lm` with simulated dat
#'
#' Experienced researchers have probably noted tha
#'
## -------------------------------------------------------------------------------------------------------------------
print(summary(my_lm))
#'
#'
#' The estimated coefficients have high _T_ values
#'
#' Additional information is available in the resu
#'
## -------------------------------------------------------------------------------------------------------------------
my_summary <- summary(my_lm)
print(names(my_summary))
#'
#' Each element contains information that can be r
#'
#' Now, let's move to an example with real data. F
#'
#' $$R _t = \alpha + \beta R_{M,t} + \epsilon _t$$
#'
#' First, let's load the SP500 dataset. \index{cal
#'
## -------------------------------------------------------------------------------------------------------------------
library(tidyverse)
# load stock data
my_f <- afedR::get_data_file('SP500-Stocks-WithRet.rds')
my_df <- read_rds(my_f)
# select rnd asset and filter data
set.seed(10)
my_asset <- sample(my_df$ticker,1)
my_df_asset <- my_df[my_df$ticker == my_asset, ]
# load SP500 data
df_sp500 <- read.csv(file = 'data/SP500.csv',
colClasses = c('Date','numeric'))
# calculate return
calc_ret <- function(P) {
N <- length(P)
ret <- c(NA, P[2:N]/P[1:(N-1)] -1)
}
df_sp500$ret <- calc_ret(df_sp500$price)
# print number of rows in datasets
print(nrow(my_df_asset))
print(nrow(df_sp500))
#'
#' You can see the number of rows of the dataset f
#'
## -------------------------------------------------------------------------------------------------------------------
# find location of dates in df_sp500
idx <- match(my_df_asset$ref.date, df_sp500$ref.date)
# create column in my_df with sp500 returns
my_df_asset$ret_sp500 <- df_sp500$ret[idx]
#'
#' As a start, let's create a scatter plot with th
#'
## ---- eval=TRUE, tidy=FALSE, fig.height=my.fig.height, fig.width=my.fig.width---------------------------------------
library(ggplot2)
p <- ggplot(data = my_df_asset,
aes(x=ret_sp500, y=ret)) +
geom_point(size = 0.5) +
geom_smooth(method = 'lm') +
labs(x = 'SP500 Returns',
y = paste0(my_asset, ' Returns'),
title = paste0(my_asset, ' and ', ' the SP500' ),
caption = 'Data from Yahoo Finance') +
theme_bw()
print(p)
#'
#' The figure shows a clear linear tendency; the
#'
## -------------------------------------------------------------------------------------------------------------------
# estimate beta model
my_beta_model <- lm(data = my_df_asset,
formula = ret ~ ret_sp500)
# print it
print(summary(my_beta_model))
#'
#'
#' The output shows that stock `r my_asset` has a
#'
#'
#' ### Statistical Inference in Linear Models {#te
#'
#' After estimating a model with function `lm`, th
#'
## -------------------------------------------------------------------------------------------------------------------
n_T <- 100
df <- data.frame(y = runif(n_T),
x_1 = runif(n_T),
x_2 = runif(n_T))
my_lm <- lm(data = df,
formula = y ~ x_1 + x_2)
print(summary(my_lm))
#'
#' In this example, the F statistic is `r summary(
#'
#' Another type of test automatically executed by
#'
#' In the practice of research, it is likely that
#'
#' As a simple example, let's test a linear hypoth
## -------------------------------------------------------------------------------------------------------------------
set.seed(10)
# number of time periods
n_T <- 1000
# set parameters
my_intercept <- 0.5
my_beta <- 1.5
# simulate
x <- rnorm(n_T)
y <- my_intercept + my_beta*x + rnorm(n_T)
# set df
df <- tibble(y, x)
# estimate model
my_lm <- lm(data = df,
formula = y ~ x )
#'
#' After the estimation of the model, we use the f
#'
#' $$\underbrace{\begin{bmatrix}
#' 1 & 0 \\
#' 0 & 1
#' \end{bmatrix}}_{hypothesis.matrix}\begin{bmatri
#' \alpha \\
#' \beta
#' \end{bmatrix} = \underbrace{\begin{bmatrix}
#' 0.5 \\
#' 1.5
#' \end{bmatrix} }_{rhs} $$
#'
#' With this matrix operation, we test the joint h
#'
## ---- message=FALSE-------------------------------------------------------------------------------------------------
library(car)
# set test matrix
test_matrix <- matrix(c(my_intercept, # alpha test value
my_beta)) # beta test value
# hypothesis matrix
hyp_mat <- matrix(c(1,0,
0,1),nrow = 2)
# do test
my_waldtest <- linearHypothesis(my_lm,
hypothesis.matrix = hyp_mat,
rhs = test_matrix)
# print result
print(my_waldtest)
#'
#' As we can see, the test fails to reject the nul
#'
#' Another family of tests commonly applied to lin
#'
#' In R, we can use the package `lmtest` [@R-lmtes
#'
## ---- results='hold'------------------------------------------------------------------------------------------------
library(lmtest)
# Breush Pagan test 1 - Serial correlation
# Null Hypothesis: No serial correlation in residual
print(bgtest(my_lm, order = 5))
# Breush Pagan test 2 - Homocesdasticity of residuals
# Null Hypothesis: homocesdasticity
# (constant variance of residuals)
print(ncvTest(my_lm))
# Durbin Watson test - Serial correlation
# Null Hypothesis: No serial correlation in residual
print(dwtest(my_lm))
# Shapiro test - Normality
# Null Hypothesis: Data is normally distributed
print(shapiro.test(my_lm$residuals))
#'
#' As expected, the model with artificial data pas
#'
#' Another interesting approach for validating lin
#'
## -------------------------------------------------------------------------------------------------------------------
library(gvlma)
# global validation of model
gvmodel <- gvlma(my_lm)
# print result
summary(gvmodel)
#'
#' The output of `gvlma` shows several tests perfo
#'
#'
#' ## Generalized Linear Models (GLM)
#'
#' The generalized linear model (GLM) is a flexibl
#'
#' We can write a general univariate GLM specifica
#'
#' $$E \left( y _t \right) = g \left(\alpha + \sum
#'
#' The main difference of a GLM model and a OLS mo
#'
#' $$g(x) = \frac{\exp(x)}{1+\exp(x)} $$
#'
#' Do notice that function _g()_ ensures any value
#'
#'
#' ### Simulating a GLM Model
#'
#' As an example, let's simulate the following GLM
#'
#' $$p _t = \frac{\exp(2+5x_t)}{1+\exp(2+5x_t)}$$
#' In R, we use the following code to build the re
#'
## -------------------------------------------------------------------------------------------------------------------
set.seed(15)
# set number of obs
n_T <- 500
# set x
x = rnorm(n_T)
my_alpha <- 2
my_beta <- 5
# set probabilities
z = my_alpha + my_beta*x
p = exp(z)/(1+exp(z))
# set response variable
y = rbinom(n = n_T,
size = 1,
prob = p)
#'
#' Function `rbinom` creates a vector of 1s and 0s
#'
## ---- eval=TRUE, tidy=FALSE, fig.height=my.fig.height, fig.width=my.fig.width---------------------------------------
summary(y)
#'
#' Object `y` contains zeros and ones, as expected
#'
#'
#' ### Estimating a GLM Model
#'
#' In R, the estimation of GLM models is accomplis
#'
#' Let's use the previously simulated data to esti
#'
## -------------------------------------------------------------------------------------------------------------------
# estimate GLM
df <- tibble(x, y)
my_family <- binomial(link = "logit")
my_glm <- glm(data = df,
formula = y ~ x ,
family = my_family)
# print it with summary
print(summary(my_glm))
#'
#' The estimated coefficients are close to what we
#'
#' Function `glm` offers many options for setting
#'
#'
#' The first step in using a GLM model is to ident
#'
#' As an example, with real data, we'll use a cred
#'
## -------------------------------------------------------------------------------------------------------------------
library(tidyverse)
# read default data
my_f <- afedR::get_data_file('UCI_Credit_Card.csv')
df_default <- read_csv(my_f,
col_types = cols())
glimpse(df_default)
#'
#' This is a comprehensive dataset with several pi
#'
## -------------------------------------------------------------------------------------------------------------------
library(tidyverse)
# read credit card data
# source:
# www.kaggle.com/uciml/default-of-credit-card-clients-dataset
# COLUMNS: GENDER: (1 = male; 2 = female).
# EDUCATION: 1 = graduate school;
# 2 = university;
# 3 = high school;
# 4 = others.
# MARRIAGE: 1 = married;
# 2 = single;
# 3 = others
df_default <- df_default %>%
mutate(default = (default.payment.next.month == 1),
D_male_gender = (SEX == 1),
age = AGE,
educ = dplyr::recode(as.character(EDUCATION),
'1' = 'Grad',
'2' = 'University',
'3' = 'High School',
'4' = 'Others',
'5' = 'Unknown',
'6' = 'Unknown'),
D_marriage = (MARRIAGE == 1)) %>%
select(default, D_male_gender, age, educ, D_marriage)
glimpse(df_default)
#'
#' Much better! Now we only have the columns of in
#'
## -------------------------------------------------------------------------------------------------------------------
# estimate glm model
glm_credit <- glm(data=df_default,
formula = default ~ D_male_gender + age +
educ + D_marriage,
family = binomial(link = "logit"))
# show output
summary(glm_credit)
#'
#' We find that the only coefficients with statist
#'
#'
#' ## Panel Data Models
#'
#' Panel data models are advised when the modeled
#'
#' The main motivation to use panel data models is
#'
#' We can represent the simplest case of a panel d
#'
#' $$y_{i,t} = \alpha _i + \beta x_{i,t} + \epsilo
#'
#' Do notice the use of index _i_ in the dependent
#'
#'
#' ### Simulating Panel Data Models
#'
#' Let's simulate a balanced panel data with fixed
#'
## -------------------------------------------------------------------------------------------------------------------
set.seed(25)
# number of obs for each case
n_T <- 5
# set number of groups
N <- 12
# set possible cases
possible_cases <- LETTERS[1:N]
# set parameters
my_alphas <- seq(-10, 10,
length.out = N)
my_beta <- 1.5
# set indep var (x) and dates
indep_var <- sapply(rep(n_T,N), rnorm)
my_dates <- Sys.Date() + 1:n_T
# create response matrix (y)
response_matrix <- matrix(rep(my_alphas,
n_T),
nrow = n_T,
byrow = TRUE) +
indep_var*my_beta + sapply(rep(n_T,N),rnorm, sd = 0.25)
# set df
sim_df <- tibble(firm = as.character(sapply(possible_cases,
rep,
times=n_T )),
dates = rep(my_dates, times=N),
y = as.numeric(response_matrix),
x = as.numeric(indep_var),
stringsAsFactors = FALSE)
# print result
glimpse(sim_df)
#'
#' The result is a `dataframe` object with `r nrow
#'
## ---- eval=TRUE, tidy=FALSE, fig.height=my.fig.height, fig.width=my.fig.width---------------------------------------
library(ggplot2)
p <- ggplot(sim_df, aes(x = x,
y = y)) +
geom_point() + geom_line() +
facet_wrap(~ firm) +
labs(title = 'Simulated Panel Data') +
theme_bw()
print(p)
#'
#' The figure shows the strong linear relationship
#'
#'
#' ### Estimating Panel Data Models
#'
#' With the artificial data simulated in the previ
#'
## ---- message=FALSE-------------------------------------------------------------------------------------------------
library(plm)
# estimate panel data model with fixed effects
my_pdm <- plm(data = sim_df,
formula = y ~ x,
model = 'within',
index = c('firm','dates'))
# print coeficient
print(coef(my_pdm))
#'
#' As expected, the slope parameter was correctly
#'
## -------------------------------------------------------------------------------------------------------------------
print(fixef(my_pdm))
#'
#' Again, the simulated intercept values are close
#'
#' As an example with real data, let's use the dat
#'
## ---- message=FALSE-------------------------------------------------------------------------------------------------
library(plm)
# data from Grunfeld
my_f <- afedR::get_data_file('grunfeld.csv')
df_grunfeld <- read_csv(my_f, col_types = cols())
# print it
glimpse(df_grunfeld )
#'
#' The `df_grunfeld` dataset contains company info
#'
#' A note here is important; given its high number
#'
#' First, let's explore the raw data by estimating
#'
## -------------------------------------------------------------------------------------------------------------------
est_lm <- function(df) {
# Estimates a linear model from Grunfeld data
#
# Args:
# df - dataframe from Grunfeld
#
# Returns:
# lm object
my_model <- lm(data = df,
formula = inv ~ value + capital)
return(my_model)
}
# estimate model for each firm
my_l <- by(df_grunfeld,
INDICES = df_grunfeld$firm,
FUN = est_lm)
# print result
my_coefs <- sapply(my_l, coef)
print(my_coefs)
#'
#'
#' The results show a great discrepancy between th
#'
## -------------------------------------------------------------------------------------------------------------------
# test if all coef are the same across firms
my_pooltest <- pooltest(inv ~ value + capital,
data = df_grunfeld,
model = "pooling")
# print result
print(my_pooltest)
#'
#' The high F test and small p-value suggest the r
#'
#' Before estimating the model, we need to underst
#'
#' We can test the model specification using the `
#'
## -------------------------------------------------------------------------------------------------------------------
# set options for Hausman test
my_formula <- inv ~ value + capital
my_index <- c('firm','year')
# do Hausman test
my_hausman_test <- phtest(x = my_formula,
data = df_grunfeld,
model = c('within', 'random'),
index = my_index)
# print result
print(my_hausman_test)
#'
#' The p-value of `r format(my_hausman_test$p.valu
#'
#' After identifying the model, let's estimate it
#'
## -------------------------------------------------------------------------------------------------------------------
# set panel data model with random effects
my_model <- 'random'
my_formula <- inv ~ value + capital
my_index <- c('firm','year')
# estimate it
my_pdm_random <- plm(data = df_grunfeld,
formula = my_formula,
model = my_model,
index = my_index)
# print result
print(summary(my_pdm_random))
#'
#'
#' As expected, the coefficients are significant a
#'
#' For one last example of using R in panel models
#'
#' The `systemfit` package offers a function with
#'
## ---- message=FALSE-------------------------------------------------------------------------------------------------
library(systemfit)
# set pdataframe
p_Grunfeld <- pdata.frame(df_grunfeld, c( "firm", "year" ))
# estimate sur
my_SUR <- systemfit(formula = inv ~ value + capital,
method = "SUR",
data = p_Grunfeld)
print(my_SUR)
#'
#' The output object `my_SUR` contains the estimat
#'
#'
#' ## Arima Models
#'
#' Arima is a special type of model that uses the
#'
#' A simple example of an Arima model is defined b
#'
#' $$y _t = 0.5 y_{t-1} - 0.2 \epsilon _{t-1} + \e
#'
#' In this example, we have an ARIMA(AR = 1, D = 0
#'
#'
#' ### Simulating Arima Models
#'
#' First, let's simulate an Arima model using func
#'
## ---- tidy=FALSE----------------------------------------------------------------------------------------------------
set.seed(1)
# set number of observations
my_T <- 5000
# set model's parameters
my_model <- list(ar = 0.5,
ma = -0.1)
my_sd <- 1
# simulate model
my_ts <- arima.sim(n = my_T,
model = my_model ,
sd = my_sd)
#'
#' We can look at the result of the simulation by
#'
## ---- tidy=FALSE----------------------------------------------------------------------------------------------------
library(ggplot2)
# set df
temp_df <- data.frame(y = unclass(my_ts),
date = Sys.Date() + 1:my_T)
p <- ggplot(temp_df, aes(x = date, y = y)) +
geom_line(size=0.25) +
labs(title = 'Simulated ARIMA Model') +
theme_bw()
print(p)
#'
#' The graph shows a time series with an average c
#'
#'
#' ### Estimating Arima Models {#arima-estimating}
#'
#' To estimate an Arima model, we use function `ar
#'
## -------------------------------------------------------------------------------------------------------------------
# estimate arima model
my_arima <- arima(my_ts, order = c(1,0,1))
# print result
print(coef(my_arima))
#'
#' As expected, the estimated parameters are close
#'
## ---- eval=FALSE----------------------------------------------------------------------------------------------------
## # not evaluated due to large output
## # please run it in your own R session for details
## attributes(summary(my_arima))
#'
#' We have the adjustment criteria in `aic`, resid
#'
#' The identification of the Arima model, definin
#'
#' In the next example, we use the function `auto.
#'
## ---- message=FALSE-------------------------------------------------------------------------------------------------
library(BatchGetSymbols)
df_SP500 <- BatchGetSymbols(tickers = '^GSPC',
first.date = '2015-01-01',
last.date = '2019-01-01')$df.tickers
#'
#' Before estimating the model, we need to check t
#'
## ---- message=FALSE-------------------------------------------------------------------------------------------------
library(tseries)
print(adf.test(na.omit(df_SP500$ret.adjusted.prices)))
#'
#' The result of the test shows a small p-value th
#'
## ---- message=FALSE-------------------------------------------------------------------------------------------------
print(adf.test(df_SP500$price.close))
#'
#' This time, we easily fail to reject the null hy
#'
#' One issue in working with Arima models is with
#'
## -------------------------------------------------------------------------------------------------------------------
library(forecast)
# estimate arima model with automatic identification
my_autoarima <- auto.arima(x = df_SP500$ret.closing.prices)
# print result
print(my_autoarima)
#'
#' The result tells us the best model for the retu
#'
#'
#' ### Forecasting Arima Models
#'
#' We can obtain the forecasts of an Arima model w
#'
## -------------------------------------------------------------------------------------------------------------------
# forecast model
print(forecast(my_autoarima, h = 5))
#'
#'
#' ## GARCH Models
#'
#' GARCH (Generalized Autoregressive Conditional H
#'
#' A GARCH model is modular. In its simplest forma
#'
#' $$
#' \begin{aligned}
#' y _t &= \mu + \theta y_{t-1} + \epsilon _t \\
#' \epsilon _t &\sim N \left(0, h _t \right ) \\
#' h _t &= \omega + \alpha \epsilon ^2 _{t-1}+ \be
#' \end{aligned}
#' $$
#'
#' The `r if (my.engine!='epub3') {'$y_t$'} else {
#'
#'
#' ### Simulating Garch Models
#'
#' In CRAN, we can find two main packages related
#'
#' In `fGarch`, we simulate a model using function
#'
## ---- tidy=FALSE, message=FALSE-------------------------------------------------------------------------------------
library(fGarch)
# set list with model spec
my_model = list(omega=0.001,
alpha=0.15,
beta=0.8,
mu=0.02,
ar = 0.1)
# set garch spec
spec = garchSpec(model = my_model)
# print it
print(spec)
#'
#' The previous code defines a Garch model equival
#'
#' $$
#' \begin{aligned}
#' y _t &= 0.02 + 0.1 y_{t-1} + \epsilon _t \\
#' \epsilon _t &\sim N \left(0, h _t \right ) \\
#' h _t &= 0.001 + 0.15 \epsilon ^2 _{t-1}+ 0.8 h_
#' \end{aligned}
#' $$
#'
#' To simulate _1000_ observations of this model,
#'
## -------------------------------------------------------------------------------------------------------------------
set.seed(20)
# simulate garch model
sim_garch = garchSim(spec, n = 1000)
#'
#' We can visualize the artificial time series gen
#'
## ---- eval=TRUE, tidy=FALSE, fig.height=my.fig.height, fig.width=my.fig.width---------------------------------------
# set df for ggplot
temp_df <- tibble(sim_ret = sim_garch$garch,
idx=seq_along(sim_garch$garch))
p <- ggplot(temp_df, aes(x = idx,
y = sim_ret)) +
geom_line() +
labs(title = 'Simulated time series of garch model',
y = 'Value of Time Series',
x = '') +
theme_bw()
print(p)
#'
#' The behavior of the simulated series is similar
#'
#'
#' ### Estimating Garch Models {#estimating-garch}
#'
#' The estimation of the parameters from a GARCH m
#'
#' In the following example, we estimate a Garch m
#'
## ---- message=FALSE, warning=FALSE----------------------------------------------------------------------------------
# estimate garch model
my_form <- sim_ret ~ arma(1,0) + garch(1,1)
my_garchfit <- garchFit(
data = sim_garch,
formula = my_form,
trace = FALSE)
#'
#' To learn more about the estimated model, we can
#'
## -------------------------------------------------------------------------------------------------------------------
print(my_garchfit)
#'
#' The resulting parameters from the estimation ar
#'
#' Now, we will conduct another example of Garch m
#'
## ---- message=FALSE-------------------------------------------------------------------------------------------------
library(MTS)
# test for Arch effects
archTest(rt = na.omit(df_SP500$ret.adjusted.prices))
#'
#' The evidence is strong for Arch effects in the
#'
## ---- warning=FALSE-------------------------------------------------------------------------------------------------
# set object for estimation
df_est <- as.timeSeries(na.omit(df_SP500))
# estimate garch model for SP500
my_garchfit_SP500 <- garchFit(
data = df_est,
formula = ret.adjusted.prices ~ arma(1,0) +
garch(1,1),
trace = FALSE)
# print model
print(my_garchfit_SP500)
#'
#' As expected, all Garch coefficients are signifi
#'
#'
#' ### Forecasting Garch Models
#'
#' Forecasting Garch models involves two elements:
#'
#' In package `fGarch`, both forecasts are calcula
#'
## -------------------------------------------------------------------------------------------------------------------
# static forecast for garch
my_garch_forecast <- fGarch::predict(my_garchfit_SP500,
n.ahead = 3)
# print df
print(my_garch_forecast)
#'
#' The first column of the previous result is the
#'
#'
#' ## Regime Switching Models
#'
#' Markov regime-switching models are a specificat
#'
#' If we want to motivate the model, we need to co
#'
#' $$y_t=\mu_{S_t} + \epsilon_t $$
#'
#' where `r if (my.engine!='epub3') {'$S_t=1..k$'}
#'
#' Now, let's assume the previous model has two st
#'
#' $$
#' \begin{aligned}
#' y_t&=\mu_{1} + \epsilon_t \qquad \mbox{for Stat
#' y_t&=\mu_{2} + \epsilon_t \qquad \mbox{for Stat
#' \end{aligned}
#' $$
#'
#' where:
#'
#' $$
#' \begin{aligned}
#' \epsilon_t &\sim (0,\sigma^2_{1}) \qquad \mbox{
#' \epsilon_t &\sim (0,\sigma^2_{2}) \qquad \mbox{
#' \end{aligned}
#' $$
#'
#' This representation implies two processes for t
#'
#' We will now look at a financial example where t
#'
#' The different volatilities represent higher unc
#'
#' The changes in the states in the model can be s
#'
#' Markov switching is a special type of model for
#'
#' $$
#' P=\left[ \begin{array}{ccc}
#' p_{11} & \ldots & p_{1k} \\
#' \vdots & \ddots & \vdots \\
#' p_{k1} & \ldots & p_{kk} \\
#' \end{array} \right ]
#' $$
#'
#' In the previous matrix, row _i_, column _j_ con
#'
#'
#' ### Simulating Regime Switching Models
#'
#' In R, two packages are available for handling u
#'
## ---- eval=FALSE----------------------------------------------------------------------------------------------------
## install.packages("fMarkovSwitching",
## repos="http://R-Forge.R-project.org")
#'
#'
#' Once it is installed, let's look at its functio
#'
## -------------------------------------------------------------------------------------------------------------------
library(fMarkovSwitching)
print(ls('package:fMarkovSwitching'))
#'
#' The package includes functions for simulating,
#'
#' $$
#' \begin{aligned}
#' y_{t}&= +0.5x_t+\epsilon_{t} \qquad \mbox{State
#' y_{t}&=-0.5x_t+\epsilon_{t} \qquad \mbox{State
#' \epsilon _t &\sim N(0,0.25) \qquad \mbox{State
#' \epsilon _t &\sim N(0,1) \qquad \mbox{State 2}
#' \end{aligned}
#' $$
#'
#' The transition matrix will be given by:
#'
#' $$
#' P=\left[ \begin{array}{ccc}
#' 0.90 & 0.2 \\
#' 0.10 & 0.8
#' \end{array} \right ]
#' $$
#'
#' This model has two states with different volati
#'
## -------------------------------------------------------------------------------------------------------------------
set.seed(10)
library(fMarkovSwitching)
# number of obs
n_T <- 500
# distribution of residuals
distrib <- "Normal"
# number of states
k <- 2
# set transition matrix
P <- matrix(c(.9 ,.2,
.1 ,.8),
nrow = 2,
byrow = T)
# set switching flag
S <- c(0,1)
# set parameters of model (see manual for details)
nS_param <- matrix(0)
S_param <- matrix(0,sum(S),k)
S_param[,1] <- .5
S_param[,2] <- -.5
# set variance of model
sigma <- matrix(0, 1, k)
sigma[1,1] <- sqrt(0.25) # state 1
sigma[1,2] <- 1 # state 2
# build list
Coeff <- list(P = P ,
S = S ,
nS_param = nS_param ,
S_param = S_param ,
sigma = sigma )
# simulate model
my_ms_simul <- MS_Regress_Simul(nr = n_T,
Coeff = Coeff,
k = k,
distrib = distrib)
#'
#' In the simulation function, argument `nS_param`
#'
#' Once the model is simulated and available, let'
#'
## ---- fig.height=my.fig.height, fig.width=my.fig.width--------------------------------------------------------------
library(ggplot2)
df_to_plot <- tibble(y = my_ms_simul@dep,
x = Sys.Date()+1:my_ms_simul@nr,
states = my_ms_simul@trueStates[, 1])
p <- ggplot(data = df_to_plot,
aes(y = y, x = seq_along(y))) +
geom_line() +
labs(title = 'Simulated markov switching process',
x = '',
y = 'Value') +
theme_bw()
print(p)
#'
#' We can also look at the simulated states:
#'
## ---- eval=TRUE, tidy=FALSE, fig.height=my.fig.height, fig.width=my.fig.width---------------------------------------
library(ggplot2)
df_to_plot <- tibble(y = my_ms_simul@dep,
x = Sys.Date()+1:my_ms_simul@nr,
states = my_ms_simul@trueStates[,1])
p <- ggplot(data = df_to_plot,
aes(y = states, x = x)) +
geom_line() +
labs(y = 'Probability of state 1') +
theme_bw()
print(p)
#'
#' As expected, the model is switching from one st
#'
#'
#' ### Estimating Regime Switching Models {#estima
#'
#' We can estimate a univariate Markov switching m
#'
## ---- message=FALSE, results='hide', cache=TRUE---------------------------------------------------------------------
# set dep and indep
dep <- my_ms_simul@dep
indep <- my_ms_simul@indep
# set switching parameters and distribution
S <- c(0,1)
k <- 2
distIn <- "Normal"
# estimate the model
my_ms_model <- MS_Regress_Fit(dep, indep, S, k)
#'
#' Argument `dep` and `indep` sets the variables i
#'
## ---- eval=FALSE----------------------------------------------------------------------------------------------------
## # print estimation output
## # not evaluated due to large output (execute in your r session)
## glimpse(my_ms_model)
#'
#' The estimated coefficients are close to the one
#'
#' As an example with real data, let's estimate th
#'
## ---- message=FALSE,results='hide', cache=TRUE----------------------------------------------------------------------
library(BatchGetSymbols)
df_SP500 <- BatchGetSymbols(tickers = '^GSPC',
first.date = '2010-01-01',
last.date = '2019-01-01')$df.tickers
# set input objects to MS_Regress_Fit
ret <- na.omit(df_SP500$ret.closing.prices)
dep <- matrix(ret, nrow = length(ret))
indep <- matrix(rep(1, length(dep)),nrow = length(dep))
S <- c(1) # where to switch (in this case in the only indep)
k <- 2 # number of states
distIn <- "Normal" #distribution assumption
# estimating the model
my_SP500_MS_model <- MS_Regress_Fit(dep, indep, S, k)
#'
#' And now, we check the result.
#'
## ---- eval=FALSE----------------------------------------------------------------------------------------------------
## # printing output (not evaluated due to large output)
## glimpse(my_SP500_MS_model)
#'
#'
#'
#' The model identified two volatility regimes fro
#'
#' A common figure in the analysis of Markov switc
#'
#'
## ---- fig.height=my.fig.height, fig.width=my.fig.width--------------------------------------------------------------
# get variables for plot
smooth.prob <- as.numeric(my_SP500_MS_model @smoothProb[ , 1])
price <- df_SP500$price.close[2:nrow(df_SP500)]
ref_dates <- df_SP500$ref.date[2:nrow(df_SP500)]
# build long df to plot
df.to.plot <- tibble(type = c(rep('Probabilities Bull Market',
length(smooth.prob)),
rep('SP500',
length(smooth.prob))),
ref.date = rep(ref_dates ,2),
value = c(smooth.prob,
price) )
# plot with ggplot
p <- ggplot(df.to.plot,
aes(y=value, x =ref.date)) +
geom_line(size = 0.5) +
facet_wrap(~type, nrow = 2, scales = 'free_y') +
labs(x = '',
y = 'Value',
title = 'SP500 and its bull market states',
subtitle = 'Prob. from a markov regime switching model',
caption = 'Data from Yahoo Finance')
# plot it!
print(p)
#'
#' The figure shows how the price increases in sta
#'
#'
#' ### Forecasting Regime Switching Models
#'
#' Package `MS_Regress` provides function `MS_Regr
#'
## -------------------------------------------------------------------------------------------------------------------
# make static forecast of regime switching model
newIndep <- 1
my_for <- MS_Regress_For(my_SP500_MS_model , newIndep)
# print output
print(my_for)
#'
#'
#' The model predicts, the day after the last date
#'
#'
#' ## Dealing with Several Models
#'
#' In practice, it is very unlikelly that the rese
#'
#' ### Using `tapply` and `sapply`
#'
#' In chapter \@ref(programming), we learned we co
#'
## -------------------------------------------------------------------------------------------------------------------
set.seed(10)
# set number of stocks
n_stocks <- 4
# load data from .rds
my_f <- afedR::get_data_file('SP500-Stocks-WithRet.rds')
df_stocks <- read_rds(my_f)
# select tickers
my_tickers <- sample(unique(df_stocks$ticker), n_stocks)
# set my_df
df_temp <- df_stocks %>%
dplyr::filter(ticker %in% my_tickers)
# renew factors in ticker
df_temp$ticker <- as.factor(as.character(df_temp$ticker))
#'
#' Now, what we want to do with this data is separ
#'
## ---- tidy=FALSE----------------------------------------------------------------------------------------------------
my_l <- tapply(X = df_temp$ret,
INDEX = df_temp$ticker,
FUN = arima,
order = c(1, 0, 0))
#'
#' Each model is available in `my_l`. To retrieve
#'
## -------------------------------------------------------------------------------------------------------------------
# print all coefficients
print(sapply(X = my_l,
FUN = coef))
#'
#' ### Using `by`
#'
#' A limitation of using `tapply` is that we are r
#'
#' For an example, we are going to estimate severa
#'
## -------------------------------------------------------------------------------------------------------------------
# load SP500 data
my_f <- afedR::get_data_file('SP500.csv')
# calculate return
df_sp500$ret <- calc.ret(df_sp500$price.close)
# find location of dates in df_sp500
idx <- match(df_stocks$ref.date,
df_sp500$ref.date)
# create column in my_df with sp500 returns
df_stocks$ret.sp500 <- df_sp500$ret[idx]
#'
#' The next step is to create a function that will
#'
## -------------------------------------------------------------------------------------------------------------------
estimate_beta <- function(df) {
# Function to estimate beta from dataframe of stocks returns
#
# Args:
# df - Dataframe with columns ret and ret.sp500
#
# Returns:
# The value of beta
my_model <- lm(data = df,
formula = ret ~ ret.sp500)
return(coef(my_model)[2])
}
#'
#' Now, we can use the previous function with `by`
#'
## -------------------------------------------------------------------------------------------------------------------
# calculate beta for each stock
my_betas <- by(data = df_stocks,
INDICES = df_stocks$ticker,
FUN = estimate_beta)
glimpse(as.numeric(my_betas))
#'
#' The values of the different `betas` are availab
#'
## ---- eval=TRUE, message=FALSE, fig.height=my.fig.height, fig.width=my.fig.width, warning=FALSE---------------------
library(ggplot2)
df_to_plot <- tibble(betas = as.numeric(my_betas))
p <- ggplot(df_to_plot, aes(x = my_betas)) +
geom_histogram(bins = 40) +
labs(x = 'Betas',
y = 'Frequency',
title = 'Histogram of Betas for SP500 stocks',
subtitle = paste0('Market models estimated with data from ',
min(df_stocks$ref.date), ' to ',
max(df_stocks$ref.date), '\n',
length(unique(df_stocks$ticker)),
' stocks included'),
caption = 'Data from Yahoo Finance') +
theme_bw()
print(p)
#'
#' For the SP500 data, we find no negative value o
#'
#'
#' ### Using `dplyr::group_by`
#'
#' Another way of storing and managing several mod
#'
## ---- tidy=FALSE----------------------------------------------------------------------------------------------------
library(dplyr)
my_tab <- df_stocks %>%
group_by(ticker) %>%
do(my_model = arima(x = .$ret, order = c(1,0,0)))
glimpse(my_tab)
#'
#' We have a list-column, called `my_model`, stori
#'
## ---- tidy=FALSE----------------------------------------------------------------------------------------------------
my_model_tab <- df_stocks %>%
group_by(ticker) %>%
do(my_model = arima(x = .$ret, order = c(1,0,0))) %>%
mutate(alpha = coef(my_model)[2],
ar1 = coef(my_model)[1])
glimpse(my_model_tab)
#'
#'
#' ## Exercises
#'
## ---- echo=FALSE, results='asis'------------------------------------------------------------------------------------
f_in <- list.files('../02-EOCE-Rmd/Chapter11-FinEcon/',
full.names = TRUE)
compile_eoc_exercises(f_in, type_doc = my_engine)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.