library(data.table)
library(mrisk)
library(leanr)
library(httr)
library(rvest)
library(ggplot2)
library(purrr)
library(cpm)
library(runner)
# risk models
library(GAS)
library(evir)
library(quarks)
# performance
library(parallel)
library(future.apply)
# IMPORT DATA -------------------------------------------------------------
# import data
save_path <- "D:/risks/risk-factors"
freq <- "minute"
save_path <- file.path(save_path, freq)
stocks <- import_lean('D:/market_data/equity/usa/hour/trades_adjusted')
# help vars
stocks[, N_ :=.N, by = symbol]
stocks <- setorder(stocks, symbol, datetime)
stocks[, returns := close / shift(close) - 1, by = symbol]
stocks <- na.omit(stocks)
#
# help functions
get_series_statistics <- function(series) {
var_1 <- series[1]
var_day <- mean(series[1:8], na.rm = TRUE)
var_week <- mean(series[1:40], na.rm = TRUE)
var_month <- mean(series, na.rm = TRUE)
var_std <- sd(series, na.rm = TRUE)
return(list(var_1 = var_1, var_day = var_day, var_week = var_week, var_month = var_month, var_std = var_std))
}
# VAR AND ES RISK MEASURES -----------------------------------------------------
# parameters
symbol = unique(stocks$symbol)
symbol = setdiff(symbol, 'SPY')
p = c(0.95, 0.975, 0.99)
win_size = c(100, 200, 400, 800)
model = c("EWMA", "GARCH")
method = c("plain", "age", "vwhs", "fhs")
forecast_length = 150
params <- expand.grid(symbol, p, model, method, win_size, forecast_length, stringsAsFactors = FALSE)
colnames(params) <- c("symbol", "p", "model", "method", "win_size", "forecast_length")
params <- params[!(params$model == 'GARCH' & params$method == "fhs"), ]
# get forecasts
for (i in 1:nrow(params)) {
# params
params_ <- params[i, ]
print(params_)
# create symbol if it doesnt already exists
if (!dir.exists(file.path(save_path, tolower(params_$symbol)))) {
dir.create(file.path(save_path, tolower(params_$symbol)))
}
# create file name
params_$p <- params_$p * 1000
file_name <- paste0(paste0(paste(params_, sep = "_"), collapse = "_"), ".csv")
file_name <- file.path(save_path, tolower(params_$symbol), file_name)
params_$p <- params_$p / 1000
# cont if file exists
if (file.exists(file_name)) {
next()
}
# take sample
sample_ <- stocks[symbol == params_$symbol]
# stop if number of rows is smaller than params_$win_size + params_$forecast_length
if (nrow(sample_) < params_$win_size + params_$forecast_length ) {
next()
}
# set up clusters
cl <- makeCluster(16)
clusterExport(cl, c("sample_", "params_", "get_series_statistics"), envir = environment())
# roll estimation
roll_quarks <- runner(
x = data.frame(y=sample_$returns),
f = function(x) {
library(quarks)
library(data.table)
print(x[1])
if (params_$method == "fhs") {
y <- rollcast(x,
p = params_$p,
model = params_$model,
method = params_$method,
nout = params_$forecast_length,
nwin = params_$win_size,
nboot = 1000
)
} else {
y <- rollcast(x,
p = params_$p,
model = params_$model,
method = params_$method,
nout = params_$forecast_length,
nwin = params_$win_size
)
}
VaR <- as.data.table(get_series_statistics(y$VaR))
ES <- as.data.table(get_series_statistics(y$ES))
colnames(ES) <- gsub("var_", "es_", colnames(ES))
return(cbind.data.frame(VaR, ES))
},
k = params_$win_size + params_$forecast_length,
na_pad = TRUE,
cl = cl
)
stopCluster(cl)
# clean table
risks <- rbindlist(lapply(roll_quarks, as.data.frame), fill = TRUE)[, -1]
risks <- cbind.data.frame(datetime = sample_$datetime, risks)
# save
fwrite(risks, file_name)
}
# CHANGEPOINTS (FROM HERE OLD CODE) ---------------------------------------
# estimate changepoints using cpm package
arl0 <- c(370, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000, 6000,
7000, 8000, 9000, 10000, 20000, 30000, 40000, 50000)
get_changepoints <- function(returns, method, arl0) {
# change points roll
detectiontimes <- numeric()
changepoints <- numeric()
cpm <- makeChangePointModel(cpmType=method, ARL0=arl0, startup=200)
i <- 0
while (i < length(returns)) {
i <- i + 1
# process each observation in turn
cpm <- processObservation(cpm, returns[i])
# if a change has been found, log it, and reset the CPM
if (changeDetected(cpm) == TRUE) {
detectiontimes <- c(detectiontimes,i)
# the change point estimate is the maximum D_kt statistic
Ds <- getStatistics(cpm)
tau <- which.max(Ds)
if (length(changepoints) > 0) {
tau <- tau + changepoints[length(changepoints)]
}
changepoints <- c(changepoints,tau)
# reset the CPM
cpm <- cpmReset(cpm)
#resume monitoring from the observation following the change point
i <- tau
}
}
points <- cbind.data.frame(detectiontimes, changepoints)
breaks <- rep(FALSE, length(returns))
breaks[detectiontimes] <- TRUE
change <- rep(FALSE, length(returns))
change[changepoints] <- TRUE
return(cbind.data.frame(breaks, change))
}
# estimate breaks
for (i in c(370, 500, 1000, 5000)) {
stocks[, paste(c('breaks', 'changes'), i, sep = '_') := get_changepoints(returns, method = 'Mood', i), by = .(symbol)]
}
# save
column_save <- c('.id', 'index', colnames(stocks)[grep('breaks|chang', colnames(stocks))])
fwrite(stocks[, ..column_save], file.path(save_path, 'changepoints.csv'))
# GPD --------------------------------------------------------
# prepare data
DT <- as.data.table(stocks)
DT <- DT[, .(symbol, datetime, close, returns)]
setnames(DT, colnames(DT), c('symbol', 'date', 'close', 'return'))
DT <- DT[DT[, .N, by = .(symbol)][N > 2500], on = "symbol"] # stock have at least 1 year of data
DT <- na.omit(DT)
# GDP ES and VaR
roll_windows = c(100, 250) # 25 and 50 days
thresholds = c(0.01, 0.015)
parameters <- expand.grid(roll_windows, thresholds, stringsAsFactors = FALSE)
colnames(parameters) <- c('w', 'threshold')
p <- c(0.999, 0.9999)
# estimate gpd statistics
estimate_gpd <- function(returns, threshold, window, p, ncores = 8L) {
cl <- makeCluster(ncores)
clusterExport(cl, c("returns", "threshold", "window", "p"), envir = environment())
gpd_roll <- runner::runner(
x = returns,
f = function(x) {
gpd_est_last <- tryCatch({
library(data.table)
gpd_est <- evir::gpd(x, threshold = threshold, method = 'ml', information = 'expected')
es_q <- evir::riskmeasures(gpd_est, p)
es_q <- as.data.frame(t(as.vector(es_q[, 2:ncol(es_q)])))
es_q <- cbind.data.frame(es_q, gpd_est$n.exceed, gpd_est$par.ests[1], gpd_est$par.ests[2])
colnames(es_q) <- c(paste0(apply(expand.grid(c('q_', 'e_'), p * 10000), 1, function(x) paste0(x, collapse = '')),
'_', window, '_', threshold * 1000),
paste('n_exceed', window, threshold * 1000, sep = '_'),
paste('xi', window, threshold * 1000, sep = '_'),
paste('beta', window, threshold * 1000, sep = '_'))
es_q
}, error = function(e) data.table(NA))
return(gpd_est_last)
},
k = window,
na_pad = TRUE,
cl = cl
)
stopCluster(cl)
gpd_roll <- rbindlist(gpd_roll, fill = TRUE)
return(gpd_roll)
}
# for gpd risks estimates
gpd_risks_estimate <- function(returns, no_cores = 16) {
gpd_params <- lapply(1:nrow(parameters), function(i) {
estimate_gpd(returns, parameters[i, 2], parameters[i, 1], p, no_cores)[, V1 := NULL]
})
gpd_params <- do.call(cbind, gpd_params)
return(gpd_params)
}
# gpd risks
gpd_risks_left_tail <- DT[, gpd_risks_estimate(return * -1, no_cores = 25), by = .(symbol)]
gpd_risks_right_tail <- DT[, gpd_risks_estimate(return, no_cores = 25), by = .(symbol)]
# convert tu numeric becuase some columns are characters for some reaseon
cols <- colnames(gpd_risks_right_tail)[grep('q_|e_', colnames(gpd_risks_right_tail))]
gpd_risks_right_tail[, (cols) := lapply(.SD, as.numeric), .SDcols = cols]
gpd_risks_left_tail[, (cols) := lapply(.SD, as.numeric), .SDcols = cols]
# cbind date column
gpd_risks_right_tail <- cbind(date = DT$date, gpd_risks_right_tail)
gpd_risks_left_tail <- cbind(date = DT$date, gpd_risks_left_tail)
# fread
gpd_risks_left_tail_old <- fread(file.path(save_path, 'gpd_risks_left_tail.csv'), sep = ';')
gpd_risks_right_tail_old <- fread(file.path(save_path, 'gpd_risks_right_tail.csv'), sep = ';')
gpd_risks_left_tail_old <- merge(gpd_risks_left_tail_old, gpd_risks_left_tail,
by = c("symbol", 'date'), all.x = TRUE, all.y = TRUE)
gpd_risks_right_tail_old <- merge(gpd_risks_right_tail_old, gpd_risks_right_tail,
by = c("symbol", 'date'), all.x = TRUE, all.y = TRUE)
# save
fwrite(gpd_risks_left_tail_old, file.path(save_path, 'gpd_risks_left_tail.csv'), sep = ';')
fwrite(gpd_risks_right_tail_old, file.path(save_path, 'gpd_risks_right_tail.csv'), sep = ';')
# PTSUITE -----------------------------------------------------------------
# function to calculate all mesures
estimate_gpd <- function(x, hill_threshold = 0.02, suffix = '_right') {
columns_names <- c(
'ptest',
'hill_shape',
'scales_geometric_percentiles_method',
'scales_least_squares',
'scales_method_of_moments',
'scales_modified_percentiles_method',
'scales_weighted_least_squares',
'shapes_geometric_percentiles_method',
'shapes_least_squares',
'shapes_method_of_moments',
'shapes_modified_percentiles_method',
'shapes_weighted_least_squares'
)
columns_names <- paste0(columns_names, suffix)
if (length(x) == 0) {
risks <- data.table(t(rep(0, length(columns_names))))
setnames(risks, colnames(risks), columns_names)
} else {
ptest <- pareto_test(x)$`p-value`
estimates <- as.data.table(generate_all_estimates(x))
shapes <- data.table::dcast(estimates[, 1:2], . ~ `Method.of.Estimation`, value.var = 'Shape.Parameter')
shapes <- shapes[, 2:ncol(shapes)]
colnames(shapes) <- paste0('shapes_', colnames(clean_names(shapes)))
scales <- data.table::dcast(estimates[, c(1, 3)], . ~ `Method.of.Estimation`, value.var = 'Scale.Parameter')
scales <- scales[, 2:ncol(scales)]
colnames(scales) <- paste0('scales_', colnames(clean_names(scales)))
hill_estimate <- alpha_hills(x, hill_threshold, FALSE)
hill_shape <- hill_estimate$shape
risks <- as.data.table(data.frame(as.list(c(scales, shapes))))
risks <- cbind(ptest, hill_shape, risks)
colnames(risks) <- paste0(colnames(risks), suffix)
}
return(risks)
}
# roll measures from ptsuite package
roll_gpd <- function(returns, window_ = 500, threshold = 0.02) {
library(parallel)
# calculate shapes on rolling windows
cl <- makeCluster(16)
environment(window_) <- .GlobalEnv
environment(threshold) <- .GlobalEnv
clusterExport(cl, c("estimate_gpd", "window_", "threshold"), envir=environment())
evt <- runner::runner(
x = returns,
f = function(x) {
library(data.table)
library(ptsuite)
library(janitor)
# pareto left tail
x_left <- x[x < -threshold] * -1
risks_left <- estimate_gpd(x_left, hill_threshold = 0.02, suffix = '_left')
# pareto test right tail
x_right <- x[x > threshold]
risks_right <- estimate_gpd(x_right, hill_threshold = 0.02)
# estimate shape parameters
pareto_tests <- cbind(risks_left, risks_right)
return(pareto_tests)
},
k = window_,
na_pad = TRUE,
type = 'auto',
cl = cl
)
stopCluster(cl)
risks <- rbindlist(evt, fill = TRUE)
return(risks)
}
# params
# roll gpd across params
roll_ptsuite_params <- function() {
}
# estimate indicators
DT_sample <- na.omit(stocks[N_ > 1000])
cols <- colnames(roll_gpd(DT_sample[1:50, (returns)], window_ = 20, threshold = 0.02))
DT_sample[, (cols) := roll_gpd(returns), by = symbol]
# GAS PREDICTION ----------------------------------------------------------
# specification of general autoregresive scoring model
estimate_gas <- function(date_return, dist = 'sstd',
scale_type = 'Identity', insample_length = 1000,
alphas = c(0.001, 0.01, 0.025, 0.05, 0.95, 0.99, 0.999),
no_cores = 25) {
# gas apec
GASSpec = UniGASSpec(
Dist = dist,
ScalingType = scale_type,
GASPar = list(location = TRUE, scale = TRUE, skewness = TRUE, shape = TRUE))
# Perform 1-step ahead rolling forecast with refit
cluster <- makeCluster(no_cores)
Roll <- UniGASRoll(
as.xts.data.table(date_return),
GASSpec,
Nstart = insample_length,
RefitEvery = 5,
RefitWindow = c("moving"),
cluster = cluster)
stopCluster(cluster)
# statistics
forecasts_gas <- getForecast(Roll)
var_gas <- quantile(Roll, probs = alphas) # VaR
colnames(var_gas) <- paste0('var_gas_', as.character(alphas * 1000))
es_gas <- GAS::ES(Roll, probs = alphas)
colnames(es_gas) <- paste0('es_gas_', as.character(alphas * 1000))
# merge to market_data
data <- as.data.table(cbind(forecasts_gas, var_gas, es_gas))
colnames(data) <- paste(dist, scale_type, insample_length, colnames(data), sep = '_')
return(data)
}
# estimate over parameters
risk_estimate_gas <- function(date_return, params, ncores = 25) {
output <- lapply(1:nrow(params), function(i) {
estimate_gas(date_return, params[i, 1], params[i, 2], params[i, 3], ncores)
})
output <- do.call(cbind, output)
return(output)
}
# params
distributions <- c('sstd')
scale_type <- c("Identity", "Inv")
insample_length <- c(500, 1000, 2000) # , 1000, 2000
params <- expand.grid(distributions, scale_type, insample_length, stringsAsFactors = FALSE)
# prepare data
DT <- as.data.table(stocks)
DT <- DT[, .(.id, index, close, returns)]
setnames(DT, colnames(DT), c('symbol', 'date', 'close', 'return'))
DT <- DT[DT[, .N, by = .(symbol)][N > 2500], on = "symbol"] # stock have at least 1 year of data
DT <- na.omit(DT)
# GAS risks
DT <- DT[symbol %in% c("MMM", "SPY")]
date_return <- DT[symbol == "SPY", .(date, return)]
gas_risks <- DT[, risk_estimate_gas(data.table(date, return), params, ncores = 20), by = .(symbol)]
# BACKTEST ----------------------------------------------------------------
# PLAYGROUND --------------------------------------------------------------
# merge all risk factors
# merge
symb <- 'ETSY'
stock <- stocks[.id == symb]
risk_left <- gpd_risks_left_tail[symbol == symb]
colnames(risk_left) <- paste0('left_', colnames(risk_left))
risk_right <- gpd_risks_right_tail[symbol == symb]
colnames(risk_right) <- paste0('right_', colnames(risk_right))
s <- cbind(stock, risk_left, risk_right)
s <- s[left_e_9999_2000_15 < 1 & left_e_9999_2000_15 > -1]
# plot
ggplot(s, aes(x = index)) +
geom_line(aes(y = close)) +
geom_line(aes(y = left_e_9999_1000_15 * 10), color = 'red') +
geom_line(aes(y = right_e_9999_1000_15 * 10), color = 'blue')
ggplot(s[index %between% c('2015-01-01', '2021-01-01')], aes(x = index)) +
geom_line(aes(y = close)) +
geom_line(aes(y = left_e_9999_1000_15 * 100), color = 'red') +
geom_line(aes(y = right_e_9999_1000_15 * 100), color = 'blue')
ggplot(s, aes(x = index)) +
geom_line(aes(y = close)) +
geom_line(aes(y = n_exceed_2000_15 * 1), color = 'red')
# portfolio investing
# SAVE DATA FOR ML MODEL --------------------------------------------------
# convert risk to data.table
risks_dt <- lapply(risks, as.data.table)
names(risks_dt) <- names(estimated)
risks_dt <- lapply(risks_dt, setnames, 'index', 'Datetime')
# risks_dt <- rbindlist(risks_dt, idcol = TRUE)
save(risks_dt, file = file.path('mldata', 'risks.rda'))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.