library(data.table)
library(RollingWindow)
library(DescTools)
library(TTR)
library(janitor)
library(PerformanceAnalytics)
library(erf)
library(foreach)
library(parallel)
library(doParallel)
# # SET UP ------------------------------------------------------------------
# # global vars
# PATH = "F:/data/equity/us"
# PRICE DATA --------------------------------------------------------------
# Import QC daily data
prices = fread("F:/lean/data/stocks_daily.csv")
setnames(prices, gsub(" ", "_", c(tolower(colnames(prices)))))
# Remove duplicates
prices = unique(prices, by = c("symbol", "date"))
# Remove duplicates - there are same for different symbols (eg. phun and phun.1)
dups = prices[, .(symbol , n = .N),
by = .(date, open, high, low, close, volume, adj_close,
symbol_first = substr(symbol, 1, 1))]
dups = dups[n > 1]
dups[, symbol_short := gsub("\\.\\d$", "", symbol)]
symbols_remove = dups[, .(symbol, n = .N),
by = .(date, open, high, low, close, volume, adj_close,
symbol_short)]
symbols_remove[n >= 2, unique(symbol)]
symbols_remove = symbols_remove[n >= 2, unique(symbol)]
symbols_remove = symbols_remove[grepl("\\.", symbols_remove)]
prices = prices[symbol %notin% symbols_remove]
# Adjust all columns
prices[, adj_rate := adj_close / close]
prices[, let(
open = open*adj_rate,
high = high*adj_rate,
low = low*adj_rate
)]
setnames(prices, "close", "close_raw")
setnames(prices, "adj_close", "close")
prices[, let(adj_rate = NULL)]
setcolorder(prices, c("symbol", "date", "open", "high", "low", "close", "volume"))
# Remove observations where open, high, low, close columns are below 1e-008
# This step is opional, we need it if we will use finfeatures package
prices = prices[open > 1e-008 & high > 1e-008 & low > 1e-008 & close > 1e-008]
# Sort
setorder(prices, symbol, date)
# Calculate returns
prices[, returns := close / shift(close, 1) - 1]
# Remove missing values
prices = na.omit(prices)
# Set SPY returns as market returns
spy_ret = na.omit(prices[symbol == "spy", .(date, market_ret = returns)])
prices = spy_ret[prices, on = "date"]
# Minimal observations per symbol is 253 days
remove_symbols = prices[, .(symbol, n = .N), by = symbol][n < 253, symbol]
prices = prices[symbol %notin% remove_symbols]
# Free memory
gc()
# FILTERING ---------------------------------------------------------------
# Add label for most liquid asssets
prices[, dollar_volume := close_raw * volume]
# Create list of symbols arranged by dollar liquidity from 2020 to today
symbols_liquid = prices[date >= as.Date("2020-01-01"),
.(dollar_volume_sum = sum(dollar_volume, na.rm = TRUE)),
by = symbol][order(-dollar_volume_sum)]
# Remove columns we don't need
prices[, dollar_volume := NULL]
# PREDICTORS --------------------------------------------------------------
# Rolling beta
setorder(prices, symbol, date)
prices = prices[, beta := RollingBeta(market_ret, returns, 252, na_method = "ignore"),
by = symbol]
# Highest beta by date - 5% of symbols by highest beta
prices[, beta_rank := frank(abs(beta), ties.method = "dense", na.last = "keep"), by = date]
prices[, beta_rank_pct := beta_rank / max(beta_rank, na.rm = TRUE), by = date]
prices[, beta_rank_largest_99 := 0, by = date]
prices[beta_rank_pct > 0.99, beta_rank_largest_99 := 1, by = date]
prices[, beta_rank_largest_95 := 0, by = date]
prices[beta_rank_pct > 0.95, beta_rank_largest_95 := 1, by = date]
prices[, beta_rank_largest_90 := 0, by = date]
prices[beta_rank_pct > 0.90, beta_rank_largest_90 := 1, by = date]
setorder(prices, symbol, date)
# Momentum predictors
months_size = c(3, 6, 9, 12)
mom_vars = paste0("momentum_", months_size)
f_ = function(x, n) {
shift(x, 21) / shift(x, n * 21) - 1
}
prices[, (mom_vars) := lapply(months_size, function(x) f_(close, x)), by = symbol]
# Momentum ensambles
weights_ = c(12, 6, 3, 1) / sum(c(12, 6, 3, 1))
prices[, momentum_average := momentum_3 * weights_[1] +
momentum_6 * weights_[2] +
momentum_9 * weights_[3] +
momentum_12 * weights_[4]]
# Dolar volume z-score
dv_cols = paste0("dollar_volume_zscore_winsorized", months_size)
f_ = function(x, y, z) RollingZscore(as.numeric(x * y), z, na_method = "ignore")
prices[, (dv_cols) := lapply(months_size, function(s) as.vector(f_(close_raw, volume, s * 21))),
by = symbol]
prices[, (dv_cols) := lapply(.SD, function(x) Winsorize(x, val = c(-5, 5))),
.SDcols = dv_cols]
# Tecnical indicators
prices[, rsi := RSI(close, n = 14), by = symbol]
# Remove columns we don't need
prices[, c("beta_rank", "beta_rank_pct", "open", "high", "low", "volume") := NULL]
# ESTIMATION --------------------------------------------------------------
# Keep data we need
dt = na.omit(prices[symbol %in% symbols_liquid[, head(symbol, 200)]])
# Create target variable
setorder(dt, symbol, date)
dt[, target := shift(close, 1, type = "lead") / close - 1, by = symbol]
dt = na.omit(dt)
# Shift predictors
nonx_cols = c("symbol", "date", "close", "close_raw", "returns", "target")
predictors = dt[, colnames(.SD), .SDcols = -nonx_cols]
# Select some date interval
dates = seq.Date(from = as.Date("2019-01-01"), to = Sys.Date(), by = "day")
dates = as.Date(intersect(dates, dt[, unique(date)]))
# Loop over date and symbols and train erf model extract predictions
quantile_levels = c(0.01, 0.05, 0.95, 0.99)
setorder(dt, symbol ,date)
for (s in dt[, unique(symbol)]) {
# debug
print(s)
# Check if already estimated
file_name = paste0("F:/strategies/evtq/", s, ".csv")
if (file.exists(file_name)) next
# sample data
dt_ = dt[symbol == s]
# estimation
s_ = Sys.time()
cl = makeCluster(4L)
registerDoParallel(cl)
# l = foreach(i = 1:100) %do% {
l = foreach(i = 1:length(dates),
.packages = c("data.table", "erf", "janitor"),
.export = c("dt_", "s")) %dopar% {
d = dates[i]
# for (d in dates[1:20]) {
# d = dates[1]
# Train data
dtd = dt_[date < d]
if (nrow(dtd) < 252) return(NULL)
if (as.Date(d) - dt_[, as.Date(max(date))] > 2) return(NULL)
# Test data
test_data = dt_[date == d]
if (nrow(test_data) == 0) return(NULL)
# Fit model for upper
train_data_upper = dtd[returns > 0]
erf_model_upper = erf(
X = as.matrix(train_data_upper[, .SD, .SDcols = predictors]),
Y = train_data_upper[, target],
min.node.size = 5,
lambda = 0.001,
intermediate_quantile = 0.8
)
# Fit model for lower
train_data_lower = dt_[returns < 0]
erf_model_lower = erf(
X = as.matrix(train_data_lower[, .SD, .SDcols = predictors]),
Y = -train_data_lower[, target],
min.node.size = 5,
lambda = 0.001,
intermediate_quantile = 0.8
)
# Predict
erf_predictions_upper = predict(
erf_model_upper,
as.matrix(test_data[, .SD, .SDcols = predictors]),
quantiles = quantile_levels
)
erf_predictions_lower = predict(
erf_model_lower,
as.matrix(test_data[, .SD, .SDcols = predictors]),
quantiles = quantile_levels
)
erf_predictions_upper = clean_names(as.data.frame(erf_predictions_upper))
colnames(erf_predictions_upper) = paste0("upper_", colnames(erf_predictions_upper))
erf_predictions_lower = clean_names(as.data.frame(erf_predictions_lower))
colnames(erf_predictions_lower) = paste0("lower_", colnames(erf_predictions_lower))
cbind(symbol = s, date = d, erf_predictions_upper, erf_predictions_lower,
targetr = test_data[, target])
}
stopCluster(cl)
# Check time
e = Sys.time()
print(e-s_)
# Clean and save
x = rbindlist(l)
fwrite(x, file_name)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.