library(data.table)
library(AzureStor)
library(httr)
library(roll)
library(ggplot2)
library(duckdb)
library(lubridate)
library(gausscov)
library(fs)
library(future.apply)
library(forecast)
library(matrixStats)
# DATA --------------------------------------------------------------------
# get securities data from QC
get_sec = function(symbol) {
con <- dbConnect(duckdb::duckdb())
query <- sprintf("
SELECT *
FROM 'F:/lean_root/data/all_stocks_daily.csv'
WHERE Symbol = '%s'
", symbol)
data_ <- dbGetQuery(con, query)
dbDisconnect(con)
data_ = as.data.table(data_)
data_ = data_[, .(date = Date, close = `Adj Close`)]
data_[, returns := close / shift(close) - 1]
data_ = na.omit(data_)
return(data_)
}
spy_dt = get_sec("spy")
tlt_dt = get_sec("tlt")
######## THIS TO QUANTDATA ###########
# # import FRED meta
# fred_meta = fread("F:/macro/fred_series_meta.csv")
# fred_meta = unique(fred_meta, by = "id")
#
# # import FRED series data
# fred_files = dir_ls("F:/macro/fred")
# # plan("multisession", workers = 6)
# fred_l = lapply(fred_files, function(x) tryCatch(fread(x), error = function(e) NULL))
# length(fred_l)
# fred_l = fred_l[!sapply(fred_l, is.null)]
# fred_l = fred_l[!sapply(fred_l, function(x) nrow(x) == 0)]
# fred_l = future_lapply(fred_l, function(dt_) dt_[, value := as.numeric(value)])
#
# # solve date problem
# fred_l = future_lapply(fred_l, function(dt) {
# if (dt[, all(vintage == 1)]) {
# dt[, date_real := realtime_start]
# } else {
# dt[, date_real := date]
# }
# })
#
# # merge all series
# fred_dt = rbindlist(fred_l, fill = TRUE)
# setorder(fred_dt, series_id, date)
#
# # merge meta data
# fred_dt = fred_meta[, .(series_id = id, frequency_short, popularity, observation_start, units, id_category)][
# fred_dt, on = "series_id"]
#
# # save fred data
# fwrite(fred_dt, "F:/macro/fred.csv")
######## THIS TO QUANTDATA ###########
# import fred series
fred_dt = fread("F:/macro/fred.csv")
# remove unnecessary columns
fred_dt = fred_dt[, .(series_id, date = date_real, frequency_short, popularity,
units, id_category, value)]
# filter daily and weekly data
fred_dt_w = fred_dt[frequency_short %in% c("D", "W", "BW", "M")]
# diff if necessary
fred_dt_w = fred_dt_w[, ndif := ndiffs(value), by = series_id]
fred_dt_w[, unique(ndif), by = series_id][, .N, by = V1]
fred_dt_w[ndif > 0, value_diff := c(rep(NA, unique(ndif)), diff(value, unique(ndif))), by = series_id]
# check for duplicates
date_series = fred_dt_w[, .N, by = .(date, series_id)]
fred_dt_w = unique(fred_dt_w,
fromLast = TRUE,
by = c("series_id", "date"))
# reshape - variables to columns
fred_dt_w = dcast(fred_dt_w[, .(date, series_id, value_diff)],
date ~ series_id,
value.var = "value_diff")
setorder(fred_dt_w, date)
fred_dt_w[1:10, 1:10]
# locf vars
predictors = colnames(fred_dt_w)[2:ncol(fred_dt_w)]
fred_dt_w[, (predictors) := lapply(.SD, nafill, type = "locf"), .SDcols = predictors]
# downsample TLT to weekly
tlt = copy(tlt_dt)
tlt[, week := floor_date(date, "month")]
tlt[, close := tail(close, 1), by = week]
tlt = tlt[, tail(.SD, 1), by = week]
# add target variable
DT = fred_dt_w[tlt[, .(date, close)], on = "date", roll = Inf]
DT = DT[date > as.Date("2002-01-01")]
cols = c("date", "close", predictors)
DT = DT[, ..cols]
DT[, `:=`(
ret_1 = shift(close, n = -1, type = "shift") / close -1,
ret_2 = shift(close, n = -2, type = "shift") / close -1,
ret_3 = shift(close, n = -3, type = "shift") / close -1,
ret_4 = shift(close, n = -4, type = "shift") / close -1,
ret_12 = shift(close, n = -12, type = "shift") / close -1
)]
# DT[, `:=`(
# ret_d = shift(close, n = -1, type = "shift") / close -1,
# ret_w = shift(close, n = -5, type = "shift") / close -1,
# ret_bw = shift(close, n = -10, type = "shift") / close -1,
# ret_m = shift(close, n = -22, type = "shift") / close -1,
# ret_q = shift(close, n = -66, type = "shift") / close -1
# )]
# lag all variables - IMPORTANT !
DT[, (predictors) := lapply(.SD, shift), .SDcols = predictors]
# MODEL -------------------------------------------------------------------
# define predictors
non_pred_cols = c("date", "date_merge_dt", "week", "close", "volume",
"date_merge_tlt", "ret_1", "ret_2", "ret_3", "ret_4", "ret_12")
predictors = setdiff(colnames(DT), non_pred_cols)
DT = na.omit(DT, cols = "ret_1")
X = DT[, ..predictors]
y = DT[, ret_1]
dim(X)
length(y)
# remove columns with many NA values
nas_cols_train = colSums(is.na(X))
nas_cols_train = nas_cols_train / nrow(X)
non_na = names(nas_cols_train)[nas_cols_train < 0.1]
X = X[, ..non_na]
dim(X)
X = as.matrix(X)
isna_ind = rowAnyNAs(X)
X = X[!isna_ind, ]
y = y[!isna_ind]
dim(X)
length(y)
rowany
# # remove INF values if exists
# X_cols = c(predictors, "y_week")
# any(vapply(dt_model[, ..X_cols], function(x) any(is.infinite(x)), FUN.VALUE = logical(1L)))
# prepare X and y
# char_cols = X[, colnames(.SD), .SDcols = is.factor]
# X[, (char_cols) := lapply(.SD, as.integer), .SDcols = char_cols]
# X[, ..char_cols]
# X = na.omit(X)
# formula <- as.formula(paste(" ~ (", paste(colnames(X), collapse = " + "), ")^2"))
# X = model.matrix(formula, X)
# X = as.matrix(X)
# insample feature importance using gausscov
f1st_res = f1st(y = y, x = X, p0=0.01)
f1st_res = f1st_res[[1]]
f1st_res_index = f1st_res[f1st_res[, 1] != 0, , drop = FALSE]
colnames(X)[f1st_res_index[, 1]]
lm(y ~ X[, colnames(X)[f1st_res_index[, 1]]])
f3st_res = f3st(y = y, x = X, m = 3, p = 0.1)
res_index <- unique(as.integer(f3st_res[[1]][1, ]))[-1]
res_index <- res_index [res_index != 0]
colnames(X)[res_index]
# visualize most important variables
important_vars = c("GS10",
"change_in_nonrept_long_all:ret_3")
X_imp = cbind(date = dataset[, date],
close = dataset[, close],
X[, important_vars], y)
X_imp = as.data.table(X_imp)
X_imp[, date := as.POSIXct(date)]
X_imp[, signal := ifelse(shift(`change_in_noncomm_spead_all:ret_6`) < 300, 0, 1)]
X_imp[, signal2 := ifelse(shift(`change_in_nonrept_long_all:ret_3`) < 100, 0, 1)]
ggplot(X_imp, aes(x = date)) +
geom_line(aes(y = `change_in_noncomm_spead_all:ret_6`))
ggplot(X_imp, aes(x = date)) +
geom_line(aes(y = `change_in_nonrept_long_all:ret_3`))
ggplot(X_imp[1:200], aes(x = date)) +
geom_line(aes(y = `change_in_noncomm_spead_all:ret_6`))
ggplot(X_imp[800:1000], aes(x = date)) +
geom_line(aes(y = `change_in_noncomm_spead_all:ret_6`))
ggplot(X_imp[1000:nrow(X_imp)], aes(x = date)) +
geom_line(aes(y = `change_in_noncomm_spead_all:ret_6`))
ggplot(na.omit(X_imp), aes(x = date, color = as.factor(signal))) +
geom_line(aes(y = close))
ggplot(na.omit(X_imp[1000:nrow(X_imp)]), aes(x = date, y = close, color = signal)) +
geom_line(size = 1)
ggplot(na.omit(X_imp[800:1000]), aes(x = date, y = close, color = signal)) +
geom_line(size = 1)
ggplot(na.omit(X_imp[1000:nrow(X_imp)]), aes(x = date, y = close, color = signal2)) +
geom_line(size = 1)
ggplot(na.omit(X_imp[800:1000]), aes(x = date, y = close, color = signal2)) +
geom_line(size = 1)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.