library(HistDAWass)
library(data.table)
library(ggplot2)
library(QuantTools)
library(future.apply)
library(PerformanceAnalytics)
library(runner)
library(tiledb)
library(highfrequency)
library(lubridate)
library(timechange)
library(patchwork)
library(MASS)
library(univariateML)
# HELP FUNCTIONS ----------------------------------------------------------
# backtst function
backtest <- function(returns, indicator, buy_cluster, return_cumulative = TRUE) {
sides <- vector("integer", length(indicator))
for (i in seq_along(sides)) {
if (i %in% c(1) || is.na(indicator[i-1])) {
sides[i] <- NA
} else if (indicator[i-1] %in% buy_cluster) {
sides[i] <- 1
} else {
sides[i] <- 0
}
}
sides <- ifelse(is.na(sides), 1, sides)
returns_strategy <- returns * sides
if (return_cumulative) {
return(PerformanceAnalytics::Return.cumulative(returns_strategy))
} else {
return(returns_strategy)
}
}
# predict clusters (source: https://github.com/Airpino/HistDAWass/issues/4)
predict_WH_kmeans<- function(res, test, clu = 3){
ID_pred<-rep(0,nrow(test@M))
for (i in 1:length(ID_pred)){
mindist<- Inf
for (j in 1:clu){
tmpdist<- 0
for (v in 1:get.MatH.ncols(test)){
tmpdist<-tmpdist+WassSqDistH(test@M[i,v][[1]], #the test point
res$solution$centers@M[j,v][[1]]) #the center of cluster j
}
if (tmpdist<mindist){
mindist<- tmpdist
ID_pred[i]<-j
}
}
}
names(ID_pred)<-get.MatH.rownames(test)
return(ID_pred)
}
# configure s3
config <- tiledb_config()
config["vfs.s3.aws_access_key_id"] <- Sys.getenv("AWS-ACCESS-KEY")
config["vfs.s3.aws_secret_access_key"] <- Sys.getenv("AWS-SECRET-KEY")
config["vfs.s3.region"] <- Sys.getenv("AWS-REGION")
context_with_config <- tiledb_ctx(config)
# DUKASCOPY SPY DATA ----------------------------------------------------------
# import data
arr <- tiledb_array("D:/equity-usa-quotes-dukascopy",
as.data.frame = TRUE,
selected_ranges = list(symbol = cbind("SPYUSUSD", "SPYUSUSD")))
system.time(quotes_raw <- arr[])
tiledb_array_close(arr)
# prepare data for highfrequency package
quotes <- as.data.table(quotes_raw)
setnames(quotes,
c("symbol", "bidp", "bidv", "askp", "askv", "date"),
c("SYMBOL", "BID", "BIDSIZ", "OFR", "OFRSIZ", "DT"))
quotes[, EX := "P"]
cols <- colnames(sampleQDataRaw)
quotes <- quotes[, ..cols]
quotes[, DT := as.nanotime(DT, tz = "GMT")]
# clean quotes data
dim(quotes)
quotes <- noZeroQuotes(quotes)
dim(quotes)
# keep only exchange hours (my method because highfrequency approach is too slow)
dates_ <- as.nanotime(unique(nano_floor(quotes$DT, nanoperiod(days = 1), tz = "UTC")))
dates_start <- dates_ + as.nanoduration("14:30:00")
dates_end <- dates_ + as.nanoduration("21:00:00")
intervlas_ <- nanoival(dates_start, dates_end)
quotes <- quotes[DT %in% intervlas_]
dim(quotes)
# remove negative spreads
quotes <- rmNegativeSpread(quotes)
dim(quotes)
# remove large spreads
quotes <- rmLargeSpread(quotes)
dim(quotes)
# # merge quotes time stamp with selection criteria
# # here I decided to use POSIXct instead of nanotime, but could keep nanotime...
# quotes[, DT := nano_ceiling(DT, as.nanoduration("00:00:01"))]
# quotes[, DT := as.POSIXct(DT, tz = "UTC")]
# system.time(quotes <- mergeQuotesSameTimestamp(test, "median"))
# dim(quotes)
# remove outliers
quotes[, DT := nano_ceiling(DT, as.nanoduration("00:00:01"))]
quotes[, DT := as.POSIXct(DT, tz = "UTC")]
system.time(quotes <- rmOutliersQuotes(quotes, type = "advanced"))
dim(quotes)
test <- quotes[, as.POSIXct(DT)]
test <- as.ITime(test)
test_dt <- as.data.table(test)[, .N, by = test]
setorder(test_dt, N)
tset <- highfrequency::aggregateQuotes(quotes, "minutes", 5, )
# FINAM TICK DATA ---------------------------------------------------------
# import from s3
arr <- tiledb_array("D:/equity-usa-tick-finam",
as.data.frame = TRUE,
selected_ranges = list(symbol = cbind("AAPL", "AAPL")))
system.time(tick_raw <- arr[])
tiledb_array_close(arr)
# prepare data for highfrequency package
tick_raw <- as.data.table(tick_raw)
setnames(tick_raw,
c("symbol", "volume", "price", "time"),
c("SYMBOL", "SIZE", "PRICE", "DT"))
tick_raw[, EX := "P"]
tick_raw[, COND := "I"]
tick_raw[, CORR := 0]
cols <- colnames(sampleTDataRaw)
tick_raw <- tick_raw[, ..cols]
attr(tick_raw$DT, "tz") <- "UTC"
tick_raw[, DT := with_tz(DT, tzone = "America/New_York")]
# # clean tick data
# dim(tick_raw)
# tick_dt <- noZeroPrices(tick_raw)
# dim(tick_dt)
#
# # keep only exchange hours (my method because highfrequency approach is too slow)
# tick_dt[, time := as.ITime(DT)]
# tick_dt <- tick_dt[time %between% c("09:30:00", "16:00:00")]
# dim(tick_dt)
#
# # merge trades with same timestamp
# tick_dt <- mergeTradesSameTimestamp(tick_dt)
# dim(tick_dt)
# VOLUME PROFILE ----------------------------------------------------------
# calculate volume profile
tick_dt <- tick_raw[, .(DT, PRICE, SIZE)]
setnames(tick_dt, c("time", "price", "volume"))
tick_dt <- tick_dt[volume > 0]
tick_dt[, volume := log(volume)]
vp = roll_volume_profile(tick_dt, 60 * 60 * 4, 0.01, 0.98, 100)
length(vp)
length(vp[[1]])
length(vp[[2]])
vp[[1]][1:3]
vp[[2]][1:3]
# visualize
data_plot <- as.data.frame(vp[[2]][[1]])
ggplot(data_plot, aes(price)) + geom_bar(aes(weight = volume))
# create histograms from real data
volumes <- lapply(vp[[2]], `[[`, 3)
volumes_dist <- lapply(volumes, data2hist)
plot(volumes_dist[[66]])
# create MatH object
new_mat<-MatH(x=volumes_dist,
nrows=length(volumes_dist),
ncols = 1,
rownames = vp[[1]],
varnames = "volumep")
new_mat<-MatH(x=tail(volumes_dist, 1000),
nrows = 1000,
ncols = 1,
rownames = tail(vp[[1]], 1000),
varnames = "volumep")
# save object
# saveRDS(new_mat, "D:/features/volumes_dist_vol_h4.rds")
# 1. Kmeans clustering
k_ <- 2
res_cluster <- WH_kmeans(new_mat, k = k_, rep = 10, simplify = TRUE, qua = 20, standardize = TRUE)
dates <- as.POSIXct(as.integer(rownames(new_mat@M)), origin = as.POSIXct("1970-01-01 00:00:00"), tz = "America/New_York")
clusters_1 <- data.table(date = dates, cluster_1 = res_cluster$solution$IDX)
# 2. Hierarchical clustering
# res_hcluster <- WH_hclust(new_mat, simplify = TRUE, method = "complete")
# plot(res_hcluster) # it plots the dendrogram
# clusters_2 <- cutree(res_hcluster, k = 3) # it returns the labels for 5 clusters
# 3. Fuzzy c-means of a dataset of histogram-valued data
res_fcmeans <- WH_fcmeans(new_mat, k = k_, m = 1.5, rep = 10, simplify = TRUE, qua = 20, standardize = TRUE)
clusters_3 <- clusters_3[, .(date = dates, cluster_3 = res_fcmeans$solution$IDX)]
# 4. K-means of a dataset of histogram-valued data using adaptive Wasserstein distances
res_kmeans <- WH_adaptive.kmeans(new_mat, k = k_, rep = 10, simplify = TRUE, qua = 20, standardize = TRUE)
clusters_4 <- data.table(date = dates, cluster_4 = res_kmeans$IDX)
# combine clusteres results and dates
clusters_data <- Reduce(function(x, y) merge(x, y, by = "date", all.x = TRUE, all.y = FALSE),
list(clusters_1, clusters_3, clusters_4))
# get daily spy data
# arr <- tiledb_array("D:/equity-usa-hour-fmpcloud-adjusted", as.data.frame = TRUE,
# selected_ranges = list(symbol = cbind("AAPL", "AAPL")))
# spy_daily <- arr[]
# tiledb_array_close(arr)
# spy_daily <- as.data.table(spy_daily)
# setnames(spy_daily, "time", "date")
# merge spy and clusters
clusters_data <- merge(clusters_data, spy_daily, by = "date", all.x = TRUE, all.y = FALSE)
# create returns
clusters_data[, returns := close / shift(close) - 1]
clusters_data <- na.omit(clusters_data)
# plot data
ggplot(clusters_data, aes(x = date, y = close, colour = factor(cluster_1), group = 1)) +
geom_line()
ggplot(clusters_data[date %between% c("2020-01-01", "2021-01-01")], aes(x = date, y = close, colour = factor(cluster_1), group = 1)) +
geom_line()
# show good distributions
n_ <- 120
g1 <- plot(new_mat[which(clusters_data$cluster_4 == 1)[n_]])
g2 <- plot(new_mat[which(clusters_data$cluster_4 == 2)[n_]])
g1 / g2
table(clusters_data$cluster_4)
plot(new_mat[which(clusters_data$cluster_4 == 1)[22:23]])
plot(new_mat[which(clusters_data$cluster_4 == 2)[22:23]])
# test for exponential dist
cluster_pvalues_exp <- list()
for (cluster_ in unique(clusters_data$cluster_4)) {
sample_ <- clusters_data[cluster_4 == cluster_]
x <- new_mat[which(sample_$cluster_4 == cluster_)]
p_values <- c()
for (i in 1:nrow(x@M)) {
y <- ks.test(x@M[[i]]@x, "punif", fit1$estimate)
p_values <- c(p_values, y$p.value)
}
cluster_pvalues_exp[[cluster_]] <- p_values
}
mean(cluster_pvalues_exp[[1]])
mean(cluster_pvalues_exp[[2]])
# estiamte distribution
cluster_dist_est <- list()
for (cluster_ in sort(unique(clusters_data$cluster_4))) {
sample_ <- clusters_data[cluster_4 == cluster_]
x <- new_mat[which(sample_$cluster_4 == cluster_)]
dist_est <- list()
for (i in 1:nrow(x@M)) {
y <- model_select(x@M[[i]]@x)
dist_est[[i]] <- attributes(y)
}
cluster_dist_est[[cluster_]] <- dist_est
}
dists_1 <- lapply(cluster_dist_est[[1]], function(x) x$model)
dists_2 <- lapply(cluster_dist_est[[2]], function(x) x$model)
table(unlist(dists_1))
table(unlist(dists_2))
# estimate power distribution
power_est <- list()
for (cluster_ in sort(unique(clusters_data$cluster_4))) {
sample_ <- clusters_data[cluster_4 == cluster_]
x <- new_mat[which(sample_$cluster_4 == cluster_)]
power_est_ <- list()
for (i in 1:nrow(x@M)) {
y <- mlgumbel(x@M[[i]]@x)
power_est_[[i]] <- as.vector(y)
}
power_est[[cluster_]] <- power_est_
}
power_est[[1]]
beat_1 <- lapply(power_est[[1]], function(x) x[2])
beat_2 <- lapply(power_est[[2]], function(x) x[2])
beat_3 <- lapply(power_est[[3]], function(x) x[2])
beat_4 <- lapply(power_est[[4]], function(x) x[2])
hist(unlist(beat_1))
hist(unlist(beat_2))
hist(unlist(beat_3))
hist(unlist(beat_4))
mean(unlist(beat_1))
mean(unlist(beat_2))
mean(unlist(beat_3))
mean(unlist(beat_4))
# backtst function
backtest <- function(returns, clusters, sell_clusters, return_cumulative = TRUE) {
sides <- vector("integer", length(clusters))
for (i in seq_along(sides)) {
if (i %in% c(1) || is.na(clusters[i-1])) {
sides[i] <- NA
} else if (clusters[i-1] %in% sell_clusters) {
sides[i] <- 0
} else {
sides[i] <- 1
}
}
sides <- ifelse(is.na(sides), 1, sides)
returns_strategy <- returns * sides
if (return_cumulative) {
return(PerformanceAnalytics::Return.cumulative(returns_strategy))
} else {
return(returns_strategy)
}
}
# backtest
backtest(clusters_data$returns, clusters_data$cluster_4, c(2))
res <- backtest(clusters_data$returns, clusters_data$cluster_4, c(2), FALSE)
charts.PerformanceSummary(as.xts.data.table(cbind(clusters_data[, .(date, returns)], res)))
charts.PerformanceSummary(as.xts.data.table(cbind(clusters_data[4000:nrow(clusters_data), .(date, returns)], res[4000:nrow(clusters_data)])))
# SPY DATA ----------------------------------------------------------------
# minute SPY data
arr <- tiledb_array("D:/equity-usa-minute-fmpcloud-adjusted",
as.data.frame = TRUE,
selected_ranges = list(symbol = cbind("AAPL", "AAPL")))
system.time(spy <- arr[])
tiledb_array_close(arr)
attr(spy$time, "tz") <- "UTC"
spy <- as.data.table(spy)
spy[, time := with_tz(time, tzone = "America/New_York")]
# keep only trading hours
spy <- spy[format.POSIXct(time, format = "%H:%M:%S") %between% c("09:30:00", "16:30:00")]
# calculate returns
spy[, returns := close / shift(close) - 1]
# remove missing vlaues
spy <- na.omit(spy)
# prepare data for HistDAWass package
spy[, day := as.Date(time, format = "%Y-%m-%d", tz = "America/New_York")]
system.time(list_of_t_spy <- spy[ , list(list(data2hist(returns))), by = c("day")])
# create MatH object
new_mat<-MatH(x=list_of_t_spy$V1,
nrows=nrow(list_of_t_spy),
ncols = 1,
rownames = unique(list_of_t_spy$day),
varnames = "returns")
plot(new_mat[150:170], type="DENS") # to see the data
# rolling cluster predictions
estimated_clusters <- list()
for (i in 1:nrow(new_mat@M)) {
if (i < 500) {
estimated_clusters[[i]] <- NA_integer_
} else {
estimated_clusters[[i]] <- WH_kmeans(new_mat[1:i], k = 2, rep = 10, simplify = TRUE, qua = 40)
}
}
# get predictions
dates <- lapply(estimated_clusters, function(x) {
dates_ <- as.integer(names(x$solution$IDX))
tail(dates_, 1)
})
dates <- as.Date(unlist(dates), origin = "1970-01-01")
last_clusters <- lapply(estimated_clusters, function(x) tail(x$solution$IDX, 1))
last_clusters <- unlist(last_clusters)
buy_cluster <- lapply(estimated_clusters, function(x) {
if (is.null(x)) {
return(NA)
} else if (x$solution$centers@M[[1]]@m > x$solution$centers@M[[2]]@m) {
return(1L)
} else {
return(2L)
}
})
buy_cluster <- unlist(buy_cluster)
buy_cluster <- buy_cluster[!is.na(buy_cluster)]
predictions <- ifelse(last_clusters == buy_cluster, 1, 0)
predictions <- cbind.data.frame(date = dates, predictions)
# get daily spy data
arr <- tiledb_array("s3://equity-usa-daily-fmpcloud", as.data.frame = TRUE,
selected_ranges = list(symbol = cbind("SPY", "SPY")))
spy_daily <- arr[]
tiledb_array_close(arr)
spy_daily <- as.data.table(spy_daily)
# merge spy and predictions
clusters_data <- merge(spy_daily, predictions, by = "date", all.x = TRUE, all.y = FALSE)
# create returns
clusters_data[, returns := close / shift(close) - 1]
clusters_data <- na.omit(clusters_data)
# summary stats
table(clusters_data$predictions)
# plot data
ggplot(clusters_data, aes(x = date, y = close, colour = factor(predictions), group = 1)) +
geom_line()
# backtest
backtest(clusters_data$returns, clusters_data$predictions, 0)
res <- backtest(clusters_data$returns[500:nrow(clusters_data)], clusters_data$predictions[500:nrow(clusters_data)], 0, FALSE)
charts.PerformanceSummary(as.xts.data.table(cbind(clusters_data[500:nrow(clusters_data), .(date, returns)], res)))
charts.PerformanceSummary(as.xts.data.table(cbind(clusters_data[4000:nrow(clusters_data), .(date, returns)], res[4000:nrow(clusters_data)])))
# cluster analysis
res_cluster <- WH_kmeans(new_mat, k=3)
clusters_1 <- data.table(date = unique(spy$day), cluster_1 = res_cluster$solution$IDX)
# 2. Hierarchical clustering
res_hcluster <- WH_hclust(new_mat, simplify = TRUE, method = "complete")
plot(res_hcluster) # it plots the dendrogram
clusters_2 <- cutree(res_hcluster, k = 3) # it returns the labels for 5 clusters
# 3. Fuzzy c-means of a dataset of histogram-valued data
res <- WH_fcmeans(new_mat, k = 3, m = 1.5, rep = 10, simplify = TRUE, qua = 10, standardize = TRUE)
clusters <- data.table(1:length(res$solution$IDX), clusters = res$solution$IDX)
clusters$date <- rownames(res$solution$membership)
clusters <- clusters[, .(date, clusters)]
# 4. K-means of a dataset of histogram-valued data using adaptive Wasserstein distances
res_kmeans <- WH_adaptive.kmeans(new_mat, k = 3, rep = 10, simplify = TRUE, qua = 10, standardize = TRUE)
clusters_4 <- data.table(date = unique(spy$day), cluster_4 = res_kmeans$IDX)
# combine clusteres results and dates
clusters_data <- merge(clusters_1, clusters_4, by = "date", all.x = TRUE, all.y = FALSE)
# get daily spy data
arr <- tiledb_array("s3://equity-usa-daily-fmpcloud", as.data.frame = TRUE,
selected_ranges = list(symbol = cbind("SPY", "SPY")))
spy_daily <- arr[]
tiledb_array_close(arr)
spy_daily <- as.data.table(spy_daily)
clusters_data <- merge(clusters_data, spy_daily, by = "date", all.x = TRUE, all.y = FALSE)
# create returns
clusters_data[, returns := close / shift(close) - 1]
clusters_data <- na.omit(clusters_data)
# summary stats
table(clusters_data$cluster)
# plot data
ggplot(clusters_data, aes(x = date, y = close, colour = factor(cluster_4), group = 1)) +
geom_line()
ggplot(clusters_data[date %between% c("2004-01-01", "2007-01-01")], aes(x = date, y = close, colour = factor(cluster_1), group = 1)) +
geom_line()
ggplot(clusters_data[date %between% c("2007-01-01", "2008-01-01")], aes(x = date, y = close, colour = factor(cluster_1), group = 1)) +
geom_line()
ggplot(clusters_data[date %between% c("2008-01-01", "2009-01-01")], aes(x = date, y = close, colour = factor(cluster_1), group = 1)) +
geom_line()
ggplot(clusters_data[date %between% c("2009-01-01", "2010-01-01")], aes(x = date, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[date %between% c("2010-01-01", "2013-01-01")], aes(x = date, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[date %between% c("2013-01-01", "2016-01-01")], aes(x = date, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[date %between% c("2016-01-01", "2020-01-01")], aes(x = date, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[date %between% c("2020-01-01", "2021-01-01")], aes(x = date, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[date %between% c("2021-01-01", "2021-11-01")], aes(x = date, y = close, colour = factor(cluster), group = 1)) +
geom_line()
# backtst function
backtest <- function(returns, clusters, sell_clusters, return_cumulative = TRUE) {
sides <- vector("integer", length(clusters))
for (i in seq_along(sides)) {
if (i %in% c(1) || is.na(clusters[i-1])) {
sides[i] <- NA
} else if (clusters[i-1] %in% sell_clusters) {
sides[i] <- 0
} else {
sides[i] <- 1
}
}
sides <- ifelse(is.na(sides), 1, sides)
returns_strategy <- returns * sides
if (return_cumulative) {
return(PerformanceAnalytics::Return.cumulative(returns_strategy))
} else {
return(returns_strategy)
}
}
# backtest
backtest(clusters_data$returns, clusters_data$cluster_4, c(2))
res <- backtest(clusters_data$returns, clusters_data$cluster_4, c(2), FALSE)
charts.PerformanceSummary(as.xts.data.table(cbind(clusters_data[, .(date, returns)], res)))
charts.PerformanceSummary(as.xts.data.table(cbind(clusters_data[4000:nrow(clusters_data), .(date, returns)], res[4000:nrow(clusters_data)])))
# backtest_data <- clusters_data[, .(date, cluster, adjClose)]
# backtest_data[, returns := (adjClose / shift(adjClose)) - 1]
# backtest_data <- na.omit(backtest_data)
# backtest_data[, indicator := ifelse(cluster == 1, 1, 0)]
# results <- backtest(returns = backtest_data$returns, indicator = backtest_data$cluster, c(3), return_cumulative = FALSE)
# results_xts <- cbind(as.xts.data.table(backtest_data[, .(date, returns)]), results)
# Return.cumulative(results)
# charts.PerformanceSummary(results_xts)
# charts.PerformanceSummary(results_xts["2007-01/2010-01"])
# charts.PerformanceSummary(results_xts["2020-01/2021-01"])
# SharpeRatio(cbind(as.xts.data.table(backtest_data[, .(date, returns)]), results))
# CROSS SECTION CLUSTERING ------------------------------------------------
# import hour ohlcv
arr <- tiledb_array("D:/equity-usa-hour-fmpcloud-adjusted", as.data.frame = TRUE)
hour_data <- arr[]
tiledb_array_close(arr)
hour_data_dt <- as.data.table(hour_data)
attr(hour_data_dt$time, "tz") <- Sys.getenv("TZ")
hour_data_dt[, time := with_tz(time, "America/New_York")]
# prepare data for
hour_data_dt[, returns := close / shift(close) - 1]
hour_data_dt <- na.omit(hour_data_dt)
# keep trading hours
DT <- hour_data_dt[as.ITime(time) %between% c(as.ITime("10:00:00"), as.ITime("16:00:00"))]
DT[, N := .N, by = time]
DT <- DT[N > 250]
# prepare data for HistDAWass package
list_of_t <- DT[, list(list(data2hist(returns))), by = time]
# create MatH object
new_mat<-MatH(x=list_of_t$V1,
nrows=nrow(list_of_t),
ncols = 1,
rownames = list_of_t$day,
varnames = "returns")
plot(new_mat[100:115], type="DENS") # to see the data
# cluster analysis
# 1. K MEANS
res_kmeans <- WH_kmeans(new_mat, k = 3, rep = 10, simplify = TRUE, qua = 100, standardize = TRUE)
clusters_kmeans <- data.table(date = list_of_t$time, cluster_kmeans = res_kmeans$solution$IDX)
# 2. Hierarchical clustering
# res <- WH_hclust(new_mat, simplify = TRUE, method = "complete")
# plot(res) # it plots the dendrogram
# clusters <- cutree(res, k = 3) # it returns the labels for 5 clusters
# # 3. Fuzzy c-means of a dataset of histogram-valued data
# res <- WH_fcmeans(new_mat, k = 2, m = 1.5, rep = 10, simplify = TRUE, qua = 10, standardize = TRUE)
# clusters <- data.table(1:length(res$solution$IDX), clusters = res$solution$IDX)
# 4. K-means of a dataset of histogram-valued data using adaptive Wasserstein distances
res_akmeans <- WH_adaptive.kmeans(new_mat, k = 3, rep = 10, simplify = TRUE, qua = 100, standardize = TRUE)
clusters_akmeans <- data.table(date = list_of_t$time, cluster_akmeans = res_akmeans$IDX)
# combine clusteres results and dates
clusters_data <- merge(clusters_kmeans, clusters_akmeans,
by = "date", all.x = TRUE, all.y = FALSE)
# get daily spy data
arr <- tiledb_array("D:/equity-usa-hour-fmpcloud-adjusted", as.data.frame = TRUE,
selected_ranges = list(symbol = cbind("SPY", "SPY")))
spy_daily <- arr[]
tiledb_array_close(arr)
spy_daily <- as.data.table(spy_daily)
setnames(spy_daily, "time", "date")
attr(spy_daily$date, "tz") <- Sys.getenv("TZ")
spy_daily[, date := with_tz(date, "America/New_York")]
clusters_data <- merge(clusters_data, spy_daily, by = "date", all.x = TRUE, all.y = FALSE)
# create returns
clusters_data <- na.omit(clusters_data)
clusters_data[, returns := close / shift(close) - 1]
clusters_data <- na.omit(clusters_data)
# plot data
ggplot(clusters_data, aes(x = date, y = close, colour = factor(cluster_kmeans), group = 1)) +
geom_line()
ggplot(clusters_data[30000:nrow(clusters_data)], aes(x = date, y = close, colour = factor(cluster_kmeans), group = 1)) +
geom_line()
# backtst function
backtest <- function(returns, clusters, sell_clusters, return_cumulative = TRUE) {
sides <- vector("integer", length(clusters))
for (i in seq_along(sides)) {
if (i %in% c(1) || is.na(clusters[i-1])) {
sides[i] <- NA
} else if (clusters[i-1] %in% sell_clusters) {
sides[i] <- 0
} else {
sides[i] <- 1
}
}
sides <- ifelse(is.na(sides), 1, sides)
returns_strategy <- returns * sides
if (return_cumulative) {
return(PerformanceAnalytics::Return.cumulative(returns_strategy))
} else {
return(returns_strategy)
}
}
# backtest
backtest(clusters_data$returns, clusters_data$cluster_kmeans, c(2, 3))
res <- backtest(clusters_data$returns, clusters_data$cluster_kmeans, c(2, 3), FALSE)
charts.PerformanceSummary(as.xts.data.table(cbind(clusters_data[, .(date, returns)], res)))
charts.PerformanceSummary(as.xts.data.table(cbind(clusters_data[4000:nrow(clusters_data), .(date, returns)], res[4000:nrow(clusters_data)])))
# plot clusters results
clusters_data <- as.data.table(clusters, keep.rownames = TRUE)
clusters_data$datetime <- unique(sample$day)
clusters_data <- clusters_data[, .(datetime, clusters)]
setnames(clusters_data, c("date", "cluster"))
# spy_hourly <- sample[symbol == "SPY" & datetime %in% clusters_data$datetime]
spy_daily <- as.data.table(fmpc_price_history("SPY", min(clusters_data$date), max(clusters_data$date)))
clusters_data <- merge(clusters_data, spy_daily, by = "date", all.x = TRUE, all.y = FALSE)
# summary stats
table(clusters_data$cluster)
# plot data
ggplot(clusters_data, aes(x = date, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[date %between% c("2004-01-01", "2007-01-01")], aes(x = date, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[date %between% c("2007-01-01", "2008-01-01")], aes(x = date, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[date %between% c("2008-01-01", "2009-01-01")], aes(x = date, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[date %between% c("2009-01-01", "2010-01-01")], aes(x = date, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[date %between% c("2010-01-01", "2013-01-01")], aes(x = date, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[date %between% c("2013-01-01", "2016-01-01")], aes(x = date, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[date %between% c("2016-01-01", "2020-01-01")], aes(x = date, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[date %between% c("2020-01-01", "2021-01-01")], aes(x = date, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[date %between% c("2021-01-01", "2021-11-01")], aes(x = date, y = close, colour = factor(cluster), group = 1)) +
geom_line()
# SPY HOUR ROLLING ------------------------------------------------------------
# hour spy
spy_hour <- market_data[symbol == "SPY"]
# prepare data for HistDAWass package
h1 = 35
h2 = 8
l = runner(x = spy_hour$returns,
f = function(x)tryCatch(data2hist(x), error = function(e) NA),
k = h1,
at = seq(1, nrow(spy_hour), h2),
na_pad = TRUE
)
l <- l[!is.na(l)]
l_dates = runner(x = spy_hour$datetime,
f = function(x) tail(x, 1),
k = h1,
at = seq(1, nrow(spy_hour), h2),
na_pad = TRUE
)
l_dates <- l_dates[!is.na(l)]
l_dates <- as.POSIXct(unlist(l_dates), origin = "1970-01-01", tz = "EST")
# create MatH object
new_mat <- MatH(x=l,
nrows=length(l),
ncols = 1,
rownames = l_dates,
varnames = "returns")
plot(new_mat[1:15], type="DENS") # to see the data
#prediction
ID_test<-predict_WH_kmeans(res,test) #named vector with the ID of the corresponding cluster
# cluster analysis
# 1. K MEANS
res<-WH_kmeans(new_mat,k=3)
clusters <- res$solution$IDX
# # 2. Hierarchical clustering
# res <- WH_hclust(new_mat, simplify = TRUE, method = "complete")
# plot(res) # it plots the dendrogram
# clusters <- cutree(res, k = 3) # it returns the labels for 5 clusters
# # 3. Fuzzy c-means of a dataset of histogram-valued data
# res <- WH_fcmeans(new_mat, k = 2, m = 1.5, rep = 10, simplify = TRUE, qua = 10, standardize = TRUE)
# clusters <- data.table(1:length(res$solution$IDX), clusters = res$solution$IDX)
# # 4. K-means of a dataset of histogram-valued data using adaptive Wasserstein distances
# res <- WH_adaptive.kmeans(new_mat, k = 2, rep = 10, simplify = TRUE, qua = 10, standardize = TRUE)
# plot clusters results
clusters_data <- as.data.table(clusters, keep.rownames = TRUE)
setnames(clusters_data, c("datetime", "cluster"))
clusters_data$datetime <- as.POSIXct(as.numeric(clusters_data$datetime), origin = "1970-01-01", tz = "EST")
clusters_data <- merge(spy_hour, clusters_data, by = "datetime", all.x = TRUE, all.y = FALSE)
# summary stats
table(clusters_data$cluster)
# plot data
ggplot(clusters_data, aes(x = datetime, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[datetime %between% c("2004-01-01", "2007-01-01")], aes(x = datetime, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[datetime %between% c("2007-01-01", "2008-01-01")], aes(x = datetime, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[datetime %between% c("2008-01-01", "2009-01-01")], aes(x = datetime, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[datetime %between% c("2009-01-01", "2010-01-01")], aes(x = datetime, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[datetime %between% c("2010-01-01", "2013-01-01")], aes(x = datetime, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[datetime %between% c("2013-01-01", "2016-01-01")], aes(x = datetime, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[datetime %between% c("2016-01-01", "2020-01-01")], aes(x = datetime, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[datetime %between% c("2020-01-01", "2020-01-01")], aes(x = datetime, y = close, colour = factor(cluster), group = 1)) +
geom_line()
ggplot(clusters_data[date %between% c("2021-01-01", "2021-11-01")], aes(x = datetime, y = close, colour = factor(cluster), group = 1)) +
geom_line()
# backtst function
backtest <- function(returns, indicator, buy_cluster, return_cumulative = TRUE) {
sides <- vector("integer", length(indicator))
for (i in seq_along(sides)) {
if (i %in% c(1) || is.na(indicator[i-1])) {
sides[i] <- NA
} else if (indicator[i-1] %in% buy_cluster) {
sides[i] <- 1
} else {
sides[i] <- 0
}
}
sides <- ifelse(is.na(sides), 1, sides)
returns_strategy <- returns * sides
if (return_cumulative) {
return(PerformanceAnalytics::Return.cumulative(returns_strategy))
} else {
return(returns_strategy)
}
}
# backtest
backtest_data <- clusters_data[, .(datetime, cluster, close)]
backtest_data[, returns := (close / shift(close)) - 1]
backtest_data[, cluster := na.locf(cluster, na.rm = FALSE)]
backtest_data <- na.omit(backtest_data)
results <- backtest(returns = backtest_data$returns, indicator = backtest_data$cluster, c(1), return_cumulative = FALSE)
results_xts <- cbind(as.xts.data.table(backtest_data[, .(datetime, returns)]), results)
Return.cumulative(results)
charts.PerformanceSummary(results_xts)
charts.PerformanceSummary(results_xts["2007-01/2010-01"])
charts.PerformanceSummary(results_xts["2020-01/2021-01"])
SharpeRatio(cbind(as.xts.data.table(backtest_data[, .(date, returns)]), results))
# CROSS SECTION MINUTE CLUSTERING AGGREGATION -----------------------------
# choose symbols
symbols = c("SPY", "AAPL", "GM", "AMZN", "AAL")
# import minute data
market_data_list <- lapply(symbols, function(x) {
print(x)
y <- tryCatch(storage_read_csv(CONTMIN, paste0(x, ".csv")), error = function(e) NA)
if (is.null(y) | all(is.na(y))) return(NULL)
y <- cbind(symbol = x, y)
return(y)
})
market_data <- rbindlist(market_data_list)
# change timezone
market_data[, datetime := as.POSIXct(as.numeric(formated), origin = as.POSIXct("1970-01-01", tz = "EST"), tz = "EST")]
market_data[, formated := NULL]
market_data[, t := NULL]
# keep only trading hours
market_data <- market_data[format.POSIXct(datetime, format = "%H:%M:%S") %between% c("09:30:00", "16:30:00")]
# adjust market data
market_data <- adjust(data = market_data, symbol_col = "symbol", ohlc_cols = c("o", "h", "c", "l"))
setnames(market_data, c("symbol", "datetime", "open", "high", "close", "low", "volume"))
# calculate returns
market_data[, returns := close / shift(close) - 1]
# remove missing vlaues
market_data <- na.omit(market_data)
# choose parameters
freq = "minute"
n_clusters <- 2
minimal_number_of_clusters = 1000
# prepare data for HistDAWass package
market_data[, day := as.Date(datetime)]
list_of_ts <- market_data[ , list(list(data2hist(returns))), by = c("symbol", "day")]
### LOOP STARTS HERE
# create MatH object
s <- unique(list_of_ts$symbol)[1]
sample_ <- list_of_ts[symbol == s]
# inner loop to calculate clusters
for (i in minimal_number_of_cluster:1005) { # nrow(sample_)
sample_cluster <- sample_[1:i]
new_mat<-MatH(x=sample_cluster$V1,
nrows=nrow(sample_cluster),
ncols = 1,
rownames = unique(sample_cluster$day),
varnames = "returns")
# rolling k means
res <- WH_kmeans(new_mat, k=n_clusters)
kmean_clusters <- res$solution$IDX
kmean_clusters <- as.data.table(kmean_clusters, keep.rownames = TRUE)
kmean_clusters[, date := as.Date(as.numeric(rn))]
kmean_clusters <- kmean_clusters[, .(date, kmean_clusters)]
}
# rolling clusters
runner(x = 1:nrow(sample_),
f = function(x) {
new_mat <- MatH(x=x$V1,
nrows=nrow(x),
ncols = 1,
rownames = unique(x$day),
varnames = "returns")
as.data.table(x = new_mat)
},
at = minimal_number_of_clusters:1010)
new_mat<-MatH(x=sample_$V1,
nrows=nrow(sample_),
ncols = 1,
rownames = unique(sample_$day),
varnames = "returns")
# plot(new_mat[1:15], type="DENS") # to see the data
# rolling k means
runner(x = new_mat)
res <- WH_kmeans(new_mat,k=n_clusters)
kmean_clusters <- res$solution$IDX
kmean_clusters <- as.data.table(kmean_clusters, keep.rownames = TRUE)
kmean_clusters[, date := as.Date(as.numeric(rn))]
kmean_clusters <- kmean_clusters[, .(date, kmean_clusters)]
# # Hierarchical clustering
# res_hclust <- WH_hclust(new_mat, simplify = TRUE, method = "complete")
# hclust_clusters <- cutree(res_hclust, k = n_clusters)
# hclust_clusters <- as.data.table(hclust_clusters, keep.rownames = TRUE)
# hclust_clusters[, date := as.Date(as.numeric(rn))]
# hclust_clusters <- hclust_clusters[, .(date, hclust_clusters)]
#
# # Fuzzy c-means
# res_fuzzy_cmeans <- WH_fcmeans(new_mat, k = n_clusters, m = 1.5, rep = 10, simplify = TRUE, qua = 10, standardize = TRUE)
# fuzzy_cmeans_clusters <- data.table(date = as.Date(as.numeric(rownames(new_mat@M))),
# clusters_fuzzy_cmeans = res_fuzzy_cmeans$solution$IDX)
#
# # K-means using adaptive Wasserstein distances
# res_kmeans_adaptive <- WH_adaptive.kmeans(new_mat, k = n_clusters, rep = 10, simplify = TRUE, qua = 10, standardize = TRUE)
# kmeans_adaptive_clusters <- data.table(date = as.Date(as.numeric(rownames(new_mat@M))),
# kmeans_adaptive_clusters = res_kmeans_adaptive$IDX)
### LOOP ENDS HERE
# combine clusteres results and dates
clusters <- Reduce(function(x, y) merge(x, y, by = c("date"), all.x = TRUE, all.y = FALSE),
list(kmean_clusters, hclust_clusters, fuzzy_cmeans_clusters, kmeans_adaptive_clusters))
clusters <- cbind(symbol = s, clusters)
# NED LOOP
# summary stats
table(clusters_data$cluster)
BLOOD
class(BLOOD)
model.parameters <- WH.regression.two.components(data = BLOOD, Yvar = 1, Xvars = c(2:3))
Predicted.BLOOD <- WH.regression.two.components.predict(data = BLOOD[, 2:3],
parameters = model.parameters)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.