library(data.table)
library(fs)
library(cryptoQuotes)
library(xts)
library(ggplot2)
library(arrow)
# setup
PATH = "F:/data/crypto/binance"
# import futures data
dir_ls(path(PATH))
read_ = function(x, clean = TRUE) {
dt_ = read_parquet(path(PATH, paste0(x, ".parquet")))
if (clean) {
dt_[, let(
open_time = as.POSIXct(as.numeric(open_time / 1000)),
close_time = as.POSIXct(as.numeric(close_time / 1000))
)]
}
dt_= unique(dt_)
return(dt_)
}
cm_indexpriceklines_hour = read_("futures_cm_indexpriceklines_1h")
cm_indexpriceklines_daily = read_("futures_cm_indexpriceklines_1d")
futures_cm_klines_hour = read_("futures_cm_klines_1h")
futures_cm_klines_daily = read_("futures_cm_klines_1d")
spot_klines_daily = read_("spot_klines_1d")
spot_klines_hour = read_("spot_klines_1h")
cm_funding_rates = read_("futures_cm_fundingrate", FALSE)
cm_funding_rates[, let(calc_time = as.POSIXct(as.numeric(calc_time / 1000)))]
# Upasample funding rates to daily
cm_funding_rates[, date := as.IDate(calc_time)]
funding_daily = cm_funding_rates[
, .(rate = sum(last_funding_rate, na.rm = TRUE)), by = c("symbol", "date")
]
# Summary statistics
cm_funding_rates[, .(
mean = mean(last_funding_rate, na.rm = TRUE),
median = median(last_funding_rate, na.rm = TRUE),
sd = sd(last_funding_rate, na.rm = TRUE),
min = min(last_funding_rate, na.rm = TRUE),
max = max(last_funding_rate, na.rm = TRUE)
)]
# Visualize some rates
symb_ = cm_funding_rates[grep("BTC", symbol), unique(symbol)]
dt_ = cm_funding_rates[symbol == symb_, .(calc_time, last_funding_rate)]
xts_ = as.xts.data.table(unique(dt_[, .(calc_time, last_funding_rate)]))
plot(xts_, main=paste0(symb_, " funding rate"))
# Average daily funding
avg_daily_fun = funding_daily[, .(rate = mean(rate, na.rm = TRUE)), by = "symbol"]
all_coins_avg = avg_daily_fun[, mean(rate, na.rm = TRUE)]
all_coins_avg * 100 * 365
avg_daily_fun %>%
ggplot(aes(x = symbol, y = rate*100*365)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(
title = "Average daily funding, annualised",
y = "Annualised funding rate, %"
)
# costs
cost = 4*0.07/100
horizon = cost/all_coins_avg
# horizon = 2
# Predict next 22 days rates from 5 days history
cols = paste0("rate_", 1:4)
funding_daily[, (cols) := shift(rate, 1:4), by = symbol]
funding_daily[, predicted_funding := as.integer(horizon) * rowMeans(.SD, na.rm = TRUE)
, .SDcols = cols]
funding_daily[, (cols) := NULL]
cols = paste0("rate_", 1:as.integer(horizon))
funding_daily[, (cols) := shift(rate, -1:-as.integer(horizon), type = "shift")
, by = symbol]
funding_daily[, actual := rowSums(.SD, na.rm = TRUE), .SDcols = cols]
funding_daily[, (cols) := NULL]
# Here's a plot of predicted funding vs actual funding:
funding_daily %>%
na.omit() %>%
ggplot(aes(x = actual, y = predicted_funding)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Predicted vs actual funding, all data")
# And with overlapping data removed:
keep_rows = funding_daily[, .I %% as.integer(horizon) == 0, by = symbol]
funding_daily[keep_rows[, V1]] %>%
na.omit() %>%
ggplot(aes(x = actual, y = predicted_funding)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Predicted vs actual funding, overlapping data removed")
# winsorize
data_plot = copy(funding_daily)
data_plot = data_plot[keep_rows[, V1]]
data_plot[, actual := DescTools::Winsorize(actual, probs = c(0.01, 0.99))]
data_plot[, predicted_funding := DescTools::Winsorize(predicted_funding,
probs = c(0.01, 0.99),
na.rm = TRUE)]
data_plot %>%
na.omit() %>%
ggplot(aes(x = actual, y = predicted_funding)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Predicted vs actual funding, overlapping data removed")
# Residuals
data_plot[, residual := predicted_funding - actual]
data_plot %>%
ggplot(aes(x = actual, y = residual)) +
geom_point() +
geom_smooth(method = "loess") +
labs(title = "Model residuals, overlapping data removed")
# cutoff value
cutoff = 1e6 # remove from universe all assets with volume < cutoff
# merge futures and spot data
futures_cm_klines_daily[, unique(symbol)]
universe_df = futures_cm_klines_daily[symbol %like% "PERP", .(
symbol = gsub("_PERP|USD", "", symbol),
date = as.Date(close_time),
close_perp = close,
volume_perp = volume
)]
spot_klines_daily[symbol %like% "^BTC.*USD", unique(symbol)]
universe_df = spot_klines_daily[, .(
symbol = gsub("USDT", "", symbol),
date = as.Date(close_time),
close_spot = close
)][universe_df, on = c("symbol", "date") ]
# plot spot and future prices for some symbols
data_ = as.xts.data.table(universe_df[symbol %chin% c("BTC"), .(date, close_spot, close_perp)])
plot(data_)
plot(tail(data_, 50))
data_ = as.xts.data.table(universe_df[symbol %chin% c("BTC"), .(date, volume_perp)])
plot(data_)
# missing values
universe_df[!is.na(close_spot)]
universe_df = na.omit(universe_df)
# sort
setorder(universe_df, symbol, date)
# get 5-day rolling mean volume
universe_df[, roll_mean_volume := frollmean(volume_perp, 5), by = symbol]
universe_df[, is_universe := 0]
universe_df[shift(roll_mean_volume) > cutoff, is_universe := 1]
head(universe_df, 99)
# Check how many perpetuals are in our universe given these criteria
universe_df[, .(universe_size = sum(is_universe, na.rm = TRUE)), by = date] |>
ggplot(aes(x = date, y = universe_size)) +
geom_line() +
labs(title = sprintf("Universe size, 5-day average volume > %d", cutoff))
# Create daily funging rates and calculate predicted rates
funding_daily = copy(cm_funding_rates)
funding_daily[, rate := shift(last_funding_rate, -1, type = "shift"), by = symbol]
funding_daily = funding_daily[, .(rate = sum(rate, na.rm = TRUE)), by = c("symbol", "date")]
cols = paste0("rate_", 1:4)
funding_daily[, (cols) := shift(rate, 1:4), by = symbol]
funding_daily[, predicted_funding := as.integer(horizon) * rowMeans(.SD, na.rm = TRUE)
, .SDcols = cols]
funding_daily[, (cols) := NULL]
funding_daily[, symbol := gsub("_PERP|USD", "", symbol)]
# backtest data
backtest_df = merge(universe_df, funding_daily, by = c("symbol", "date"),
all.x = TRUE, all.y = FALSE)
backtest_df[, let(
return_perp = close_perp / shift(close_perp) - 1,
return_spot = close_spot / shift(close_spot) - 1
), by = "symbol"]
backtest_df[, let(
fwd_return_perp = shift(return_perp, -1, "shift"),
fwd_return_spot = shift(return_spot, -1, "shift"),
fwd_funding = shift(rate, -2, type = "shift")
), by = "symbol"]
## MY LINE
backtest_df = na.omit(backtest_df, cols = c("rate"))
# Next we take positions in the top ten by predicted funding, over some threshold.
threshold_ann = 0.2 # annualised funding
threshold = threshold_ann/(365/7.) # next 7-day's funding
top_n = 10
positions_df = backtest_df[is_universe == TRUE]
positions_df = positions_df[predicted_funding > threshold]
setorder(positions_df, date, -predicted_funding)
positions_df[, rank := rowid(date)]
data.table::first(positions_df, 20)
positions_df[, pos := 0]
positions_df[rank <= top_n, pos := 1]
positions_df[, num_positions := sum(pos), by = date]
# plot positions
positions_df |>
ggplot(aes(x = date, y = num_positions)) +
geom_line() +
labs(title = "Check number of positions per day")
# inspect why are there more than 10 positions
positions_df[num_positions > 15]
positions_df[date == as.Date("2023-01-18")]
# Next, we'll estimate costs in basis points.
cost = 0.12/100
# 3)
backtest_dt = copy(positions_df)
backtest_dt[is.na(pos), pos := 0]
backtest_dt[, let(diff_pos = c(1, abs(diff(pos)))), by = symbol]
backtest_dt[diff_pos == 1]
backtest_dt[, costs := 0]
backtest_dt[diff_pos == 1, costs := cost]
# 2)
# backtest_dt = backtest_df[, .(symbol, date, fwd_return_perp, fwd_return_spot, fwd_funding)]
# backtest_dt = backtest_dt[positions_df[, .(symbol, date, pos)], on = .(symbol, date)]
# backtest_dt[!is.na(pos)]
# backtest_dt[is.na(pos), pos := 0]
# backtest_dt[, let(diff_pos = c(1, abs(diff(pos)))), by = symbol]
# backtest_dt[diff_pos == 1]
# backtest_dt[, costs := 0]
# backtest_dt[diff_pos == 1, costs := cost]
# 1)
# backtest_dt = backtest_df[, .(symbol, date, fwd_return_perp, fwd_return_spot, fwd_funding)]
# backtest_dt = backtest_dt[positions_df, on = .(symbol, date), nomatch = 0]
# backtest_dt[is.na(pos), pos := 0]
# backtest_dt[, `:=`(
# diff_pos = c(1, abs(diff(pos))),
# costs = 0 # Initialize costs column
# ), by = symbol]
# backtest_dt[diff_pos == 1, costs := cost]
backtest_dt %>%
dplyr::group_by(date) %>%
dplyr::summarise(
num_positions = sum(pos),
tot_cost = sum(costs)/num_positions) %>%
ggplot(aes(x = date, y = tot_cost*10000)) +
geom_line() +
labs(
title = "Check on daily total costs",
y = "Total cost (bps)"
)
# P&L
capital = 1000
library(dplyr)
results_df = backtest_dt %>%
dplyr::group_by(date) %>%
dplyr::summarise(
num_positions = sum(pos),
# pnl to being equal weight in the short perp positions (half our capital is in perps)
perp_pnl = dplyr::case_when(num_positions > 0 ~ sum(-pos*fwd_return_perp*capital/(2*num_positions), na.rm = TRUE), TRUE ~ 0.),
# pnl to being equal weight in the long spot positions (half our capital is in spot)
spot_pnl = dplyr::case_when(num_positions > 0 ~ sum(pos*fwd_return_spot*capital/(2*num_positions), na.rm = TRUE), TRUE ~ 0.),
# pnl from change in basis
basis_change_pnl = (perp_pnl + spot_pnl),
# pnl from funding on positions (we get funding on the half of our capital which is short perps)
funding_pnl = case_when(num_positions > 0 ~ sum(pos*fwd_funding*capital/(2*num_positions)), TRUE ~ 0.),
# costs accrue proportionally to each position (noting no minimum commission)
costs = case_when(num_positions > 0 ~ -sum(costs*capital/num_positions, na.rm = TRUE), TRUE ~ 0.),
total_pnl = basis_change_pnl + funding_pnl + costs
)
results_df %>%
select(date, basis_change_pnl, funding_pnl, costs, total_pnl) %>%
tidyr::pivot_longer(-date, names_to = "pnl_source", values_to = "pnl") %>%
ggplot(aes(x = date, y = pnl, colour = pnl_source)) +
geom_line() +
labs(title = "PnL by source")
options(repr.plot.width = 14, repr.plot.height=15)
results_df %>%
filter(date >= as.data.table(results_df)[, min(date)]) %>%
mutate(
basis_cum_pnl = cumsum(basis_change_pnl),
funding_cum_pnl = cumsum(funding_pnl),
costs = cumsum(costs),
total_cum_pnl = cumsum(total_pnl)
) %>%
select(date, basis_cum_pnl, funding_cum_pnl, costs, total_cum_pnl) %>%
tidyr::pivot_longer(-date, names_to = "pnl_source", values_to = "pnl") %>%
ggplot(aes(x = date, y = pnl, colour = pnl_source)) +
geom_line() +
facet_wrap(~pnl_source, scales = "free_y", ncol = 1) +
labs(
title = "PnL to price change, funding, and costs. Starting capital ",
subtitle = "Note different y-axes"
)
options(repr.plot.width = 14, repr.plot.height=7)
na.omit(results_df) %>%
summarise(
Ann.Return = mean(total_pnl)*365/capital,
Ann.Vol = sd(total_pnl/capital)*sqrt(365),
Ann.Sharpe = Ann.Return/Ann.Vol
) %>%
mutate(across(.cols = everything(), .fns = ~ round(.x, 3)))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.