library(data.table)
library(arrow)
library(dplyr)
library(qlcal)
library(ggplot2)
# SET UP ------------------------------------------------------------------
# Global vars
PATH = "F:/data/equity/us"
PATH_DATASET = "F:/data/strategies/pead"
# Set NYSE calendar
setCalendar("UnitedStates/NYSE")
# TODO: NOt sure if I need this
# # Constants
# update = TRUE
# EARING ANNOUNCEMENT DATA ------------------------------------------------
# get events data
events = read_parquet(fs::path(PATH, "fundamentals", "earning_announcements", ext = "parquet"))
# Plot number of rows by date
if (interactive()) {
ggplot(events[, .N, by = date][order(date)], aes(x = date, y = N)) +
geom_line() +
theme_minimal()
}
# Remove observations before today
events = events[date < Sys.Date()]
# Remove duplicates
events[, .N, by = c("symbol", "date")][N > 1]
events = unique(events, by = c("symbol", "date")) # remove duplicated symbol / date pair
# Get investing.com data
investingcom_ea = read_parquet(
fs::path(
PATH,
"fundamentals",
"earning_announcements_investingcom",
ext = "parquet"
)
)
# Keep columns we need and convert date and add col names prefix
investingcom_ea = investingcom_ea[, .(symbol, time, eps, eps_forecast, revenue, revenue_forecast, right_time)]
investingcom_ea[, date_investingcom := as.Date(time)]
investingcom_ea = investingcom_ea[date_investingcom <= qlcal::advanceDate(Sys.Date(), 2)]
setnames(investingcom_ea,
colnames(investingcom_ea)[2:6],
paste0(colnames(investingcom_ea)[2:6], "_investingcom"))
setorder(investingcom_ea, date_investingcom)
investingcom_ea[date_investingcom %between% c("2024-11-01", "2024-11-08")][!is.na(eps_investingcom)]
# Plot number of rows by date
if (interactive()) {
ggplot(investingcom_ea[, .N, by = date_investingcom][order(date_investingcom)],
aes(x = date_investingcom, y = N)) +
geom_line() +
theme_minimal()
}
# Get earnings surprises data from FMP
es = read_parquet(
fs::path(
PATH,
"fundamentals",
"earning_surprises",
ext = "parquet"
)
)
if (interactive()) {
ggplot(es[, .N, by = date][order(date)], aes(x = date, y = N)) +
geom_line() +
theme_minimal()
}
# merge DT and investing com earnings surprises
events = merge(
events,
investingcom_ea,
by.x = c("symbol", "date"),
by.y = c("symbol", "date_investingcom"),
all.x = TRUE,
all.y = FALSE
)
events = merge(events, es, by = c("symbol", "date"), all = TRUE)
# Keep only observations available in both datasets by checking dates
events = events[!is.na(date) & !is.na(as.Date(time_investingcom))]
# Check if time are the same
events[!is.na(right_time) & right_time == "marketClosed ", right_time := "amc"]
events[!is.na(right_time) & right_time == "marketOpen ", right_time := "bmo"]
events[, same_announce_time := time == right_time]
# if both fmp cloud and investing.com data exists keep similar
print(paste0("Number of removed observations because time of announcements are not same :",
sum(!((events$same_announce_time) == TRUE), na.rm = TRUE), " or ",
round(sum(!((events$same_announce_time) == TRUE), na.rm = TRUE) / nrow(events), 4) * 100, "% percent."))
events = events[events$same_announce_time == TRUE]
# Remove duplicated events
events = unique(events, by = c("symbol", "date"))
# Keep only rows with similar eps at least in one of datasets (investing com of sueprises in FMP)
eps_threshold = 0.1
events_ic_similar = events[(eps >= eps_investingcom * 1-eps_threshold) & eps <= (1 + eps_threshold)]
events_fmp_similar = events[(eps >= actualEarningResult * 1-eps_threshold) & actualEarningResult <= (1 + eps_threshold)]
events = unique(rbind(events_ic_similar,
events_fmp_similar,
events[date > (Sys.Date() - 1)]))
# Plot number of rows by date
if (interactive()) {
ggplot(events[, .N, by = date][order(date)], aes(x = date, y = N)) +
geom_line() +
theme_minimal()
}
# Remove column we don't need
cols_remove = c(
"eps_investingcom", "eps_forecast_investingcom", "revenue_investingcom",
"revenue_forecast_investingcom", "right_time", "actualEarningResult",
"estimatedEarning", "same_announce_time", "time_investingcom", "updatedFromDate")
events[, (cols_remove) := NULL]
# Check for business days
events[, all(isBusinessDay(date))]
events[, sum(!isBusinessDay(date))]
round((events[, sum(!isBusinessDay(date))] / nrow(events)) * 100, 2)
events = events[symbol %notin% events[!isBusinessDay(date), symbol]]
events[, all(isBusinessDay(date))]
# Checks
events[, max(date)] # last date
events[date == max(date), symbol] # Symbols for last date
# MARKET DATA AND FUNDAMENTALS ---------------------------------------------
# Get factors
path_to_parquet = fs::path(PATH, "predictors_daily", "factors", "prices_factors.parquet")
events_symbols = c(events[, unique(symbol)], "SPY")
prices_dt = open_dataset(path_to_parquet, format = "parquet") |>
dplyr::filter(date > "2008-01-01", symbol %in% events_symbols) |>
dplyr::arrange(symbol, date) |>
collect() |>
setDT(prices_dt)
# Checks and summarizes
prices_dt[, max(date, na.rm = TRUE)]
# Filter dates and symbols
prices_dt = unique(prices_dt, by = c("symbol", "date"))
prices_n = prices_dt[, .N, by = symbol]
prices_n = prices_n[which(prices_n$N > 700)] # remove prices with only 700 or less observations
prices_dt = prices_dt[symbol %chin% prices_n[, symbol]]
# Remove symbols that have (almost) constant close prices
prices_dt[, sd_roll := roll::roll_sd(close, 22 * 6), by = symbol]
symbols_remove = prices_dt[sd_roll == 0, unique(symbol)]
prices_dt = prices_dt[symbol %notin% symbols_remove]
# Set key
setkey(prices_dt, "symbol")
setorder(prices_dt, symbol, date)
key(prices_dt)
# Calculate most liquid symbols
prices_dt[, dollar_vol_rank := frankv(close_raw * volume, order = -1L), by = date]
# SPY data
spy = open_dataset("F:/data/equity/daily_fmp_all.csv", format = "csv") |>
dplyr::filter(symbol == "SPY") |>
dplyr::select(date, adjClose) |>
dplyr::rename(close = adjClose) |>
collect()
setDT(spy)
spy[, returns := close / shift(close) - 1]
spy = na.omit(spy)
# Free memory
gc()
# Check if dates form events are aligned with dates from prices
# This checks should be done after market closes, but my data is updated after
# 00:00, so take this into account
events[, max(date)]
last_trading_day = events[, data.table::last(sort(unique(date)), 3)[1]]
last_trading_day_corected = events[, data.table::last(sort(unique(date)), 4)[1]]
prices_dt[, max(date)]
prices_dt[date == last_trading_day]
prices_dt[date == last_trading_day_corected]
# REGRESSION LABELING ----------------------------------------------------------
# calculate returns
prices_dt[, ret_5 := shift(close, -6L, "shift") / shift(close, -1L, "shift") - 1, by = "symbol"]
prices_dt[, ret_22 := shift(close, -22L, "shift") / shift(close, -1L, "shift") - 1, by = "symbol"]
prices_dt[, ret_44 := shift(close, -44L, "shift") / shift(close, -1L, "shift") - 1, by = "symbol"]
prices_dt[, ret_66 := shift(close, -66L, "shift") / shift(close, -1L, "shift") - 1, by = "symbol"]
# calculate rolling sd
prices_dt[, sd_5 := roll::roll_sd(returns, 5), by = "symbol"]
prices_dt[, sd_22 := roll::roll_sd(close / shift(close, 1L) - 1, 22), by = "symbol"]
prices_dt[, sd_44 := roll::roll_sd(close / shift(close, 1L) - 1, 44), by = "symbol"]
prices_dt[, sd_66 := roll::roll_sd(close / shift(close, 1L) - 1, 66), by = "symbol"]
# calculate spy returns
spy[, ret_5_spy := shift(close, -6L, "shift") / shift(close, -1L, "shift") - 1]
spy[, ret_22_spy := shift(close, -22L, "shift") / shift(close, -1L, "shift") - 1]
spy[, ret_44_spy := shift(close, -44L, "shift") / shift(close, -1L, "shift") - 1]
spy[, ret_66_spy := shift(close, -66L, "shift") / shift(close, -1L, "shift") - 1]
# calculate excess returns
prices_dt <- merge(prices_dt,
spy[, .(date, ret_5_spy, ret_22_spy, ret_44_spy, ret_66_spy)],
by = "date", all.x = TRUE, all.y = FALSE)
prices_dt[, ret_5_excess := ret_5 - ret_5_spy]
prices_dt[, ret_22_excess := ret_22 - ret_22_spy]
prices_dt[, ret_44_excess := ret_44 - ret_44_spy]
prices_dt[, ret_66_excess := ret_66 - ret_66_spy]
prices_dt[, `:=`(ret_5_spy = NULL, ret_22_spy = NULL, ret_44_spy = NULL, ret_66_spy = NULL)]
setkey(prices_dt, symbol)
setorder(prices_dt, symbol, date)
# Calculate standardized excess returns
prices_dt[, ret_excess_stand_5 := ret_5_excess / shift(sd_5, -4L), by = "symbol"]
prices_dt[, ret_excess_stand_22 := ret_22_excess / shift(sd_22, -21L), by = "symbol"]
prices_dt[, ret_excess_stand_44 := ret_44_excess / shift(sd_44, -43L), by = "symbol"]
prices_dt[, ret_excess_stand_66 := ret_66_excess / shift(sd_66, -65L), by = "symbol"]
# Calculate standardized returns
prices_dt[, ret_stand_5 := ret_5 / shift(sd_5, -4L), by = "symbol"]
prices_dt[, ret_stand_22 := ret_22 / shift(sd_22, -21L), by = "symbol"]
prices_dt[, ret_stand_44 := ret_44 / shift(sd_44, -43L), by = "symbol"]
prices_dt[, ret_stand_66 := ret_66 / shift(sd_66, -65L), by = "symbol"]
# remove unnecesary columns
prices_dt[, `:=`(sd_5 = NULL, sd_22 = NULL, sd_44 = NULL, sd_66 = NULL)]
# prices_dt[, `:=`(ret_5 = NULL, ret_22 = NULL, ret_44 = NULL, ret_66 = NULL)]
prices_dt[, `:=`(ret_excess_stand_5 = NULL, ret_excess_stand_22 = NULL,
ret_excess_stand_44 = NULL, ret_excess_stand_66 = NULL)]
# Calculate one day return. We will use that in results, to calculate portfolio
# returns easier.
prices_dt[, ret_1_lead := shift(close, type = "lead") / close - 1, by = "symbol"]
# MERGE MARKET DATA, EVENTS AND CLASSIF LABELS ---------------------------------
# Merge events and prices
events[, date_event := date]
prices_dt[, date_prices := date]
dataset = prices_dt[events, on = c("symbol", "date"), roll = Inf]
# Defin possibly target columns
possible_target_vars = colnames(dataset)[grepl("^ret_", colnames(dataset))]
# Extreme labeling (BIAS? quantile on all set)
bin_extreme_col_names <- paste0("bin_extreme_", possible_target_vars)
dataset[, (bin_extreme_col_names) := lapply(.SD, function(x) {
y <- cut(x,
quantile(x, probs = c(0, 0.2, 0.8, 1), na.rm = TRUE),
labels = c(-1, NA, 1),
include.lowest = TRUE)
as.factor(droplevels(y))
}), .SDcols = possible_target_vars]
# around zero labeling
labeling_around_zero <- function(x) {
x_abs <- abs(x)
bin <- cut(x_abs, quantile(x_abs, probs = c(0, 0.3333), na.rm = TRUE), labels = 0L, include.lowest = TRUE)
max_0 <- max(x[bin == 0], na.rm = TRUE)
min_0 <- min(x[bin == 0], na.rm = TRUE)
levels(bin) <- c(levels(bin), 1L, -1L)
bin[x > max_0] <- as.character(1L)
bin[x < min_0] <- as.factor(-1)
return(bin)
}
bin_aroundzero_col_names <- paste0("bin_aroundzero_", possible_target_vars)
dataset[, (bin_aroundzero_col_names) := lapply(.SD, labeling_around_zero), .SDcols = possible_target_vars]
# simple labeling (ret > 0 -> 1, vice versa)
bin_extreme_col_names <- paste0("bin_simple_", possible_target_vars)
dataset[, (bin_extreme_col_names) := lapply(.SD, function(x) {
as.factor(ifelse(x > 0, 1, 0))
}), .SDcols = possible_target_vars]
# decile labeling
bin_decile_col_names <- paste0("bin_decile_", possible_target_vars)
dataset[, (bin_decile_col_names) := lapply(.SD, function(x) {
y <- cut(x,
quantile(x, probs = c(0, seq(0.1, 0.9, 0.1), 1), na.rm = TRUE),
labels = 1:10,
include.lowest = TRUE)
as.factor(droplevels(y))
}), .SDcols = possible_target_vars]
# Sort dataset and set key
setkey(dataset, symbol)
setorderv(dataset, c("symbol", "date"))
# Checks
dataset[, .(date, date_event, date_prices)]
dataset[ date_event != date_prices, .(date, date_event, date_prices)]
events[, max(date)]
last_trading_day = events[, data.table::last(sort(unique(date)), 3)[1]]
last_trading_day_corected = events[, data.table::last(sort(unique(date)), 4)[1]]
prices_dt[, max(date)]
prices_dt[date == last_trading_day]
prices_dt[date == last_trading_day_corected]
symbols_ = dataset[date == max(date), symbol]
prices_dt[date == max(date) & symbol %in% symbols_, .SD, .SDcols = c("date", "symbol", "maxret")]
dataset[date == max(date), .SD, .SDcols = c("date", "symbol", "maxret")]
dataset[, max(date_prices, na.rm = TRUE)]
dataset[date_prices == max(date_prices, na.rm = TRUE), .SD, .SDcols = c("date", "symbol", "maxret")]
cols_ = c("date", "symbol", "maxret", "indmom")
dataset[date == last_trading_day_corected, .SD, , .SDcols = cols_]
# PEAD EFFECT -------------------------------------------------------------
# Calculate PEAD effect
back = dataset[, .(symbol, date, date_event, date_prices, time, marketCap, open,
close, volume, close_raw, returns, dollar_vol_rank,
eps, epsEstimated, revenue, revenueEstimated,
ret_5, ret_22, ret_44, ret_66)]
back[, eps_surprse := (eps - epsEstimated / abs(epsEstimated))]
# summaries
back[, .(
mret5 = mean(ret_5, na.rm = TRUE),
mret22 = mean(ret_22, na.rm = TRUE),
mret44 = mean(ret_44, na.rm = TRUE),
mret66 = mean(ret_66, na.rm = TRUE)
)]
back[eps_surprse > 0, .(
mret5 = mean(ret_5, na.rm = TRUE),
mret22 = mean(ret_22, na.rm = TRUE),
mret44 = mean(ret_44, na.rm = TRUE),
mret66 = mean(ret_66, na.rm = TRUE)
)]
back[eps_surprse > 0.1, .(
mret5 = mean(ret_5, na.rm = TRUE),
mret22 = mean(ret_22, na.rm = TRUE),
mret44 = mean(ret_44, na.rm = TRUE),
mret66 = mean(ret_66, na.rm = TRUE)
)]
# Regrssion
ggplot(back, aes(x = eps_surprse, y = ret_5)) +
geom_point() +
geom_smooth(method = "lm") +
theme_minimal()
ggplot(back[eps_surprse %between% c(-10, 100)], aes(x = eps_surprse, y = ret_5)) +
geom_point() +
geom_smooth(method = "lm") +
theme_minimal()
ggplot(back[eps_surprse > 0], aes(x = eps_surprse, y = ret_5)) +
geom_point() +
geom_smooth(method = "lm") +
theme_minimal()
ggplot(back[eps_surprse > 0], aes(x = eps_surprse, y = ret_5)) +
geom_point() +
geom_smooth(method = "glm") +
theme_minimal()
ggplot(back[eps_surprse > 0], aes(x = eps_surprse, y = ret_5)) +
geom_point() +
geom_smooth(method = "gam") +
theme_minimal()
# Bars
back[, eps_surperise_bins_equal := frank(eps_surprse)]
back[, eps_surperise_bins_equal := cut(
eps_surperise_bins_equal,
quantile(eps_surperise_bins_equal, probs = seq(0, 1, by = 0.1), na.rm = TRUE),
include.lowest = TRUE)]
back[, .N, by = eps_surperise_bins_equal]
back[, mean(ret_5, na.rm = TRUE),
by = eps_surperise_bins_equal] |>
ggplot(aes(x = eps_surperise_bins_equal, y = V1)) +
geom_bar(stat = "identity") +
theme_minimal()
back[, mean(ret_22, na.rm = TRUE),
by = eps_surperise_bins_equal] |>
ggplot(aes(x = eps_surperise_bins_equal, y = V1)) +
geom_bar(stat = "identity") +
theme_minimal()
back[, mean(ret_44, na.rm = TRUE),
by = eps_surperise_bins_equal] |>
ggplot(aes(x = eps_surperise_bins_equal, y = V1)) +
geom_bar(stat = "identity") +
theme_minimal()
back[, mean(ret_66, na.rm = TRUE),
by = eps_surperise_bins_equal] |>
ggplot(aes(x = eps_surperise_bins_equal, y = V1)) +
geom_bar(stat = "identity") +
theme_minimal()
back[, median(ret_5, na.rm = TRUE),
by = eps_surperise_bins_equal] |>
ggplot(aes(x = eps_surperise_bins_equal, y = V1)) +
geom_bar(stat = "identity") +
theme_minimal()
back[, median(ret_22, na.rm = TRUE),
by = eps_surperise_bins_equal] |>
ggplot(aes(x = eps_surperise_bins_equal, y = V1)) +
geom_bar(stat = "identity") +
theme_minimal()
back[, median(ret_44, na.rm = TRUE),
by = eps_surperise_bins_equal] |>
ggplot(aes(x = eps_surperise_bins_equal, y = V1)) +
geom_bar(stat = "identity") +
theme_minimal()
back[, median(ret_66, na.rm = TRUE),
by = eps_surperise_bins_equal] |>
ggplot(aes(x = eps_surperise_bins_equal, y = V1)) +
geom_bar(stat = "identity") +
theme_minimal()
# Impact of dollar vol
# QC VS LOCAL -------------------------------------------------------------
# Comapre straddle over earnings between WC and local
symbol_ = "BAC"
date_1 = as.Date("2021-12-20")
date_2 = as.Date("2022-01-20")
dataset[symbol == symbol_ & date %between% c(date_1, date_2), .(symbol, date, date_event, date_prices, time)]
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.