#'@title STL Decompose Volume Data.
#'@description A function that decompose volume data automatically.
#'
#'@details This function will generate volume data automatically, and save the plots for transformation and STL Decomposition.
#'
#'Decomposition for both intra day periodicity and intra week periodicity will be applied.
#'
#'@param All_dta_dir The directory for the CSV file containing the raw volume data.
#'@param lag_days The maximum lagging days of acf plot, default = 5.
#'@param long_memory_tests Wheather perform long memory tests or not, default = TRUE.
#'@param sec Seconds between intervals, default = 1800.
#'@param peak_param The threashold for peak values quantiles, default = 0.995.
#'@param sub_title The subtitle of the figures, default = "".
#'@param acf Wheather to save acf plots or not, default = TRUE.
#'@param mixreg Wheather to perform analysis of mixture regression or not, default = FALSE.
#'@param splines Should the mixture regression being linear or spline expanded, default = FALSE.
#'
#'@return figures and a table will be saved under a folder.
#'
#'@examples
#'
#'\dontrun{
#'Return_data <- read.dta("MSFT 1800 Sec Summary Return Data.dta")
#'SherryChapter1::Decompose_Volume(Return_data)
#'}
#'
#'@import ggplot2
#'@import magrittr
#'@import reshape2
#'@import MASS
#'@import forecast
#'@import LongMemoryTS
#'@import flexmix
#'@import splines
#'
#'@export
#'
Decompose_Volume <- function(Return_data,
lag_days = 10,
long_memory_tests = TRUE,
sec = 1800,
peak_param = 0.995,
sub_title = "",
acf = TRUE,
mixreg = FALSE,
splines = FALSE) {
figure_dir = "Volume_decompose_results"
if(!file.exists(figure_dir)) dir.create(figure_dir)
Return_data$volume[is.na(Return_data$volume)] = 0
#Convert the num_trades data into a matrix, with columns correspond to days.
all_days_lst <- split(Return_data$volume, Return_data$date)
Volume_M <- t(matrix(unlist(all_days_lst),
nrow = length(unique(Return_data$date)),
byrow = T))
rm(all_days_lst)
colnames(Volume_M) = unique(Return_data$date)
ts_M <- Volume_M
feature_M <- data.frame(V_raw = colSums(ts_M, na.rm = T))
ts_M[ts_M < quantile(ts_M, peak_param, na.rm = T)] = NA
feature_M$V_raw_peak = colSums(ts_M, na.rm = T)
rm(ts_M)
#Define the quantities in the time series.
M = nrow(Volume_M) #within day sampling frequency
t = ncol(Volume_M) #total number of days
elapse = 1/M #normalized time elapse between observations
plot_df <- data.frame(value = c(rowMeans(Volume_M),
rowVars(Volume_M)),
interval = rep(seq_len(M),2),
Stats = rep(c(paste0("Avg(Volume)"),
paste0("Var(Volume)")), each = M))
p1 <- ggplot(plot_df) + geom_line(aes(x = interval, y = value), colour = "dark blue") + facet_wrap(~Stats, scales = "free") + theme_classic() +
labs(x = paste0(sec, " sec interval"), title = paste0("Original Intraday Volume"), subtitle = sub_title)
ggsave(file.path(figure_dir,"1.Intraday_Volume_Original.pdf"), p1, width = 5.5, height = 2.8)
plot_df <- data.frame(value = c(rowMeans(log(Volume_M+1)),
rowVars(log(Volume_M+1))),
interval = rep(seq_len(M),2),
Stats = rep(c(paste0("Avg(ln(Volume+1))"),
paste0("Var(ln(Volume+1))")), each = M))
p2 <- ggplot(plot_df) + geom_line(aes(x = interval, y = value), colour = "dark blue") + facet_wrap(~Stats, scales = "free") + theme_classic() +
labs(x = paste0(sec, " sec interval"), title = paste0("Transformed Intraday Volume"), subtitle = sub_title)
ggsave(file.path(figure_dir,"2.Intraday_Volume_log.pdf"), p2, width = 5.5, height = 2.8)
#Plot the periodicity of mean absolute return.
plot_df <- data.frame(value = c(rowMeans(log(Volume_M+1)),
rowMeans(matrix(abs(Return_data$r),
nrow = M,
byrow = F))),
interval = rep(seq_len(M),2),
Stats = rep(c(paste0("Avg(ln(Volume+1))"),
paste0("Avg(|Return|)")), each = M))
p3 <- ggplot(plot_df) + geom_line(aes(x = interval, y = value),colour = "dark blue") + facet_wrap(~Stats, scales = "free") + theme_classic() +
labs(x = paste0(sec, " sec interval"), title = paste0("Periodicity of Average Absolute Return and Volume"), subtitle = sub_title)
ggsave(file.path(figure_dir,"3.Intraday_Absolute_Return_log.pdf"), p3, width = 5.5, height = 2.8)
plot_df2 <- data.frame(y = plot_df$value[plot_df$Stats == "Avg(|Return|)"],
x = plot_df$value[plot_df$Stats == "Avg(ln(Volume+1))"])
Rsq <- paste0("R.squared = ", round(summary(lm(y~x,data = plot_df2))$r.squared,3))
p4 <- ggplot(plot_df2, aes(x = x, y = y)) +
geom_point(colour = "dark blue", alpha = .5) +
geom_smooth(se = F, method = "lm") +
geom_label(data = data.frame(label = Rsq,
x = min(plot_df2$x)+(max(plot_df2$x)-min(plot_df2$x))*0.3,
y = min(plot_df2$y)+(max(plot_df2$y)-min(plot_df2$y))*0.8),
aes(label = label,
x = x,
y = y)) +
labs(y = "Avg(|Return|)", x = "Avg(ln(Volume+1))", title = "Correlation Plot", subtitle = sub_title) +
theme_classic()
ggsave(file.path(figure_dir,"4.Correlation_plot.pdf"), p4, width = 3, height = 2.8)
if(mixreg){
set.seed(1802)
if(splines){
knots <- quantile(plot_df2$x, probs = c(0.025, 0.125, 0.25, 0.375, 0.5, 0.75, 0.975)) + c(+0.01, 0, 0, 0, 0, 0, -0.01)
ex1 <- flexmix(y ~ ns(x, knots = knots), data = plot_df2, k = 2, control = list(verb = 5, iter = 100))
} else {
ex1 <- flexmix(y ~ x, data = plot_df2, k = 2, control = list(verb = 5, iter = 100))
}
fitted_curves <- predict(ex1,data = plot_df2$x)
larger_class <- which.max(c(quantile(fitted_curves$Comp.1,.95),
quantile(fitted_curves$Comp.2,.95)))
posterior_larger_class <- ex1@posterior$scaled[,larger_class]
plot_df3 <- plot_df2
plot_df3$posterior_c1 = posterior_larger_class
plot_df3$fity_c1 = fitted_curves[[larger_class]]
plot_df3$fity_c2 = fitted_curves[[-larger_class]]
p5 <- ggplot(plot_df3, aes(x = x)) +
geom_point(aes(colour = posterior_c1, y = y), alpha = .5) +
geom_line(aes(y = fity_c1), alpha = .8, colour = "#4daf4a") +
geom_line(aes(y = fity_c2), alpha = .8, colour = "#e41a1c") +
labs(y = "Avg(|Return|)", x = "Avg(ln(Volume+1))", title = "Mixture of 2 Linear Regression Models", subtitle = sub_title) +
theme_classic()
ggsave(file.path(figure_dir,"5.Posterior_dot.pdf"),p5,width = 4, height = 2.8)
rescale_i <- function(x) (x - mean(x))/sd(x)
plot_df4 <- data.frame(value = c(rescale_i(rowMeans(log(Volume_M+1))),
rescale_i(rowMeans(matrix(abs(Return_data$r),
nrow = M,
byrow = F))),
posterior_larger_class),
interval = rep(seq_len(M),3),
Stats = rep(c(paste0("Avg(ln(Volume+1)) z score"),
paste0("Avg(|Return|) z score" ),
"Posterior Prob c1"),
each = M),
Facet_var = rep(c("Z scores of average statistics",
"Z scores of average statistics",
"Posterior Probability C1"),
each = M))
plot_df4$Facet_var = factor(plot_df4$Facet_var, levels = c("Z scores of average statistics","Posterior Probability C1"))
p6 <- ggplot(plot_df4) + geom_line(aes(x = interval, y = value, colour = Stats), size = 0.3) + theme_bw() +
facet_wrap(~Facet_var,nrow = 2,scales = "free") +
labs(x = paste0(sec, " sec interval"),
title = paste0("Posterior Probability of the Mixing Component 1"),
subtitle = sub_title) +
scale_colour_manual(values = c("#377eb8","#4daf4a","#e41a1c"))
ggsave(file.path(figure_dir,"6.Posterior_day.pdf"),p6,width = 5.5, height = 4)
rm(plot_df,plot_df2,plot_df3,plot_df4)
}else{
rm(plot_df,plot_df2)
}
#Time series decomposition with stl BY DAY
volume_daily_ts <- ts(log(Return_data$volume+1),
start = 0,
end = t-elapse,
frequency = M)
stl.object <- stl(volume_daily_ts,
s.window = M, #Apply daily smooth periodic estimation
robust = T) #Use robust loess estimation
stl.decomposed <- as.data.frame(stl.object$time.series)
seasonal_removed_ts <- exp(stl.decomposed$remainder + stl.decomposed$trend) - 1
rm(stl.object)
#Plot the seasonal component against the original log means
plot_df <- data.frame(value = c(rowMeans(log(Volume_M+1)) - mean(rowMeans(log(Volume_M+1))),
stl.decomposed$seasonal[1:M]),
interval = rep(seq_len(M),2),
Stats = rep(c(paste0("Centered Avg(ln(Volume+1))"),
paste0("STL s component estimates")), each = M))
p5 <- ggplot(plot_df) +
geom_line(aes(x = interval, y = value, color = Stats, linetype = Stats), size = 0.2, alpha = 0.8) +
theme_classic() +
labs(x = paste0(sec, " sec interval"),
title = paste0("STL Seasonal Component Estimates"),
subtitle = sub_title) + scale_color_manual(values = c("dark blue","red"))
ggsave(file.path(figure_dir,"5.STL_Component_Daily.pdf"), p5, width = 5.5, height = 2.8)
#Plot the ACF plot after removal of periodicity
if (acf){
plot_df <- data.frame(
Filtered = acf(seasonal_removed_ts, lag.max = lag_days*M, plot = F)$acf,
Raw = acf(Return_data$volume, lag.max = lag_days*M, plot = F)$acf
)
plot_df <- melt(plot_df)
plot_df$Daily_lag <- seq(0,lag_days,length.out = nrow(plot_df)/2)
lower_bound <- round(min(plot_df$value) - IQR(plot_df$value),1)
pacf <- ggplot(plot_df) + geom_line(aes(x = Daily_lag,
y = value,
colour = variable,
linetype = variable)) +
theme_classic() +
scale_x_continuous(breaks = 0:lag_days) +
labs(x = "Daily lag", y = "Autocorrelation", subtitle = sub_title) +
scale_y_continuous(limits = c(lower_bound,1), breaks = seq(lower_bound,1,by = 0.1)) +
scale_colour_brewer(palette = "Set1") + scale_linetype_manual(values = c(2,1))
ggsave(file.path(figure_dir, "6.acf_daily_filter.pdf"), pacf, width = 6, height = 3)
rm(plot_df)
}
ts_M <- t(matrix(seasonal_removed_ts,
nrow = t,
byrow = T))
feature_M$V_daily_stl = colSums(ts_M,na.rm = T)
ts_M[ts_M < quantile(ts_M,peak_param,na.rm = T)] = NA
feature_M$V_daily_stl_peak = colSums(ts_M, na.rm = T)
rm(ts_M)
if(long_memory_tests){
m = floor(length(Return_data$volume)^0.65)
test_raw_lst = local.W(Return_data$volume,m = m)
test_filtered_lst = local.W((seasonal_removed_ts),m = m)
M_LMT = data.frame(d = c(NA,NA,NA), s.e. = c(NA,NA,NA))
rownames(M_LMT) = c("raw_volume","filtered_volume_day","filtered_volume_week")
M_LMT[1,1] = test_raw_lst$d
M_LMT[1,2] = test_raw_lst$s.e.
M_LMT[2,1] = test_filtered_lst$d
M_LMT[2,2] = test_filtered_lst$s.e.
}
#Time series decomposition with stl BY WEEK
N = 5 #Span of the week
volume_weekly_ts <- ts(Return_data$volume,
start = 0,
end = (t-elapse)/N,
frequency = M*N)
stl.object <- stl(log(volume_weekly_ts + 1),
s.window = M*N, #Apply smooth periodic estimation
robust = T) #Use robust loess estimation (outlier detection) and missing value interperlation
stl.decomposed <- as.data.frame(stl.object$time.series)
seasonal_removed_ts <- exp(stl.decomposed$remainder+stl.decomposed$trend) - 1
rm(stl.object,stl.decomposed)
if(acf){
plot_df <- data.frame(
Filtered = acf(seasonal_removed_ts, lag.max = lag_days*M, plot = F)$acf,
Raw = acf(Return_data$volume, lag.max = lag_days*M, plot = F)$acf
)
plot_df <- melt(plot_df)
plot_df$Daily_lag <- seq(0,lag_days,length.out = nrow(plot_df)/2)
lower_bound <- round(min(plot_df$value) - IQR(plot_df$value),1)
pacf <- ggplot(plot_df) + geom_line(aes(x = Daily_lag,
y = value,
colour = variable,
linetype = variable)) +
theme_classic() +
scale_x_continuous(breaks = 0:lag_days) +
labs(x = "Daily lag", y = "Autocorrelation", subtitle = sub_title) +
scale_y_continuous(limits = c(lower_bound,1), breaks = seq(lower_bound,1,by = 0.1)) +
scale_colour_brewer(palette = "Set1") + scale_linetype_manual(values = c(2,1))
ggsave(file.path(figure_dir, "7.acf_weekly_filter.pdf"), pacf, width = 6, height = 3)
rm(plot_df)
}
ts_M <- t(matrix(seasonal_removed_ts,
nrow = t,
byrow = T))
feature_M$V_weekly_stl = colSums(ts_M,na.rm = T)
ts_M[ts_M < quantile(ts_M,peak_param,na.rm = T)] = NA
feature_M$V_weekly_stl_peak = colSums(ts_M, na.rm = T)
rm(ts_M)
if(long_memory_tests){
m = floor(length(Return_data$volume)^0.65)
test_filtered_lst = local.W((seasonal_removed_ts),m = m)
M_LMT[3,1] = test_filtered_lst$d
M_LMT[3,2] = test_filtered_lst$s.e.
write.csv(M_LMT,file.path(figure_dir, "8.Volume_LM_test.csv"))
}
return(feature_M)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.