R/Decompose_Volume.R

Defines functions Decompose_Volume

Documented in Decompose_Volume

#'@title STL Decompose Volume Data.
#'@description A function that decompose volume data automatically.
#'
#'@details This function will generate volume data automatically, and save the plots for transformation and STL Decomposition.
#'
#'Decomposition for both intra day periodicity and intra week periodicity will be applied.
#'
#'@param All_dta_dir The directory for the CSV file containing the raw volume data.
#'@param lag_days The maximum lagging days of acf plot, default = 5.
#'@param long_memory_tests Wheather perform long memory tests or not, default = TRUE.
#'@param sec Seconds between intervals, default = 1800.
#'@param peak_param The threashold for peak values quantiles, default = 0.995.
#'@param sub_title The subtitle of the figures, default = "".
#'@param acf Wheather to save acf plots or not, default = TRUE.
#'@param mixreg Wheather to perform analysis of mixture regression or not, default = FALSE.
#'@param splines Should the mixture regression being linear or spline expanded, default = FALSE.
#'
#'@return figures and a table will be saved under a folder.
#'
#'@examples
#'
#'\dontrun{
#'Return_data <- read.dta("MSFT 1800 Sec Summary Return Data.dta")
#'SherryChapter1::Decompose_Volume(Return_data)
#'}
#'
#'@import ggplot2
#'@import magrittr
#'@import reshape2
#'@import MASS
#'@import forecast
#'@import LongMemoryTS
#'@import flexmix
#'@import splines
#'
#'@export
#'
Decompose_Volume <- function(Return_data,
                             lag_days = 10,
                             long_memory_tests = TRUE,
                             sec = 1800,
                             peak_param = 0.995,
                             sub_title = "",
                             acf = TRUE,
                             mixreg = FALSE,
                             splines = FALSE) {

figure_dir = "Volume_decompose_results"

if(!file.exists(figure_dir)) dir.create(figure_dir)

Return_data$volume[is.na(Return_data$volume)] = 0

#Convert the num_trades data into a matrix, with columns correspond to days.
all_days_lst <- split(Return_data$volume, Return_data$date)
Volume_M <- t(matrix(unlist(all_days_lst),
                         nrow = length(unique(Return_data$date)),
                         byrow = T))
rm(all_days_lst)

colnames(Volume_M) = unique(Return_data$date)

ts_M <- Volume_M

feature_M <- data.frame(V_raw = colSums(ts_M, na.rm = T))

ts_M[ts_M < quantile(ts_M, peak_param, na.rm = T)] = NA

feature_M$V_raw_peak = colSums(ts_M, na.rm = T)

rm(ts_M)

#Define the quantities in the time series.
M = nrow(Volume_M) #within day sampling frequency
t = ncol(Volume_M) #total number of days
elapse = 1/M #normalized time elapse between observations

plot_df <- data.frame(value = c(rowMeans(Volume_M),
                                rowVars(Volume_M)),
                      interval = rep(seq_len(M),2),
                      Stats =  rep(c(paste0("Avg(Volume)"),
                                     paste0("Var(Volume)")), each = M))
p1 <- ggplot(plot_df) + geom_line(aes(x = interval, y = value), colour = "dark blue") + facet_wrap(~Stats, scales = "free") + theme_classic() +
  labs(x = paste0(sec, " sec interval"), title = paste0("Original Intraday Volume"), subtitle = sub_title)

ggsave(file.path(figure_dir,"1.Intraday_Volume_Original.pdf"), p1, width = 5.5, height = 2.8)

plot_df <- data.frame(value = c(rowMeans(log(Volume_M+1)),
                                rowVars(log(Volume_M+1))),
                      interval = rep(seq_len(M),2),
                      Stats =  rep(c(paste0("Avg(ln(Volume+1))"),
                                     paste0("Var(ln(Volume+1))")), each = M))
p2 <- ggplot(plot_df) + geom_line(aes(x = interval, y = value), colour = "dark blue") + facet_wrap(~Stats, scales = "free") + theme_classic() +
  labs(x = paste0(sec, " sec interval"), title = paste0("Transformed Intraday Volume"), subtitle = sub_title)

ggsave(file.path(figure_dir,"2.Intraday_Volume_log.pdf"), p2, width = 5.5, height = 2.8)

#Plot the periodicity of mean absolute return.

plot_df <- data.frame(value = c(rowMeans(log(Volume_M+1)),
                                rowMeans(matrix(abs(Return_data$r),
                                                nrow = M,
                                                byrow = F))),
                      interval = rep(seq_len(M),2),
                      Stats =  rep(c(paste0("Avg(ln(Volume+1))"),
                                     paste0("Avg(|Return|)")), each = M))
p3 <- ggplot(plot_df) + geom_line(aes(x = interval, y = value),colour = "dark blue") + facet_wrap(~Stats, scales = "free") + theme_classic() +
  labs(x = paste0(sec, " sec interval"), title = paste0("Periodicity of Average Absolute Return and Volume"), subtitle = sub_title)

ggsave(file.path(figure_dir,"3.Intraday_Absolute_Return_log.pdf"), p3, width = 5.5, height = 2.8)

plot_df2 <- data.frame(y = plot_df$value[plot_df$Stats == "Avg(|Return|)"],
                       x = plot_df$value[plot_df$Stats == "Avg(ln(Volume+1))"])

Rsq <- paste0("R.squared = ", round(summary(lm(y~x,data = plot_df2))$r.squared,3))

p4 <- ggplot(plot_df2, aes(x = x, y = y)) +
  geom_point(colour = "dark blue", alpha = .5) +
  geom_smooth(se = F, method = "lm") +
  geom_label(data = data.frame(label = Rsq,
                               x = min(plot_df2$x)+(max(plot_df2$x)-min(plot_df2$x))*0.3,
                               y = min(plot_df2$y)+(max(plot_df2$y)-min(plot_df2$y))*0.8),
             aes(label = label,
                 x = x,
                 y = y)) +
  labs(y = "Avg(|Return|)", x = "Avg(ln(Volume+1))", title = "Correlation Plot", subtitle = sub_title) +
  theme_classic()

ggsave(file.path(figure_dir,"4.Correlation_plot.pdf"), p4, width = 3, height = 2.8)


if(mixreg){
set.seed(1802)

if(splines){
  knots <- quantile(plot_df2$x, probs = c(0.025, 0.125, 0.25, 0.375, 0.5,  0.75, 0.975)) + c(+0.01, 0, 0, 0, 0, 0, -0.01)
  ex1 <- flexmix(y ~ ns(x, knots = knots), data = plot_df2, k = 2, control = list(verb = 5, iter = 100))
} else {
  ex1 <- flexmix(y ~ x,  data = plot_df2, k = 2, control = list(verb = 5, iter = 100))
}

fitted_curves <- predict(ex1,data = plot_df2$x)
larger_class <- which.max(c(quantile(fitted_curves$Comp.1,.95),
                            quantile(fitted_curves$Comp.2,.95)))
posterior_larger_class <- ex1@posterior$scaled[,larger_class]

plot_df3 <- plot_df2
plot_df3$posterior_c1 = posterior_larger_class
plot_df3$fity_c1 = fitted_curves[[larger_class]]
plot_df3$fity_c2 = fitted_curves[[-larger_class]]
p5 <- ggplot(plot_df3, aes(x = x)) +
  geom_point(aes(colour = posterior_c1, y = y), alpha = .5) +
  geom_line(aes(y = fity_c1), alpha = .8, colour = "#4daf4a") +
  geom_line(aes(y = fity_c2), alpha = .8, colour = "#e41a1c") +
  labs(y = "Avg(|Return|)", x = "Avg(ln(Volume+1))", title = "Mixture of 2 Linear Regression Models", subtitle = sub_title) +
  theme_classic()

ggsave(file.path(figure_dir,"5.Posterior_dot.pdf"),p5,width = 4, height = 2.8)

rescale_i <- function(x) (x - mean(x))/sd(x)

plot_df4 <- data.frame(value = c(rescale_i(rowMeans(log(Volume_M+1))),
                                 rescale_i(rowMeans(matrix(abs(Return_data$r),
                                                           nrow = M,
                                                           byrow = F))),
                                 posterior_larger_class),
                       interval = rep(seq_len(M),3),
                       Stats =  rep(c(paste0("Avg(ln(Volume+1)) z score"),
                                      paste0("Avg(|Return|) z score" ),
                                      "Posterior Prob c1"),
                                    each = M),
                       Facet_var = rep(c("Z scores of average statistics",
                                         "Z scores of average statistics",
                                         "Posterior Probability C1"),
                                       each = M))

plot_df4$Facet_var = factor(plot_df4$Facet_var, levels = c("Z scores of average statistics","Posterior Probability C1"))

p6 <- ggplot(plot_df4) + geom_line(aes(x = interval, y = value, colour = Stats), size = 0.3) + theme_bw() +
  facet_wrap(~Facet_var,nrow = 2,scales = "free") +
  labs(x = paste0(sec, " sec interval"),
       title = paste0("Posterior Probability of the Mixing Component 1"),
       subtitle = sub_title) +
  scale_colour_manual(values = c("#377eb8","#4daf4a","#e41a1c"))

ggsave(file.path(figure_dir,"6.Posterior_day.pdf"),p6,width = 5.5, height = 4)

rm(plot_df,plot_df2,plot_df3,plot_df4)
}else{
rm(plot_df,plot_df2)
}

#Time series decomposition with stl BY DAY

volume_daily_ts <- ts(log(Return_data$volume+1),
                      start = 0,
                      end = t-elapse,
                      frequency = M)

stl.object <- stl(volume_daily_ts,
                  s.window = M, #Apply daily smooth periodic estimation
                  robust = T) #Use robust loess estimation

stl.decomposed <- as.data.frame(stl.object$time.series)

seasonal_removed_ts <- exp(stl.decomposed$remainder + stl.decomposed$trend) - 1

rm(stl.object)

#Plot the seasonal component against the original log means
plot_df <- data.frame(value = c(rowMeans(log(Volume_M+1)) - mean(rowMeans(log(Volume_M+1))),
                                stl.decomposed$seasonal[1:M]),
                      interval = rep(seq_len(M),2),
                      Stats =  rep(c(paste0("Centered Avg(ln(Volume+1))"),
                                     paste0("STL s component estimates")), each = M))
p5 <- ggplot(plot_df) +
      geom_line(aes(x = interval, y = value, color = Stats, linetype = Stats), size = 0.2, alpha = 0.8) +
      theme_classic() +
      labs(x = paste0(sec, " sec interval"),
           title = paste0("STL Seasonal Component Estimates"),
           subtitle = sub_title) + scale_color_manual(values = c("dark blue","red"))

ggsave(file.path(figure_dir,"5.STL_Component_Daily.pdf"), p5, width = 5.5, height = 2.8)


#Plot the ACF plot after removal of periodicity

if (acf){

plot_df <- data.frame(
  Filtered = acf(seasonal_removed_ts, lag.max = lag_days*M, plot = F)$acf,
  Raw = acf(Return_data$volume, lag.max = lag_days*M, plot = F)$acf
)
plot_df <- melt(plot_df)
plot_df$Daily_lag <- seq(0,lag_days,length.out = nrow(plot_df)/2)

lower_bound <- round(min(plot_df$value) - IQR(plot_df$value),1)

pacf <- ggplot(plot_df) + geom_line(aes(x = Daily_lag,
                                y = value,
                                colour = variable,
                                linetype = variable)) +
  theme_classic() +
  scale_x_continuous(breaks = 0:lag_days) +
  labs(x = "Daily lag", y = "Autocorrelation", subtitle = sub_title) +
  scale_y_continuous(limits = c(lower_bound,1), breaks = seq(lower_bound,1,by = 0.1)) +
  scale_colour_brewer(palette = "Set1") + scale_linetype_manual(values = c(2,1))

ggsave(file.path(figure_dir, "6.acf_daily_filter.pdf"), pacf, width = 6, height = 3)

rm(plot_df)

}

ts_M <- t(matrix(seasonal_removed_ts,
                 nrow = t,
                 byrow = T))

feature_M$V_daily_stl = colSums(ts_M,na.rm = T)

ts_M[ts_M < quantile(ts_M,peak_param,na.rm = T)] = NA

feature_M$V_daily_stl_peak = colSums(ts_M, na.rm = T)

rm(ts_M)

if(long_memory_tests){

  m = floor(length(Return_data$volume)^0.65)
  test_raw_lst = local.W(Return_data$volume,m = m)
  test_filtered_lst = local.W((seasonal_removed_ts),m = m)
  M_LMT = data.frame(d = c(NA,NA,NA), s.e. = c(NA,NA,NA))
  rownames(M_LMT) = c("raw_volume","filtered_volume_day","filtered_volume_week")
  M_LMT[1,1] = test_raw_lst$d
  M_LMT[1,2] = test_raw_lst$s.e.
  M_LMT[2,1] = test_filtered_lst$d
  M_LMT[2,2] = test_filtered_lst$s.e.

}

#Time series decomposition with stl BY WEEK
N = 5 #Span of the week
volume_weekly_ts <- ts(Return_data$volume,
                       start = 0,
                       end = (t-elapse)/N,
                       frequency = M*N)


stl.object <- stl(log(volume_weekly_ts + 1),
                  s.window = M*N, #Apply smooth periodic estimation
                  robust = T) #Use robust loess estimation (outlier detection) and missing value interperlation

stl.decomposed <- as.data.frame(stl.object$time.series)

seasonal_removed_ts <- exp(stl.decomposed$remainder+stl.decomposed$trend) - 1

rm(stl.object,stl.decomposed)

if(acf){

plot_df <- data.frame(
  Filtered = acf(seasonal_removed_ts, lag.max = lag_days*M, plot = F)$acf,
  Raw = acf(Return_data$volume, lag.max = lag_days*M, plot = F)$acf
)

plot_df <- melt(plot_df)

plot_df$Daily_lag <- seq(0,lag_days,length.out = nrow(plot_df)/2)

lower_bound <- round(min(plot_df$value) - IQR(plot_df$value),1)


pacf <- ggplot(plot_df) + geom_line(aes(x = Daily_lag,
                                        y = value,
                                        colour = variable,
                                        linetype = variable)) +
  theme_classic() +
  scale_x_continuous(breaks = 0:lag_days) +
  labs(x = "Daily lag", y = "Autocorrelation", subtitle = sub_title) +
  scale_y_continuous(limits = c(lower_bound,1), breaks = seq(lower_bound,1,by = 0.1)) +
  scale_colour_brewer(palette = "Set1") + scale_linetype_manual(values = c(2,1))

ggsave(file.path(figure_dir, "7.acf_weekly_filter.pdf"), pacf, width = 6, height = 3)

rm(plot_df)

}

ts_M <- t(matrix(seasonal_removed_ts,
                 nrow = t,
                 byrow = T))

feature_M$V_weekly_stl = colSums(ts_M,na.rm = T)

ts_M[ts_M < quantile(ts_M,peak_param,na.rm = T)] = NA

feature_M$V_weekly_stl_peak = colSums(ts_M, na.rm = T)

rm(ts_M)

if(long_memory_tests){

  m = floor(length(Return_data$volume)^0.65)
  test_filtered_lst = local.W((seasonal_removed_ts),m = m)
  M_LMT[3,1] = test_filtered_lst$d
  M_LMT[3,2] = test_filtered_lst$s.e.

  write.csv(M_LMT,file.path(figure_dir, "8.Volume_LM_test.csv"))
}

return(feature_M)
}
ZhenWei10/Sherry-Chapter1 documentation built on Oct. 31, 2019, 1:48 a.m.