R/Decompose_TradeNum.R

Defines functions Decompose_TradeNum

Documented in Decompose_TradeNum

#'@title STL Decompose Data of trading number.
#'
#'@description A function that decompose trade number data automatically.
#'
#'@details This function will generate trade number data automatically, and save the plots for transformation and STL Decomposition.
#'
#'Decomposition for both intra day periodicity and intra week periodicity will be applied.
#'
#'@param All_dta_dir The directory for the CSV file containing the raw volume data.
#'@param lag_days The maximum lagging days of acf plot, default = 5.
#'@param long_memory_tests Wheather perform long memory tests or not, default = TRUE.
#'@param sec Seconds between intervals, default = 1800.
#'@param peak_param The threashold for peak values quantiles, default = 0.995.
#'@param sub_title The subtitle of the figures, default = "".
#'@param acf Wheather to save acf plots or not, default = TRUE.
#'@param mixreg Wheather to perform analysis of mixture regression or not, default = FALSE.
#'@param splines Should the mixture regression being linear or spline expanded, default = FALSE.
#'
#'@return figures and a table will be saved under a folder.
#'
#'@examples
#'
#'\dontrun{
#'Return_data <- read.dta("MSFT 1800 Sec Summary Return Data.dta")
#'SherryChapter1::Decompose_TradeNum(Return_data)
#'}
#'
#'@import matrixStats
#'@import ggplot2
#'@import magrittr
#'@import reshape2
#'@import MASS
#'@import foreign
#'@import flexmix
#'@import splines
#'
#'@export
Decompose_TradeNum <- function(Return_data,
                               lag_days = 10,
                               long_memory_tests = TRUE,
                               sec = 1800,
                               peak_param = 0.985,
                               sub_title = "",
                               acf = TRUE,
                               mixreg = FALSE,
                               splines = FALSE) {

  figure_dir = "TradeNum_decompose_results"

  if(!file.exists(figure_dir)) dir.create(figure_dir)

  Return_data$tickcount[is.na(Return_data$tickcount)] = 0

  #Convert the num_trades data into a matrix, with columns correspond to days.
  all_days_lst <- split(Return_data$tickcount, Return_data$date)
  TradeNum_M <- t(matrix(unlist(all_days_lst),
                       nrow = length(unique(Return_data$date)),
                       byrow = T))
  rm(all_days_lst)

  colnames(TradeNum_M) = unique(Return_data$date)

  ts_M <- TradeNum_M

  feature_M <- data.frame(TN_raw = colSums(ts_M, na.rm = T))

  ts_M[ts_M < quantile(ts_M, peak_param, na.rm = T)] = NA

  feature_M$TN_raw_peak = colSums(ts_M, na.rm = T)

  rm(ts_M)

  #Define the quantities in the time series.
  M = nrow(TradeNum_M) #within day sampling frequency
  t = ncol(TradeNum_M) #total number of days
  elapse = 1/M #normalized time elapse between observations

  plot_df <- data.frame(value = c(rowMeans(TradeNum_M),
                                  rowVars(TradeNum_M)),
                        interval = rep(seq_len(M),2),
                        Stats =  rep(c(paste0("Avg(TradeNum)"),
                                       paste0("Var(TradeNum)")), each = M))
  p1 <- ggplot(plot_df) + geom_line(aes(x = interval, y = value),colour = "dark blue") + facet_wrap(~Stats, scales = "free") + theme_classic() +
    labs(x = paste0(sec, " sec interval"), title = paste0("Original Intraday Trade Number"), subtitle = sub_title)

  ggsave(file.path(figure_dir,"1.Intraday_TradeNum_Original.pdf"), p1, width = 5.5, height = 2.8)

  plot_df <- data.frame(value = c(rowMeans(log(TradeNum_M+1)),
                                  rowVars(log(TradeNum_M+1))),
                        interval = rep(seq_len(M),2),
                        Stats =  rep(c(paste0("Avg(ln(TradeNum+1))"),
                                       paste0("Var(ln(TradeNum+1))")), each = M))
  p2 <- ggplot(plot_df) + geom_line(aes(x = interval, y = value),colour = "dark blue") + facet_wrap(~Stats, scales = "free") + theme_classic() +
    labs(x = paste0(sec, " sec interval"), title = paste0("Transformed Intraday Trade Number"), subtitle = sub_title)

  ggsave(file.path(figure_dir,"2.Intraday_TradeNum_log.pdf"), p2, width = 5.5, height = 2.8)

  #Plot the periodicity of mean absolute return.

  plot_df <- data.frame(value = c(rowMeans(log(TradeNum_M+1)),
                                  rowMeans(matrix(abs(Return_data$r),
                                                  nrow = M,
                                                  byrow = F))),
                        interval = rep(seq_len(M),2),
                        Stats =  rep(c(paste0("Avg(ln(TradeNum+1))"),
                                       paste0("Avg(|Return|)")), each = M))
  p3 <- ggplot(plot_df) + geom_line(aes(x = interval, y = value),colour = "dark blue") + facet_wrap(~Stats, scales = "free") + theme_classic() +
    labs(x = paste0(sec, " sec interval"), title = paste0("Periodicity of Average Absolute Return and Trade Number"), subtitle = sub_title)

  ggsave(file.path(figure_dir,"3.Intraday_Absolute_Return_log.pdf"), p3, width = 5.5, height = 2.8)

  plot_df2 <- data.frame(y = plot_df$value[plot_df$Stats == "Avg(|Return|)"],
                         x = plot_df$value[plot_df$Stats == "Avg(ln(TradeNum+1))"])

  Rsq <- paste0("R.squared = ", round(summary(lm(y~x,data = plot_df2))$r.squared,3))

  p4 <- ggplot(plot_df2, aes(x = x, y = y)) +
    geom_point(colour = "dark blue", alpha = .5) +
    geom_smooth(se = F, method = "lm") +
    geom_label(data = data.frame(label = Rsq,
                                 x = min(plot_df2$x)+(max(plot_df2$x)-min(plot_df2$x))*0.3,
                                 y = min(plot_df2$y)+(max(plot_df2$y)-min(plot_df2$y))*0.8),
               aes(label = label,
                   x = x,
                   y = y)) +
    labs(y = "Avg(|Return|)", x = "Avg(ln(TradeNum+1))", title = "Correlation Plot", subtitle = sub_title) +
    theme_classic()

  ggsave(file.path(figure_dir,"4.Correlation_plot.pdf"), p4, width = 3, height = 2.8)

  if(mixreg){
  set.seed(1802)

  if(splines){
  knots <- quantile(plot_df2$x, probs = c(0.025, 0.125, 0.25, 0.375, 0.5,  0.75, 0.975)) + c(+0.01, 0, 0, 0, 0, 0, -0.01)
  ex1 <- flexmix(y ~ ns(x, knots = knots), data = plot_df2, k = 2, control = list(verb = 5, iter = 100))
  } else {
  ex1 <- flexmix(y ~ x,  data = plot_df2, k = 2, control = list(verb = 5, iter = 100))
  }

  fitted_curves <- predict(ex1,data = plot_df2$x)
  larger_class <- which.max(c(quantile(fitted_curves$Comp.1,.95),
                              quantile(fitted_curves$Comp.2,.95)))
  posterior_larger_class <- ex1@posterior$scaled[,larger_class]

  plot_df3 <- plot_df2
  plot_df3$posterior_c1 = posterior_larger_class
  plot_df3$fity_c1 = fitted_curves[[larger_class]]
  plot_df3$fity_c2 = fitted_curves[[-larger_class]]
  p5 <- ggplot(plot_df3, aes(x = x)) +
    geom_point(aes(colour = posterior_c1, y = y), alpha = .5) +
    geom_line(aes(y = fity_c1), alpha = .8, colour = "#4daf4a") +
    geom_line(aes(y = fity_c2), alpha = .8, colour = "#e41a1c") +
    labs(y = "Avg(|Return|)", x = "Avg(ln(TradeNum+1))", title = "Mixture of 2 Linear Regression Models", subtitle = sub_title) +
    theme_classic()

  ggsave(file.path(figure_dir,"5.Posterior_dot.pdf"),p5,width = 4, height = 2.8)

  rescale_i <- function(x) (x - mean(x))/sd(x)

  plot_df4 <- data.frame(value = c(rescale_i(rowMeans(log(TradeNum_M+1))),
                                   rescale_i(rowMeans(matrix(abs(Return_data$r),
                                                             nrow = M,
                                                             byrow = F))),
                                   posterior_larger_class),
                         interval = rep(seq_len(M),3),
                         Stats =  rep(c(paste0("Avg(ln(TradeNum+1)) z score"),
                                        paste0("Avg(|Return|) z score" ),
                                        "Posterior Prob c1"), each = M),
                         Facet_var = rep(c("Z scores of average statistics",
                                           "Z scores of average statistics",
                                           "Posterior Probability C1"), each = M))

  plot_df4$Facet_var = factor(plot_df4$Facet_var, levels = c("Z scores of average statistics","Posterior Probability C1"))

  p6 <- ggplot(plot_df4) + geom_line(aes(x = interval, y = value, colour = Stats), size = 0.3) + theme_bw() +
    facet_wrap(~Facet_var,nrow = 2,scales = "free") + labs(x = paste0(sec, " sec interval"), title = paste0("Posterior Probability of the Mixing Component 1"), subtitle = sub_title) + scale_colour_manual(values = c("#377eb8","#4daf4a","#e41a1c"))

  ggsave(file.path(figure_dir,"6.Posterior_day.pdf"),p6,width = 5.5, height = 4)

  rm(plot_df,plot_df2,plot_df3,plot_df4)

}else{
  rm(plot_df,plot_df2)
}

  #Time series decomposition with stl BY DAY
  #Plot the ACF plot after removal of periodicity

  TradeNum_daily_ts <- ts(log(Return_data$tickcount+1),
                        start = 0,
                        end = t-elapse,
                        frequency = M)

  stl.object <- stl(TradeNum_daily_ts,
                    s.window = M, #Apply daily smooth periodic estimation
                    robust = T) #Use robust loess estimation + missing value interperlation

  stl.decomposed <- as.data.frame(stl.object$time.series)

  seasonal_removed_ts <- exp(stl.decomposed$remainder + stl.decomposed$trend) - 1

  rm(stl.object)

  #Plot the seasonal component against the original log means
  plot_df <- data.frame(value = c(rowMeans(log(TradeNum_M+1)) - mean(rowMeans(log(TradeNum_M+1))),
                                  stl.decomposed$seasonal[1:M]),
                        interval = rep(seq_len(M),2),
                        Stats =  rep(c(paste0("Centered Avg(ln(Volume+1))"),
                                       paste0("STL s component estimates")), each = M))
  p5 <- ggplot(plot_df) +
    geom_line(aes(x = interval, y = value, color = Stats, linetype = Stats), size = 0.2, alpha = 0.8) +
    theme_classic() +
    labs(x = paste0(sec, " sec interval"),
         title = paste0("STL Seasonal Component Estimates"),
         subtitle = sub_title) + scale_color_manual(values = c("dark blue","red"))

  ggsave(file.path(figure_dir,"7.STL_Component_Daily.pdf"), p5, width = 5.5, height = 2.8)

  if(acf){
  #Plot the ACF plot after removal of periodicity
  TradeNum_daily_ts[is.na(TradeNum_daily_ts)] = 0

  plot_df <- data.frame(
    Filtered = acf(seasonal_removed_ts, lag.max = lag_days*M, plot = F)$acf,
    Raw = acf(Return_data$tickcount, lag.max = lag_days*M, plot = F)$acf
  )
  plot_df <- melt(plot_df)
  plot_df$Daily_lag <- seq(0,lag_days,length.out = nrow(plot_df)/2)

  lower_bound <- round(min(plot_df$value) - IQR(plot_df$value),1)

  pacf <- ggplot(plot_df) + geom_line(aes(x = Daily_lag,
                                          y = value,
                                          colour = variable,
                                          linetype = variable)) +
    theme_classic() +
    scale_x_continuous(breaks = 0:lag_days) +
    labs(x = "Daily lag", y = "Autocorrelation",title = sub_title) +
    scale_y_continuous(limits = c(lower_bound,1), breaks = seq(lower_bound,1,by = 0.1)) +
    scale_colour_brewer(palette = "Set1") + scale_linetype_manual(values = c(2,1))

  ggsave(file.path(figure_dir, "8.acf_daily_filter.pdf"), pacf, width = 6, height = 3)

  rm(plot_df)

}

  ts_M <- t(matrix(seasonal_removed_ts,
                   nrow = t,
                   byrow = T))

  feature_M$TN_daily_stl = colSums(ts_M,na.rm = T)

  ts_M[ts_M < quantile(ts_M,peak_param,na.rm = T)] = NA

  feature_M$TN_daily_stl_peak = colSums(ts_M, na.rm = T)

  rm(ts_M)

  if(long_memory_tests){

    m = floor(length(Return_data$tickcount)^0.65)
    test_raw_lst = local.W(Return_data$tickcount,m = m)
    test_filtered_lst = local.W((seasonal_removed_ts),m = m)
    M_LMT = data.frame(d = c(NA,NA,NA), s.e. = c(NA,NA,NA))
    rownames(M_LMT) = c("raw_TradeNum","filtered_TradeNum_day","filtered_TradeNum_week")
    M_LMT[1,1] = test_raw_lst$d
    M_LMT[1,2] = test_raw_lst$s.e.
    M_LMT[2,1] = test_filtered_lst$d
    M_LMT[2,2] = test_filtered_lst$s.e.

  }
  #Time series decomposition with stl BY WEEK
  N = 5 #Span of the week
  TradeNum_weekly_ts <- ts(log(Return_data$tickcount+1),
                         start = 0,
                         end = (t-elapse)/N,
                         frequency = M*N)


  stl.object <- stl(TradeNum_weekly_ts,
                    s.window = M*N, #Apply smooth periodic estimation
                    robust = T)

  stl.decomposed <- as.data.frame(stl.object$time.series)

  seasonal_removed_ts <- exp(stl.decomposed$remainder + stl.decomposed$trend) - 1

  rm(stl.object,stl.decomposed)

  TradeNum_weekly_ts[is.na(TradeNum_weekly_ts)] = 0

  if(acf){

  plot_df <- data.frame(
    Filtered = acf(seasonal_removed_ts, lag.max = lag_days*M, plot = F)$acf,
    Raw = acf(Return_data$tickcount, lag.max = lag_days*M, plot = F)$acf
  )

  plot_df <- melt(plot_df)

  plot_df$Daily_lag <- seq(0,lag_days,length.out = nrow(plot_df)/2)

  lower_bound <- round(min(plot_df$value) - IQR(plot_df$value),1)

  pacf <- ggplot(plot_df) + geom_line(aes(x = Daily_lag,
                                          y = value,
                                          colour = variable,
                                          linetype = variable)) +
    theme_classic() +
    scale_x_continuous(breaks = 0:lag_days) +
    labs(x = "Daily lag", y = "Autocorrelation",subtitle = sub_title) +
    scale_y_continuous(limits = c(lower_bound,1), breaks = seq(lower_bound,1,by = 0.1)) +
    scale_colour_brewer(palette = "Set1") + scale_linetype_manual(values = c(2,1))

  ggsave(file.path(figure_dir, "9.acf_weekly_filter.pdf"), pacf, width = 6, height = 3)

  rm(plot_df)

  }

  ts_M <- t(matrix(seasonal_removed_ts,
                   nrow = t,
                   byrow = T))

  feature_M$TN_weekly_stl = colSums(ts_M,na.rm = T)

  ts_M[ts_M < quantile(ts_M,peak_param,na.rm = T)] = NA

  feature_M$TN_weekly_stl_peak = colSums(ts_M, na.rm = T)

  rm(ts_M)

  if(long_memory_tests){

    m = floor(length(Return_data$tickcount)^0.65)
    test_filtered_lst = local.W((seasonal_removed_ts),m = m)
    M_LMT[3,1] = test_filtered_lst$d
    M_LMT[3,2] = test_filtered_lst$s.e.

    write.csv(M_LMT,file.path(figure_dir,"19.TradeNum_LM_test.csv"))
  }

  return(feature_M)
}
ZhenWei10/Sherry-Chapter1 documentation built on Oct. 31, 2019, 1:48 a.m.