knitr::opts_chunk$set(echo = TRUE)

Visualize R Squared

#Load data 
library(ggplot2)
library(reshape2)
R2_OLS_dir <- list.files("OLR_R2_all")
plot_df <- data.frame(X = NA, variable = NA, value = NA, stock = NA, time_interval = NA)
for ( i in c(5, 30, 60, 300, 1800) ) {
  for (j in c("MSFT","SPY","XRX")) {
    csv_ij <- read.csv(file.path("OLR_R2_all",paste0(j,"_",i,"_OLR_adj_R2.csv")))
    plot_df_ij <- melt(csv_ij,id = "X")
    plot_df_ij$stock = j
    plot_df_ij$time_interval = i
    plot_df <- rbind(plot_df, plot_df_ij)
  }
}
plot_df <- plot_df[-1,]

colnames( plot_df ) = c("predictors","RV_Filter","Rsquared","stock","time_interval")
plot_df$time_interval = factor(plot_df$time_interval, levels = c(5, 30, 60, 300, 1800))


R2_OLS_dir <- list.files("RandomForest_R2_all")
plot_df2 <- data.frame(X = NA, variable = NA, value = NA, stock = NA, time_interval = NA)
for ( i in c(5, 30, 60, 300, 1800) ) {
  for (j in c("MSFT","SPY","XRX")) {
    csv_ij <- read.csv(file.path("RandomForest_R2_all",paste0(j,"_",i,"_RF_cv_R2.csv")))
    plot_df2_ij <- melt(csv_ij,id = "X")
    plot_df2_ij$stock = j
    plot_df2_ij$time_interval = i
    plot_df2 <- rbind(plot_df2, plot_df2_ij)
  }
}
plot_df2 <- plot_df2[-1,]

colnames( plot_df2 ) = c("predictors","RV_Filter","Rsquared","stock","time_interval")
plot_df2$time_interval = factor(plot_df2$time_interval, levels = c(5, 30, 60, 300, 1800))

plot_df$regressor = "Linear regression"
plot_df2$regressor = "Random Forest regression"

plot_df <- rbind(plot_df,plot_df2)

plot_df$RV_Filter = factor(plot_df$RV_Filter, levels = c("RV_raw","RV_fD","RV_fC"))

levels(plot_df$RV_Filter) = c("no filter", "constant", "day")

p1 <- ggplot(plot_df[plot_df$predictors == "add_V",],
       aes(x = time_interval, y = Rsquared)) + 
  geom_bar(aes(fill = RV_Filter),
           position="dodge",
           width = 0.7,
           color = 1,
           stat = "identity",
           size = .3) +
  facet_grid(regressor~stock,
             scales = "free") + theme_bw() +
  labs( x = "Sampling Interval (Secs)", y = "R Squared", title = "R Squared of Default Model")

ggsave("R_squared_RV_filter.pdf", p1, width = 7.5, height = 4.3)
#Load data 
library(reshape2)
R2_OLS_dir <- list.files("OLR_R2_all")
plot_df <- data.frame(X = NA, variable = NA, value = NA, stock = NA, time_interval = NA)
for ( i in c(5, 30, 60, 300, 1800) ) {
  for (j in c("MSFT","SPY","XRX")) {
    csv_ij <- read.csv(file.path("OLR_R2_all",paste0(j,"_",i,"_OLR_increase_in_R2.csv")))
    plot_df_ij <- melt(csv_ij,id = "X")
    plot_df_ij$stock = j
    plot_df_ij$time_interval = i
    plot_df <- rbind(plot_df, plot_df_ij)
  }
}
plot_df <- plot_df[-1,]

colnames( plot_df ) = c("predictors","response","Rsquared_change","stock","time_interval")

plot_df$time_interval = paste(plot_df$time_interval, "Sec")

plot_df$time_interval = factor(plot_df$time_interval, levels = paste(c(5, 30, 60, 300, 1800),"Sec"))

plot_df$predictors_filter = "Pred_raw"

plot_df$predictors_filter[grepl("fd",plot_df$predictors)] = "Pred_fd"
plot_df$predictors_filter[grepl("fw",plot_df$predictors)] = "Pred_fw"

plot_df$predictors_add = gsub("fd|fw","",plot_df$predictors)

plot_df$predictors_add = factor(plot_df$predictors_add)

levels(plot_df$predictors_add) = c("+ Trade Number","+ Volume","+ Both")
indx_keep <- (plot_df$predictors_filter == "Pred_fd" & plot_df$response == "RV_fC") |
             (plot_df$predictors_filter == "Pred_fw" & plot_df$response == "RV_fD") |
             (plot_df$predictors_filter == "Pred_raw" & plot_df$response == "RV_raw")
plot_df <- plot_df[indx_keep,]

plot_df$Filter <- "no filter"

plot_df$Filter[ (plot_df$predictors_filter == "Pred_fw" & plot_df$response == "RV_fD") ] <- "by week"

plot_df$Filter[ (plot_df$predictors_filter == "Pred_fd" & plot_df$response == "RV_fC") ] <- "by day"

plot_df$Filter <- factor(plot_df$Filter, levels = c("no filter","by day", "by week"))

p2 <- ggplot(plot_df,
       aes(x = predictors_add, y = Rsquared_change)) + 
  geom_bar(aes(fill = Filter),
           position="dodge",
           width = 0.7,
           color = 0,
           stat = "identity",
           size = 1) +
  scale_fill_brewer(palette = "Paired") + 
  facet_grid(stock~time_interval,
             scales = "fixed") + theme_bw() +
  labs( x = "New Predictors added", y = "Increase on adj R Squared", title = "Increase on Adjusted R Squared of OLR") + 
  theme(axis.text.x = element_text(angle = 310,hjust = 0))


ggsave("R_squared_increase_OLS.pdf", p2, width = 7.5, height = 4.7)
#Load data 
library(reshape2)
R2_OLS_dir <- list.files("RandomForest_R2_all")
plot_df <- data.frame(X = NA, variable = NA, value = NA, stock = NA, time_interval = NA)
for ( i in c(5, 30, 60, 300, 1800) ) {
  for (j in c("MSFT","SPY","XRX")) {
    csv_ij <- read.csv(file.path("RandomForest_R2_all",paste0(j,"_",i,"_RF_increase_in_R2.csv")))
    plot_df_ij <- melt(csv_ij,id = "X")
    plot_df_ij$stock = j
    plot_df_ij$time_interval = i
    plot_df <- rbind(plot_df, plot_df_ij)
  }
}
plot_df <- plot_df[-1,]

colnames( plot_df ) = c("predictors","response","Rsquared_change","stock","time_interval")

plot_df$time_interval = paste(plot_df$time_interval, "Sec")

plot_df$time_interval = factor(plot_df$time_interval, levels = paste(c(5, 30, 60, 300, 1800),"Sec"))

plot_df$predictors_filter = "Pred_raw"

plot_df$predictors_filter[grepl("fd",plot_df$predictors)] = "Pred_fd"
plot_df$predictors_filter[grepl("fw",plot_df$predictors)] = "Pred_fw"

plot_df$predictors_add = gsub("fd|fw","",plot_df$predictors)

plot_df$predictors_add = factor(plot_df$predictors_add)

levels(plot_df$predictors_add) = c("+ Trade Number","+ Volume","+ Both")
indx_keep <- (plot_df$predictors_filter == "Pred_fd" & plot_df$response == "RV_fC") |
             (plot_df$predictors_filter == "Pred_fw" & plot_df$response == "RV_fD") |
             (plot_df$predictors_filter == "Pred_raw" & plot_df$response == "RV_raw")
plot_df <- plot_df[indx_keep,]

plot_df$Filter <- "no filter"

plot_df$Filter[ (plot_df$predictors_filter == "Pred_fw" & plot_df$response == "RV_fD") ] <- "by week"

plot_df$Filter[ (plot_df$predictors_filter == "Pred_fd" & plot_df$response == "RV_fC") ] <- "by day"

plot_df$Filter <- factor(plot_df$Filter, levels = c("no filter","by day", "by week"))

p3 <- ggplot(plot_df,
       aes(x = predictors_add, y = Rsquared_change)) + 
  geom_bar(aes(fill = Filter),
           position="dodge",
           width = 0.7,
           color = 0,
           stat = "identity",
           size = 1) +
  scale_fill_brewer(palette = "Paired") + 
  facet_grid(stock~time_interval,
             scales = "fixed") + theme_bw() +
  labs( x = "New Predictors added", y = "Increase on CV R Squared", title = "Increase on R Squared of Random Forest with 30 folds CV") + 
  theme(axis.text.x = element_text(angle = 310,hjust = 0))


ggsave("R_squared_increase_RF.pdf", p3, width = 7.5, height = 4.7)
#Load data 
library(reshape2)
R2_OLS_dir <- list.files("OLR_R2_all")
plot_df <- data.frame(X = NA, variable = NA, value = NA, stock = NA, time_interval = NA)
for ( i in c(5, 30, 60, 300, 1800) ) {
  for (j in c("MSFT","SPY","XRX")) {
    csv_ij <- read.csv(file.path("OLR_R2_all", paste0(j, "_", i, "_OLR_increase_in_R2.csv")))
    plot_df_ij <- melt(csv_ij,id = "X")
    plot_df_ij$stock = j
    plot_df_ij$time_interval = i
    plot_df <- rbind(plot_df, plot_df_ij)
  }
}
plot_df <- plot_df[-1,]

colnames( plot_df ) = c("predictors","response","Rsquared_change","stock","time_interval")

plot_df$time_interval = paste(plot_df$time_interval, "Sec")

plot_df$time_interval = factor(plot_df$time_interval, levels = paste(c(5, 30, 60, 300, 1800),"Sec"))

plot_df$predictors_filter = "Pred_raw"

plot_df$predictors_filter[grepl("fd",plot_df$predictors)] = "Pred_fd"
plot_df$predictors_filter[grepl("fw",plot_df$predictors)] = "Pred_fw"

plot_df$predictors_add = gsub("fd|fw","",plot_df$predictors)

plot_df$predictors_add = factor(plot_df$predictors_add)

levels(plot_df$predictors_add) = c("+ Trade Number","+ Volume","+ Both")

indx_keep <- (plot_df$response == "RV_fD")

plot_df <- plot_df[indx_keep,]

plot_df$Pred_Filter <- "no filter"

plot_df$Pred_Filter[ (plot_df$predictors_filter == "Pred_fw" ) ] <- "by week"

plot_df$Pred_Filter[ (plot_df$predictors_filter == "Pred_fd" ) ] <- "by day"

plot_df$Pred_Filter <- factor(plot_df$Pred_Filter, levels = c("no filter","by day", "by week"))

p2 <- ggplot(plot_df,
       aes(x = predictors_add, y = Rsquared_change)) + 
  geom_bar(aes(fill = Pred_Filter),
           position="dodge",
           width = 0.7,
           color = 0,
           stat = "identity",
           size = 1) +
  scale_fill_brewer(palette = "Paired") + 
  facet_grid(stock~time_interval,
             scales = "fixed") + theme_bw() +
  labs( x = "New Predictors added", y = "Increase on adj R Squared", title = "Increase on Adjusted R Squared of OLR") + 
  theme(axis.text.x = element_text(angle = 310,hjust = 0))


ggsave("R_squared_increase_OLS_2.pdf", p2, width = 7.5, height = 4.7)
#Load data 
library(reshape2)
R2_OLS_dir <- list.files("RandomForest_R2_all")
plot_df <- data.frame(X = NA, variable = NA, value = NA, stock = NA, time_interval = NA)
for ( i in c(5, 30, 60, 300, 1800) ) {
  for (j in c("MSFT","SPY","XRX")) {
    csv_ij <- read.csv(file.path("RandomForest_R2_all",paste0(j,"_",i,"_RF_increase_in_R2.csv")))
    plot_df_ij <- melt(csv_ij,id = "X")
    plot_df_ij$stock = j
    plot_df_ij$time_interval = i
    plot_df <- rbind(plot_df, plot_df_ij)
  }
}
plot_df <- plot_df[-1,]

colnames( plot_df ) = c("predictors","response","Rsquared_change","stock","time_interval")

plot_df$time_interval = paste(plot_df$time_interval, "Sec")

plot_df$time_interval = factor(plot_df$time_interval, levels = paste(c(5, 30, 60, 300, 1800),"Sec"))

plot_df$predictors_filter = "Pred_raw"

plot_df$predictors_filter[grepl("fd",plot_df$predictors)] = "Pred_fd"
plot_df$predictors_filter[grepl("fw",plot_df$predictors)] = "Pred_fw"

plot_df$predictors_add = gsub("fd|fw","",plot_df$predictors)

plot_df$predictors_add = factor(plot_df$predictors_add)

levels(plot_df$predictors_add) = c("+ Trade Number","+ Volume","+ Both")

indx_keep <- (plot_df$response == "RV_fD")

plot_df <- plot_df[indx_keep,]

plot_df$Pred_Filter <- "no filter"

plot_df$Pred_Filter[ (plot_df$predictors_filter == "Pred_fw" ) ] <- "by week"

plot_df$Pred_Filter[ (plot_df$predictors_filter == "Pred_fd" ) ] <- "by day"

plot_df$Pred_Filter <- factor(plot_df$Pred_Filter, levels = c("no filter","by day", "by week"))

p3 <- ggplot(plot_df,
       aes(x = predictors_add, y = Rsquared_change)) + 
  geom_bar(aes(fill = Pred_Filter),
           position="dodge",
           width = 0.7,
           color = 0,
           stat = "identity",
           size = 1) +
  scale_fill_brewer(palette = "Paired") + 
  facet_grid(stock~time_interval,
             scales = "fixed") + theme_bw() +
  labs( x = "New Predictors added", y = "Increase on CV R Squared", title = "Increase on R Squared of Random Forest with 30 folds CV") + 
  theme(axis.text.x = element_text(angle = 310,hjust = 0))


ggsave("R_squared_increase_RF_2.pdf", p3, width = 7.5, height = 4.7)


ZhenWei10/Sherry-Chapter1 documentation built on Oct. 31, 2019, 1:48 a.m.