In dimagor/railActive: Dashboard for Predicting Delays for NJ Transit Trains

library(dplyr)
# library(njtPredict)
library(lubridate)
library(ggplot2)
library(tidyr)
library(knitr)
library(caret)
# devtools::load_all() #Toggle if not in the mood to rebuild after change
data("njt_features")

opts_chunk$set(echo = TRUE,
               message = FALSE,
               prompt = FALSE,
               warning = FALSE,
               cache = TRUE)

set.seed(42)
tr_data <- njt_features %>% 
  ungroup %>% 
  # Need to remove these fields for the complete.cases function (these aren't used as features) 
  select(-Actual_End_Time, -delay_length, -ttl_dep_arv_line)

tr_data <- tr_data[complete.cases(tr_data), ] #Removes only about 8K entries

# Register the # of cores to use
library(doMC)
registerDoMC(cores = 7)

# Specify a fixed control object for accurate comparison between samples
train_control <- trainControl(method = "repeatedcv",
                              number = 5,
                              repeats = 5,
                              ## Estimate class probabilities
                              classProbs = TRUE,
                              ## Evaluate performance using 
                              ## the following function
                              summaryFunction = twoClassSummary)

line_formula <- Class ~ Line +
  as.character(dep_hour) + 
  as.character(dep_wday) +
  as.character(dep_mon) + 
  ttl_line +
  Temp_F + 
  Visibility + 
  WindSpeed

runDelayRF <- function(delay_thresh){
  tr_data <- njt_features %>% 
    ungroup %>% 
    mutate(is_delayed = delay_length >= delay_thresh | is.na(delay_length),
           Line = gsub(" " , "_", Line)) %>%
    mutate(is_delayed = factor(is_delayed, levels = c(TRUE,FALSE), labels = c("Yes","No")))

  # Using downsampling per "Line" to account for the Unbalanced training data
  data_downsampled <- tr_data %>% group_by(Line) %>% do(downSample(., .$is_delayed))

  rf_fit <- train(line_formula,
                  data = data_downsampled,
                  tuneLength = 5,
                  trControl = train_control,
                  metric = "ROC", 
                  method = "rf",
                  ntree = 80)
  return(rf_fit)
}

intervals <- seq(6,16,2)
names(intervals) <- paste0("n_",intervals)
cutoff_compare <- lapply(intervals, tryCatch(runDelayRF,
                                             error = function(c) NULL))
resamp <- resamples(cutoff_compare)

resamps_df <- resamp$values %>% 
  gather(method_metric, value, -Resample) %>% 
  separate(method_metric, c("method","metric"),sep = "~") %>% 
  rowwise() %>% 
  mutate(method = as.numeric(gsub("n_","", method)))

resamps_df %>% 
  ggplot(aes(factor(method), value)) + 
  geom_boxplot() + 
  facet_wrap(~metric) + 
  theme_bw() +
  xlab("Delay Cutoff (Minutes)") +
  ylab("")
saveRDS(resamps_df, file = "data/delayed_cutoffcompare.rds")
# bwplot(resamp, layout = c(3, 1))