In dimagor/railActive: Dashboard for Predicting Delays for NJ Transit Trains

library(dplyr)
# library(njtPredict)
library(lubridate)
library(ggplot2)
library(tidyr)
library(knitr)
library(caret)
# devtools::load_all() #Toggle if not in the mood to rebuild after change
data("njt_features")

opts_chunk$set(echo = TRUE,
               message = FALSE,
               prompt = FALSE,
               warning = FALSE,
               cache = TRUE)

set.seed(42)
tr_data <- njt_features %>% 
  ungroup %>% 
  mutate(Line = gsub(" " , "_", Line),
         is_delayed = factor(is_delayed, levels = c(TRUE,FALSE), labels = c("Yes","No"))) %>% 
  # Need to remove these fields for the complete.cases function (these aren't used as features) 
  select(-Actual_End_Time, -delay_length, -ttl_dep_arv_line)

tr_data <- tr_data[complete.cases(tr_data), ] #Removes only about 8K entries

# Using downsampling per "Line" to account for the Unbalanced training data
data_downsampled <- tr_data %>% group_by(Line) %>% do(downSample(., .$is_delayed))

Model Selection

# Register the # of cores to use
library(doMC)
registerDoMC(cores = 7)

# Specify a fixed control object for accurate comparison between samples
train_control <- trainControl(method = "repeatedcv",
                              number = 5,
                              repeats = 10,
                              ## Estimate class probabilities
                              classProbs = TRUE,
                              ## Evaluate performance using 
                              ## the following function
                              summaryFunction = twoClassSummary)

line_formula <- Class ~ Line +
  as.character(dep_hour) + 
  as.character(dep_wday) +
  as.character(dep_mon) + 
  ttl_line +
  Temp_F + 
  Visibility + 
  WindSpeed

# Withold 20% of the data for an independent evaluation
in_train <- createDataPartition(data_downsampled$Class, p=.80, list=FALSE)
training <- data_downsampled[in_train, ]
testing <- data_downsampled[-in_train, ]


save(in_train, training, testing, train_control, file = "data/training_data.rda")

# Logistic Regression
logit_fit <- train(line_formula,
                   data = training,
                   tuneLength = 10, #Not necessary
                   trControl = train_control,
                   method = "glm",
                   family = "binomial",
                   metric = "ROC")
save(logit_fit, file = "data/logit_fit.rda")

# Regularized Logistic Regression
glmnet_fit <- train(line_formula,
                    data = training,
                    tuneLength = 10,
                    trControl = train_control,
                    method = "glmnet",
                    family = "binomial",
                    metric = "ROC")
save(glmnet_fit, file = "data/glmnet_fit.rda")

# Random Forest
rf_fit <- train(line_formula,
                data = training,
                tuneLength = 10,
                trControl = train_control,
                metric = "ROC", 
                method = "rf",
                ntree = 80)
save(rf_fit, file = "data/rf_fit.rda")

# Gradient Boosted Model
gbm_fit <- train(line_formula,
                 data = training,
                 tuneLength = 10,
                 trControl = train_control,
                 metric = "ROC", 
                 method = "gbm")
save(gbm_fit, file = "data/gbm_fit.rda")

# K-nearest neighbors
knn_fit <- train(line_formula,
                 data = training,
                 tuneLength = 10,
                 trControl = train_control,
                 metric = "ROC", 
                 method = "knn",
                 preProc = c("center", "scale"))
save(knn_fit, file =  "data/knn_fit.rda")

# Naive Bayes
nb_fit <- train(line_formula,
                data = training,
                tuneLength = 10,
                trControl = train_control,
                metric = "ROC", 
                method = "nb",
                preProc = c("center", "scale"))
save(nb_fit, file = "data/nb_fit.rda")


## SVM
# Too slow, not run
# smvlin_fit <- train(line_formula,
#                 data = training,
#                 tuneLength = 5,
#                 trControl = train_control,
#                 metric = "ROC", 
#                 method = "svmLinear",
#                 preProc = c("center", "scale"))
# save(svmlin_fit, file = "data/svmlin_fit.rda")

Model Comparison

data(list = list("glmnet_fit","rf_fit","knn_fit","gbm_fit"))
resamps <- resamples(list(
  "Logistic Regression" = logit_fit, 
  "Regularized Regression" = glmnet_fit, 
  "Random Forest" = rf_fit, 
  "K-Nearest Neighbors" = knn_fit,
  "Naive Bayes" = nb_fit))
diff(resamps)  %>% summary
bwplot(resamps, layout = c(3, 1))

#Convert resamples values to dataframe for use in gplot
resamps_df <- resamps$values %>% 
  gather(method_metric, value, -Resample) %>% 
  separate(method_metric, c("method","metric"),sep = "~")

resamps_df %>% 
  ggplot(aes(method, value)) + 
  geom_boxplot() + 
  facet_wrap(~metric, scale = "free_y") + 
  theme_bw()

Testing the Model

model_predictions <- predict(rf_fit, testing, type = "prob" )
confusionMatrix(model_predictions, testing$Class)

library(pROC)
roc_test <- testing
roc_test$pred_prob <- model_predictions$Yes  
roc_test <- roc_test %>% 
  arrange(desc(pred_prob)) %>% 
  mutate(Class = ifelse(Class == "Yes",1,0))

roc_obj <-  roc(roc_test$Class,roc_test$pred_prob)
roc_df <- data.frame(Sensitivity = roc_obj$sensitivities, Specificity = roc_obj$specificities)
roc_df %>% 
  ggplot(aes(1-Specificity, Sensitivity)) + 
  geom_line(color = "red2", cex = 1) + 
  geom_abline(intercept = 0, slope = 1, color = "black", linetype = 2) + 
  theme_classic() + scale_x_continuous(expand = c(0, 0)) + 
  scale_y_continuous(expand = c(0, 0))

Save the Model

# Caret version seems to be missing the trim method for train, manually entering it to decrease the size of the 
# model on disk (for the dashboard)
# https://github.com/topepo/caret/blob/master/pkg/caret/R/trim.R

trim <- function(object, ...) UseMethod("trim")
trim.train <- function(object, ...) {
  removals <- c("results", "pred", "bestTune", "call", "dots",
                "metric", "trainingData", "resample", "resampledCM",
                "perfNames", "maxmimize", "times")
  for(i in removals)
    if(i %in% names(object)) object[i] <- NULL
    c_removals <- c('method', 'number', 'repeats', 'p', 'initialWindow', 
                    'horizon', 'fixedWindow', 'verboseIter', 'returnData', 
                    'returnResamp', 'savePredictions', 'summaryFunction', 
                    'selectionFunction', 'index', 'indexOut', 'timingSamps', 
                    'trim', 'yLimits')
    for(i in c_removals)
      if(i %in% names(object$control)) object$control[i] <- NULL  
    if(!is.null(object$modelInfo$trim))
      object$finalModel <- object$modelInfo$trim(object$finalModel)
    object
}

modelfit <- train(line_formula, method = "rf", data =  training, 
                  trControl = trainControl(method = "none", classProbs = FALSE),
                  tuneGrid = data.frame(mtry = 14), ntree = 80, trim = TRUE)
modelfit <- trim(modelfit)

saveRDS(modelfit, file = "railactive/data/modelfit.rds")

mf_predict <- predict(modelfit, testing)
confusionMatrix(mf_predict, testing$Class)

Classifier per line

modelfit_byline <- data_downsampled %>% group_by(Line) %>%
  do(fit = train(Class ~ 
                   as.character(dep_hour) + 
                   as.character(dep_wday) +
                   as.character(dep_mon) + 
                   ttl_line +
                   Temp_F + 
                   Visibility + 
                   WindSpeed,
                 data = ., method = "rf", 
                 trControl = trainControl(method = "cv",
                                          number = 5,
                                          savePredictions = TRUE,
                                          classProb = TRUE),
                 ntree = 80))
save(modelfit_byline, file = "../../DelayPredictor/modelfit_byline.rda")

Separate Holdout (TODO: Update Tuning)

library(pROC)
tr_ctrl = trainControl(method = "cv",
                       number = 5,
                       savePredictions = TRUE,
                       classProb = TRUE)

testModelHoldout <- function(df, tr_ctrl, method = "glm", formula, ...){
  ds_df <- df %>% droplevels()
  in_partition <- createDataPartition(ds_df$Class, p=.75, list=FALSE)
  fit <- train(formula, data = ds_df, method = method, trControl = tr_ctrl, subset = in_partition, ...)

  test_set <- ds_df[-in_partition,]
  test_set$pred <-  predict(fit, ds_df[-in_partition,])
  test_set$pred_prob <-  predict(fit, ds_df[-in_partition,], type = "prob")$Yes
  cm <- confusionMatrix(test_set$pred, test_set$Class)

  # Get AUC per Fold
  roc_eval <- test_set %>% 
    arrange(desc(pred_prob)) %>% 
    mutate(Class = ifelse(Class == "Yes",1,0)) %>%
    summarise(auc = auc(roc(Class,pred_prob)))
  data.frame(t(c(cm$overall, cm$byClass)),"AUC" = roc_eval$auc)
}

formula_byline <- Class ~ 
  as.character(dep_hour) + 
  as.character(dep_wday) + 
  as.character(dep_mon) +
  ttl_line +
  Temp_F + 
  Visibility + 
  WindSpeed
results_ho_byline <- data_downsampled %>% group_by(Line) %>% do(testModelHoldout(.,tr_ctrl, method = "rf", formula_byline, ntree = 80))

ts_testing <- tr_data %>% filter(Scheduled_Start_Time >= 2015-01-01)
ts_train_full <- tr_data %>% filter(Run_Date < 2015-01-01)
in_train <- 
training <- data_downsampled[in_train, ]
testing <- data_downsampled[-in_train, ]