library(dplyr) # library(njtPredict) library(lubridate) library(ggplot2) library(tidyr) library(knitr) library(caret) # devtools::load_all() #Toggle if not in the mood to rebuild after change data("njt_features") opts_chunk$set(echo = TRUE, message = FALSE, prompt = FALSE, warning = FALSE, cache = TRUE)
set.seed(42) tr_data <- njt_features %>% ungroup %>% mutate(Line = gsub(" " , "_", Line), is_delayed = factor(is_delayed, levels = c(TRUE,FALSE), labels = c("Yes","No"))) %>% # Need to remove these fields for the complete.cases function (these aren't used as features) select(-Actual_End_Time, -delay_length, -ttl_dep_arv_line) tr_data <- tr_data[complete.cases(tr_data), ] #Removes only about 8K entries # Using downsampling per "Line" to account for the Unbalanced training data data_downsampled <- tr_data %>% group_by(Line) %>% do(downSample(., .$is_delayed))
# Register the # of cores to use library(doMC) registerDoMC(cores = 7) # Specify a fixed control object for accurate comparison between samples train_control <- trainControl(method = "repeatedcv", number = 5, repeats = 10, ## Estimate class probabilities classProbs = TRUE, ## Evaluate performance using ## the following function summaryFunction = twoClassSummary) line_formula <- Class ~ Line + as.character(dep_hour) + as.character(dep_wday) + as.character(dep_mon) + ttl_line + Temp_F + Visibility + WindSpeed # Withold 20% of the data for an independent evaluation in_train <- createDataPartition(data_downsampled$Class, p=.80, list=FALSE) training <- data_downsampled[in_train, ] testing <- data_downsampled[-in_train, ] save(in_train, training, testing, train_control, file = "data/training_data.rda")
# Logistic Regression logit_fit <- train(line_formula, data = training, tuneLength = 10, #Not necessary trControl = train_control, method = "glm", family = "binomial", metric = "ROC") save(logit_fit, file = "data/logit_fit.rda") # Regularized Logistic Regression glmnet_fit <- train(line_formula, data = training, tuneLength = 10, trControl = train_control, method = "glmnet", family = "binomial", metric = "ROC") save(glmnet_fit, file = "data/glmnet_fit.rda") # Random Forest rf_fit <- train(line_formula, data = training, tuneLength = 10, trControl = train_control, metric = "ROC", method = "rf", ntree = 80) save(rf_fit, file = "data/rf_fit.rda") # Gradient Boosted Model gbm_fit <- train(line_formula, data = training, tuneLength = 10, trControl = train_control, metric = "ROC", method = "gbm") save(gbm_fit, file = "data/gbm_fit.rda") # K-nearest neighbors knn_fit <- train(line_formula, data = training, tuneLength = 10, trControl = train_control, metric = "ROC", method = "knn", preProc = c("center", "scale")) save(knn_fit, file = "data/knn_fit.rda") # Naive Bayes nb_fit <- train(line_formula, data = training, tuneLength = 10, trControl = train_control, metric = "ROC", method = "nb", preProc = c("center", "scale")) save(nb_fit, file = "data/nb_fit.rda") ## SVM # Too slow, not run # smvlin_fit <- train(line_formula, # data = training, # tuneLength = 5, # trControl = train_control, # metric = "ROC", # method = "svmLinear", # preProc = c("center", "scale")) # save(svmlin_fit, file = "data/svmlin_fit.rda")
data(list = list("glmnet_fit","rf_fit","knn_fit","gbm_fit")) resamps <- resamples(list( "Logistic Regression" = logit_fit, "Regularized Regression" = glmnet_fit, "Random Forest" = rf_fit, "K-Nearest Neighbors" = knn_fit, "Naive Bayes" = nb_fit)) diff(resamps) %>% summary bwplot(resamps, layout = c(3, 1))
#Convert resamples values to dataframe for use in gplot resamps_df <- resamps$values %>% gather(method_metric, value, -Resample) %>% separate(method_metric, c("method","metric"),sep = "~") resamps_df %>% ggplot(aes(method, value)) + geom_boxplot() + facet_wrap(~metric, scale = "free_y") + theme_bw()
model_predictions <- predict(rf_fit, testing, type = "prob" ) confusionMatrix(model_predictions, testing$Class)
library(pROC) roc_test <- testing roc_test$pred_prob <- model_predictions$Yes roc_test <- roc_test %>% arrange(desc(pred_prob)) %>% mutate(Class = ifelse(Class == "Yes",1,0)) roc_obj <- roc(roc_test$Class,roc_test$pred_prob) roc_df <- data.frame(Sensitivity = roc_obj$sensitivities, Specificity = roc_obj$specificities) roc_df %>% ggplot(aes(1-Specificity, Sensitivity)) + geom_line(color = "red2", cex = 1) + geom_abline(intercept = 0, slope = 1, color = "black", linetype = 2) + theme_classic() + scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0))
# Caret version seems to be missing the trim method for train, manually entering it to decrease the size of the # model on disk (for the dashboard) # https://github.com/topepo/caret/blob/master/pkg/caret/R/trim.R trim <- function(object, ...) UseMethod("trim") trim.train <- function(object, ...) { removals <- c("results", "pred", "bestTune", "call", "dots", "metric", "trainingData", "resample", "resampledCM", "perfNames", "maxmimize", "times") for(i in removals) if(i %in% names(object)) object[i] <- NULL c_removals <- c('method', 'number', 'repeats', 'p', 'initialWindow', 'horizon', 'fixedWindow', 'verboseIter', 'returnData', 'returnResamp', 'savePredictions', 'summaryFunction', 'selectionFunction', 'index', 'indexOut', 'timingSamps', 'trim', 'yLimits') for(i in c_removals) if(i %in% names(object$control)) object$control[i] <- NULL if(!is.null(object$modelInfo$trim)) object$finalModel <- object$modelInfo$trim(object$finalModel) object } modelfit <- train(line_formula, method = "rf", data = training, trControl = trainControl(method = "none", classProbs = FALSE), tuneGrid = data.frame(mtry = 14), ntree = 80, trim = TRUE) modelfit <- trim(modelfit) saveRDS(modelfit, file = "railactive/data/modelfit.rds") mf_predict <- predict(modelfit, testing) confusionMatrix(mf_predict, testing$Class)
modelfit_byline <- data_downsampled %>% group_by(Line) %>% do(fit = train(Class ~ as.character(dep_hour) + as.character(dep_wday) + as.character(dep_mon) + ttl_line + Temp_F + Visibility + WindSpeed, data = ., method = "rf", trControl = trainControl(method = "cv", number = 5, savePredictions = TRUE, classProb = TRUE), ntree = 80)) save(modelfit_byline, file = "../../DelayPredictor/modelfit_byline.rda")
library(pROC) tr_ctrl = trainControl(method = "cv", number = 5, savePredictions = TRUE, classProb = TRUE) testModelHoldout <- function(df, tr_ctrl, method = "glm", formula, ...){ ds_df <- df %>% droplevels() in_partition <- createDataPartition(ds_df$Class, p=.75, list=FALSE) fit <- train(formula, data = ds_df, method = method, trControl = tr_ctrl, subset = in_partition, ...) test_set <- ds_df[-in_partition,] test_set$pred <- predict(fit, ds_df[-in_partition,]) test_set$pred_prob <- predict(fit, ds_df[-in_partition,], type = "prob")$Yes cm <- confusionMatrix(test_set$pred, test_set$Class) # Get AUC per Fold roc_eval <- test_set %>% arrange(desc(pred_prob)) %>% mutate(Class = ifelse(Class == "Yes",1,0)) %>% summarise(auc = auc(roc(Class,pred_prob))) data.frame(t(c(cm$overall, cm$byClass)),"AUC" = roc_eval$auc) } formula_byline <- Class ~ as.character(dep_hour) + as.character(dep_wday) + as.character(dep_mon) + ttl_line + Temp_F + Visibility + WindSpeed results_ho_byline <- data_downsampled %>% group_by(Line) %>% do(testModelHoldout(.,tr_ctrl, method = "rf", formula_byline, ntree = 80))
ts_testing <- tr_data %>% filter(Scheduled_Start_Time >= 2015-01-01) ts_train_full <- tr_data %>% filter(Run_Date < 2015-01-01) in_train <- training <- data_downsampled[in_train, ] testing <- data_downsampled[-in_train, ]
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.