library(dplyr)
library(njtPredict)
library(lubridate)
library(ggplot2)
# library(broom)
library(tidyr)
library(knitr)
# devtools::load_all()

opts_chunk$set(echo = TRUE,
               message = FALSE,
               prompt = FALSE,
               warning = FALSE,
               cache = TRUE, 
               fig.width = 14,
               fig.height = 8)
data("njt_features")
njt_features <- njt_features %>% mutate(is_delayed = factor(is_delayed, levels = c(TRUE,FALSE), labels = c("Yes","No")))

Overview

The purpose of this notebook is to characterize & explore the njt data and weather data.

For Live/Hourly Data

Exploring General Attributes

njt_features %>% 
  count(Line, is_delayed) %>% 
  spread(is_delayed, n) %>%
  mutate(pct_delays = Yes/(No+Yes)*100) %>%
  arrange(desc(pct_delays)) %>%
  kable

Not suprisingly, Corridor has the most amount of delays.

Exploring Calendar Attributes

Delays By Day of Weekday/Line

#To easly adjust all graphs, standarizing via function
plotPctFacet <- function(df,x_var){
  ggplot(df,aes_string(x_var, "pct_delays")) + 
  geom_bar(stat="identity") + 
  facet_wrap(~Line, scale = "free", ncol = 5) +
  theme_classic()
}
njt_features %>% 
  count(Line, dep_wday, is_delayed) %>% 
  spread(is_delayed,n) %>% 
  mutate(pct_delays = Yes/(No+Yes)*100) %>% 
  plotPctFacet(x_var = "dep_wday")

Delays By Day of Month/Line

njt_features %>% 
  count(Line, dep_mon, is_delayed) %>% 
  spread(is_delayed,n) %>% 
  mutate(pct_delays = Yes/(No+Yes)*100) %>% 
  plotPctFacet(x_var = "dep_mon")

Delays By Day of Hour/Line

njt_features %>% 
  count(Line, dep_hour, is_delayed) %>% 
  spread(is_delayed,n) %>% 
  mutate(pct_delays = Yes/(No+Yes)*100) %>% 
  plotPctFacet(x_var = "dep_hour")

Time to Last Delay

Fraction of delays when previous delay occurred in the last 90 minutes

previous_delay_window = 90

Using any delay on the line

plotPrDelayFacet <- function(df){
  df %>% 
    filter(!is.na(prdelay)) %>% 
    count(Line, prdelay, is_delayed) %>%
    spread(is_delayed,n) %>% 
    mutate(pct_delays = Yes/(No+Yes)*100) %>% 
    ggplot(aes(prdelay, pct_delays)) + 
    geom_bar(stat="identity", position = "dodge") + 
    facet_wrap(~Line, scale = "free_y", ncol = 5) +
    theme_classic() +
    xlab(paste("Delay Occurred withing the last",previous_delay_window,"minutes")) +
    ylab("% of all Trains")
}
njt_features %>% 
  mutate(prdelay = ttl_line <= previous_delay_window) %>%
  plotPrDelayFacet

Using delays with fixed departure station

njt_features %>%
  mutate(prdelay = ttl_dep_line <= previous_delay_window) %>% 
  plotPrDelayFacet

Using delays with fixed departure & arrival stations

njt_features %>% 
  mutate(prdelay = ttl_dep_arv_line <= previous_delay_window) %>%
  plotPrDelayFacet

Using Weather Data

TODO

Comparing Delay Reasons

Need to propogate Reasons (MVP++)

Analyzing Daily Trends



dimagor/railActive documentation built on May 15, 2019, 8:44 a.m.