knitr::opts_chunk$set(echo = TRUE)

Getting the Data

Base

#Map lines to ttld & Name
data("njt_features")
line_info <- data.frame(Line = unique(njt_features$Line))
line_ttld <- read.table("external_data/line_model_map.txt", header = TRUE)
main_info <- inner_join(line_info, line_ttld, by = "Line") %>% mutate(Line = gsub(" ","_",Line))

write.csv(main_info, file = "../shinyapp/main_info.csv", row.names = FALSE)
pred_df <- main_info %>% mutate(Current_Date = Sys.time(),
                     dep_hour = hour(Current_Date),
         dep_mon = month(Current_Date, label = TRUE),
         dep_wday = wday(Current_Date, label = TRUE))

Last Delay

max_delay = 24*60
ttld_file <- read.csv("http://52.88.4.39/ttld_log.txt")
pred_df <- ttld_file %>% 
  select(ttld_Line = Line, Scheduled_Start_Time) %>% 
  right_join(pred_df) %>% 
  mutate(ttl_line = as.numeric(difftime(Current_Date, Scheduled_Start_Time, units = "mins"))) %>% 
  mutate(ttl_line = ifelse(is.na(ttl_line)|ttl_line > max_delay, max_delay, ttl_line))

Weather

# hourly_newark <- "http://api.wunderground.com/api/547710b840da62b4/conditions/q/NJ/Newark.json"
hourly_newark <- "http://forecast.weather.gov/MapClick.php?lat=40.7242&lon=-74.1726&FcstType=json"
w_get <- GET(hourly_newark)
w_text <- content(w_get, as = "text") %>% fromJSON()
pred_df$Temp_F <- as.numeric(w_text$currentobservation$Temp)
pred_df$Visibility <- as.double(w_text$currentobservation$Visibility)
pred_df$WindSpeed <- as.numeric(w_text$currentobservation$Winds)
predict_prob <- inner_join(pred_df, modelfit_byline, by = "Line")  %>%
      mutate(Line_Name = Full.Name) %>%
      group_by(Line_Name)  %>%
      do(data.frame(p=predict(.$fit, ., type = "prob")[[1]][[1]][1]))

12 Hour Forecast

w_get_hourly <- GET("http://api.wunderground.com/api/547710b840da62b4/hourly/q/NJ/Newark.json")
w_text_hourly <- content(w_get_hourly, as = "text") %>% fromJSON()

hourly_forecast <- w_text_hourly$hourly_forecast %>% flatten %>% 
  transmute(WindSpeed = wspd.english,
            Temp_F = temp.english,
    Date_Time = paste(FCTTIME.hour_padded, 
                             FCTTIME.min, 
                             FCTTIME.mon_padded, 
                             FCTTIME.mday_padded, 
                             FCTTIME.year)) %>%
  mutate(Date_Time = as.POSIXct(strptime(Date_Time, format = "%H %M %m %d %Y")),
         Visibility = pred_df$Visibility[1],
         dep_hour = hour(Date_Time),
         dep_mon = month(Date_Time, label = TRUE),
         dep_wday = wday(Date_Time, label = TRUE)) %>% 
  slice(1:12)

Conditional Probability Tables

data("njt_features")
njt_features <- njt_features %>% ungroup %>% 
  mutate(is_delayed = factor(is_delayed, levels = c(TRUE,FALSE), labels = c("Yes","No")),
                                        Line = gsub(" ","_",Line))
cpt_base <- njt_features %>% 
  count(Line, is_delayed) %>% 
  spread(is_delayed, n) %>%
  transmute(Line, base_pct = Yes/(No+Yes)*100) %>%
  arrange(desc(base_pct))
cpt_wday <- njt_features %>% 
  count(Line, is_delayed, dep_wday) %>% 
  spread(is_delayed, n) %>%
  mutate(Line, dep_wday, pct = Yes/(No+Yes)*100) %>% 
  inner_join(cpt_base, by = "Line") %>% 
  transmute(Line, feature = "dep_wday", feature_val = dep_wday, pct, p_fold = log2(pct/base_pct))

cpt_mon <- njt_features %>% 
  count(Line, is_delayed, dep_mon) %>% 
  spread(is_delayed, n) %>%
  mutate(Line, dep_mon, pct = Yes/(No+Yes)*100) %>% 
  inner_join(cpt_base, by = "Line") %>% 
  transmute(Line, feature = "dep_mon", feature_val = dep_mon, pct, p_fold = log2(pct/base_pct))

cpt_hour <- njt_features %>% 
  count(Line, is_delayed, dep_hour) %>% 
  spread(is_delayed, n) %>%
  mutate(Line, dep_hour, pct = Yes/(No+Yes)*100) %>% 
  inner_join(cpt_base, by = "Line") %>% 
  transmute(Line, feature = "dep_hour", feature_val = as.factor(dep_hour), pct, p_fold = log2(pct/base_pct))
breaks_WindSpeed = quantile(njt_features$WindSpeed, probs= seq(0,1,.25))
cpt_WindSpeed <- njt_features %>% group_by(Line) %>%
  mutate(bin = cut(WindSpeed, breaks_WindSpeed , include.lowest = TRUE)) %>% 
  count(Line, is_delayed, bin) %>% 
  spread(is_delayed, n) %>%
  mutate(Line, bin, pct = Yes/(No+Yes)*100) %>% 
  inner_join(cpt_base, by = "Line") %>% 
  transmute(Line, feature = "WindSpeed_break", feature_val = as.factor(bin), pct, p_fold = log2(pct/base_pct))

breaks_Temp_F = c(min(njt_features$Temp_F),seq(33,126,20), max(njt_features$Temp_F))
cpt_Temp_F <- njt_features %>% group_by(Line) %>%
  mutate(bin = cut(Temp_F, breaks_Temp_F , include.lowest = TRUE)) %>% 
  count(Line, is_delayed, bin) %>% 
  spread(is_delayed, n) %>%
  mutate(Line, bin, pct = Yes/(No+Yes)*100) %>% 
  inner_join(cpt_base, by = "Line") %>% 
  transmute(Line, feature = "Temp_F_break", feature_val = as.factor(bin), pct, p_fold = log2(pct/base_pct))


breaks_Visibility = c(seq(1,45,10),max(njt_features$Visibility))
cpt_Visibility <- njt_features %>% group_by(Line) %>%
  mutate(bin = cut(Visibility, breaks_Visibility , include.lowest = TRUE)) %>% 
  count(Line, is_delayed, bin) %>% 
  spread(is_delayed, n) %>%
  mutate(Line, bin, pct = Yes/(No+Yes)*100) %>% 
  inner_join(cpt_base, by = "Line") %>% 
  transmute(Line, feature = "Visibility_break", feature_val = as.factor(bin), pct, p_fold = log2(pct/base_pct))
breaks_ttl_line = c(seq(0,180,30),max(njt_features$ttl_line))
cpt_ttl_line <- njt_features %>% group_by(Line) %>%
  mutate(bin = cut(ttl_line, breaks_ttl_line , include.lowest = TRUE)) %>% 
  count(Line, is_delayed, bin) %>% 
  spread(is_delayed, n) %>%
  mutate(Line, bin, pct = Yes/(No+Yes)*100) %>% 
  inner_join(cpt_base, by = "Line") %>% 
  transmute(Line, feature = "ttl_line_break", feature_val = as.factor(bin), pct, p_fold = log2(pct/base_pct))
cpt_complete <- do.call(rbind,list(cpt_wday, cpt_mon, cpt_hour, cpt_Visibility, cpt_Temp_F, cpt_WindSpeed, cpt_ttl_line))
# save(cpt_complete, breaks_WindSpeed, breaks_ttl_line, breaks_Visibility, breaks_Temp_F,  file="shinyapp/cpt_complete.rda")
pred_cpt <- pred_df %>% mutate(
  ttl_line_break = cut(ttl_line, breaks_ttl_line , include.lowest = TRUE),
  Temp_F_break = cut(Temp_F, breaks_Temp_F , include.lowest = TRUE),
  WindSpeed_break = cut(WindSpeed, breaks_WindSpeed , include.lowest = TRUE),
  Visibility_break = cut(Visibility, breaks_Visibility , include.lowest = TRUE)
) %>% 
  select(Line, dep_hour:Visibility_break) %>% gather(feature, feature_val, -Line, -ttl_line:-WindSpeed) %>% 
  inner_join(cpt_complete) %>% 
  mutate(feature = factor(feature, 
                          levels = c("ttl_line",
                                     "Temp_F", 
                                     "WindSpeed",
                                     "Visibility", 
                                     "dep_hour",
                                     "dep_wday",
                                     "dep_mon"),
                          labels = c("Time To Last Delay",
                                     "Current Temperature",
                                     "Current Wind Speed",
                                     "Current Visibility",
                                     "Time of Day",
                                     "Day of the Week",
                                     "Current Month")
                          ))
pred_cpt  %>% ggplot(aes(feature,p_fold, fill = p_fold)) + geom_bar(stat = "identity") + facet_wrap(~Line, ncol = 3) + coord_flip() + theme_minimal() + scale_fill_gradient2(low = "blue", mid = "black", high = "red", midpoint = 0) + ylim(-1,1)


dimagor/railActive documentation built on May 15, 2019, 8:44 a.m.