library(dplyr) library(stringr) # library(njtPredict) library(lubridate) #Make sure the latest version is installed to avoid error in rounding dates devtools::load_all() data("njt_trains") opts_chunk$set(echo = TRUE, message = FALSE, prompt = FALSE, warning = FALSE, cache = FALSE, eval = FALSE)
This data was obtained directly from NJT and contains ~5 years of departure & arrival times for NJT trains.
data("njt_trains") delay_thresh = 10 njt_features <- njt_trains %>% filter(Line != "ACES_AC_EXPRE") %>% mutate(delay_length = as.numeric(difftime(Scheduled_End_Time,Actual_End_Time, units = "mins")), #Ignore early arrivals delay_length = ifelse(delay_length < 0, 0, delay_length), # NA's are likely cancellations, consider them as part of delays is_delayed = delay_length >= delay_thresh | is.na(delay_length), Line = gsub(" " , "_", Line)) %>% group_by(Line)
#Add features based on date/time njt_features <- njt_features %>% mutate(round_time = round_date(Scheduled_Start_Time, unit = "hour"), # Used as key to match to weather dep_hour = hour(round_time), dep_mon = month(round_time, label = TRUE), dep_wday = wday(round_time, label = TRUE))
getLastDelay <- function(df, group = "Line"){ #Set Max Time to prevent NA's and the very far delays max_delay <- 24 * 60 #24 Hours #Great solution adapted from http://stackoverflow.com/questions/30391333/calculate-days-since-last-event-in-r df <- df[order(df$Scheduled_Start_Time, decreasing = FALSE),] last_event_index <- cumsum(df$is_delayed) + 1 # shift it by one to the right last_event_index <- c(1, last_event_index[1:length(last_event_index) - 1]) # get the dates of the events and index the vector with the last_event_index, # added an NA as the first date because there was no event last_event_date <- c(as.POSIXct(NA), df[which(df$is_delayed), "Scheduled_Start_Time"])[last_event_index] # substract the event's date with the date of the last event last_delay <- difftime(df$Scheduled_Start_Time, last_event_date, units = "mins") last_delay_fixed <- pmin(last_delay, max_delay) last_delay_fixed[is.na(last_delay_fixed)] <- max_delay df[,paste0("ttl_",group)] <- last_delay_fixed df } njt_features <- njt_features %>% group_by(Line) %>% do(getLastDelay(. ,group = "line")) %>% #Delay length on line group_by(Line, Scheduled_Departure_Terminal) %>% do(getLastDelay(., group = "dep_line")) %>% #Delay length considering departure station group_by(Line, Scheduled_Departure_Terminal, Actual_Arrival_Terminal) %>% do(getLastDelay(., group = "dep_arv_line")) %>% group_by(Line, Actual_Arrival_Terminal) %>% do(getLastDelay(., group = "arv_line"))#Delay length only arrival station
data("weather_hourly") weather_hourly_newark <- filter(weather_hourly, WBAN == 14734) %>% mutate(round_time = round_date(Time,unit = "hour")) %>% select(round_time, Visibility, Temp_F = DryBulbFarenheit, Temp_C = DryBulbCelsius, WindSpeed, HourlyPrecip) %>% mutate(Visibility = as.numeric(Visibility), Temp_F = as.numeric(Temp_F), Temp_C = as.numeric(Temp_C), WindSpeed = as.numeric(WindSpeed), HourlyPrecip = as.numeric(HourlyPrecip)) %>% group_by(round_time) %>% slice(1) #Take only the first entry when more than one entry exists per hour # Clean up weather data weather_hourly_newark[,-1] <- apply(weather_daily_newark[,-1],2, function(f){ f[f == "M"] <- NA f[f == " T"] <- 0 #Trace can be considered at 0 f}) njt_features <- inner_join(njt_features, weather_hourly_newark, by = "round_time")
# Clean up reasons reason_types <- njt_features %>% filter(Delay_Reason != "") %>% count(Delay_Reason) %>% select(Delay_Reason) %>% mutate(reason_entity = gsub("\\.","", word(Delay_Reason,1)), reason_type = word(Delay_Reason,start = 2, end = -1)) njt_features <- left_join(njt_features, reason_types, by = "Delay_Reason") # Fix NA for use in completecases later njt_features <- njt_features %>% mutate(reason_entity = ifelse(is.na(reason_entity),"",reason_entity), reason_type = ifelse(is.na(reason_type),"",reason_type))
devtools::use_data(njt_features, overwrite = TRUE)
To predict number of delays (greater than 10 minutes) per day.
njt_featdaily <- njt_features %>% group_by(Run_Date, Line) %>% summarise(number_delays = sum(is_delayed)) %>% mutate(dep_mon = month(Run_Date, label = TRUE), dep_wday = wday(Run_Date, label = TRUE))
data("weather_daily") weather_daily_newark <- filter(weather_daily, WBAN == 14734) weather_daily_newark <- weather_daily_newark %>% select(Run_Date = YearMonthDay, Tmax, Tmin, Tavg, PrecipTotal, SnowFall, WindSpeed = Max5Speed) weather_daily_newark[,-1] <- apply(weather_daily_newark[,-1],2, function(f){ f[f == "M"] <- NA f[f == " T"] <- 0 as.numeric(f)}) # weather_daily_newark <- weather_daily_newark[complete.cases(weather_daily_newark),] #Toss NA's weather_daily_newark <- weather_daily_newark %>% group_by(Run_Date) %>% slice(1) #Take only the first entry when more than one entry exists per hour njt_featdaily <- njt_featdaily %>% inner_join(weather_daily_newark, by = "Run_Date" )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.