library(dplyr)
library(lubridate)
#To regenerate the files while knitting, set eval = TRUE
opts_chunk$set(echo = TRUE,
               message = FALSE,
               prompt = FALSE,
               warning = FALSE,
               cache = FALSE,
               eval = FALSE)
njt_ids <- read.delim("../external_data/njt_data/STATIONS_EXTRACT.TXT", sep = "\t")
devtools::use_data(njt_ids, overwrite = TRUE)

#Define columns (provided by word DOC)
njt_colnames <- c("Run_Date", 
                  "Train_ID", 
                  "Scheduled_Departure_Terminal", 
                  "Scheduled_Start_Time", 
                  "Actual_Arrival_Terminal", 
                  "Scheduled_End_Time", 
                  "Actual_End_Time", 
                  "Delay_Reason", 
                  "Line")
njt_coltypes <- list("character", 
                    "numeric", 
                    "factor",
                    "character", 
                    "factor",
                    "character",
                    "character",
                    "factor",
                    "factor")

# Fixed width info obtained from "Record-Layout-ALL-TRAINS.doc"
njt_widths = c(8, 6, 4, 4, 4, 4, 4, 30, 13)
njt_2010 <- read.fwf("../data/njt_data/TRAIN_EXTRACT_2010.TXT", widths = njt_widths, colClasses = njt_coltypes, col.names = njt_colnames)
njt_2011 <- read.fwf("../data/njt_data/TRAIN_EXTRACT_2011.TXT", widths = njt_widths, colClasses = njt_coltypes, col.names = njt_colnames)
njt_2012 <- read.fwf("../data/njt_data/TRAIN_EXTRACT_2012.TXT", widths = njt_widths, colClasses = njt_coltypes, col.names = njt_colnames)
njt_2013 <- read.fwf("../data/njt_data/TRAIN_EXTRACT_2013.TXT", widths = njt_widths, colClasses = njt_coltypes, col.names = njt_colnames)
njt_2014 <- read.fwf("../data/njt_data/TRAIN_EXTRACT_2014.TXT", widths = njt_widths, colClasses = njt_coltypes, col.names = njt_colnames)
njt_2015 <- read.fwf("../data/njt_data/TRAIN_EXTRACT_2015.TXT", widths = njt_widths, colClasses = njt_coltypes, col.names = njt_colnames)

njt_trains <- rbind(njt_2010, njt_2011, njt_2012, njt_2013, njt_2014, njt_2015)

# Fix Time variables
## POSIXct requires a date when storing time
njt_trains$Scheduled_Start_Time <- as.POSIXct(strptime(paste(njt_trains$Run_Date, njt_trains$Scheduled_Start_Time), format = "%Y%m%d %H%M"))
njt_trains$Scheduled_End_Time <- as.POSIXct(strptime(paste(njt_trains$Run_Date, njt_trains$Scheduled_End_Time), format = "%Y%m%d %H%M"))
njt_trains$Actual_End_Time <- as.POSIXct(strptime(paste(njt_trains$Run_Date, njt_trains$Actual_End_Time), format = "%Y%m%d %H%M"))
## 
njt_trains$Run_Date <- as.POSIXct(strptime(njt_trains$Run_Date, format = "%Y%m%d"))

# Fixing cases when train arrived the next day (messy because ifelse tends to destroy POSIX formatting)
njt_trains$Scheduled_End_Time[!is.na(njt_trains$Scheduled_End_Time) & !is.na(njt_trains$Scheduled_Start_Time) & hour(njt_trains$Scheduled_Start_Time) > hour(njt_trains$Scheduled_End_Time)] <- njt_trains$Scheduled_End_Time[!is.na(njt_trains$Scheduled_End_Time) & !is.na(njt_trains$Scheduled_Start_Time) & hour(njt_trains$Scheduled_Start_Time) > hour(njt_trains$Scheduled_End_Time)] + days(1)

njt_trains$Actual_End_Time[!is.na(njt_trains$Actual_End_Time) & !is.na(njt_trains$Scheduled_Start_Time) & hour(njt_trains$Scheduled_Start_Time) > hour(njt_trains$Actual_End_Time)] <- njt_trains$Actual_End_Time[!is.na(njt_trains$Actual_End_Time) & !is.na(njt_trains$Scheduled_Start_Time) & hour(njt_trains$Scheduled_Start_Time) > hour(njt_trains$Actual_End_Time)] + days(1)

#Get rid of the extra whitespace to make filtering easier
njt_trains$Scheduled_Departure_Terminal <- trimws(njt_trains$Scheduled_Departure_Terminal) 
njt_trains$Actual_Arrival_Terminal <- trimws(njt_trains$Actual_Arrival_Terminal)
njt_trains$Delay_Reason <- trimws(njt_trains$Delay_Reason)

njt_trains <- njt_trains %>% filter(!is.na(Scheduled_Start_Time))

#Save data
devtools::use_data(njt_trains, overwrite = TRUE)
# Obtained from NOAA
weather_daily <- read.csv(file = "../external_data/weather_data/weather/NY_NJ-daily_weather.csv") %>% 
  mutate(YearMonthDay = as.POSIXct(strptime(YearMonthDay, format = "%Y%m%d")))

weather_hourly <- read.csv(file = "../data/weather_data/weather/NY_NJ-hourly_weather.csv", 
                           colClasses = c("Time" = "character"))
weather_hourly$Time <- as.POSIXct(strptime(paste(weather_hourly$Date,weather_hourly$Time), format = "%Y%m%d %H%M"))
weather_hourly$Date <- as.POSIXct(strptime(weather_hourly$Date, format = "%Y%m%d"))

devtools::use_data(weather_daily)
devtools::use_data(weather_hourly)


dimagor/railActive documentation built on May 15, 2019, 8:44 a.m.