# Removing problems functions
#' Split a GTFS object based on trip_ids
#'
#' @param gtfs gtfs list
#' @param trip_ids a vector of trips ids
#' @return Returns a named list of two gtfs objects. The `true` list contains
#' trips that matched `trip_ids` the `false` list contains all other trips.
#'
#' @export
gtfs_split_ids <- function(gtfs, trip_ids) {
agency <- gtfs$agency
stops <- gtfs$stops
routes <- gtfs$routes
trips <- gtfs$trips
stop_times <- gtfs$stop_times
calendar <- gtfs$calendar
calendar_dates <- gtfs$calendar_dates
trips_true <- trips[trips$trip_id %in% trip_ids, ]
trips_false <- trips[!trips$trip_id %in% trip_ids, ]
stop_times_true <- stop_times[stop_times$trip_id %in% trips_true$trip_id, ]
stop_times_false <- stop_times[stop_times$trip_id %in% trips_false$trip_id, ]
routes_true <- routes[routes$route_id %in% trips_true$route_id, ]
routes_false <- routes[routes$route_id %in% trips_false$route_id, ]
calendar_true <- calendar[calendar$service_id %in% trips_true$service_id, ]
calendar_false <- calendar[calendar$service_id %in% trips_false$service_id, ]
calendar_dates_true <- calendar_dates[calendar_dates$service_id %in% trips_true$service_id, ]
calendar_dates_false <- calendar_dates[calendar_dates$service_id %in% trips_false$service_id, ]
stops_true <- stops[stops$stop_id %in% stop_times_true$stop_id, ]
stops_false <- stops[stops$stop_id %in% stop_times_false$stop_id, ]
agency_true <- agency[agency$agency_id %in% routes_true$agency_id, ]
agency_false <- agency[agency$agency_id %in% routes_false$agency_id, ]
gtfs_true <- list(agency_true, stops_true, routes_true, trips_true, stop_times_true, calendar_true, calendar_dates_true)
gtfs_false <- list(agency_false, stops_false, routes_false, trips_false, stop_times_false, calendar_false, calendar_dates_false)
names(gtfs_true) <- c("agency", "stops", "routes", "trips", "stop_times", "calendar", "calendar_dates")
names(gtfs_false) <- c("agency", "stops", "routes", "trips", "stop_times", "calendar", "calendar_dates")
result <- list(gtfs_true, gtfs_false)
names(result) <- c("true", "false")
return(result)
}
#' Find fast trips
#' @description Fast trips can identify problems with the input data or
#' conversion process. This function returns trip_ids for trips that exceed
#' `maxspeed`.
#' @param gtfs list of gtfs tables
#' @param maxspeed the maximum allowed speed in metres per second default 83 m/s
#' (about 185 mph the max speed of trains on HS1 line)
#' @param routes logical, do one trip per route, faster but may miss some trips
#' @details The function looks a straight line distance between each stop and
#' detects the fastest segment of the journey. A common cause of errors is
#' that a stop is in the wrong location so a bus can appear to teleport
#' across the country in seconds.
#' @export
gtfs_fast_trips <- function(gtfs, maxspeed = 83, routes = TRUE) {
if(routes){
gtfs$trips <- gtfs$trips[!duplicated(gtfs$trips$route_id),]
gtfs$stop_times <- gtfs$stop_times[gtfs$stop_times$trip_id %in% gtfs$trips$trip_id,]
}
trips <- gtfs$stop_times
#times$stop_sequence <- as.integer(times$stop_sequence)
trips <- dplyr::left_join(trips, gtfs$stops, by = "stop_id")
trips$distance <- geodist::geodist(as.matrix(trips[,c("stop_lon","stop_lat")]), sequential = TRUE, pad = TRUE)
trips$distance[trips$stop_sequence == 1] <- NA
trips$time <- dplyr::if_else(is.na(trips$arrival_time), trips$departure_time, trips$arrival_time)
if(inherits(trips$time, "character")){
trips$time <- as.POSIXct(trips$time, format="%H:%M:%S", origin = "1970-01-01")
}
if(inherits(trips$time, "Period")){
trips$time2 <- c(NA, as.numeric(trips$time[2:nrow(trips)] - trips$time[1:(nrow(trips)-1)]))
} else {
trips$time <- as.POSIXct(trips$time, format="%H:%M:%S", origin = "1970-01-01")
trips$time2 <- c(NA, difftime(trips$time[2:nrow(trips)], trips$time[1:(nrow(trips)-1)]))
}
trips$speed <- trips$distance / trips$time2
trips$speed[trips$speed == Inf] <- NA
times <- dplyr::group_by(trips, trip_id)
times <- dplyr::summarise(times,
max_speed = max(speed, na.rm = TRUE)
)
times <- times[times$max_speed > maxspeed,]
return(times$trip_id)
}
#' Find fast stops
#' @description A varient of gtfs_fast_trips that can detect stops that may be in the wrong location
#' @param gtfs list of gtfs tables
#' @param maxspeed the maximum allowed speed in metres per second default 83 m/s
#' (about 185 mph the max speed of trains on HS1 line)
#' @details The function looks a straight line distance between each stop and
#' detects the fastest segment of the journey. A common cause of errors is
#' that a stop is in the wrong location so a bus can appear to teleport
#' across the country in seconds.
#' @export
gtfs_fast_stops <- function(gtfs, maxspeed = 83) {
trips <- gtfs$stop_times
trips <- dplyr::left_join(trips, gtfs$stops, by = "stop_id")
trips$distance <- geodist::geodist(as.matrix(trips[,c("stop_lon","stop_lat")]), sequential = TRUE, pad = TRUE)
trips$distance[trips$stop_sequence == 1] <- NA
trips$time <- dplyr::if_else(is.na(trips$arrival_time), trips$departure_time, trips$arrival_time)
if(inherits(trips$time, "character")){
trips$time <- as.POSIXct(trips$time, format="%H:%M:%S", origin = "1970-01-01")
}
if(inherits(trips$time, "Period")){
trips$time2 <- c(NA, as.numeric(trips$time[2:nrow(trips)] - trips$time[1:(nrow(trips)-1)]))
} else {
trips$time <- as.POSIXct(trips$time, format="%H:%M:%S", origin = "1970-01-01")
trips$time2 <- c(NA, difftime(trips$time[2:nrow(trips)], trips$time[1:(nrow(trips)-1)]))
}
trips$speed <- trips$distance / trips$time2
trips$speed[trips$speed == Inf] <- NA
trips$speed_after <- c(trips$speed[2:nrow(trips)],NA)
trips$distance_after <- c(trips$distance[2:nrow(trips)],NA)
times <- dplyr::group_by(trips, stop_id)
times <- dplyr::summarise(times,
max_speed = round(max(c(speed,speed_after), na.rm = TRUE), 1),
min_speed = round(min(c(speed,speed_after), na.rm = TRUE), 1),
mean_speed = round(mean(c(speed,speed_after), na.rm = TRUE), 1),
max_distance = round(max(c(distance,distance_after), na.rm = TRUE), 0),
min_distance = round(min(c(distance,distance_after), na.rm = TRUE), 0),
mean_distance = round(mean(c(distance,distance_after), na.rm = TRUE), 0),
trips = dplyr::n()
)
times <- times[times$max_speed > maxspeed,]
times <- dplyr::left_join(times, gtfs$stops, by = "stop_id")
times <- sf::st_as_sf(times, coords = c("stop_lon","stop_lat"), crs = 4326)
return(times)
}
# gtfs_fast_trips <- function(gtfs, maxspeed = 30) {
# times <- gtfs$stop_times
# times$stop_sequence <- as.integer(times$stop_sequence)
# times <- dplyr::group_by(times, trip_id)
# times <- dplyr::summarise(times,
# nstops = dplyr::n(),
# time_start = arrival_time[stop_sequence == min(stop_sequence)],
# time_end = if (nstops == 2) {
# arrival_time[stop_sequence == max(stop_sequence)]
# } else {
# arrival_time[stop_sequence == stop_sequence[floor(stats::median(1:nstops))]]
# },
# stop_start = stop_id[stop_sequence == min(stop_sequence)],
# stop_end = if (nstops == 2) {
# stop_id[stop_sequence == max(stop_sequence)]
# } else {
# stop_id[stop_sequence == stop_sequence[floor(stats::median(1:nstops))]]
# }
# )
#
# stops <- gtfs$stops
# stops <- stops[, c("stop_id", "stop_lon", "stop_lat")]
# names(stops) <- c("stop_id", "from_lon", "from_lat")
# times <- dplyr::left_join(times, stops, by = c("stop_start" = "stop_id"))
# names(stops) <- c("stop_id", "to_lon", "to_lat")
# times <- dplyr::left_join(times, stops, by = c("stop_end" = "stop_id"))
# times$time_start <- lubridate::hms(times$time_start)
# times$time_end <- lubridate::hms(times$time_end)
# times$duration <- as.numeric(times$time_end - times$time_start)
# times$distance <- geodist::geodist(
# x = as.matrix(times[, c("from_lon", "from_lat")]),
# y = as.matrix(times[, c("to_lon", "to_lat")]),
# paired = TRUE
# )
# times$speed <- times$distance / times$duration
#
# fast_trips <- times$trip_id[times$speed > maxspeed]
# return(fast_trips)
# }
PUBLIC_SERVICE_CATEGORY = c("OL", "OU", "OO", "OW", "XC", "XD", "XI",
"XR", "XU", "XX", "XZ", "BR", "BS", "SS" )
#' Clean simple errors from GTFS files
#'
#' @param gtfs gtfs list
#' @param public_only Logical, if TRUE remove routes with route_type missing
#' @details
#' Task done:
#'
#' 0. Remove stops with no coordinates
#' 1. Remove stops with no location information
#' 2. Remove trips with less than two stops
#' 3. Remove stops that are never used
#' 4. Replace missing agency names with "MISSINGAGENCY"
#' 5. If service is not public and public_only=TRUE then remove it (freight, 'trips' aka charters)
#' (these have a null route_type, so loading into OpenTripPlanner fails if these are present)
#' 6. If public_only=TRUE then remove services with 'train_category' not for public use. e.g. EE (ECS-Empty Coaching Stock)
#' 7. Remove shapes that no longer have any trips
#' 8 Remove frequencies that no longer have any trips
#'
#' @export
gtfs_clean <- function(gtfs, public_only = FALSE) {
# 0 Remove stops with no coordinates
gtfs$stops <- gtfs$stops[!is.na(gtfs$stops$stop_lon) & !is.na(gtfs$stops$stop_lat), ]
# 1 Remove stop times with no locations
gtfs$stop_times <- gtfs$stop_times[gtfs$stop_times$stop_id %in% unique(gtfs$stops$stop_id), ]
# 2 Remove trips with less than two stops
stop_count <- gtfs$stop_times
stop_count <- data.table::as.data.table(stop_count)
stop_count <- stop_count[, .N, by = "trip_id"]
gtfs$trips <- gtfs$trips[!("trip_id" %in% stop_count[N<2]$trip_id)]
# 3 Remove stops that are never used
gtfs$stops <- gtfs$stops[gtfs$stops$stop_id %in% unique(gtfs$stop_times$stop_id), ]
# 4 Replace "" agency_id with dummy name
gtfs$agency$agency_id[gtfs$agency$agency_id == ""] <- "MISSINGAGENCY"
gtfs$routes$agency_id[gtfs$routes$agency_id == ""] <- "MISSINGAGENCY"
gtfs$agency$agency_name[gtfs$agency$agency_name == ""] <- "MISSINGAGENCY"
# 5 remove calls, trips and routes that have an empty route_type (non public services)
# in addition to all the previous filtering - ECS moves were still making it into the output GTFS file, this removes them
if (public_only)
{
joinedTrips <- merge(gtfs$trips, gtfs$routes, by = "route_id", all.x = TRUE)
joinedCalls <- merge(gtfs$stop_times, joinedTrips, by = "trip_id", all.x = TRUE)
if ("train_category" %in% names(joinedCalls) )
{
filteredCalls <- joinedCalls[ !is.na( joinedCalls$route_type) &
joinedCalls$train_category %in% PUBLIC_SERVICE_CATEGORY, ]
}
else
{
filteredCalls <- joinedCalls[ !is.na( joinedCalls$route_type), ]
}
gtfs$stop_times <- filteredCalls[, names( gtfs$stop_times ), with=FALSE]
#what is this batty code I hear you cry ?!
gtfs$stop_times$arrival_time = gtfs$stop_times$arrival_time[ 1: nrow(gtfs$stop_times) ]
gtfs$stop_times$departure_time = gtfs$stop_times$departure_time[ 1: nrow(gtfs$stop_times) ]
#well, it's a bug workaround. Not entirely sure of the trigger, but when we have 30M stop times, and filter down to 1M
#the hour and minute component of the Period 'object' report a length of 30M, when there is only supposed to be 1M of them.
#The nrow() in the data.table says 1M, and the number of seconds in the period 'object' says 1M.
#Clearly 'object' is in big air quotes......
#as a result it blows up in gtfs_write() when writing because the sprintf moans about the input vectors being different lengths.
#This fixes it.
# see also stops_interpolate() "Needed because rbindlist doesn't work with periods for some reason" which smells like a similar workaround.
#stamping on the gc() button at the end of this fn for good measure.
#- remember kids, R is not suitable for production use.....
#after merging GTFS files we may have compressed the calendar and calendar_dates so a service pattern is used by
#multiple trips - so don't remove calendar and calendar_dates that link to routes with NA route_type in case
#it's in use by multiple trips/routes
if ("train_category" %in% names(joinedTrips) )
{
filteredTrips <- joinedTrips[ !is.na( joinedTrips$route_type ) &
joinedTrips$train_category %in% PUBLIC_SERVICE_CATEGORY, ]
}
else
{
filteredTrips <- joinedTrips[ !is.na( joinedTrips$route_type ), ]
}
gtfs$trips <- filteredTrips[, names( gtfs$trips ), with=FALSE]
if ("train_category" %in% names(gtfs$routes) )
{
gtfs$routes <- gtfs$routes[ !is.na( gtfs$routes$route_type ) &
gtfs$routes$train_category %in% PUBLIC_SERVICE_CATEGORY, ]
}
else
{
gtfs$routes <- gtfs$routes[ !is.na( gtfs$routes$route_type ), ]
}
rm(joinedCalls)
rm(joinedTrips)
rm(filteredCalls)
rm(filteredTrips)
gc()
}
# 7 remove shapes that no longer have any trips
if ("shapes" %in% names(gtfs))
{
gtfs$shapes <- gtfs$shapes[gtfs$shapes$shape_id %in% gtfs$trips$shape_id, ]
}
# 8 remove frequencies that no longer have any trips
if ("frequencies" %in% names(gtfs))
{
gtfs$frequencies <- gtfs$frequencies[gtfs$frequencies$trip_id %in% gtfs$trips$trip_id, ]
}
return(gtfs)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.