## code to prepare `station_info` dataset goes here
rD <- RSelenium::rsDriver(browser = "firefox", port = 4546L, verbose = FALSE)
base_url <- "http://www.railwaycodes.org.uk/stations/"
station_info <- NULL
for (i in seq_along(letters)) {
message("Processing stations starting by: ", LETTERS[i])
rD$client$navigate(paste0(base_url, "station", letters[i], ".shtm"))
tryCatch({
out <- rD$client$getPageSource()[[1]] %>%
xml2::read_html() %>% # parse HTML
rvest::html_nodes("table#tablesort") # extract table node
# Find small notes (<em>NOTE</em>)
em_nodes <- out %>%
rvest::html_nodes("em")
# Remove notes
xml2::xml_remove(em_nodes)
# Parse page to table
out <- out %>%
rvest::html_table(fill = TRUE) %>%
.[[1]] %>%
magrittr::set_colnames(c("station",
"elr",
"mileage",
"status",
"owner",
"operator",
"longitude",
"latitude",
"grid_ref"))
station_info <- rbind(station_info, out)
}, error = function(e){
warning(e)
})
Sys.sleep(2)
}
aux <- trainR::station_codes %>%
dplyr::left_join(station_info, by = c("name" = "station")) %>%
dplyr::filter(!grepl("nderground", name, ignore.case = TRUE)) # Filter underground stations
aux[is.na(aux$elr), ]
match_names <- function(out, idx, source) {
string <- tolower(out[idx, 1])
i <- nchar(string)
# browser()
while (i > 0) {
idx2 <- pmatch(substr(string, 1, i), tolower(source[, 1]))
if (!is.na(idx2)) {
print(paste0("Found: ", idx2, " - Matched: '", out[idx, 1], "' to '", source[idx2, 1], "'"))
# browser()
out[idx, -c(1:2)] <- source[idx2, -1]
break
}
i <- i - 1
}
out
}
##### Concept
# Match missing stations
unmatch_idx <- which(is.na(aux$elr))
aux[unmatch_idx[1],]
idx <- pmatch("Adlington (Lanc", station_info$station)
aux[unmatch_idx[1], -c(1:2)] <- station_info[idx, -1]
### Manually inspect
#### Ashford International (Eurostar)
aux[100, -c(1:2)] <- station_info[101, -1]
aux <- aux[-99, ] # Duplicated Ashford International (Eurostar)
#### Devon Dockyard
idx <- pmatch("Dockyard (Devonport)", station_info$station)
idx2 <- pmatch("Devonport Dockyard", aux$name)
aux[idx2, -c(1:2)] <- station_info[idx, -1]
#### Loop
# Match missing stations
unmatch_idx <- which(is.na(aux$elr))
aux2 <- aux
for (i in unmatch_idx)
aux2 <- match_names(aux2, i, station_info)
# 2nd round
unmatch_idx <- aux2[is.na(aux2$elr), ]
#### Dinas Rhondda
idx <- pmatch("Dinas Rhondda", station_info$station)
idx2 <- pmatch("Dinas (Rhondda)", aux$name)
aux2[idx2, -c(1:2)] <- station_info[idx, -1]
#### Dorking Deepdene
idx <- pmatch("Dorking (Deepdene)", station_info$station)
idx2 <- pmatch("Dorking Deepdene", aux$name)
aux2[idx2, -c(1:2)] <- station_info[idx, -1]
#### Edenbridge (Kent)
idx <- pmatch("Edenbridge (Kent)", station_info$station)
idx2 <- pmatch("Edenbridge", aux2$name)
aux2[idx2, -c(1:2)] <- station_info[idx, -1]
#### Edinburgh
idx2 <- pmatch("Edinburgh", aux2$name)
aux2[idx2, -c(1:2)] <- station_info[824, -1]
#### Garth (Mid Glamorgan)
idx <- pmatch("Garth (Bridgend County)", station_info$station)
idx2 <- pmatch("Garth (Mid Glamorgan)", aux2$name)
aux2[idx2, -c(1:2)] <- station_info[idx, -1]
#### Heathrow
##### Terminals 2 and 3
idx <- pmatch("Heathrow Terminals 2 and 3 (Rail Station Only)", station_info$station)
idx2 <- pmatch("Heathrow Airport Terminals 1, 2 and 3", aux2$name)
aux2[idx2, -c(1:2)] <- station_info[idx, -1]
##### Terminal 4
idx <- pmatch("Heathrow Terminal 4 (Rail Station Only)", station_info$station)
idx2 <- pmatch("Heathrow Airport Terminal 4", aux2$name)
aux2[idx2, -c(1:2)] <- station_info[idx, -1]
##### Terminal 5
idx <- pmatch("Heathrow Terminal 5 (Rail Station Only)", station_info$station)
idx2 <- pmatch("Heathrow Airport Terminal 5", aux2$name)
aux2[idx2, -c(1:2)] <- station_info[idx, -1]
#### London St Pancras (Intl)
idx <- pmatch("London St Pancras International", station_info$station)
idx2 <- pmatch("London St Pancras (Intl)", aux2$name)
aux2[idx2, -c(1:2)] <- station_info[1516, -1]
#### Remove stations without information:
##### Corfe Castle
aux2 <- aux2[!(aux2$name %in% c("Corfe Castle")), ]
##### Dublin Ferryport and Dublin Port - Stena
aux2 <- aux2[!(aux2$name %in% c("Dublin Ferryport", "Dublin Port - Stena")), ]
aux2[is.na(aux2$elr), ]
#### Convert latitude and longitude to numeric
aux3 <- aux2 %>%
dplyr::mutate(latitude = as.numeric(latitude),
longitude = as.numeric(longitude))
#### Find records with NAs (these records have more than one line)
idx <- which(is.na(aux3$latitude) | is.na(aux3$longitude))
#### Parse each location and extract only the top records
tmp <- aux2[idx, ] %>%
tidyr::separate_rows(latitude, sep = " ", convert = TRUE) %>%
tidyr::separate_rows(longitude, sep = " ", convert = TRUE) %>%
tidyr::separate_rows(grid_ref, sep = " ") %>%
dplyr::filter((is.na(latitude) & is.na(longitude)) |
(!is.na(latitude) & !is.na(longitude))) %>%
dplyr::distinct(name, .keep_all = TRUE)
aux3[idx, ] <- tmp
station_info <- aux3
# Drop unused variables
station_info$mileage <- NULL
usethis::use_data(station_info, overwrite = TRUE)
# aux3_lat <- aux2 %>%
# tidyr::separate_rows(latitude) %>%
# dplyr::filter(latitude != "")
# aux3_lon <- aux2 %>%
# tidyr::separate_rows(longitude) %>%
# dplyr::filter(longitude != "")
# aux3_grid <- aux2 %>%
# tidyr::separate_rows(grid_ref) %>%
# dplyr::filter(longitude != "")
#
# aux3 <- aux2 %>%
# tidyr::separate_rows(longitude) %>%
# dplyr::filter(longitude != "") %>%
# tidyr::separate_rows(latitude) %>%
# dplyr::distinct(longitude, latitude, .keep_all = TRUE)
#
# df <- tibble::tibble(lon = station_info$longitude,
# lat = station_info$latitude)
# Generate map with stations
station_info %>%
ggmap::qmplot(longitude, latitude, data = ., size = I(0.5))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.