#' Download, unzip, and read raw OpenSky States Files into R
#'
#' If states files are not downloaded, use this to download, unzip, and fread into R
#' If states files are downloaded, use this to unzip, and fread into R by download = F
#'
#' @param date One date at a time, in the format "YYYY-MM-DD"
#' @param range The hours in the day to be included. Default is all (0:23), but if download fails halfway though, can pick it up (5:23)
#' @param download T/F Need to download data from opensky and unzip it first? False if you have already downloaded and want to just read into R
#' @param fread T/F Need to read it into R? False if you want to download only
#' @param dir Where to download files or where to look for already downloaded files
#' @return "states" which is a dataframe containing a row for each plane at 10 second intervals. This will be fed to additional functions to get data into a usable format.
#' @export
readOpenSkyStates <- function (date, range = 0:23,
download = TRUE, fread = TRUE,
dir = NULL) {
url <- "https://opensky-network.org/datasets/states/"
hours <- c("00","01","02","03","04","05","06",
"07","08","09","10","11","12","13",
"14","15","16","17","18","19","20",
"21","22","23")[range + 1]
fn <- paste0(url, date, "/", hours, "/states_", date, "-", hours, ".csv.tar")
if(is.null(dir)){
dir <- getwd()
}
return( rbindlist(
lapply(fn, function(x){
file <- file.path(dir, paste0(basename(tools::file_path_sans_ext(x)), ".gz"))
if (download){
download.file(x, destfile=file.path(dir,"tmp.tar.gz"))
untar(file.path(dir,"tmp.tar.gz"), exdir = dir)
}
if (fread){
states <- fread(file)
# remove rows that have missing location data
states <- na.omit(states, cols = c("lon", "lat", "geoaltitude"))
return(states)
}
})
))
}
#' Read RDSs from folders
#'
#' When storing multiple geo.RDS and df.RDS in two folders, pass those folders
#' to this function to create one object containing all the data.
#' If dfFolder = NULL, read geo matrices only from folder and return 1 object
#'
#' @param geoFolder Location of geo.RDS
#' @param dfFolder Location of df.RDS
#' @return A sf dataframe, flightgeo, that contains flight data and geodata.
#'
readFlightgeoFromFolder <- function(geoFolder = NULL, dfFolder = NULL, drop){
if(is.null(dfFolder) && !is.null(geoFolder)){
geoFiles <- list.files(geoFolder)
geos <- lapply(1:length(geoFiles), function(x){
return(get(load(file.path(geoFolder, geoFiles[x]))))
})
geos <- unlist(geos, recursive = F)
return(geos)
} else if (!is.null(dfFolder) && is.null(geoFolder)){
dfFiles <- list.files(dfFolder)
dfs <- lapply(1:length(dfFiles), function(x){
return(get(load(file.path(dfFolder, dfFiles[x]))))
})
return(do.call(rbind, dfs))
} else {
geoFiles <- list.files(geoFolder)
dfFiles <- list.files(dfFolder)
dfs <- lapply(1:length(geoFiles), function(x){
geoReadRDS <- get(load(file.path(geoFolder, geoFiles[x])))
flightsReadRDS <- get(load(file.path(dfFolder, dfFiles[x])))
return(createFlightgeo(geoReadRDS, flightsReadRDS, drop = drop))
})
return(do.call(rbind, dfs))
}
}
###########################################################################
#################### Main Transformation Function #########################
###########################################################################
#' Parse flight state data
#'
#' Loop through callsigns and subset to all data with that callsign. Separate into
#' different flights if more than 30 minutes between records. For each of these flights,
#' only keep data it is a complete flight: starting and ending below 4500m elevation,
#' more than 10 minutes long, is not a Cessna or Beech, doesn't have a jump/data error of
#' more than two decimal degrees, is within 500m elevation and .1 decimal degrees lat and lon
#' to nearest airport at start and end of flight.
#'
#' @param states A df containing flight data, each row 10 seconds for a single aircraft
#' @param callsigns Either pass selected callsigns if only interested in a few, or leave as is to select all callsigns
#' @return a list of "geomatrix" (list of matrices) and "flights" (a df where a flight is a row)
#'
transformStates <- function(states,
callsigns = unique(states[ , callsign])) {
# read in airports & aircraft data data
# these are used as covariates but also used to determine if flightpath is real & complete
airports <- fread("inst/extdata/helpers/airports.csv")
# for airports without iata, replace with icao
airports[iata == "N/A", iata:=icao]
airports <- st_as_sf(airports, coords = c("X", "Y"),
crs = 4326, agr = "constant")
aircraft <- fread("inst/extdata/helpers/aircraftDatabase.csv")
# initialize before loop
starttime <- c()
endtime <- c()
callsign <- c()
icao <- c()
airportStart <- c()
airportEnd <- c()
airportStartName <- c()
airportEndName <- c()
aveSpeed <- c()
country <- c()
id <- c()
alert <- c()
routeLines <- list()
f <- 0 # initialize flight count ticker
# loop through callsign
for (i in 1:length(callsigns)) {
# every 1000 callsigns, save to routeLines and flights in case error
# causes function to quit in the middle
if(i %% 1000 == 0){
print(i)
keepIndices <- sapply(routeLines, function(x) !is.null(x) && !is.na(x))
if(length(keepIndices) > 0 && sum(keepIndices) > 0) {
craftMatch <- unlist(lapply(icao[keepIndices], function(x){
which(aircraft$icao24 == x)[1]
}))
geomatrix <<- routeLines[keepIndices]
flights <<- data.table("start" = starttime[keepIndices],
"end" = endtime[keepIndices],
"date" = lubridate::date(as.POSIXct(as.numeric(starttime[keepIndices]), origin = "1970-01-01")),
"duration" = (as.numeric(endtime[keepIndices])-as.numeric(starttime[keepIndices]))/60,
"callsign" = callsign[keepIndices],
"plane" = icao[keepIndices],
"airline" = aircraft$owner[craftMatch],
"manufacturer" = aircraft$manufacturername[craftMatch],
"model" = aircraft$model[craftMatch],
"airportStart" = airportStart[keepIndices],
"airportEnd" = airportEnd[keepIndices],
"airportStartName" = airportStartName[keepIndices],
"airportEndName" = airportEndName[keepIndices],
"country" = country[keepIndices],
"aveSpeed" = aveSpeed[keepIndices],
"id" = id[keepIndices],
"alert" = alert[keepIndices])
}
}
flightCallsign <- states[callsign == callsigns[i]]
# we don't care about cessnas - leave out low flights
if (any(!is.na(flightCallsign$geoaltitude)) && mean(unique(flightCallsign$geoaltitude), na.rm = TRUE) > 1500) {
# Find records with a 30 min gap
# This will define a new flight, later we'll check endpoints to only
# include flights that started & ended at an airport
# Don't want to include flights with gap in coverage over ocean
flightCut <- c(0, which(diff(flightCallsign$time) > 1800), nrow(flightCallsign))
flightCut <- flightCut[which(!is.na(flightCut))]
# loop through flights for current callsign
for (j in 1:(length(flightCut)-1)) {
flight <- flightCallsign[(flightCut[j]+1):flightCut[j+1]]
# ~4500m is the highest airport in the world. If higher than this at
# first or last entry we know it's an incomplete flight
if(nrow(flight) > 1 &&
flight$geoaltitude[1] < 4500 &&
flight$geoaltitude[nrow(flight)] < 4500){
# only look at flights longer than 10 min
if ((flight$time[nrow(flight)] - flight$time[1]) > 600){
# check for cessnas more directly
aircraftMaker <- aircraft[icao24 == flight$icao24[1], manufacturername]
if(length(aircraftMaker) == 0 || aircraftMaker != "Cessna" && aircraftMaker != "Beech"){
## remove rows that are duplicated at beginning and end to have correct time
firstDup <- duplicated(flight[,c(3:7, 13:15)], fromLast = T)
lastDup <- duplicated(flight[,c(3:7, 13:15)])
flight <- flight[min(which(!(firstDup))) : max(which(!(lastDup))), ]
#making XYZ points
locMat <- matrix(c(flight[,lon], flight[,lat],
flight[,geoaltitude], flight[,time],
flight[,velocity]),
nrow = nrow(flight))
locMat <- locMat[rowSums(is.na(locMat)) == 0,]
routeLine <- NA
## leave out flights that have a jump of 2 decimal degrees
if(is.matrix(locMat) && all(diff(locMat[,c(1,2)]) < 2)){
# don't need time or velocity for airport calculations
routeLine <- st_linestring(locMat[,c(1:3)])
}
if(!is.na(routeLine) && length(routeLine) > 0 ){
suppressMessages( #"although coordinates are longitude/latitude, st_nearest_feature assumes that they are planar"
nearStart <- st_nearest_feature(st_point(routeLine[1,1:2]), airports)
)
suppressMessages(
nearEnd <- st_nearest_feature(st_point(routeLine[nrow(routeLine),1:2]), airports)
)
if(airports$iata[nearStart] != airports$iata[nearEnd]){
# test to see if begining and end have close enough elevation to airport to have entire flight
fullStart <- NA
fullEnd <- NA
# hard to know what cutoffs should be...
# .1 decimal degrees = 11.1km for lat everywhere and for lon at equator
# cases of complete flights being 370m above airport and .06 dd away (common to be over .01 away)
fullStart <- abs(flight$geoaltitude[1] - airports$alt[nearStart]) < 500 &&
abs(flight$lat[1] - airports$geometry[nearStart][[1]][2]) < .1 &&
abs(flight$lon[1] - airports$geometry[nearStart][[1]][1]) < .1
fullEnd <- abs(flight$geoaltitude[nrow(flight)] - airports$alt[nearEnd]) < 500 &&
abs(flight$lat[nrow(flight)] - airports$geometry[nearEnd][[1]][2]) < .1 &&
abs(flight$lon[nrow(flight)] - airports$geometry[nearEnd][[1]][1]) < .1
# if it is a full flight, save its data
if(!is.na(fullStart) && !is.na(fullEnd)
&& fullStart && fullEnd) {
f <- f + 1
routeLines[[f]] <- locMat
starttime[f] <- as.character(flight$time[1])
endtime[f] <- as.character(flight$time[nrow(flight)])
callsign[f] <- as.character(flight$callsign[1])
icao[f] <- as.character(flight$icao24[1])
airportStart[f] <- as.character(airports$iata[nearStart])
airportEnd[f] <- as.character(airports$iata[nearEnd])
airportStartName[f] <- as.character(airports$name[nearStart])
airportEndName[f] <- as.character(airports$name[nearEnd])
country[f] <- as.character(airports$country[nearStart])
aveSpeed[f] <- mean(flight$velocity)
id[f] <- i
alert[f] <- as.character(sum(flight$alert) / nrow(flight))
}
}
}
}
}
}
}
}
}
keepIndices <- sapply(routeLines, function(x) !is.null(x) && !is.na(x))
if(length(keepIndices) > 0 && sum(keepIndices) > 0) {
craftMatch <- unlist(lapply(icao[keepIndices], function(x){
which(aircraft$icao24 == x)[1]
}))
flights <- data.table("start" = starttime[keepIndices],
"end" = endtime[keepIndices],
"date" = lubridate::date(as.POSIXct(as.numeric(starttime[keepIndices]), origin = "1970-01-01")),
# duration in min
"duration" = (as.numeric(endtime[keepIndices]) - as.numeric(starttime[keepIndices])) / 60,
"callsign" = callsign[keepIndices],
"plane" = icao[keepIndices],
"airline" = aircraft$owner[craftMatch],
"manufacturer" = aircraft$manufacturername[craftMatch],
"model" = aircraft$model[craftMatch],
"operator" = aircraft$operatorcallsign[craftMatch],
"airportStart" = airportStart[keepIndices],
"airportEnd" = airportEnd[keepIndices],
"airportStartName" = airportStartName[keepIndices],
"airportEndName" = airportEndName[keepIndices],
"country" = country[keepIndices],
"aveSpeed" = aveSpeed[keepIndices],
"id" = id[keepIndices],
"alert" = alert[keepIndices]
)
geomatrix <- routeLines[keepIndices]
return(list(geomatrix, flights))
}
}
############################# Map Geo Object ###############################
#' Map flight
#'
#' Use leaflet to show flights on top of a basemap. When a flight is clicked
#' additional information is shown in a tooltip.
#'
#' @param flightgeo A dataframe of all the flights information within routes of interest including geo data.
#' @return an interactive leaflet map
#'
mapFlightgeo <- function(flightgeo) {
states_popup <- paste("Callsign:", flightgeo$callsign,
"</br>",
"Origin:", flightgeo$airportStart,
"</br>",
"Destination:", flightgeo$airportEnd,
"</br>",
"Start Time:", as.POSIXlt(as.numeric(as.character(flightgeo$start)),
tz = "UTC", origin = "1970-01-01"),
"</br>",
"End Time:", as.POSIXlt(as.numeric(as.character(flightgeo$end)),
tz = "UTC", origin = "1970-01-01"),
"</br>",
"Line ID:", flightgeo$id)
leaflet() %>%
addProviderTiles("CartoDB.Positron", group = "Carto") %>%
addPolylines(data=st_zm(flightgeo),
color = "#77aaee", opacity = .5, weight = 1,
popup = states_popup,
highlightOptions = highlightOptions(
color='#ff0000', opacity = 1, weight = 2, fillOpacity = 1,
bringToFront = TRUE, sendToBack = TRUE))
}
############################# Add Monday Data ###############################
#' Add multiple Mondays worth of data
#'
#' Run this function if it has been several weeks since data has been updated.
#' It will update delay, storm, sigmet, and flight data.
#'
#' @param statesDataDir The location where the large flight state data is downloaded and stored
#' @param geoSaveDir The location of the geo.RDS for each date
#' @param dfSaveDir The location of df.RDS for each date
#' @param appDir Where the MonaRk package files are stored (including "MonaRk")
#' @param routeCountCutoff Min number of routes to be included for app
#' @return No object returned, only data written
#'
addMondays <- function(appDir, statesDataDir, routeCountCutoff = 50){
setwd(appDir)
if(format(as.Date(lubridate::now("GMT")),'%w') == 1){
stop("It's Monday in UTC time, so new Monday data is being created now. Please wait until tomorrow to update data.")
}
# what was most recent Monday?
prevMonday <- function(x) {
for(i in 1:7){
if(format(as.Date(Sys.Date()-i),'%w') == 1){
d = Sys.Date() - i;
}
}
return(d)
}
# is data up to date?
recentMonday <- prevMonday(Sys.Date())
##### UPDATE STORMS ######
# check if storms need to be downloaded
threeMonthsAgo <- as.Date(recentMonday) %m-% months(3)
# old storms file
stormsGeo <- get(load("inst/extdata/cleaned/stormsGeo.RDS"))
recentStorm <- max(format(as.Date(stormsGeo$END_YMDHMS_UTC), "%Y-%m"))
if(recentStorm < format(threeMonthsAgo, "%Y-%m")){
stormFolder <- "inst/extdata/raw/rawStorms"
# does user need to download?
files <- list.files(stormFolder, full.names = T)
lastMonthDownload <- max(unlist(lapply(files, function(x){
fread(x, select = c("BEGIN_YEARMONTH"))
})))
lastMonthDownload <- format(as.Date(paste0(substr(lastMonthDownload, 1,4), "-", substr(lastMonthDownload, 5,6), "-28")), "%Y-%m")
if(lastMonthDownload < format(threeMonthsAgo, "%Y-%m")){
stop(paste("Please download most recent storm files into", stormFolder))
} else {
save(stormsGeo, file = "inst/extdata/cleaned/stormsGeo_old.RDS")
storms <- cleanStorms(stormFolder)
stormsGeo <- polygonizeStorms(storms)
save(stormsGeo, file = "inst/extdata/cleaned/stormsGeo.RDS")
}
}
##### UPDATE DELAY ######
# check if current delay data is updated
delay <- get(load("inst/extdata/cleaned/delayGeo.RDS"))
recentDelay <- delay$FlightDate
# if most recent month in the delay data 3 months ago, than delay data is updated
if(format(max(as.Date(recentDelay)), "%Y-%m") < format(threeMonthsAgo, "%Y-%m")) {
delayFolder <- "inst/extdata/raw/rawDelay"
# check if delay need to be downloaded
delayDownloadedMonths <- unlist(lapply(list.files(delayFolder), function(x){
dateparts <- strsplit(strsplit(x, split = ")|\\.")[[1]][2], split="_")[[1]]
return(paste0(dateparts[2],"-", dateparts[3], "-28"))
}))
allDataMonths <- format(seq(as.Date(min(delay$FlightDate)), as.Date(threeMonthsAgo), by = "month"), "%Y-%m")
needsDownload <- allDataMonths[unlist(lapply(allDataMonths, function(x){
!(x %in% format(as.Date(delayDownloadedMonths), "%Y-%m"))
}))]
# redownload most recent month also
needsDownload <- unique(c(needsDownload, max(format(as.Date(delay$FlightDate), "%Y-%m"))))
if(length(needsDownload) > 1){
stop(paste("Please download most recent raw delay files from the date",
needsDownload,
"into",
delayFolder,
"."))
} else {
save(delay, file = "inst/extdata/cleaned/delayGeo_old.RDS")
delay <- cleanDelay(delayFolder)
save(delay, file = "inst/extdata/cleaned/delayGeo.RDS")
}
}
##################################################
##### update sigmet date with multiple dates
# old sigmet file
sigmetgeo <- get(load("appData/allsigs.RDS"))
# where new files are
sigmetDataDir <- "inst/extdata/raw/rawSigmetScraping"
dateseq <- as.Date((max(sigmetgeo$date) + 1) : recentMonday, origin = "1970-01-01")
if(length(dateseq) >= 7){
##### Save allsigs_old as a backup
save(sigmetgeo, file = "inst/extdata/cleaned/allsigs_old.RDS")
datesUpdateSig <- dateseq[seq(7, length(dateseq), by = 7)]
# DO THEY NEED TO DOWNLOAD
needDownload <- unlist(lapply(datesUpdateSig, function(x){
!(any(grepl(x, list.files(sigmetDataDir))))
}))
if(any(needDownload)){
stop(paste("Please download raw Sigmet files from the date",
datesUpdateSig[which(needDownload)],
"into",
sigmetDataDir,
"."))
}
for(i in 1:length(datesUpdateSig)){
updateFolder <- which(grepl(datesUpdateSig[i], list.files(sigmetDataDir)))
rawSigmet <- readSigmetFiles(list.files(sigmetDataDir, full.names = TRUE)[updateFolder])
sigmetgeoDay <- try(parseSigmet(rawSigmet, datesUpdateSig[i]))
sigmetgeoDay$id <- seq(nrow(sigmetgeo) + 1, nrow(sigmetgeo) + nrow(sigmetgeoDay))
sigmetgeo <- rbind(sigmetgeo, sigmetgeoDay)
}
##### Add new sigmets to end, add id, and rewrite shapefile!
save(sigmetgeo, file = "appData/allsigs.RDS")
}
##### STATES DATA #####
# look where parsed States are, find most recent
files <- strsplit(list.files("inst/extdata/cleaned/df"), split = "_|\\.")
stateDates <- as.Date(unlist(lapply(files, function(x){
x[2]
})))
maxdateDf <- max(stateDates)
if(maxdateDf != recentMonday){
# find dates that need updating
dateseq <- as.Date((maxdateDf + 1) : recentMonday, origin = "1970-01-01")
if(length(dateseq) >= 7){
datesUpdate <- dateseq[seq(7, length(dateseq), by = 7)]
} else {
datesUpdate <- NULL
}
# loop through datesUpdate
for(i in 1:length(datesUpdate)){
# are states already downloaded?
if(length(which(grepl("states_", list.files(statesDataDir)) &
grepl(datesUpdate[i], list.files(statesDataDir)))) == 24){
downloadStates <- FALSE
} else {
downloadStates <- TRUE
}
print(paste("Parsing", datesUpdate[i]))
states <- readOpenSkyStates(datesUpdate[i], download = downloadStates,
fread = T, dir = statesDataDir)
fullstates <- transformStates(states)
geomatrix <- fullstates[[1]] # list of matrices with 5 cols
flights <- fullstates[[2]]
flightgeo <- createFlightgeo(geomatrix, flights, drop = "velocity")
flightgeo_covars <- addCovars(flightgeo, sigmetgeo)
flightgeo_covars$g <- NULL
## Save new df and geo to existing folder
save(flightgeo_covars, file = file.path("inst/extdata/cleaned/df", paste0("df_", datesUpdate[i], ".RDS")))
save(geomatrix, file = file.path("inst/extdata/cleaned/geo", paste0("geo_", datesUpdate[i],".RDS")))
}
## RE ADD COVARS for delay and storm data that needs updating
# subset between recentDelay and 3 months ago, send to addCovars again
updateCovarDate <- which(stateDates > max(recentDelay) &
stateDates < datesUpdate[1])
for (i in updateCovarDate){
updateDf <- get(load( file.path("inst/extdata/cleaned/df", paste0("df_", stateDates[i], ".RDS"))))
updateGeo <- get(load( file.path("inst/extdata/cleaned/geo", paste0("geo_", stateDates[i], ".RDS"))))
updateFlightGeo <- createFlightgeo(updateGeo, updateDf, drop = "velocity")
updateFlightGeo$stormEventID <- thisStormEvent(updateFlightGeo)
updateFlightGeo$stormEvent <- !(updateFlightGeo$stormEventID == "0")
delayCovars <- thisDelay(updateFlightGeo)
updateFlightGeo$weatherDelay <- delayCovars[,1]
updateFlightGeo$nasDelay <- delayCovars[,2]
updateFlightGeo$carrierDelay <- delayCovars[,3]
updateFlightGeo$securityDelay <- delayCovars[,4]
updateFlightGeo$lateAircraftDelay <- delayCovars[,5]
updateFlightGeo$g <- NULL
save(updateFlightGeo, file = file.path("inst/extdata/cleaned/df", paste0("df_", stateDates[i], ".RDS")))
}
}
# Once all dates have covars, build distance comparison df
print("Combining with previous data")
geomatrix_distcovars <- readFlightgeoFromFolder(geoFolder = "inst/extdata/cleaned/geo")
flightgeo_distcovars <- readFlightgeoFromFolder(dfFolder = "inst/extdata/cleaned/df")
flightgeo_distcovars <- compareFlights(flightgeo_distcovars)
print("Removing flights longer than 100,000 Km (bad data)")
print(paste("Subseting to routes with at least", routeCountCutoff, "flights"))
keep <- which(flightgeo_distcovars$routeCount >= routeCountCutoff & flightgeo_distcovars$distance < 100000000)
if(length(keep) < nrow(flightgeo_distcovars)){
flightgeo_distcovars <- flightgeo_distcovars[keep,]
geomatrix_distcovars <- geomatrix_distcovars[keep]
}
flightgeo_distcovars$g <- NULL
print("Saving in route folders")
createRouteFolders(flightgeo_distcovars, geomatrix_distcovars,
file.path(appDir, "appData/appRoutes"))
print("Summarizing route data")
routes <- createRouteLevelData(flightgeo_distcovars)
print("Saving route data")
save(routes, file = file.path(appDir, "appData/routes.RDS"))
print("Saving data for total cost calculations")
costData <- flightgeo_distcovars[ , c("date", "continent", "route", "manufacturer",
"minincrease", "kmincreaseFlight",
"thisWeather", "regionWeather",
"stormEvent", "weatherDelay", "nasDelay")]
costData[,c(10,11)] <- as.logical(costData[,c(10,11)] > 0)
costData$hourincrease <- costData$minincrease / 60
save(costData, file = file.path(appDir, "appData/costData.RDS"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.