knitr::opts_chunk$set(echo = TRUE)
library(tidyverse) library(sf) library(rnaturalearth) # own functions my_funs <- list.files(path = "./R/", pattern = "\\.R", full.names = TRUE) purrr::walk(my_funs, .f = source) # local data storage pth <- "../__DATA/OSN-COVID-ECTRL/"
This Rmarkdown report documents the data preparatory steps for cleaning the Opensky Network data.
TODO
Opensky Network publishes on a weekly basis records of their global data (@osn_covid_dataset, @schafer_osn_2014, @olive_traffic_2019) . These data is transferred as zipped csv files. Given the file size the data is read in and stored with {fst}.
# local central data storage for this and other projects pth <- "../__DATA/OSN-COVID-ECTRL/" # create iterator of filenames fns <- list.files(path = pth, pattern = "^flightlist_2021.*", full.names = TRUE) ds <- fns %>% purrr::map_dfr(.f=readr::read_csv)
After the refactoring, the source data fst files are augmented with position references for lat/lon positions without ADEP/ADES association. The augmented files are stored as xxx_clean fst.
# ds <- fst::read_fst(path = paste0(pth, "OSN_2019_fst")) %>% tibble() # ds <- fst::read_fst(path = paste0(pth, "OSN_2020_fst")) %>% tibble() # ds <- fst::read_fst(path = paste0(pth, "OSN_2021_fst")) %>% tibble() # first setup of a meta file # meta <- list() # initialise meta data file # meta$OSN_2019 <- meta_dataset_description(ds, 2019) # write-out: eadr::write_csv(x = meta, file="./data/meta.csv") # read and update meta meta <- readr::read_csv("./data/meta.csv") next_meta <- meta_dataset_description(ds, 2021) # replace current year line with updated meta # antijoin kills current year, bind-rows add new values (prob a better solution) meta <- meta %>% anti_join(next_meta, by = "YEAR") %>% bind_rows(next_meta) # write out update meta file readr::write_csv(x = meta, file="./data/meta.csv")
Geographic references for mapping of lat/lon positions to country and/or EEZ. Marineregions.org provides shapefile for EEZ and land boundaries (@EEZland_2020). Land country borders are taken from the {rnaturalearth} package.
world <- ne_countries(scale = "medium", returnclass = "sf") world <- world %>% select(iso_a3) EEZ_land_shp <- list.files("../__DATA/xWorld_EEZ_and_land_v3", pattern = ".shp$", full.names=TRUE) eez_land <- sf::read_sf(EEZ_land_shp) eez_land <- eez_land %>% select(ISO_TER1) EEZ_12NM_shp <- list.files("../__DATA/xWorld_12NM_v3", pattern = ".shp$", full.names=TRUE) eez_12NM <- sf::read_sf(EEZ_12NM_shp) eez_12NM <- eez_12NM %>% select(ISO_TER1)
# assign unique key ds <- ds %>% mutate(id = row_number()) # extract missing ADEPs and convert to sf pts rq <- ds %>% select(id, icao24, origin, destination,latitude_1, longitude_1) %>% filter(is.na(origin)) %>% pts_latlon_to_sf(latitude_1, longitude_1, FALSE) # rq 2020: 5,572,449 # rq 2021 feb 28: 848,591 # perform spational join - soil rq <- rq %>% sf::st_join(world) rq <- rq %>% rename(POS1_S = iso_a3) # perform spational join with 12NM territory # runs for about 9 minutes/2019 rq <- rq %>% sf::st_join(eez_12NM) rq <- rq %>% rename(POS1_TW = ISO_TER1) # 2020 nas 88,215 # 2021 feb 28: 9,167 # EEZ -------------------------------------- rq <- rq %>% sf::st_join(eez_land) rq <- rq %>% rename(POS1_EEZ = ISO_TER1) # TODO - CHECK FOR DOUBLE HITS # rq %>% group_by(id) %>% summarise(N = n()) %>% filter(N > 1) # resulted 2020 in 255 double hits! # 2021 feb 28: 9 # remove doubles ---------------------- check todo # another costly operation! about 10 mins rq <- rq %>% group_by(id) %>% slice(1) %>% ungroup()
rq %>% sf::st_set_geometry(NULL) %>% readr::write_csv("ds_origin_clean.csv")
# load cleaner version of data # rq <- readr::read_csv("ds_origin_clean.csv") # trim rq for join rq <- rq %>% sf::st_drop_geometry() # add POS1_TW and EEZ to ds ds <- ds %>% left_join(rq %>% select(id, POS1_S, POS1_TW, POS1_EEZ), by = "id")
tackle arrival side
rq <- ds %>% select(id, icao24, origin, destination,latitude_2, longitude_2) %>% filter(is.na(destination)) # 2019 nas: 7214146 # 2020 nas: 3,973,629 rq_nope <- rq %>% filter(is.na(latitude_2) | is.na(longitude_2)) # 2019 nopes 131 # 2020 nopes 78 # 2021 Feb 28: 14 rq <- rq %>% filter(! id %in% rq_nope$id) %>% pts_latlon_to_sf(latitude_2, longitude_2, FALSE) # perform spational join rq <- rq %>% sf::st_join(world) rq <- rq %>% rename(POS2_S = iso_a3) # perform spational join with 12NM territory # runs for about 9 minutes # 2019 NAs 12NM 109,152 # 2020 74,594 # no additional hits for EEZ rq <- rq %>% sf::st_join(eez_12NM) # rename variable rq <- rq %>% rename(POS2_TW = ISO_TER1) ## check for EEZ hits rq <- rq %>% sf::st_join(eez_land) rq <- rq %>% rename(POS2_EEZ = ISO_TER1) ## rq %>% filter(is.na(POS2_TW) & !is.na(POS2_EEZ)) # force single rq <- rq %>% group_by(id) %>% slice(1) %>% ungroup() # when done drop geometry rq <- rq %>% sf::st_drop_geometry() # combine with data ds # ds <- ds %>% left_join(rq %>% select(id, POS2_S, POS2_TW, POS2_EEZ), by = "id")
ds <- ds %>% left_join(rq %>% select(id, POS2_TW, POS2_EEZ), by = "id") sf drop geometry
when you break down into quarters, months, etc. make sure that the ids are newly assigned in the annual file! ds %>% for_fst_coerce_date_to_character() %>% fst::write_fst(ds, path = paste0(pth, "OSN_2021_03_clean"))
Potential sources for water-land-zones boundaries
Description of the dataset
One file per month is provided as a csv file with the following features:
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.