knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(sf)
library(rnaturalearth)

# own functions
my_funs <- list.files(path = "./R/", pattern = "\\.R", full.names = TRUE)
purrr::walk(my_funs, .f = source)

# local data storage
pth <- "../__DATA/OSN-COVID-ECTRL/"

Overview

This Rmarkdown report documents the data preparatory steps for cleaning the Opensky Network data.

TODO

Data Ingestion

Opensky Network publishes on a weekly basis records of their global data (@osn_covid_dataset, @schafer_osn_2014, @olive_traffic_2019) . These data is transferred as zipped csv files. Given the file size the data is read in and stored with {fst}.

# local central data storage for this and other projects
pth <- "../__DATA/OSN-COVID-ECTRL/"

# create iterator of filenames
fns <- list.files(path = pth, pattern = "^flightlist_2021.*", full.names = TRUE) 
ds <- fns %>% 
  purrr::map_dfr(.f=readr::read_csv)

After the refactoring, the source data fst files are augmented with position references for lat/lon positions without ADEP/ADES association. The augmented files are stored as xxx_clean fst.

Data Preparation

# ds <- fst::read_fst(path = paste0(pth, "OSN_2019_fst")) %>% tibble() 
# ds <- fst::read_fst(path = paste0(pth, "OSN_2020_fst")) %>% tibble() 
# ds <- fst::read_fst(path = paste0(pth, "OSN_2021_fst")) %>% tibble() 

# first setup of a meta file 
# meta <- list()   # initialise meta data file
# meta$OSN_2019 <- meta_dataset_description(ds, 2019)
# write-out: eadr::write_csv(x = meta, file="./data/meta.csv")

# read and update meta
meta      <- readr::read_csv("./data/meta.csv")
next_meta <- meta_dataset_description(ds, 2021)

# replace current year line with updated meta
# antijoin kills current year, bind-rows add new values (prob a better solution)
meta <- meta %>% anti_join(next_meta, by = "YEAR") %>% bind_rows(next_meta)

# write out update meta file
readr::write_csv(x = meta, file="./data/meta.csv")

Geographic references for mapping of lat/lon positions to country and/or EEZ. Marineregions.org provides shapefile for EEZ and land boundaries (@EEZland_2020). Land country borders are taken from the {rnaturalearth} package.

world <- ne_countries(scale = "medium", returnclass = "sf")
world <- world %>% select(iso_a3)

EEZ_land_shp <- list.files("../__DATA/xWorld_EEZ_and_land_v3", pattern = ".shp$", full.names=TRUE)
eez_land <- sf::read_sf(EEZ_land_shp)
eez_land <- eez_land %>% select(ISO_TER1)


EEZ_12NM_shp <- list.files("../__DATA/xWorld_12NM_v3", pattern = ".shp$", full.names=TRUE)
eez_12NM <- sf::read_sf(EEZ_12NM_shp)
eez_12NM <- eez_12NM %>% select(ISO_TER1)
# assign unique key
ds <- ds %>% mutate(id = row_number())

# extract missing ADEPs and convert to sf pts
rq <- ds %>% select(id, icao24, origin, destination,latitude_1, longitude_1) %>%
  filter(is.na(origin)) %>% pts_latlon_to_sf(latitude_1, longitude_1, FALSE)
# rq 2020: 5,572,449
# rq 2021 feb 28: 848,591

# perform spational join - soil
rq <- rq %>% sf::st_join(world)
rq <- rq %>% rename(POS1_S = iso_a3)

# perform spational join with 12NM territory
# runs for about 9 minutes/2019
rq <- rq %>% 
  sf::st_join(eez_12NM)
rq <- rq %>% rename(POS1_TW = ISO_TER1)
# 2020 nas 88,215
# 2021 feb 28: 9,167

# EEZ --------------------------------------
rq <- rq %>% 
  sf::st_join(eez_land)
rq <- rq %>% rename(POS1_EEZ = ISO_TER1)

# TODO - CHECK FOR DOUBLE HITS
# rq %>% group_by(id) %>% summarise(N = n()) %>% filter(N > 1)
# resulted 2020 in 255 double hits!
# 2021 feb 28: 9

# remove doubles ---------------------- check todo
# another costly operation! about 10 mins
rq <- rq %>% group_by(id) %>% slice(1) %>% ungroup()

rq %>% sf::st_set_geometry(NULL) %>% readr::write_csv("ds_origin_clean.csv")

# load cleaner version of data
# rq <- readr::read_csv("ds_origin_clean.csv")

# trim rq for join
rq <- rq %>% sf::st_drop_geometry()
# add POS1_TW and EEZ to ds
ds <- ds %>% left_join(rq %>% select(id, POS1_S, POS1_TW, POS1_EEZ), by = "id")

tackle arrival side

rq <- ds %>% select(id, icao24, origin, destination,latitude_2, longitude_2) %>%
  filter(is.na(destination))
# 2019 nas: 7214146
# 2020 nas: 3,973,629
rq_nope <- rq %>% filter(is.na(latitude_2) | is.na(longitude_2))
# 2019 nopes 131
# 2020 nopes 78
# 2021 Feb 28: 14

rq <- rq %>% filter(! id %in% rq_nope$id) %>%
  pts_latlon_to_sf(latitude_2, longitude_2, FALSE)

# perform spational join
rq <- rq %>% sf::st_join(world)
rq <- rq %>% rename(POS2_S = iso_a3)

# perform spational join with 12NM territory
# runs for about 9 minutes
# 2019 NAs 12NM 109,152
# 2020          74,594
# no additional hits for EEZ
rq <- rq %>% 
  sf::st_join(eez_12NM)
# rename variable
rq <- rq %>% rename(POS2_TW = ISO_TER1)

## check for EEZ hits
rq <- rq %>% sf::st_join(eez_land)
rq <- rq %>% rename(POS2_EEZ = ISO_TER1)
## rq %>% filter(is.na(POS2_TW) & !is.na(POS2_EEZ))

# force single
rq <- rq %>% group_by(id) %>% slice(1) %>% ungroup()

# when done drop geometry
rq <- rq %>% sf::st_drop_geometry()

# combine with data ds
# ds <- ds %>% left_join(rq %>% select(id, POS2_S, POS2_TW, POS2_EEZ), by = "id")

ds <- ds %>% left_join(rq %>% select(id, POS2_TW, POS2_EEZ), by = "id") sf drop geometry

when you break down into quarters, months, etc. make sure that the ids are newly assigned in the annual file! ds %>% for_fst_coerce_date_to_character() %>% fst::write_fst(ds, path = paste0(pth, "OSN_2021_03_clean"))

Appendix

Shapefile borders

Potential sources for water-land-zones boundaries

Opensky Network Data Dictionary

Description of the dataset

One file per month is provided as a csv file with the following features:

References



rainer-rq-koelle/COVID19airtraffic documentation built on March 26, 2021, 5:18 a.m.