knitr::opts_chunk$set(echo = TRUE)
library(tidyverse) # local data storage pth <- "../__DATA/OSN-COVID-ECTRL/"
To cite this product: Flanders Marine Institute (2020). Union of the ESRI Country shapefile and the Exclusive Economic Zones (version 3). Available online at https://www.marineregions.org/. https://doi.org/10.14284/403
This Rmarkdown report documents the data preparatory steps for cleaning the Opensky Network data.
TODO
Opensky Network publishes on a weekly basis records of their global data. These data is transferred as zipped csv files. Given the file size the data is read in and stored with {fst}.
# document read zips and save out as fst
ds <- fst::read_fst(path = paste0(pth, "OSN_2019_fst")) %>% tibble() ds_ok <- ds %>% filter(!is.na(origin) & !is.na(destination)) ds_dep <- ds %>% filter(!is.na(origin) & is.na(destination)) # ADEP known ds_arr <- ds %>% filter( is.na(origin) & !is.na(destination)) # ADES known ds_nok <- ds %>% filter( is.na(origin) & is.na(destination)) # both unknown
meta data / data description 2019 nrow:= 30989481, missing ADEP 5126423
For the first round of matching to geography we use rnaturalearth
library(rnaturalearth) world <- ne_countries(scale = "medium", returnclass = "sf") world <- world %>% select(iso_a2) # load helper function to assign geo region source('~/RProjects/COVID-air-traffic/R/check_nngeo.R')
Dealing with departures in 2019
pts_sf <- ds_dep %>% mutate(id = row_number()) %>% select(id, icao24, latitude_2, longitude_2) pts_sf_nope <- pts_sf %>% filter(is.na(latitude_2) | is.na(longitude_2)) pts_sf <- pts_sf %>% filter(!id %in% pts_sf_nope$id) %>% sf::st_as_sf(coords = c("longitude_2","latitude_2"), crs=4326) pts_sf[1:50000,] <- check_nngeo(pts_sf[1:50000,], world)
Working with the missing arrivals in 2019 as deps are crunched on home PC.
# assign id for split-apply, if needed pts_sf <- ds_arr %>% mutate(id = row_number()) %>% select(id, icao24, latitude_1, longitude_1) # split sample that is.na for lat/lon and would through error pts_sf_nope <- pts_sf %>% filter(is.na(latitude_1) | is.na(longitude_1)) # coerce lat/lon to sf for nearest neighbour calc pts_sf <- pts_sf %>% filter(!id %in% pts_sf_nope$id) %>% sf::st_as_sf(coords = c("longitude_1","latitude_1"), crs=4326)
source('~/RProjects/COVID19airtraffic/R/pts_latlon_to_sf.R') ds_dep_sf <- ds_dep %>% pts_latlon_to_sf(latitude_1, longitude_1)
ds_dep_sf <- ds_dep_sf %>% sf::st_join(world) # runs for about 5 mins
EEZ_shps <-list.files("../__DATA/xWorld_EEZ_v11", pattern = ".shp$", full.names=TRUE) eez_sf <- sf::read_sf(EEZ_shps[2]) eez_id_sf <- eez_sf %>% select(ISO_TER1)
EEZ_land_shp <- list.files("../__DATA/xWorld_EEZ_and_land_v3", pattern = ".shp$", full.names=TRUE) eez_land <- sf::read_sf(EEZ_land_shp) eez_land <- eez_land %>% select(ISO_TER1)
rq <- readr::read_csv("x_deps_sf.csv")
rq <- rq %>% tidyr::separate(geometry, c("lon","lat"), sep = ", ") %>% mutate(lon = stringr::str_remove(lon, "c\(") %>% as.numeric() ,lat = stringr::str_remove(lat, "\)") %>% as.numeric() )
rq <- ds_dep_sf %>% filter(is.na(iso_a2))
rq <- rq %>% sf::st_join(eez_id_sf) # runs for 2 mins!
rq <- rq %>% dplyr::mutate(lat = sf::st_coordinates(.)[,1], lon = sf::st_coordinates(.)[,2] )
test initially with small set before kicking off long iterator rq <- pts_sf[1:1000, ]
split_df_by_groupsize <- function(.df, .group_size = 2500){ my_list <- .df %>% dplyr::group_split(grp = (dplyr::row_number() - 1) %/% (.group_size)) return(my_list) }
rq %>% split_df_by_groupsize(200) %>% purrr::walk(.f = calc_nngeo_and_save,.poly = world, .prefix = "ARR")_ fst::read_fst("../__DATA/OSN-COVID-ECTRL/tmp/ARR_400")
group_size <- 2500 split_and_nngeo_arr <- . %>% split_df_by_groupsize(.group_size = group_size) %>% purrr::walk(.f = calc_nngeo_and_save,.poly = world, .prefix = "ARR") pts_sf %>% split_and_nngeo_arr()
150001-152500
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.