# libs for data library(data.table) library(stringi) library(glue) # for analysis library(dbscan) # library for atlas data library(ggplot2) library(atlastools)
Prepare files.
# list files files <- list.files("data/processed/data_id", full.names = TRUE)
This pre-processing pipeline is parameterised based on exploratory data analysis which can be found in the supplementary material.
# remove old preprocessing log if (file.exists("data/log_preprocessing.log")) { message("removing old preprocessing log") file.remove("data/log_preprocessing.log") } else { message(glue("No preprocessing log as of {Sys.time()}, new log to be made")) } # some parameters min_daily_fixes <- 500 # discard data below n rows moving_window_ppa <- 7 # moving window for sparrow, bulbul, warbler (PPA) moving_window_h <- 7 # moving window for swallow speed_threshold_ppa <- 20 # speed threshold for PPA is 3 m/s speed_threshold_h <- 20 # speed threshold for swallow angle_threshold <- 10 # in degrees hour_start <- 5 hour_end <- 20 dbscan_eps <- 3 # for PPA only sd_limit <- 20 # sink date time to log sink(file = "data/log_preprocessing.log", append = FALSE) glue("Holeybirds Pre-processing log from {Sys.time()}\n\n") sink() # open sinks sink(file = "data/log_preprocessing.log", append = TRUE) # run pre-processing for (file in files) { # read in file df <- fread(file) tag_id <- unique(df$TAG) species <- unique(df$sp) # write messages # informative messages print( glue( "Pre-processing tag_id = {tag_id}; {species}; N rows raw = {nrow(df)} " ) ) # add time df[, time := as.integer(as.numeric(TIME) / 1000)] # remove so called attractor points df <- atl_filter_bounds( data = df, x = "X", y = "Y", x_range = 257000.0 - c(300, -300), y_range = 780000.0 - c(300, -300) ) print( glue(" -- removed attractor points near X = {257000.0},Y = {780000.0} ") ) # remove other attractors df[, count := .N, by = c("X", "Y")] df <- df[count < 5, ] print( glue(" -- removed other attractor points; counts above {5} ") ) df$count <- NULL # filter on time of day -- this is being redone for safety # the column 'd' already marks daytime positions df[, c("date", "hour") := list( lubridate::date(as.POSIXct(as.numeric(TIME) / 1000, origin = "1970-01-01", tz = "Asia/Jerusalem" )), lubridate::hour(as.POSIXct(as.numeric(TIME) / 1000, origin = "1970-01-01", tz = "Asia/Jerusalem" )) )] df <- df[hour >= hour_start & hour < hour_end, ] print( glue(" -- removed nighttime positions between {hour_end}PM and {hour_start}AM ") ) # calculate SD and filter for SD <= 20 metres df[, SD := sqrt(VARX + VARY + (2 * COVXY))] df <- df[SD <= 20] print( glue(" -- removed SD > {sd_limit} ") ) # split by date df_l <- split(df, by = "date") # remove data with few rows df_l <- df_l[vapply(df_l, function(le) { nrow(le) > min_daily_fixes }, FUN.VALUE = T)] #### Filtering on speed OR density based scan (DBSCAN) #### # warn for all data lost, and skip to next individual if all data lost # else continue with pre-processing if (purrr::is_empty(df_l)) { print( glue::glue("List of data is empty, all data had fewer than \\ {min_daily_fixes} rows") ) print( glue("\n\n***\n\n") ) next } else { # pre-processing continues here print(glue::glue("-- Pre-processing {length(df_l)} days' data separately ")) # first calculate speeds and turning angles, handle 0 angles df_l <- lapply(df_l, function(le) { # check if filtering works le[, c("speed_in", "speed_out", "angle") := list( atl_get_speed(data = le, x = "X", y = "Y", time = "time", type = "in"), atl_get_speed(data = le, x = "X", y = "Y", time = "time", type = "out"), atl_turning_angle(data = le, x = "X", y = "Y", time = "time") )] # fix missing angles due to same position, assign 0 le[, angle := nafill(angle, type = "const", fill = 0)] }) # check is species is Hirundo (swallow) and handle differently # CURRENTLY HANDLED THE SAME! if (species == "Hirundo") { df_l <- lapply(df_l, function(le) { atl_filter_covariates( le, filters = c( glue::glue("(angle < {angle_threshold}) |\\ ((speed_in < {speed_threshold_ppa}) &\\ (speed_out < {speed_threshold_ppa}))") ) ) }) print(glue::glue("-- Filtered out spikes using speeds > \\ {speed_threshold_ppa} AND angle > {angle_threshold}; ")) } else { df_l <- lapply(df_l, function(le) { atl_filter_covariates( le, filters = c( glue::glue("(angle < {angle_threshold}) |\\ ((speed_in < {speed_threshold_ppa}) &\\ (speed_out < {speed_threshold_ppa}))") ) ) }) print(glue::glue("-- Filtered out spikes using speeds > \\ {speed_threshold_ppa} AND angle > {angle_threshold}; ")) } #### Median smoothing #### # remove data with few rows df_l <- df_l[vapply(df_l, function(le) { nrow(le) > min_daily_fixes }, FUN.VALUE = T)] # check again if any data remains and if not, move to next ID if (purrr::is_empty(df_l)) { print( glue::glue("List of data is empty, all data removed by speed filter ") ) print( glue("\n\n***\n\n") ) next } else { if (species == "Hirundo") { # pre-processing continues # apply a median smooth smooth df_l <- lapply(df_l, atl_median_smooth, x = "X", y = "Y", time = "time", moving_window = moving_window_h ) print( glue("-- Median smoothed coordinates, moving window = {moving_window_h}") ) } else { # apply a median smooth smooth df_l <- lapply(df_l, atl_median_smooth, x = "X", y = "Y", time = "time", moving_window = moving_window_ppa ) print( glue("-- Median smoothed coordinates, moving window = {moving_window_ppa}") ) } # if else for H or PPA ends #### Merging data and saving to file #### df <- rbindlist(df_l) # save data to file save_file <- glue("data/processed/data_preprocessed/data_preproc\\ _{species}_{tag_id}.csv") print( glue("-- N rows pre-processed = {nrow(df)}") ) # really save the data fwrite(df, file = save_file) print( glue("-- saved to data/processed/data_preprocessed/ data_preproc_{species}_{tag_id}.csv ") ) print( glue("\n\n***\n\n") ) } # else case for data after filtering ends, data to file } # else case for min daily fixes data ends } # for loops ends sink()
files <- list.files( path = "data/processed/data_preprocessed", full.names = TRUE ) npos <- lapply(files, fread) |> sapply(nrow) |> sum()
# load swallow data preprocessed files <- list.files( path = "data/processed/data_preprocessed", full.names = TRUE )
# read all data and count fixes per hour per calendar day data <- lapply(files, function(df) { df_ <- fread(df) df_ <- df_[, list( tracking_duration_min = (max(time) - min(time)) / (60), # div by 60 for mins nfixes = length(X) ), by = c("sp", "TAG", "date")] df_[, fix_per_min := nfixes / (tracking_duration_min)][] }) # read WGI and attach beacause we onyl want metrics for birds we # actually use, ie with rrv score # attach to rrv data wgi <- fread("data/results/data_tracking_metrics_rrv.csv") # bind list and split by id and date data <- rbindlist(data) data <- merge( data, wgi, by = c("TAG", "sp") ) # how many birds with preprocessed data and rrv length(unique(data$TAG)) # save data to file fwrite(data, file = "data/results/data_fix_per_min_daily.csv")
Check data remaining.
# read all preprocesed data nd merge data_preproc <- lapply(files, fread) |> rbindlist() data_preproc[TAG %in% wgi$TAG, ][, list(n_id = length(unique(TAG))), by = "sp"]
# summarise the data per species data_summary_fix_rate <- data[, list( mean_fix_pm = mean(fix_per_min, na.rm = T), sd_fix_pm = sd(fix_per_min, na.rm = T) ), by = c("sp")] # save to file fwrite(data_summary_fix_rate, file = "data/results/data_summary_fix_rate.csv")
This code is to find positions returned by ATLAS when it fails to calculate a good localisations.
# find all raw data files files <- list.files("data/processed/data_id", full.names = TRUE) # find attractor locations data_attractors <- lapply(files, function(file) { # read in file df <- fread(file) tag_id <- unique(df$TAG) species <- unique(df$sp) # add time df[, time := as.integer(as.numeric(TIME) / 1000)] # remove other attractors df[, count := .N, by = c("X", "Y")] df <- df[count > 5, ] df }) # bind list and plot locations data_attractors <- rbindlist(data_attractors)
# get time of day - are these sleeping birds? data_attractors[, hour := hour(as.POSIXct(time, origin = "1970-01-01", tz = "Asia/Jerusalem" ))] data_attractors[, light := fifelse(hour >= 5 & hour < 20, "day", "night")] # save data on attractprs fwrite( data_attractors, file = "data/results/data_attractors.csv" )
( ggplot(data_attractors) + geom_histogram( aes(hour) ) + geom_vline( xintercept = c(5, 20), col = "red" ) + facet_wrap(~sp) + labs( x = "Hour of day", y = "# Duplicated positions" ) ) |> ggsave( filename = "figures/fig_duplicated_positions.png" )
Read in spatial data to plot attractors in correct locations.
# read in spatial data library(sf) landcover <- st_read("data/spatial/hula_lc_vector/HulaValley.shp")
p_attractor <- ggplot(data_attractors) + geom_sf( data = landcover, fill = "lightgrey" ) + geom_point( aes(X, Y, fill = sp, size = count), shape = 21 ) + scale_size_continuous( range = c(0.1, 10) ) + coord_sf( xlim = st_bbox(landcover)[c("xmin", "xmax")], ylim = st_bbox(landcover)[c("ymin", "ymax")] ) + facet_wrap( ~light ) ggsave( p_attractor, filename = "figures/fig_attractor_locations.png" )
atrac_sf <- st_as_sf(data_attractors[, c("sp", "X", "Y")], coords = c("X", "Y"), crs = 2039 )
lc_attractor <- terra::extract( terra::vect(landcover[, c("Name")]), terra::vect(atrac_sf) )
# WIP - may need to be processed further - fails due to replicate # landcovers from points on boundaries data_attractors$lc <- lc_attractor$Name
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.