In this section, we prepare a final list of covariates, after taking into account spatial sampling bias, temporal bias and observer expertise scores (examined in previous sections).
# load libs for data library(dplyr) library(readr) library(stringr) library(purrr) library(glue) library(tidyr) # check for velox and install library(devtools) if (!"velox" %in% installed.packages()) { install_github("hunzikp/velox") } # load spatial library(raster) library(rgeos) library(velox) library(sf) # load saved data object load("results/02_data_prelim_processing.rdata")
Sampling bias can be introduced into citizen science due to the often ad-hoc nature of data collection (Sullivan et al. 2014). For eBird, this translates into checklists reported when convenient, rather than at regular or random points in time and space, leading to non-independence in the data if observations are spatio-temporally clustered (Johnston et al. 2021). Spatio-temporal autocorrelation in the data can be reduced by sub-sampling at an appropriate spatial resolution, and by avoiding temporal clustering. We estimated two simple measures of spatial clustering: the distance from each site to the nearest road (road data from OpenStreetMap) and the nearest-neighbor distance for each site. Sites were strongly tied to roads (mean distance to road ± SD = 390.77 ± 859.15 m; range = 0.28 m – 7.64 km) and were on average only 297 m away from another site (SD = 553 m; range = 0.14 m – 12.85 km) (Figure 3). This analysis was done in the previous section.
Here, to further reduce spatial autocorrelation, we divided the study area into a grid of 1km wide square cells and picked checklists from one site at random within each grid cell.
# grid based spatial thinning gridsize <- 500 # grid size in metres effort_distance_max <- 1000 # removing checklists with this distance # make grids across the study site hills <- st_read("data/spatial/hillsShapefile/Nil_Ana_Pal.shp") %>% st_transform(32643) grid <- st_make_grid(hills, cellsize = gridsize) # filtering on !pres_abs keeps absences # this absence data will be thinned data_thin_absences <- filter(dataGrouped, !pres_abs) data_presences <- filter(dataGrouped, pres_abs) # split data by species data_thin_absences <- split( x = data_thin_absences, f = data_thin_absences$scientific_name )
data_presence_prop <- count(data_presences, scientific_name, name = "presences") %>% mutate( absences = map_int(data_thin_absences, nrow), presence_prop = presences / (presences + absences) ) # mean and sd of presence prop data_presence_prop %>% summarise(mean(presence_prop), sd(presence_prop))
# spatial thinning on each species retains # site with maximum visits per grid cell data_thin_absences <- map(data_thin_absences, function(df) { # count visits per locality df <- group_by(df, locality) %>% mutate(tot_effort = length(sampling_event_identifier)) %>% ungroup() # remove sites with distances above spatial independence df <- df %>% dplyr::filter(effort_distance_km <= effort_distance_max) %>% st_as_sf(coords = c("longitude", "latitude")) %>% `st_crs<-`(4326) # transform to regional UTM 43N and add id df <- df %>% st_transform(32643) %>% mutate(coordId = 1:nrow(.)) %>% bind_cols(as_tibble(st_coordinates(.))) # whcih cell has which coords grid_overlap <- st_contains(grid, df) %>% unclass() %>% purrr::discard(.p = is_empty) # count length of grid overlap list # this is the number of cells with points in them sampled_cells <- length(grid_overlap) # make tibble grid_overlap <- tibble( uid_cell = seq(length(grid_overlap)), # the uid_cell is specific to this sp. coordId = grid_overlap ) # unnest grid_overlap <- unnest(grid_overlap, cols = "coordId") # join grid cell overlap with coordinate data df <- left_join(df, grid_overlap, by = "coordId" ) %>% st_drop_geometry() # for each uid_cell, select coord where effort is max points_max <- df %>% group_by(uid_cell) %>% dplyr::filter(tot_effort == max(tot_effort)) %>% # there may be multiple rows with max effort, select first dplyr::filter(row_number() == 1) # check for number of samples assertthat::assert_that( assertthat::are_equal(sampled_cells, nrow(points_max), msg = "spatial thinning error: more samples than\\ sampled cells" ) ) # check that there is one sample per cell assertthat::assert_that( assertthat::are_equal( max(count(points_max, uid_cell)$n), 1 ) ) # return data without UID cell and coordinate Id dplyr::select(ungroup(points_max), -uid_cell, -coordId, -tot_effort) }) # remove old data rm(dataGrouped)
data_presence_prop <- data_presence_prop %>% mutate( absences_sp_thin = map_int(data_thin_absences, nrow) )
Additionally, from each selected site, we randomly selected a maximum of 10 absence checklists, which reduced temporal clustering. We kept all presence checklists.
# subsample data for random 10 observations dataSubsample <- map(data_thin_absences, function(df) { df <- ungroup(df) df_to_locality <- split(x = df, f = df$locality) df_samples <- map_if( .x = df_to_locality, .p = function(x) { nrow(x) > 10 }, .f = function(x) sample_n(x, 10, replace = FALSE) ) bind_rows(df_samples) })
data_presence_prop <- data_presence_prop %>% mutate( absences_tmp_thin = map_int(dataSubsample, nrow), presence_prop_post_thin = presences / (presences + absences_tmp_thin) ) # save data write_csv(data_presence_prop, "results/08_data-class-balance.csv")
# bind all spatially and temporally thinned absences rows for data frame dataSubsample <- bind_rows(dataSubsample) # convert presence data to UTM 43 N and long-lat to X-Y data_presences <- bind_cols( data_presences, as_tibble( st_as_sf( data_presences, coords = c("longitude", "latitude"), crs = 4326 ) %>% st_transform(32643) %>% st_coordinates() ) ) # drop long lat data_presences <- dplyr::select(data_presences, -longitude, -latitude) # join ALL PRESENCES and THINNED ABSENCES dataSubsample <- bind_rows(dataSubsample, data_presences) # check joined data assertthat::assert_that( max(apply(dataSubsample, 2, function(x) sum(is.na(x)))) == 0, msg = "some columns missing from one of the datasets" ) # remove previous data rm(data_thin_absences)
Load the CCI computed in the previous section. The CCI was the lone observer’s expertise score for single-observer checklists, and the highest expertise score among observers for group checklists.
# read in obs score and extract numbers expertiseScore <- read_csv("results/04_data-obsExpertise-score.csv") %>% mutate(numObserver = str_extract(observer, "\\d+")) %>% dplyr::select(-observer) # group seis consist of multiple observers # in this case, seis need to have the highest expertise observer score # as the associated covariate # get unique observers per sei dataSeiScore <- distinct( dataSubsample, sampling_event_identifier, observer_id ) %>% # make list column of observers mutate(observers = str_split(observer_id, ",")) %>% unnest(cols = c(observers)) %>% # add numeric observer id mutate(numObserver = str_extract(observers, "\\d+")) %>% # now get distinct sei and observer id numeric distinct(sampling_event_identifier, numObserver) # now add expertise score to sei dataSeiScore <- left_join(dataSeiScore, expertiseScore, by = "numObserver" ) %>% # get max expertise score per sei group_by(sampling_event_identifier) %>% summarise(expertise = max(score)) # add to dataCovar dataSubsample <- left_join(dataSubsample, dataSeiScore, by = "sampling_event_identifier" ) # remove data without expertise score dataSubsample <- filter(dataSubsample, !is.na(expertise))
Reload climate and land cover predictors prepared previously.
# list landscape covariate stacks landscape_files <- "data/spatial/landscape_resamp01_km.tif" # read in as stacks landscape_data <- stack(landscape_files) # get proper names elev_names <- c("elev", "slope", "aspect") chelsa_names <- c("bio_1", "bio_12") names(landscape_data) <- as.character(glue('{c(elev_names, chelsa_names, "landcover")}'))
Every checklist on eBird is associated with a latitude and longitude. However, the coordinates entered by an observer may not accurately depict the location at which a species was detected. This can occur for two reasons: first, traveling checklists are often associated with a single location along the route travelled by observers; and second, checklist locations could be assigned to a ‘hotspot’ – a location that is marked by eBird as being frequented by multiple observers. In many cases, an observation might be assigned to a hotspot even though the observation was not made at the precise location of the hotspot (Praveen J 2017). Johnston et al. (2021) showed that a large proportion of observations occurred within a 3km grid, even for those checklists up to 5km in length. Hence to adjust for spatial precision, we considered a minimum radius of 2.5km around each unique locality when sampling environmental covariate values.
# assign neighbourhood radius in m sample_radius <- 2.5 * 1e3 # get distinct points and make buffer ebird_buff <- dataSubsample %>% ungroup() %>% distinct(X, Y) %>% # remove NAs drop_na() # convert to spatial features ebird_buff <- st_as_sf(ebird_buff, coords = c("X", "Y"), crs = 32643) %>% # add long lat bind_cols(as_tibble(st_coordinates(.))) %>% # make buffer around points st_buffer(dist = sample_radius)
All climatic covariates are sampled by considering the mean values within a 2.5km radius as discussed above and prefixed "am_".
# get area mean for all preds except landcover, which is the last one stk <- raster::dropLayer(landscape_data, "landcover") # removing landcover here velstk <- velox(stk) # velox raster value extraction dextr <- velstk$extract( sp = ebird_buff, df = TRUE, fun = function(x) mean(x, na.rm = T) ) # assign names for joining names(dextr) <- c("id", names(stk)) env_area_mean <- as_tibble(dextr) # add id to buffer data ebird_buff <- mutate(ebird_buff, id = seq(nrow(ebird_buff)) ) # join to buffer data ebird_buff <- inner_join(ebird_buff, env_area_mean)
All land cover covariates were sampled by considering the proportion of each land cover type within a 2.5km radius.
# get the last element of each stack from the list # this is the landcover at that resolution lc <- landscape_data[["landcover"]] # accessing landcover here lc_velox <- velox(lc) lc_vals <- lc_velox$extract(sp = ebird_buff, df = TRUE) names(lc_vals) <- c("id", "lc") # get landcover proportions lc_prop <- count(lc_vals, id, lc) %>% group_by(id) %>% mutate( lc = glue('lc_{str_pad(lc, 2, pad = "0")}'), prop = n / sum(n) ) %>% dplyr::select(-n) %>% tidyr::pivot_wider( names_from = lc, values_from = prop, values_fill = list(prop = 0) ) %>% ungroup() # join to data ebird_buff <- mutate(ebird_buff, lc_prop)
# drop geometry ebird_buff <- st_drop_geometry(ebird_buff) # link to dataSubsample dataSubsample <- inner_join(dataSubsample, ebird_buff, by = c("X", "Y") )
Save data to file.
# write to file write_csv(dataSubsample, path = glue("results/08_data-covars-2.5km.csv"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.