# Load libraries # for ebird data library(auk) library(ebirdst) # general data library(tidyverse) library(data.table) library(lubridate) library(openxlsx) library(raster) # probably unnecessary # for models library(unmarked) library(MuMIn) library(AICcmodavg) library(fields) # for computation library(doParallel) library(snow) library(ecodist) # Source necessary functions source("R/fun_screen_cor.R") source("R/fun_model_estimate_collection.r")
Here, we load the required dataframe that contains 10 random visits to a site and environmental covariates that were prepared at a spatial scale of 2.5 sq.km. We also scaled all covariates (mean around 0 and standard deviation of 1). Next, we ensured that only Traveling and Stationary checklists were considered. Even though stationary counts have no distance traveled, we defaulted all stationary accounts to an effective distance of 100m, which we consider the average maximum detection radius for most bird species in our area.
# Load in the prepared dataframe dat <- fread("results/08_data-covars-2.5km.csv", header = T) dat <- as_tibble(dat) head(dat)
Select protocol and add 0.1 km to stationary checklists.
# Some more pre-processing to get the right data structures # Ensuring that only Traveling and Stationary checklists were considered names(dat) dat <- dat %>% filter(protocol_type %in% c("Traveling", "Stationary")) # We take all stationary counts and give them a distance of 100 m (so 0.1 km), # as that's approximately the max normal hearing distance for people doing point # counts. dat <- dat %>% mutate(effort_distance_km = if_else( effort_distance_km == 0 & protocol_type == "Stationary", 0.1, effort_distance_km ))
Convert time and date to julian date and minutes since day.
# Converting time observations started to numeric and adding it as a new column # This new column will be minute_observations_started dat <- dat %>% mutate( min_obs_started = as.integer( as.difftime( time_observations_started, format = "%H:%M:%S", units = "mins" ) ) ) # Adding the julian date to the dataframe dat <- dat %>% mutate(julian_date = lubridate::yday(observation_date)) # recode julian date to model it as a linear predictor dat <- dat %>% mutate( newjulianDate = case_when( (julian_date >= 334 & julian_date) <= 365 ~ (julian_date - 333), (julian_date >= 1 & julian_date) <= 152 ~ (julian_date + 31) ) ) %>% drop_na(newjulianDate) # recode time observations started to model it as a linear predictor dat <- dat %>% mutate( newmin_obs_started = case_when( min_obs_started >= 300 & min_obs_started <= 720 ~ abs(min_obs_started - 720), min_obs_started >= 720 & min_obs_started <= 1140 ~ abs(720 - min_obs_started) ) ) %>% drop_na(newmin_obs_started)
# Removing other unnecessary columns from the dataframe and creating a clean one without the rest names(dat) # select relevant columns BY NAME dat <- dplyr::select( dat, c( "duration_minutes", "effort_distance_km", "locality", "locality_type", "locality_id", "observer_id", "observation_date", "scientific_name", "observation_count", "protocol_type", "number_observers", "pres_abs" ), # rename X-Y but NOTE THEY ARE IN UTM COORDINATES longitude = "X", latitude = "Y", expertise, # elevation and climate layers elev, bio4, bio15, # all LANDCOVER COLUMNS matches("lc"), # set new columns to old column names julian_date = "newjulianDate", min_obs_started = "newmin_obs_started" ) # add year and convert presence-absence to integer dat.1 <- dat %>% mutate( year = year(observation_date), pres_abs = as.integer(pres_abs) ) # occupancy modeling requires an integer response # Scaling detection and occupancy covariates dat.scaled <- dat.1 # Note: Never refer to columns by numbers, numbers change, names remain cols_to_scale <- c( "duration_minutes", "effort_distance_km", "number_observers", "expertise", "elev", "bio4a", "bio15a", "lc_02", "lc_09", "lc_01", "lc_05", "lc_04", "lc_07", "lc_03", "julian_date", "min_obs_started" ) # this scales the relevant columns between 0 and 1 dat.scaled <- mutate( dat.scaled, across( .cols = all_of(cols_to_scale), # referring to the columns .fns = scales::rescale # the rescale function ) ) # save data to file fwrite(dat.scaled, "results/09_scaled-covars-2.5km.csv")
# Reload the scaled covariate data dat.scaled <- fread("results/09_scaled-covars-2.5km.csv", header = T) dat.scaled <- as_tibble(dat.scaled) head(dat.scaled) # Ensure observation_date column is in the right format dat.scaled$observation_date <- format( as.Date( dat.scaled$observation_date, "%m/%d/%Y" ), "%Y-%m-%d" )
# Testing for correlations before running further analyses # Majority are uncorrelated since we decided to keep climatic and land cover predictors and removed elevation. source("R/fun_screen_cor.R") # SELECT COLUMNS to check BY NAME cols_to_check <- c( "expertise", "bio4a", "bio15a", "lc_02", "lc_09", "lc_01", "lc_05", "lc_04", "lc_07", "lc_03" ) # screen covariates for correlation screen.cor(dat.scaled[, cols_to_check], threshold = 0.3) # total number of presences by species # min no. presences = 224 to max = 7725 presSpecies <- dat.scaled %>% group_by(scientific_name) %>% filter(pres_abs == "1") %>% summarise(n = n()) # convert locality_id to factors dat.scaled$locality_id <- as.factor(dat.scaled$locality_id)
# All null models are stored in lists below all_null <- list() # define species and a counter species <- unique(dat.scaled$scientific_name) counter <- 0 # Add a progress bar for the loop pb <- txtProgressBar( min = 0, max = length(species), style = 3 ) # text based bar # loop over species for (i in species) { # filter data by species data <- dat.scaled %>% filter(scientific_name == i) # Preparing data for the unmarked model occ <- filter_repeat_visits(data, min_obs = 1, max_obs = 10, annual_closure = FALSE, n_days = 1488, # 7 years is considered a period of closure date_var = "observation_date", site_vars = c("locality_id") ) obs_covs <- c( "min_obs_started", "duration_minutes", "effort_distance_km", "number_observers", "expertise", "julian_date" ) # format for unmarked occ_wide <- format_unmarked_occu(occ, site_id = "site", response = "pres_abs", site_covs = c( "locality_id", "lc_01", "lc_02", "lc_05", "lc_04", "lc_09", "lc_07", "lc_03", "bio4a", "bio15a" ), obs_covs = obs_covs ) # Convert this dataframe of observations into an unmarked object to start fitting occupancy models occ_um <- formatWide(occ_wide, type = "unmarkedFrameOccu") # Set up the model # the list is now automatically named all_null[[i]] <- occu(~1 ~ 1, data = occ_um) # increase counter counter <- counter + 1 setTxtProgressBar(pb, counter) } close(pb) # Store all the model outputs for each species capture.output(all_null, file = "results/09_null_models.csv")
Here, we use the unmarked
package in R (Fiske and Chandler 2011) to identify detection level covariates that are important for each species. We use AIC criteria to select top models (Burnham et al. 2002; 2011).
# All models are stored in lists below det_dred <- list() # Subsetting those models whose deltaAIC<4 (Burnham et al. 2011) top_det <- list() # Getting model averaged coefficients and relative importance scores det_avg <- list() det_imp <- list() # Getting model estimates det_modelEst <- list() # Add a progress bar for the loop pb <- txtProgressBar( min = 0, max = length(unique(dat.scaled$scientific_name)), style = 3 ) # text based bar for (i in 1:length(unique(dat.scaled$scientific_name))) { data <- dat.scaled %>% filter(dat.scaled$scientific_name == unique(dat.scaled$scientific_name)[i]) # Preparing data for the unmarked model occ <- filter_repeat_visits(data, min_obs = 1, max_obs = 10, annual_closure = FALSE, n_days = 1488, # 7 years is considered a period of closure date_var = "observation_date", site_vars = c("locality_id") ) obs_covs <- c( "min_obs_started", "duration_minutes", "effort_distance_km", "number_observers", "expertise", "julian_date" ) # format for unmarked occ_wide <- format_unmarked_occu(occ, site_id = "site", response = "pres_abs", site_covs = c( "locality_id", "lc_01", "lc_02", "lc_05", "lc_04", "lc_09", "lc_07", "lc_03", "bio4a", "bio15a" ), obs_covs = obs_covs ) # Convert this dataframe of observations into an unmarked object to start fitting occupancy models occ_um <- formatWide(occ_wide, type = "unmarkedFrameOccu") # Fit a global model with all detection level covariates global_mod <- occu(~ min_obs_started + julian_date + duration_minutes + effort_distance_km + number_observers + expertise ~ 1, data = occ_um) # Set up the cluster clusterType <- if (length(find.package("snow", quiet = TRUE))) "SOCK" else "PSOCK" clust <- try(makeCluster(getOption("cl.cores", 5), type = clusterType)) clusterEvalQ(clust, library(unmarked)) clusterExport(clust, "occ_um") det_dred[[i]] <- pdredge(global_mod, clust) names(det_dred)[i] <- unique(dat.scaled$scientific_name)[i] # Get the top models, which we'll define as those with deltaAICc < 4 top_det[[i]] <- get.models(det_dred[[i]], subset = delta < 4, cluster = clust) names(top_det)[i] <- unique(dat.scaled$scientific_name)[i] # Obtaining model averaged coefficients if (length(top_det[[i]]) > 1) { a <- model.avg(top_det[[i]], fit = TRUE) det_avg[[i]] <- as.data.frame(a$coefficients) names(det_avg)[i] <- unique(dat.scaled$scientific_name)[i] det_modelEst[[i]] <- data.frame( Coefficient = coefTable(a, full = T)[, 1], SE = coefTable(a, full = T)[, 2], lowerCI = confint(a)[, 1], upperCI = confint(a)[, 2], z_value = (summary(a)$coefmat.full)[, 3], Pr_z = (summary(a)$coefmat.full)[, 4] ) names(det_modelEst)[i] <- unique(dat.scaled$scientific_name)[i] det_imp[[i]] <- as.data.frame(MuMIn::importance(a)) names(det_imp)[i] <- unique(dat.scaled$scientific_name)[i] } else { det_avg[[i]] <- as.data.frame(unmarked::coef(top_det[[i]][[1]])) names(det_avg)[i] <- unique(dat.scaled$scientific_name)[i] lowDet <- data.frame(lowerCI = confint(top_det[[i]][[1]], type = "det")[, 1]) upDet <- data.frame(upperCI = confint(top_det[[i]][[1]], type = "det")[, 2]) zDet <- data.frame(summary(top_det[[i]][[1]])$det[, 3]) Pr_zDet <- data.frame(summary(top_det[[i]][[1]])$det[, 4]) Coefficient <- coefTable(top_det[[i]][[1]])[, 1] SE <- coefTable(top_det[[i]][[1]])[, 2] det_modelEst[[i]] <- data.frame( Coefficient = Coefficient[2:length(Coefficient)], SE = SE[2:length(SE)], lowerCI = lowDet, upperCI = upDet, z_value = zDet, Pr_z = Pr_zDet ) names(det_modelEst)[i] <- unique(dat.scaled$scientific_name)[i] } setTxtProgressBar(pb, i) stopCluster(clust) } close(pb) ## Storing output from the above models in excel sheets # 1. Store all the model outputs for each species (variable: det_dred() - see above) write.xlsx(det_dred, file = "results/09_det-dred.xlsx") # 2. Store all the model averaged outputs for each species and the relative importance score write.xlsx(det_avg, file = "results/09_det-avg.xlsx", rowNames = T, colNames = T) write.xlsx(det_imp, file = "results/09_det-imp.xlsx", rowNames = T, colNames = T) write.xlsx(det_modelEst, file = "results/09_det-modelEst.xlsx", rowNames = T, colNames = T) # Note if you are unable to write to a file, use (for example) a <- purrr::map(det_imp, ~ purrr::compact(.)) %>% purrr::keep(~ length(.) != 0)
Occupancy models estimate the probability of occurrence of a given species while controlling for the probability of detection and allow us to model the factors affecting occurrence and detection independently. The flexible eBird observation process contributes to the largest source of variation in the likelihood of detecting a particular species; hence, we included seven covariates that influence the probability of detection for each checklist: ordinal day of year, duration of observation, distance traveled, protocol type, time observations started, number of observers and the checklist calibration index (CCI).
Using a multi-model information-theoretic approach, we tested how strongly our occurrence data fit our candidate set of environmental covariates (Burnham et al. 2002). We fitted single-species occupancy models for each species, to simultaneously estimate a probability of detection ($\p$) and a probability of occupancy ($\psi$). For each species, we fit models, each with a unique combination of the climate and land cover occupancy covariates and all seven detection covariates.
Across the models tested for each species, the model with highest support was determined using AICc scores. However, across the majority of the species, no single model had overwhelming support. Hence, for each species, we examined those models which had $\Delta$AICc < 4, as these top models were considered to explain a large proportion of the association between the species-specific probability of occupancy and environmental drivers (Burnham et al. 2011; Elsen et al. 2017). Using these restricted model sets for each species; we created a model-averaged coefficient estimate for each predictor and assessed its direction and significance. We considered a predictor to be significantly associated with occupancy if the range of the 95% confidence interval around the model-averaged coefficient did not contain zero.
# All models are stored in lists below lc_clim <- list() # Subsetting those models whose deltaAIC<4 (Burnham et al. 2011) top_lc_clim <- list() # Getting model averaged coefficients and relative importance scores lc_clim_avg <- list() lc_clim_imp <- list() # Storing Model estimates lc_clim_modelEst <- list() # Add a progress bar for the loop pb <- txtProgressBar(min = 0, max = length(unique(dat.scaled$scientific_name)), style = 3) # text based bar for (i in 1:length(unique(dat.scaled$scientific_name))) { data <- dat.scaled %>% filter(dat.scaled$scientific_name == unique(dat.scaled$scientific_name)[i]) # Preparing data for the unmarked model occ <- filter_repeat_visits(data, min_obs = 1, max_obs = 10, annual_closure = FALSE, n_days = 1488, # 7 years is considered a period of closure date_var = "observation_date", site_vars = c("locality_id") ) obs_covs <- c( "min_obs_started", "duration_minutes", "effort_distance_km", "number_observers", "expertise", "julian_date" ) # format for unmarked occ_wide <- format_unmarked_occu(occ, site_id = "site", response = "pres_abs", site_covs = c( "locality_id", "lc_01", "lc_02", "lc_05", "lc_04", "lc_09", "lc_07", "lc_03", "bio4a", "bio15a" ), obs_covs = obs_covs ) # Convert this dataframe of observations into an unmarked object to start fitting occupancy models occ_um <- formatWide(occ_wide, type = "unmarkedFrameOccu") model_lc_clim <- occu(~ min_obs_started + julian_date + duration_minutes + effort_distance_km + number_observers + expertise ~ lc_01 + lc_02 + lc_05 + lc_04 + lc_09 + lc_07 + lc_03 + bio4a + bio15a, data = occ_um) # Set up the cluster clusterType <- if (length(find.package("snow", quiet = TRUE))) "SOCK" else "PSOCK" clust <- try(makeCluster(getOption("cl.cores", 5), type = clusterType)) clusterEvalQ(clust, library(unmarked)) clusterExport(clust, "occ_um") # Detection terms are fixed det_terms <- c( "p(duration_minutes)", "p(effort_distance_km)", "p(expertise)", "p(julian_date)", "p(min_obs_started)", "p(number_observers)" ) lc_clim[[i]] <- pdredge(model_lc_clim, clust, fixed = det_terms) names(lc_clim)[i] <- unique(dat.scaled$scientific_name)[i] # Identiying top subset of models based on deltaAIC scores being less than 4 (Burnham et al., 2011) top_lc_clim[[i]] <- get.models(lc_clim[[i]], subset = delta < 4, cluster = clust) names(top_lc_clim)[i] <- unique(dat.scaled$scientific_name)[i] # Obtaining model averaged coefficients for both candidate model subsets if (length(top_lc_clim[[i]]) > 1) { a <- model.avg(top_lc_clim[[i]], fit = TRUE) lc_clim_avg[[i]] <- as.data.frame(a$coefficients) names(lc_clim_avg)[i] <- unique(dat.scaled$scientific_name)[i] lc_clim_modelEst[[i]] <- data.frame( Coefficient = coefTable(a, full = T)[, 1], SE = coefTable(a, full = T)[, 2], lowerCI = confint(a)[, 1], upperCI = confint(a)[, 2], z_value = (summary(a)$coefmat.full)[, 3], Pr_z = (summary(a)$coefmat.full)[, 4] ) names(lc_clim_modelEst)[i] <- unique(dat.scaled$scientific_name)[i] lc_clim_imp[[i]] <- as.data.frame(MuMIn::importance(a)) names(lc_clim_imp)[i] <- unique(dat.scaled$scientific_name)[i] } else { lc_clim_avg[[i]] <- as.data.frame(unmarked::coef(top_lc_clim[[i]][[1]])) names(lc_clim_avg)[i] <- unique(dat.scaled$scientific_name)[i] lowSt <- data.frame(lowerCI = confint(top_lc_clim[[i]][[1]], type = "state")[, 1]) lowDet <- data.frame(lowerCI = confint(top_lc_clim[[i]][[1]], type = "det")[, 1]) upSt <- data.frame(upperCI = confint(top_lc_clim[[i]][[1]], type = "state")[, 2]) upDet <- data.frame(upperCI = confint(top_lc_clim[[i]][[1]], type = "det")[, 2]) zSt <- data.frame(z_value = summary(top_lc_clim[[i]][[1]])$state[, 3]) zDet <- data.frame(z_value = summary(top_lc_clim[[i]][[1]])$det[, 3]) Pr_zSt <- data.frame(Pr_z = summary(top_lc_clim[[i]][[1]])$state[, 4]) Pr_zDet <- data.frame(Pr_z = summary(top_lc_clim[[i]][[1]])$det[, 4]) lc_clim_modelEst[[i]] <- data.frame( Coefficient = coefTable(top_lc_clim[[i]][[1]])[, 1], SE = coefTable(top_lc_clim[[i]][[1]])[, 2], lowerCI = rbind(lowSt, lowDet), upperCI = rbind(upSt, upDet), z_value = rbind(zSt, zDet), Pr_z = rbind(Pr_zSt, Pr_zDet) ) names(lc_clim_modelEst)[i] <- unique(dat.scaled$scientific_name)[i] } gc() setTxtProgressBar(pb, i) stopCluster(clust) } close(pb) # 1. Store all the model outputs for each species (for both landcover and climate) write.xlsx(lc_clim, file = "results/09_lc-clim.xlsx") # 2. Store all the model averaged outputs for each species and relative importance scores write.xlsx(lc_clim_avg, file = "results/09_lc-clim-avg.xlsx", rowNames = T, colNames = T) write.xlsx(lc_clim_imp, file = "results/09_lc-clim-imp.xlsx", rowNames = T, colNames = T) # 3. Store all model estimates write.xlsx(lc_clim_modelEst, file = "results/09_lc-clim-modelEst.xlsx", rowNames = T, colNames = T) # Note if you are unable to write to a file, use (for example) a <- purrr::map(lc_clim_modelEst, ~ purrr::compact(.)) %>% purrr::keep(~ length(.) != 0)
Adequate model fit was assessed using a chi-square goodness-of-fit test using 1,000 parametric bootstrap simulations on a global model that included all occupancy and detection covariates (MacKenzie & Bailey, 2004).
goodness_of_fit <- data.frame() # Add a progress bar for the loop pb <- txtProgressBar(min = 0, max = length(unique(dat.scaled$scientific_name)), style = 3) # text based bar for (i in 1:length(unique(dat.scaled$scientific_name))) { data <- dat.scaled %>% filter(dat.scaled$scientific_name == unique(dat.scaled$scientific_name)[i]) # Preparing data for the unmarked model occ <- filter_repeat_visits(data, min_obs = 1, max_obs = 10, annual_closure = FALSE, n_days = 1488, # 7 years is considered a period of closure date_var = "observation_date", site_vars = c("locality_id") ) obs_covs <- c( "min_obs_started", "duration_minutes", "effort_distance_km", "number_observers", "protocol_type", "expertise", "julian_date" ) # format for unmarked occ_wide <- format_unmarked_occu(occ, site_id = "site", response = "pres_abs", site_covs = c( "locality_id", "lc_01", "lc_02", "lc_05", "lc_04", "lc_09", "lc_07", "lc_03", "bio4a", "bio15a" ), obs_covs = obs_covs ) # Convert this dataframe of observations into an unmarked object to start fitting occupancy models occ_um <- formatWide(occ_wide, type = "unmarkedFrameOccu") model_lc_clim <- occu(~ min_obs_started + julian_date + duration_minutes + effort_distance_km + number_observers + protocol_type + expertise ~ lc_01 + lc_02 + lc_05 + lc_04 + lc_09 + lc_07 + lc_03 + bio4a + bio15a, data = occ_um) # note: reduce nsim as this takes a very long time even with parallelization occ_gof <- mb.gof.test(model_lc_clim, nsim = 1000, parallel = T, ncores = 5, plot.hist = FALSE ) p.value <- occ_gof$p.value c.hat <- occ_gof$c.hat.est scientific_name <- unique(data$scientific_name) a <- data.frame(scientific_name, p.value, c.hat) goodness_of_fit <- rbind(a, goodness_of_fit) setTxtProgressBar(pb, i) } close(pb) write.csv(goodness_of_fit, "results/09_goodness-of-fit-2.5km.csv", row.names = F)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.