#' Install a particular dataset via rdataretreiver
#'
#' @param dataset name
install_dataset <- function(dataset){
# Install a dataset using the rdataretriever
rdataretriever::install(dataset, 'sqlite', db_file='./data/bbsforecasting.sqlite')
}
#' Single wrapper for all database actions
#'
#' We require only a few simple sql methods. They are 1. Writing an entire dataframe
#' directly to a database as it's own table, 2. Reading the same tables as dataframes,
#' possibly with some modification using SQL statements, 3. Checking to see if a
#' table exists. If a particular table does it exists it's assumed it has all data
#' required.
#'
#' read returns a dataframe
#' write returns nothing
#' check returns boolean
#'
#' @param action Action to perform in db call. Either read, write, or check
#' @param db name of database. A file if using sqlite
#' @param sql_query SQL statement if action is read
#' @param df Dataframe of data if action is write. Will copy the dataframe verbatim to it's own table with name new_table_name
#' @param new_table_name Table name for new data being written
#' @param table_to_check Table name to check if it exists for when action is check
#' @importFrom dplyr copy_to src_sqlite src_tbls collect tbl
db_engine=function(action, db='./data/bbsforecasting.sqlite', sql_query=NULL,
df=NULL, new_table_name=NULL, table_to_check=NULL){
if(!dir.exists("data")){dir.create("data")}
con <- src_sqlite(db, create=TRUE)
if(action=='read'){
to_return=collect(tbl(con, sql(sql_query)), n=Inf)
} else if(action=='write') {
copy_to(con, df, name=new_table_name, temporary = FALSE)
to_return=NA
} else if(action=='check') {
#Only works with sqlite for now.
to_return=tolower(table_to_check) %in% tolower(src_tbls(con))
} else {
stop(paste0('DB action: ',action,' not found'))
}
#Close the connection before returning results.
rm(con)
return(to_return)
}
#' Filter poorly sampled BBS species
#'
#' Removes waterbirds, shorebirds, owls, kingfishers, knightjars,
#' dippers. These species are poorly sampled due to their aquatic or
#' noctural nature. Also removes taxa that were either partially unidentified
#' (e.g. "sp.") or were considered hybrids (e.g. "A x B") or were listed as more
#' than one species (e.g. "A / B")
#'
#' @param df dataframe containing an species_id column
#'
#' @return dataframe, filtered version of initial dataframe
#' @importFrom dplyr "%>%" inner_join do rowwise select filter group_by ungroup full_join n_distinct semi_join left_join
filter_species <- function(df){
species_table = get_species_data()
is_unidentified = function(names) {
#Before filtering, account for this one hybrid of 2 subspecies so it's kept
names[names=='auratus auratus x auratus cafer']='auratus auratus'
grepl('sp\\.| x |\\/', names)
}
valid_taxa = species_table %>%
filter(!is_unidentified(species)) %>%
filter(aou > 2880) %>%
filter(aou < 3650 | aou > 3810) %>%
filter(aou < 3900 | aou > 3910) %>%
filter(aou < 4160 | aou > 4210) %>%
filter(aou != 7010)
filter(df, species_id %in% valid_taxa$aou)
}
#' Combine subspecies into their common species
#'
#' @importFrom dplyr "%>%" filter slice group_by summarise ungroup pull
#' @importFrom stringr word
combine_subspecies = function(df){
species_table = get_species_data()
# Subspecies have two spaces separated by non-spaces
subspecies_names = species_table %>%
filter(aou %in% unique(df$species_id)) %>%
pull(spanish_common_name) %>%
grep(" [^ ]+ ", ., value = TRUE)
subspecies_ids = species_table %>%
filter(spanish_common_name %in% subspecies_names) %>%
pull(aou)
# Drop all but the first two words to get the root species name,
# then find the AOU code
new_subspecies_ids = species_table %>%
slice(match(word(subspecies_names, 1,2),
species_table$spanish_common_name)) %>%
pull(aou)
# replace the full subspecies names with species-level names
for (i in seq_along(subspecies_ids)) {
df$species_id[df$species_id == subspecies_ids[i]] = new_subspecies_ids[i]
}
df %>%
group_by(site_id, year, species_id, lat, long) %>%
summarise(abundance = sum(abundance)) %>%
ungroup()
}
get_species_data = function() {
data_path <- paste('./data/', 'bbs', '_species.csv', sep = "")
if (file.exists(data_path)) {
return(read.csv(data_path))
}else{
species_table=db_engine(action = 'read', sql_query = 'SELECT * FROM breed_bird_survey_species')
write.csv(species_table, file = data_path, row.names = FALSE, quote = FALSE)
save_provenance(species_table)
return(species_table)
}
}
#' Get the primary bbs data file which compiles the counts, route info, and weather
#' data. Install it via rdataretriever if needed.
#'
#' @export
#' @importFrom dplyr "%>%" group_by
#' @importFrom readr read_csv
get_bbs_data <- function(){
data_path <- paste('./data/', 'bbs', '_data.csv', sep="")
if (file.exists(data_path)){
return(read_csv(data_path))
}
else{
if (!db_engine(action='check', table_to_check = 'breed_bird_survey_counts')){
install_dataset('breed-bird-survey')
}
#Primary BBS dataframe
bbs_query ="SELECT
(counts.statenum*1000) + counts.Route AS site_id,
Latitude AS lat,
Longitude AS long,
aou AS species_id,
counts.Year AS year,
speciestotal AS abundance
FROM
breed_bird_survey_counts AS counts
JOIN breed_bird_survey_weather
ON counts.statenum=breed_bird_survey_weather.statenum
AND counts.route=breed_bird_survey_weather.route
AND counts.rpid=breed_bird_survey_weather.rpid
AND counts.year=breed_bird_survey_weather.year
JOIN breed_bird_survey_routes
ON counts.statenum=breed_bird_survey_routes.statenum
AND counts.route=breed_bird_survey_routes.route
WHERE breed_bird_survey_weather.runtype=1 AND breed_bird_survey_weather.rpid=101"
bbs_data=db_engine(action='read', sql_query = bbs_query) %>%
filter_species() %>%
group_by(site_id) %>%
combine_subspecies()
save_provenance(bbs_data)
write.csv(bbs_data, file = data_path, row.names = FALSE, quote = FALSE)
return(bbs_data)
}
}
#' Get route locations in a SpatialPointsDataFrame
#'
#' @param projection string projection for route data
#'
#' @return a spatial data frame including site_id, long, and lat
#' @importFrom sp SpatialPointsDataFrame CRS
#' @importFrom dplyr collect copy_to src_sqlite src_tbls tbl %>%
get_route_data <- function(){
p=CRS('+proj=longlat +datum=WGS84')
bbs_data <- get_bbs_data()
route_locations <- unique(dplyr::select(bbs_data, site_id, long, lat))
spatial_routes <- route_locations %>%
dplyr::select(long, lat) %>%
SpatialPointsDataFrame(data=route_locations, proj4string=p)
}
#' Master function for acquiring all environmental in a single table.
#' Returns either past observed data from PRSISM + NDVI, or future
#' forecasted data from CMIP5 and a naive NDVI forecast.
#'
#' @param timeframe str Either 'past' or 'future' for 1981-2013 or 2014-2050, respectively.
#' @param ndvi_forecast_source_years vector Years of observed NDVI data to use for making
#' a naive NDVI forecast with site specific means. Ignored if timeframe is 'past'
#' @param future_data_years vector Years of forecast environmental data to return. Ignored
#' if timeframe is 'past'
#'
#' @export
#' @importFrom dplyr "%>%" filter group_by summarise ungroup inner_join full_join
#' @importFrom tidyr gather spread
get_env_data <- function(timeframe = 'past', ndvi_forecast_source_years=2000:2013,
future_data_years=2014:2050){
ndvi_data_raw <- get_bbs_gimms_ndvi()
#Offset the NDVI year by 6 months so that the window for will be July 1 - June 30.
#See https://github.com/weecology/bbs-forecasting/issues/114
ndvi_data_raw$year = with(ndvi_data_raw, ifelse(month %in% c('jul','aug','sep','oct','nov','dec'), year+1, year))
ndvi_data_summer <- ndvi_data_raw %>%
filter(!is.na(ndvi), month %in% c('apr', 'may', 'jun'), year > 1981) %>%
group_by(site_id, year) %>%
summarise(ndvi_sum = mean(ndvi)) %>%
ungroup()
ndvi_data_winter <- ndvi_data_raw %>%
filter(!is.na(ndvi), month %in% c('dec', 'jan', 'feb'), year > 1981) %>%
group_by(site_id, year) %>%
summarise(ndvi_win = mean(ndvi)) %>%
ungroup()
ndvi_data_ann <- ndvi_data_raw %>%
filter(!is.na(ndvi), year > 1981) %>%
group_by(site_id, year) %>%
summarise(ndvi_ann = mean(ndvi)) %>%
ungroup()
ndvi_data = inner_join(ndvi_data_summer, ndvi_data_winter, by=c('site_id','year')) %>%
inner_join(ndvi_data_ann, by=c('site_id','year'))
#If projecting forward, use NDVI averages for future NDVI values, and use
#CMIP5 data for bioclim values
if(timeframe == 'future') {
ndvi_long_term_averages = ndvi_data %>%
filter(year %in% ndvi_forecast_source_years) %>%
gather(season,value, -site_id, -year) %>%
group_by(site_id, season) %>%
summarise(value = mean(value)) %>%
ungroup()
all_sites = unique(ndvi_data$site_id)
ndvi_data = expand.grid(site_id = all_sites, year=future_data_years) %>%
left_join(ndvi_long_term_averages, by='site_id') %>%
spread(season, value)
bioclim_data = get_bioclim_data(source='cmip5') %>%
filter(year>=2014)
} else if(timeframe=='past') {
bioclim_data = get_bioclim_data(source='prism')
} else {
stop(paste0('timeframe unknown: ',timeframe))
}
elev_data <- get_elev_data()
env_data <- full_join(bioclim_data, ndvi_data, by = c('site_id', 'year')) %>%
full_join(elev_data, by = c('site_id'))
save_provenance(env_data)
return(env_data)
}
#' Get BBS population time-series data with environmental variables
#'
#' Selects data from the subset of sites that were visited in at least
#' `min_year_percentage` percent of years in the training set, and from
#' all years between `start_yr` to `end_yr`.
#'
#' Also attaches associated environmental data
#'
#' @param start_yr num first year of time-series
#' @param end_yr num last year of time-series
#' @param last_train_year num last year of the time-series' training portion
#' @param min_year_percentage num percent of years of non-missing data between
#' start_yr & last_train_year
#'
#' @return dataframe with site_id, lat, long, year, species_id, and abundance
#' @export
#' @importFrom dplyr "%>%" filter select
get_pop_ts_env_data <- function(start_yr, end_yr, last_train_year,
min_year_percentage){
pop_ts_env_data = get_bbs_data() %>%
filter_ts(start_yr, end_yr, last_train_year, min_year_percentage) %>%
complete(site_id, year) %>%
add_env_data() %>%
filter(!is.na(bio1), !is.na(ndvi_sum), !is.na(elevs)) %>%
#Filter min_num_years again after accounting for missing environmental data
filter_ts(start_yr, end_yr, last_train_year, min_year_percentage) %>%
ungroup() %>%
add_observers()
save_provenance(pop_ts_env_data)
return(pop_ts_env_data)
}
#' Get BBS richness time-series data with environmental variables. Will
#' fill in NA values for richness in missing years as long as a site
#' meets the year requirments and has environmental data.
#'
#' @param start_yr num first year of time-series
#' @param end_yr num last year of time-series
#' @param last_train_year num last year of time series' training portion
#' @param min_year_percentage num minimum number of years of data between start_yr & end_yr
#'
#' @return dataframe with site_id, year, and richness
#' @export
#' @importFrom dplyr "%>%" left_join select distinct group_by summarise ungroup filter
#' @importFrom tidyr complete
get_richness_ts_env_data <- function(start_yr, end_yr, last_train_year,
min_year_percentage){
richness_ts_env_data = get_pop_ts_env_data(start_yr, end_yr, last_train_year,
min_year_percentage) %>%
collapse_to_richness()
save_provenance(richness_ts_env_data)
return(richness_ts_env_data)
}
#' Replace species-level information with richness values, eliminating redundant rows
#' @param df a data frame, such as produced by get_bbs_data or get_pop_ts_env_data
#' @export
#' @importFrom dplyr n group_by mutate select ungroup distinct right_join
collapse_to_richness = function(df){
# Code below assumes that NA abundance always means no observations for the
# whole transect
stopifnot(all(is.na(df$abundance) == is.na(df$species_id)))
df %>%
group_by(site_id, year) %>%
mutate(richness = sum(abundance > 0)) %>%
select(-species_id, -abundance) %>%
ungroup() %>%
distinct() %>%
right_join(distinct(select(df, -species_id, -abundance)))
}
#' Add environmental data to BBS data frame
#'
#' @param bbs_data dataframe that contains BBS site_id and year columns
#'
#' @return dataframe with original data and associated environmental data
#' @importFrom dplyr "%>%" inner_join
add_env_data <- function(bbs_data){
env_data <- get_env_data()
bbs_data_w_env <- bbs_data %>%
inner_join(env_data, by = c("site_id", "year"), copy = TRUE)
}
#' Add observer ID's to data
#' @param bbs_data dataframe that contains BBS site_id and year columns
#' @importFrom dplyr %>% left_join
add_observers = function(bbs_data) {
data_path = "data/bbs_observers.csv"
if (file.exists(data_path)) {
observer_info = read_csv(data_path)
} else {
observer_query = '
SELECT
breed_bird_survey_weather.obsn as observer_id,
year,
(statenum*1000)+route as site_id
FROM
breed_bird_survey_weather
WHERE
runtype=1 AND rpid=101'
observer_info = db_engine(action = 'read', sql_query = observer_query)
write.csv(observer_info, file = data_path, row.names = FALSE, quote = FALSE)
}
bbs_data %>%
left_join(observer_info, by = c('site_id','year'))
}
#' Filter BBS to specified time series period and number of samples
#'
#' @param bbs_data dataframe that contains BBS site_id and year columns
#' @param start_yr num first year of time-series
#' @param end_yr num last year of time-series
#' @param last_train_year last year of the time-series' training portion
#' @param min_year_percentage num minimum percentage of years between
#' start_yr & last_train_year that have data
#'
#' @return dataframe with original data and associated environmental data
#' @importFrom dplyr "%>%" filter group_by summarise ungroup
filter_ts <- function(bbs_data, start_yr, end_yr, last_train_year,
min_year_percentage){
min_num_years = (last_train_year - start_yr + 1) * min_year_percentage / 100
sites_to_keep = bbs_data %>%
filter(year >= start_yr, year <= last_train_year) %>%
group_by(site_id) %>%
summarise(num_years=length(unique(year))) %>%
ungroup() %>%
filter(num_years >= min_num_years)
filterd_data <- bbs_data %>%
filter(year >= start_yr, year <= end_yr) %>%
filter(site_id %in% sites_to_keep$site_id)
}
get_longest_contig_ts <- function(df){
full_ts <- seq(from=min(df$year), to=max(df$year), by=1)
years <- unique(df$year)
contig_pos <- na.contiguous(match(full_ts, years))
filter(df, year %in% years[contig_pos])
}
get_filtered_ts <- function(df, min_ts_length){
data_by_site <- group_by(df, site_id)
contig_ts <- do(data_by_site, get_longest_contig_ts(.))
contig_ts_by_site <- group_by(contig_ts, site_id)
contig_ts_length <- dplyr::summarize(contig_ts_by_site, n_years = n_distinct(year))
long_ts <- filter(contig_ts_length, n_years >= min_ts_length)
contig_ts_long <- semi_join(contig_ts, long_ts)
}
#' Get data for all contiguous samples ending in a given year
#'
#' Get data for all contiguous samples ending in a given year.
#' If only one year of contiguous data are available keep that year.
#'
#' @param df data.frame including a year column
#' @param stopyear year of the most recent sample
#'
#' @return Contiguous time-series data frame (data.frame).
#' A version of the original data frame only including years that
#' are in a contiguous series with stopyear.
get_modern_contig_ts <- function(df, stopyear) {
}
#' Add zeros to population dynamics data
#'
#' For population dynamics data that lacks real zeros, add them
#'
#' @param df data.frame including site_id, year, species_id, and abundance columns
#'
#' @return Data frame with real zeros added
get_popdyn_data_w_zeros <- function(commdyn_data, start_year, stop_year) {
sp_years <- expand.grid(site_id = unique(commdyn_data$site_id), year = start_year:stop_year, species_id = unique(commdyn_data$species_id))
popdyn_data <- commdyn_data %>%
do(left_join(sp_years, ., by=c("site_id", "year", "species_id")))
popdyn_data[is.na(popdyn_data)] <- 0
return(popdyn_data)
}
#' Perform a suite of time-series only forecasts
#'
#' Performs time-series only forecasting using naive, average, and ARIMA models
#' for each group in a grouped data frame of time-series.
#'
#' @param grouped_tsdata data.frame is a grouped data frame where the group
#' indicates how to split the data frame for running time-series models on each
#' group
#' @param timecol str name of the column that includes the time variable
#' @param responsecol str name of the column that includes the response variable
#' @param lag numeric how many time-steps to forecast and also how many
#' time-steps in the current time-series to ignore when fitting.
#'
#' @return data.frame
#'
#' TODO: separate time-steps to hold out from fitting from time-steps to
#' forecast, since when not hindcasting these may be different
#' @importFrom forecast auto.arima forecast meanf naive
#' @export
get_ts_forecasts <- function(grouped_tsdata, timecol, responsecol, exogcols,
lag = 1, pred_int_levels = c(80, 95)){
get_train_data <- function(df, colnames) as.matrix(df[, colnames])[1:(nrow(df) - lag),]
get_test_data <- function(df, colnames) as.matrix(df[, colnames])[(nrow(df) - lag + 1):nrow(df),]
tsmodel_forecasts <- do(grouped_tsdata,
timeperiod = get_test_data(., timecol),
cast_naive = naive(get_train_data(., responsecol), lag, level = pred_int_levels),
cast_avg = meanf(get_train_data(., responsecol), lag, level = pred_int_levels),
cast_arima = forecast(auto.arima(get_train_data(., responsecol), seasonal = FALSE), h = lag, level = pred_int_levels),
cast_exog_arima = forecast(auto.arima(get_train_data(., responsecol), xreg = get_train_data(., exogcols), seasonal = FALSE), xreg = get_test_data(., exogcols), level = pred_int_levels),
cast_ets = forecast(ets(get_train_data(., responsecol)), lag, level = pred_int_levels),
test_set = get_test_data(., responsecol)
)
cleaned_ts_model_forecasts <- cleanup_ts_forecasts(tsmodel_forecasts)
ts_fcasts <- restruct_ts_forecasts(cleaned_ts_model_forecasts)
save_provenance(ts_fcasts)
return(ts_fcasts)
}
#' Cleanup time-series forecast output
#'
#' Transforms a data frame with one row per for each site and columns
#' with cells containing full forecast output a number of different forecasts,
#' into one where each row contains the point forecast and forecast intervals
#' for a single site, time period, model combination.
#'
#' @param ts_model_forecasts dataframe with site_id, timeperiod, cast_naive,
#' cast_arima, cast_exog_arima, and test_set columns. Each cast_ column
#' contains the full forecast object for each year of the forecast
#'
#' @return data.frame
cleanup_ts_forecasts <- function(tsmodel_forecasts){
column_names <-
tsmodel_forecasts %>%
do(
{timeperiod <- as.data.frame(.$timeperiod)
colnames(timeperiod) <- c('timeperiod')
naive <- as.data.frame(.$cast_naive)
naive <- cbind(timeperiod, naive)
naive$model <- 'naive'
naive$site_id <- .$site
naive$obs <- .$test_set
avg <- as.data.frame(.$cast_avg)
avg <-cbind(timeperiod, avg)
avg$model <- 'avg'
avg$site_id <- .$site
avg$obs <- .$test_set
arima <- as.data.frame(.$cast_arima)
arima <- cbind(timeperiod, arima)
arima$model <- 'arima'
arima$site_id <- .$site
arima$obs <- .$test_set
exog_arima <- as.data.frame(.$cast_exog_arima)
exog_arima <- cbind(timeperiod, exog_arima)
exog_arima$model <- 'exog_arima'
exog_arima$site_id <- .$site
exog_arima$obs <- .$test_set
ets <- as.data.frame(.$cast_ets)
ets <- cbind(timeperiod, ets)
ets$model <- 'ets'
ets$site_id <- .$site
ets$obs <- .$test_set
df <- rbind(naive, avg, arima, exog_arima, ets)
df %>% dplyr::select(site_id, model, timeperiod, obs, everything())
}
)
}
#' Split time-series forecasts into point forecast and prediction interval dfs
#'
#' @param cleaned_ts_model_forecasts dataframe with site_id, model, timeperiod,
#' obs, pt_fcast, and Lo and Hi columns for each level of prediction interval
#'
#' @return list of data.frames. The first value is the pt_fcast part of the
#' input data frame. The second value is a long version of the full data.frame
#' with columns for the level and interval characterizing each prediction
#' interval
restruct_ts_forecasts <- function(cleaned_ts_model_fcasts){
cleaned_ts_model_fcasts <- dplyr::rename(cleaned_ts_model_fcasts, pt_fcast = `Point Forecast`)
ts_pt_fcasts <- dplyr::select(cleaned_ts_model_fcasts, site_id:pt_fcast)
ts_fcasts_pred_int <- cleaned_ts_model_fcasts %>%
tidyr::gather(level, interval_edge, -site_id, -model, -timeperiod, -obs, -pt_fcast) %>%
tidyr::separate(level, into = c("hilo", "levels")) %>%
dplyr::group_by(site_id, model, timeperiod, obs, pt_fcast, levels) %>%
dplyr::summarize(lo = min(interval_edge), hi = max(interval_edge))
ts_fcasts_pred_int$levels <- as.numeric(ts_fcasts_pred_int$levels)
ts_fcasts <- list(pt_est = ts_pt_fcasts, intervals = ts_fcasts_pred_int)
save_provenance(ts_fcasts)
return(ts_fcasts)
}
#' @export
get_error_measures <- function(obs, pred){
error <- obs - pred
percenterror <- error / obs * 100
me <- mean(error, na.rm=TRUE)
rmse <- sqrt(mean(error^2, na.rm=TRUE))
mae <- mean(abs(error), na.rm=TRUE)
mpe <- mean(percenterror, na.rm=TRUE)
mape <- mean(abs(percenterror), na.rm=TRUE)
results <- data.frame(t(unlist(c(me, rmse, mae, mpe, mape))))
colnames(results) <- c('ME', 'RMSE', 'MAE', 'MPE', 'MAPE')
save_provenance(results)
return(results)
}
#' Visualize a forecast
#'
#' Visualizes a forecast for a single focal site from a group of forecasts
#'
viz_forecast <- function(data, forecasts, focal_site){
forecasts_focal <- filter(forecasts, site_id == focal_site)
train_set <- filter(data, site_id == focal_site, year < min(forecasts$timeperiod))
test_set <- filter(data, site_id == focal_site, year >= min(forecasts$timeperiod))
examp_fcasts <- ggplot(train_set, aes(year, richness)) +
geom_point(size = 2) +
geom_line(size = 0.75) +
geom_point(data = test_set, size = 2) +
geom_line(data = test_set, linetype = "dashed", size = 0.75) +
geom_line(data = forecasts_focal,
aes(x = timeperiod, y = pt_fcast, color = model),
size = 1) +
scale_color_brewer(palette = "Set3") +
labs(x = "Year", y = "Diversity")
examp_fcasts
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.