vignette1.R
In BioTIMEr: Tools to Use and Explore the 'BioTIME' Database

## ----setup, include=FALSE-----------------------------------------------------
library(knitr)
opts_chunk$set(echo = TRUE)

library(vegan)
library(dplyr)
library(tidyr)
library(ggplot2)
library(dggridR)

set.seed(19)

## ----bt_package_loading, warning=FALSE, message=FALSE, eval=TRUE--------------
# Install and load the latest version of BioTIMEr
library(BioTIMEr)

## ----data_description, warning=FALSE, message=FALSE, eval=TRUE, include=TRUE----
# you can run the following commands to retrieve more information about the subsets.
?BTsubset_meta
?BTsubset_data

## ----package info, warning=FALSE, message=FALSE, eval=TRUE, include=TRUE------
# you can also see a full list of BioTIMEr functions and help pages by:
??BioTIMEr

## ----gridding_ex, cache=TRUE, echo=TRUE, message=FALSE, tidy=FALSE, include=TRUE----
grid_samples <- gridding(meta = BTsubset_meta, btf = BTsubset_data, res = 12, resByData = FALSE)

# Get a look at the output
# grid_samples |> head() |> kable()

## ----gridding_ex_plot, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7, fig.height=3,tidy=FALSE, include=TRUE----
check_res_1 <- grid_samples |>
  group_by(STUDY_ID, StudyMethod) |>
  summarise(n_cell = n_distinct(cell), n_aID = n_distinct(assemblageID), res = "res12")

check_res_1 |> head(10) |> kable()

# How many samples were there in each study?
ggplot(data = check_res_1) +
  geom_bar(mapping = aes(x = as.character(STUDY_ID), y = n_aID, fill = res),
           stat = "identity") +
  scale_fill_discrete(type = c("#155f49")) +
  xlab("StudyID") + ylab("Number of assemblages in a study") +
  theme_bw() +
  theme(legend.position = "none")


## ----gridding_ex2, cache=TRUE, echo=TRUE, message=FALSE, tidy=FALSE, include=TRUE,results='hold'----
# define an alternative resolution of 14 (~10.7 km2 cells)
grid_samples_14 <- gridding(meta = BTsubset_meta, btf = BTsubset_data, res = 14, resByData = FALSE)

# allow the spatial extent of the data to define the resolution
grid_samples_auto <- gridding(meta = BTsubset_meta, btf = BTsubset_data, res = 12, resByData = TRUE)
# this option also returns a message with the automatically picked resolution:

## ----gridding_ex3, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7, fig.height=3,tidy=FALSE, include=TRUE----
check_res_2 <- grid_samples_14 |>
  group_by(StudyMethod, STUDY_ID) |>
  summarise(n_cell = n_distinct(cell), n_aID = n_distinct(assemblageID), res = "res14")

check_res_3 <- grid_samples_auto |>
  group_by(StudyMethod, STUDY_ID) |>
  summarise(n_cell = n_distinct(cell), n_aID = n_distinct(assemblageID), res = "res15")

checks <- rbind(check_res_1, check_res_2, check_res_3)

ggplot(data = checks) +
  geom_bar(mapping = aes(x = as.character(STUDY_ID), y = n_aID, fill = res),
           stat = "identity", position = "dodge") +
  scale_fill_discrete(type = c("#155f49","#66c1d1","#d9d956")) +
  xlab("StudyID") + ylab("Number of assemblages in a study") +
  theme_bw()

## ----gridding_map, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7, fig.height=3,tidy=FALSE, include=TRUE,results='hold', figures='hold'----
## The following example is built on the demonstrations in
## https://cran.r-project.org/web/packages/dggridR/vignettes/dggridR.html.

# First we build the ~96 km2 global grid
dgg_12 <- dggridR::dgconstruct(res = 12)

# To simplify, we only map the grid cell boundaries for cells which
# have observations.
# NOTE: if you are working with the full BioTIME database, this step may take some time.
grid_12 <- dggridR::dgcellstogrid(dgg_12, grid_samples$cell)

# Now let's follow the same steps and build a ~10.7 km2 global grid:
dgg_14 <- dggridR::dgconstruct(res = 14)
grid_14 <- dggridR::dgcellstogrid(dgg_14, grid_samples_14$cell)

# And we get some polygons for each country of the world, to create a background:
countries <- ggplot2::map_data("world")

# Now you could map the whole world, but let's just zoom in the UK and have a look at
# STUDY 466 (Marine Fish):
map_uk_locations <- ggplot() +
  geom_polygon(data = countries, aes(x = long, y = lat, group = group), fill = NA, color = "grey") +
  geom_sf(data = grid_12, aes(), color = alpha("blue", 0.4)) +
  coord_sf(xlim = c(-20, 10), ylim = c(50, 60)) +
  geom_rect(aes(xmin = -11, xmax = -0.7, ymin = 57.2, ymax = 59), colour = "red", fill = NA) +
  labs(x = NULL, y = NULL) +
  theme_bw() + theme(text = element_text(size = 8))

zoom_in_map <- ggplot() +
  geom_polygon(data = countries, aes(x = long, y = lat, group = group), fill = NA,
               color = "grey") +
  geom_sf(data = grid_12, aes(), color = alpha("blue", 0.4)) +
  geom_sf(data = grid_14, aes(), color = alpha("red", 0.4)) +
  coord_sf(xlim = c(-11, -0.7), ylim = c(57.2, 59)) +
  theme_bw()

grid::grid.newpage()
main <- grid::viewport(width = 1, height = 1, x = 0.5, y = 0.5)  # the main map
inset <- grid::viewport(width = 0.4, height = 0.4, x = 0.82, y = 0.45)  # the inset in bottom left

# The resulting distribution of different size cells appears as follows:
print(zoom_in_map, vp = main)
print(map_uk_locations, vp = inset)

## ----resampling_ex, cache=TRUE, echo=TRUE, message=FALSE, tidy=FALSE, include=TRUE----
# First, if you are not sure you need this step,
# you can always check how many samples there are in every year of the different time series:
check_samples <- grid_samples |>
  group_by(STUDY_ID, assemblageID, YEAR) |>
  summarise(n_samples = n_distinct(SAMPLE_DESC), n_species = n_distinct(Species))

check_samples |> head(10) |> kable()


## ----resampling_ex1, cache=TRUE, echo=TRUE, message=FALSE, tidy=FALSE, include=TRUE----
# Let's apply resampling() then, using the data frame of the gridded data:
grid_rare <- resampling(x = grid_samples, measure = "ABUNDANCE",
                       resamps = 1, conservative = FALSE)

## ----tests1, cache=TRUE, echo=TRUE, message=FALSE, tidy=TRUE, include=FALSE----
#let's apply it then:

#g rid_samples_temp <- subset(grid_samples, !grid_samples$BIOMASS==0) #to be deleted after Faye reviews the data and makes all 0=NAs
# grid_rare <- resampling( x= grid_samples_test, measure ="BIOMASS", resamps = 1, conservative = FALSE)

## ----resampling_ex12, cache=TRUE, echo=TRUE, message=FALSE, tidy=TRUE, include=TRUE----
# Keep only observations with both abundance and biomass
grid_rare_ab <- resampling(x = grid_samples, measure = c("ABUNDANCE", "BIOMASS"),
                           resamps = 1, conservative = FALSE)

# Keep only sampling events where all observations within had both abundance and biomass to start with
grid_rare_abT <- resampling(x = grid_samples, measure = c("ABUNDANCE", "BIOMASS"),
                            resamps = 1, conservative = TRUE)

## ----resampling_ex3, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7, fig.height=3,tidy=FALSE, include=TRUE,results='hold', figures='hold'----
# What is the number of samples in the year with the lowest sampling effort?
ggplot(data = check_samples[check_samples$assemblageID == "18_335699",], aes(x = YEAR, y = n_samples)) +
  geom_col(aes(x = YEAR, y = n_samples), fill = "red", alpha = 0.5) +
  geom_segment(aes(x = 1926, y = min(n_samples) + 3,
                   xend = 1927, yend = min(n_samples)),
               arrow = arrow(length = unit(0.2, "cm"))) +
  xlab("Year") + ylab("Number of samples") +
  theme_bw()

# In this case,the year 1927 had the lowest sampling effort, with 3 samples (arrow).

## ----resampling_ex4, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7, fig.height=3,tidy=FALSE, include=TRUE,results='hold', figures='hold'----
# Let's implement the sample-based rarefaction by resampling the dataset 10 times.
grid_rare_n10 <- resampling(x = grid_samples, measure = "ABUNDANCE", resamps = 10,
                            conservative = FALSE)

# Note that you may want to resample many more times (e.g. at least 30-100+ times, but up to 199
# if e.g. working with the whole BioTIME data), depending on how many iterations you want a
# subsequent bootstrap analysis to have.
# This may also take some computation time, so if you are working with a big subset of BioTIME
# is advisable to break it down in smaller subsets.

# Each resampling iteration will be identified as resamp = 1:n, in this case 1:10.
# Now we can check if there are differences across the first 3 of these iterations:

check_resamps <- grid_rare_n10[grid_rare_n10$resamp < 4,] |>
  group_by(STUDY_ID, assemblageID, resamp) |>
  summarise(n_obs = n(), n_species = n_distinct(Species), n_year = n_distinct(YEAR))

check_resamps |> head(10) |> kable()

## ----metrics, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7, fig.height=3,tidy=FALSE, include=TRUE, figures='hold'----
# Get alpha metrics estimates:
alpha_metrics <- getAlphaMetrics(x = grid_rare, measure = "ABUNDANCE")
# see also help("getAlphaMetrics") for more details on the metrics

# Have a quick look at the output
alpha_metrics |> head(6) |> kable()

# Get beta metrics estimates:
beta_metrics <- getBetaMetrics(x = grid_rare, measure = "ABUNDANCE")
#see also help("getBetaMetrics") for more details on the metrics

# Have a quick look at the output
beta_metrics |> head(6) |> kable()
# NOTE the functions used the rarefied data with only one resampling iteration

## ----tests2, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7, fig.height=3,tidy=FALSE, include=FALSE, figures='hold'----
# # Get alpha metrics estimates:
# alpha_metrics <- getAlphaMetrics(x = grid_rare_ab, measure = "BIOMASS")
# #see also help("getAlphaMetrics") for more details on the metrics
#
# #Have a quick look at the output
# alpha_metrics |> head(6) |> kable()
#
# # Get beta metrics estimates:
# beta_metrics <- getBetaMetrics(x = grid_rare_ab, measure = "BIOMASS")
# #see also help("getBetaMetrics") for more details on the metrics
#
# #Have a quick look at the output
# beta_metrics |> head(6) |> kable()

## ----trends, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7, fig.height=3,tidy=FALSE, include=TRUE,results='hold', figures='hold'----
# Let's apply it then:
alpha_slopes <- getLinearRegressions(x = alpha_metrics, divType = "alpha",
                                     pThreshold = 0.05) #for alpha metrics


# Have a quick look at the output
alpha_slopes |> head(6) |> kable()

beta_slopes <- getLinearRegressions(x = beta_metrics, divType = "beta",
                                    pThreshold = 0.05)  #for beta metrics

# Have a quick look at the output
# beta_slopes |> head(6) |> kable()

## ----trends2, cache=TRUE, echo=TRUE, message=FALSE, fig.width=10, fig.height=7,tidy=FALSE, include=TRUE----
# First, how many assemblages in our dataset show a moderate evidence (P < 0.05) of change in alpha diversity?
check_alpha_trend <- alpha_slopes |>
  group_by(metric) |>
  filter(significance == 1) |>   # or use filter(pvalue<0.05)
  summarise(n_sig = n_distinct(assemblageID), mean(slope))

check_alpha_trend |> kable()

# We can see that only a few (<40) of the assemblage time series actually show a significant
# trend of change over time, independently of the metric used. This indicates that in most
# time series in the studies we analysed alpha diversity is not really changing through time.

## ----trends3, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7.2, fig.height=5,tidy=FALSE, include=TRUE,results='hold'----

# Get a slope per assemblageID and metric
alpha_slopes_simp <- alpha_slopes |>
  group_by(assemblageID, metric, pvalue) |>
  filter(significance == 1) |>   #select only the assemblages with significant trends
  summarise(slope = unique(slope))

# Calculate the mean slope and CIs
stats_alpha <- alpha_slopes_simp |>
  group_by(metric) |>
  summarise(mean_slope = mean(slope), #mean
            ci_slope = qt(0.95, df = length(slope) - 1) * (sd(slope, na.rm = TRUE) / sqrt(length(slope)))) #margin of error

# Let's put it all together
ggplot(data = alpha_slopes_simp) +
  geom_histogram(aes(x = slope, fill = metric), bins = 25) +
  #geom_density(alpha=0.5)+ #in case you what a density plot instead
  geom_vline(aes(xintercept = 0), linetype = 3, colour = "grey",linewidth = 0.5) + #add slope=0 line
  geom_vline(data = stats_alpha, aes(xintercept = mean_slope), linetype = 1,
             linewidth = 0.5,colour = "black") + #mean
  geom_vline(data = stats_alpha, aes(xintercept = mean_slope - ci_slope),
             linetype = 2, linewidth = 0.5, colour = "black") + #lower confidence interval
  geom_vline(data = stats_alpha, aes(xintercept = mean_slope + ci_slope),
             linetype = 2, linewidth = 0.5, colour = "black") + #upper confidence interval
  facet_wrap(~metric, scales = "free") +
  scale_fill_biotime() +  #using the customize BioTIME colour scale. See help("scale_color_biotime") for more options.
  ggtitle("Alpha diversity change") +
  theme_bw() +
  theme(legend.position = "none",plot.title = element_text(size = 11, hjust = 0.5))


## ----trends4, cache=TRUE, echo=FALSE, message=FALSE, fig.width=7.2, fig.height=1.9,tidy=FALSE, include=TRUE,results='hold'----

# Get a slope per assemblageID and metric
beta_slopes_simp <- beta_slopes |>
  group_by(assemblageID, metric, pvalue) |>
  filter(significance == 1) |>     #select only the assemblages with significant trends
  summarise(slope = unique(slope))

# Calculate the mean slope and CIs
stats_beta <- beta_slopes_simp |>
  group_by(metric) |>
  summarise(mean_slope = mean(slope), #mean
            ci_slope = qt(0.95, df = length(slope) - 1) * (sd(slope, na.rm  = TRUE) / sqrt(length(slope)))) #margin of error

#Let's put it all together
ggplot(data = beta_slopes_simp) +
  geom_histogram(aes(x = slope, fill = metric), bins = 25) +
  #geom_density(alpha=0.5)+ #in case you what a density plot instead
  geom_vline(aes(xintercept = 0), linetype = 3, colour = "grey", linewidth = 0.5) + #add slope=0 line
  geom_vline(data = stats_beta, aes(xintercept = mean_slope), linetype = 1,
             linewidth = 0.5, colour = "black") + #mean
  geom_vline(data = stats_beta, aes(xintercept = mean_slope - ci_slope),
             linetype = 2, linewidth = 0.5, colour = "black") + #lower confidence interval
  geom_vline(data = stats_beta, aes(xintercept = mean_slope + ci_slope),
             linetype = 2, linewidth = 0.5, colour = "black") + #upper confidence interval
  facet_wrap(~metric, scales = "free") +
  scale_fill_biotime() +  #using the customize BioTIME colour scale. See help("scale_color_biotime") for more options.
  ggtitle("Beta diversity change") +
  theme_bw() +
  theme(legend.position = "none",plot.title = element_text(size = 11, hjust = 0.5))


## ----trends5, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7.2, fig.height=1.9,tidy=FALSE, include=TRUE,results='hold'----
# Hint: If you wish to plot all metrics together, simply merge you alpha_slopes and
# beta_slopes dataframes beforehand.

## ----trends6, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7.2, fig.height=2.3,tidy=FALSE, include=TRUE,results='hold'----
# First, we need to check the meta file and retrieve the information for the studies of interest
#head(BTsubset_meta)
meta <- select(BTsubset_meta, STUDY_ID, TAXA, REALM, CLIMATE)

# Get a slope per assemblageID and metric
alpha_slopes_simp <- alpha_slopes |>
  group_by(assemblageID, metric, 'pvalue', significance) |>
  summarise(slope = unique(slope))

# Get back the Study ID by separating our assemblageID column into multiple other columns.
alpha_slopes_simp <- alpha_slopes_simp |>
  separate_wider_delim(
    cols = assemblageID,
    names = c("STUDY_ID", "cell"),
    delim = "_",
    cols_remove = FALSE
  ) |>
  as.data.frame()

# Merge it all
alpha_slopes_meta <- merge(alpha_slopes_simp, meta, by = "STUDY_ID")

# Select relevant data for plotting
alpha_slopes_set1 <- subset(
  alpha_slopes_meta,
  alpha_slopes_meta$metric == "Shannon" &
    (alpha_slopes_meta$TAXA == "Fish" | alpha_slopes_meta$TAXA == "Birds")
)

# Let's put it all together
ggplot(data = alpha_slopes_set1, aes(x = slope, fill = TAXA)) +
  geom_histogram(fill = "grey", bins = 25) + #all assemblages
  geom_histogram(
    data = alpha_slopes_set1[alpha_slopes_set1$significance == 1, ],
    bins = 25
  ) + #only significant change assemblages
  geom_vline(
    aes(xintercept = 0),
    linetype = 3,
    colour = "black",
    linewidth = 0.5
  ) + #add slope=0 line
  facet_wrap(~TAXA, scales = "free") +
  scale_fill_biotime() +
  ggtitle("Shannon-Weiner Species Diversity Index") +
  theme_bw() +
  theme(plot.title = element_text(size = 11, hjust = 0.5))


## ----trends7, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7.2, fig.height=1.9, include=TRUE,results='hold'----
# Now we see the full distribution of slopes of change for the Shannon index for the birds
# and fish time series in our subset: all slopes are shown in grey, while in color we show
# the subset of assemblages for which evidence (P < 0.05) of change was detected.

## ----trends8, cache=TRUE, echo=TRUE, message=FALSE, tidy=FALSE, include=TRUE,fig.width=7.2, fig.height=5, eval=TRUE,results='hold', figures='hold'----

# Let's go back to the data frame with the yearly values and select our relevant data for plotting
alpha_metrics_set <- subset(alpha_metrics, assemblageID == "18_335699")

# Transform data
alpha_metrics_set_long <- pivot_longer(alpha_metrics_set,
                                   cols = c("S", "N", "maxN", "Shannon", "expShannon",
                                   "Simpson", "invSimpson", "PIE", "DomMc"),
                                   names_to = "metric",
                                   names_transform = as.factor)

# Get assemblage ID slope of change
alpha_slopes_set2 <- subset(alpha_slopes, assemblageID == "18_335699")

# Get assemblage ID
name_assemblage <- unique(alpha_slopes_set2$assemblageID)

# Merge info
alpha_metr_slop<- left_join(alpha_slopes_set2, alpha_metrics_set_long,
                                join_by(assemblageID, metric))

# Let's put it all together
ggplot(data = alpha_metr_slop, aes(x = YEAR, y = value)) +
    geom_point(colour = "#155f49", size = 1) +   #plot the yearly estimates
    stat_smooth(method = "lm", se = FALSE, linetype = 2, colour = "grey") + #draw all regression line
    stat_smooth(data = subset(alpha_metr_slop, alpha_metr_slop$significance == 1),
                aes(x = YEAR, y = value), method = "lm", se = FALSE,
                linetype = 2, colour = "#155f49") + #color only trends that are significant
    facet_wrap(~metric, scales = "free") +
    scale_fill_biotime() +
    ggtitle(paste("Assemblage", name_assemblage)) + ylab("Diversity") +
    theme_bw() + theme(plot.title = element_text(size = 11, hjust = 0.5))


## ----trends9, cache=TRUE, echo=TRUE, message=FALSE, fig.width=10, fig.height=7,tidy=FALSE, include=FALSE,results='hold', figures='hold'----
# Hint: If you want to draw the p-value on the plot, you can try using the #ggpmisc::stat_fit_glance which takes anything passed through lm() in R and
#allows it to processed and printed using ggplot2.

## ----citation, cache=TRUE, echo=FALSE, message=FALSE, fig.width=10, fig.height=7,tidy=FALSE, include=FALSE,results='hold', figures='hold'----
citation("BioTIMEr")
Any scripts or data that you put into this service are public.
BioTIMEr documentation built on June 8, 2025, 10:02 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
BioTIMEr
Tools to Use and Explore the 'BioTIME' Database

inst/doc/vignette1.R
In BioTIMEr: Tools to Use and Explore the 'BioTIME' Database

Try the BioTIMEr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

BioTIMEr Tools to Use and Explore the 'BioTIME' Database

inst/doc/vignette1.R In BioTIMEr: Tools to Use and Explore the 'BioTIME' Database

Try the BioTIMEr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

BioTIMEr
Tools to Use and Explore the 'BioTIME' Database

inst/doc/vignette1.R
In BioTIMEr: Tools to Use and Explore the 'BioTIME' Database