In sustainablefarms/sflddata: Data Extraction Methods for the Linking Data Project

Kassel stopped working on this document as it appeared that much study on the importance of variables has already been completed. Further analysis here seems superfluous at this point in time.

knitr::opts_chunk$set(echo = TRUE, out.width = "100%")

out <- lapply(c("sf", "tsibble", 'lubridate', "viridis",
                'ggplot2', 'tidyr', 'grid', 'gridExtra', 
                'feasts', 'dplyr', 'gtable', 'fable',
                'mgcv'),
       library, character.only = TRUE)
out <- lapply(paste0("./R/", list.files("./R/")), source)

Import Data

bird_richness <- readRDS("./private/data/clean/sws_bird_richness.rds") # contains all bird spp
birds <- readRDS("./private/data/clean/sws_birds.rds") # contains only common bird spp
stopifnot(all.equal(bird_richness[, names(birds)], birds))

sites_rs <- readRDS("./private/data/clean/sws_sites_rs.rds") %>%
    mutate(farm = substr(SiteCode, 1, 4),
         sitenum = factor(as.integer(substr(SiteCode, 5, 5))))
sites_rs$log_plus_one_woody_cover <- log(sites_rs$woody_cover + 1)
traits <- readRDS("./private/data/clean/sws_traits.rds")

obsa <- cbind(sites_rs, bird_richness)

# make binary
for(i in 1:ncol(birds)){
  birds[which(birds[, i] > 0), i] <- 1
}
for(i in 1:ncol(bird_richness)){
  bird_richness[which(bird_richness[, i] > 0), i] <- 1
}
obsb <- cbind(sites_rs, bird_richness)

Non-Graphical Summary of Traits

traits <- traits %>%
  mutate_at(c("family", "habitat", "diet", "substrate", "movements", "nest"),
            as.factor)
traits %>%  summary()

Visualise Abundance

obsa %>%
  #filter(farm == "BELL") %>%
  ggplot() +
  facet_wrap(vars(farm)) +
  geom_point(position = "jitter", #position_jitter(width = 1E6, height = 0),
    aes(x = SurveyDate, y = `Australian Magpie`)
             ) +
  scale_y_log10() +
  ggtitle("Australian Magpie Numbers Observed")

Numbers of detected Austaralian Magpie appears to have roughly constant density.

obsa %>%
  #filter(farm == "BELL") %>%
  ggplot() +
  facet_wrap(vars(farm)) +
  geom_point(position = "jitter", #position_jitter(width = 1E6, height = 0),
    aes(x = SurveyDate, y = `Apostlebird`)
             ) +
  scale_y_log10() +
  ggtitle("Apostlebird Numbers Observed")

obsa %>%
  #filter(farm == "BELL") %>%
  ggplot() +
  # facet_wrap(vars(farm)) +
  geom_point(aes(x = farm, y = `Apostlebird`, col = year(SurveyDate))
             ) +
  scale_color_viridis(name = "Survey Year") +
  scale_y_log10() +
  ggtitle("Apostlebird Numbers Observed")

That the same farms have a variety of colours suggests that the things that are changing in time are not an enormous factor on Apostlebirds.

p <- obsa %>%
  pivot_longer(cols = names(bird_richness), names_to = "species", values_to = "numbers") %>%
  #filter(farm == "BELL") %>%
  ggplot() +
  # facet_wrap(vars(species)) +
  # ggforce::facet_wrap_paginate(vars(species), nrow = 4, ncol = 13) +
  geom_point(position = position_jitter(height = 0.2),
    aes(x = farm, y = numbers, col = year(SurveyDate))
             ) +
  scale_color_viridis(name = "Survey Year") +
  scale_y_log10() +
  theme(strip.text.x = element_text(size = 8)) +
  ggtitle("Observed Bird Numbers")
p + ggforce::facet_wrap_paginate(vars(species), nrow = 4, ncol = 13, page = 1)
p + ggforce::facet_wrap_paginate(vars(species), nrow = 4, ncol = 13, page = 2)
p + ggforce::facet_wrap_paginate(vars(species), nrow = 4, ncol = 13, page = 3)

Looks like the Noisy Friarbird was widely around in early years and later years, but not years around 2010. There are a number of species that were only observed much less in later years (some of this could be confounded with any farms/sites that dropped out of the study).

Visualise Presence/Absence Binary Simplification

obsb %>%
  ggplot() +
  facet_wrap(vars(farm)) +
  geom_point(position = position_jitter(width = 0.5, height = 0.05),
    aes(x = SurveyDate, y = `Australian Magpie`)
             ) +
  # theme(axis.text.x = element_text(angle = 90)) +
  theme(strip.text.x = element_text(size = 4)) +
  ggtitle("Australian Magpie Presence Absence")

obsb %>%
  ggplot() +
  geom_point(position = position_jitter(width = 0.5, height = 0.05),
    aes(x = SurveyDate, y = `Australian Magpie`)
             ) +
  # theme(axis.text.x = element_text(angle = 90)) +
  theme(strip.text.x = element_text(size = 4)) +
  ggtitle("Australian Magpie Presence Absence")
obsb %>%
  # filter(!(is.na(longitude))) %>%
  # filter(!(is.na(latitude))) %>%
  # sf::st_as_sf(coords = c("longitude", "latitude"), crs = "+proj=longlat +datum=WGS84") %>%
  mutate(`Australian Magpie` = as.logical(`Australian Magpie`)) %>%
  ggplot() +
  geom_point(position = position_jitter(width = 0.01, height = 0.01),
    aes(x = longitude, y = latitude,
                 col = year(SurveyDate),
        alpha = `Australian Magpie`,
        shape = `Australian Magpie`)) +
  coord_fixed() +
  scale_shape_manual(values = c("TRUE" = 17, "FALSE" = 3)) +
  scale_color_viridis() +
  ggtitle("Australian Magpie Presence Absence",
          subtitle = "Locations jittered")

obsb %>%
  mutate(`Apostlebird` = as.logical(`Apostlebird`)) %>%
  ggplot() +
  geom_point(position = position_jitter(width = 0.01, height = 0.01),
    aes(x = longitude, y = latitude,
                 col = year(SurveyDate),
        alpha = `Apostlebird`,
        shape = `Apostlebird`)) +
  coord_fixed() +
  scale_shape_manual(values = c("TRUE" = 17, "FALSE" = 3)) +
  scale_color_viridis() +
  ggtitle("Apostlebird Presence Absence",
          subtitle = "Locations jittered")

p <- obsb %>%
  pivot_longer(cols = names(bird_richness), names_to = "species", values_to = "detected") %>%
  #filter(farm == "BELL") %>%
  ggplot() +
  # facet_wrap(vars(species)) +
  # ggforce::facet_wrap_paginate(vars(species), nrow = 4, ncol = 13) +
  geom_point(position = position_jitter(height = 0.05),
    aes(x = farm, y = detected, col = year(SurveyDate))
             ) +
  scale_color_viridis(name = "Survey Year") +
  theme(strip.text.x = element_text(size = 8)) +
  ggtitle("Detected Birds")
p + ggforce::facet_wrap_paginate(vars(species), nrow = 4, ncol = 13, page = 1)
p + ggforce::facet_wrap_paginate(vars(species), nrow = 4, ncol = 13, page = 2)
p + ggforce::facet_wrap_paginate(vars(species), nrow = 4, ncol = 13, page = 3)


p <- obsb %>%
  pivot_longer(cols = names(bird_richness), names_to = "species", values_to = "detected") %>%
  filter(!is.na(detected)) %>%
  mutate(detected = as.logical(detected)) %>%
  ggplot() +
  geom_point(position = position_jitter(width = 0.01, height = 0.01),
    aes(x = longitude, y = latitude,
                 col = year(SurveyDate)),
        shape = 3,
    size = 2) +
  coord_fixed() +
  scale_color_viridis() +
  theme(strip.text.x = element_text(size = 6)) +
  ggtitle("Bird Presence",
          subtitle = "Locations jittered")
p + ggforce::facet_wrap_paginate(vars(species, detected), nrow = 4, ncol = 12, page = 1) 
p + ggforce::facet_wrap_paginate(vars(species, detected), nrow = 4, ncol = 12, page = 2) 
p + ggforce::facet_wrap_paginate(vars(species, detected), nrow = 4, ncol = 12, page = 3) 
p + ggforce::facet_wrap_paginate(vars(species, detected), nrow = 4, ncol = 12, page = 4) 
p + ggforce::facet_wrap_paginate(vars(species, detected), nrow = 4, ncol = 12, page = 5) 
p + ggforce::facet_wrap_paginate(vars(species, detected), nrow = 4, ncol = 12, page = 6) 
p + ggforce::facet_wrap_paginate(vars(species, detected), nrow = 4, ncol = 12, page = 7)

The above plots are not overly helpful. Some species are clustered into a subregion. For example the Little Lorikeet, the White-Faced Honey Eater, the Dollarbird and Horsfield’s Bushlark. are these clusters environmental (correlation purely due to predictors)? or due to colonisation costs (conditional spatial correlation)?

The species with many many detections do not have much observable spatial patttern. There are also many species with very few detections, which means no spatial patterns could be observed.

Covariates in Data

covarnames <- names(sites_rs)
summary(obsb[, covarnames])

From literature it is anticipated that GrowthType, Planting Size, Native Veg, os_cover, ms_cover, stems_to_30cm, stems_above_30cm, mean_dead_trees, SurveyYear, Wind and more impact the species detected. In otherwords all of the covariates that come with the observational data will be important.

I expect to find similar literature for elevation, temperature and clouds.

For many of these values we'll have to decide if they are predictors or nuisance variables. For example, will be predicting biodiveristy on a calm day or an average-wind day...?

Below is an example of visualising GrowthType effects. It is very quick and doesn't consider lurking variables.

Growth Type

p <- obsa %>%
  pivot_longer(cols = names(bird_richness), names_to = "species", values_to = "numbers") %>%
  #filter(farm == "BELL") %>%
  ggplot() +
  # facet_wrap(vars(species)) +
  # ggforce::facet_wrap_paginate(vars(species), nrow = 4, ncol = 13) +
  geom_point(position = position_jitter(height = 0.2),
    aes(x = farm, y = numbers, col = GrowthType)
             ) +
  scale_color_viridis_d() +
  scale_y_log10() +
  theme(strip.text.x = element_text(size = 8)) +
  ggtitle("Observed Bird Numbers", subtitle = "Colored by Vegetation Growth Type")
p + ggforce::facet_wrap_paginate(vars(species), nrow = 4, ncol = 13, page = 1)
p + ggforce::facet_wrap_paginate(vars(species), nrow = 4, ncol = 13, page = 2)
p + ggforce::facet_wrap_paginate(vars(species), nrow = 4, ncol = 13, page = 3)

obsb %>%
  pivot_longer(cols = names(bird_richness), names_to = "species", values_to = "detected") %>%
  filter(!is.na(GrowthType)) %>%
  group_by(species, GrowthType, farm, SurveyYear) %>%
  summarise(PropDetected = sum(detected) / n()) %>% #proportion within a single season at a farm
  summarise(PropDetected = mean(PropDetected), #across 'independent' seasons
            df = n()) %>%
  summarise(PropDetected = mean(PropDetected), #across 'independent' farms
            df = sum(df)) %>%
  mutate(se = sqrt(PropDetected * (1 - PropDetected) / df)) %>%
  inner_join(traits, by = c(species = "common_name")) %>%
  filter(!is.na(movements)) %>%
  ggplot() +
  geom_col(position = "dodge2",
           aes(x = species, y = PropDetected,
               group = GrowthType,
               fill = GrowthType)) +
  geom_errorbar(position = "dodge2",
           aes(x = species, ymin = PropDetected - 2*se,
               ymax = PropDetected + 2*se,
               group = GrowthType)) +
  scale_fill_viridis_d() +
  coord_flip() +
  facet_wrap(vars(movements), scales = "free_y") +
  theme(legend.position = "bottom") +
  ylab("Proportion of Visits at which Species Detected") +
  ggtitle("Birds: Detection")

There are many species for which growth type could be important: Red-Capped Robin, Brown Treecreeper...

GrowthType is plausibly available as maps across the full landscape. It is also possible that GrowthType is correlated with other predictors.

obsa %>%
  pivot_longer(cols = names(bird_richness), names_to = "species", values_to = "numbers") %>%
  filter(!is.na(GrowthType)) %>%
  filter(numbers > 0) %>%
  inner_join(traits, by = c(species = "common_name")) %>%
  filter(movements == "Sedentary") %>%
  ggplot() +
  geom_violin(aes(GrowthType, numbers),
              draw_quantiles = c(0.25, 0.5, 0.75)) +
  # geom_point(
  #   position = position_jitter(width = 0.4, height = 0),
  #   aes(x = GrowthType, y = numbers, col = SurveyYear)) +
  # scale_color_viridis() +
  scale_y_log10() +
  coord_flip() +
  facet_wrap(vars(species)) +
  # theme(legend.position="bottom") +
  ggtitle("Sedentary Birds: Abundance vs GrowthType")

This method is very rough and quick

Error bars in above are very rough. They assume that revisits within a year at a single farm are highly correlated, and that vists on different seasons and different farms are independent. It would seem reasonable to go further and assume that each visit, even across seasons is measuring the same thing (characteristics of the site will stay constant), so that really there are very few degrees of freedom. They are also based on a Gaussian approximation of 95% confidence intervals. [Martin has probably performed better analysis for this question]

Cropping and Grazing

sws_landuse <- read.table("./private/data/raw/LandUses SWS Farms.csv",
                          sep = ",", skip = 1,
                          as.is = TRUE)
colnames(sws_landuse) <- sws_landuse[1, ]
sws_landuse <- sws_landuse[-1, ]
sws_landuse$DmnntLandUse <- as.factor(sws_landuse$DmnntLandUse)

obsb %>%
  pivot_longer(cols = names(bird_richness), names_to = "species", values_to = "detected") %>%
  filter(!is.na(detected)) %>%
  inner_join(sws_landuse, by = c(farm = "FarmUnit")) %>%
  filter(!is.na(DmnntLandUse)) %>%
  group_by(species, DmnntLandUse, farm, SurveyYear) %>%
  summarise(PropDetected = sum(detected) / n()) %>% #proportion within a single season at a farm
  summarise(PropDetected = mean(PropDetected), #across 'independent' seasons
            df = n()) %>%
  summarise(PropDetected = mean(PropDetected), #across 'independent' farms
            df = sum(df)) %>%
  mutate(se = sqrt(PropDetected * (1 - PropDetected) / df)) %>%
  inner_join(traits, by = c(species = "common_name")) %>%
  filter(!is.na(movements)) %>%
  ggplot() +
  geom_col(position = "dodge2",
           aes(x = species, y = PropDetected,
               group = DmnntLandUse,
               fill = DmnntLandUse)) +
  geom_errorbar(position = "dodge2",
           aes(x = species, ymin = PropDetected - 2*se,
               ymax = PropDetected + 2*se,
               group = DmnntLandUse)) +
  scale_fill_viridis_d() +
  coord_flip() +
  facet_wrap(vars(movements), scales = "free_y") +
  theme(legend.position = "bottom") +
  ylab("Proportion of Visits at which Species Detected") +
  ggtitle("Birds: Detection for DmnntLandUse")

There are multiple species where the farms labelled cropping, grazing or mixed have very different detection rates. Cropping vs Grazing vs Mixed may have statistically significant affect on: Noisy Miner, White-plumed Honeyeater, Dusky Woodswallow, Weebill, Jacky Winter, Grey Shrike-thrush, Galah...

It seems the effect is much smaller than GrowthType.

This method is very rough and quick

Error bars in above are very rough. They assume that revisits within a year at a single farm are highly correlated, and that vists on different seasons and different farms are independent. It would seem reasonable to go further and assume that each visit, even across seasons is measuring the same thing (characteristics of the site will stay constant), so that there are very few degrees of freedom (8 to 13). They are also based on a Gaussian approximation of 95% confidence intervals. [A mixed quick mixed effects model might be more appropriate]

Lurking variables are also an issue here.

A quick mixed effects model

Interested in effect of DmntLandUse
Effects nested within DmntLandUse are: farm, year, season, site (GrowthType)

I'm not sure what repeat number means. I thought it was the index of the visit (when sites visited in consecutive days) But there would be a row for each visit.

obsb %>%
  group_by(SurveyYear, SurveySeasonId, farm, SiteCode) %>%
  ggplot() +
  geom_histogram(aes(x = RepeatNumber))

obsb %>%
  # filter(RepeatNumber == 2) %>%
  filter(SiteCode == "ARCH1", SurveyYear == 2004)

library(lme4)

indata <- obsb %>%
  inner_join(sws_landuse, by = c(farm = "FarmUnit")) %>%
  mutate(SurveyYear = scale(SurveyYear))

mod <- glmer(Galah  ~ DmnntLandUse + SurveyYear + (1 | farm : GrowthType)  ,
      data = indata,
      family = binomial(link = "logit"))
summary(mod)

If the above model is good then it suggests DmnntLandUse is important for Galahs. However, I need to look up diagnostic plots for binomial models.

Conclusions

Very preliminary exploration suggests the following is useful:

Year
GrowthType
Cropping/Grazing/Mixed

sustainablefarms/sflddata documentation built on April 19, 2022, 11:19 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

sustainablefarms/sflddata
Data Extraction Methods for the Linking Data Project

In sustainablefarms/sflddata: Data Extraction Methods for the Linking Data Project

Import Data

Non-Graphical Summary of Traits

Visualise Abundance

Visualise Presence/Absence Binary Simplification

Covariates in Data

Growth Type

This method is very rough and quick

Cropping and Grazing

This method is very rough and quick

A quick mixed effects model

Conclusions

R Package Documentation

Browse R Packages

We want your feedback!

sustainablefarms/sflddata Data Extraction Methods for the Linking Data Project

In sustainablefarms/sflddata: Data Extraction Methods for the Linking Data Project

Import Data

Non-Graphical Summary of Traits

Visualise Abundance

Visualise Presence/Absence Binary Simplification

Covariates in Data

Growth Type

This method is very rough and quick

Cropping and Grazing

This method is very rough and quick

A quick mixed effects model

Conclusions

R Package Documentation

Browse R Packages

We want your feedback!

sustainablefarms/sflddata
Data Extraction Methods for the Linking Data Project