In sustainablefarms/sflddata: Data Extraction Methods for the Linking Data Project

knitr::opts_knit$set(root.dir = rprojroot::find_root(rprojroot::has_file("DESCRIPTION")))
devtools::load_all(rprojroot::find_root(rprojroot::has_file("DESCRIPTION")))
library(sf)
library(leaflet)
library(ggplot2)

data_7_2_7 <- readRDS("./private/data/clean/7_2_4_input_data.rds")
Xocc <- rbind(data_7_2_7$insampledata$Xocc, data_7_2_7$holdoutdata$Xocc)
yXobs_grnd <- rbind(data_7_2_7$insampledata$yXobs, data_7_2_7$holdoutdata$yXobs)
species <- data_7_2_7$species

locs <- read.csv("private/data/clean/site_locations_cleaned.csv", stringsAsFactors = FALSE)
locs <- st_as_sf(locs, coords = c("MeanLON", "MeanLAT"))
locs <- st_set_crs(locs, 4326) #4326 is the epsg code for WGS84, make this default crs I'll return to

gpp <- readRDS("./private/data/remote_sensed/8d_gpp.rds")
woody <- readRDS("./private/data/remote_sensed/woody500.rds")
names(woody) <- gsub("^X", "", names(woody))
twi <- readRDS("./private/data/remote_sensed/TWI_boxgum_sites.rds")

Components to check:

Mean GPP for each location
TWI for each location
Woody veg per year (i.e. visit)
GPP difference for each visit

TWI for each location

twi %>%
  ggplot() +
  geom_histogram(aes(x = TWI), bins = 40)
twi %>%
  filter(TWI > 12)

There are 6 sites with extremely high TWI values.

twi_loc <- st_as_sf(inner_join(twi, locs[, "SiteCode"], by = "SiteCode"))

pal <- colorNumeric(viridisLite::viridis(10), twi_loc$TWI)
map <- leaflet() %>%
  # addTiles() %>%
  # addProviderTiles("Esri.WorldImagery") %>%
  # addProviderTiles("Esri.DeLorme") %>%
  addProviderTiles("Esri.WorldShadedRelief") %>%
  addScaleBar(options = scaleBarOptions(maxWidth = 200, imperial = FALSE)) 
  # addOpacitySlider("elevation")
map <- map %>%
  addCircleMarkers(lng = st_coordinates(twi_loc)[, "X"], lat = st_coordinates(twi_loc)[, "Y"],
                   label = paste(twi_loc$TWI, twi_loc$SiteCode),
                   color = pal(twi_loc$TWI),
                   opacity = 1) %>%
  addLegend(pal = pal, values = seq(min(twi_loc$TWI), max(twi_loc$TWI), length = 20),
            title = "TWI")
map

The TWI at CLOT-S is unusally high. Neighbouring CLOT-C is much lower and there seems to be little elevation difference between the two. It appears, roughly, that nearby points higher on the Earth's surface have lower TWI.

Check against a raster map.

library(slga) #specialist library for reading the data
library(raster)
library(rasterVis)
library(leaflet.opacity)
twi_ras <- get_lscape_data(slga_product_info %>%
                   dplyr::filter(Product == "Topographic Wetness Index") %>%
                   dplyr::select(Short_Name) %>%
                   as.character(),
                   aoi = locs[locs$StudyCode == "Nanangroe Natural Experiment", ])

map %>%
  fitBounds(extent(twi_ras)@xmin, extent(twi_ras)@ymin, extent(twi_ras)@xmax, extent(twi_ras)@ymax) %>%
  addRasterImage(twi_ras, colors = pal, layerId = "raster") %>%
  addOpacitySlider(layerId = "raster")

# gplot(twi_ras) + 
#   geom_tile(aes(fill = value)) +
#   scale_fill_viridis_c() +
#   geom_sf(data = locs[locs$StudyCode == "Nanangroe Natural Experiment", ], inherit.aes = FALSE, stat = "sf") +
#   coord_sf()
# ggplot() + geom_sf(data = twi_loc)

A raster extraction of TWI has values consistent with the values at the site locations. The raster map is also consistent with the ESRI terrain, confirming that the projections are approximately correct.

Woody Vegetation, 500m Buffer

anyNA(woody, recursive = TRUE) #there are no NA values in the woody data

woody %>% 
  pivot_longer(-SiteCode, names_to = "year", values_to = "woody500m") %>%
  ggplot() +
  geom_histogram(aes(x = woody500m), bins = 50)

woody %>% 
  pivot_longer(-SiteCode, names_to = "year", values_to = "woody500m") %>%
  ggplot() +
  facet_wrap(~year, scales = "free_y") +
  geom_histogram(aes(x = woody500m), bins = 30)

woody %>% 
  pivot_longer(-SiteCode, names_to = "year", values_to = "woody500m") %>%
  dplyr::filter(woody500m <= 1) #less than 1% 60ish site years

There are some very low (<1%) woody cover locations and years. There are also some very high (>60%) woody cover data points. Some of these anomalies may not be present in the data we are interested in because we are not interested in every year at every location.

The standard deviation of the woody veg values for each location will give an idea of any anomolous locations that have clearing or planting nearby.

woody %>% 
  pivot_longer(-SiteCode, names_to = "year", values_to = "woody500m") %>%
  group_by(SiteCode) %>%
  summarise(sd = sd(woody500m)) %>%
  ggplot() +
  geom_point(aes(x = sd, y = SiteCode))

woody %>% 
  pivot_longer(-SiteCode, names_to = "year", values_to = "woody500m") %>%
  group_by(SiteCode) %>%
  summarise(sd = sd(woody500m)) %>%
  filter(sd > 7)

woody %>% 
  pivot_longer(-SiteCode, names_to = "year", values_to = "woody500m") %>%
  filter(SiteCode %in% c("WEBB-2", "KEA-1", "WISE-B")) %>%
  ggplot() +
  facet_wrap(vars(SiteCode)) +
  geom_point(aes(x = year, y = woody500m))

The site WISE-B has an incredibly large sample standard deviation. This is driven by low woody veg values from 2003 through to 2010, and high woody veg values in other years. WISE-B is at the edge of a lake-like section of the Murray river, near Albury, could it be that the values of woody veg are not filtering out water well?

WEBB-2, also an outlier, appears to neighbour a plantation forest.

tile <- tilearoundobj(locs[locs$SiteCode == "WISE-B", ], 0.01)
wiseb_region <- brick_woodycover(as_Spatial(tile), 2000:2019)
wiseb_region[wiseb_region > 100] <- 0 #assume NA values mean no woody veg
plot(wiseb_region)

wf <- raster::focalWeight(wiseb_region, d = 500, type = "circle")
woody_bs <- focal_bylayer(wiseb_region, wf, fun = sum)
names(woody_bs) <- names(wiseb_region)
WISEB_woody_vals <- extract(woody_bs, as_Spatial(locs[locs$SiteCode == "WISE-B", ]), df = TRUE)
checkvals <- woody_vals_buffer(tile, locs[locs$SiteCode == "WISE-B", ], 2000:2019, 500)
print(WISEB_woody_vals)
print(checkvals)

WISE-B's woody veg values are heavily affected by the nearby water body. The cover value appears to 157, and must represent NA. Replacing all values above 100 with NA will hopefully lead to a better result.

But are there other values above 100? Try with a histogram.

bigwoody <- raster("http://dapds00.nci.org.au/thredds/dodsC/ub8/au/LandCover/DEA_ALC/13_-41/fc_metrics_13_-41_2000.nc", varname = "WCF")
woodyvals <- as.matrix(bigwoody)
unique(woodyvals[woodyvals > 100])

Woody Veg values for visits

woody_at_visit <- woody %>% 
  pivot_longer(-SiteCode, names_to = "year", values_to = "woody500m") %>%
  mutate(year = as.integer(year)) %>%
  right_join(yXobs_grnd[, c("SiteCode", "SurveyYear")], by = c("SiteCode", "year" = "SurveyYear"))

sum(is.na(woody_at_visit)) / nrow(woody_at_visit)

woody_at_visit %>%
  filter(is.na(woody500m)) %>%
  dplyr::select(year) %>%
  unique()

Visits in 1998 and 2019 have no woody veg data, they correspond to 6.6% of visits. How would I impute them? Using the closest value?

woody_at_visit %>%
  filter(woody500m <= 1)

woody_at_visit %>%
  ggplot() +
  geom_histogram(aes(x = woody500m), bins = 30)

Woody Veg Imputation Model

woody[sample(1:nrow(woody), 36), ] %>%
  pivot_longer(-SiteCode, names_to = "year", values_to = "woody500m") %>%
  ggplot() +
  facet_wrap(vars(SiteCode)) +
  geom_point(aes(x = year, y = woody500m))

It looks like the nearest value is a good predictor of woody vegetation. Lets test it on 1 year ahead and 2 years back.

impute1ahead_residual <- woody$`2018` - woody$`2017`
hist(impute1ahead_residual)
impute1ahead_residual %>%
  as_tibble() %>%
  ggplot() +
  geom_qq(aes(sample = value)) +
  geom_qq_line(aes(sample = value))
range(impute1ahead_residual)

It looks like the residual of using the previous year is not normally distributed, but it has a range of only -12 to 4, which is ok for our modeling situation for now. Of course, this will not work if the year 2019 is drastically different to the year 2018. Similarly for 1998. Would be better to get the 2019 woody veg data corrected.

impute2behind_residual <- woody$`2000` - woody$`2002`
hist(impute2behind_residual)
impute2behind_residual %>%
  as_tibble() %>%
  ggplot() +
  geom_qq(aes(sample = value)) +
  geom_qq_line(aes(sample = value))
range(impute2behind_residual)

The residual for backwards in time two years has a much larger range, and appears close to normally distributed. The larger range could be concerning, but I feel we have little choice. It will be worth investigating if the residuals are larger for 1998 and 2019. Or checking if the leverage of these values are larger.

woody$`1998` <- woody$`2000`
woody$`2019` <- woody$`2018`

GPP

GPP values are not available for earlier than 2000, and they are very hard to predict. Thus we cannot use the visits from earlier than 2000.

datename_2_date <- function(x) {
  posixltval <- lubridate::fast_strptime(x, format = "X%Y.%m.%d", tz = "Australia/Sydney")
  out <- as.POSIXct(posixltval)
  return(out)
}
range(datename_2_date(names(gpp[, -length(gpp)])))
sum(is.na(gpp))

gpp_long <- gpp %>%
  pivot_longer(-SiteCode, names_to = "Date", values_to = "GPP") %>%
  mutate(Date = datename_2_date(Date))

gpp_long %>%
  ggplot() +
  geom_histogram(aes(x = GPP), bins = 30) +
  ggtitle("GPP Values")

gpp_means <- rowMeans(gpp[, -ncol(gpp)])
gpp_means <- data.frame(GPPmean = gpp_means, SiteCode = gpp$SiteCode)

gpp_means %>%
  ggplot() +
  geom_histogram(aes(x = GPPmean))

gpp_diff <- gpp[, -ncol(gpp)] - gpp_means$GPPmean  #high gpp_diff means higher than mean GPP values
gpp_diff$SiteCode <- gpp$SiteCode
all.equal(gpp_diff[, 5], gpp[, 5] - gpp_means$GPPmean)

# interpolate to days in between for matching to visit dates
obsdates <- datename_2_date(names(gpp_diff)[-ncol(gpp_diff)])
xout <- seq(min(obsdates), max(obsdates), by = '1 day')
gpp_diff_interp_l <- apply(gpp_diff[, -ncol(gpp_diff)], MARGIN = 1, 
      FUN = function(x) {
        interp <- approx(obsdates, x, xout = xout)
        return(interp)}
      )
gpp_diff_interp <- simplify2array(lapply(gpp_diff_interp_l, function(x) x[["y"]]))
gpp_diff_interp <- as.data.frame(t(gpp_diff_interp))
colnames(gpp_diff_interp) <- format(xout, "%Y-%m-%d")
gpp_diff_interp$SiteCode <- gpp_diff$SiteCode



gpp_diff_interp_long <- gpp_diff_interp %>%
  pivot_longer(-SiteCode, names_to = "Date", values_to = "GPP - mean")

gpp_diff_modelsite <- gpp_diff_interp_long %>%
  right_join(yXobs_grnd[, c("SurveyDate", "SurveyYear", "SiteCode")], by = c("Date" = "SurveyDate", "SiteCode")) %>%
  group_by(SurveyYear, SiteCode) %>%
  summarise(meanGPPdiff = mean(`GPP - mean`)) 

gpp_diff_modelsite %>%
  ggplot() +
  geom_histogram(aes(x = meanGPPdiff), bins = 30) +
  ggtitle("GPP Difference Values for Each ModelSite")

Mean GPP is nicely distributed, and very low relative to the range of the actual GPP values.

Difference to the mean, averaged within each ModelSite, is also nicely distributed.

Correlation between GPP mean and TWI

a <- inner_join(gpp_means, twi, by = "SiteCode")
plot(a$GPPmean, a$TWI)
cor(a$GPPmean, a$TWI)

TWI and GPPmean appear to have very little correlation. Strange.

Conclusions

Use visits from 2000 onwards.
For 2019 woody veg data, using the 2018 data.
For GPP data, use the mean GPP within a ModelSite, differenced to the mean GPP of the location.
Use woody veg data that treats water and cover types represented as 157 as 0.
Investigate the following 6 sites for impact of high TWI values: CLOT-S
JRAY-S
MSPE-S
PCAM-C
BELL-2
PATT-2