knitr::opts_knit$set(root.dir = rprojroot::find_root(rprojroot::has_file("DESCRIPTION"))) devtools::load_all(rprojroot::find_root(rprojroot::has_file("DESCRIPTION"))) library(sf) library(leaflet) library(ggplot2)
data_7_2_7 <- readRDS("./private/data/clean/7_2_4_input_data.rds") Xocc <- rbind(data_7_2_7$insampledata$Xocc, data_7_2_7$holdoutdata$Xocc) yXobs_grnd <- rbind(data_7_2_7$insampledata$yXobs, data_7_2_7$holdoutdata$yXobs) species <- data_7_2_7$species locs <- read.csv("private/data/clean/site_locations_cleaned.csv", stringsAsFactors = FALSE) locs <- st_as_sf(locs, coords = c("MeanLON", "MeanLAT")) locs <- st_set_crs(locs, 4326) #4326 is the epsg code for WGS84, make this default crs I'll return to gpp <- readRDS("./private/data/remote_sensed/8d_gpp.rds") woody <- readRDS("./private/data/remote_sensed/woody500.rds") names(woody) <- gsub("^X", "", names(woody)) twi <- readRDS("./private/data/remote_sensed/TWI_boxgum_sites.rds")
Components to check:
twi %>% ggplot() + geom_histogram(aes(x = TWI), bins = 40) twi %>% filter(TWI > 12)
There are 6 sites with extremely high TWI values.
twi_loc <- st_as_sf(inner_join(twi, locs[, "SiteCode"], by = "SiteCode")) pal <- colorNumeric(viridisLite::viridis(10), twi_loc$TWI) map <- leaflet() %>% # addTiles() %>% # addProviderTiles("Esri.WorldImagery") %>% # addProviderTiles("Esri.DeLorme") %>% addProviderTiles("Esri.WorldShadedRelief") %>% addScaleBar(options = scaleBarOptions(maxWidth = 200, imperial = FALSE)) # addOpacitySlider("elevation") map <- map %>% addCircleMarkers(lng = st_coordinates(twi_loc)[, "X"], lat = st_coordinates(twi_loc)[, "Y"], label = paste(twi_loc$TWI, twi_loc$SiteCode), color = pal(twi_loc$TWI), opacity = 1) %>% addLegend(pal = pal, values = seq(min(twi_loc$TWI), max(twi_loc$TWI), length = 20), title = "TWI") map
The TWI at CLOT-S is unusally high. Neighbouring CLOT-C is much lower and there seems to be little elevation difference between the two. It appears, roughly, that nearby points higher on the Earth's surface have lower TWI.
Check against a raster map.
library(slga) #specialist library for reading the data library(raster) library(rasterVis) library(leaflet.opacity) twi_ras <- get_lscape_data(slga_product_info %>% dplyr::filter(Product == "Topographic Wetness Index") %>% dplyr::select(Short_Name) %>% as.character(), aoi = locs[locs$StudyCode == "Nanangroe Natural Experiment", ]) map %>% fitBounds(extent(twi_ras)@xmin, extent(twi_ras)@ymin, extent(twi_ras)@xmax, extent(twi_ras)@ymax) %>% addRasterImage(twi_ras, colors = pal, layerId = "raster") %>% addOpacitySlider(layerId = "raster") # gplot(twi_ras) + # geom_tile(aes(fill = value)) + # scale_fill_viridis_c() + # geom_sf(data = locs[locs$StudyCode == "Nanangroe Natural Experiment", ], inherit.aes = FALSE, stat = "sf") + # coord_sf() # ggplot() + geom_sf(data = twi_loc)
A raster extraction of TWI has values consistent with the values at the site locations. The raster map is also consistent with the ESRI terrain, confirming that the projections are approximately correct.
anyNA(woody, recursive = TRUE) #there are no NA values in the woody data woody %>% pivot_longer(-SiteCode, names_to = "year", values_to = "woody500m") %>% ggplot() + geom_histogram(aes(x = woody500m), bins = 50) woody %>% pivot_longer(-SiteCode, names_to = "year", values_to = "woody500m") %>% ggplot() + facet_wrap(~year, scales = "free_y") + geom_histogram(aes(x = woody500m), bins = 30) woody %>% pivot_longer(-SiteCode, names_to = "year", values_to = "woody500m") %>% dplyr::filter(woody500m <= 1) #less than 1% 60ish site years
There are some very low (<1%) woody cover locations and years. There are also some very high (>60%) woody cover data points. Some of these anomalies may not be present in the data we are interested in because we are not interested in every year at every location.
The standard deviation of the woody veg values for each location will give an idea of any anomolous locations that have clearing or planting nearby.
woody %>% pivot_longer(-SiteCode, names_to = "year", values_to = "woody500m") %>% group_by(SiteCode) %>% summarise(sd = sd(woody500m)) %>% ggplot() + geom_point(aes(x = sd, y = SiteCode)) woody %>% pivot_longer(-SiteCode, names_to = "year", values_to = "woody500m") %>% group_by(SiteCode) %>% summarise(sd = sd(woody500m)) %>% filter(sd > 7) woody %>% pivot_longer(-SiteCode, names_to = "year", values_to = "woody500m") %>% filter(SiteCode %in% c("WEBB-2", "KEA-1", "WISE-B")) %>% ggplot() + facet_wrap(vars(SiteCode)) + geom_point(aes(x = year, y = woody500m))
The site WISE-B has an incredibly large sample standard deviation. This is driven by low woody veg values from 2003 through to 2010, and high woody veg values in other years. WISE-B is at the edge of a lake-like section of the Murray river, near Albury, could it be that the values of woody veg are not filtering out water well?
WEBB-2, also an outlier, appears to neighbour a plantation forest.
tile <- tilearoundobj(locs[locs$SiteCode == "WISE-B", ], 0.01) wiseb_region <- brick_woodycover(as_Spatial(tile), 2000:2019) wiseb_region[wiseb_region > 100] <- 0 #assume NA values mean no woody veg plot(wiseb_region) wf <- raster::focalWeight(wiseb_region, d = 500, type = "circle") woody_bs <- focal_bylayer(wiseb_region, wf, fun = sum) names(woody_bs) <- names(wiseb_region) WISEB_woody_vals <- extract(woody_bs, as_Spatial(locs[locs$SiteCode == "WISE-B", ]), df = TRUE) checkvals <- woody_vals_buffer(tile, locs[locs$SiteCode == "WISE-B", ], 2000:2019, 500) print(WISEB_woody_vals) print(checkvals)
WISE-B's woody veg values are heavily affected by the nearby water body. The cover value appears to 157, and must represent NA. Replacing all values above 100 with NA will hopefully lead to a better result.
But are there other values above 100? Try with a histogram.
bigwoody <- raster("http://dapds00.nci.org.au/thredds/dodsC/ub8/au/LandCover/DEA_ALC/13_-41/fc_metrics_13_-41_2000.nc", varname = "WCF") woodyvals <- as.matrix(bigwoody) unique(woodyvals[woodyvals > 100])
woody_at_visit <- woody %>% pivot_longer(-SiteCode, names_to = "year", values_to = "woody500m") %>% mutate(year = as.integer(year)) %>% right_join(yXobs_grnd[, c("SiteCode", "SurveyYear")], by = c("SiteCode", "year" = "SurveyYear")) sum(is.na(woody_at_visit)) / nrow(woody_at_visit) woody_at_visit %>% filter(is.na(woody500m)) %>% dplyr::select(year) %>% unique()
Visits in 1998 and 2019 have no woody veg data, they correspond to 6.6% of visits. How would I impute them? Using the closest value?
woody_at_visit %>% filter(woody500m <= 1) woody_at_visit %>% ggplot() + geom_histogram(aes(x = woody500m), bins = 30)
woody[sample(1:nrow(woody), 36), ] %>% pivot_longer(-SiteCode, names_to = "year", values_to = "woody500m") %>% ggplot() + facet_wrap(vars(SiteCode)) + geom_point(aes(x = year, y = woody500m))
It looks like the nearest value is a good predictor of woody vegetation. Lets test it on 1 year ahead and 2 years back.
impute1ahead_residual <- woody$`2018` - woody$`2017` hist(impute1ahead_residual) impute1ahead_residual %>% as_tibble() %>% ggplot() + geom_qq(aes(sample = value)) + geom_qq_line(aes(sample = value)) range(impute1ahead_residual)
It looks like the residual of using the previous year is not normally distributed, but it has a range of only -12 to 4, which is ok for our modeling situation for now. Of course, this will not work if the year 2019 is drastically different to the year 2018. Similarly for 1998. Would be better to get the 2019 woody veg data corrected.
impute2behind_residual <- woody$`2000` - woody$`2002` hist(impute2behind_residual) impute2behind_residual %>% as_tibble() %>% ggplot() + geom_qq(aes(sample = value)) + geom_qq_line(aes(sample = value)) range(impute2behind_residual)
The residual for backwards in time two years has a much larger range, and appears close to normally distributed. The larger range could be concerning, but I feel we have little choice. It will be worth investigating if the residuals are larger for 1998 and 2019. Or checking if the leverage of these values are larger.
woody$`1998` <- woody$`2000` woody$`2019` <- woody$`2018`
GPP values are not available for earlier than 2000, and they are very hard to predict. Thus we cannot use the visits from earlier than 2000.
datename_2_date <- function(x) { posixltval <- lubridate::fast_strptime(x, format = "X%Y.%m.%d", tz = "Australia/Sydney") out <- as.POSIXct(posixltval) return(out) } range(datename_2_date(names(gpp[, -length(gpp)]))) sum(is.na(gpp))
gpp_long <- gpp %>% pivot_longer(-SiteCode, names_to = "Date", values_to = "GPP") %>% mutate(Date = datename_2_date(Date)) gpp_long %>% ggplot() + geom_histogram(aes(x = GPP), bins = 30) + ggtitle("GPP Values") gpp_means <- rowMeans(gpp[, -ncol(gpp)]) gpp_means <- data.frame(GPPmean = gpp_means, SiteCode = gpp$SiteCode) gpp_means %>% ggplot() + geom_histogram(aes(x = GPPmean))
gpp_diff <- gpp[, -ncol(gpp)] - gpp_means$GPPmean #high gpp_diff means higher than mean GPP values gpp_diff$SiteCode <- gpp$SiteCode all.equal(gpp_diff[, 5], gpp[, 5] - gpp_means$GPPmean) # interpolate to days in between for matching to visit dates obsdates <- datename_2_date(names(gpp_diff)[-ncol(gpp_diff)]) xout <- seq(min(obsdates), max(obsdates), by = '1 day') gpp_diff_interp_l <- apply(gpp_diff[, -ncol(gpp_diff)], MARGIN = 1, FUN = function(x) { interp <- approx(obsdates, x, xout = xout) return(interp)} ) gpp_diff_interp <- simplify2array(lapply(gpp_diff_interp_l, function(x) x[["y"]])) gpp_diff_interp <- as.data.frame(t(gpp_diff_interp)) colnames(gpp_diff_interp) <- format(xout, "%Y-%m-%d") gpp_diff_interp$SiteCode <- gpp_diff$SiteCode gpp_diff_interp_long <- gpp_diff_interp %>% pivot_longer(-SiteCode, names_to = "Date", values_to = "GPP - mean") gpp_diff_modelsite <- gpp_diff_interp_long %>% right_join(yXobs_grnd[, c("SurveyDate", "SurveyYear", "SiteCode")], by = c("Date" = "SurveyDate", "SiteCode")) %>% group_by(SurveyYear, SiteCode) %>% summarise(meanGPPdiff = mean(`GPP - mean`)) gpp_diff_modelsite %>% ggplot() + geom_histogram(aes(x = meanGPPdiff), bins = 30) + ggtitle("GPP Difference Values for Each ModelSite")
Mean GPP is nicely distributed, and very low relative to the range of the actual GPP values.
Difference to the mean, averaged within each ModelSite, is also nicely distributed.
a <- inner_join(gpp_means, twi, by = "SiteCode") plot(a$GPPmean, a$TWI) cor(a$GPPmean, a$TWI)
TWI and GPPmean appear to have very little correlation. Strange.
157
as 0.Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.