knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 8, fig.height = 6 )
devtools::load_all(here::here("../vertical-lidar-paper")) library(GET) library(patchwork) library(tibble) library(ggplot2) library(readr) library(dplyr) library(tidyr) library(purrr) ggplot2::theme_set(theme_pub()) alpha <- 0.05 / 3 # (Bonferroni correction) nperm <- 100 # number of permutations
The observations are imbalanced. The most important imbalance is between species, then age. The crown closures are more balanced, but still not perfectly balanced.
What's the magnitude of the problem and how are the differences between the age and crown closure classes combined? Here I'll use the filtered data.
# read data # wave can be create once with `source("data-raw/lidar_wave.R")` wave <- wave %>% mutate( age_class = sort_num_factor(age_class), crown_closure = sort_num_factor(crown_closure) )
group_size <- wave %>% group_by(dominant_species, age_class, crown_closure) %>% tally(name = "count") %>% group_by(dominant_species) %>% add_count(name = "total") group_size %>% ggplot(aes(crown_closure, age_class, fill = count / total, label = count)) + geom_tile(show.legend = FALSE) + scale_fill_continuous(trans = "log") + facet_wrap(~dominant_species) + geom_text(color = "grey80", size = 3) + coord_equal()
Aspen is the smallest category, with very little observations in each category. Also balsam fir has substantially more observations than any other species. Another imbalance is that some species are more represented in some classes. Black and white spruces have lot of young stands, while paper birch stands are mostly mature. I would expect the null model to account for these differences.
Downsample observations to mimic Aspen distribution for species comparison.
set.seed(2020) ec_bf_down_aspen <- wave %>% group_by(dominant_species, age_class, crown_closure) %>% nest() %>% left_join( group_size %>% ungroup() %>% filter(dominant_species == "aspen") %>% select(-dominant_species), by = c("crown_closure", "age_class") ) %>% mutate( count = replace_na(count, 0), data = map2(data, count, ~sample_n(.x, round(.y), replace = TRUE)) ) %>% unnest(data) %>% ungroup() # visualize results ec_bf_down_aspen %>% group_by(dominant_species, age_class, crown_closure) %>% tally(name = "count") %>% group_by(dominant_species) %>% add_count(name = "total") %>% mutate( count = ifelse(count < 1, NA, count) ) %>% ggplot(aes(crown_closure, age_class, fill = count, label = count)) + geom_tile(show.legend = FALSE) + scale_fill_continuous(trans = "log") + facet_wrap(~dominant_species) + geom_text(color = "grey80", size = 3) + coord_equal()
We setup the analysis
mobs <- ec_bf_down_aspen %>% dplyr::select(starts_with("ss_")) %>% as.matrix() cset <- create_curve_set(list(r = 1:39, obs = t(mobs))) independent_factors <- ec_bf_down_aspen %>% select( dominant_species, age_class, crown_closure ) %>% as.data.frame()
functional_lm <- function(null_model, nsim = nperm, contrasts = TRUE) { graph.flm( nsim = nsim, formula.full = Y ~ dominant_species + age_class + crown_closure, formula.reduced = null_model, curve_sets = cset, factors = independent_factors, contrasts = contrasts, GET.args = list(alpha = alpha) ) } res_flm_sp <- functional_lm(Y ~ age_class + crown_closure) res_flm_cc <- functional_lm(Y ~ dominant_species + age_class, contrasts = FALSE) res_flm_age <- functional_lm(Y ~ dominant_species + crown_closure, contrasts = FALSE)
my_plot_contrasts(res_flm_sp, 39, full = TRUE)
My understanding is that I should balance the other factors for each analysis (i.e. balance crown closure per cc class, balance age per age class), but let's see what it does to other comparisons.
my_plot_continuous(res_flm_cc, 39) + scale_y_continuous("Scaled density difference (×10⁻³)", labels = function(x) paste0(x * 100)) my_plot_continuous(res_flm_age, 39) + scale_y_continuous("Scaled density difference (×10⁻³)", labels = function(x) paste0(x * 100))
It seems that age is very much affected because aspen only has three age classes sufficently represented (30, 50, 70), crown closure is less affected because it is better represented, but the differences are still smaller than with the complete data.
Another strategy is to upsample our observations by padding the data with observations drawn at random. Each missing observation becomes a penalty, since it is replaced by a random observation from the global data.
set.seed(2020) target_size <- 30 wave_upsample <- wave %>% # pad empty classes expand(dominant_species, age_class, crown_closure) %>% left_join(wave, by = c("dominant_species", "age_class", "crown_closure")) %>% # sample group_by(dominant_species, age_class, crown_closure) %>% nest() %>% mutate( count = target_size, # # Permutation only (nothing exits the envelope) # data = map2(data, count, ~sample_n(wave %>% select(matches("ss_")), round(.y), replace = TRUE)) # # Pick within the available observations for the specific combination # data = map2(data, count, ~sample_n(.x, round(.y), replace = TRUE)) # If sufficient observations, sample a fraction of the data # If missing observations, randomly pick observations from the pool data = map2(data, count, ~sample_n_if_needed(.x %>% drop_na(), round(.y), fill = wave)) ) %>% unnest(data) %>% ungroup() # visualize results wave_upsample %>% group_by(dominant_species, age_class, crown_closure) %>% tally(name = "count") %>% group_by(dominant_species) %>% add_count(name = "total") %>% mutate( count = ifelse(count < 1, NA, count) ) %>% ggplot(aes(crown_closure, age_class, fill = count, label = count)) + geom_tile(show.legend = FALSE) + scale_fill_continuous(trans = "log") + facet_wrap(~dominant_species) + geom_text(color = "grey80", size = 3)
Setup the functional format
mobs <- wave_upsample %>% dplyr::select(starts_with("ss_")) %>% as.matrix() cset <- create_curve_set(list(r = 1:39, obs = t(mobs))) independent_factors <- wave_upsample %>% select( dominant_species, age_class, crown_closure ) %>% as.data.frame() res_flm_sp <- functional_lm(Y ~ age_class + crown_closure) res_flm_cc <- functional_lm(Y ~ dominant_species + age_class, contrasts = FALSE) res_flm_age <- functional_lm(Y ~ dominant_species + crown_closure, contrasts = FALSE)
Species
my_plot_contrasts(res_flm_sp, 39, full = TRUE)
( my_plot_continuous(res_flm_cc, 39) + scale_y_continuous("Scaled density difference (×10⁻³)", labels = function(x) paste0(x * 100)) + ggtitle("Crown closure") ) / ( my_plot_continuous(res_flm_age, 39) + scale_y_continuous("Scaled density difference (×10⁻³)", labels = function(x) paste0(x * 100)) + ggtitle("Age"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.