# setwd('reports')
# load knitr for markdown
library(knitr)
knitr::opts_chunk$set(echo=FALSE, 
                      warning=FALSE,
                      # error = FALSE,
                      message=FALSE)
#options(tinytex.verbose = TRUE)
options(knitr.kable.NA = '-')


# what kind of document is being created?
doc.type <- knitr::opts_knit$get('rmarkdown.pandoc.to')
# when knitting to Word, use this
if(doc.type == 'docx') {
  options(knitr.table.format = "pandoc")
}


library(kableExtra)
# load needed packages
library(tidyverse)
library(sf)
library(janitor)
library(ggpubr)
library(ggrepel)
library(scales)
library(ggspatial)
library(QRFcapacity)

# theme_set(theme_bw())
theme_set(theme_pubr(base_size = 10))
knitr::write_bib(c("base",
                   "survey", 
                   "sf", 
                   "knitr", 
                   "rmarkdown"),
                 file = 'packages.bib')
rch_res_list = c('juv_summer',
                 'juv_summer_dash',
                 'redds') %>%
  as.list() %>%
  rlang::set_names() %>%
  map(.f = function(x) {
    load(paste0('../output/modelFits/extrap_200rch_RF_', x, '.rda'))
    model_df = model_rf_df

    list(rch_preds = all_preds,
         rch_in_covar_range = model_df$pred_all_rchs[[1]] %>%
           select(UniqueID, in_covar_range),
         extrap_covars = extrap_covars) %>%
      return()
  })

all_rch_preds = rch_res_list %>%
  map_df(.id = "model_choice",
         .f = "rch_preds")

in_covar_range_rch = rch_res_list %>%
  map_df(.id = "model_choice",
         .f = "rch_in_covar_range")

extrap_covars_rch = rch_res_list %>%
  map_df(.f = function(x) {
    tibble(covars = x$extrap_covars)
  }) %>%
  distinct() %>%
  pull(covars)

data("chnk_domain")
data("rch_200")

rch_200_df = rch_200 %>%
  st_drop_geometry() %>%
  as_tibble() %>%
  mutate_at(vars(regime),
            list(~ as.factor(as.character(.))))

rch_200_cap = rch_200 %>%
  select(UniqueID, GNIS_Name, reach_leng:HUC8_code, 
         chnk, chnk_use, chnk_ESU_DPS:chnk_NWR_NAME,
         sthd, sthd_use, sthd_ESU_DPS:sthd_NWR_NAME) %>%
  left_join(all_rch_preds %>%
              select(-HUC8_code)) %>%
  mutate_at(vars(chnk_ESU_DPS:chnk_NWR_NAME, sthd_ESU_DPS:sthd_NWR_NAME),
            list(fct_explicit_na)) %>%
  mutate(chnk_tot = chnk_per_m * reach_leng,
         chnk_tot_se = chnk_per_m_se * reach_leng) %>%
  mutate(sthd_tot = sthd_per_m * reach_leng,
         sthd_tot_se = sthd_per_m_se * reach_leng) %>%
  mutate_at(vars(starts_with("chnk_tot")),
            list(~ if_else(!chnk, NA_real_, .))) %>%
  mutate_at(vars(starts_with("sthd_tot")),
            list(~ if_else(!sthd, NA_real_, .))) %>%
  # focus on anadromous ranges
  filter(chnk | sthd) %>%
  left_join(in_covar_range_rch) %>%
  st_transform(st_crs(chnk_domain)) %>%
  select(everything(), geometry) %>%
  arrange(UniqueID,
          model_choice)

# rm(mod_data_weights, model_svy_df, extrap_covars, rch_200, all_preds)
rch_glm_list = c('juv_summer',
                 'juv_summer_dash',
                 'redds') %>%
  as.list() %>%
  rlang::set_names() %>%
  map(.f = function(x) {
    load(paste0('../output/modelFits/extrap_200rch_', x, '.rda'))
    model_df = model_svy_df

    list(rch_preds = all_preds,
         rch_in_covar_range = model_df$pred_all_rchs[[1]] %>%
           select(UniqueID, in_covar_range),
         extrap_covars = extrap_covars) %>%
      return()
  })

all_glm_preds = rch_glm_list %>%
  map_df(.id = "model_choice",
         .f = "rch_preds")

rch_200_cap_glm = rch_200 %>%
  select(UniqueID, GNIS_Name, reach_leng:HUC8_code, 
         chnk, chnk_use, chnk_ESU_DPS:chnk_NWR_NAME,
         sthd, sthd_use, sthd_ESU_DPS:sthd_NWR_NAME) %>%
  left_join(all_glm_preds) %>%
  mutate_at(vars(chnk_ESU_DPS:chnk_NWR_NAME, sthd_ESU_DPS:sthd_NWR_NAME),
            list(fct_explicit_na)) %>%
  mutate(chnk_tot = chnk_per_m * reach_leng,
         chnk_tot_se = chnk_per_m_se * reach_leng) %>%
  mutate(sthd_tot = sthd_per_m * reach_leng,
         sthd_tot_se = sthd_per_m_se * reach_leng) %>%
  mutate_at(vars(starts_with("chnk_tot")),
            list(~ if_else(!chnk, NA_real_, .))) %>%
  mutate_at(vars(starts_with("sthd_tot")),
            list(~ if_else(!sthd, NA_real_, .))) %>%
  # focus on anadromous ranges
  filter(chnk | sthd) %>%
  left_join(in_covar_range_rch) %>%
  st_transform(st_crs(chnk_domain)) %>%
  select(everything(), geometry) %>%
  arrange(UniqueID,
          model_choice)
champ_ids = rch_200_cap %>%
  filter(model == 'QRF') %>%
  # filter(model %in% c("QRF",
  #                     "CHaMP")) %>%
  pull(UniqueID) %>%
  unique()

extrap_class_rch = rch_200_df %>%
  summarise(across(one_of(extrap_covars_rch),
                   class))
# which ones are numeric?
extrap_num_rch = names(extrap_class_rch)[extrap_class_rch %in% c('integer', 'numeric')]
# which ones are categorical?
extrap_catg_rch = names(extrap_class_rch)[extrap_class_rch %in% c('factor', 'character', 'ordered')]

# create the comparison data.frame
comp_rch_df = rch_200_df %>%
  mutate(Source = if_else(UniqueID %in% champ_ids,
                          "CHaMP Reaches",
                          "non-CHaMP Reaches")) %>%
  filter(strm_order <= 6,
         !GNIS_Name %in% c("Columbia River",
                           "Snake River")) %>%
  filter(sthd | chnk) %>%
  pivot_longer(cols = any_of(extrap_num_rch),
               names_to = "Metric",
               values_to = "value") %>%
  filter(!(Metric == "S2_02_11" & value < -9000)) %>%
  filter(!is.na(value))

# grab some summary statistics of CHaMP reaches
range_max_all = comp_rch_df %>%
  filter(Source == 'CHaMP Reaches') %>%
  select(UniqueID,
         Metric, value) %>%
  group_by(Metric) %>%
  summarise(across(value,
                   tibble::lst(min, max),
                   na.rm = T),
            .groups = "drop") %>%
  left_join(comp_rch_df %>%
              filter(Source == 'CHaMP Reaches') %>%
              select(UniqueID,
                     Metric, value) %>%
              group_by(Metric) %>%
              summarise(across(value,
                               list(lower_quant = ~ quantile(., probs = 0.25, na.rm = T),
                                    upper_quant = ~ quantile(., probs = 0.75, na.rm = T))),
                        .groups = "drop")) %>%
  pivot_longer(starts_with("value"),
               names_to = "type") %>%
  mutate(type = str_remove(type,
                           "value_")) %>%
  mutate(range_type = if_else(type %in% c('min', 'max'),
                              'Range',
                              'IQR'))

comp_rch_df %<>%
  left_join(range_max_all %>%
              select(-range_type) %>%
              pivot_wider(names_from = "type",
                          values_from = "value")) %>%
  mutate(in_range = if_else(value >= min & value <= max,
                            T, F),
         in_iqr = if_else(value >= lower_quant & value <= upper_quant,
                          T, F))


champ_range_rch_p = comp_rch_df %>%
  mutate(Source = str_remove(Source, " Reaches$")) %>%
  ggplot(aes(x = Source,
             y = value,
             fill = Source)) +
  geom_boxplot() +
  facet_wrap(~ Metric,
             scales = 'free_y') +
  geom_hline(data = range_max_all %>%
               filter(range_type == "Range"),
             aes(yintercept = value),
             linetype = 2,
             color = 'darkblue') +
  scale_linetype(guide = 'none') +
  theme_pubr(base_size = 10) +
  theme(legend.position = "bottom",
        axis.title = element_blank(),
        axis.text.x = element_blank()) +
  labs(fill = "Reaches")

# break it down by HUC6
# log these covariates for plotting purposes
log_mets = c('alp_accum',
             'fines_accu',
             'flow_accum',
             'grav_accum',
             'p_accum',
             'fp_cur')

# 2 plots, one with logged covariates, one without
covar_comp_log = comp_rch_df %>%
  mutate(HUC6_name = as.character(HUC6_name),
         HUC6_name = if_else(Source == "CHaMP Reaches",
                             'CHaMP',
                             HUC6_name),
         HUC6_name = fct_relevel(HUC6_name,
                                 'CHaMP')) %>%
  filter(Metric %in% log_mets) %>%
  mutate(across(value:upper_quant,
                ~ if_else(. == 0,
                          . + 0.5,
                          .))) %>%
  ggplot(aes(x = HUC6_name,
             y = value,
             fill = HUC6_name)) +
  scale_fill_brewer(palette = "Set3") +
  geom_boxplot(outlier.color = 'gray70',
               outlier.size = 0.5) +
  geom_hline(data = range_max_all %>%
               filter(Metric %in% log_mets) %>%
               mutate(across(value,
                             ~ if_else(. == 0,
                                       . + 0.5,
                                       .))),
             aes(yintercept = value,
                 linetype = range_type),
             color = 'darkblue') +
  scale_linetype(guide = 'none') +
  facet_wrap(~ Metric,
             ncol = 2,
             scales = 'free_y') +
  scale_y_continuous(trans = "log",
                     breaks = scales::pretty_breaks()) +
  theme_pubr(base_size = 10) +
  theme(legend.position = "bottom",
        axis.title = element_blank(),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1,
                                   size = 6),
        axis.text.y = element_text(size = 6))

covar_comp_nolog = comp_rch_df %>%
  mutate(HUC6_name = as.character(HUC6_name),
         HUC6_name = if_else(Source == "CHaMP Reaches",
                             'CHaMP',
                             HUC6_name),
         HUC6_name = fct_relevel(HUC6_name,
                                 'CHaMP')) %>%
  filter(!Metric %in% log_mets) %>%
  ggplot(aes(x = HUC6_name,
             y = value,
             fill = HUC6_name)) +
  scale_fill_brewer(palette = "Set3") +
  geom_boxplot(outlier.color = 'gray70',
               outlier.size = 0.5) +
  geom_hline(data = range_max_all %>%
               filter(!Metric %in% log_mets),
             aes(yintercept = value,
                 linetype = range_type),
             color = 'darkblue') +
  scale_linetype(guide = 'none') +
  facet_wrap(~ Metric,
             ncol = 3,
             scales = 'free_y') +
  scale_y_continuous(trans = "identity",
                     breaks = scales::pretty_breaks()) +
  theme_pubr(base_size = 10) +
  theme(legend.position = "bottom",
        axis.title = element_blank(),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1,
                                   size = 6),
        axis.text.y = element_text(size = 6))

huc6_rch_p = ggpubr::ggarrange(plotlist = list(covar_comp_log,
                                               covar_comp_nolog),
                               nrow = 1,
                               widths = c(2, 3),
                               common.legend = T,
                               legend = "bottom")
pts_res_list = c('juv_summer',
                 'juv_summer_dash',
                 'redds') %>%
  as.list() %>%
  rlang::set_names() %>%
  map(.f = function(x) {
    load(paste0('../output/modelFits/extrap_mastPts_', x, '.rda'))

    list(pts_preds = all_preds,
         pt_in_covar_range = model_svy_df$pred_all_rchs[[1]] %>%
           select(Site, in_covar_range),
         extrap_covars = extrap_covars,
         model_df = model_svy_df) %>%
      return()
  })

all_pts_preds = pts_res_list %>%
  map_df(.id = "model_choice",
         .f = "pts_preds")

in_covar_range_pts = pts_res_list %>%
  map_df(.id = "model_choice",
         .f = "pt_in_covar_range")

extrap_covars_pts = pts_res_list %>%
  map_df(.f = function(x) {
    tibble(covars = x$extrap_covars)
  }) %>%
  distinct() %>%
  pull(covars)

# model_df_pts = pts_res_list %>%
#   map(.f = "model_df")
# 
# extrap_summ_pts = model_df_pts %>%
#   map_df(.id = "model_choice",
#          .f = function(z) {
#            z %>%
#              mutate(tidy_summ = map(mod_no_champ,
#                                     .f = function(y) {
#                                       broom::tidy(y)               
#                                     })) %>%
#              select(Species, response,
#                     tidy_summ) %>%
#              unnest(cols = tidy_summ)
#          }) %>%
#   mutate(model = "non-CHaMP") %>%
#   bind_rows(model_df_pts %>%
#               map_df(.id = "model_choice",
#                      .f = function(z) {
#                        z %>%
#                          mutate(tidy_summ = map(mod_champ,
#                                                 .f = function(y) {
#                                                   broom::tidy(y)               
#                                                 })) %>%
#                          select(Species, response,
#                                 tidy_summ) %>%
#                          unnest(cols = tidy_summ)
#                      }) %>%
#               mutate(model = "CHaMP"))


# add a few others
extrap_covars_pts_org = extrap_covars_pts
extrap_covars_pts = c(extrap_covars_pts,
                      "Precip",
                      "GDD",
                      "NatPrin2")

data("chnk_domain")
data("gaa")

master_pts = gaa %>%
  select(Site, GNIS_Name = GNIS_NA, 
         HUC6_code = HUC_6,
         HUC6_name = HUC6NmNRCS,
         HUC8_code = HUC_8,
         Lat, Lon) %>%
  filter(!is.na(Lon), !is.na(Lat)) %>%
  st_as_sf(coords = c('Lon', 'Lat'),
           crs = 4326) %>%
  st_transform(crs = st_crs(chnk_domain))

# re-define species domains to match 200m reaches

# Chinook
chnk_domain = rch_200 %>%
  filter(chnk) %>%
  select(UniqueID, GNIS_Name, reach_leng:HUC8_code, 
         starts_with("chnk"))

chnk_buff = chnk_domain %>%
  st_buffer(dist = 200,
            endCapStyle = 'FLAT')

# filter out points outside Chinook domain
keep_rows = master_pts %>%
  st_covered_by(chnk_buff) %>%
  map_df(.id = 'row_id',
         .f = function(x) {
           tibble(n_polys = length(x))
         }) %>%
  filter(n_polys > 0) %>%
  pull(row_id) %>%
  unique() %>%
  sort() %>%
  as.numeric()

chnk_pts_cap = master_pts %>%
  slice(keep_rows) %>%
  st_join(chnk_domain %>%
            select(-one_of(names(master_pts))),
          join = st_nearest_feature) %>%
  left_join(all_pts_preds)

rm(keep_rows)

chnk_strm_lng = chnk_domain %>%
  mutate(length_m = st_length(.)) %>%
  mutate_at(vars(chnk_ESU_DPS, chnk_NWR_POPID, chnk_NWR_NAME, chnk_MPG, GNIS_Name),
            list(fct_explicit_na)) %>%
  group_by(chnk_ESU_DPS, chnk_MPG, chnk_NWR_POPID, chnk_NWR_NAME, GNIS_Name) %>%
  summarise_at(vars(length_m, reach_leng),
               list(sum))

chnk_strm_cap = chnk_strm_lng %>%
  full_join(chnk_pts_cap %>%
              mutate_at(vars(chnk_ESU_DPS, chnk_NWR_POPID, chnk_NWR_NAME, chnk_MPG, GNIS_Name),
                        list(fct_explicit_na)) %>%
              group_by(model_choice, chnk_ESU_DPS, chnk_MPG, chnk_NWR_POPID, chnk_NWR_NAME, GNIS_Name) %>%
              summarise(n_pts = n_distinct(Site),
                        chnk_per_m = mean(chnk_per_m, na.rm = T),
                        chnk_per_m_se = mean(chnk_per_m_se, na.rm = T)) %>%
              st_drop_geometry() %>%
              as_tibble()) %>%
  mutate(chnk_tot = chnk_per_m * length_m,
         chnk_tot_se = chnk_per_m_se * length_m) %>%
  mutate(across(starts_with('chnk_tot'),
                as.numeric))

# steelhead
sthd_domain = rch_200 %>%
  filter(sthd) %>%
  select(UniqueID, GNIS_Name, reach_leng:HUC8_code, 
         starts_with("sthd"))

sthd_buff = sthd_domain %>%
  st_buffer(dist = 200,
            endCapStyle = 'FLAT')

# filter out points outside Chinook domain
keep_rows = master_pts %>%
  st_covered_by(sthd_buff) %>%
  map_df(.id = 'row_id',
         .f = function(x) {
           tibble(n_polys = length(x))
         }) %>%
  filter(n_polys > 0) %>%
  pull(row_id) %>%
  unique() %>%
  sort() %>%
  as.numeric()

sthd_pts_cap = master_pts %>%
  slice(keep_rows) %>%
  st_join(sthd_domain %>%
            select(-one_of(names(master_pts))),
          join = st_nearest_feature) %>%
  left_join(all_pts_preds)
rm(keep_rows)

sthd_strm_lng = sthd_domain %>%
  mutate(length_m = st_length(.)) %>%
  mutate_at(vars(sthd_ESU_DPS, sthd_NWR_POPID, sthd_NWR_NAME, sthd_MPG, GNIS_Name),
            list(fct_explicit_na)) %>%
  group_by(sthd_ESU_DPS, sthd_MPG, sthd_NWR_POPID, sthd_NWR_NAME, GNIS_Name) %>%
  summarise_at(vars(length_m, reach_leng),
               list(sum))

sthd_strm_cap = sthd_strm_lng %>%
  full_join(sthd_pts_cap %>%
              mutate_at(vars(sthd_ESU_DPS, sthd_NWR_POPID, sthd_NWR_NAME, sthd_MPG, GNIS_Name),
                        list(fct_explicit_na)) %>%
              group_by(model_choice, sthd_ESU_DPS, sthd_MPG, sthd_NWR_POPID, sthd_NWR_NAME, GNIS_Name) %>%
              summarise(n_pts = n_distinct(Site),
                        sthd_per_m = mean(sthd_per_m, na.rm = T),
                        sthd_per_m_se = mean(sthd_per_m_se, na.rm = T)) %>%
              # summarise_at(vars(sthd_per_m,
              #                   sthd_per_m_se),
              #              list(mean),
              #              na.rm = T) %>%
              st_drop_geometry() %>%
              as_tibble()) %>%
  mutate(sthd_tot = sthd_per_m * length_m,
         sthd_tot_se = sthd_per_m_se * length_m) %>%
  mutate(across(starts_with('sthd_tot'),
                as.numeric))
champ_sites = all_pts_preds %>%
  filter(model == 'QRF') %>%
  # filter(model %in% c("QRF",
  #                     "CHaMP")) %>%
  pull(Site) %>%
  unique()


extrap_class_pts = gaa %>%
  rename(Channel_Type = ChanlType) %>%
  summarise(across(one_of(extrap_covars_pts),
                   class))
# which ones are numeric?
extrap_num_pts = names(extrap_class_pts)[extrap_class_pts %in% c('integer', 'numeric')]
# which ones are categorical?
extrap_catg_pts = names(extrap_class_pts)[extrap_class_pts %in% c('factor', 'character', 'ordered')]

# create the comparison data.frame
comp_pts_df = gaa %>%
  filter(Site %in% union(sthd_pts_cap$Site,
                         chnk_pts_cap$Site)) %>%
  mutate(Source = if_else(Site %in% champ_sites,
                          "CHaMP Reaches",
                          "non-CHaMP Reaches")) %>%
  filter(SO_v1 <= 6,
         !GNIS_NA %in% c("Columbia River",
                         "Snake River")) %>%
  # filter(sthd | chnk) %>%
  pivot_longer(cols = any_of(extrap_num_pts),
               names_to = "Metric",
               values_to = "value") %>%
  filter(!(Metric == "S2_02_11" & value < -9000)) %>%
  filter(!is.na(value)) %>%
  mutate(Metric = fct_relevel(Metric,
                              extrap_num_pts[extrap_num_pts %in% extrap_covars_pts_org],
                              after = 0))

# grab some summary statistics of CHaMP reaches
range_max_all = comp_pts_df %>%
  filter(Source == 'CHaMP Reaches') %>%
  select(Site,
         Metric, value) %>%
  group_by(Metric) %>%
  summarise(across(value,
                   tibble::lst(min, max),
                   na.rm = T),
            .groups = "drop") %>%
  left_join(comp_pts_df %>%
              filter(Source == 'CHaMP Reaches') %>%
              select(Site,
                     Metric, value) %>%
              group_by(Metric) %>%
              summarise(across(value,
                               list(lower_quant = ~ quantile(., probs = 0.25, na.rm = T),
                                    upper_quant = ~ quantile(., probs = 0.75, na.rm = T))),
                        .groups = "drop")) %>%
  pivot_longer(starts_with("value"),
               names_to = "type") %>%
  mutate(type = str_remove(type,
                           "value_")) %>%
  mutate(range_type = if_else(type %in% c('min', 'max'),
                              'Range',
                              'IQR'))

comp_pts_df %<>%
  left_join(range_max_all %>%
              select(-range_type) %>%
              pivot_wider(names_from = "type",
                          values_from = "value")) %>%
  rowwise() %>%
  mutate(in_range = if_else(value >= min & value <= max,
                            T, F),
         in_iqr = if_else(value >= lower_quant & value <= upper_quant,
                          T, F)) %>%
  ungroup()


champ_range_pts_p = comp_pts_df %>%
  mutate(Source = str_remove(Source, " Reaches$")) %>%
  ggplot(aes(x = Source,
             y = value,
             fill = Source)) +
  geom_boxplot() +
  facet_wrap(~ Metric,
             scales = 'free_y') +
  geom_hline(data = range_max_all %>%
               filter(range_type == "Range"),
             aes(yintercept = value),
             linetype = 2,
             color = 'darkblue') +
  scale_linetype(guide = 'none') +
  theme_pubr(base_size = 10) +
  theme(legend.position = "bottom",
        axis.title = element_blank(),
        axis.text.x = element_blank()) +
  labs(fill = "Sites")


huc6_pts_p = comp_pts_df %>%
  mutate(HUC6_name = as.character(HUC6NmNRCS),
         HUC6_name = if_else(Source == "CHaMP Reaches",
                             'CHaMP',
                             HUC6_name),
         HUC6_name = fct_relevel(HUC6_name,
                                 'CHaMP')) %>%
  ggplot(aes(x = HUC6_name,
             y = value,
             fill = HUC6_name)) +
  scale_fill_brewer(palette = "Set3") +

  geom_boxplot(outlier.color = 'gray70',
               outlier.size = 0.5) +
  # geom_violin(draw_quantiles = 0.5,
  #             scale = "width") +
  geom_hline(data = range_max_all,
             aes(yintercept = value,
                 linetype = range_type),
             color = 'darkblue') +
  scale_linetype(guide = 'none') +
  facet_wrap(~ Metric,
             ncol = 4,
             scales = 'free_y') +
  scale_y_continuous(trans = "identity",
                     breaks = scales::pretty_breaks()) +
  theme_pubr(base_size = 10) +
  theme(legend.position = "bottom",
        axis.title = element_blank(),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1,
                                   size = 6),
        axis.text.y = element_text(size = 6))
comp_pop = chnk_strm_cap %>%
  group_by(model_choice,
           ESU_DPS = chnk_ESU_DPS, 
           MPG = chnk_MPG, 
           NWR_POPID = chnk_NWR_POPID, 
           NWR_NAME = chnk_NWR_NAME) %>%
  summarise(reach_leng = sum(reach_leng, na.rm = T),
            n_ids = sum(n_pts, na.rm = T),
            cap_tot = sum(chnk_tot, na.rm = T),
            cap_tot_se = sqrt(sum(chnk_tot_se^2, na.rm = T)),
            .groups = "drop") %>%
  st_drop_geometry() %>%
  as_tibble() %>%
  mutate(Method = 'Points') %>%
  bind_rows(rch_200_cap %>%
              filter(chnk) %>%
              group_by(model_choice,
                       ESU_DPS = chnk_ESU_DPS, 
                       MPG = chnk_MPG, 
                       NWR_POPID = chnk_NWR_POPID, 
                       NWR_NAME = chnk_NWR_NAME) %>%
              summarise(reach_leng = sum(reach_leng),
                        n_ids = n_distinct(UniqueID),
                        cap_tot = sum(chnk_tot, na.rm = T),
                        cap_tot_se = sqrt(sum(chnk_tot_se^2, na.rm = T)),
                        .groups = "drop") %>%
              st_drop_geometry() %>%
              as_tibble() %>%
              mutate(Method = 'Reaches')) %>%
  mutate(Species = "Chinook") %>%
  select(Species, everything()) %>%
  bind_rows(sthd_strm_cap %>%
              group_by(model_choice,
                       ESU_DPS = sthd_ESU_DPS, 
                       MPG = sthd_MPG, 
                       NWR_POPID = sthd_NWR_POPID, 
                       NWR_NAME = sthd_NWR_NAME) %>%
              summarise(reach_leng = sum(reach_leng, na.rm = T),
                        n_ids = sum(n_pts, na.rm = T),
                        cap_tot = sum(sthd_tot, na.rm = T),
                        cap_tot_se = sqrt(sum(sthd_tot_se^2, na.rm = T)),
                        .groups = "drop") %>%
              st_drop_geometry() %>%
              as_tibble() %>%
              mutate(Method = 'Points') %>%
              bind_rows(rch_200_cap %>%
                          filter(sthd) %>%
                          group_by(model_choice,
                                   ESU_DPS = sthd_ESU_DPS, 
                                   MPG = sthd_MPG, 
                                   NWR_POPID = sthd_NWR_POPID, 
                                   NWR_NAME = sthd_NWR_NAME) %>%
                          summarise(reach_leng = sum(reach_leng),
                                    n_ids = n_distinct(UniqueID),
                                    cap_tot = sum(sthd_tot, na.rm = T),
                                    cap_tot_se = sqrt(sum(sthd_tot_se^2, na.rm = T)),
                                    .groups = "drop") %>%
                          st_drop_geometry() %>%
                          as_tibble() %>%
                          mutate(Method = 'Reaches')) %>%
              mutate(Species = "Steelhead") %>%
              select(Species, everything()))

# remove a few points that aren't useful
comp_pop %<>%
  filter(NWR_NAME != "na") %>%
  filter(NWR_NAME != "(Missing)") %>%
  filter(!grepl("Outside legal", ESU_DPS)) %>%
  mutate(cap_dens = cap_tot / reach_leng) %>%
  # filter(cap_dens < 21) %>%
  mutate(pop_label = str_split(NWR_NAME, " - ", simplify = T)[,2],
         pop_label = str_remove(pop_label, " River"))

pts_rch_cor = comp_pop %>%
  pivot_wider(id_cols = c(Species, 
                          model_choice,
                          ESU_DPS:NWR_NAME),
              names_from = "Method",
              values_from = c("reach_leng", "n_ids", "cap_tot", "cap_tot_se")) %>%
  filter(cap_tot_Points > 0,
         cap_tot_Reaches > 0) %>%
  group_by(Species, model_choice) %>%
  nest() %>%
  mutate(corr = map(data,
                    .f = function(x) {
                      x %>%
                        select(cap_tot_Points, cap_tot_Reaches) %>%
                        corrr::correlate(method = 'pearson')
                    })) %>%
  summarise(corr = map_dbl(corr,
                           .f = function(x) {
                             x[1,3] %>%
                               as.numeric()
                             }))

cap_xy_p = comp_pop %>%
  mutate(model_choice = recode(model_choice,
                               'juv_summer' = 'CHaMP',
                               'juv_summer_dash' = 'DASH',
                               'redds' = 'Redds')) %>%
  pivot_wider(id_cols = c(Species, 
                          model_choice,
                          ESU_DPS:NWR_NAME,
                          pop_label),
              names_from = "Method",
              values_from = c("reach_leng", "cap_tot", "cap_tot_se")) %>%
  # filter(reach_leng > 0)
  filter(cap_tot_Points > 0,
         cap_tot_Reaches > 0) %>%
  ggplot(aes(x = cap_tot_Points/10000,
             y = cap_tot_Reaches/10000,
             color = MPG)) +
  # scale_color_brewer(palette = "Set1",
  #                    name = 'MPG',
  #                    guide = guide_legend(nrow = 3)) +
  geom_abline(linetype = 2) +
  geom_point(size = 4) +
  # geom_label_repel(aes(label = pop_label),
  #                  size = 3,
  #                  fill = 'gray80',
  #                  show.legend = F) +
  theme_pubr(base_size = 10) +
  theme(legend.position = "right") +
  facet_wrap(~ model_choice + Species,
             ncol = 2,
             scales = 'free') +
  labs(x = "Master Sample Points (x10,000)",
       y = "200 m Reaches (x10,000)",
       title = "Capacity")

rel_diff_p = comp_pop %>%
  mutate(model_choice = recode(model_choice,
                               'juv_summer' = 'CHaMP',
                               'juv_summer_dash' = 'DASH',
                               'redds' = 'Redds')) %>%
  mutate_at(vars(ESU_DPS:NWR_NAME, pop_label),
            list(as.factor)) %>%
  mutate(pop_label = fct_reorder(pop_label,
                                 MPG,
                                 .fun = function(x) median(as.integer(x)))) %>%
  pivot_wider(id_cols = c(Species, 
                          model_choice,
                          ESU_DPS:NWR_NAME,
                          pop_label),
              names_from = "Method",
              values_from = c("reach_leng", "cap_tot", "cap_tot_se")) %>%
  filter(cap_tot_Points > 0,
         cap_tot_Reaches > 0) %>%
  mutate(diff = cap_tot_Reaches - cap_tot_Points,
         rel_diff = diff / cap_tot_Points) %>%
  filter(rel_diff < 7) %>%
  ggplot(aes(x = pop_label,
             y = rel_diff,
             color = MPG)) +
  geom_hline(yintercept = 0,
             linetype = 2) +
  geom_point(size = 4) +
  theme_pubr(base_size = 10) +
  theme(legend.position = "right",
        axis.text.x = element_blank()) +
        # axis.text.x = element_text(angle = 45,
        #                            size = 4,
        #                            hjust = 1)) +
  facet_grid(model_choice ~ Species,
             scales = 'free_y') +
  labs(x = "Population",
       y = "Relative Difference",
       title = "Capacity")

\newpage

Introduction

The quantile random forest (QRF) capacity model we have developed uses paired fish abundance and detailed habitat data from selected sites around the Columbia River Basin to estimate the carrying capacity at the 200-500 meter reach scale, where such detailed habitat data is available. Initially, and to date, the sites where such data was collected were monitored by CHaMP (Columbia Habitat Monitoring Program). This aspect of the QRF model is useful for examining empirical fish/habitat relationships, determining what habitat factors may be limiting capacity at a particular location, and examining the improvement to capacity after rehabilitation actions. However, there is also a need to generate capacity estimates on larger spatial scales (e.g. tributary, watershed, population, etc.).

To date, inference to areas without detailed habitat data and at larger spatial scales has relied on master sample points and attributes associated with them [@Larsen2016]. This method was developed because that dataset was available at the time, and covered the entire Columbia River Basin. However, each master sample point does not actually represent a stretch of river, rather they are a single location (latitude/longitude coordinates) that is meant to be representative of about a kilometer of stream. Some points are closer together than a kilometer though, due to tributary junctions or other issues. Because of how they're constructed and what they actually represent, the interpretation of capacity estimates at master sample points is slightly more complicated than we desire.

Recently, a group of NOAA researchers has developed a line layer that breaks down the Columbia River Basin into 200 meter reaches, with various attributes assigned to each reach. It covers the same area as the master sample point dataset, but provides better interpretation and visualization properties than the master sample point layer. In this document, we present details about how we have used both the master sample points and the line network as extrapolation tools, and include some comparisons of the results between them.

Methods

QRF Models

We examined results from a total of six different QRF models, three each for spring/summer Chinook and steelhead. These consisted of a model for redds, and two versions of a QRF model for summer juveniles. The first of these used what we considered the best choice of metrics from the entire CHaMP dataset. The second focused on metrics that are collected by DASH (Drone Assisted Stream Habitat protocol) that can be calculated from CHaMP data as well. This version allows direct QRF estimates to be made for areas sampled by DASH, since the CHaMP protocol is no longer in use.

Master Sample Points

The master sample points were generated in the design phase of CHaMP. These 551,046 sites were selected from the NHD Plus 1:100,000 stream layer covering WA, OR and ID at an average density of one site per kilometer [@Larsen2016]. Each CHaMP site where direct QRF capacity estimates were made corresponds to one of these master sample points, identified and selected using a generalized randomized tessellation stratification (GRTS) design [@Stevens2004; @Olsen2012]. CHaMP generated a number of attributes for each master sample point, referred to here as globally available attributes (GAAs) because they are associated with every master sample point across all watersheds. We chose r length(extrap_covars_pts_org) to include in the extrapolation model (Table \@ref(tab:covars-pts-tab)).

gaa_meta = readxl::read_excel('../data/raw/master_sample_pts/GAA_Metadata_20150506.xlsx') %>%
  clean_names() %>%
  mutate(field_name = recode(field_name,
                             "ChanlType" = "Channel_Type")) %>%
  rename(ShortName = field_name,
         Name = display_name, 
         Description = description) %>%
  bind_rows(tibble(ShortName = 'S2_02_11',
                   Name = "Average August Temperature",
                   Description = "NorWeST 10 year average August mean stream temperatures for 2002-2011"))
tibble(ShortName = extrap_covars_pts_org) %>%
  left_join(gaa_meta %>%
              select(ShortName,
                     Name,
                     Description)) %>%
  kable(booktabs = T,
        linesep = "",
        caption = "Attributes available at every master sample point, used as covariates in extrapolation model.") %>%
  kable_styling() %>%
  column_spec(3,
              width = "3in")

The original extrapolation model used the log of capacity estimates at each CHaMP site (fish / m) as the response, and selected GAAs as covariates. The model was fit using the svyglm function in the survey [@survey2004] package with R software [@R-base], accounting for the various survey design weights within each CHaMP watershed. We then used that model to predict capacity at every master sample point that was not a CHaMP site. In other words, we fit a linear regression to establish associations between estimated habitat capacity, from QRF, at CHaMP sites and globally available attributes from those sites and then used those associations at locations where CHaMP habitat data was not available to predict capacity at those master sample points.

The design weights were based on the particular stratification used in each CHaMP watershed to select monitoring sites. The most common stratification used three categories of valley segment type (source, transport and depositional) and selected a fixed number of sites from each strata. Because the strata are not equally distributed across the watershed, the design weights account for that unequal distribution. There are potential consequences to ignoring those weights when analyzing data from these sites [@Nahorniak2015].

To roll up capacity estimates to larger spatial scales, the average predicted capacity of master sample points along a stream was multiplied by the length of that stream, and then combinations of streams could be added together to generate overall capacity estimates for a watershed.

Line Network

We adapted this method to using a stream layer created by Morgan Bond and Tyler Nodine at the Northwest Fisheries Science Center. This layer consisted of a line file divided into 200m reaches with various attributes attached to each reach. The line file is based on the National Hydrography Dataset High Resolution (NHDPlus HR) dataset, which has a higher resolution, 1:24,000, compared to the older layer that the master sample points were chosen from.

tibble(ShortName = extrap_covars_rch) %>%
  left_join(gaa_meta %>%
              select(ShortName,
                     Description) %>%
              bind_rows(read_csv('../data/raw/stream_network/crb_streams_24k_datadictionary.csv') %>%
                          rename(ShortName = `Field Name`) %>%
                          filter(!is.na(ShortName)) %>%
                          select(-Units))) %>%
  kable(booktabs = T,
        linesep = "",
        caption = "Attributes available at every 200m reach, used as covariates in extrapolation model.") %>%
  kable_styling()

We determined which reach was closest to each CHaMP site, and used the predicted QRF capacity density of those CHaMP reaches as the response with the attributes attached to each 200m reach as covariates (Table \@ref(tab:covars-rch-tab)). We also took this opportunity to move to a random forest modeling framework. This accommodates possible non-linear or saturating effects of some of these covariates on capacity predictions, and prevents the extrapolation model from predicting capacity values well above or well below the range of predictions at CHaMP sites.

Range of Covariates

We examined the range of the covariates used in each method, for wadeable streams, and compared it to the range of values found at CHaMP sites or reaches. This exercise provides some context about how representative the suite of CHaMP sites are compared to the rest of the Columbia River Basin. These figures are found at the end of this document.

Capacity Comparisons

We computed the total capacity of each species in each population using both methods, for summer juveniles (using both CHaMP and DASH habitat metrics) and redds, and compared them. The correlations between the two estimates are shown in Table \@ref(tab:corr-tab).

pts_rch_cor %>%
  mutate(model_choice = recode(model_choice,
                               'juv_summer' = 'CHaMP',
                               'juv_summer_dash' = 'DASH',
                               'redds' = 'Redds')) %>%
  rename(r = corr,
         Model = model_choice) %>%
  kable(booktabs = T,
        digits = 3,
        caption = "Correlation coefficient between capacity estimates at the population scale using each method.") %>%
  kable_styling()

We plotted one estimate against the other in Figure \@ref(fig:comp-xy), and showed the relative difference in Figure \@ref(fig:comp-rel).

cap_xy_p
# cap_xy_p +
#   guides(color = guide_legend(ncol = 3))
rel_diff_p
# rel_diff_p +
#   guides(color = guide_legend(ncol = 3)) +
#   theme(axis.text.x = element_text(size = 5))

Maps

This shows the difference in how the results can be visualized.

lem_rch = rch_200_cap %>%
  filter(model_choice == 'juv_summer_dash') %>%
  filter(grepl('Lemhi', chnk_NWR_NAME)) %>%
  filter(chnk) %>%
  ggplot(aes(color = chnk_per_m2)) +
  geom_sf(size = 1.5) +
  scale_color_viridis_c(direction = -1,
                        name = expression("Parr /" ~ m^2))

lem_pts = chnk_pts_cap %>%
  filter(model_choice == 'juv_summer_dash') %>%
  filter(grepl('Lemhi', chnk_NWR_NAME)) %>%
  ggplot(aes(color = chnk_per_m2)) +
  geom_sf() +
  scale_color_viridis_c(direction = -1,
                        name = expression("Parr /" ~ m^2))

ggarrange(plotlist = list(lem_pts, lem_rch),
          nrow = 1,
          labels = 'AUTO',
          common.legend = T,
          legend = 'bottom')
x_center = -1382900
y_center = 2579748
zoom_amt = 3000

lem_rch_zm = lem_rch +
  coord_sf(xlim = x_center + c(-1, 1) * zoom_amt,
           ylim = y_center + c(-1, 1) * zoom_amt) +
  theme(axis.text = element_blank(),
        axis.ticks = element_blank()) +
  ggspatial::annotation_scale()

# get the flowlines for the zoomed in reach
# to add to the points plot
bbox = c(x_center + c(-1, 1) * zoom_amt,
         y_center + c(-1, 1) * zoom_amt)[c(1,3,2,4)]

flow_lines = st_polygon(list(matrix(c(bbox[c(1,2)],
                                      bbox[c(1,4)],
                                      bbox[c(3,4)],
                                      bbox[c(3,2)],
                                      bbox[c(1,2)]),
                                    ncol = 2,
                                    byrow = T))) %>%
  st_bbox() %>%
  st_as_sfc() %>%
  st_as_sf(crs = st_crs(rch_200_cap)$input) %>%
  get_flowlines(wtsd_polygon = .)

lem_pts_zm = lem_pts +
  geom_sf(data = flow_lines,
          color = 'blue') +
  geom_sf(size = 5) +
  coord_sf(xlim = x_center + c(-1, 1) * zoom_amt,
           ylim = y_center + c(-1, 1) * zoom_amt) +
  theme(axis.text = element_blank(),
        axis.ticks = element_blank()) +
  ggspatial::annotation_scale()


ggarrange(plotlist = list(lem_pts_zm, lem_rch_zm),
          nrow = 1,
          labels = 'AUTO',
          common.legend = T,
          legend = 'bottom')

Discussion

Extrapolations of QRF predictions are useful for higher-level spatial analyses or comparisons, such as at the watershed level. Examining predictions at individual master sample points or 200m reaches should be discouraged. For that scale, detailed habitat data should be collected, by using a protocol like DASH, and direct estimates of capacity can be made using a QRF model. On the other hand, extrapolation summaries of capacity at the watershed scale, for various species and life-stages, can be useful in broad prioritization discussions, to determine what life-stages and watersheds to target for rehabilitation.

For most of the GAAs, the range of values represented at CHaMP sites or reaches overlapped with the range of values in other places, with a few exceptions. The most notable is modeled precipitation (Precip) in the Clearwater basin (Figure \@ref(fig:huc6-pts-fig)). We did not use Precip as a covariate in the master sample points extrapolation model, but it does indicate that something about the conditions in the Clearwater may be different from other places with the interior Columbia River Basin, and therefore extrapolations to that area should be scrutinized carefully. The 2nd PCA of the natural classification (NatPrin2) also shows some deviation from the CHaMP dataset in the Willamette, Lower Columbia and Salmon watersheds. It could be worth investigating what part of that PCA (or combination of parts) are driving that deviation.

For both species, across all three QRF models, the two extrapolation models resulted in estimates of total capacity at the population scale that are very highly correlated (Table \@ref(tab:corr-tab)). The linear network estimates were often greater than the master sample point estimates, to a greater or lesser degree, but not always (Figure \@ref(fig:comp-xy)).

Changing the modeling framework from linear regression to a random forest has several benefits. Primarily, it provides a method to constrain extrapolation predictions naturally, even when the extrapolation covariates are beyond the range found at CHaMP sites. In addition, random forests accommodate potential non-linear associations between capacity predictions and GAAs while handling correlations among GAAs. The sample size of CHaMP sites with QRF predictions of capacity is sufficient to fit a random forest model, so we have no concerns about the "data-hungry" nature of this framework for this situation.

Although the master sample point method has been used for several years, there is no reason to believe estimates from that method are inherently superior to using a line network, so even in the cases when the two models result in different estimates of capacity, it is difficult to say which is "better". On the other hand, there are several reasons to support using the line network method, apart from the actual results, primarily based on the ease of interpretation. Extrapolation to a line network involves capacity predictions at actual 200 m reaches along a stream network, while the master sample point method provides estimates at instantaneous "points" on the landscape. The summation of capacity to larger spatial scales is more straightforward when using a line network, and the maps that can be created are easier to interpret (Figure \@ref(fig:lem-map)).

Therefore, we conclude that the extrapolation to a linear network method presented here is superior to the master sample point method, and should be adopted moving forward for examining QRF outputs at large spatial scales.

References

\newpage

Covariate Range Figures

champ_range_pts_p
huc6_pts_p
champ_range_rch_p
huc6_rch_p


KevinSee/QRFcapacity documentation built on Feb. 27, 2023, 3:57 p.m.