analysis/extrap_qrf_200rch.R

# Author: Kevin See
# Purpose: Extrapolate QRF model to all 200 m reaches
# Created: 3/20/2020
# Last Modified: 12/15/2021
# Notes: 

#-----------------------------------------------------------------
# load needed libraries
library(QRFcapacity)
library(tidyverse)
library(janitor)
library(magrittr)
library(sf)
library(quantregForest)
library(survey)

# set default theme for ggplot
theme_set(theme_bw())

#-----------------------------------------------------------------
# load model fit and habitat data associated with that model
#-----------------------------------------------------------------
mod_choice = c('juv_summer',
               'juv_summer_dash',
               'redds',
               'juv_winter')[1]

load(paste0('output/modelFits/qrf_', mod_choice, '.rda'))

#-----------------------------------------------------------------
# predict capacity at all CHaMP sites
#-----------------------------------------------------------------
# what quantile is a proxy for capacity?
pred_quant = 0.9
set.seed(5)
# for overwintering juveniles, predictions done on channel unit scale, then summed up for each CHaMP site.
if(mod_choice == "juv_winter") {
  # note that some Tier1 designations have to be imputed. Even if they're wrong, at least it will be some estimate of capacity, which seems better than 0, when summing at the CHaMP site scale
  hab_impute = hab_avg %>%
    filter(!is.na(Discharge)) %>%
    mutate(across(c(Watershed,
                    Channel_Type,
                    Tier1),
                  fct_drop)) %>%
    mutate(across(Tier1,
                  fct_recode,
                  NULL = "(Missing)")) %>%
    impute_missing_data(data = .,
                        covars = unique(sel_hab_mets$Metric),
                        impute_vars = c('Watershed', 
                                        'Elev_M', 
                                        'Channel_Type', 
                                        'CUMDRAINAG'),
                        method = 'missForest') %>%
                        # method = "Hmisc") %>%
                        # method = "randomForestSRC") %>%
    select(Site, Watershed, LON_DD, LAT_DD, 
           VisitID,
           Lgth_Wet, Area_Wet, 
           ChUnitNumber, AreaTotal, 
           any_of(unique(sel_hab_mets$Metric)))
  
  pred_hab_sites = hab_impute %>%
    # filter(Tier1 %in% levels(qrf_mod_df$Tier1))
    mutate(chnk_per_m2 = predict(qrf_mods[['Chinook']],
                                 newdata = select(., one_of(unique(sel_hab_mets$Metric))),
                                 what = pred_quant),
           chnk_per_m2 = exp(chnk_per_m2) - dens_offset,
           chnk_tot = chnk_per_m2 * AreaTotal) %>%
    mutate(sthd_per_m2 = predict(qrf_mods[['Steelhead']],
                                 newdata = select(., one_of(unique(sel_hab_mets$Metric))),
                                 what = pred_quant),
           sthd_per_m2 = exp(sthd_per_m2) - dens_offset,
           sthd_tot = sthd_per_m2 * AreaTotal) %>%
    group_by(Site,
             Watershed,
             LON_DD,
             LAT_DD,
             VisitID,
             Lgth_Wet,
             Area_Wet) %>%
    summarise(across(c(chnk_tot, sthd_tot),
                     sum,
                     na.rm = T),
              .groups = "drop") %>%
    mutate(chnk_per_m = chnk_tot / Lgth_Wet,
           chnk_per_m2 = chnk_tot / Area_Wet,
           sthd_per_m = sthd_tot / Lgth_Wet,
           sthd_per_m2 = sthd_tot / Area_Wet)
  
} else {
  hab_impute = hab_avg %>%
    mutate_at(vars(Watershed, Channel_Type),
              list(fct_drop)) %>%
    impute_missing_data(data = .,
                        covars = unique(sel_hab_mets$Metric),
                        impute_vars = c('Watershed', 
                                        'Elev_M', 
                                        'Channel_Type', 
                                        'CUMDRAINAG'),
                        method = 'missForest') %>%
    select(Site, Watershed, LON_DD, LAT_DD, VisitYear, Lgth_Wet, Area_Wet, one_of(unique(sel_hab_mets$Metric)))
  
  pred_hab_sites = hab_impute %>%
    mutate(chnk_per_m = predict(qrf_mods[['Chinook']],
                                newdata = select(., one_of(unique(sel_hab_mets$Metric))),
                                what = pred_quant),
           chnk_per_m = exp(chnk_per_m) - dens_offset,
           chnk_per_m2 = chnk_per_m * Lgth_Wet / Area_Wet) %>%
    mutate(sthd_per_m = predict(qrf_mods[['Steelhead']],
                                newdata = select(., one_of(unique(sel_hab_mets$Metric))),
                                what = pred_quant),
           sthd_per_m = exp(sthd_per_m) - dens_offset,
           sthd_per_m2 = sthd_per_m * Lgth_Wet / Area_Wet)
}

# only use sites that are in the 200 m reach dataset
data("champ_site_rch")

pred_hab_sites %<>%
  inner_join(champ_site_rch) %>%
  mutate_at(vars(Watershed),
            list(fct_drop))

# split by species, and filter by each species domain
data("rch_200")

pred_hab_df = pred_hab_sites %>%
  select(-starts_with('chnk')) %>%
  rename(cap_per_m = sthd_per_m,
         cap_per_m2 = sthd_per_m2) %>%
  mutate(Species = 'Steelhead') %>%
  bind_rows(pred_hab_sites %>%
              select(-starts_with('sthd')) %>%
              rename(cap_per_m = chnk_per_m,
                     cap_per_m2 = chnk_per_m2) %>%
              mutate(Species = 'Chinook')) %>%
  select(Species, UniqueID, Site:Area_Wet, starts_with("cap_")) %>%
  left_join(rch_200 %>%
              st_drop_geometry() %>%
              select(UniqueID, chnk, sthd)) %>%
  filter((Species == 'Steelhead' & sthd) |
           (Species == 'Chinook' & chnk)) %>%
  # filter(cap_per_m > 0,
  #        cap_per_m2 > 0) %>%
  mutate_at(vars(Watershed),
            list(fct_drop))



#----------------------------------------
# prep 200 m reaches for extrapolation
#----------------------------------------
rch_200_df = rch_200 %>%
  st_drop_geometry() %>%
  as_tibble() %>%
  mutate_at(vars(regime),
            list(~ as.factor(as.character(.))))

extrap_covars = names(rch_200_df)[c(18:20,
                                    23:29,
                                    37, 40:42)]

# what type of covariate is each GAA?
extrap_class = rch_200_df %>%
  select(one_of(extrap_covars)) %>%
  as.list() %>%
  map_chr(.f = function(x) class(x)[1])

# which ones are numeric?
extrap_num = names(extrap_class)[extrap_class %in% c('integer', 'numeric')]
# which ones are categorical?
extrap_catg = names(extrap_class)[extrap_class %in% c('factor', 'character', 'ordered')]

# compare range of covariates from model dataset and prediction dataset
range_comp = bind_rows(rch_200_df %>%
                         filter(!UniqueID %in% unique(pred_hab_sites$UniqueID)) %>%
                         select(UniqueID,
                                one_of(extrap_num)) %>%
                         gather(Metric, value, -UniqueID) %>%
                         mutate(Source = 'non-CHaMP Reaches'),
                       rch_200_df %>%
                         filter(UniqueID %in% unique(pred_hab_sites$UniqueID)) %>%
                         select(UniqueID,
                                one_of(extrap_num)) %>%
                         distinct() %>%
                         gather(Metric, value, -UniqueID) %>%
                         mutate(Source = 'CHaMP Reaches')) %>%
  mutate_at(vars(Source, Metric),
            list(as.factor))

range_max = range_comp %>%
  group_by(Metric, Source) %>%
  summarise_at(vars(value),
               tibble::lst(min, max),
               na.rm = T) %>%
  filter(Source == 'CHaMP Reaches') %>%
  ungroup() %>%
  gather(type, value, -Metric, -Source)

range_max_all = range_max %>%
  bind_rows(range_comp %>%
              group_by(Metric, Source) %>%
              summarize(across(value,
                               tibble::lst(quantile),
                               probs = c(0.25),
                               na.rm = T)) %>%
              rename(lower_quant = value_quantile) %>%
              left_join(range_comp %>%
                          group_by(Metric, Source) %>%
                          summarize(across(value,
                                           tibble::lst(quantile),
                                           probs = c(0.75),
                                           na.rm = T)) %>%
                          rename(upper_quant = value_quantile)) %>%
              filter(Source == 'CHaMP Reaches') %>%
              ungroup() %>%
              pivot_longer(cols = ends_with('quant'),
                           names_to = "type",
                           values_to = "value")) %>%
  mutate(range_type = if_else(type %in% c('min', 'max'),
                              'Range',
                              'IQR'))

covar_range_p = range_comp %>%
  mutate(Source = str_remove(Source, " Reaches$")) %>%
  ggplot(aes(x = Source,
             y = value,
             fill = Source)) +
  geom_boxplot() +
  facet_wrap(~ Metric,
             scales = 'free') +
  geom_hline(data = range_max,
             aes(yintercept = value),
             lty = 2,
             color = 'darkgray') +
  theme_minimal() +
  theme(legend.position = "bottom")

# covar_range_p
# ggsave('output/figures/GAA_200rch_comparison.pdf',
#        covar_range_p,
#        height = 7,
#        width = 10)

# another way to look at it
# break it down by HUC6
comp_df = rch_200_df %>%
  mutate(Source = if_else(UniqueID %in% unique(pred_hab_sites$UniqueID),
                          "CHaMP Reaches",
                          "non-CHaMP Reaches")) %>%
  filter(strm_order <= 6,
         !GNIS_Name %in% c("Columbia River",
                           "Snake River")) %>%
  filter(sthd | chnk) %>%
  pivot_longer(cols = any_of(extrap_num),
               names_to = "Metric",
               values_to = "value") %>%
  left_join(range_max_all %>%
              select(-range_type) %>%
              pivot_wider(names_from = "type",
                          values_from = "value") %>%
              select(-Source)) %>%
  mutate(in_range = if_else(value >= min & value <= max,
                            T, F),
         in_iqr = if_else(value >= lower_quant & value <= upper_quant,
                          T, F)) %>%
  filter(!(Metric == "S2_02_11" & value < -9000)) %>%
  filter(!is.na(value))

# log these covariates for plotting purposes
log_mets = c('alp_accum',
             'fines_accu',
             'flow_accum',
             'grav_accum',
             'p_accum',
             'fp_cur')

# 2 plots, one with logged covariates, one without
covar_comp_log = comp_df %>%
  filter(Metric %in% log_mets) %>%
  ggplot(aes(x = HUC6_name,
             y = value,
             fill = HUC6_name)) +
  scale_fill_brewer(palette = "Spectral") +
  geom_hline(data = range_max_all %>%
               filter(Metric %in% log_mets),
             aes(yintercept = value,
                 linetype = range_type),
             color = 'darkblue') +
  scale_linetype(guide = 'none') +
  geom_boxplot(outlier.color = 'gray70') +
  facet_wrap(~ Metric,
             ncol = 3,
             scales = 'free') +
  scale_y_continuous(trans = "log",
                     breaks = scales::pretty_breaks()) +
  theme_minimal() +
  theme(legend.position = "bottom",
        axis.title = element_blank(),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1,
                                   size = 6),
        axis.text.y = element_text(size = 6))

covar_comp_nolog = comp_df %>%
  filter(!Metric %in% log_mets) %>%
  ggplot(aes(x = HUC6_name,
             y = value,
             fill = HUC6_name)) +
  scale_fill_brewer(palette = "Spectral") +
  geom_hline(data = range_max_all %>%
               filter(!Metric %in% log_mets),
             aes(yintercept = value,
                 linetype = range_type),
             color = 'darkblue') +
  scale_linetype(guide = 'none') +
  geom_boxplot(outlier.color = 'gray70') +
  facet_wrap(~ Metric,
             nrow = 2,
             scales = 'free') +
  scale_y_continuous(trans = "identity",
                     breaks = scales::pretty_breaks()) +
  theme_minimal() +
  theme(legend.position = "bottom",
        axis.title = element_blank(),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1,
                                   size = 6),
        axis.text.y = element_text(size = 6))

covar_comp_p = ggpubr::ggarrange(plotlist = list(covar_comp_log,
                                                 covar_comp_nolog),
                                 nrow = 1,
                                 common.legend = T,
                                 legend = "bottom")
# save the plot
# ggsave('output/figures/GAA_200rch_comparison_huc6.pdf',
#        covar_comp_p,
#        height = 8,
#        width = 11)

# another way to summarize these results
comp_summ = comp_df %>%
  mutate(across(HUC6_name,
                fct_drop)) %>%
  group_by(Metric,
           HUC6_name,
           min, 
           lower_quant, upper_quant,
           max) %>%
  summarise(n_rch = n_distinct(UniqueID),
            n_champ = n_distinct(UniqueID[Source == 'CHaMP Reaches']),
            n_in_range = n_distinct(UniqueID[in_range]),
            n_out_range = n_distinct(UniqueID[!in_range]),
            # n_in_range = n_distinct(UniqueID[in_iqr]),
            # n_out_range = n_distinct(UniqueID[!in_iqr]),
            pct_in_range = n_in_range / n_rch,
            max_huc = max(value, na.rm = T),
            min_huc = min(value, na.rm = T),
            .groups = "drop") %>%
  mutate(pct_max_over = if_else(max_huc > max,
                                (max_huc - max) / max,
                                NA_real_),
         pct_min_under = if_else(min_huc < min,
                                 (min_huc - min) / min,
                                 NA_real_))

# tabyl(comp_summ,
#       HUC6_name)

# comp_summ %>%
  # filter(!is.na(pct_min_under))
  # arrange(pct_in_range)
  # arrange(desc(pct_max_over))

# comp_summ %>%
#   filter(HUC6_name == "Clearwater") %>%
#   pull(pct_in_range)

# comp_df %>%
#   filter(HUC6_name == "Upper Columbia",
#          !in_range) %>%
#   group_by(GNIS_Name) %>%
#   summarise(n_rch = n_distinct(UniqueID),
#             .groups = "drop") %>%
#   arrange(desc(n_rch))



# # correlation between numeric covariates
# rch_200_df %>%
#   select(one_of(extrap_num)) %>%
#   cor(method = 'spearman',
#       use = "pairwise")


# Center the covariates
# filter out reaches with covariates outside range of covariates used to fit extrapolation model
out_range_rchs = rch_200_df %>%
  select(one_of(extrap_num), UniqueID) %>%
  gather(Metric, value, -UniqueID) %>%
  left_join(select(range_max, -Source) %>%
              spread(type, value)) %>%
  group_by(Metric) %>%
  filter(value > max |
           value < min) %>%
  ungroup() %>%
  pull(UniqueID) %>%
  unique()

#----------------------------------------
# for linear models
#----------------------------------------
# center covariates
extrap_summ = inner_join(pred_hab_df %>%
                        select(UniqueID) %>%
                        distinct(),
                      rch_200_df %>%
                        select(UniqueID, one_of(extrap_num))) %>%
  gather(metric_nm, value, -UniqueID) %>%
  group_by(metric_nm) %>%
  summarise(metric_mean = mean(value, na.rm=T),
            metric_sd = sd(value, na.rm=T)) %>%
  ungroup()

# extrapolation model data set, with normalized covariates
mod_data = inner_join(pred_hab_df,
                      rch_200_df %>%
                        select(UniqueID, one_of(extrap_num))) %>%
  gather(metric_nm, value, one_of(extrap_num)) %>%
  left_join(extrap_summ) %>%
  mutate(norm_value = (value - metric_mean) / metric_sd) %>%
  select(-(value:metric_sd)) %>%
  spread(metric_nm, norm_value) %>%
  left_join(rch_200_df %>%
              select(UniqueID, one_of(extrap_catg)))

sum(is.na(mod_data))

# mod_data %<>%
#   bind_cols(mod_data %>%
#               is.na() %>%
#               as_tibble() %>%
#               select(one_of(extrap_covars)) %>%
#               transmute(n_NA = rowSums(.))) %>%
#   filter(n_NA == 0) %>%
#   mutate_at(vars(Watershed, one_of(extrap_catg)),
#             list(fct_drop))

# where to make extrapolation predictions
rch_pred = rch_200_df %>%
  select(UniqueID, one_of(extrap_num)) %>%
  pivot_longer(cols = one_of(extrap_num),
               names_to = "metric_nm",
               values_to = "value") %>%
  left_join(extrap_summ) %>%
  mutate(norm_value = (value - metric_mean) / metric_sd) %>%
  select(-(value:metric_sd)) %>%
  pivot_wider(names_from = "metric_nm",
              values_from = "norm_value") %>%
  left_join(rch_200_df %>%
              select(UniqueID, !any_of(extrap_num))) %>%
  select(any_of(names(rch_200_df))) %>%
  mutate(in_covar_range = if_else(UniqueID %in% out_range_rchs,
                                  F, T))

#----------------------------------------
# pull in survey design related data
#----------------------------------------
# Calculate GRTS design weights.
data("gaa")

# pull in info about what strata each CHaMP site was assigned to (using 2014 as reference year)
site_strata = pred_hab_df %>%
  select(Species, Site, Watershed) %>%
  distinct() %>%
  left_join(gaa %>%
              select(Site,
                     strata = AStrat2014)) %>%
  mutate(site_num = str_split(Site, '-', simplify = T)[,2]) %>%
  mutate(strata = if_else(Watershed == 'Asotin',
                          site_num,
                          if_else(Watershed == 'Entiat' & grepl('ENT00001', Site),
                                  paste('EntiatIMW', site_num, sep = '_'),
                                  strata))) %>%
  mutate(strata = if_else(grepl('EntiatIMW', strata),
                          str_remove(strata, '[[:digit:]]$'),
                          strata),
         strata = if_else(grepl('EntiatIMW', strata),
                          str_remove(strata, '[[:digit:]]$'),
                          strata)) %>%
  filter(!is.na(strata)) %>%
  mutate(strata = paste(Watershed, strata, sep = '_')) %>%
  select(-site_num)

# read in data from the CHaMP frame
champ_frame_df = read_csv('data/prepped/champ_frame_data.csv') %>%
  mutate(Target2014 = ifelse(is.na(AStrat2014), 'Non-Target', Target2014)) %>%
  mutate(AStrat2014 = ifelse(AStrat2014 == 'Entiat IMW', paste('EntiatIMW', GeoRchIMW, sep = '_'), AStrat2014)) %>%
  mutate(UseTypCHSP = ifelse(CHaMPshed == 'Lemhi' & AStrat2014 == 'Little Springs',
                             "Spawning and rearing", UseTypCHSP),
         UseTypSTSU = ifelse(CHaMPshed == 'Lemhi' & AStrat2014 %in% c('Big Springs', 'Little Springs'),
                             "Spawning and rearing", UseTypSTSU)) %>%
  filter(Target2014 == 'Target') %>%
  rename(Watershed = CHaMPshed)

# what strata do we have?
frame_strata = champ_frame_df %>%
  mutate(strata = paste(Watershed, AStrat2014, sep='_')) %>%
  select(Watershed,
         strata) %>%
  distinct()

# how long is each strata, by species?
chnk_strata_length = champ_frame_df %>%
  filter(!is.na(UseTypCHSP)) %>%
  mutate(strata = paste(Watershed, AStrat2014, sep='_')) %>%
  select(Watershed, matches("Strat"), FrameLeng) %>%
  group_by(Watershed, strata) %>%
  summarise(tot_length_km = sum(FrameLeng) / 1000) %>%
  ungroup() %>%
  mutate_at(vars(Watershed, strata),
            list(as.factor)) %>%
  arrange(Watershed, strata)

sthd_strata_length = champ_frame_df %>%
  filter(!is.na(UseTypSTSU)) %>%
  mutate(strata = paste(Watershed, AStrat2014, sep='_')) %>%
  select(Watershed, matches("Strat"), FrameLeng) %>%
  group_by(Watershed, strata) %>%
  summarise(tot_length_km = sum(FrameLeng) / 1000) %>%
  bind_rows(tibble(Watershed = 'Asotin',
                   strata = paste('Asotin', c('CC', 'NF', 'SF'), sep = '_'),
                   tot_length_km = 12)) %>%
  ungroup() %>%
  mutate_at(vars(Watershed, strata),
            list(as.factor)) %>%
  arrange(Watershed, strata)

strata_length = chnk_strata_length %>%
  mutate(Species = 'Chinook') %>%
  bind_rows(sthd_strata_length %>%
              mutate(Species = 'Steelhead')) %>%
  select(Species, everything())

# how many sites in each strata? and what is the length of each strata?
strata_tab = pred_hab_df %>%
  select(Species, Site, Watershed, matches('per_m')) %>%
  left_join(site_strata) %>%
  filter(strata != 'Entiat_Entiat IMW') %>%
  mutate_at(vars(Watershed),
            list(fct_drop)) %>%
  group_by(Species, Watershed, strata) %>%
  summarise(n_sites = n_distinct(Site)) %>%
  ungroup() %>%
  full_join(strata_length) %>%
  mutate(n_sites = if_else(is.na(n_sites),
                           as.integer(0),
                           n_sites)) %>%
  # calculate the weight of each site in each strata
  mutate(site_weight = if_else(n_sites > 0,
                               tot_length_km / n_sites,
                               as.numeric(NA)))


# test to see if we've accounted for all strata and most of each watershed
strata_test = frame_strata %>%
  full_join(strata_tab) %>%
  mutate_at(vars(Watershed),
            list(fct_drop)) %>%
  mutate(n_sites = if_else(is.na(n_sites),
                           as.integer(0),
                           n_sites)) %>%
  select(Species, everything()) %>%
  arrange(Species, Watershed, strata)

# # what frame strata don't have any sites in them?
# strata_test %>%
#   filter(n_sites == 0,
#          !is.na(tot_length_km)) %>%
#   arrange(Species, Watershed, strata) %>%
#   as.data.frame()
#
# # what strata that we have sites for are not in the frame strata?
# strata_test %>%
#   filter(n_sites > 0,
#          (is.na(tot_length_km) |
#             tot_length_km == 0)) %>%
#   as.data.frame()

# champ_frame_df %>%
#   filter(Watershed == 'Lemhi') %>%
#   select(Watershed, AStrat2014, Target2014, UseTypSTSU, FrameLeng) %>%
#   filter(!grepl('Mainstem', AStrat2014)) %>%
#   group_by(AStrat2014) %>%
#   summarise(use_length = sum(FrameLeng[!is.na(UseTypSTSU)]),
#             nonuse_length = sum(FrameLeng[is.na(UseTypSTSU)]))

# # how much of each watershed is not accounted for with current sites / strata?
# strata_test %>%
#   group_by(Species, Watershed) %>%
#   summarise_at(vars(tot_length_km),
#                list(sum),
#                na.rm = T) %>%
#   left_join(strata_test %>%
#               filter(n_sites == 0) %>%
#               group_by(Species, Watershed) %>%
#               summarise_at(vars(missing_length = tot_length_km),
#                            list(sum),
#                            na.rm = T)) %>%
#   mutate_at(vars(missing_length),
#             list(~ if_else(is.na(.), 0, .))) %>%
#   mutate(perc_missing = missing_length / tot_length_km) %>%
#   mutate_at(vars(perc_missing),
#             list(~ if_else(is.na(.), 0, .))) %>%
#   arrange(desc(perc_missing))


# calculate adjusted weights for all predicted QRF capacity sites
mod_data_weights = mod_data %>%
  left_join(site_strata) %>%
  left_join(strata_tab) %>%
  # if site not in a strata, it gets weight proportionate to it's length
  mutate(site_weight = if_else(is.na(site_weight),
                               Lgth_Wet / 1000,
                               site_weight)) %>%
  group_by(Species, Watershed) %>%
  mutate(sum_weights = sum(site_weight)) %>%
  ungroup() %>%
  mutate(adj_weight = site_weight / sum_weights)

#-------------------------------------------------------------
# clean up some memory
#-------------------------------------------------------------
rm(champ_frame_df, champ_site_rch,
   chnk_strata_length, sthd_strata_length, fish_hab,
   frame_strata, strata_length, strata_tab, strata_test,
   gaa,
   hab_avg,
   hab_data,
   hab_impute,
   rch_200)

#-------------------------------------------------------------
# Set up the survey design.
#-------------------------------------------------------------
# getOption('survey.lonely.psu')
# this will prevent strata with only 1 site from contributing to the variance
options(survey.lonely.psu = 'certainty')
# this centers strata with only 1 site to the sample grand mean; this is conservative
# options(survey.lonely.psu = 'adjust')

# extrapolation model formula
full_form = as.formula(paste('log_qrf_cap ~ -1 + (', paste(extrap_covars, collapse = ' + '), ')'))

# fit various models
model_svy_df = mod_data_weights %>%
  gather(response, qrf_cap, matches('per_m')) %>%
  select(-(n_sites:sum_weights)) %>%
  mutate(log_qrf_cap = log(qrf_cap)) %>%
  group_by(Species, response) %>%
  nest() %>%
  mutate(design = map(data,
                      .f = function(x) {
                        svydesign(id = ~ 1,
                                  data = x,
                                  strata = ~ Watershed,
                                  # strata = ~ strata,
                                  weights = ~ adj_weight)
                      })) %>%
  mutate(mod_no_champ = map(design,
                        .f = function(x) {
                          svyglm(full_form,
                                 design = x)
                        }),
         mod_champ = map(design,
                            .f = function(x) {
                              svyglm(update(full_form, .~ . + Watershed),
                                     design = x)
                            })) %>%
  arrange(Species, response) %>%
  ungroup()

# make predictions at all possible reaches, using both models
model_svy_df %<>%
  mutate(pred_all_rchs = list(rch_pred %>%
                                select(UniqueID, one_of(extrap_covars)) %>%
                                na.omit() %>%
                                left_join(rch_pred)),
         # which reaches are in CHaMP watersheds?
         pred_champ_rchs = map2(pred_all_rchs,
                                mod_champ,
                               .f = function(x,y) {
                                 x %>%
                                   left_join(mod_data_weights %>%
                                               select(UniqueID, Watershed) %>%
                                               left_join(rch_200_df %>%
                                                           select(UniqueID, HUC8_code)) %>%
                                               select(HUC8_code, Watershed) %>%
                                               distinct()) %>%
                                   filter(!is.na(Watershed)) %>%
                                   filter(Watershed %in% y$xlevels$Watershed) %>%
                                   mutate_at(vars(Watershed),
                                             list(as.factor))
                               })) %>%
  mutate(pred_no_champ = map2(mod_no_champ,
                              pred_all_rchs,
                              .f = function(x, y) {
                                y %>%
                                  select(UniqueID) %>%
                                  bind_cols(predict(x,
                                                    newdata = y,
                                                    se = T,
                                                    type = 'response') %>%
                                              as_tibble()) %>%
                                  rename(log_fit = response,
                                         log_se = SE) %>%
                                  mutate(pred_cap = exp(log_fit) * (1 + log_se^2 / 2),
                                         pred_se = pred_cap * log_se)
                              }),
         pred_champ = map2(mod_champ,
                           pred_champ_rchs,
                           .f = function(x, y) {
                             y %>%
                               select(UniqueID) %>%
                               bind_cols(predict(x,
                                                 newdata = y,
                                                 se = T,
                                                 type = 'response') %>%
                                           as_tibble()) %>%
                               rename(log_fit = response,
                                      log_se = SE) %>%
                               mutate(pred_cap = exp(log_fit) * (1 + log_se^2 / 2),
                                      pred_se = pred_cap * log_se)
                           }))

# quick comparison of capacity predictons with both models
comp_pred_p = model_svy_df %>%
  select(Species, type = response,
         pred_no_champ) %>%
  unnest(cols = pred_no_champ) %>%
  rename(resp_no_champ = pred_cap,
         se_no_champ = pred_se) %>%
  select(-log_fit, -log_se) %>%
  left_join(model_svy_df %>%
              select(Species, type = response,
                     pred_champ) %>%
              unnest(cols = pred_champ) %>%
              rename(resp_champ = pred_cap,
                     se_champ = pred_se) %>%
              select(-log_fit, -log_se)) %>%
  left_join(rch_200_df %>%
              select(UniqueID, HUC8_code) %>%
              left_join(mod_data_weights %>%
                          select(UniqueID, Watershed) %>%
                          left_join(rch_200_df %>%
                                      select(UniqueID, HUC8_code)) %>%
                          select(HUC8_code, Watershed) %>%
                          distinct())) %>%
  filter(!is.na(resp_champ)) %>%
  # if mod_choice == "redds", filter out crazy values
  # filter(resp_champ < 0.2,
  #        resp_no_champ < 0.2) %>%
  ggplot(aes(x = resp_champ,
             y = resp_no_champ)) +
  geom_point() +
  geom_abline(linetype = 2,
              color = 'red') +
  facet_wrap(~ Watershed + Species + type,
             scales = 'free') +
  labs(x = 'CHaMP',
       y = 'Non-CHaMP')

# comp_pred_p

# pull out all predictions, create data.frame with one row per reach
all_preds = model_svy_df %>%
  select(Species, type = response,
         pred_no_champ) %>%
  unnest(cols = pred_no_champ) %>%
  rename(resp_no_champ = pred_cap,
         se_no_champ = pred_se) %>%
  select(-log_fit, -log_se) %>%
  left_join(model_svy_df %>%
              select(Species, type = response,
                     pred_champ) %>%
              unnest(cols = pred_champ) %>%
              rename(resp_champ = pred_cap,
                     se_champ = pred_se) %>%
              select(-log_fit, -log_se)) %>%
  # add in direct QRF estimates
  left_join(pred_hab_sites %>%
              select(UniqueID, Watershed,
                     matches("per_m")) %>%
              pivot_longer(cols = matches('per_m'),
                           names_to = "type",
                           values_to = "resp_qrf") %>%
              mutate(Species = if_else(grepl('chnk', type),
                                       'Chinook',
                                       'Steelhead')) %>%
              mutate(type = str_replace(type, 'chnk', 'cap'),
                     type = str_replace(type, 'sthd', 'cap')) %>%
              group_by(UniqueID, Watershed, Species, type) %>%
              # slice(which.max(resp_qrf)) %>%
              summarise_at(vars(resp_qrf),
                           list(mean),
                           na.rm = T) %>%
              ungroup()) %>%
  # choose which response to use: if QRF is available, use that, then if CHaMP model is available use that, then go with non-CHaMP model
  mutate(model = if_else(!is.na(resp_qrf),
                         "QRF",
                         if_else(!is.na(resp_champ),
                                 "CHaMP",
                                 "non-CHaMP")),
         response = if_else(model == 'QRF',
                            resp_qrf,
                            if_else(model == 'CHaMP',
                                    resp_champ,
                                    resp_no_champ)),
         SE = if_else(model == 'QRF',
                      0,
                      if_else(model == 'CHaMP',
                              se_champ,
                              se_no_champ))) %>%
  select(Species, type, UniqueID, model, response, SE) %>%
  # add watershed name (and HUC8 code)
  left_join(rch_200_df %>%
              select(UniqueID, HUC8_code, chnk, sthd) %>%
              left_join(mod_data_weights %>%
                          select(UniqueID, Watershed) %>%
                          left_join(rch_200_df %>%
                                      select(UniqueID, HUC8_code)) %>%
                          select(HUC8_code, Watershed) %>%
                          distinct())) %>%
  mutate(model = if_else(!is.na(Watershed) & Watershed == 'Asotin' & model != 'QRF',
                         'CHaMP',
                         model)) %>%
  pivot_longer(cols = c(response, SE),
               names_to = "key",
               values_to = "value") %>%
  mutate(key = if_else(key == 'response',
                       if_else(Species == 'Chinook',
                               str_replace(type, 'cap', 'chnk'),
                               str_replace(type, 'cap', 'sthd')),
                       if_else(Species == 'Chinook',
                               paste0(str_replace(type, 'cap', 'chnk'), "_se"),
                               paste0(str_replace(type, 'cap', 'sthd'), "_se")))) %>%
  select(UniqueID, Watershed, HUC8_code, model, chnk, sthd, key, value) %>%
  pivot_wider(names_from = "key",
              values_from = "value")

# save the results
save(extrap_covars,
     full_form,
     mod_data_weights,
     model_svy_df,
     all_preds,
     file = paste0('output/modelFits/extrap_200rch_', mod_choice, '.rda'))

#---------------------------
# create a shapefile
load(paste0('output/modelFits/extrap_200rch_', mod_choice, '.rda'))
data("rch_200")

rch_200_cap = rch_200 %>%
  select(UniqueID, GNIS_Name, reach_leng:HUC8_code,
         chnk, chnk_use, chnk_ESU_DPS:chnk_NWR_NAME,
         sthd, sthd_use, sthd_ESU_DPS:sthd_NWR_NAME) %>%
  left_join(all_preds %>%
              select(-HUC8_code)) %>%
  filter(reach_leng < 500)
# 
# rm(mod_data_weights, model_svy_df, extrap_covars)
# rm(rch_200, all_preds)
# 
# save it
# as GPKG
# st_write(rch_200_cap,
#          dsn = paste0('output/gpkg/Rch_Cap_', mod_choice, '.gpkg'),
#          driver = 'GPKG')
# 
# 
# try splitting it up and appending each one subsequently, to help speed it up.
rch_200_cap %>%
  mutate_at(vars(HUC6_name),
            list(fct_explicit_na)) %>%
  tabyl(HUC6_name) %>%
  adorn_totals()

rch_200_cap_split = rch_200_cap %>%
  group_split(HUC6_name)
for(i in 1:length(rch_200_cap_split)) {
  cat(paste("Working on group", i, "of", length(rch_200_cap_split),
            "with", nrow(rch_200_cap_split[[i]]), " rows\n"))

  st_write(rch_200_cap_split[[i]],
           dsn = paste0('output/gpkg/Rch_Cap_', mod_choice, '.gpkg'),
           driver = 'GPKG',
           append = if_else(i == 1, F, T))
}
# 
# # as shapefile
# st_write(rch_200_cap,
#          dsn = paste0('output/shapefiles/Rch_Cap_', mod_choice, '.shp'),
#          driver = 'ESRI Shapefile')
# 
# 
# #-------------------------------------------------------------
# # build other extrapolation models
# # linear model without design weights
# # random forest
# #-------------------------------------------------------------
# # fit various models that don't account for survey design
# model_lm_df = mod_data %>%
#   gather(response, qrf_cap, matches('per_m')) %>%
#   mutate(log_qrf_cap = log(qrf_cap)) %>%
#   group_by(Species, response) %>%
#   nest() %>%
#   mutate(mod_no_champ = map(data,
#                             .f = function(x) {
#                               lm(full_form,
#                                  data = x)
#                             }),
#          mod_champ = map(data,
#                          .f = function(x) {
#                            lm(update(full_form, .~ . + Watershed),
#                               data = x)
#                          })) %>%
#   arrange(Species, response) %>%
#   ungroup() %>%
#   # make predictions at all possible reaches, using both models
#   mutate(pred_all_rchs = list(rch_pred %>%
#                                 select(UniqueID, one_of(extrap_covars)) %>%
#                                 na.omit() %>%
#                                 left_join(rch_pred)),
#          # which reaches are in CHaMP watersheds? 
#          pred_champ_rchs = map2(pred_all_rchs,
#                                 mod_champ,
#                                 .f = function(x,y) {
#                                   x %>%
#                                     left_join(mod_data %>%
#                                                 select(UniqueID, Watershed) %>%
#                                                 left_join(rch_200_df %>%
#                                                             select(UniqueID, HUC8_code)) %>%
#                                                 select(HUC8_code, Watershed) %>%
#                                                 distinct()) %>%
#                                     filter(!is.na(Watershed)) %>%
#                                     filter(Watershed %in% y$xlevels$Watershed) %>%
#                                     mutate_at(vars(Watershed),
#                                               list(as.factor))
#                                 })) %>%
#   mutate(pred_no_champ = map2(mod_no_champ,
#                               pred_all_rchs,
#                               .f = function(x, y) {
#                                 y %>%
#                                   select(UniqueID) %>%
#                                   bind_cols(predict(x,
#                                                     newdata = y,
#                                                     se = T,
#                                                     type = 'response') %>%
#                                               as_tibble() %>%
#                                               select(response = fit,
#                                                      SE = se.fit))
#                               }),
#          pred_champ = map2(mod_champ,
#                            pred_champ_rchs,
#                            .f = function(x, y) {
#                              y %>%
#                                select(UniqueID) %>%
#                                bind_cols(predict(x,
#                                                  newdata = y,
#                                                  se = T,
#                                                  type = 'response') %>%
#                                            as_tibble() %>%
#                                            select(response = fit,
#                                                   SE = se.fit))
#                            }))
# 
# # pull out all predictions, create data.frame with one row per reach
# all_preds = model_lm_df %>%
#   select(Species, type = response, 
#          pred_no_champ) %>%
#   unnest(cols = pred_no_champ) %>%
#   rename(resp_no_champ = response,
#          se_no_champ = SE) %>%
#   left_join(model_lm_df %>%
#               select(Species, type = response, 
#                      pred_champ) %>%
#               unnest(cols = pred_champ) %>%
#               rename(resp_champ = response,
#                      se_champ = SE)) %>%
#   mutate_at(vars(starts_with("resp"), 
#                  starts_with("se")),
#             list(exp)) %>%
#   # add in direct QRF estimates
#   left_join(pred_hab_sites %>%
#               select(UniqueID, Watershed,
#                      matches("per_m")) %>%
#               pivot_longer(cols = matches('per_m'),
#                            names_to = "type",
#                            values_to = "resp_qrf") %>%
#               mutate(Species = if_else(grepl('chnk', type),
#                                        'Chinook',
#                                        'Steelhead')) %>%
#               mutate(type = str_replace(type, 'chnk', 'cap'),
#                      type = str_replace(type, 'sthd', 'cap')) %>%
#               group_by(UniqueID, Watershed, Species, type) %>%
#               # slice(which.max(resp_qrf)) %>%
#               summarise_at(vars(resp_qrf),
#                            list(mean),
#                            na.rm = T) %>%
#               ungroup()) %>%
#   # choose which response to use: if QRF is available, use that, then if CHaMP model is available use that, then go with non-CHaMP model
#   mutate(model = if_else(!is.na(resp_qrf),
#                          "QRF",
#                          if_else(!is.na(resp_champ),
#                                  "CHaMP",
#                                  "non-CHaMP")),
#          response = if_else(model == 'QRF',
#                             resp_qrf,
#                             if_else(model == 'CHaMP',
#                                     resp_champ,
#                                     resp_no_champ)),
#          SE = if_else(model == 'QRF',
#                       0,
#                       if_else(model == 'CHaMP',
#                               se_champ,
#                               se_no_champ))) %>%
#   select(Species, type, UniqueID, model, response, SE) %>%
#   # add watershed name (and HUC8 code)
#   left_join(rch_200_df %>%
#               select(UniqueID, HUC8_code, chnk, sthd) %>%
#               left_join(mod_data_weights %>%
#                           select(UniqueID, Watershed) %>%
#                           left_join(rch_200_df %>%
#                                       select(UniqueID, HUC8_code)) %>%
#                           select(HUC8_code, Watershed) %>%
#                           distinct())) %>%
#   mutate(model = if_else(!is.na(Watershed) & Watershed == 'Asotin' & model != 'QRF',
#                          'CHaMP',
#                          model)) %>%
#   pivot_longer(cols = c(response, SE),
#                names_to = "key", 
#                values_to = "value") %>%
#   mutate(key = if_else(key == 'response',
#                        if_else(Species == 'Chinook',
#                                str_replace(type, 'cap', 'chnk'),
#                                str_replace(type, 'cap', 'sthd')),
#                        if_else(Species == 'Chinook',
#                                paste0(str_replace(type, 'cap', 'chnk'), "_se"),
#                                paste0(str_replace(type, 'cap', 'sthd'), "_se")))) %>%
#   select(UniqueID, Watershed, HUC8_code, model, chnk, sthd, key, value) %>%
#   pivot_wider(names_from = "key", 
#               values_from = "value")

# fit various random forest models
# to account for design weights, might need to create new dataset by resampling original data, with probabilities weighted by design weights - update: this may not be a good idea, I think it biases the out-of-bag error rates

# extrapolation model formula
full_form = as.formula(paste('log_qrf_cap ~ -1 + (', paste(extrap_covars, collapse = ' + '), ')'))

model_rf_df = inner_join(pred_hab_df,
                         rch_200_df %>%
                           select(UniqueID, one_of(extrap_covars))) %>%
  gather(response, qrf_cap, matches('per_m')) %>%
  # mutate(log_qrf_cap = log(qrf_cap)) %>%
  group_by(Species, response) %>%
  nest() %>%
  ungroup() %>%
  mutate(mod_no_champ = map(data,
                            .f = function(x) {
                              randomForest(update(full_form, qrf_cap ~ .),
                                           data = x,
                                           ntree = 2000)
                            }),
         mod_champ = map(data,
                         .f = function(x) {
                           randomForest(update(full_form, qrf_cap ~. + Watershed),
                                        data = x,
                                        ntree = 2000)
                         })) %>%
  # make predictions at all possible reaches, using both models
  mutate(pred_all_rchs = list(rch_200_df %>%
                                mutate(in_covar_range = ifelse(UniqueID %in% out_range_rchs, F, T)) %>%
                                select(UniqueID, in_covar_range, everything()))) %>%
  # drop reaches with missing covariates
  mutate(pred_all_rchs = map(pred_all_rchs,
                             .f = function(x) {
                               x %>%
                                 select(UniqueID, one_of(extrap_covars)) %>%
                                 na.omit() %>%
                                 left_join(x)
                               }),
         # which reaches are in CHaMP watersheds? 
         pred_champ_rchs = map2(pred_all_rchs,
                                mod_champ,
                                .f = function(x,y) {
                                  x %>%
                                    left_join(pred_hab_df %>%
                                                select(UniqueID, Watershed) %>%
                                                left_join(rch_200_df %>%
                                                            select(UniqueID, HUC8_code)) %>%
                                                select(HUC8_code, Watershed) %>%
                                                distinct()) %>%
                                    filter(!is.na(Watershed)) %>%
                                    filter(Watershed %in% unique(pred_hab_df$Watershed)) %>%
                                    mutate_at(vars(Watershed),
                                              list(as.factor))
                                })) %>%
  mutate(pred_no_champ = map2(mod_no_champ,
                              pred_all_rchs,
                              .f = function(x, y) {
                                # this doesn't work because I run out of memory. So I can't get a SE on non-CHaMP predictions
                                # preds = predict(x,
                                #                 newdata = y,
                                #                 predict.all = T)
                                # 
                                # y %>%
                                #   select(UniqueID) %>%
                                #   bind_cols(tibble(pred_cap = preds$aggregate,
                                #                    pred_se = preds$individual %>%
                                #                      apply(1, sd)))
                                
                                # split into smaller datasets by HUC
                                y %>%
                                  group_by(HUC8_code) %>%
                                  group_split() %>%
                                  map_df(.f = function(z) {
                                    preds = predict(x,
                                                    newdata = z,
                                                    predict.all = T)
                                    
                                    z %>%
                                      select(UniqueID) %>%
                                      bind_cols(tibble(pred_cap = preds$aggregate,
                                                       pred_se = preds$individual %>%
                                                         apply(1, sd)))
                                  })
                                
                                

                                # # this version doesn't provide any standard errors
                                # y %>%
                                #   select(UniqueID) %>%
                                #   bind_cols(tibble(pred_cap = predict(x,
                                #                                       newdata = y)))
                                  
                              }),
         pred_champ = map2(mod_champ,
                           pred_champ_rchs,
                           .f = function(x, y) {
                             # preds = predict(x,
                             #                 newdata = y,
                             #                 predict.all = T)
                             # 
                             # y %>%
                             #   select(UniqueID) %>%
                             #   bind_cols(tibble(pred_cap = preds$aggregate,
                             #                    pred_se = preds$individual %>%
                             #                      apply(1, sd)))
                             
                             # split into smaller datasets by HUC
                             y %>%
                               group_by(HUC8_code) %>%
                               group_split() %>%
                               map_df(.f = function(z) {
                                 preds = predict(x,
                                                 newdata = z,
                                                 predict.all = T)
                                 
                                 z %>%
                                   select(UniqueID) %>%
                                   bind_cols(tibble(pred_cap = preds$aggregate,
                                                    pred_se = preds$individual %>%
                                                      apply(1, sd)))
                               })
                             
                             # # this version doesn't include any standard errors
                             # y %>%
                             #   select(UniqueID) %>%
                             #   bind_cols(tibble(pred_cap = predict(x,
                             #                                       newdata = y)))
                             
                           })) %>%
  arrange(Species, response)

# Sys.unsetenv("R_MAX_VSIZE")

# pull out all predictions, create data.frame with one row per reach
all_preds = model_rf_df %>%
  select(Species, type = response, 
         pred_no_champ) %>%
  unnest(cols = pred_no_champ) %>%
  # rename(resp_no_champ = pred_cap) %>%
  rename(resp_no_champ = pred_cap,
         se_no_champ = pred_se) %>%
  left_join(model_rf_df %>%
              select(Species, type = response, 
                     pred_champ) %>%
              unnest(cols = pred_champ) %>%
              # rename(resp_champ = pred_cap)) %>%
              rename(resp_champ = pred_cap,
                     se_champ = pred_se)) %>%
  # mutate_at(vars(starts_with("resp"), 
  #                starts_with("se")),
  #           list(exp)) %>%
  # add in direct QRF estimates
  left_join(pred_hab_sites %>%
              select(UniqueID, Watershed,
                     matches("per_m")) %>%
              pivot_longer(cols = matches('per_m'),
                           names_to = "type",
                           values_to = "resp_qrf") %>%
              mutate(Species = if_else(grepl('chnk', type),
                                       'Chinook',
                                       'Steelhead')) %>%
              mutate(type = str_replace(type, 'chnk', 'cap'),
                     type = str_replace(type, 'sthd', 'cap')) %>%
              group_by(UniqueID, Watershed, Species, type) %>%
              # slice(which.max(resp_qrf)) %>%
              summarise_at(vars(resp_qrf),
                           list(mean),
                           na.rm = T) %>%
              ungroup()) %>%
  # choose which response to use: if QRF is available, use that, then if CHaMP model is available use that, then go with non-CHaMP model
  mutate(model = if_else(!is.na(resp_qrf),
                         "QRF",
                         if_else(!is.na(resp_champ),
                                 "CHaMP",
                                 "non-CHaMP")),
         response = if_else(model == 'QRF',
                            resp_qrf,
                            if_else(model == 'CHaMP',
                                    resp_champ,
                                    resp_no_champ))) %>%
  mutate(SE = if_else(model == 'QRF',
                      0,
                      if_else(model == 'CHaMP',
                              se_champ,
                              # NA_real_))) %>%
                              se_no_champ))) %>%
  select(Species, type, UniqueID, model, response, SE) %>%
  # add watershed name (and HUC8 code)
  left_join(rch_200_df %>%
              select(UniqueID, HUC8_code, chnk, sthd) %>%
              left_join(pred_hab_df %>%
                          select(UniqueID, Watershed) %>%
                          left_join(rch_200_df %>%
                                      select(UniqueID, HUC8_code)) %>%
                          select(HUC8_code, Watershed) %>%
                          distinct())) %>%
  mutate(model = if_else(!is.na(Watershed) & Watershed == 'Asotin' & model != 'QRF',
                         'CHaMP',
                         model)) %>% 
  filter(!is.na(response)) %>%
  pivot_longer(cols = c(response, SE),
               names_to = "key", 
               values_to = "value") %>%
  mutate(key = if_else(key == 'response',
                       if_else(Species == 'Chinook',
                               str_replace(type, 'cap', 'chnk'),
                               str_replace(type, 'cap', 'sthd')),
                       if_else(Species == 'Chinook',
                               paste0(str_replace(type, 'cap', 'chnk'), "_se"),
                               paste0(str_replace(type, 'cap', 'sthd'), "_se")))) %>%
  select(UniqueID, Watershed, HUC8_code, model, chnk, sthd, key, value) %>%
  pivot_wider(names_from = "key", 
              values_from = "value",
              values_fill = NA_real_)

# save the results
save(extrap_covars,
     full_form,
     pred_hab_df,
     model_rf_df,
     all_preds,
     file = paste0('output/modelFits/extrap_200rch_RF_', mod_choice, '.rda'))

# # using the randomForestSRC package
# library(randomForestSRC)
# model_rfsrc_df = inner_join(pred_hab_df,
#                             rch_200_df %>%
#                               select(UniqueID, one_of(extrap_covars))) %>%
#   gather(response, qrf_cap, matches('per_m')) %>%
#   mutate(log_qrf_cap = log(qrf_cap)) %>%
#   group_by(Species, response) %>%
#   nest() %>%
#   ungroup()%>%
#   mutate(mod_no_champ = map(data,
#                             .f = function(x) {
#                               rfsrc(update(full_form, qrf_cap ~ .),
#                                     data = as.data.frame(x),
#                                     ntree = 5000)
#                             }),
#          mod_champ = map(data,
#                          .f = function(x) {
#                            rfsrc(update(full_form, qrf_cap ~ . + Watershed),
#                                  data = as.data.frame(x),
#                                  ntree = 5000)
#                          })) %>%
#   # make predictions at all possible reaches, using both models
#   mutate(pred_all_rchs = list(rch_200_df %>%
#                                 mutate(in_covar_range = ifelse(UniqueID %in% out_range_rchs, F, T)) %>%
#                                 select(UniqueID, in_covar_range, everything()))) %>%
#   # drop reaches with missing covariates
#   mutate(pred_all_rchs = map(pred_all_rchs,
#                              .f = function(x) {
#                                x %>%
#                                  select(UniqueID, one_of(extrap_covars)) %>%
#                                  na.omit() %>%
#                                  left_join(x)
#                              }),
#          # which reaches are in CHaMP watersheds? 
#          pred_champ_rchs = map2(pred_all_rchs,
#                                 mod_champ,
#                                 .f = function(x,y) {
#                                   x %>%
#                                     left_join(pred_hab_df %>%
#                                                 select(UniqueID, Watershed) %>%
#                                                 left_join(rch_200_df %>%
#                                                             select(UniqueID, HUC8_code)) %>%
#                                                 select(HUC8_code, Watershed) %>%
#                                                 distinct()) %>%
#                                     filter(!is.na(Watershed)) %>%
#                                     filter(Watershed %in% unique(pred_hab_df$Watershed)) %>%
#                                     mutate_at(vars(Watershed),
#                                               list(as.factor))
#                                 })) %>%
#   mutate(pred_no_champ = map2(mod_no_champ,
#                               pred_all_rchs,
#                               .f = function(x, y) {
#                                 preds = predict(x,
#                                                 newdata = y %>%
#                                                   select(UniqueID, 
#                                                          one_of(x$forest$xvar.names)),
#                                                 na.action = "na.impute")
#                                 y %>%
#                                   select(UniqueID) %>%
#                                   mutate(pred_cap = preds$predicted)
#                               }),
#          pred_champ = map2(mod_champ,
#                            pred_champ_rchs,
#                            .f = function(x, y) {
#                              preds = predict(x,
#                                              newdata = y %>%
#                                                select(UniqueID, 
#                                                       one_of(x$forest$xvar.names)),
#                                              na.action = "na.impute")
#                              
#                              y %>%
#                                select(UniqueID) %>%
#                                mutate(pred_cap = preds$predicted)
#                                # bind_cols(tibble(pred_cap = predict(x,
#                                #                                     newdata = y %>%
#                                #                                       select(one_of(x$forest$xvar.names)) %>%
#                                #                                       mutate_at(vars(ends_with('accu'),
#                                #                                                      ends_with('accum')),
#                                #                                                 list(as.numeric)),
#                                #                                     na.action = "na.impute")$predicted))
#                            }))
# 
# # pull out all predictions, create data.frame with one row per reach
# all_preds = model_rfsrc_df %>%
#   select(Species, type = response, 
#          pred_no_champ) %>%
#   unnest(cols = pred_no_champ) %>%
#   rename(resp_no_champ = pred_cap) %>%
#   # rename(resp_no_champ = pred_cap,
#   #        se_no_champ = pred_se) %>%
#   left_join(model_rfsrc_df %>%
#               select(Species, type = response, 
#                      pred_champ) %>%
#               unnest(cols = pred_champ) %>%
#               rename(resp_champ = pred_cap)) %>%
#   # rename(resp_champ = pred_cap,
#   #        se_champ = pred_se)) %>%
#   # mutate_at(vars(starts_with("resp"), 
#   #                starts_with("se")),
#   #           list(exp)) %>%
#   # add in direct QRF estimates
#   left_join(pred_hab_sites %>%
#               select(UniqueID, Watershed,
#                      matches("per_m")) %>%
#               pivot_longer(cols = matches('per_m'),
#                            names_to = "type",
#                            values_to = "resp_qrf") %>%
#               mutate(Species = if_else(grepl('chnk', type),
#                                        'Chinook',
#                                        'Steelhead')) %>%
#               mutate(type = str_replace(type, 'chnk', 'cap'),
#                      type = str_replace(type, 'sthd', 'cap')) %>%
#               group_by(UniqueID, Watershed, Species, type) %>%
#               # slice(which.max(resp_qrf)) %>%
#               summarise_at(vars(resp_qrf),
#                            list(mean),
#                            na.rm = T) %>%
#               ungroup()) %>%
#   # choose which response to use: if QRF is available, use that, then if CHaMP model is available use that, then go with non-CHaMP model
#   mutate(model = if_else(!is.na(resp_qrf),
#                          "QRF",
#                          if_else(!is.na(resp_champ),
#                                  "CHaMP",
#                                  "non-CHaMP")),
#          response = if_else(model == 'QRF',
#                             resp_qrf,
#                             if_else(model == 'CHaMP',
#                                     resp_champ,
#                                     resp_no_champ))) %>% #,
#   # SE = if_else(model == 'QRF',
#   #              0,
#   #              if_else(model == 'CHaMP',
#   #                      se_champ,
#   #                      se_no_champ))) %>%
#   select(Species, type, UniqueID, model, response) %>% #, SE) %>%
#   # add watershed name (and HUC8 code)
#   left_join(rch_200_df %>%
#               select(UniqueID, HUC8_code, chnk, sthd) %>%
#               left_join(mod_data_weights %>%
#                           select(UniqueID, Watershed) %>%
#                           left_join(rch_200_df %>%
#                                       select(UniqueID, HUC8_code)) %>%
#                           select(HUC8_code, Watershed) %>%
#                           distinct())) %>%
#   mutate(model = if_else(!is.na(Watershed) & Watershed == 'Asotin' & model != 'QRF',
#                          'CHaMP',
#                          model)) %>% 
#   filter(!is.na(response)) %>%
#   pivot_longer(cols = c(response),
#                names_to = "key", 
#                values_to = "value") %>%
#   mutate(key = if_else(key == 'response',
#                        if_else(Species == 'Chinook',
#                                str_replace(type, 'cap', 'chnk'),
#                                str_replace(type, 'cap', 'sthd')),
#                        if_else(Species == 'Chinook',
#                                paste0(str_replace(type, 'cap', 'chnk'), "_se"),
#                                paste0(str_replace(type, 'cap', 'sthd'), "_se")))) %>%
#   select(UniqueID, Watershed, HUC8_code, model, chnk, sthd, key, value) %>%
#   pivot_wider(names_from = "key", 
#               values_from = "value")
# 
# # save the results
# save(extrap_covars,
#      full_form,
#      mod_data_weights,
#      model_rfsrc_df,
#      all_preds,
#      file = paste0('output/modelFits/extrap_200rch_RFSRC_', mod_choice, '.rda'))


#---------------------------
# create a geopackage from random forest extrapolation
data("rch_200")
for(mod_choice in c('juv_summer',
                    'juv_summer_dash',
                    'redds',
                    'juv_winter')) {
  
  load(paste0('output/modelFits/extrap_200rch_RF_', mod_choice, '.rda'))
  
  
  rch_200_cap = rch_200 %>%
    select(UniqueID, GNIS_Name, reach_leng:HUC8_code, 
           chnk, chnk_use, chnk_ESU_DPS:chnk_NWR_NAME,
           sthd, sthd_use, sthd_ESU_DPS:sthd_NWR_NAME) %>%
    left_join(all_preds %>%
                select(-chnk, -sthd) %>%
                select(-HUC8_code)) %>%
    filter(reach_leng < 500)
  
  rm(pred_hab_df, model_rf_df, extrap_covars, all_preds,
     full_form)
  # rm(rch_200)
  
  # # try splitting it up and appending each one subsequently, to help speed it up.
  # rch_200_cap %>%
  #   mutate_at(vars(HUC6_name),
  #             list(fct_explicit_na)) %>%
  #   tabyl(HUC6_name) %>%
  #   adorn_totals()
  
  rch_200_cap_split = rch_200_cap %>%
    group_split(HUC6_name)
  for(i in 1:length(rch_200_cap_split)) {
    cat(paste("Working on group", i, "out of", length(rch_200_cap_split), "with", nrow(rch_200_cap_split[[i]]), " rows\n"))
    
    st_write(rch_200_cap_split[[i]],
             dsn = paste0('output/gpkg/Rch_Cap_RF_', mod_choice, '.gpkg'),
             driver = 'GPKG',
             append = if_else(i == 1, F, T))
  }
}


#-----------------------------------------------
# compare all the predictions
#-----------------------------------------------
preds_comp = list('Survey' = model_svy_df,
                  "lm" = model_lm_df,
                  "SRC" = model_rfsrc_df,
                  "RF" = model_rf_df) %>%
  map_df(.id = 'model',
         .f = function(x) {
           x %>%
             select(Species, type = response, mod_pred = pred_no_champ) %>%
             unnest(cols = mod_pred) %>%
             select(Species, type, UniqueID, response) %>%
             gather(key, value, response) %>%
             mutate(key = if_else(key == 'response',
                                  if_else(Species == 'Chinook',
                                          str_replace(type, 'cap', 'chnk'),
                                          str_replace(type, 'cap', 'sthd')),
                                  if_else(Species == 'Chinook',
                                          paste0(str_replace(type, 'cap', 'chnk'), "_se"),
                                          paste0(str_replace(type, 'cap', 'sthd'), "_se")))) %>%
             select(UniqueID, key, value) %>%
             spread(key, value) %>%
             mutate_at(vars(matches('per_m')),
                       list(exp))
         })

test = preds_comp %>%
  gather(key, value, matches('per_m')) %>%
  spread(model, value) %>%
  filter(key == 'chnk_per_m')

summary(test)

cor_pear = test %>%
  select(lm:Survey) %>%
  corrr::correlate(method = 'pearson')
cor_spear = test %>%
  select(lm:Survey) %>%
  corrr::correlate(method = 'spearman')
cor_ken = test %>%
  select(lm:Survey) %>%
  corrr::correlate(method = 'kendall')

cor_pear
cor_spear
cor_ken



test %>%
  filter(lm < 10,
         Survey < 10) %>%
  GGally::ggpairs(columns = 3:6,
                  lower = list("continuous" = 
                                 function(data, mapping) {
                                   ggplot(data, 
                                          mapping) + 
                                     geom_point() +
                                     geom_abline(linetype = 2,
                                                 color = 'red') +
                                     geom_smooth()
                                 }))
KevinSee/QRFcapacity documentation built on Feb. 27, 2023, 3:57 p.m.