analysis/old_fit_qrf_redds.R

# Author: Kevin See
# Purpose: Fit QRF model to redd data, using CHaMP habitat 2011-2017
# Created: 12/31/2019
# Last Modified: 3/20/2020
# Notes: 

#-----------------------------------------------------------------
# load needed libraries
library(QRFcapacity)
# library(maptools)
library(tidyverse)
library(janitor)
library(magrittr)
library(sf)
library(quantregForest)
library(survey)

# set default theme for ggplot
theme_set(theme_bw())

#-----------------------------------------------------------------
# determine which set of fish/habitat data to use
data("fh_redds_champ_2017")
fish_hab = fh_redds_champ_2017 %>%
  mutate_at(vars(Watershed),
            list(as.factor))

# alter a few metrics
fish_hab %<>%
  # scale some metrics by site length
  mutate_at(vars(starts_with('LWVol'),
                 ends_with('_Vol')),
            list(~ . / Lgth_Wet * 100)) %>%
  # add a metric showing "some" riparian canopy
  mutate(RipCovCanSome = 100 - RipCovCanNone) %>%
  # add a metric showing "some" fish cover
  mutate(FishCovSome = 100 - FishCovNone)


# and the appropriate habitat dictionrary to go with it
data("hab_dict_2017")
hab_dict = hab_dict_2017

# change some of the descriptions for large wood volume
hab_dict %<>%
  mutate(DescriptiveText = if_else(grepl("^LWVol", ShortName),
                                   paste0(str_remove(DescriptiveText, ".$"),
                                          ", scaled by site length."),
                                   DescriptiveText),
         UnitOfMeasure = if_else(grepl("^LWVol", ShortName),
                                 paste0(UnitOfMeasure,
                                        " per 100 meters"),
                                 UnitOfMeasure),
         UnitOfMeasureAbbrv = if_else(grepl("^LWVol", ShortName),
                                      paste0(UnitOfMeasureAbbrv,
                                             "/100m"),
                                      UnitOfMeasureAbbrv)) %>%
  # add description for some riparian canopy
  bind_rows(hab_dict_2017 %>%
              filter(ShortName == "RipCovCanNone") %>%
              mutate(ShortName = "RipCovCanSome",
                     Name = "Riparian Cover: Some Canopy",
                     DescriptiveText = "Percent of riparian canopy with some vegetation.")) %>%
  # add description for no riparian ground cover
  bind_rows(hab_dict_2017 %>%
              filter(ShortName == "RipCovGrnd") %>%
              mutate(ShortName = "RipCovGrndNone",
                     Name = "Riparian Cover: No Ground",
                     DescriptiveText = "Percent of groundcover with no vegetation.")) %>%
  # add description for some fish cover
  bind_rows(hab_dict_2017 %>%
              filter(ShortName == "FishCovNone") %>%
              mutate(ShortName = "FishCovSome",
                     Name = "Fish Cover: Some Cover",
                     DescriptiveText = "Percent of wetted area with some form of fish cover")) %>%
  # add description for Elevation
  bind_rows(tibble(ShortName = 'Elev_M',
                   Name = 'Elevation',
                   UnitOfMeasure = 'Meter',
                   UnitOfMeasureAbbrv = 'm'))

# all the related habitat data
data("champ_site_2011_17")
hab_data = champ_site_2011_17

data("champ_site_2011_17_avg")
hab_avg = champ_site_2011_17_avg

# alter a few metrics
hab_data %<>%
  # scale some metrics by site length
  mutate_at(vars(starts_with('LWVol'),
                 ends_with('_Vol')),
            list(~ . / Lgth_Wet * 100)) %>%
  # add a metric showing "some" riparian canopy
  mutate(RipCovCanSome = 100 - RipCovCanNone) %>%
  # add a metric showing "some" fish cover
  mutate(FishCovSome = 100 - FishCovNone)

hab_avg %<>%
  # scale some metrics by site length
  mutate_at(vars(starts_with('LWVol'),
                 ends_with('_Vol')),
            list(~ . / Lgth_Wet * 100)) %>%
  # add a metric showing "some" riparian canopy
  mutate(RipCovCanSome = 100 - RipCovCanNone) %>%
  # add a metric showing "some" fish cover
  mutate(FishCovSome = 100 - FishCovNone)


# add temperature metrics
data("champ_temps")
hab_avg %<>%
  left_join(champ_temps %>%
              as_tibble() %>%
              select(Site, avg_aug_temp = S2_02_11) %>%
              distinct())

hab_data %<>%
  left_join(hab_data %>%
              select(VisitID, Year = VisitYear) %>%
              distinct() %>%
              left_join(champ_temps %>%
                          as_tibble() %>%
                          select(VisitID, avg_aug_temp = S2_02_11))) %>%
  left_join(hab_data %>%
              select(VisitID, Year = VisitYear) %>%
              distinct() %>%
              left_join(champ_temps %>%
                          as_tibble() %>%
                          select(Site:VisitID, S1_93_11:S36_2015) %>%
                          gather(scenario, aug_temp, S1_93_11:S36_2015) %>%
                          mutate(Year = str_sub(scenario, -4)) %>%
                          mutate_at(vars(Year),
                                    list(as.numeric)) %>%
                          filter(!is.na(Year)) %>%
                          select(Site:VisitID, Year, aug_temp))) %>%
  select(-Year)

#-----------------------------------------------------------------
# clip Chinook data to Chinook domain
data("rch_200")
data("champ_site_rch")

chnk_sites = champ_site_rch %>%
  inner_join(rch_200 %>%
               select(UniqueID, chnk)) %>%
  filter(chnk) %>%
  pull(Site) %>%
  as.character()

# add Big Springs and Little Springs sites in the Lemhi
chnk_sites = c(chnk_sites,
               hab_data %>%
                 filter(grepl('Big0Springs', Site) | grepl('Little0Springs', Site)) %>%
                 pull(Site) %>%
                 unique()) %>%
  unique()


# only keep Chinook data from sites in Chinook domain
fish_hab %<>%
  filter(Species == 'Steelhead' |
           (Species == 'Chinook' & (Site %in% chnk_sites | fish_dens > 0)))

# fish_hab %>%
#   filter(!(Site %in% chnk_sites),
#          Species == 'Chinook',
#          fish_dens > 0) %>%
#   select(Watershed, Site, maxYr, starts_with('maxRedds'), fish_dens) %>%
#   left_join(champ_site_rch %>%
#               inner_join(rch_200 %>%
#                            st_drop_geometry() %>%
#                            select(UniqueID, chnk)))
#   arrange(desc(fish_dens)) %>%
#   tabyl(Watershed,
#         show_missing_levels = F)

# #-------------------------------
# data("chnk_domain")
# 
# # which sites were sampled for Chinook? 
# chnk_samps = fish_hab %>%
#   filter(Species == 'Chinook') %>%
#   select(Site, Watershed, LON_DD, LAT_DD) %>%
#   distinct() %>%
#   st_as_sf(coords = c('LON_DD', 'LAT_DD'),
#            crs = 4326) %>%
#   st_transform(st_crs(chnk_domain))
# 
# # set snap distance (in meters)
# st_crs(chnk_samps)
# snap_dist = 1000
# 
# # which of those sites are in Chinook domain?
# chnk_sites = chnk_samps %>%
#   as_Spatial() %>%
#   maptools::snapPointsToLines(chnk_domain %>%
#                                 mutate(id = 1:n()) %>%
#                                 select(id, MPG) %>%
#                                 as_Spatial(),
#                               maxDist = snap_dist,
#                               withAttrs = T,
#                               idField = 'id') %>%
#   as('sf')
# 
# # ggplot() +
# #   geom_sf(data = chnk_samps,
# #           aes(color = 'Sampled')) +
# #   geom_sf(data = chnk_sites,
# #           aes(color = 'In Chnk Range')) +
# #   theme(axis.text = element_blank())
# 
# fish_hab %<>%
#   filter(Species == 'Steelhead' |
#            (Species == 'Chinook' & Site %in% chnk_sites$Site))


#-----------------------------------------------------------------
# select which habitat metrics to use in QRF model
# based on conversation with Mike and Richie
# sel_hab_mets = crossing(Species = c('Chinook', 
#                                     'Steelhead'),
#                         Metric = c('MeanU',
#                                    'Elev_M',
#                                    'DistPrin1',
#                                    'avg_aug_temp',
#                                    'SubEstGrvl'))

# updated on 3/19/20
sel_hab_mets = crossing(Species = c('Chinook', 
                                    'Steelhead'),
                        Metric = c('SubEstGrvl',
                                   'SubLT2',
                                   'CU_Freq',
                                   "DetrendElev_SD",
                                   "FishCovSome",
                                   "UcutLgth_Pct",
                                   "RipCovCanSome",
                                   "Q",
                                   "DistPrin1",
                                   "PoolResidDpth",
                                   "avg_aug_temp",
                                   "LWFreq_Wet"))

#-----------------------------------------------------------------
# Fit QRF model
#-----------------------------------------------------------------
# impute missing data in fish / habitat dataset

# impute missing habitat metrics once, for both species
all_covars = sel_hab_mets %>%
  pull(Metric) %>%
  unique()

# are all covars present in fish/habitat data?
all_covars[!all_covars %in% names(fish_hab)]

qrf_mod_df = impute_missing_data(data = fish_hab %>%
                                   select(-(maxYr:maxReddsPerMsq)) %>%
                                   distinct(),
                                 covars = all_covars,
                                 impute_vars = c('Watershed', 'Elev_M', 'Sin', 'CUMDRAINAG'),
                                 method = 'missForest') %>%
  left_join(fish_hab %>%
              select(Species:Site, maxYr:maxReddsPerMsq)) %>%
  mutate(fish_dens = maxReddsPerMsq) %>%
  select(Species, Site, Watershed, maxYr, LON_DD, LAT_DD, fish_dens, one_of(all_covars))

rm(all_covars)

# fit the QRF model
# set the density offset (to accommodate 0s)
range(qrf_mod_df$fish_dens)
# dens_offset = 0.005
dens_offset = 0

# fit random forest models
qrf_mods = qrf_mod_df %>%
  split(list(.$Species)) %>%
  map(.f = function(z) {
    
    covars = sel_hab_mets %>%
      filter(Species == unique(z$Species)) %>%
      pull(Metric)
    
    set.seed(3)
    qrf_mod = quantregForest(x = z %>%
                               select(one_of(covars)) %>%
                               as.matrix,
                             y = z %>%
                               mutate_at(vars(fish_dens),
                                         list(~ log(. + dens_offset))) %>%
                               select(fish_dens) %>%
                               as.matrix(),
                             keep.inbag = T,
                             ntree = 1000)
    
    return(qrf_mod)
  })

# save some results
save(fish_hab, 
     sel_hab_mets,
     qrf_mod_df,
     dens_offset,
     qrf_mods,
     hab_dict,
     file = 'output/modelFits/qrf_redds.rda')

#-----------------------------------------------------------------
# create a few figures
#-----------------------------------------------------------------
load('output/modelFits/qrf_redds.rda')

# relative importance of habtiat covariates
rel_imp_p = qrf_mods %>%
  map(.f = function(x) {
    as_tibble(x$importance,
              rownames = 'Metric') %>%
      mutate(relImp = IncNodePurity / max(IncNodePurity)) %>%
      left_join(hab_dict %>%
                  select(Metric = ShortName,
                         Name) %>%
                  distinct()) %>%
      mutate_at(vars(Metric, Name),
                list(~ fct_reorder(., relImp))) %>%
      arrange(Metric) %>%
      distinct() %>%
      ggplot(aes(x = Name,
                 y = relImp)) +
      geom_col(fill = 'gray40') +
      coord_flip() +
      labs(x = 'Metric',
           y = 'Relative Importance')
    
  })
# add species name to each plot
for(i in 1:length(rel_imp_p)) {
  rel_imp_p[[i]] = rel_imp_p[[i]] +
    labs(title = names(qrf_mods)[[i]])
}

ggpubr::ggarrange(plotlist = rel_imp_p,
                  nrow = 2,
                  ncol = 1)

# for Chinook
chnk_pdp = plot_partial_dependence(qrf_mods[['Chinook']],
                                   data = qrf_mod_df %>%
                                     filter(Species == 'Chinook'),
                                   data_dict = hab_dict,
                                   log_offset = dens_offset,
                                   scales = 'free') +
  labs(title = 'Chinook')

# for steelhead
sthd_pdp = plot_partial_dependence(qrf_mods[['Steelhead']],
                                   data = qrf_mod_df %>%
                                     filter(Species == 'Steelhead'),
                                   data_dict = hab_dict,
                                   log_offset = dens_offset,
                                   scales = 'free') +
  labs(title = 'Steelhead')


#-----------------------------------------------------------------
# predict capacity at all CHaMP sites
#-----------------------------------------------------------------
load('output/modelFits/qrf_redds.rda')

# what quantile is a proxy for capacity?
pred_quant = 0.9

covars = unique(sel_hab_mets$Metric)
hab_impute = hab_avg %>%
  mutate_at(vars(Watershed, Channel_Type),
            list(fct_drop)) %>%
  impute_missing_data(data = .,
                      covars = covars,
                      impute_vars = c('Watershed', 
                                      'Elev_M', 
                                      'Channel_Type', 
                                      'CUMDRAINAG'),
                      method = 'missForest') %>%
  select(Site, Watershed, LON_DD, LAT_DD, VisitYear, Lgth_Wet, Area_Wet, one_of(covars))

pred_hab_sites = hab_impute %>%
  mutate(chnk_per_m2 = predict(qrf_mods[['Chinook']],
                              newdata = select(., one_of(unique(sel_hab_mets$Metric))),
                              what = pred_quant),
         chnk_per_m2 = exp(chnk_per_m2) - dens_offset,
         chnk_per_m = chnk_per_m2 * Area_Wet / Lgth_Wet) %>%
  mutate(sthd_per_m2 = predict(qrf_mods[['Steelhead']],
                              newdata = select(., one_of(unique(sel_hab_mets$Metric))),
                              what = pred_quant),
         sthd_per_m2 = exp(sthd_per_m2) - dens_offset,
         sthd_per_m = sthd_per_m2 * Area_Wet / Lgth_Wet)

# filter out sites outside the Chinook domain
data("chnk_domain")
snap_dist = 1000

pred_hab_sites_chnk = pred_hab_sites %>%
  filter(!is.na(LON_DD)) %>%
  st_as_sf(coords = c('LON_DD', 'LAT_DD'),
           crs = 4326) %>%
  st_transform(crs = st_crs(chnk_domain)) %>%
  as_Spatial() %>%
  maptools::snapPointsToLines(chnk_domain %>%
                                mutate(id = 1:n()) %>%
                                select(id, MPG) %>%
                                as_Spatial(),
                              maxDist = snap_dist,
                              withAttrs = T,
                              idField = 'id') %>%
  as('sf') %>%
  as_tibble()

# note if sites are in Chinook domain or not
pred_hab_sites %<>% 
  mutate(chnk_domain = if_else(Site %in% pred_hab_sites_chnk$Site, T, F)) %>%
  mutate_at(vars(Watershed),
            list(fct_drop))

# put into dataframe for extrapolation, keeping only Chinook sites in Chinook domain
pred_hab_df = pred_hab_sites %>%
  select(-starts_with('chnk')) %>%
  rename(cap_per_m = sthd_per_m,
         cap_per_m2 = sthd_per_m2) %>%
  mutate(Species = 'Steelhead') %>%
  bind_rows(pred_hab_sites %>%
              select(-starts_with('sthd')) %>%
              filter(chnk_domain) %>%
              select(-chnk_domain) %>%
              rename(cap_per_m = chnk_per_m,
                     cap_per_m2 = chnk_per_m2) %>%
              mutate(Species = 'Chinook')) %>%
  select(Species, everything())
  

#----------------------------------------
# pull in survey design related data
#----------------------------------------
# Calculate GRTS design weights.

# pull in info about what strata each CHaMP site was assigned to (using 2014 as reference year)
site_strata = pred_hab_df %>%
  select(Species, Site, Watershed) %>%
  distinct() %>%
  left_join(gaa %>%
              select(Site,
                     strata = AStrat2014)) %>%
  mutate(site_num = str_split(Site, '-', simplify = T)[,2]) %>%
  mutate(strata = if_else(Watershed == 'Asotin',
                          site_num,
                          if_else(Watershed == 'Entiat' & grepl('ENT00001', Site),
                                  paste('EntiatIMW', site_num, sep = '_'),
                                  strata))) %>%
  mutate(strata = if_else(grepl('EntiatIMW', strata),
                          str_remove(strata, '[[:digit:]]$'),
                          strata),
         strata = if_else(grepl('EntiatIMW', strata),
                          str_remove(strata, '[[:digit:]]$'),
                          strata)) %>%
  filter(!is.na(strata)) %>%
  mutate(strata = paste(Watershed, strata, sep = '_')) %>%
  select(-site_num)

# read in data from the CHaMP frame
champ_frame_df = read_csv('data/prepped/champ_frame_data.csv') %>%
  mutate(Target2014 = ifelse(is.na(AStrat2014), 'Non-Target', Target2014)) %>%
  mutate(AStrat2014 = ifelse(AStrat2014 == 'Entiat IMW', paste('EntiatIMW', GeoRchIMW, sep = '_'), AStrat2014)) %>%
  filter(Target2014 == 'Target') %>%
  rename(Watershed = CHaMPshed)

# what strata do we have?
frame_strata = champ_frame_df %>%
  mutate(strata = paste(Watershed, AStrat2014, sep='_')) %>%
  select(Watershed, 
         strata) %>%
  distinct()

# how long is each strata, by species?
chnk_strata_length = champ_frame_df %>%
  filter(!is.na(UseTypCHSP)) %>%
  mutate(strata = paste(Watershed, AStrat2014, sep='_')) %>%
  select(Watershed, matches("Strat"), FrameLeng) %>%
  group_by(Watershed, strata) %>%
  summarise(tot_length_km = sum(FrameLeng) / 1000) %>%
  ungroup() %>%
  mutate_at(vars(Watershed, strata), 
            list(as.factor)) %>%
  arrange(Watershed, strata)

sthd_strata_length = champ_frame_df %>%
  filter(!is.na(UseTypSTSU)) %>%
  mutate(strata = paste(Watershed, AStrat2014, sep='_')) %>%
  select(Watershed, matches("Strat"), FrameLeng) %>%
  group_by(Watershed, strata) %>%
  summarise(tot_length_km = sum(FrameLeng) / 1000) %>%
  bind_rows(tibble(Watershed = 'Asotin',
                   strata = paste('Asotin', c('CC', 'NF', 'SF'), sep = '_'),
                   tot_length_km = 12)) %>%
  ungroup() %>%
  mutate_at(vars(Watershed, strata), 
            list(as.factor)) %>%
  arrange(Watershed, strata)

strata_length = chnk_strata_length %>%
  mutate(Species = 'Chinook') %>%
  bind_rows(sthd_strata_length %>%
              mutate(Species = 'Steelhead')) %>%
  select(Species, everything())

# how many sites in each strata? and what is the length of each strata?
strata_tab = pred_hab_df %>%
  select(Species, Site, Watershed, matches('per_m')) %>%
  left_join(site_strata) %>%
  filter(strata != 'Entiat_Entiat IMW') %>%
  mutate_at(vars(Watershed),
            list(fct_drop)) %>%
  group_by(Species, Watershed, strata) %>%
  summarise(n_sites = n_distinct(Site)) %>%
  ungroup() %>%
  full_join(strata_length) %>%
  mutate(n_sites = if_else(is.na(n_sites),
                           as.integer(0),
                           n_sites)) %>%
  # calculate the weight of each site in each strata
  mutate(site_weight = if_else(n_sites > 0,
                               tot_length_km / n_sites,
                               as.numeric(NA)))


# test to see if we've accounted for all strata and most of each watershed
strata_test = frame_strata %>%
  full_join(strata_tab) %>%
  mutate_at(vars(Watershed),
            list(fct_drop)) %>%
  mutate(n_sites = if_else(is.na(n_sites),
                           as.integer(0),
                           n_sites)) %>%
  select(Species, everything()) %>%
  arrange(Species, Watershed, strata)

# what frame strata don't have any sites in them?
# strata_test %>%
#   filter(n_sites == 0,
#          !is.na(tot_length_km)) %>%
#   arrange(Species, Watershed, strata) %>%
#   as.data.frame()

# # what strata that we have sites for are not in the frame strata?
# strata_test %>%
#   filter(n_sites > 0,
#          (is.na(tot_length_km) |
#             tot_length_km == 0)) %>%
#   as.data.frame()

# # how much of each watershed is not accounted for with current sites / strata?
# strata_test %>%
#   group_by(Watershed) %>%
#   summarise_at(vars(tot_length_km),
#                list(sum),
#                na.rm = T) %>%
#   left_join(strata_test %>%
#               filter(n_sites == 0) %>%
#               group_by(Watershed) %>%
#               summarise_at(vars(missing_length = tot_length_km),
#                            list(sum),
#                            na.rm = T)) %>%
#   mutate_at(vars(missing_length),
#             list(~ if_else(is.na(.), 0, .))) %>%
#   mutate(perc_missing = missing_length / tot_length_km) %>%
#   mutate_at(vars(perc_missing),
#             list(~ if_else(is.na(.), 0, .))) %>%
#   arrange(desc(perc_missing))

# Prep GAAs for extrapolation
data("gaa")

gaa_all = gaa %>%
  rename(Channel_Type = ChanlType)

# which GAAs to use
gaa_covars = c('TRange', 
               # 'GDD', 
               # 'Precip',
               'Elev_M', 
               'CHaMPsheds', 
               'NatPrin1', 
               # 'NatPrin2', 
               'DistPrin1', 
               # 'BFW_M', 
               'SrtCumDrn', 
               'StrmPwr',
               'Slp_NHD_v1', 
               'Channel_Type', 
               # 'MAVELV', 
               'WIDE_BF',
               'S2_02_11')

# what type of covariate is each GAA?
gaa_class = gaa_all %>%
  select(one_of(gaa_covars)) %>%
  as.list() %>%
  map_chr(.f = class)

# which ones are numeric?
gaa_num = names(gaa_class)[gaa_class %in% c('integer', 'numeric')]
# which ones are categorical?
gaa_catg = names(gaa_class)[gaa_class %in% c('factor', 'character')]

# compare range of covariates from model dataset and prediction dataset
range_comp = bind_rows(gaa_all %>%
                         filter(!Site %in% unique(pred_hab_sites$Site)) %>%
                         select(Site,
                                one_of(gaa_num)) %>%
                         gather(Metric, value, -Site) %>%
                         mutate(Source = 'non-CHaMP Sites'),
                       gaa_all %>%
                         filter(Site %in% unique(pred_hab_sites$Site)) %>%
                         select(Site,
                                one_of(gaa_num)) %>%
                         distinct() %>%
                         gather(Metric, value, -Site) %>%
                         mutate(Source = 'CHaMP Sites')) %>%
  mutate_at(vars(Source, Metric),
            list(as.factor))

range_max = range_comp %>%
  group_by(Metric, Source) %>%
  summarise_at(vars(value),
               tibble::lst(min, max),
               na.rm = T) %>%
  filter(Source == 'CHaMP Sites') %>%
  ungroup() %>%
  gather(type, value, -Metric, -Source)

# Center the covariates
# filter out sites with covariates outside range of covariates used to fit extrapolation model
out_range_sites = gaa_all %>%
  # filter out a few areas
  filter(!HUC6NmNRCS %in% c('Upper Sacramento', 'Southern Oregon Coastal', 'Puget Sound', 'Northern California Coastal', 'Oregon Closed Basins')) %>%
  # don't use AEM sites in model
  filter(!grepl('^AEM', Site)) %>%
  select(one_of(gaa_num), Site) %>%
  gather(Metric, value, -Site) %>%
  left_join(select(range_max, -Source) %>%
              spread(type, value)) %>%
  group_by(Metric) %>%
  filter(value > max |
           value < min) %>%
  ungroup() %>%
  pull(Site) %>%
  unique()

# center covariates
gaa_summ = inner_join(pred_hab_df %>%
                        select(Site) %>%
                        distinct(),
                      gaa_all %>%
                        select(Site, one_of(gaa_num))) %>%
  gather(GAA, value, -Site) %>%
  group_by(GAA) %>%
  summarise(metric_mean = mean(value, na.rm=T),
            metric_sd = sd(value, na.rm=T)) %>%
  ungroup()

# extrapolation model data set, with normalized covariates
mod_data = inner_join(pred_hab_df %>%
                        select(Species:avg_aug_temp,
                               matches('per_m')),
                      gaa_all %>%
                        select(Site, one_of(gaa_num))) %>%
  gather(GAA, value, one_of(gaa_num)) %>%
  left_join(gaa_summ) %>%
  mutate(norm_value = (value - metric_mean) / metric_sd) %>%
  select(-(value:metric_sd)) %>%
  spread(GAA, norm_value) %>%
  left_join(gaa_all %>%
              select(Site, one_of(gaa_catg)))

mod_data %<>%
  bind_cols(mod_data %>%
              is.na() %>%
              as_tibble() %>%
              select(one_of(gaa_covars)) %>%
              transmute(n_NA = rowSums(.))) %>%
  filter(n_NA == 0) %>%
  mutate_at(vars(Watershed, CHaMPsheds, Channel_Type),
            list(fct_drop)) 

# calculate adjusted weights for all predicted QRF capacity sites
mod_data_weights = mod_data %>%
  left_join(site_strata) %>%
  left_join(strata_tab) %>%
  filter(!is.na(site_weight)) %>%
  group_by(Species, Watershed) %>%
  mutate(sum_weights = sum(site_weight)) %>%
  ungroup() %>%
  mutate(adj_weight = site_weight / sum_weights)

# where do we want to make extrapolation predictions?
gaa_pred = gaa_all %>%
  # filter out a few areas
  filter(!HUC6NmNRCS %in% c('Upper Sacramento', 'Southern Oregon Coastal', 'Puget Sound', 'Northern California Coastal', 'Oregon Closed Basins')) %>%
  # don't use AEM sites in model
  filter(!grepl('^AEM', Site)) %>%
  # don't use non-GRTS sites
  filter(SiteID_alt != 'NonGRTSSite' | is.na(SiteID_alt)) %>%
  filter(!grepl('mega', Site, ignore.case = T)) %>%
  # note which sites have GAAs outside range of CHaMP sites GAAs
  mutate(inCovarRange = ifelse(Site %in% out_range_sites, F, T)) %>%
  select(Site, one_of(gaa_covars), Lon, Lat, inCovarRange, HUC6NmNRCS, HUC8NmNRCS, HUC10NmNRC, HUC12NmNRC, chnk, steel) %>%
  gather(GAA, value, one_of(gaa_num)) %>%
  left_join(gaa_summ) %>%
  mutate(norm_value = (value - metric_mean) / metric_sd) %>%
  select(-(value:metric_sd)) %>%
  spread(GAA, norm_value)

#-------------------------------------------------------------
# Set up the survey design.

# getOption('survey.lonely.psu')
# this will prevent strata with only 1 site from contributing to the variance
# options(survey.lonely.psu = 'certainty')
# this centers strata with only 1 site to the sample grand mean; this is conservative
options(survey.lonely.psu = 'adjust')

# extrapolation model formula
full_form = as.formula(paste('log(qrf_cap) ~ -1 + (', paste(gaa_covars, collapse = ' + '), ')'))

# fit various models
model_svy_df = mod_data_weights %>%
  gather(response, qrf_cap, matches('per_m')) %>%
  select(-(n_sites:sum_weights)) %>%
  group_by(Species, response) %>%
  nest() %>%
  mutate(design = map(data,
                      .f = function(x) {
                        svydesign(id = ~ 1,
                                  data = x,
                                  strata = ~ Watershed,
                                  # strata = ~ strata,
                                  weights = ~ adj_weight)
                      })) %>%
  mutate(mod_full = map(design,
                        .f = function(x) {
                          svyglm(full_form,
                                 design = x)
                        }),
         mod_no_champ = map(design,
                            .f = function(x) {
                              svyglm(update(full_form, .~ . -CHaMPsheds),
                                     design = x)
                            })) %>%
  arrange(Species, response) %>%
  ungroup()


# predictions within CHaMP watersheds, using the model that includes CHaMPsheds as a covariate
y = gaa_pred %>%
  filter(CHaMPsheds %in% unique(model_svy_df$data[[1]]$Watershed)) %>%
  select(Site, CHaMPsheds, one_of(gaa_covars)) %>%
  na.omit() %>%
  left_join(gaa_pred) %>%
  mutate_at(vars(Channel_Type),
            list(as.factor))

per_m_preds = model_svy_df %>%
  filter(Species == 'Chinook',
         response == 'cap_per_m') %>%
  pull(mod_full) %>%
  extract2(1) %>%
  predict(.,
          newdata = y,
          se = T,
          type = 'response') %>%
  as_tibble() %>%
  rename(chnk_per_m = response,
         chnk_per_m_se = SE) %>%
  bind_cols(model_svy_df %>%
              filter(Species == 'Steelhead',
                     response == 'cap_per_m') %>%
              pull(mod_full) %>%
              extract2(1) %>%
              predict(.,
                      newdata = y,
                      se = T,
                      type = 'response') %>%
              as_tibble() %>%
              rename(sthd_per_m = response,
                     sthd_per_m_se = SE))


per_m2_preds = model_svy_df %>%
  filter(Species == 'Chinook',
         response == 'cap_per_m2') %>%
  pull(mod_full) %>%
  extract2(1) %>%
  predict(.,
          newdata = y,
          se = T,
          type = 'response') %>%
  as_tibble() %>%
  rename(chnk_per_m2 = response,
         chnk_per_m2_se = SE) %>%
  bind_cols(model_svy_df %>%
              filter(Species == 'Steelhead',
                     response == 'cap_per_m2') %>%
              pull(mod_full) %>%
              extract2(1) %>%
              predict(.,
                      newdata = y,
                      se = T,
                      type = 'response') %>%
              as_tibble() %>%
              rename(sthd_per_m2 = response,
                     sthd_per_m2_se = SE))

y %<>%
  bind_cols(per_m_preds) %>%
  bind_cols(per_m2_preds) %>%
  mutate_at(vars(matches('per_m')),
            list(exp))

rm(per_m_preds,
   per_m2_preds)

# predictions at all points, using the model without CHaMPsheds as a covariate
z = gaa_pred %>%
  select(Site, one_of(gaa_covars), -CHaMPsheds) %>%
  na.omit() %>%
  left_join(gaa_pred) %>%
  mutate_at(vars(Channel_Type),
            list(as.factor))

per_m_preds = model_svy_df %>%
  filter(Species == 'Chinook',
         response == 'cap_per_m') %>%
  pull(mod_no_champ) %>%
  extract2(1) %>%
  predict(.,
          newdata = z,
          se = T,
          type = 'response') %>%
  as_tibble() %>%
  rename(chnk_per_m = response,
         chnk_per_m_se = SE) %>%
  bind_cols(model_svy_df %>%
              filter(Species == 'Steelhead',
                     response == 'cap_per_m') %>%
              pull(mod_no_champ) %>%
              extract2(1) %>%
              predict(.,
                      newdata = z,
                      se = T,
                      type = 'response') %>%
              as_tibble() %>%
              rename(sthd_per_m = response,
                     sthd_per_m_se = SE))


per_m2_preds = model_svy_df %>%
  filter(Species == 'Chinook',
         response == 'cap_per_m2') %>%
  pull(mod_no_champ) %>%
  extract2(1) %>%
  predict(.,
          newdata = z,
          se = T,
          type = 'response') %>%
  as_tibble() %>%
  rename(chnk_per_m2 = response,
         chnk_per_m2_se = SE) %>%
  bind_cols(model_svy_df %>%
              filter(Species == 'Steelhead',
                     response == 'cap_per_m2') %>%
              pull(mod_no_champ) %>%
              extract2(1) %>%
              predict(.,
                      newdata = z,
                      se = T,
                      type = 'response') %>%
              as_tibble() %>%
              rename(sthd_per_m2 = response,
                     sthd_per_m2_se = SE))

z %<>%
  bind_cols(per_m_preds) %>%
  bind_cols(per_m2_preds) %>%
  mutate_at(vars(matches('per_m')),
            list(exp))


rm(per_m_preds, per_m2_preds)

# put all predictions together
all_preds = y %>%
  mutate(model = 'CHaMP') %>%
  bind_rows(z %>%
              mutate(model = 'non-CHaMP'))

# quick comparison of capacity predicitons with both models
comp_pred_p = all_preds %>%
  filter(Site %in% Site[duplicated(Site)]) %>%
  # select(Site, CHaMPsheds, Channel_Type, model, chnk_per_m, chnk_per_m2) %>%
  select(Site, CHaMPsheds, Channel_Type, model, chnk_per_m, chnk_per_m2, sthd_per_m, sthd_per_m2) %>%
  arrange(CHaMPsheds, Site, model) %>%
  gather(dens_type, cap, matches('_per_m')) %>%
  spread(model, cap) %>%
  ggplot(aes(x = CHaMP,
             y = `non-CHaMP`)) +
  geom_point() +
  geom_abline(linetype = 2,
              color = 'red') +
  facet_wrap(~ CHaMPsheds + dens_type,
             scales = 'free')

comp_pred_p

# for sites in CHaMP watersheds, use predicions from CHaMP extrapolation model
all_preds %<>%
  filter((Site %in% Site[duplicated(Site)] & model == 'CHaMP') |
           !Site %in% Site[duplicated(Site)])

# for CHaMP sites, use direct QRF esimates, not extrapolation ones
qrf_cap = pred_hab_sites %>%
  select(-starts_with('chnk')) %>%
  rename(cap_per_m = sthd_per_m,
         cap_per_m2 = sthd_per_m2) %>%
  mutate(Species = 'Steelhead') %>%
  bind_rows(pred_hab_sites %>%
              select(-starts_with('sthd')) %>%
              select(-chnk_domain) %>%
              rename(cap_per_m = chnk_per_m,
                     cap_per_m2 = chnk_per_m2) %>%
              mutate(Species = 'Chinook')) %>%
  select(Species, everything()) %>%
  select(Site, Species, cap_per_m, cap_per_m2)

all_preds %<>%
  left_join(qrf_cap %>%
              select(Site, 
                     Species,
                     cap_per_m) %>%
              spread(Species, cap_per_m) %>%
              rename(qrf_chnk_per_m = Chinook,
                     qrf_sthd_per_m = Steelhead) %>%
              full_join(qrf_cap %>%
                          select(Site, 
                                 Species,
                                 cap_per_m2) %>%
                          spread(Species, cap_per_m2) %>%
                          rename(qrf_chnk_per_m2 = Chinook,
                                 qrf_sthd_per_m2 = Steelhead))) %>%
  mutate(chnk_per_m = if_else(!is.na(qrf_chnk_per_m),
                              qrf_chnk_per_m,
                              chnk_per_m),
         chnk_per_m_se = if_else(!is.na(qrf_chnk_per_m),
                                 as.numeric(NA),
                                 chnk_per_m_se),
         chnk_per_m2 = if_else(!is.na(qrf_chnk_per_m2),
                               qrf_chnk_per_m2,
                               chnk_per_m2),
         chnk_per_m2_se = if_else(!is.na(qrf_chnk_per_m2),
                                  as.numeric(NA),
                                  chnk_per_m2_se)) %>%
  mutate(sthd_per_m = if_else(!is.na(qrf_sthd_per_m),
                              qrf_sthd_per_m,
                              sthd_per_m),
         sthd_per_m_se = if_else(!is.na(qrf_sthd_per_m),
                                 as.numeric(NA),
                                 sthd_per_m_se),
         sthd_per_m2 = if_else(!is.na(qrf_sthd_per_m2),
                               qrf_sthd_per_m2,
                               sthd_per_m2),
         sthd_per_m2_se = if_else(!is.na(qrf_sthd_per_m2),
                                  as.numeric(NA),
                                  sthd_per_m2_se)) %>%
  select(-starts_with('qrf'))


save(gaa_covars,
     mod_data_weights,
     model_svy_df,
     all_preds,
     file = 'output/modelFits/extrap_redds.rda')

#---------------------------
# create a shapefile
load('output/modelFits/extrap_redds.rda')
data("chnk_domain")

all_preds_sf = all_preds %>%
  select(Site, Lon:model) %>%
  st_as_sf(coords = c('Lon', 'Lat'),
           crs = 4326) %>%
  st_transform(st_crs(chnk_domain))

# save it
# as shapefile
st_write(all_preds_sf,
         dsn = 'output/shapefiles/Redds_Capacity.shp',
         driver = 'ESRI Shapefile',
         delete_layer = T)

# as GPKG
st_write(all_preds_sf,
         dsn = 'output/gpkg/Redds_Capacity.gpkg',
         driver = 'GPKG',
         delete_layer = T)
KevinSee/QRFcapacity documentation built on Feb. 27, 2023, 3:57 p.m.