R/calc-iphc-ser-area.R

Defines functions iphc_format_for_EAFM iphc_get_calc_plot_area add_in_area calc_iphc_ser_EF calc_iphc_ser_E_and_F

Documented in add_in_area calc_iphc_ser_E_and_F calc_iphc_ser_EF iphc_format_for_EAFM iphc_get_calc_plot_area

#' Calculations restricted to a user-defined area.
#'
#' Calculate Series E and F from the IPHC data (restricted to a given area)
#'
#' Calculate both Series for as many years as possible, including bootstrapped
#'  values, for the data restricted to a user-defined area and (default is)
#'  standard stations only.
#' @details The two series are:
#'
#'  Series E: first 20 hooks from each skate (like Series A but restricted to a
#'   given area)
#'
#'  Series F: all hooks from each skate (like Series B but restricted to a given area)
#'
#' @param set_counts species-specific set-level counts from [tidy_iphc_survey()]
#'  or other, but with extra column `in_area` indicating if each station is in
#'   the area being considered or not. See vignette for example.
#' @param only_standard only use the standard stations (default is TRUE).
#' @return list of class `IPHC_ser_E_and_F` containing two tibbles, one for each series (ser_E and ser_F).
#'   Each tibble has one row for each set in each year, restricted to only the
#'   sets within the defined area, with columns
#'   year, station, lat, lon, E_it (effective skate number), N_it (number of
#'   fish caught of the given species), C_it (catch rate of given species, as
#'   numbers per effective skate), E_it20, N_it20 and C_it20 are the same but
#'   considering the first 20 hooks of each skate only, usable (whether the set
#'   should be used, based on IPHC codes; note that some should not be used for
#'   geospatial analysis but are included here).
#'
#' @examples
#' \dontrun{
#' TODO:
#' yelloweye <- tidy_iphc_survey(
#'  get_iphc_hooks("yelloweye rockfish"),
#'  get_iphc_skates_info(),
#'  get_iphc_sets_info()
#' )
#' calc_iphc_ser_all(yelloweye)
#' }
#' @export
calc_iphc_ser_E_and_F <- function(set_counts,
                                  only_standard = TRUE) {
  stopifnot("in_area" %in% names(set_counts))

  set_counts_usable <- filter(set_counts,
                              usable == "Y",
                              in_area == TRUE,
                              standard == ifelse(only_standard,
                                                 "Y",
                                                 "N"))
  # Series E
  ser_E_counts <- filter(
    set_counts_usable,
    !is.na(E_it20)
  )
  ser_E_boot <- boot_iphc(select(
    ser_E_counts,
    year,
    C_it20
  ))

  ser_E <- summarise(group_by(ser_E_counts, year),
    Sets = n(),
    num_pos20 = sum(C_it20 > 0),
    I_t20SampleMean = mean(C_it20)
  ) %>%
    filter(!is.na(I_t20SampleMean)) %>% # NA's got carried through, thinking may get error if this ends up empty
    left_join(ser_E_boot, by = "year")
  names(ser_E)[ names(ser_E) == "I_tBootMean"] <- "I_t20BootMean"
  names(ser_E)[ names(ser_E) == "I_tBootLow"] <- "I_t20BootLow"
  names(ser_E)[ names(ser_E) == "I_tBootHigh"] <- "I_t20BootHigh"
  names(ser_E)[ names(ser_E) == "I_tBootCV"] <- "I_t20BootCV"


  # Series F
  ser_F_counts <- filter(
    set_counts_usable,
    !is.na(E_it)
  )

  ser_F_boot <- boot_iphc(select(
    ser_F_counts,
    year,
    C_it
  ))

  ser_F <- summarise(group_by(ser_F_counts, year),
    Sets = n(),
    num_pos = sum(C_it > 0),
    I_tSampleMean = mean(C_it)
  ) %>%
    filter(!is.na(I_tSampleMean)) %>% # NA's got carried through
    left_join(ser_F_boot, by = "year")

  structure(
    list(ser_E = ser_E,
         ser_F = ser_F),
    class = c("IPHC_ser_E_and_F", "list"))
}


##' Calculate Series EF and test if the scaled E and F are significantly different
##'
##' Calculate Series EF (Series E with 1995 and 1996 appropriately scaled from
##'  Series F)
##' @param series_all List of tibbles, one for each of Series E and F,
##'   resulting from `calc_iphc_ser_E_and_F()`
##' @param return_F_if_same_length logical to return Series F if Series E and F
##'  have the same number of years. User should check which is more suitable and
##'  then change this value if necessary (F uses all hooks but E includes 1995
##'  and 1996)
##'
##' @return List containing
##'
##'   ser_longest: the longest time series possible from
##'   Series E and F,
##'
##'   test_EF: the results from the paired t-test, NULL if Series F is longest.
##'
##'   G_E, G_F: geometric means of nonzero values in Series E and Series F
##'   (based on bootstrapped means).
##'
##'   type: which type the longest series is, either `EF`, `E`, or `F`, based on
##'   the descriptions below.
##'
##'   Longest series is either
##'
##'   (i) Series EF (Series E with 1995 and 1996 appropriately scaled from
##'   Series F) if `test_EF$p.value >= 0.05`, because the p-value means that we
##'   cannot reject the null hypothesis that the true difference
##'   in means of the rescaled (by their geometric means) Series E and F equals 0,
##'
##'   (ii) Series E, which is the longest series available if the rescaled
##'  series are significantly different.
##'
##'   (iii) Series F, if the years for which only the first 20 hooks are enumerated
##'    never caught the species (but other years do). Then Series EF becomes Series
##'    E plus some zeros, so we may as well use Series F that uses all the hooks
##'    and so is more likely to catch the species. This is a rare situation (and
##'    likely arises because not all species were specifically identified in
##'    earlier years (or 2013?), but seems to occur for China Rockfish for
##'   Series AB analysis. Return NULL
##'    for t_EF (to then use later to identify that Series F is the longest
##'    series).
##'
##'  (iv) Series F (since all hooks) if E and F contain the same number of
##'   years, but user can change `return_F_if_same_length` as Series E may make
##'   more sense (since goes back to 1995). If E and F contain the same exact
##'   years then F is always returned.
##'
##' See vignette for example.
##' @export
##' @author Andrew Edwards
calc_iphc_ser_EF <- function(series_all,
                             return_F_if_same_length = TRUE) {
  years_EF <- intersect(series_all$ser_E$year,
                        series_all$ser_F$year)
  # TODO: NOT CHECKED THIS:
  # Series B for English Sole catches none, and only returns 1995 and 1996.
  #  Series A caught one English Sole in 2001. Therefore use A as the longest.
  #  No overlapping years since never caught one when all hooks were evaluated
  #  and we have hook-by-hook data. Could have 1995 and 1996 catching one, so
  #  then may want Series B; but this is only when we catch the odd fish.
  if (length(years_EF) == 0) {
    if (nrow(series_all$ser_E) > nrow(series_all$ser_F)) {
      return(list(
        ser_longest = series_all$ser_E,
        test_EF = list(
          t_EF = NULL,
          G_E = NA,
          G_F = NA,
          type = "E"
        )
      ))
    } else {
      return(list(
        ser_longest = series_all$ser_F,
        test_EF = list(
          t_EF = NULL,
          G_E = NA,
          G_F = NA,
          type = "F"
        )
      ))
    }
  }
  # If Series E in overlapping years ends up with no catch (Bluntnose Sixgill
  #  Shark for Series A), then return Series F.
  if (nrow(filter(series_all$ser_E, year %in% years_EF, I_t20BootMean > 0)) == 0) {
    return(list(
      ser_longest = series_all$ser_F,
      test_EF = list(
        t_EF = NULL,
        G_E = NA,
        G_F = NA,
        type = "F"
      )
    ))
  }

  # Geometric means of each series for the overlapping years, excluding zeros
  G_E <- exp(mean(log(dplyr::filter(series_all$ser_E,
                                    year %in% years_EF,
                                    I_t20BootMean > 0
                                    )$I_t20BootMean)))

  G_F <- exp(mean(log(dplyr::filter(series_all$ser_F,
                                    year %in% years_EF,
                                    I_tBootMean > 0
                                    )$I_tBootMean)))

  # If Series E has same number of years as Series F then return F if years are
  # the same, or return E or F depending on `return_F_if_same_length`
  if (length(unique(series_all$ser_E$year)) ==
      length(unique(series_all$ser_F$year))) {

    if (all(unique(series_all$ser_E$year) == unique(series_all$ser_F$year))) {
      return(list(
        ser_longest = series_all$ser_F,
        test_EF = list(
          t_EF = NULL,
          G_E = G_E,
          G_F = G_F,
          type = "F"
        )))
    } else {
      if(return_F_if_same_length){
        return(list(
          ser_longest = series_all$ser_F,
          test_EF = list(
            t_EF = NULL,
            G_E = G_E,
            G_F = G_F,
            type = "F"
          )))
      } else {
        return(list(
          ser_longest = series_all$ser_E,
          test_EF = list(
            t_EF = NULL,
            G_E = G_E,
            G_F = G_F,
            type = "E"
          )))
      }
    }
  }


  # Scale by G_E, geometric mean of bootstrapped means.
  ser_E_scaled <- filter(
    series_all$ser_E,
    year %in% years_EF
  ) %>%
    mutate(
      I_t20SampleMean = I_t20SampleMean / G_E,
      I_t20BootMean = I_t20BootMean / G_E,
      I_t20BootLow = I_t20BootLow / G_E,
      I_t20BootHigh = I_t20BootHigh / G_E
    )
  # exp(mean(log(ser_A_scaled$I_t20BootMean)))  # =1

  ser_F_scaled <- filter(
    series_all$ser_F,
    year %in% years_EF
  ) %>%
    mutate(
      I_tSampleMean = I_tSampleMean / G_F,
      I_tBootMean = I_tBootMean / G_F,
      I_tBootLow = I_tBootLow / G_F,
      I_tBootHigh = I_tBootHigh / G_F
    )
  # exp(mean(log(ser_B_scaled$I_tBootMean)))  # =1

  t_EF <- stats::t.test(ser_E_scaled$I_t20BootMean,
                        ser_F_scaled$I_tBootMean,
                        paired = TRUE)

  if (t_EF$p.value >= 0.05) {
    # Can't reject null hypothesis that true difference
    #  in means equals 0

    # Multiply the ser_F years not in years_EF by G_E/G_F,
    #  naming columns with 20 since rescaling (and to combine with
    #  series_all$ser_E). Note that num_pos20 is not scaled (as
    #  we're implicitly scaling all the catch rates, but the numbers
    #  of sets won't change).
    ser_EF <- dplyr::filter(series_all$ser_F,
                            !year %in% years_EF) %>%   # years in F but not E
      mutate(
        num_pos20 = num_pos,
        I_t20SampleMean = I_tSampleMean * G_E / G_F,
        I_t20BootMean = I_tBootMean * G_E / G_F,
        I_t20BootLow = I_tBootLow * G_E / G_F,
        I_t20BootHigh = I_tBootHigh * G_E / G_F,
        I_t20BootCV = I_tBootCV
      ) %>%
      select(-c(
        "num_pos",
        "I_tSampleMean",
        "I_tBootMean",
        "I_tBootLow",
        "I_tBootHigh",
        "I_tBootCV"
      )) %>%
      rbind(series_all$ser_E)
    return(
      structure(
        list(
          ser_longest = ser_EF,
          test_EF = list(
            t_EF = t_EF,
            G_E = G_E,
            G_F = G_F,
            type = "EF")),
        class = "IPHC_ser_EF"))  # TODO - add to above ones once happy with,
                                 # though plot function didn't seem to work so
                                 # not really needed.
  } else {
    return(
      structure(
        list(
          ser_longest = series_all$ser_E,
          test_EF = list(
            t_EF = t_EF,
            G_E = G_E,
            G_F = G_F,
            type = "E")),
        class = "IPHC_ser_EF"))
  }
}


##' Take set count data for a species and add column indicating whether or not
##' it is inside a given area
##'
##' See example and the vignette.
##'
##' @param set_counts_of_sp input tibble of the species of interest, likely generated from
##'   `cache_pbs_data_iphc` (which seems to save a list with the first element
##'   the desired tibble), with column names `year`, `station`, `lat`, `lon`,
##'   `E_it`, `N_it`, `C_it`, `E_it20`, `N_it20`, `C_it20`, `usable`, and
##'   `in_area` if `indicate_in_area` is TRUE
##' @param area `data.frame` of (PBSmapping) class `PolySet` with column names
##'   `PID`, `SID`, `POS`, `X`, `Y`,
##' @return tibble of `set_counts_of_sp` with extra logical column `in_area` to
##'   say whether or not each set is within `area`. Internally a column `EID` is
##'   created to assign each set-year a unique ID, needed for
##'   `PBSmapping::findPolys`, but this is not returned.
##' @export
##' @author Andrew Edwards
##' @examples
##' \dontrun{
##' sp_set_counts_with_area <- add_in_area(yelloweye_rockfish$set_counts,
##'                                        HG_herring_pred_area)
##' # Gives a warning about hole vertices, can ignore.
##' }
add_in_area <- function(set_counts_of_sp,
                        area = HG_herring_pred_area){

  sp_set_counts_with_area <- set_counts_of_sp %>%
    dplyr::mutate(in_area = FALSE) %>%       # then adjusted below
    dplyr::bind_cols("EID" = 1:nrow(set_counts_of_sp))

  events <- sp_set_counts_with_area %>%
    dplyr::rename(X = lon, Y = lat) %>%
    dplyr::select(year, EID, Y, X)

  locData <- PBSmapping::findPolys(as.data.frame(events),
                                   area)

  EIDs_in_poly <- unique(locData$EID)

  # Give each set an `in_area` logical attribute TRUE/FALSE
  sp_set_counts_with_area$in_area <- sp_set_counts_with_area$EID %in% EIDs_in_poly

  sp_set_counts_with_area <- select(sp_set_counts_with_area,
                                    -c("EID"))

  return(sp_set_counts_with_area)
}



##' Get data, do calculations and plot longest series for the IPHC survey
##'  restricted to a specified area
##'
##' Get data, do calculations and plot longest series for the IPHC survey for
##'  a given species restricted to a given area. Will take a while if queries
##'  GFbio (for which need to be on DFO network), else can use cached data
##'  (species-name.rds is always the full data for the species, before any
##'  calculations).
##'  If data are for combined species, as cached with `get_combined_species()`,
##'  then the column `N_it_sum` etc. column names are used for `N_it` etc.
##'
##' @param sp Species name (as used in gfdata and gfplot). Or something like
##'   "skates combined" -- see vignette.
##' @param area `data.frame` of (PBSmapping) class `PolySet` with column names
##'   `PID`, `SID`, `POS`, `X`, `Y`,
##' @param cached_data if TRUE then use cached data (path_data/sp-name.rds)
##' @param cached_results if TRUE then use cached results
##'   (path_results/sp-name-results.rds), else do calculations here and save results
##' @param verbose if TRUE then print out some of the data (useful in vignette loops)
##' @param print_sp_name if TRUE then print out species name (useful in vignette
##'   loops)
##' @param path_data path to save or load the cached data
##' @param path_results path to save or load the cached data
##' @return For the given species, list containing
##'
##'   sp_set_counts_with_area: tibble returned from [add_in_area()] of set
##'   counts of the species with with extra logical column `in_area` to
##'   say whether or not each set is within `area`.
##'
##'   ser_E_and_F: list of class `IPHC_ser_E_and_F` containing two tibbles, one
##'   for each series (ser_E and ser_F), as output from [calc_iphc_ser_E_and_F()].
##'
##'   series_longest: list containing `ser_longest`, the longest possible time
##'   series from Series E and F, and other objects, as output form [calc_iphc_ser_EF()].
##' @export
##' @author Andrew Edwards
##' @examples
##' \dontrun{
##' iphc_get_calc_plot_area("yelloweye rockfish")
##' ##' # Gives a warning about hole vertices, can ignore.
##' }
iphc_get_calc_plot_area <- function(sp,
                                    area = HG_herring_pred_area,
                                    cached_data = TRUE,
                                    cached_results = FALSE,
                                    verbose = FALSE,
                                    print_sp_name = TRUE,
                                    path_data = ".",
                                    path_results = NULL){
  if(!cached_data){
    cache_pbs_data_iphc(sp,
                        path = path_data)
  }

  if(print_sp_name){
    print(paste("*****", sp, "*****"))
  }

  sp_set_counts <- readRDS(paste0(path_data,
                                  "/",
                                  sp_hyphenate(sp)))

  results_RDS_name <- paste0(file.path(path_results,
                                       sp_hyphenate(sp,
                                                    area = TRUE,
                                                    results = TRUE)))

  if(cached_results){
    cached <- readRDS(results_RDS_name)
    sp_set_counts_with_area <- cached[["sp_set_counts_with_area"]]
    ser_E_and_F <- cached[["ser_E_and_F"]]
    series_longest <- cached[["series_longest"]]
  } else {
    sp_set_counts_with_area <- add_in_area(sp_set_counts$set_counts,
                                           area = area)
    if(!("N_it" %in% names(sp_set_counts_with_area)) &
       "N_it_sum" %in% names(sp_set_counts_with_area)){
      sp_set_counts_with_area <- sp_set_counts_with_area %>%
        dplyr::rename(N_it = N_it_sum,
                      N_it20 = N_it20_sum,
                      C_it = C_it_sum,
                      C_it20 = C_it20_sum)
    }
    ser_E_and_F <- calc_iphc_ser_E_and_F(sp_set_counts_with_area)
    series_longest <- calc_iphc_ser_EF(ser_E_and_F)
  }

  # Not creating gfsynopsis formatted output since shouldn't need that for a
  #  restricted area -- see iphc_get_calc_plot() to add something here.

  if(verbose){
    # Print the first and last values:
    print(sp_set_counts)
    print(tail(sp_set_counts$set_counts))
    print(ser_E_and_F)
    print(series_longest)
    print(paste("*****", sp, "*****"))
  }

  plot_IPHC_ser_four_panels(ser_E_and_F,
                            series_longest,
                            sp = sp)
  res <- list(sp_set_counts_with_area = sp_set_counts_with_area,
              ser_E_and_F = ser_E_and_F,
              series_longest = series_longest)

  if(!cached_results){
    if(!is.null(path_results)){
      dir.create(path_results,
                 showWarnings = FALSE)
      saveRDS(res,
              file = results_RDS_name,
              compress = FALSE)
    }
  }
  return(res)
}

##' Format results from multiple species for Jennifer's EAFM HG Herring Case
##' Study and save them.
##'
##' Create and save a .csv file in the format requested by Jennifer Bolt:
##'   What would work best for me is a csv file with 3 columns: `Year`,
##'   `Indicator`, `Value`
##' Non-standardized values are best (i.e. not normalised).
##' The `Indicator` column could include:
##' 1. ArrowtoothFlounder_IPHC_SurveyCatchRate
##' 2. ArrowtoothFlounder_IPHC_UpperCI
##' 3. ArrowtoothFlounder_IPHC_LowerCI
##' 4. Rockfish_IPHC_SurveyCatchRate
##' 5. ...
##'
##' Also saves `sp_vec_list` as an .rds file.
##'
##' @param sp_vec A vector of species names. Each one *must* have a corresponding
##'   list `species_name_area` in the workspace which is an output from
##'   `iphc_get_calc_plot_area()` -- see vignette.
##' @param sp_vec_list list of output, with each element corresponding to each
##'   species (and named for that species), and being a tibble of the longest
##'   time series. See vignette, since needs to be created outside of a function.
##' @param path The folder where the cached results will be saved.
##' @param filename filename (including .csv) to save .csv file; also saves .rds
##'   of sp_vec_list.
##' @return saves a .csv file and returns the resulting tibble. Columns are:
##'  - Year
##'  - Indicator (e.g. ArrowtoothFlounder_IPHC_SurveyCatchRate,
##'   ArrowtoothFlounder_IPHC_UpperCI)
##'  - Value
##' @export
##' @author Andrew Edwards
##' @examples
##' \dontrun{
##' # See vignette.
##' }
iphc_format_for_EAFM <- function(sp_vec,
                                 sp_vec_list,
                                 path = ".",
                                 filename = "herring-HG-predators-IPHC.csv"){

  dir.create(path,
             showWarnings = FALSE)

  saveRDS(sp_vec_list,
          paste0(file.path(path,
                           gsub(".csv", ".rds", filename))))

  eafm_res <- tibble()        # Long format for EAFM Case Study

  for(sp in sp_vec){
    sp_name_one_word <- gsub(" ", "", simple_cap(sp))
    new_names <- c(paste0(sp_name_one_word,
                          "_IPHC_SurveyCatchRate"),
                   paste0(sp_name_one_word,
                          "_IPHC_LowerCI"),
                   paste0(sp_name_one_word,
                          "_IPHC_UpperCI"))

    this_sp_res <- sp_vec_list[[sp]] %>%
      dplyr::select(Year = year,
                    I_t20BootMean,
                    I_t20BootLow,
                    I_t20BootHigh) %>%
      dplyr::rename_with(.cols = I_t20BootMean,
                         .fn = function(x){new_names[1]}) %>%
      dplyr::rename_with(.cols = I_t20BootLow,
                         .fn = function(x){new_names[2]}) %>%
      dplyr::rename_with(.cols = I_t20BootHigh,
                         .fn = function(x){new_names[3]}) %>%
      tidyr::pivot_longer(!Year,
                          names_to = "Indicator",
                          values_to = "Value")

    eafm_res <- rbind(eafm_res,
                      this_sp_res)
  }

  write.csv(eafm_res,
            paste0(file.path(path,
                             filename)),
            row.names = FALSE,
            quote = FALSE)

  return(eafm_res)
}
pbs-assess/gfiphc documentation built on July 4, 2023, 1:13 p.m.