In fishvice/tidyices: Making working with DATRAS data a little smoother

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  warning = FALSE,
  message = FALSE
)

# remotes::install_github("fishvice/tidydatras", ref = "dev")
library(tidydatras) # remotes::install_github("fishvice/tidydatras")
library(tidyverse)
library(DescTools)  # e.g. winsorize function

spatialdir <- "C:/DATA/RDATA"
load(file.path(spatialdir, "rect.RData")) 

datadir <- "C:/DATA/DATRAS/raw"

source("../../r/geo_inside.R")

# suppressMessages(tidydatras:::dr_download_all(save_dir="C:/TEMP", verbose=FALSE))

Get data and tidy

Note that the dr_getdata can work with multiple surveys (a loop is inside the function)

sur <- "FR-CGFS"

hh <- 
  readr::read_rds(file = paste0(datadir, "/", tolower(sur), "_hh.rds")) %>% 
  dr_tidy() %>%                             # Make tidy with column specifications
  dr_idunite(., remove = FALSE) %>%         # Add identifier
  left_join(reco, by=c("ship"="code")) %>%  # Add vesselname
  mutate(year = as.integer(year))

hl <- 
  readr::read_rds(file = paste0(datadir, "/", tolower(sur), "_hl.rds")) %>% 
  dr_tidy() %>%                             # Make tidy with column specifications
  dr_idunite(., remove = FALSE) %>%         # Add identifier
  dr_calccpue(hh) %>%                       # Calculated CPUE per hour
  left_join(aphia_latin) %>%                # Add scientific name
  left_join(asfis) %>%                      # Add FAO species names and codes
  left_join(reco, by=c("ship"="code")) %>%  # Add vesselname
  mutate(year = as.integer(year))

Calculate index

# stations
st <-
  hl |>
  dplyr::select(survey, id, year, lon=shootlong, lat=shootlat) |>
  distinct() |>
  group_by(survey, year) |>
  mutate(n.tows = n_distinct(id)) |>
  ungroup()

# may need to generate this for each survey
year.min <- min(st$year)
year.max <- max(st$year)

# Check if we get a unique id:
if(!st |> nrow() == st |> distinct(id, .keep_all = TRUE) |> nrow()) {
  warning("This is unexpected, check the code")
}

le <-
  hl |>
  bind_rows() |>
  dplyr::select(vessel, survey, id, year, lon=shootlong, lat=shootlat, latin, length, n=cpue_number_per_hour) |>
  filter(length > 0) |>
  # CHECK upstream why one has NA in n
  mutate(n = replace_na(n, 0)) |>
  # I thought this was done upstream
  mutate(length = floor(length)) |>
  group_by(vessel, survey, id, year, lon, lat, latin, length) |>
  summarise(n = sum(n),
            .groups = "drop") |>
  # get rid of species where cpue is always zero
  group_by(survey, latin) |>
  mutate(n.sum = sum(n)) |>
  ungroup() |>
  filter(n.sum > 0) |>
  select(-n.sum)

species_table <-
  le |>
  filter(length > 0,
         n > 0) |>
  # NOTE: Should one have different species list for different surveys
  group_by(survey, latin) |>
  summarise(n.year.pos = sum(n_distinct(year)),
            .groups = "drop") |>
  filter(n.year.pos >= 5) |>
  # only proper species, not genus
  filter(str_detect(latin, " ")) |>
  select(latin) |>
  distinct() |>
  left_join(tidydatras::asfis) |>
  mutate(english_name = case_when(species == "PLA" ~ "Long rough dab",
                                  species == "MON" ~ "Monkfish",
                                  species == "WHB" ~ "Blue whiting",
                                  species == "PIL" ~ "Sardine",
                                  species == "LUM" ~ "Lumpfish",
                                  species == "BIB" ~ "Pouting",
                                  species == "POK" ~ "Saithe",
                                  species == "USK" ~ "Tusk",
                                  species == "MUR" ~ "Red mullet",
                                  species == "MAC" ~ "Mackerel",
                                  species == "HOM" ~ "Horse mackerel",
                                  .default = english_name)) |>
  arrange(english_name)

# testing:
species_table |> count(english_name) |> filter(n > 1)

LATIN <- species_table$latin
names(LATIN) <- species_table$english_name

rbyl <-
  le |>
  filter(latin %in% LATIN) |>
  group_by(vessel, survey, year, latin, length) |>
  # the new kid on the block, here summarise returns a warning
  reframe(N = sum(n),                      # total number    by length caught in the year (per 60 minute haul)
          B = sum(n * 0.00001 * length^3)) #       mass [kg]

## trim length of species, make as a "plus group" -------------------------------
length.trim <-
  rbyl |>
  group_by(latin, length) |>
  reframe(B = sum(B)) |>
  arrange(latin, length) |>
  group_by(latin) |>
  mutate(cB = cumsum(B),
         cB.trim = 0.999 * max(cB),
         length.trim = ifelse(length > 30 & cB > cB.trim, NA, length),
         length.trim = ifelse(!is.na(length.trim), length.trim, max(length.trim, na.rm = TRUE))) |>
  select(latin, length, length.trim) |>
  ungroup()

rbyl <-
  rbyl |>
  left_join(length.trim) |>
  mutate(length = length.trim) |>
  # fill in full cm lengths from min to max witin each species
  # select(vessel, survey, year, latin, length) |> # step not really needed, just added for clarity
  group_by(vessel, survey, latin) |>
  expand(year = full_seq(c(min(year), max(year)), 1),
         length = full_seq(length, 1)) |>
  # join back to get the N and B
  left_join(rbyl) |>
  mutate(N = replace_na(N, 0),
         B = replace_na(B, 0))

rbyl <-
  rbyl |>
  left_join(st |> count(survey, year, name = "n.tows")) |>

  # divide total by the number of hauls (or should it by by station?)
  mutate(n = N / n.tows,
         b = B / n.tows) |>
  select(-n.tows)

# average over the whole survey time period
# results by length, used in the length frequency plot -------------------------
rbl <-
  rbyl |>
  group_by(vessel, survey, latin, length) |>
  reframe(N = mean(N),
          B = mean(B),
          n = mean(n),
          b = mean(b))

# Throw out variables that are not used downstream to speed up the shiny loading
rbyl <-
  rbyl |>
  select(vessel, survey, latin, year, length, n, b)
rbl <-
  rbl |>
  select(vessel, survey, latin, length, n, b)

rbys <-
  le |>
  filter(latin %in% LATIN) |>
  group_by(vessel, survey, id, year, lon, lat, latin) |>
  reframe(N = sum(n),
          B = sum(n * 0.00001 * length^3))

# add zero station
rbys <-
  rbys |>
  expand(nesting(vessel, survey, id, year, lon, lat), latin) |>
  # get back N and B
  left_join(rbys) |>
  mutate(N = replace_na(N, 0),
         B = replace_na(B, 0))

my_boot = function(x, times = 100) {

  # Get column name from input object
  var = deparse(substitute(x))
  var = gsub("^\\.\\$","", var)

  # Bootstrap 95% CI
  cis = stats::quantile(replicate(times, mean(sample(x, replace=TRUE))), probs=c(0.025,0.975))

  # Return data frame of results
  data.frame(var, n=length(x), mean=mean(x), lower.ci=cis[1], upper.ci=cis[2])
}

print("Bootstrapping abundance:")

boot.N <-
  rbys |>
  dplyr::group_by(vessel, survey, latin, year) %>%
  dplyr::do(my_boot(.$N)) %>%
  dplyr::mutate(variable = "N",
                var = as.character(variable))

print("Bootstrapping biomass:")

boot.B <-
  rbys %>%
  dplyr::group_by(vessel, survey, latin, year) %>%
  dplyr::do(my_boot(.$B)) %>%
  dplyr::mutate(variable = "B",
                var = as.character(var))

boot <-
  bind_rows(boot.N,
            boot.B)

# Throw out variables that are not used downstream to speed up the shiny loading
boot <-
  boot |>
  select(-n) %>% 
  left_join(species_table)

# gplyph plot ------------------------------------------------------------------
# need to create a dummy year before and after for the glyph-plot
glyph <-
  rbys |>
  mutate(sq = geo::d2ir(lat, lon)) |>
  group_by(vessel, survey, year, sq, latin) |>
  summarise(N = mean(N),
            B = mean(B),
            .groups = "drop")

Length frequencies plots

select_species <- "HER"
select_survey  <- "NS-IBTS"
select_quarter <- 1
winsorize      <- c(0.05, 0.95)
length_step    <- 0.5

Index plots

myspecies <- c("MUR", "CTC","MAC","WHG", "HOM", "GUR", "GUG", "GUU", "PLE", "WEG", "SQR", "ANE", "JOD", "POD", "SYC")
myvar     <- "B"

boot %>%
  dplyr::filter(species %in% myspecies, var == myvar) %>%
  dplyr::filter(year != 2020) %>% 
  # dplyr::filter(var == myvar) %>%
  ggplot2::ggplot(ggplot2::aes(year, mean)) +
  ggplot2::theme_bw(base_size = 16) +
  ggplot2::theme(legend.position="bottom") +

  ggplot2::geom_pointrange(ggplot2::aes(year, mean, ymin = lower.ci, ymax = upper.ci, colour=vessel)) +
  ggplot2::geom_smooth(ggplot2::aes(year, mean, group=vessel, colour=vessel), method="lm", se=FALSE) +

  ggplot2::scale_x_continuous(breaks = seq(1985, 2025, by = 5)) +
  ggplot2::expand_limits(y = 0) +
  ggplot2::labs(x = NULL, y=myvar) +
  # ggplot2::facet_wrap(~paste(species, english_name), scales="free_y")
  ggplot2::facet_wrap(~paste(latin, english_name, species), scales="free_y", ncol=5)