In einarhjorleifsson/husky: a collection of husky functions

knitr::opts_chunk$set(eval = TRUE, echo = TRUE,
                      message = FALSE, warning = FALSE)

Biomass indices

SMB

huskyverse

NB: The next generation of indices calculation has been seeded in /net/hafkaldi/export/u2/reikn/Splus5/SurveyWork. According to Husky (December 2016) this has not "formally" been adopted.

Script location

Husky's master scripts is /net/hafkaldi/export/u2/reikn/Splus5/SMB/allarteg.sh. This script calls for each species two additional shell scripts: /net/hafkaldi/export/u2/reikn/Splus5/SMB/BIOVISIT_R.sh /net/hafkaldi/export/u2/reikn/Splus5/SMB/BIOVISIT_R/BIOVISIT_R.sh

The difference in the two scripts is:

 diff BIOVISIT_R.sh BIOVISIT.fastar_R.sh 
10c10,11
< st1 <- STODVAR$y999[STODVAR$y999$tognumer %in% 1:39,]
---
> ind <- c(31931,31932,32131,36731,37031,37131,37132,37231,41431,41531,42231,42232,47431,52331)
> st1 <- STODVAR$y999[STODVAR$y999$tognumer %in% 1:19 | !is.na(match(STODVAR$y999$index,ind)) ,]
129,130c130,131
<     base.visit <- tmp.visit1a
<     aggr.visit <- tmp.visit2a
---
>     base.visit.fastar <- tmp.visit1a
>     aggr.visit.fastar <- tmp.visit2a
133,134c134,135
<   base.visit <- rbind(base.visit,tmp.visit1a)
<   aggr.visit <- rbind(aggr.visit,tmp.visit2a) 
---
>   base.visit.fastar <- rbind(base.visit.fastar,tmp.visit1a)
>   aggr.visit.fastar <- rbind(aggr.visit.fastar,tmp.visit2a) 
145,146c146,147
< aggr.visit$bio <- aggr.visit$fj*lwcoeff[1]*aggr.visit$lengd^lwcoeff[2]/1e3
< base.visit$bio <- base.visit$fj*lwcoeff[1]*base.visit$lengd^lwcoeff[2]/1e3
---
> aggr.visit.fastar$bio <- aggr.visit.fastar$fj*lwcoeff[1]*aggr.visit.fastar$lengd^lwcoeff[2]/1e3
> base.visit.fastar$bio <- base.visit.fastar$fj*lwcoeff[1]*base.visit.fastar$lengd^lwcoeff[2]/1e3

Minimum fuzz adaptation

The functions and data objects have been moved from the various .RData spaces to the husky-package. The following is based on the script /net/hafkaldi/export/u2/reikn/Splus5/SMB/BIOVISIT_R.sh with some minimum adaptations.

library(tidyverse)
library(lubridate)
library(fjolst)
library(husky)
library(smx)

YEARS <- 1985:2016
TEG <- 1
KYN <- FALSE
lwcoeff <- LWCOEFF[[as.character(TEG)]]
lengdir <- LENGDIR[[as.character(TEG)]]

t1 <- now()
base.visit2 <- list()
aggr.visit2 <- list()
STODVAR2 <- dplyr::bind_rows(STODVAR)  # EH added

for (j in 1:length(YEARS)) {
  #print(YEARS[j])
  #st1 <- STODVAR$'y1985'[STODVAR$'y1985'$tognumer %in% 1:39,]
  st1 <- STODVAR2[STODVAR2$ar == YEARS[j] & STODVAR2$tognumer %in% 1:39,] # EH added
  tmp <- lesa.lengdir(st1$synis.id,TEG,col.names="kyn")
  tmp1 <- lesa.numer(st1$synis.id,TEG) 
  tmp <- Skala.med.toldum(tmp,tmp1)
  i <- is.na(tmp$fj.alls) 
  tmp$fj.alls[i] <- 0
  tmp$bio <- tmp$fj.alls*lwcoeff[1]*tmp$lengd^lwcoeff[2]/1e6 # tonn
  tmp$fj.alls <- tmp$fj.alls/1e3 # þúsundir


  for( i in 1:length(lengdir)) {

    #print(i) 
    tmp1 <- tmp[tmp$lengd==lengdir[i],]

    if(nrow(tmp1) > 0) {
      x <- apply.shrink(tmp1$fj.alls,tmp1$synis.id,sum) 
      names(x) <- c("synis.id","fj")
      st <- husky:::join(st1[,c("newstrata","toglengd","synis.id")],x,"synis.id",set=0)
    } else {
      st <- st1[,c("newstrata","toglengd","synis.id")]
      st$fj <- rep(0,nrow(st)) 
    }

    if(KYN) {
      tmp1 <- tmp[tmp$lengd==lengdir[i] & tmp$kyn==1 & !is.na(tmp$kyn),]
      if(nrow(tmp1) > 0) {
        x <- apply.shrink(tmp1$fj.alls,tmp1$synis.id,sum) 
        names(x) <- c("synis.id","fjhaenga")
        st <- husky:::join(st,x,"synis.id",set=0)
      } else {
        st$fjhaenga <- rep(0,nrow(st)) 
      }

      tmp1 <- tmp[tmp$lengd==lengdir[i] & tmp$kyn==2 & !is.na(tmp$kyn),]
      if(nrow(tmp1) > 0) {
        x <- apply.shrink(tmp1$fj.alls,tmp1$synis.id,sum) 
        names(x) <- c("synis.id","fjhrygna")
        st <- husky:::join(st,x,"synis.id",set=0)
      }
      else {
        st$fjhrygna <- rep(0,nrow(st)) 
      }
    }

    tmp1 <- tmp[tmp$lengd >= lengdir[i],]
    if(nrow(tmp1) > 0) {
      x <- apply.shrink(tmp1$bio,tmp1$synis.id,sum) 
      names(x) <- c("synis.id","bioge") 
      st <- husky:::join(st,x,"synis.id",set=0)
    } else {
      st$bioge <- rep(0,nrow(st)) 
    }

    tmp1 <- tmp[tmp$lengd <= lengdir[i],]
    if(nrow(tmp1) > 0) {
      x <- apply.shrink(tmp1$fj.alls,tmp1$synis.id,sum) 
      names(x) <- c("synis.id","fjle")
      st <- husky:::join(st,x,"synis.id",set=0)
    } else {
      st$fjle <- rep(0,nrow(st)) 
    }


    tmp.visit <- Calc.index(st,"fj")

    tmp.biovisit <- Calc.index(st,"bioge")
    tmp.seidavisit <- Calc.index(st,"fjle")

    if(KYN) {
      tmp.haengavisit <- Calc.index(st,"fjhaenga")
      tmp.hrygnuvisit <- Calc.index(st,"fjhrygna")
    }

    tmp.visit1 <- tmp.visit$result[,c("strata","total","cv")]
    names(tmp.visit1)[2:3] <- c("fj","cv.fj")
    tmp.visit1$bio.staerri <- tmp.biovisit$result[,"total"]
    tmp.visit1$cv.bio.staerri <- tmp.biovisit$result[,"cv"]
    tmp.visit1$fj.minni <- tmp.seidavisit$result[,"total"]
    tmp.visit1$cv.fj.minni <- tmp.seidavisit$result[,"cv"]

    if(KYN) {
      tmp.visit1$fj.haenga <- tmp.haengavisit$result[,"total"]
      tmp.visit1$cv.fj.haenga <- tmp.haengavisit$result[,"cv"]
      tmp.visit1$fj.hrygna <- tmp.hrygnuvisit$result[,"total"]
      tmp.visit1$cv.fj.hrygna <- tmp.hrygnuvisit$result[,"cv"]
    }

    tmp.visit2 <- tmp.visit$aggr.output[,c("total","cv")]
    names(tmp.visit2) <- c("fj","cv.fj") 
    tmp.visit2$bio.staerri <- tmp.biovisit$aggr.output[,"total"]
    tmp.visit2$cv.bio.staerri <- tmp.biovisit$aggr.output[,"cv"]
    tmp.visit2$fj.minni <- tmp.seidavisit$aggr.output[,"total"]
    tmp.visit2$cv.fj.minni <- tmp.seidavisit$aggr.output[,"cv"]

    if(KYN) {
      tmp.visit2$fj.haenga <- tmp.haengavisit$aggr.output[,"total"]
      tmp.visit2$cv.fj.haenga <- tmp.haengavisit$aggr.output[,"cv"]
      tmp.visit2$fj.hrygna <- tmp.hrygnuvisit$aggr.output[,"total"]
      tmp.visit2$cv.fj.hrygna <- tmp.hrygnuvisit$aggr.output[,"cv"]
    }


    tmp.visit2$svaedi <- dimnames(tmp.visit2)[[1]]
    tmp.visit2$svaedisnr <- 1:nrow(tmp.visit2)
    dimnames(tmp.visit2)[[1]] <- 1:nrow(tmp.visit2)
    tmp.visit1$lengd <- rep(lengdir[i],nrow(tmp.visit1))
    tmp.visit2$lengd <- rep(lengdir[i],nrow(tmp.visit2))
    tmp.visit1$ar <- rep(YEARS[j],nrow(tmp.visit1))
    tmp.visit2$ar <- rep(YEARS[j],nrow(tmp.visit2))
    if(i == 1 ) {
      tmp.visit1a <- tmp.visit1
      tmp.visit2a <- tmp.visit2
    } else {
      tmp.visit1a <- rbind(tmp.visit1a,tmp.visit1)
      tmp.visit2a <- rbind(tmp.visit2a,tmp.visit2)
    }
  }
  base.visit2[[j]] <- tmp.visit1a
  aggr.visit2[[j]] <- tmp.visit2a
} # End year loop

base.visit2 <- bind_rows(base.visit2)
aggr.visit2 <- bind_rows(aggr.visit2)
# tonn því fjöldi er þegar / 1000 en lengd-þyngdar gefur grömm.  
aggr.visit2$bio <-
  aggr.visit2$fj*lwcoeff[1]*aggr.visit2$lengd^lwcoeff[2]/1e3
base.visit2$bio <-
  base.visit2$fj*lwcoeff[1]*base.visit2$lengd^lwcoeff[2]/1e3

print(paste("Duration:", round((now() - t1)/dminutes(1),2), "minutes"))

TESTING

attach('/net/hafkaldi/export/u2/reikn/Splus5/SMB/TORSKUR/.RData')
#detach("file:/net/hafkaldi/export/u2/reikn/Splus5/SMB/TORSKUR/.RData")

ggplot() +
  geom_pointrange(data = aggr.visit2 %>% filter(svaedi == "Heild", lengd == min(lengd)),
                  aes(ar, bio.staerri, 
                      ymin = bio.staerri * (1 - cv.bio.staerri),
                      ymax = bio.staerri * (1 + cv.bio.staerri)),
                  lwd = 1,
                  size = 4) +
  geom_pointrange(data = aggr.visit %>% filter(svaedi == "Heild", lengd == min(lengd)),
                  aes(ar, bio.staerri, 
                      ymin = bio.staerri * (1 - cv.bio.staerri),
                      ymax = bio.staerri * (1 + cv.bio.staerri)),
                  colour = "red") +
  labs(x = NULL, y = NULL, title = "Standardized biomass index")

ERGO: The two huskyverse things are the same

tidyverse

A little demo on using only dplyr verbs and ggplot2 to achieve quite a lot of things. Topic is a candidate for the upcoming ICES TCRENV course.

The data used (Icelandic spring groundfish survey) is just a placeholder for the important part - the generic algorithm used to achieve an objective. In this specific case one has a demonstration of a calculation and visualization of stratified survey indices. Of course with the inclusion of the associated uncertainties.

Setup

Some constants:

std.towlength <- 4             # seamiles
std.towwidth  <- 17            # meters
std.area      <- 4 * 17/1852   # standard area swept square nautical miles
std.cv <- 1                    # for setting the cv as the mean if number of tows in a
                               #  strata is one

Get in some data

Data for some 9 most common species from the Icelandic spring groundfish survey can be obtained by:

t1 <- now()
#library(dplyrOracle)
#library(mar)
#db <- src_oracle("mar")
Station <- 
  #readr::read_csv("http://www.hafro.is/~einarhj/data/tcrenv2016/Station.csv") %>%
  STODVAR %>% 
  bind_rows() %>% 
  filter(tognumer %in% 1:39) %>% 
  select(id = synis.id, year = ar, towlength = toglengd, strata = newstrata) %>% 
  mutate(towlength = trim_towlength(towlength)) %>% 
  arrange(id)
Length <- 
  #read.csv("http://www.hafro.is/~einarhj/data/tcrenv2016/Length.csv")
  lesa.lengdir(Station$id, TEG, col.names="kyn") %>% 
  mutate(species = TEG) %>% 
  select(id = synis.id, species, length = lengd,
         n = fjoldi, sex = kyn)
Subsampling <-
  #read.csv("http://www.hafro.is/~einarhj/data/tcrenv2016/Subsampling.csv")
  lesa.numer(Station$id, TEG) %>% 
  mutate(species = TEG) %>% 
  select(id = synis.id, species,
         n.counted = fj.talid,
         n.measured = fj.maelt) %>% 
  mutate(n.total = n.counted + n.measured)
Stratas <- 
  read.csv("http://www.hafro.is/~einarhj/data/tcrenv2016/Stratas.csv") %>% 
  select(strata, area = rall.area)

For the record we have the following table dimensions:

d <- data_frame(table = c("Station","Length","Subsampling","Stratas"),
                rows = c(nrow(Station),nrow(Length),nrow(Subsampling),nrow(Stratas)),
                columns = c(ncol(Station),ncol(Length),ncol(Subsampling),ncol(Stratas)))
d

Calculate

Specify species:

SPECIES <- 1                   # Cod

Filter the measurement data for the species in question

Get the raising factor:

ss <-
  Subsampling %>% 
  filter(species %in% SPECIES) %>%
  mutate(r = n.total/n.measured) %>% 
  select(id, r)

Get the length measurements and do the raising:

le <- 
  Length %>% 
  filter(species == SPECIES) %>%
  # a double precaution, in case length bins by sex
  group_by(id, length) %>%
  summarize(n = sum(n)) %>% 
  ungroup() %>% 
  left_join(ss, by="id") %>%
  mutate(n = n * r / 1e3) %>%     # units of thousands
  select(-r)

d <- smx::calc_indices(st = Station, le = le, stratas = Stratas)
print(paste("Duration:", round((now() - t1)/dminutes(1),2), "minutes"))

Comparison

ggplot() +
  geom_pointrange(data = d$aggr %>% filter(length == min(length)),
                  aes(year, cb, 
                      ymin = cb * (1 - cb.cv), 
                      ymax = cb * (1 + cb.cv)),
                  col = "black",
                  lwd = 1,
                  size = 4) +
  geom_pointrange(data = aggr.visit2 %>% filter(svaedi == "Heild", lengd == min(lengd)),
                  aes(ar, bio.staerri, 
                      ymin = bio.staerri * (1 - cv.bio.staerri),
                      ymax = bio.staerri * (1 + cv.bio.staerri)),
                  colour = "red") +
  labs(x = NULL, y = NULL, 
       title = "Standardized biomass index",
       subtitle = "Tidyverse (black) vs. Höskiverse (red)")

ggplot() +
  geom_pointrange(data = d$aggr %>% filter(length == max(length)),
                  aes(year, cn, 
                      ymin = cn * (1 - cn.cv), 
                      ymax = cn * (1 + cn.cv)),
                  col = "black",
                  lwd = 1,
                  size = 4) +
  geom_pointrange(data = aggr.visit2 %>% filter(svaedi == "Heild", lengd == max(lengd)),
                  aes(ar, fj.minni, 
                      ymin = fj.minni * (1 - cv.fj.minni),
                      ymax = fj.minni * (1 + cv.fj.minni)),
                  col = "red") +
  labs(x = NULL, y = NULL, title = "Standardized abundance index",
       subtitle = "Tidyverse (black) vs. Höskiverse (red)")

devtools::session_info()

# Storage
base <- 
  as_data_frame(expand.grid(length = c(5:140), id = Station$id)) %>% 
  left_join(Station, by = "id") %>%
  left_join(le, by=c("id","length")) %>% 
  arrange(id, length) %>% 
  group_by(id) %>% # The grouping here is for the cumsum calculation
  mutate(n  = ifelse(is.na(n),0,n)  * std.towlength / towlength, # standardized to per 4 miles
         cn = cumsum(n),
         b  = n * 0.01 * length^3/1e3,
         cb = sum(b) - cumsum(b) + b) %>%
  group_by(year, strata, length) %>%
  summarise(N  = n(),
            n_m  = mean(n),
            n_d  = ifelse(N == 1, n_m  * std.cv, sd(n)),
            cn_m = mean(cn),
            cn_d = ifelse(N == 1, cn_m * std.cv, sd(cn)),
            b_m  = mean(b),
            b_d  = ifelse(N == 1, b_m  * std.cv, sd(b)),
            cb_m = mean(cb),
            cb_d = ifelse(N == 1, cb_m * std.cv, sd(cb))) %>% 
  ungroup() %>% 
  left_join(Stratas, by = "strata") %>%
  mutate(area  = area/1.852^2 / std.area,
         n     = n_m  * area,
         cn    = cn_m * area,
         b     = b_m  * area,
         cb    = cb_m * area)

Combining strata:

aggr <- 
  base %>% 
  group_by(year, length) %>% 
  # A la Höski:
  summarise(n = sum(n),
            n.cv = smx:::calc_cv(n_m,n_d,area,N),
            b = sum(b),
            b.cv = smx:::calc_cv(b_m,b_d,area,N),
            cn = sum(cn),
            cn.cv = smx:::calc_cv(cn_m, cn_d, area, N),
            cb = sum(cb),
            cb.cv = smx:::calc_cv(cb_m, cb_d, area, N))