knitr::opts_chunk$set(warning=FALSE, message=FALSE)

library(tangled)
library(tidyverse)
library(ggplot2)
library(gghighlight)
library(lme4)
library(lattice)
library(forcats)
library(stringr)

The goal of this vignette is to describe health during available years by building up a data frame where the dependent variable is success of pregnancy, and the covariates are as follows:

  1. Health during available year
  2. whether the animal was entangled or not during the available year
  3. severity of entanglement
  4. time since last pregnancy
  5. decade

We begin by preparing the data.

Data Preparation

We have two different data frames that we need to work with: 1) the list of every known pregnancy event, and 2) the individual estimates of health that form the core of the model output.

Pregnancy Data

pregnant <- read_csv(here::here('data-raw', '2021-02-25_years_since_pregnancy_1980-2013.csv'))%>% 
  dplyr::select(!starts_with('X')) %>% 
  filter(Year >=1980 & Year <= 2013) %>% 
  drop_na(Pregnant)

pregnant$Pregnant[pregnant$Pregnant == 'F'] <- '1'
pregnant$Pregnant <- as.numeric(pregnant$Pregnant)

# pregnant_old <- read_csv(here::here('data-raw', '2020-10-20_pregnancy_yes-no.csv')) %>%
# select(!starts_with('X')) %>%
# filter(Year >=1980 & Year <= 2013)

Health and Entanglement data

Now that we have the dependent variables, we need to build up the information on health and entanglement status. Let's do health first

healthmean <- sumh / ng
pregnant$health <- 0
pregnant$health_min <- 0

for(i in 1:nrow(pregnant)){

  rowidx <- match(pregnant$EGNo[i], ID)
  s <- match(paste0("1-", pregnant$Year[i]), myName)
  e <- match(paste0("12-", pregnant$Year[i]), myName)
  colidx <- s:e  
  pregnant$health[i] <- mean(healthmean[rowidx, colidx], na.rm = TRUE)
  pregnant$health_min[i] <- min(healthmean[rowidx, colidx], na.rm = TRUE)
}

Then we add in the decadal information that we'll use as a factor covariate and possible as an interaction term.

pregnant <- pregnant %>% 
  mutate(decade = case_when(Year >= 1980 & Year <= 1989 ~ 1980,
                            Year >= 1990 & Year <= 1999 ~ 1990,
                            Year >= 2000 & Year <= 2009 ~ 2000,
                            Year >= 2010 & Year <= 2019 ~ 2010))

Ok, let's move on to the entanglement data. What I'm after now are whether or not the animal was entangled in a given available year, and if so what the severity was.

ent_tidy <- read_csv(here::here('data-raw','2020-09-29-entanglement_tidy.csv')) %>% 
  select(!starts_with('X')) %>% 
  select(!starts_with('Created')) %>% 
  select(!starts_with('Edited')) %>% 
  select(!starts_with('Change')) %>% 
  mutate(year = lubridate::year(lubridate::dmy(EndDate)),
         gear = str_detect(EntanglementComment, 'GEAR'),
         gear_vec = if_else(gear, 1, 0))# %>% select(-gear)

We wish to intersect these to bring the entanglement information across. In particular, we need severity.

UPDATE - should I do this with a join instead?

pregnant$severity <- 'unentangled'
# pregnant$gear <- 0

for (i in 1:nrow(pregnant)){
  # Get the Animal out and the year out for looking at the entanglement data
  egno_p <- pregnant$EGNo[i]
  year_p <- pregnant$Year[i]

  # subset the entanglement data to just pull out the 
  dsub <- ent_tidy %>% 
    filter(EGNo == egno_p & year == year_p)

  if(nrow(dsub) == 0) next()
  if(nrow(dsub) > 1) print(i) # note that while these are both cases where the status is the same, you should add a rule for error trapping if/when more data come in
  pregnant$severity[i] <- dsub$EntanglementStatus
  # pregnant$gear[i] <- dsub$gear_vec

}

Now we have to distinguish the unimpacted vs. the unentangled

pregnant <- pregnant %>% 
    mutate(sev_num = case_when(severity == '0' ~ 0,
                               severity == 'unentangled' ~ 0,
              severity == 'minor' ~ 1,
              severity == 'moderate' ~ 2,
              severity == 'severe' ~ 3))

pregnant <- pregnant %>% 
  group_by(EGNo) %>% 
    mutate(ent_group = cumsum(sev_num)) 
pregnant$severity[pregnant$ent_group == 0] <- 'unimpacted'

Covariate Creation and Standardization

Before we run the glm, we need to get the reference levels correct for the entanglement severity, and I want to make a binomial column for entanglement status. I also want to standardize the year covariate

pregnant$severity <- factor(pregnant$severity, levels = c('unimpacted', 'unentangled', 'minor', 'moderate', 'severe')) # this should have an added level to account for never yet entangled, i.e. unimpacted
pregnant <- pregnant %>% 
  mutate(after_2010 = if_else(Year < 2010, 0, 1)) %>%
  mutate(ent_status = if_else(severity == 'unentangled', 0, 1)) %>% 
  mutate(ent_status_2 = case_when(severity == 'unimpacted' ~ 'unimpacted',
                                  severity == 'unentangled' ~ 'unentangled',
                            severity == 'minor' ~ 'minor',
                            severity == 'moderate' ~ 'mod_sev',
                            severity == 'severe' ~ 'mod_sev'))

pregnant$ent_status_2 <- factor(pregnant$ent_status_2, levels = c('unimpacted', 'unentangled', 'minor', 'mod_sev')) 

mn_year <- mean(pregnant$Year)
sd_year <- sd(pregnant$Year)
pregnant$std_year <- (pregnant$Year - mn_year) / sd_year

pregnant$elapsed <- as.numeric(pregnant$elapsed)
pregnant$fctr_yr <- factor(pregnant$Year)
pregnant$EGNo <- factor(pregnant$EGNo)
pregnant$health_scl <- (pregnant$health - mean(pregnant$health, na.rm = TRUE)) / sd(pregnant$health, na.rm = TRUE)
pregnant$health_min_scl <- (pregnant$health_min - mean(pregnant$health_min, na.rm = TRUE)) / sd(pregnant$health_min, na.rm = TRUE)

Calculating elapsed times. We need two final covariates:

  1. Time since last entanglement
  2. Time since last calving (Amy made this)
# block from function to go in here

Model Fits

We'll run two binomial glms that only vary in how the entanglement covariate is constructed. In the first, it will be simply unentangled vs. entangled; in the second, we will have the 3 standard levels of entanglement

fit1 <- glm(Pregnant ~ health_scl + ent_status + std_year + factor(decade), data = pregnant, family = binomial(link = "logit"), na.action = na.exclude)
summary(fit1)

Now we'll examine a covariate with all 5 levels of entanglement (unimpacted, not-entangled, minor, moderate, severe).

fit2 <- glm(Pregnant ~ health_scl + severity + std_year + factor(decade), data = pregnant, family = binomial(link = "logit"))
summary(fit2)

Since it looks like there's an effect of entanglement, but owing to the relatively few cases for severe, I'm going to try and lump the severe and moderate to see if I can tease out a signal.

fit3 <- glm(Pregnant ~ health_scl + ent_status_2 + std_year + factor(decade), data = pregnant, family = binomial(link = "logit"))
summary(fit3)

Time Since Last Event

Goal of this last one is to look at elapsed variables, i.e. to take into account the fact that as time has elapsed, there is a stronger likelihood of an animal either getting pregnant, or perhaps not getting pregnant. I'll put two subsections here - one for time since pregnancy and one for time since entanglement. Note that each block of code is similar, and starts with the pregnant data frame

Time Since Last Pregnancy

First we prep up the data; Note, Amy calcualted this, so I don;t have to do it.

dsub <- pregnant %>% 
  # dplyr::slice_head(n = 250) %>%
  select(EGNo, Year, Pregnant) %>% 
  arrange(EGNo, Year)
egno <- unique(dsub$EGNo)
# id <- 1013 # testing
# loop over animals
df_all <- numeric(0)
for(id in egno){

  my_dsub <- dsub %>% 
    filter(EGNo == id) 

  # Expand to fill all years
  df_exp <- data.frame(EGNo = unique(my_dsub$EGNo),
                       Year = seq(my_dsub$Year[1], 
                                       my_dsub$Year[length(my_dsub$Year)],
                                       by = 1))

  df <- left_join(df_exp, my_dsub, by = c('Year', 'EGNo') ) 
  df$Pregnant <- df$Pregnant %>%  replace_na(0)
  df <- df %>% 
    mutate(preg_group = cumsum(Pregnant))

  df <- df %>% 
    filter(preg_group > 0) %>% 
    group_by(preg_group) %>% 
    mutate(elapsed = Year - first(Year)) %>% 
    ungroup(preg_group) %>% 
    select(- preg_group)

  df <- left_join(my_dsub, df, by = c('Year', 'EGNo', 'Pregnant') ) 
  # df <- inner_join(my_dsub, df, by = c('Year', 'EGNo') ) # keep rows in my_dsub

  df_all <- rbind(df_all, df)
}
elapsed_preg <- df_all # for later joining to the elapsed ent data

Ok, then we run the tests:

fit1 <- glm(Pregnant ~ health_scl + ent_status + std_year + factor(decade) + elapsed, 
            data = pregnant, 
            family = binomial(link = "logit"), 
            na.action = na.exclude)
# summary(fit1)


fit2 <- glm(Pregnant ~ health_scl + severity + std_year + factor(decade)+ elapsed, data = pregnant, family = binomial(link = "logit"))
# summary(fit2)

fit4 <- glm(Pregnant ~ health_scl * severity + std_year + factor(decade)+ elapsed, data = pregnant, family = binomial(link = "logit"))
# summary(fit4)


fit3 <- glm(Pregnant ~ health_scl + ent_status_2 + std_year + factor(decade) + elapsed, data = pregnant, family = binomial(link = "logit"))
# summary(fit3)

# Interaction with elapsed and pre-post 2010
fit5 <- glm(Pregnant ~ health_scl + ent_status + std_year + factor(decade) + elapsed*after_2010, 
            data = pregnant, 
            family = binomial(link = "logit"), 
            na.action = na.exclude)

# quadratic fit for elapsed
fit6 <- glm(Pregnant ~ health_scl + ent_status + std_year + factor(decade) + poly(elapsed, 2), 
            data = subset(pregnant, is.finite(elapsed)), 
            family = binomial(link = "logit"), 
            na.action = na.exclude)
# summary(fit6)

# interaction between elapsed and decade  - something there
fit7 <- glm(Pregnant ~ health_scl + ent_status + std_year + factor(decade) + elapsed:factor(decade), 
            data = pregnant, 
            family = binomial(link = "logit"), 
            na.action = na.exclude)
# summary(fit7)

# interaction between elapsed and year
fit8 <- glm(Pregnant ~ health_scl + ent_status + std_year + factor(decade) + elapsed:std_year, 
            data = pregnant, 
            family = binomial(link = "logit"), 
            na.action = na.exclude)
# summary(fit8)

fit9 <- glm(Pregnant ~ health_scl + severity + std_year + factor(decade) + elapsed:factor(decade), 
            data = pregnant, 
            family = binomial(link = "logit"), 
            na.action = na.exclude)

Is there a best model:

AIC(fit1, fit2, fit3, fit4, fit5, fit6, fit7, fit8, fit9)

Not really - they are all pretty much the same. I'm choosing model 9

summary(fit9)

My interpretation of that is:

Time Since Last Entanglement

Next we turn to the years since entanglement to test whether or not a similar pattern is seen with these disturbances. First, we prep the data. It's important to note that we will be missing the unimpacted from this, so it's not a perfect data comparison to the previous.

UPDATE - 2021-02-26 - we need to re think this block a little because I want to bring in the gear data after i've expanded the years.

dsub <- pregnant %>% 
  # dplyr::slice_head(n = 250) %>%
  select(EGNo, Year, Pregnant) %>% 
  arrange(EGNo, Year)
egno <- unique(dsub$EGNo)
# id <- 1013 # testing
# loop over animals
df_all <- data.frame()
for(id in egno){
  # print(id)
  my_dsub <- pregnant %>% 
    filter(EGNo == id) 

  # Expand to fill all years
  df_exp <- data.frame(EGNo = unique(my_dsub$EGNo),
                       Year = seq(my_dsub$Year[1], 
                                       my_dsub$Year[length(my_dsub$Year)],
                                       by = 1))

  df <- left_join(df_exp, my_dsub, by = c('Year', 'EGNo') ) 

  # to prep the vector for iteratively finding years that change status
  df$severity <- fct_explicit_na(df$severity, "0") 

    df <- df %>% 
    dplyr::mutate(sev_num = case_when(severity == '0' ~ 0,
                               severity == 'unimpacted' ~ 0,
                               severity == 'unentangled' ~ 1,
              severity == 'minor' ~ 2,
              severity == 'moderate' ~ 3,
              severity == 'severe' ~ 4)) %>% 
      dplyr::select(-severity)

   # df$sev_num <- as.numeric(df$sev_num)

   # df <- df %>%
     # mutate(ent_group = cumsum(sev_num))

  df <- df %>% 
    filter(ent_group > 0) %>% 
    group_by(ent_group) %>% 
    mutate(ent_elapsed = Year - first(Year)) %>% 
    mutate(last_severity = first(sev_num)) %>% 
    ungroup(ent_group) %>%
    select(EGNo, Year, ent_elapsed, last_severity)

  df <- inner_join(my_dsub, df, by = c('Year', 'EGNo')) # keep rows in my_dsub

  df_all <- rbind(df_all, df)
}
# write_csv(df_all, '../data/years_since_entanglement.csv')

df_all$health_scl <- (df_all$health - mean(df_all$health, na.rm = TRUE)) / sd(df_all$health, na.rm = TRUE)
df_all$health_min_scl <- (df_all$health_min - mean(df_all$health_min, na.rm = TRUE)) / sd(df_all$health_min, na.rm = TRUE)

Let's look at a snippet of these data to see if they make sense:

knitr::kable(df_all %>% 
  select(EGNo, Year, Pregnant, elapsed, decade, severity, last_severity, ent_elapsed) %>% 
    slice(1:10))

Ok, so the regression I set up is a binomial regression of the probability an animal gets pregnant in an available year as a function of health and an interaction between time (in years) since entanglement and the severity of the last entanglement. This is close in spirit to what you we had done before in terms of the animal entering a state and then staying in that:

fit1 <- glm(Pregnant ~ health_scl + factor(last_severity) * ent_elapsed, 
            data = df_all, family = binomial(link = "logit"))
summary(fit1)

What we see is two fold:

  1. as health improves, animals are more likely to get pregnant
  2. If the last entanglement was severe, the animal is less likely to get pregnant.

Though this second result is significant, which is what we were looking for, it's important to note that there's a large standard error on the coefficient. Both of these (marginal significance and high SE) are a) related and b) likely owing to the fact that we don't have tons of these cases. In any event, the story is consistent with the fact that severe entanglements are bad.

Over to you.

Understanding the Last Regression

Let's recap. we have a regression with three seemingly counter-intuitive results:

  1. the probability of getting pregnant goes down as more time has passed since the last entanglement
  2. if the last entanglement was severe, the probability of getting pregnant is quite a bit lower
  3. the interaction between status of last entanglement and time since that entanglement is positive when the last entanglement was severe

So what to make of these - especially the last one. The first two things to try are to: a do a likelihood ratio test for models that include this interaction and ones that don't; and b) to plot out the data in this factor combination.

Likelihood Ratio Test

Here we fit a model without that interaction and compare the results:

library(lmtest)
fit2 <- glm(Pregnant ~ health_scl + factor(last_severity) + ent_elapsed, 
            data = df_all, family = binomial(link = "logit"))
summary(fit2)

Next we compare the two models:

lrtest(fit2, fit1)
AIC(fit2, fit1)

This would suggest that parsimony says we should favor (slightly) the simpler model without the interaction, i.e. fit2. Let's plot the interactions, though, just so we can be sure of what we're looking at. I'm going to use an example from Fox's Companion to Applied Regression Book. First we calculate the mean values by category and outcome:

means <- tapply(df_all$ent_elapsed, list(df_all$last_severity, df_all$Pregnant), mean)

Then we make the plot:

plot(c(0.5, 4.5), range(df_all$ent_elapsed), 
     xlab = 'Severity of Last Entanglement', ylab = 'Years Since Last Entanglement',
     axes = F, type = 'n')
axis(1, at=1:4, c('Unentangled', 'Minor', 'Moderate', 'Severe'))
axis(2)
box()
# Unsuccessful
points(jitter(df_all$last_severity[df_all$Pregnant == 0]),
       df_all$ent_elapsed[df_all$Pregnant == 0], pch = 1,
       col = rgb(red = 0, green = 0, blue = 0, alpha = 0.5))
# Successful
points(jitter(df_all$last_severity[df_all$Pregnant == 1]),
       df_all$ent_elapsed[df_all$Pregnant == 1], pch = 16,
       col = rgb(red = 0, green = 0, blue = 0, alpha = 0.65))

lines(1:4, means[, 1], lty = 3, lwd = 3, type = 'b', pch = 1, col = '#fdbb84') # not pregnant
lines(1:4, means[, 2], lty = 1, lwd = 3, type = 'b', pch = 19, col = '#e34a33') # pregnant

legend(x = 2.75, y = 33, c('Pregnant', 'Not-Pregnant'), lty = c(1, 3), lwd = c(3, 3), pch = c(16, 1),
       col = c('#e34a33', '#fdbb84'))

What this plot shows is the years since entanglement on the y-axis, status of last entanglement on the x-axis. The points (jittered for clarity) are empty if the whale did not get pregnant in an available year, and filled if the whale did get pregnant. I also add the colored lines that show the mean of each factor combination. It's the uptick in the last, severe, column that is what the regression is picking up.

And I think this helps clarify what confused me last week. The story as I understand it (summarizing both the plot and the regression where we evaluate pregnancy events since the last entanglment):

Testing for Impact of Gear

UPDATE - 2021-02-26 - we need to re think this block a little because I want to bring in the gear data after i've expanded the years. I'm going to pare down pregnant to just the animals that had gear.

ent_gear <- ent_tidy %>% 
  filter(gear_vec == 1) 
ent_gear$EGNo <- factor(ent_gear$EGNo)

df_all_no_gear <- df_all %>% 
  filter(!(EGNo %in% unique(ent_gear$EGNo)))

# note that here we only look at animals that experienced gear at some point
dsub <- pregnant %>% 
  filter(EGNo %in% unique(ent_gear$EGNo)) %>%
  select(EGNo, Year, Pregnant) %>% 
  arrange(EGNo, Year)

egno <- unique(dsub$EGNo)
# id <- 1163 # testing for gear
# loop over animals
df_all_gear <- data.frame()
for(id in egno){
  # print(id)
  my_dsub <- pregnant %>% 
    filter(EGNo == id) 

  # Expand to fill all years
  df_exp <- data.frame(EGNo = unique(my_dsub$EGNo),
                       Year = seq(my_dsub$Year[1], 
                                       my_dsub$Year[length(my_dsub$Year)],
                                       by = 1))

  df <- left_join(df_exp, my_dsub, by = c('Year', 'EGNo') ) 
  df <- left_join(df, ent_gear, by = c('Year' = 'year', 'EGNo' = 'EGNo') ) 

  # to prep the vector for iteratively finding years that change status
  df$severity <- fct_explicit_na(df$severity, "0") 
  df$gear_vec[is.na(df$gear_vec)] <- 0

    df <- df %>% 
    dplyr::mutate(sev_num = case_when(severity == '0' ~ 0,
                               severity == 'unimpacted' ~ 0,
                               severity == 'unentangled' ~ 1,
              severity == 'minor' ~ 2,
              severity == 'moderate' ~ 3,
              severity == 'severe' ~ 4)) %>% 
      dplyr::select(-severity)

   df$sev_num <- as.numeric(df$sev_num)

   df <- df %>%
     mutate(ent_group = cumsum(sev_num))

   df <- df %>%
     mutate(gear_group = cumsum(gear_vec))

  df <- df %>% 
    filter(ent_group > 0) %>% 
    group_by(ent_group) %>% 
    mutate(ent_elapsed = Year - first(Year)) %>% 
    mutate(last_severity = first(sev_num)) %>% 
    ungroup(ent_group) %>%
    group_by(gear_group) %>% 
    mutate(last_gear = first(gear_vec)) %>% 
    ungroup(gear_group) %>% 
    select(EGNo, Year, ent_elapsed, last_severity, last_gear)

  df <- inner_join(my_dsub, df, by = c('Year', 'EGNo')) # keep rows in my_dsub

  df_all_gear <- rbind(df_all_gear, df)
}
# write_csv(df_all, '../data/years_since_entanglement.csv')

Ok, so the regression I set up is a binomial regression of the probability an animal gets pregnant in an available year as a function of health and an interaction between time (in years) since entanglement and the severity of the last entanglement. This is close in spirit to what you we had done before in terms of the animal entering a state and then staying in that:

fit_gear <- glm(Pregnant ~ health_scl + factor(decade) + last_gear*last_severity, 
            data = df_all_gear, family = binomial(link = "logit"))
summary(fit_gear)

Add back in Non-Gear Animals

So the above test is just the subset of animals that had gear, and to be complete we want to compare against all possible pregnancies

df_all_no_gear$last_gear <- 0
df_all_final <- bind_rows(df_all_gear, df_all_no_gear)
df_all_final$health_scl <- (df_all_final$health - mean(df_all_final$health, na.rm = TRUE)) / sd(df_all_final$health, na.rm = TRUE)

Run the test

fit_gear_all <- glm(Pregnant ~ health_scl + factor(decade) + last_gear*last_severity, 
            data = df_all_final, family = binomial(link = "logit"))
summary(fit_gear_all)


robschick/tangled documentation built on May 9, 2022, 4:07 p.m.