ip.R
In purgeR: Inbreeding-Purging Estimation in Pedigreed Populations

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup, message=FALSE, echo=FALSE-----------------------------------------
library(caret)
library(coda)
library(e1071)
library(ggplot2)
library(glmnet)
library(grid)
library(gtable)
library(magrittr)
library(plyr)
library(purgeR)
library(purrr)
library(stringr)

## ----opportunity_of_purging, fig.align='center', fig.width=5------------------
data(arrui)
arrui <- arrui %>%
  purgeR::ip_F() %>% 
  purgeR::ip_op(Fcol = "Fi") %>%
  dplyr::mutate(species = "A. lervia") %>%
  purgeR::pop_t() %>% 
  dplyr::mutate(t = plyr::round_any(t, 1))

arrui %>%
    dplyr::group_by(species, t) %>%
    dplyr::summarise(Fi = mean(Fi),
                     Oe = mean(Oe),
                     Oe_raw = mean(Oe_raw),
                     nOe = ifelse(Fi > 0, Oe/Fi, 0),
                     nOe_raw = ifelse(Fi > 0, Oe_raw/Fi, 0)) %>%
    ggplot(aes(x = t)) +
    geom_area(aes(y = 1-nOe), fill = "blue", alpha = 0.5) +
    geom_area(aes(y = 1-nOe_raw), fill = "red", alpha = 0.5) +
    geom_line(aes(y = 1-nOe, color = "enabled"), size = 2) +
    geom_line(aes(y = 1-nOe_raw, color = "disabled"), size = 2) +
    facet_grid(. ~ species) +
    scale_y_continuous(expression(paste("1 - ", O["e"], " / F", sep = "")), limits = c(0, 1)) +
    scale_x_continuous("Equivalent to complete generations", breaks = c(0,1,2,3,4,5,6,7)) +
    scale_color_manual("Correction", values = c(enabled  = "blue", disabled = "red")) +
    theme(panel.background = element_blank(),
          strip.text = element_text(size = 12, face = "italic"),
          legend.position = "bottom")

## ----estimate_inbreeding_depression_prod--------------------------------------
data(dama)
dama %>%
  purgeR::ip_F() %>%
  dplyr::filter(prod > 0) %>%
  stats::lm(formula = log(prod) ~ Fi)

## ----estimate_inbreeding_depression_surv--------------------------------------
dama %>%
  purgeR::ip_F() %>%
  stats::glm(formula = survival15 ~ Fi, family = "binomial")

## ----estimate_with_caret, message=FALSE---------------------------------------
set.seed(1234)
vars <- c("survival15", "Fi", "Fa", "yob", "pom")
df <- dama %>%
  purgeR::ip_F() %>% 
  purgeR::ip_Fa(Fcol = "Fi") %>%
  dplyr::select(tidyselect::all_of(vars)) %>%
  stats::na.exclude()
trainIndex <- caret::createDataPartition(df$survival15, p = 0.75, list = FALSE, times = 1)
df_train <- df[trainIndex, ]
df_test <- df[-trainIndex, ]
m <- caret::train(factor(survival15) ~., data = df_test, method = "glmnet")
stats::predict(m$finalModel, type = "coefficients", s = m$bestTune$lambda)
stats::predict(m, df_test) %>%
  caret::confusionMatrix(reference = factor(df_test$survival15))

## ----estimate_purging_coefficient_linear--------------------------------------
d_values <- seq(from = 0.0, to = 0.5, by = 0.01)
models <- seq_along(d_values) %>%
  purrr::map(~ip_g(ped = dama, d = d_values[[.]], name_to = "g")) %>%
  purrr::map(~glm(formula = survival15 ~ g, family = binomial(link = "probit"), data = .))
aic_values <- models %>%
  purrr::map(summary) %>%
  purrr::map_dbl("aic")
aic_best <- aic_values %>% min()
models[[which(aic_values == aic_best)]] %>% summary

## ----estimate_purging_coefficient_nonlinear-----------------------------------
set.seed(1234)
d_values <- seq(from = 0.0, to = 0.5, by = 0.01)
start_values <- seq_along(d_values) %>%
    map(~ip_g(ped = dama, d = d_values[[.]], name_to = "g")) %>%
    map(~glm(formula = survival15 ~ g, family = binomial(link = "probit"), data = .)) %>%
    map("coefficients") %>%
    map(~set_names(x = ., nm = c("W0", "B")))
models <- seq_along(d_values) %>%
  map(~ip_g(ped = dama, d = d_values[[.]], name_to = "g")) %>%
  map2(start_values, ~nls(formula = survival15 ~ W0 * exp(B * g), start = .y, data = .x))

aic_values <- models %>% map_dbl(AIC)
aic_best <- aic_values %>% min()
models[[which(aic_values == aic_best)]] %>% summary
d_values[which(aic_values == aic_best)]

## ----significance_purging_coefficient-----------------------------------------
# AIC for the model with no purging
AIC_d0 <-models[[1]] %>% AIC()

# AIC for the best model (d=0.22)
AIC_best <- models[[which(aic_values == aic_best)]] %>% AIC()

# Chi2 statistic; note that we assume here that the two models only differ in one parameter
# as the model forcing d=0 does not estimate d, but the other does, and has one degree of freedom less
# Chi2 <- (AIC(simple model) - 2K(simple model)) - (AIC(complete model) - 2K(complete model))
Chi2 <- AIC_d0 - AIC_best + 1.0

# Get a p-value from the Chi distribution, using the critical value and one degre of freedom
pchisq(q=Chi2, df=1, lower.tail=FALSE)

## ----estimate_purging_coefficient_abc, eval=FALSE-----------------------------
#  # Initialize observed data
#  data(dama)
#  dama <- dama %>% ip_F()
#  w0 <- 0.89658 # assumed fitness for non-inbred individuals, from regression above
#  
#  # We use fitness itself as summary statistic (So)
#  # In the simulated data, fitness (Ss) is computed using Morton et al. model
#  # Distance between So and Ss [d(So, Ss)] is estimated by means of the residual sum of squares (RSS)
#  get_RSS <- function(data, par) {
#    data %>%
#      purgeR::ip_g(d = par[1], Fcol = "Fi", name_to = "g") %>%
#      dplyr::mutate(Ew = w0 * exp(-par[2] * g)) %>%
#      dplyr::filter(!is.na(survival15)) %$%
#      `-`(survival15-Ew) %>%
#      `^`(2) %>%
#      sum()
#  }
#  
#  # Acceptance rule for d(so, Ss) given a threshold
#  ABC_accept <- function(data, par, threshold){
#    if (par[1] < 0) return(FALSE)
#    if (par[1] > 0.5) return(FALSE)
#    if (par[2] < 0) return(FALSE)
#    RSS <- get_RSS(data, par)
#    if(RSS < threshold) return(TRUE) else return(FALSE)
#  }
#  
#  # Run MCMC ABC
#  MCMC_ABC <- function(data, niter, nburn, nthin, threshold) {
#    nsamples <- (niter-nburn)/nthin
#    chain <- array(dim = c(nsamples, 3)) # d, delta and RSS
#    sample <- c(0.0, 0.0, get_RSS(data, c(0.0, 0.0)))
#    sample_idx <- 1
#    for (i in 1:niter) {
#      d_test <- runif(1, min = 0, max = 0.5)
#      delta_test <- runif(1, min = 0, max = 15)
#      sample_test <- c(d_test, delta_test, get_RSS(data, c(d_test, delta_test)))
#  
#      if (ABC_accept(data, sample_test, threshold)) sample <- sample_test
#      if ((i > nburn) & (i %% nthin == 0)) {
#        print(sample_idx)
#        chain[sample_idx,] <- sample
#        sample_idx <- sample_idx + 1
#      }
#    }
#    return(coda::mcmc(chain, start = nburn, thin = nthin))
#  }
#  
#  # Get the posterior
#  set.seed(1234)
#  # We set an arbitrary threshold taking the RSS from the best fit
#  # and allowing an increase in the simulations up to an 0.5% in value
#  t <- get_RSS(dama, c(0.22, 1.11))
#  t = t*1.005
#  # Get the posterior distribution for the purging coefficient and delta
#  # A third column with the RSS values is also returned
#  posterior <- MCMC_ABC(dama, niter = 5100, nburn = 100, nthin = 50, threshold = t*1.005)

## ----read_chain, include=FALSE------------------------------------------------
posterior <- system.file("extdata", "posterior.rda", package = "purgeR")
posterior <- base::readRDS(posterior)

## ----abc_tests, eval=FALSE----------------------------------------------------
#  posterior[, 1:2] %>% base::plot()
#  posterior[, 1:2] %>% coda::heidel.diag()
#  posterior[, 1:2] %>% coda::autocorr.diag()
#  posterior[, 1:2] %>% coda::effectiveSize()

## ----abc_plot, out.width = '250px', out.height = '250px', fig.align = 'center'----
df <- data.frame(posterior)
colnames(df) <- c("d", "delta", "RSS")

# Joint posterior distribution
ggplot(data = df) +
  geom_point(aes(x = d, y = delta)) +
  geom_density_2d_filled(aes(x = d, y = delta), alpha = 0.5) +
  geom_point(x = 0.22, y = 1.11, size = 3, shape = 4) +
  scale_x_continuous(expression(paste("Purging coefficient (", italic(d), ")", sep = "")),
                     limits = c(0, 0.5)) +
  scale_y_continuous(expression(paste("Inbreeding load (", italic(delta), ")", sep = "")),
                     limits = c(min(df$delta), max(df$delta))) +
  theme(panel.background = element_blank(),
        axis.line = element_line(size = 0.1),
        axis.title = element_text(size = 12),
        axis.text = element_text(size = 10),
        legend.position = "none")

## ----exp_w, fig.align='center', fig.width=4-----------------------------------
w0 <- 1
B <- 1
data.frame(t = 0:50) %>%
  dplyr::rowwise() %>%
  dplyr::mutate(Fi = exp_F(Ne = 50, t),
                g = exp_g(Ne = 50, t, d = 0.10),
                exp_WF = w0*exp(-B*Fi),
                exp_Wg = w0*exp(-B*g)) %>%
  tidyr::pivot_longer(cols = c(exp_WF, exp_Wg), names_to = "Model", values_to = "Ew") %>%
  ggplot(aes(x = t, y = Ew, color = Model)) +
  geom_line(size = 2) +
  scale_x_continuous("Generations (t)") +
  scale_y_continuous("Expected fitness [E(w)]", limits = c(0.5, 1)) +
  scale_color_manual(labels = c("F-based", "g-based"), values = c("black", "red")) +
  theme(legend.position = "bottom")