library("methods")
library("knitr")
basename <- "manuscript"
opts_chunk$set(fig.path = paste("components/figure/", basename, "-", sep=""),
               cache.path = paste("components/cache/", basename, "/", sep=""))
opts_chunk$set(cache = 2)
opts_chunk$set(tidy=FALSE, warning=FALSE, message=FALSE, 
               comment = NA, verbose = TRUE, echo=FALSE)

# PDF-based figures
opts_chunk$set(dev='pdf')
cbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", 
               "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

opts_chunk$set(dev='pdf', dev.args=c(version="1.3"))
library("nonparametricbayes") 
library("pdgControl")

library("ggplot2") 
library("reshape2")
library("plyr")
library("data.table")

library("R2jags")
library("emdbook") # for as.mcmc.bugs (?)
library("coda")  # for as.mcmc

library("modeest")
library("MASS")
library("pander")

library("cboettigR")

opts_chunk$set(fig.width=4, fig.height=3, echo=FALSE)
theme_set(theme_bw(base_size=12))
toggle = "hide" # results argument
library(modeest)
posterior.mode <- function(x) {
  mlv(x, method="shorth")$M
}
f <- RickerAllee
p <- c(2, 8, 5)
K <- 10  # approx, a li'l' less
allee <- 5 # approx, a li'l' less
sigma_g <- 0.05
sigma_m <- 0.0
z_g <- function() rlnorm(1, 0, sigma_g)
z_m <- function() 1
x_grid <- seq(0, 1.5 * K, length=50)
h_grid <- x_grid
profit <- function(x,h) pmin(x, h)
delta <- 0.01
OptTime <- 50  # stationarity with unstable models is tricky thing
reward <- 0
xT <- 0
Xo <-  allee+.5# observations start from
x0 <- K # simulation under policy starts from
Tobs <- 40
MaxT = 1000 # timeout for value iteration convergence
  set.seed(1234)
  #harvest <- sort(rep(seq(0, .5, length=7), 5))
  x <- numeric(Tobs)
  x[1] <- Xo
  nz <- 1
  for(t in 1:(Tobs-1))
    x[t+1] = z_g() * f(x[t], h=0, p=p)
  obs <- data.frame(x = c(rep(0,nz), 
                          pmax(rep(0,Tobs-1), x[1:(Tobs-1)])), 
                    y = c(rep(0,nz), 
                          x[2:Tobs]))
raw_plot <- ggplot(data.frame(time = 1:Tobs, x=x), aes(time,x)) + geom_line()
set.seed(12345)
estf <- function(p){ 
    mu <- f(obs$x,0,p)
    -sum(dlnorm(obs$y, log(mu), p[4]), log=TRUE)
}
par <- c(3, 
         9, 
         6, 
         0.06)
o <- optim(par, estf, method="L", lower=c(1e-5,1e-5,1e-5,1e-5))
f_alt <- f
p_alt <- c(as.numeric(o$par[1]), as.numeric(o$par[2]), as.numeric(o$par[3]))
sigma_g_alt <- as.numeric(o$par[4])

est <- list(f = f_alt, p = p_alt, sigma_g = sigma_g_alt, mloglik=o$value)
true_means <- sapply(x_grid, f, 0, p)
est_means <- sapply(x_grid, est$f, 0, est$p)
s2.p <- c(5,5)  
d.p = c(10, 1/0.1)
gp <- gp_mcmc(obs$x, y=obs$y, n=1e5, s2.p = s2.p, d.p = d.p)
gp_dat <- gp_predict(gp, x_grid, burnin=1e4, thin=300)
gp_assessment_plots <- summary_gp_mcmc(gp, burnin=1e4, thin=300)
tgp_dat <- 
    data.frame(  x = x_grid, 
                 y = gp_dat$E_Ef, 
                 ymin = gp_dat$E_Ef - 2 * sqrt(gp_dat$E_Vf), 
                 ymax = gp_dat$E_Ef + 2 * sqrt(gp_dat$E_Vf) )
y <- x 
N <- length(x);
jags.data <- list("N","y")
n.chains <- 6
n.iter <- 1e6
n.burnin <- floor(10000)
n.thin <- max(1, floor(n.chains * (n.iter - n.burnin)/1000))
n.update <- 10
stdQ_prior_p <- c(1e-6, 100)
stdR_prior_p <- c(1e-6, .1)
stdQ_prior  <- function(x) dunif(x, stdQ_prior_p[1], stdQ_prior_p[2])
stdR_prior  <- function(x) dunif(x, stdR_prior_p[1], stdR_prior_p[2])
K_prior_p <- c(0.01, 20.0)
r0_prior_p <- c(0.01, 6.0)
theta_prior_p <- c(0.01, 20.0)

bugs.model <- 
paste(sprintf(
"model{
  K     ~ dunif(%s, %s)
  r0    ~ dunif(%s, %s)
  theta ~ dunif(%s, %s)
  stdQ ~ dunif(%s, %s)", 
  K_prior_p[1], K_prior_p[2],
  r0_prior_p[1], r0_prior_p[2],
  theta_prior_p[1], theta_prior_p[2],
  stdQ_prior_p[1], stdQ_prior_p[2]),

  "
  iQ <- 1 / (stdQ * stdQ);
  y[1] ~ dunif(0, 10)
  for(t in 1:(N-1)){
    mu[t] <- log(y[t]) + r0 * (1 - y[t]/K)* (y[t] - theta) / K 
    y[t+1] ~ dlnorm(mu[t], iQ) 
  }
}")
writeLines(bugs.model, "allen_process.bugs")
K_prior     <- function(x) dunif(x, K_prior_p[1], K_prior_p[2])
r0_prior <- function(x) dunif(x, r0_prior_p[1], r0_prior_p[2])
theta_prior <- function(x) dunif(x, theta_prior_p[1], theta_prior_p[2])
par_priors  <- list(K = K_prior, deviance = function(x) 0 * x, 
                    r0 = r0_prior, theta = theta_prior,
                    stdQ = stdQ_prior)
allen_priors_xtable <- data.frame(parameter = c("$r$", "$K$", "$X_C$", "$\\sigma$"),
           "lower bound" = c(r0_prior_p[1], K_prior_p[1], theta_prior_p[1], stdQ_prior_p[1]),
           "upper bound" = c(r0_prior_p[2], K_prior_p[2], theta_prior_p[2], stdQ_prior_p[2]))
jags.params=c("K","r0","theta","stdQ") # be sensible about the order here
jags.inits <- function(){
  list("K"= 10 * rlnorm(1,0, 0.1),
       "r0"= 1 * rlnorm(1,0, 0.1) ,
       "theta"=   5 * rlnorm(1,0, 0.1) , 
       "stdQ"= abs( 0.1 * rlnorm(1,0, 0.1)),
       .RNG.name="base::Wichmann-Hill", .RNG.seed=123)
}

set.seed(1234)
# parallel refuses to take variables as arguments (e.g. n.iter = 1e5 works, but n.iter = n doesn't)
allen_jags <- do.call(jags, list(data=jags.data, inits=jags.inits, 
                                      jags.params, n.chains=n.chains, 
                                      n.iter=n.iter, n.thin=n.thin, 
                                      n.burnin=n.burnin, 
                                      model.file="allen_process.bugs"))

# Run again iteratively if we haven't met the Gelman-Rubin convergence criterion
recompile(allen_jags) # required for parallel
allen_jags <- do.call(autojags, 
                                            list(object=allen_jags, n.update=n.update, 
                           n.iter=n.iter, n.thin = n.thin))
tmp <- lapply(as.mcmc(allen_jags), as.matrix) # strip classes the hard way...
allen_posteriors <- melt(tmp, id = colnames(tmp[[1]])) 
names(allen_posteriors) = c("index", "variable", "value", "chain")
plot_allen_traces <- ggplot(allen_posteriors) + geom_line(aes(index, value)) + 
  facet_wrap(~ variable, scale="free", ncol=1)
allen_priors <- ddply(allen_posteriors, "variable", function(dd){
    grid <- seq(min(dd$value), max(dd$value), length = 100) 
    data.frame(value = grid, density = par_priors[[dd$variable[1]]](grid))
})
plot_allen_posteriors <- ggplot(allen_posteriors, aes(value)) + 
  stat_density(geom="path", position="identity") +
#  geom_line(data=allen_priors, aes(x=value, y=density), col="red") +  
  facet_wrap(~ variable, scale="free", ncol=3)
A <- allen_posteriors
A$index <- A$index + A$chain * max(A$index) # Combine samples across chains by renumbering index 
pardist <- acast(A, index ~ variable)
bayes_coef <- apply(pardist,2, posterior.mode) 
bayes_pars <- unname(c(bayes_coef["r0"], bayes_coef["K"], bayes_coef["theta"])) # parameters formatted for f
allen_f <- function(x,h,p) unname(RickerAllee(x,h, unname(p[c("r0", "K", "theta")])))
allen_means <- sapply(x_grid, f, 0, bayes_pars)
bayes_pars
head(pardist)
K_prior_p <- c(0.01, 40.0)
r0_prior_p <- c(0.01, 20.0)
bugs.model <- 
paste(sprintf(
"model{
  K    ~ dunif(%s, %s)
  r0    ~ dunif(%s, %s)
  stdQ ~ dunif(%s, %s)", 
  K_prior_p[1], K_prior_p[2],
  r0_prior_p[1], r0_prior_p[2],
  stdQ_prior_p[1], stdQ_prior_p[2]),
  "
  iQ <- 1 / (stdQ * stdQ);
  y[1] ~ dunif(0, 10)
  for(t in 1:(N-1)){
    mu[t] <- log(y[t]) + r0 * (1 - y[t]/K) 
    y[t+1] ~ dlnorm(mu[t], iQ) 
  }
}")
writeLines(bugs.model, "ricker_process.bugs")
K_prior     <- function(x) dunif(x, K_prior_p[1], K_prior_p[2])
r0_prior <- function(x) dunif(x, r0_prior_p[1], r0_prior_p[2])
par_priors <- list(K = K_prior, deviance = function(x) 0 * x, 
                   r0 = r0_prior, stdQ = stdQ_prior)
ricker_priors_xtable <- data.frame(
  parameter = c("$r$", "$K$", "$\\sigma$"),
  "lower bound" = c(r0_prior_p[1], K_prior_p[1], stdQ_prior_p[1]),
  "upper bound" = c(r0_prior_p[2], K_prior_p[2], stdQ_prior_p[2]))
jags.params=c("K","r0", "stdQ")
jags.inits <- function(){
  list("K"= 10 * rlnorm(1,0,.5),
       "r0"= rlnorm(1,0,.5),
       "stdQ"=sqrt(0.05) * rlnorm(1,0,.5),
       .RNG.name="base::Wichmann-Hill", .RNG.seed=123)
}
set.seed(12345) 
ricker_jags <- do.call(jags, 
                       list(data=jags.data, inits=jags.inits, 
                            jags.params, n.chains=n.chains, 
                            n.iter=n.iter, n.thin=n.thin, n.burnin=n.burnin,
                            model.file="ricker_process.bugs"))
recompile(ricker_jags)
ricker_jags <- do.call(autojags, 
                       list(object=ricker_jags, n.update=n.update, 
                                                        n.iter=n.iter, n.thin = n.thin, 
                                                        progress.bar="none"))
tmp <- lapply(as.mcmc(ricker_jags), as.matrix) # strip classes the hard way...
ricker_posteriors <- melt(tmp, id = colnames(tmp[[1]])) 
names(ricker_posteriors) = c("index", "variable", "value", "chain")
plot_ricker_traces <- ggplot(ricker_posteriors) + geom_line(aes(index, value)) + 
  facet_wrap(~ variable, scale="free", ncol=1)
ricker_priors <- ddply(ricker_posteriors, "variable", function(dd){
    grid <- seq(min(dd$value), max(dd$value), length = 100) 
    data.frame(value = grid, density = par_priors[[dd$variable[1]]](grid))
})
# plot posterior distributions
plot_ricker_posteriors <- ggplot(ricker_posteriors, aes(value)) + 
  stat_density(geom="path", position="identity") +
#  geom_line(data=ricker_priors, aes(x=value, y=density), col="red") +  # don't plot priors 
  facet_wrap(~ variable, scale="free", ncol=2)
A <- ricker_posteriors
A$index <- A$index + A$chain * max(A$index) # Combine samples across chains by renumbering index 
ricker_pardist <- acast(A, index ~ variable)
bayes_coef <- apply(ricker_pardist,2, posterior.mode) 
ricker_bayes_pars <- unname(c(bayes_coef["r0"], bayes_coef["K"]))
ricker_f <- function(x,h,p){
  sapply(x, function(x){ 
    x <- pmax(0, x-h) 
    pmax(0, x * exp(p["r0"] * (1 - x / p["K"] )) )
  })
}
ricker_means <- sapply(x_grid, Ricker, 0, ricker_bayes_pars[c(1,2)])
head(ricker_pardist)
ricker_bayes_pars
r0_prior_p <- c(.0001, 10.0)
theta_prior_p <- c(.0001, 10.0)
K_prior_p <- c(.0001, 40.0)
bugs.model <- 
paste(sprintf(
"model{
  r0    ~ dunif(%s, %s)
  theta    ~ dunif(%s, %s)
  K    ~ dunif(%s, %s)
  stdQ ~ dunif(%s, %s)", 
  r0_prior_p[1], r0_prior_p[2],
  theta_prior_p[1], theta_prior_p[2],
  K_prior_p[1], K_prior_p[2],
  stdQ_prior_p[1], stdQ_prior_p[2]),

  "
  iQ <- 1 / (stdQ * stdQ);

  y[1] ~ dunif(0, 10)
  for(t in 1:(N-1)){
    mu[t] <- log(r0)  + theta * log(y[t]) - log(1 + pow(abs(y[t]), theta) / K)
    y[t+1] ~ dlnorm(mu[t], iQ) 
  }
}")
writeLines(bugs.model, "myers_process.bugs")
K_prior     <- function(x) dunif(x, K_prior_p[1], K_prior_p[2])
r_prior     <- function(x) dunif(x, r0_prior_p[1], r0_prior_p[2])
theta_prior <- function(x) dunif(x, theta_prior_p[1], theta_prior_p[2])
par_priors <- list( deviance = function(x) 0 * x, K = K_prior,
                    r0 = r_prior, theta = theta_prior, 
                    stdQ = stdQ_prior)
myers_priors_xtable <- data.frame(parameter = c("$r$", "$K$", "$\\theta$", "$\\sigma$"),
           "lower bound" = c(r0_prior_p[1], K_prior_p[1], theta_prior_p[1], stdQ_prior_p[1]),
           "upper bound" = c(r0_prior_p[2], K_prior_p[2], theta_prior_p[2], stdQ_prior_p[2]))
jags.params=c("r0", "theta", "K", "stdQ")
jags.inits <- function(){
  list("r0"= 1 * rlnorm(1,0,.1), 
       "K"=    10 * rlnorm(1,0,.1),
       "theta" = 1 * rlnorm(1,0,.1),  
       "stdQ"= sqrt(0.2) * rlnorm(1,0,.1),
       .RNG.name="base::Wichmann-Hill", .RNG.seed=123)
}
set.seed(12345)
myers_jags <- do.call(jags, 
                      list(data=jags.data, inits=jags.inits, 
                                                     jags.params, n.chains=n.chains, 
                                                     n.iter=n.iter, n.thin=n.thin,
                           n.burnin=n.burnin, 
                           model.file="myers_process.bugs"))
recompile(myers_jags)
myers_jags <- do.call(autojags, 
                      list(myers_jags, n.update=n.update, 
                           n.iter=n.iter, n.thin = n.thin, 
                           progress.bar="none"))
tmp <- lapply(as.mcmc(myers_jags), as.matrix) # strip classes
myers_posteriors <- melt(tmp, id = colnames(tmp[[1]])) 
names(myers_posteriors) = c("index", "variable", "value", "chain")
plot_myers_traces <- ggplot(myers_posteriors) + 
  geom_line(aes(index, value)) + # priors, need to fix order though
  facet_wrap(~ variable, scale="free", ncol=1)
par_prior_curves <- ddply(myers_posteriors, "variable", function(dd){
    grid <- seq(min(dd$value), max(dd$value), length = 100) 
    data.frame(value = grid, density = par_priors[[dd$variable[1]]](grid))
})
plot_myers_posteriors <- ggplot(myers_posteriors, aes(value)) + 
  stat_density(geom="path", position="identity") +
#  geom_line(data=par_prior_curves, aes(x=value, y=density), col="red") +  # Whoops, these are misaligned. see table instead 
  facet_wrap(~ variable, scale="free", ncol=3)
A <- myers_posteriors
A$index <- A$index + A$chain * max(A$index) # Combine samples across chains by renumbering index 
myers_pardist <- acast(A, index ~ variable)
bayes_coef <- apply(myers_pardist,2, posterior.mode) # much better estimates
myers_bayes_pars <- unname(c(bayes_coef["r0"], bayes_coef["theta"], bayes_coef["K"]))
myers_means <- sapply(x_grid, Myer_harvest, 0, myers_bayes_pars)
myers_f <- function(x,h,p) Myer_harvest(x, h, p[c("r0", "theta", "K")])
head(myers_pardist)
myers_bayes_pars
models <- data.frame(x=x_grid, 
                                         GP=tgp_dat$y, 
                                         True=true_means, 
                     MLE=est_means, 
                                         Ricker=ricker_means, 
                     Allen = allen_means,
                     Myers = myers_means)
models <- melt(models, id="x")
# some labels
names(models) <- c("x", "method", "value")
# labels for the colorkey too
model_names = c("GP", "True", "MLE", "Ricker", "Allen", "Myers")
colorkey=cbPalette
names(colorkey) = model_names 
step_ahead <- function(x, f, p){
  h = 0
  x_predict <- sapply(x, f, h, p)
  n <- length(x_predict) - 1
  y <- c(x[1], x_predict[1:n])
  y
}
matrices_gp <- gp_transition_matrix(gp_dat$Ef_posterior, gp_dat$Vf_posterior, x_grid, h_grid) 
opt_gp <- value_iteration(matrices_gp, x_grid, h_grid, MaxT, xT, profit, delta, reward)
matrices_true <- f_transition_matrix(f, p, x_grid, h_grid, sigma_g)
opt_true <- value_iteration(matrices_true, x_grid, h_grid, OptTime=MaxT, xT, profit, delta=delta)
matrices_estimated <- f_transition_matrix(est$f, est$p, x_grid, h_grid, est$sigma_g)
opt_estimated <- value_iteration(matrices_estimated, x_grid, h_grid, OptTime=MaxT, xT, profit, delta=delta)
matrices_allen <- parameter_uncertainty_SDP(allen_f, x_grid, h_grid, pardist, 4)
opt_allen <- value_iteration(matrices_allen, x_grid, h_grid, OptTime=MaxT, xT, profit, delta=delta)
matrices_ricker <- parameter_uncertainty_SDP(ricker_f, x_grid, h_grid, as.matrix(ricker_pardist), 3)
opt_ricker <- value_iteration(matrices_ricker, x_grid, h_grid, OptTime=MaxT, xT, profit, delta=delta)
matrices_myers <- parameter_uncertainty_SDP(myers_f, x_grid, h_grid, as.matrix(myers_pardist), 4)
myers_alt <- value_iteration(matrices_myers, x_grid, h_grid, OptTime=MaxT, xT, profit, delta=delta)
OPT = data.frame(GP = opt_gp$D, True = opt_true$D, MLE = opt_estimated$D, Ricker = opt_ricker$D, Allen = opt_allen$D, Myers = myers_alt$D)
colorkey=cbPalette
names(colorkey) = names(OPT) 
sims <- lapply(OPT, function(D){
  set.seed(1)
  lapply(1:100, function(i) 
    ForwardSimulate(f, p, x_grid, h_grid, x0, D, z_g, profit=profit, OptTime=OptTime)
  )
})
# turn the list into a data.frame
keep.as.columns <- names(sims[[1]][[1]])
dat <- melt(sims, id=keep.as.columns)
sims_data <- data.table(dat)
setnames(sims_data, c("L1", "L2"), c("method", "reps")) 
# Legend in original ordering please, not alphabetical: 
sims_data$method = factor(sims_data$method, ordered=TRUE, levels=names(OPT))
Profit <- sims_data[, sum(profit), by=c("reps", "method")]
tmp <- dcast(Profit, reps ~ method)
#tmp$Allen <- tmp[,"Allen"] + rnorm(dim(tmp)[1], 0, 1) # jitter for plotting
tmp <- tmp / tmp[,"True"]
tmp <- melt(tmp[2:dim(tmp)[2]])
actual_over_optimal <-subset(tmp, variable != "True")
dic.dic <- function (x) sum(x$deviance) +  sum(x[[2]])
recompile(allen_jags)
allen_dic <- dic.dic(dic.samples(allen_jags$model, n.iter=1000, type="popt"))
recompile(ricker_jags)
ricker_dic <- dic.dic(dic.samples(ricker_jags$model, n.iter=1000, type="popt"))
recompile(myers_jags)
myers_dic <- dic.dic(dic.samples(myers_jags$model, n.iter=1000, type="popt"))
dictable <- data.frame(Allen = allen_dic + 2*length(bayes_pars), 
                       Ricker = ricker_dic + 2*length(ricker_bayes_pars),
                       Myers = myers_dic + 2*length(myers_bayes_pars), 
                       row.names = c("DIC"))
source("components/sensitivity-trends.R") # load the function for looping

sigmas <- c(0.01, 0.05, 0.1)
allee <- c(1,2,3,4,5)

vary_sigma <- lapply(sigmas, function(s) try(sensitivity_trends(4, sigma = s, seed=c(111,222,333))))
vary_allee <-  lapply(allee, function(a) try(sensitivity_trends(a, sigma = sigma_g, seed=c(111,222,333))))

vary_sigma <- melt(vary_sigma, id=names(vary_sigma[[1]]))
vary_allee <- melt(vary_allee, id=names(vary_allee[[1]]))
cbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", 
               "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
opts_chunk$set(dev='pdf', dev.args=c(version="1.4"))
opts_chunk$set(fig.width=4, fig.height=3, echo=FALSE)
theme_set(theme_bw(base_size=12))

Introduction

Decision making under uncertainty is a ubiquitous challenge in the management of human intervention in natural resources and conservation. Decision-theoretic approaches provide a framework to determine the best sequence of actions in face of uncertainty, but only when that uncertainty can be meaningfully quantified [@Fischer2009]. Over the last four decades (beginning with @Clark1976, @Clark2009 and @Walters1978) dynamic optimization methods, particularly Stochastic Dynamic Programming (SDP), have become increasingly important as a means of understanding how to manage human intervention into natural systems. Simultaneously, there has been increasing recognition of the importance of multiple steady states or 'tipping points' [@Scheffer2001; @Scheffer2009; @Polasky2011] in ecological systems.

We develop a novel approach to address these concerns in the context of fisheries; although the challenges and methods are germane to other problems of conservation or natural resource exploitation. Economic value and ecological concern have made marine fisheries the crucible for much of the founding work on management under uncertainty [@Gordon1954; @Clark1976; @Clark2009; @Reed1979; @May1979; @Ludwig1982].

Even if we know the proper deterministic description of the biological system, there is intrinsic stochasticity in biological dynamics, measurements, and implementation of policy [e.g. @Reed1979; @Clark1986; @Roughgarden1996; @Sethi2005]. We may also lack knowledge about the parameters of the biological dynamics [parametric uncertainty, e.g. @Ludwig1982; @Hilborn1997; @McAllister1998; @Schapaugh2013], or even not know which model is proper description of the system [structural uncertainty, e.g. @Williams2001; @Cressie2009; @Athanassoglou2012]. Of these, the latter is generally the hardest to quantify. Typical approaches confront the data with a collection of models, assuming that the true dynamics (or reasonable approximation) is among the collection and then use model choice or model averaging to arrive at a conclusion [@Williams2001; @Athanassoglou2012; @Cressie2009]. Even setting aside other concerns (see @Cressie2009), these approaches are unable to describe uncertainty outside the observed data range.

Structural uncertainty is particularly insidious when we try to predict outside of the range of observed data [@Mangel2001] because we are extrapolating into unknown regions. In management applications, this extrapolation uncertainty is particularly important since (a) management involves considering actions that may move the system outside the range of observed behavior, and (b) the decision tools (optimal control theory, SDP) rely on both reasonable estimates of the expected outcomes and on the weights given to those outcomes [e.g. @Weitzman2013]. Thus characterizing uncertainty is as important as characterizing the expected outcome.

Tipping points in ecological dynamics [@Scheffer2001; @Polasky2011] highlight this problem because precise models are not available and data are limited such as around high stock levels or an otherwise desirable state. With perfect information, one would know just how far a system could be pushed before crossing the tipping point, and management would be simple. But we face imperfect models and limited data and, with tipping points, even small errors can have very large consequences, as we shall illustrate later. Because intervention may be too late once a tipping point has been crossed (but see @Hughes2013), management is often concerned with avoiding tipping points before any data about them are available.

The dual concerns of model uncertainty and incomplete data create a substantial challenge to existing decision-theoretic approaches [@Brozovic2011]. We illustrate how Stochastic Dynamic Programming (SDP) [@Mangel1988; @Marescot2013] can be implemented using a Bayesian Non-Parametric (BNP) model of population dynamics [@Munch2005a]. The BNP method has two distinct advantages. First, using a BNP model sidesteps the need for an accurate model-based description of the system dynamics. Second, a BNP model better reflects uncertainty when extrapolating beyond the observed data. This is crucial to providing robust decision-making when the correct model is not known (as is almost always true). [We use robust to characterize approaches that provide nearly optimal solutions without being sensitive to the choice of the (unknown) underlying model.]

This paper is the first ecological application of the SDP without an a priori model of the underlying dynamics. Unlike parametric approaches that can only reflect uncertainty in parameter estimates, the BNP method provides a broader representation of uncertainty, including uncertainty beyond the observed data. We will show that Gaussian Process Dynamic Programming (GPDP) allows us to find robust management solutions in face of limited data without knowing the correct model structure.

For comparisons, we consider the performance of management based on GPDP against management policies derived under several alternative parametric models [@Reed1979; @Ludwig1982; @Mangel1988]. Rather than compare models in terms of best fit to data, we compare model performance in the concrete terms of the decision-maker's objectives.

Approach and Methods

We first describe the requirements of dynamic optimization for the management of human intervention in natural resource systems. After that we describe three parametric models for population dynamics and the Gaussian Process (GP)[^1] description of population dynamics. All computer code used here has been embedded in the manuscript sources (see @Xie_2013), and an implementation of the GPDP approach is provided as an accompanying R package. Source code, R package and the CSV data files corresponding each figure are archived in the supplement [@Boettiger_2014].

[^1]: We abbreviate Gaussian Process as GP, which refers to the statistical model we use to approximate the population dynamics, and we use the term Gaussian Process Dynamic Programming (GPDP), to refer to the use of a GP as the underlying process model when solving a Dynamic Programming equation. Hence we will refer to the models as: GP, Ricker, Allen, etc, and the novel method we put forward here as GPDP.

Requirements of dynamic optimization

Dynamic optimization requires characterizing the dynamics of a state variable (or variables), a control action, and a value function. For simplicity, we consider only a single state variable. This is a best-case scenario for the parametric models because we simulate underlying dynamics from one of the three parametric models, whereas in the natural world we never know the "true" model. In addition, by choosing one-dimensional models with just a few parameters, we limit the chance that poor performance will be due to inability to estimate parameters accurately, something that becomes a more severe problem for higher-dimensional parametric models. Finally, the parametric models we consider are commonly used in modeling stock-recruitment dynamics or to model sudden transitions between alternative stable states.

We let $X(t)$ denote the size (numbers or biomass) of the focal population at time $t$ and assume that in the absence of take its dynamics are:

\begin{equation} X(t+1) = Z(t) f(X(t), \mathbf{p}) \label{eq1} \end{equation}

Where Z(t) is log-normally distributed process stochasticity [@Reed1979] and $\mathbf{p}$ is a vector of parameters to be estimated from the data. We describe the three choices for $f(X(t),\mathbf{p})$ in the next section. The control action is a harvest or take, $h(t)$, measured in the same units as $X$, at time $t$. Thus, in the presence of take, the population size on the right hand side of Eqn 1 is replaced by $S(t) = X(t) - h(t)$.

To construct the value function, we consider a return when $X(t) = x(t)$ and harvest $h(t) = h$ denoted as the reward, $R(x(t), h)$. For example, if the return is the harvest at time $t$, then $R(x(t), h(t)) = \min(x(t), h(t))$. We assume that future harvests are discounted relative to current ones at a constant rate of discount $\delta$ and ask for the harvest policy that maximizes total discounted harvest between the current time $t$ and a final time $T$. That is, we seek to maximize over choices of harvest $E [ \sum_{t = 0}^{T} R(X(t), h(t), t) \delta^t]$, where the state dynamics are given by Eqn 1 and $E$ denotes the expectation over future population states.

In order to find that policy, we introduce the value function $V(x(t), t)$ representing the total discounted catch from time $t$ onwards given that $X(t) = x(t)$. This value function satisfies an equation of SDP [@Mangel1988; @Clark2000; @Clark2009 ;@Mangel2014],

\begin{equation} V(x(t), t) = \max_{h(t)} \lbrace R(h(t), x(t)) + \delta \cdot \mathbf{\mathrm{E}}_{X(t+1)} \left[ V(X(t+1), t+1) | x(t), h(t) \right] \rbrace \end{equation}

where expectation is taken over all possible values of the next state, $X(t+1)$, and maximized over all possible choices of harvest, $h(t)$. That is, at time $t$, when population size is $x(t)$ and harvest $h(t)$ is applied, the immediate return is $R(h(t), x(t))$. When the sole source of uncertainty is the process stochasticity term, $Z$, and thus the expectation in Eqn 2 is equivalent to taking expectations over $Z(t)$. That is

\begin{equation} \mathbf{\mathrm{E}}{X(t+1)} \left[ V( X(t+1),t+1) | x(t), h(t) \right] = \mathbf{\mathrm{E}}{Z(t)} \left[V( Z(t) f(x(t) - h(t))|\mathbf{p}), t+1 | x(t), h(t) \right] \end{equation}

where the population size after the take is $x(t) - h(t)$, which is then translated into $X(t+1)$ by Eqn 1 (that is, we replace $X(t+1)$ by $Z(t) f(x(t)-h(t)|\mathbf{p})$).

When the parameters governing the dynamics are also uncertain, we take the expectation over the posterior distribution for the parameters:

\begin{equation} \mathbf{\mathrm{E}}{X(t+1)} \left[ V(X(t+1), t+1) | x(t), h(t) \right] = \mathbf{\mathrm{E}}{\mathbf{p}|\mathrm{data}} { \mathbf{\mathrm{E}}_{Z(t) | \mathbf{p}, \mathrm{data}} \left[ V(Z(t) f(x(t) - h(t)|\mathbf{p}), t) \right] } \end{equation}

When the underlying population dynamics are unknown (the case of structural uncertainty), the function $f$ itself is uncertain and the expectation for the next state includes uncertainty in $f$ as well. That is

\begin{equation} \mathbf{\mathrm{E}}{X(t+1)} \left[ V(X(t+1), t+1) | x(t), h(t) \right] = \mathbf{\mathrm{E}}{\mathbf{p}|\mathrm{data}} { \mathbf{\mathrm{E}}_{f, Z(t) | \mathbf{p}, \mathrm{data}} \left[ V( Z(t) f(x(t) - h(t)| \mathbf{p}), t) \right] } \end{equation}

We consider the finite time problem with $T=$ r MaxT, which we solve using the standard value iteration algorithm [see @Mangel1988; @Clark2000].

Parametric Models ###

We consider three candidate parametric models for the population dynamics: The Ricker model, the Allen model [@Allen2005], and the Myers model [@Myers1995], Eqns \eqref{ricker}-\eqref{myers}. In all three, we let $K$ denote the carrying capacity and $r$ the maximum per capita growth rate. The Ricker model has two parameters and the right hand side of Eqn 1 is

\begin{equation} f(S(t)|r,K) = S(t) e^{r \left(1 - \frac{S(t)}{K} \right) } \label{ricker} \end{equation}

The Allen model has three parameters

\begin{equation} f(S(t)|r, K, X_c) = S(t) e^{r \left(1 - \frac{S(t)}{K}\right)\left(S(t) - X_c\right)} \label{allen} \end{equation}

where $X_c$ denotes the location of the unstable steady state (i.e., the tipping point).

The Myers model also has three parameters:

\begin{equation} f(S(t) | r, K, \theta) = \frac{r S(t)^{\theta}}{1 + \frac{S(t)^\theta}{K}} \label{myers} \end{equation}

where $\theta = 1$ corresponds to Beverton-Holt dynamics and $\theta$ > 2 leads to Allee effects and multiple stable states.

The Ricker model does not lead to multiple steady states. The Allen model resembles the Ricker dynamics with an added Allee effect parameter [@Courchamp2008], below which the population cannot persist. The Myers model also has three parameters and contains an Allee threshold, but unlike the Ricker model saturates at high population size. The multiplicative log-normal stochasticity in Eqn 1 introduces one additional parameter $\sigma$ that must be estimated.

Because of our interest in management performance in the presence of tipping points, all of our simulations are based on the Allen model. The Allen model is thus the state of nature and is expected to provide the best-case scenario. The Ricker model is a reasonable approximation of these dynamics far from the Allee threshold (but lacks threshold dynamics), while the Myers model shares the essential feature of a threshold but differs in structure from the Allen model. Throughout, we refer to the "True" model when the underlying parameters are known without error, and refer to the "Allen" model when these parameters have been estimated from the sample data.

We consider a period of r Tobs in which data are obtained to estimate the parameters or the GP. This is long enough that the estimates do not depend on the particular realization, and longer times are not likely to provide substantial improvement. Each of the models is fit to the same data (Figure 1).

We inferred posterior distributions for the parameters of each model by Gibbs sampling (@Gelman2003 implemented in R [@RTeam] using jags, [@R2jags]). We choose uniform priors for all parameters of the parametric models (See appendix Tables S1-S3; R code provided). We show one-step-ahead predictions of these model fits in Figure 1. We tested each chain for Gelman-Rubin convergence and results were robust to longer runs. For each simulation we also applied several commonly used model selection criteria (AIC, BIC, DIC, see @Burnham2002) to identify the best fitting model.

Additionally, we compute the maximum likelihood estimate (MLE, as we will refer to this model in the figures) of the parameters for the (structurally correct) Allen model. Comparing this to using the posterior distribution of parameters inferred from MCMC for the same model gives some indication of the importance of this uncertainty in the dynamic programming.

The Gaussian Process model

The core difference for our purpose between the estimated GP and the estimated parametric models is that the estimated GP model is defined explicitly in reference to the observed data. As a result, uncertainty arises in the GP model not only from uncertainty in the parameters, but is also increases in regions farther from the observed states, such as low population sizes in the example illustrated here. The estimated parametric models, by contrast, are completely specified by the parameters.

The use of GPs to characterize dynamical systems is relatively new [@Kocijan2005], and was first introduced in the context ecological modeling and fisheries management in @Munch2005. GP models have subsequently been used to test for the presence of Allee effects [@Sugeno2013a], estimate the maximum reproductive rate [@Sugeno2013b], determine temporal variation in food availability [@Sigourney2012], and provide a basis for identifying model-misspecification [@Thorson2014]. An accessible and thorough introduction to the formulation and use of GPs can be found in @Rasmussen2006.

A GP is a stochastic process for which any realization consisting of n points follows a multivariate normal distribution of dimension $n$. To characterize the GP we need a mean function and a covariance function. We proceed as follows.

As before, we assume that the data $X(t)$ are observed with process stochasticity around a mean function $g(X(t))$

\begin{equation} X(t+1) = g(X(t)) + \varepsilon, \end{equation}

where $\varepsilon$ are IID normal random variables with zero-mean and variance $\sigma^2$. Note that we have chosen to assume additive stochasticity. While we could consider log-normal stochasticity as in the parametric models, we make this choice to emphasize that the Gaussian process approach need not have structurally correct stochasticity to be effective.

In order to make predictions, we update the GP based on the observed set of transitions. To do so, we collect the time series of observed states into a vector of "current" states, $\mathbf{X}{\textrm{obs}} = {X(1), \dots, X(T-1)}$ and a vector of "next" states $\mathbf{Y}{\textrm{obs}} = {X(2),\dots,X(T)}$ where $T$ is the time of the final observation. Conditional on these observations, the predicted next state, $g(X_p)$, for any given "current" state, $X_p$ follows a normal distribution with mean $E$ and variance $C$ determined using the standard rules for conditioning in multivariate normals, i.e.

\begin{equation} E = K(X_p, \mathbf{X}{\textrm{obs}}) \left(K(\mathbf{X}{\textrm{obs}},\mathbf{X}{\textrm{obs}}) - \sigma \mathbf{I}_n \right)^{-1} \mathbf{Y}{\textrm{obs}} \end{equation}

and

\begin{equation} C = K(X_p, X_p) - K(X_p, \mathbf{X}{\textrm{obs}}) \left(K(\mathbf{X}{\textrm{obs}},\mathbf{X}{\textrm{obs}}) - \sigma \mathbf{I} \right)^{-1} K(\mathbf{X}{\textrm{obs}}, X_p) \end{equation}

Here $\mathbf{I}_n$ is the $n$ by $n$ identity matrix (i.e. a matrix with ones down the diagonal and zeros elsewhere) and $K$ is the ‘covariance kernel.’ The covariance kernel controls how much influence one observation has on another. In the present application we use the squared-exponential kernel which, when evaluated over a pair of vectors, say $\mathbf{x}$ and $\mathbf{y}$, generates a covariance matrix whose $i,j$th element is given by

\begin{equation} K_{i,j}(\mathbf{x}, \mathbf{y}) = \exp\left( \frac{ -(x_i - y_j)^2}{2 \ell^2} \right) \end{equation}

so that $\ell$ gives the characteristic length-scale over which correlation between two observations decays. See @Rasmussen2006 for other choices of covariance kernels and their properties. Note that this simple formulation assumes a prior mean of zero. For the parameters we use inverse Gamma priors on both the length-scale $\ell$ and $\sigma$, thus for example

\begin{equation} f(\ell; \alpha, \beta) = \frac{\beta^\alpha}{\Gamma(\alpha)} \ell^{-\alpha - 1}\exp\left(-\frac{\beta}{\ell}\right) \end{equation}

For the prior on $\ell$, $\alpha =$ r d.p[1] and $\beta =$ r d.p[2]. The prior on $\sigma$, $\alpha =$ r s2.p[1] and $\beta =$ r s2.p[2].

We use a Metropolis-Hastings Markov Chain Monte Carlo (@Gelman2003) to infer posterior distributions of the parameters of the GP (Figure S4, code in appendix). Since the posterior distributions differ substantially from the priors (Figure S4), most of the information in the posterior comes from the data rather than the prior belief.

The method of Gaussian Process Dynamic Programming (GPDP)

We derive the harvest policy from the estimated GP by inserting it into a SDP algorithm. Given the GP posteriors, we construct the transition matrix representing the probability of going to each state $X(t+1)$ given any current state $x(t)$ and any harvest $h(t)$ (See the function gp_transition_matrix() in the provided R package). Given this transition matrix, we use the same value iteration algorithm as in the parametric case to determine the optimal policy. In doing so, the uncertainty in the future state under the GP, $X(t+1)$, includes both process uncertainty (based on the estimation of $\sigma$) and structural uncertainty of the posterior collection of curves.

Results

Parametric and GP models for population dynamics

To ensure our results are robust to the choice of parameters, we will consider 96 different scenarios. To help better understand the process, we first describe in detail the results of a single scenario.

step_ahead_posteriors <- function(x){
  gp_f_at_obs <- gp_predict(gp, x, burnin=1e4, thin=300)
  df_post <- melt(lapply(sample(100, 30),  
  function(i){
    data.frame(time = 1:length(x), stock = x, 
                GP = mvrnorm(1, gp_f_at_obs$Ef_posterior[,i], gp_f_at_obs$Cf_posterior[[i]]),
                True = step_ahead(x,f,p),  
                MLE = step_ahead(x,f,est$p), 
                Allen = step_ahead(x, allen_f, pardist[i,]), 
                Ricker = step_ahead(x, ricker_f, ricker_pardist[i,]), 
                Myers = step_ahead(x, myers_f, myers_pardist[i,]))
  }), id=c("time", "stock"))
}
df_post <- step_ahead_posteriors(x)
library("dplyr")
df_post_means <- summarize(group_by(df_post, time,variable,stock), value = mean(value))
ggplot(df_post_means,aes(time, value)) +
  geom_point(aes(time, stock), size=2) + 
  geom_line(aes(time, value, col=variable, group=variable),lwd=1) + 
#  geom_line(aes(time, value, col=variable, group=interaction(L1,variable))) + 
  facet_wrap(~variable) + 
  scale_colour_manual(values=colorkey) +  
  theme(legend.position="none")




write.csv(df_post_means, "components/data/figure1.csv")

All of the models fit the observed data rather closely and with relatively small uncertainty. In Figure 1, we show the posterior predictive curves. The training data of stock sizes observed over time are points, overlaid with the step-ahead predictions of each estimated model using the parameters sampled from their posterior distributions. Compared to the true model most estimates appear to over-fit, predicting patterns that are actually due purely to stochasticity. Model selection criteria (Table 1) penalize more complex models and show a preference for the simpler Ricker model over the models with alternative stable states (Allen and Myers). Supplement provides details on the model estimates.

allen_deviance  <- - posterior.mode(pardist[,'deviance'])
ricker_deviance <- - posterior.mode(ricker_pardist[,'deviance'])
myers_deviance  <- - posterior.mode(myers_pardist[,'deviance'])
true_deviance   <- 2*estf(c(p, sigma_g))
mle_deviance    <- 2*estf(c(est$p, est$sigma_g))
aictable <- data.frame(Allen = allen_deviance + 2*(1+length(bayes_pars)),  # +1 for noise parameter
                       Ricker = ricker_deviance + 2*(1+length(ricker_bayes_pars)),
                       Myers = myers_deviance + 2*(1+length(myers_bayes_pars)), 
                       row.names = c("AIC"))
bictable <- data.frame(Allen = allen_deviance + log(length(x))*(1+length(bayes_pars)), 
                       Ricker = ricker_deviance + log(length(x))*(1+length(ricker_bayes_pars)),
                       Myers = myers_deviance + log(length(x))*(1+length(myers_bayes_pars)), 
                       row.names = c("BIC"))
pandoc.table(rbind(dictable, aictable, bictable))
pandoc.table(rbind(dictable, aictable, bictable), caption="Model selection scores for several common criteria (DIC: Deviance Information Criterion, AIC: Akaike Information Criterion, BIC: Bayesian Information Criterion) all select the wrong model. As the true (Allen) model is not distinguishable from the simpler (Ricker) model in the region of the observed data, this error cannot be avoided regardless of the model choice criterion. This highlights the danger of model choice when the selected model will be used outside of the observed range of the data.")
# subset data first, don't care about seeing dynamics for stock >> K
x_grid_short <- x_grid[1:max(which(x_grid<12))]
gp_short <- gp_predict(gp, x_grid_short, burnin=1e4, thin=300)

# sample from subset data
models_posteriors <- 
  melt(lapply(sample(100, 50), 
              function(i){
    sample_gp <- mvrnorm(1, 
                            gp_short$Ef_posterior[,i],         
                            gp_short$Cf_posterior[[i]])
    data.frame(stock = x_grid_short, 
               GP = sample_gp,
               y = sample_gp,
               ymin = sample_gp - 2 * sqrt(gp_short$E_Vf), 
               ymax = sample_gp + 2 * sqrt(gp_short$E_Vf), 
               True = sapply(x_grid_short,f,0, p),  
               MLE = sapply(x_grid_short,f,0, est$p), 
               Allen = sapply(x_grid_short, allen_f, 0, pardist[i,]), 
               Ricker = sapply(x_grid_short, ricker_f, 0, ricker_pardist[i,]), 
               Myers = sapply(x_grid_short, myers_f, 0, myers_pardist[i,]))
             }), 
       id=c("stock", "y", "ymin", "ymax"))
ggplot(models_posteriors) + 
#    geom_ribbon(aes(x=stock, y=y, ymin=ymin, ymax=ymax, group=L1), 
#                  fill = "gray80", 
#                  data=subset(models_posteriors, variable == "GP")) + 
    geom_line(aes(stock, value, col = variable, 
                  group=interaction(L1,variable)), alpha=0.4) + 
    geom_point(data = obs, aes(x,y), size=2) + 
    xlab(expression(X[t])) + ylab(expression(X[t+1])) +
    facet_wrap(~variable) + 
    scale_colour_manual(values=colorkey) +  
    theme(legend.position="none")

write.csv(models_posteriors, "components/data/figure2.csv")
write.csv(obs, "components/data/training_data.csv")

We show the mean inferred population dynamics of each model relative to the true model used to generate the data in Figure 2, predicting the relationship between observed population size (x-axis) to the population size after recruitment the following year. In addition to the raw data, the GP is conditioned on going through the point 0,0 without error. All parametric models also make this assumption. Conditioning on (0,0) is equivalent to making the assumption that the population is closed, so that once it hits 0 it stays at 0, despite the lack of any data in the observed sequence to justify this. This assumption illustrates how the GP can capture common-sense biology without having to assume more explicit details about the dynamics at low population numbers that have never been observed. If the population were not closed, one could repeat the entire analysis without this assumption. Unlike parametric models, the GP corresponds to a distribution of curves, of which this plot only shows the means. Uncertainty in the parameters of the GP (not shown) further widens the band of possible population sizes. In Figure S1 (see supplement), we show the performance of the models outside the observed training data.

policies <- melt(data.frame(stock=x_grid, sapply(OPT, function(x) x_grid[x])), id="stock")
names(policies) <- c("stock", "method", "value")

ggplot(policies, aes(stock, stock - value, color=method)) +
  geom_line() + 
  # stat_smooth(lwd=1.2, method="loess", degree=1, span=0.2, level=0, n=50) + 
  facet_wrap(~method) +
  xlab("stock size, x(t)") + 
  ylab("escapement, S(t)")  +
  scale_colour_manual(values=colorkey, guide=FALSE)


write.csv(policies, "components/data/figure3.csv")

Despite the similarities in model fits to the observed data, the policies inferred under each model differ widely (Figure 3). Policies are shown in terms of target escapement, $S(t) = x_t - h$. Under models such as this a constant escapement policy is expected to be optimal [@Reed1979], whereby population levels below a certain size $S$ are unharvested, while above that size the harvest strategy aims to return the population to $S$. Whenever a model predicts that the population will not persist below a certain threshold, the optimal solution is to harvest the entire population immediately, resulting in an escapement $S=0$, as seen in the true (correct form, exact parameters) model, the Allen model (correct form, estimated parameters) and the GP. Only the structurally correct model (Allen model) and the GP produce policies close to the true optimum policy.

ggplot(sims_data) + 
  geom_line(aes(time, fishstock, group=interaction(reps,method), color=method), alpha=0.1) +
  scale_colour_manual(values=colorkey, guide = FALSE) +
  xlab("stock size, x(t)") +
  facet_wrap(~method) + 
  guides(legend.position="none", color=FALSE)

write.csv(sims_data, "components/data/figure4.csv")

In Figure 4, we show the consequences of managing 100 replicate realizations of the simulated fishery under policies derived from each model. The structurally correct model under-harvests, leaving the stock to vary around its unfished optimum. The structurally incorrect Ricker model over-harvests the population past the tipping point consistently, resulting in the immediate crash of the stock and thus leads to minimal long-term catch.

The results across replicate stochastic simulations are most easily compared by using the relative differences in net present value realized by each of the model (Figure 5). Although not perfect, the GPDP consistently realizes a value close to the optimal solution, and avoids ever driving the system across the tipping point, which results in the near-zero value cases in the parametric models.

ggplot(actual_over_optimal, aes(value)) + geom_histogram(aes(fill=variable)) + 
  facet_wrap(~variable, ncol=2)  + 
  guides(legend.position = "none") +
  xlab("Net present value") + 
  scale_fill_manual(values=colorkey, guide=FALSE) # density plots fail when delta fn

write.csv(actual_over_optimal, "components/data/figure5.csv")

Sensitivity Analysis

These results are not sensitive to the modeling details of the simulation. The GPDP estimate remains very close to the optimal solution (obtained by knowing the true model) across changes to the training simulation, scale of stochasticity, parameters or structure of the underlying model. In the Supplement, we consider both a Latin hypercube approach and a more focused investigation of the effects of the relative distance to the Allee threshold and the variance of process stochasticity.

The GPDP is only weakly influenced by increasing stochasticity or increasing Allee effects over much of the range (Figure S2). Larger $\sigma$ or higher Allee levels make even the optimal solution without any model or parameter uncertainty unable to harvest the population effectively (e.g. the stochasticity is large enough to violate the self-sustaining criterion of @Reed1979).

Discussion

Simple, mechanistically motivated models offer the potential to increase our basic understanding of ecological processes [@Cuddington2013; @Geritz2012]. But such models can be both inaccurate and misleading when used in a decision making framework. In this paper we tackled two aspects of uncertainty that are common to many ecological decision-making problems and fundamentally challenging to existing approaches that largely rely on parametric models:

  1. We do not know the correct models for ecological systems.
  2. We have limited data from which to estimate the model.

We have illustrated how the use of non-parametric methods provides more reliable solutions in the sequential decision-making problem.

Traditional model-choice approaches can be positively misleading

Our results illustrate that model-choice approaches can be absolutely misleading -- by providing support to models that cannot capture tipping point dynamics because they have fewer parameters and the data are far from the tipping point. That is, when the data come from around the stable steady state, all the parametric models are approximately linear and approximately identical. Thus, it is intuitive that all model selection methods choose the simplest model. In a complex world, the result is suboptimal. But in a world that might contain tipping points, the result could be disastrous.

Many managers both in fisheries and beyond face a similar situation: they have not observed the population dynamics at all possible densities. A lack of comprehensive data at all population sizes, combined with the inability to formulate accurate population models for low population sizes in the absence of data, makes this situation the rule more than the exception. Relying on parametric models and model choice processes that favor simplicity ignores this basic reality. For a long time, Carl Walters [e.g. @Walters1978] has argued that if we began by fishing any newly exploited population down to very low levels and then let it recover, we would be much better at estimating population dynamics and thus predicting the optimal harvest levels. While certainly true, this presents a rather risky policy in the face of potential tipping points. The GPDP offers a risk-adverse alternative.

GPDP population dynamics capture larger uncertainty in regions where the data are poor

Parametric models perform most poorly when we seek a management strategy outside the range of the observed data. The GPDP, in contrast, leads to a predictive model that expresses a great deal of uncertainty about the probable dynamics outside the range of the observed data, while retaining very good predictive accuracy inside the range. The management policy based on by the GPDP balances uncertainty outside the range of the observed data against the immediate value of the harvest, and acts to stabilize the population dynamics in a region of state space in which the predictions are reliably reflected by the data.

Such problems are ubiquitous across ecological decision-making and conservation where the greatest concerns involve decisions that lead to population sizes that have never been observed and for which we do not know the response -- whether this is the collapse of a fishery, the spread of an invasive, or the loss of habitat.

The role of the prior

Outside of the observed range of the data, the GP reverts to the prior, and consequently the choice of the prior can also play a significant role in determining the optimal policy. In the examples shown here we have selected a prior that is both relatively uninformative (due to the broad priors placed on its parameters $\ell$ and $\sigma$ and simple (the choice of our covariance function, Eqns 12 and 13 ). In practice, these should be chosen to confer particular biological properties. In principle, this may allow a manager to improve the performance of the GPDP by adding detail as is justified. For instance, it would be possible to use a linear or a Ricker-shaped mean in the prior without making the much stronger assumption that the Ricker is the structurally correct model [@Sugeno2013a]. One fruitful avenue of future research is identifying criteria to ensure the prior and the reward function are chosen appropriately for the problem at hand.

Acknowledgments

This work was partially supported by NOAA-IAM grant to SM and Alec McCall and administered through the Center for Stock Assessment Research, a partnership between the University of California Santa Cruz and the Fisheries Ecology Division, Southwest Fisheries Science Center, Santa Cruz, CA and by NSF grant EF-0924195 to MM and NSF grant DBI-1306697 to CB.



cboettig/nonparametric-bayes documentation built on May 13, 2019, 2:09 p.m.