In boettiger-lab/pomdpplus: Planning and Learning in Uncertain Systems

library("sarsop")
library("ggplot2")
library("tidyr")
library("dplyr")

log_dir <- "https://raw.githubusercontent.com/cboettig/pomdp-solutions-library/master/archive"

who <- data.frame(model = "allen")

who <- data.frame(model = "allen", C = 15, n_states = 61, r = 1)
# who <- data.frame(model = "allen", r= 1, K = 50, C = 15)
# who <- data.frame(model = "allen", r = 1, K = 30, C = 15)
#who <- data.frame(model = "allen", r = 1, K = 40, C = 10, sigma_m = 0.05)

meta <- meta_from_log(who, log = log_dir)
meta

Import a model solution from the library

i <- 1 # just use first match from the log

alpha <- alphas_from_log(meta, log = log_dir)[[i]]
f <- f_from_log(meta)[[i]]
m <- models_from_log(meta)[[i]]

n_states = meta$n_states[[i]]
discount = meta$discount[[i]]

Find the deterministic optimal solution for this model

S_star <- round( optimize(function(x) -f(x,0) + x / discount, c(1, n_states) )$minimum)
det_policy <- function(x) if(x <= S_star) 1 else x - S_star # adjusted for index values, starting at 1

Experiment 1:

if we believe the prior value was certainly s, we are then more conservative with anything above f(S), and less conservative with anything below f(S):

s <- S_star
a0 <- 0 # action we took 
certain_prior <- numeric(length(m$observation[,1,1]))
certain_prior[s+1] <- 1

df <- compute_policy(alpha, m$transition, m$observation, m$reward, certain_prior, a0+1) # action as index

df %>% rowwise() %>% mutate(det = det_policy(state)) %>%
ggplot(aes(state, state - policy)) + geom_line() + geom_point() + geom_line(aes(state, state - det)) + geom_vline(xintercept = f(s,a0))

Experiment 2

Role of different prior beliefs on the optimal policy:

low <-  compute_policy(alpha, m$transition, m$observation, m$reward, m$observation[,4,1])
ave <-  compute_policy(alpha, m$transition, m$observation, m$reward, m$observation[,20,1])
unif <- compute_policy(alpha, m$transition, m$observation, m$reward)
high <- compute_policy(alpha, m$transition, m$observation, m$reward, m$observation[,35,1])
df <- dplyr::bind_rows(low, ave, unif, high, .id = "prior")


ggplot(df, aes(state, state - policy, col = prior, pch = prior)) + 
  geom_point(alpha = 0.5, size = 3) + 
  geom_line()

Experiment 3:

Simulate dynamics under the policy

set.seed(1234)
sim <- sim_pomdp(m$transition, m$observation, m$reward, discount, x0 = 25, Tmax = 50, alpha = alpha)
sim$df %>% select(-value) %>% gather(variable, state, -time) %>%
ggplot(aes(time, state, color = variable)) + geom_line() + geom_point()

Posterior is not stationary:

Tmax <- length(sim$df$time)
data.frame(time = 0:Tmax, sim$state_posterior) %>%
  gather(state, belief, -time, factor_key = TRUE) %>%
  mutate(state = as.numeric(state)) %>%
  filter(time > 25) %>% 
  ggplot(aes(state, belief, group = time, alpha = time)) + geom_line()

Experiment 4:

Do replicate simulations avoid crashes?

reps <- function(n){
  data.frame(sim = n, 
    sim_pomdp(m$transition, m$observation, m$reward, discount, 
              alpha = alpha, x0 = round(n_states/2), Tmax=Tmax)$df)
}

set.seed(12345)
sim_df <- purrr::map_df(1:100, reps)

ggplot(sim_df, aes(time, state, group = sim)) + geom_line()