# sim -- simulates observations from fit map/quap and map2stan and ulam models
setGeneric("sim",
function( fit , data , n=1000 , ... ) {
predict(fit)
}
)
# function at heart of all sim methods
sim_core <- function( fit , data , post , vars , n , refresh=0 , replace=list() , debug=FALSE , ll=FALSE , ... ) {
sim_vars <- list()
for ( var in vars ) {
# find a distribtional assignment with var on left
f <- fit@formula[[1]]
for ( i in 1:length(fit@formula) ) {
f <- fit@formula[[i]]
left <- as.character( f[[2]] )
if ( left==var ) {
if ( debug==TRUE ) print(f)
break
}
}
lik <- f
outcome <- var
flik <- as.character(lik[[3]][[1]])
# check whether we must convert from Stan-name distribution to R-name distribution
first_char <- substr( flik , 1 , 1 )
if ( first_char != "d" | flik=="dirichlet" ) {
# loop over templates and get matching dfoo name
for ( ii in 1:length( ulam_dists ) ) {
aStanName <- ulam_dists[[ii]]$Stan_name
if ( aStanName==flik ) {
flik <- ulam_dists[[ii]]$R_name
break
}
}#ii
}
# get simulation partner function
rlik <- flik
if ( ll==FALSE ) {
substr( rlik , 1 , 1 ) <- "r"
# hook for gampois, which needs rgampois2
if ( rlik=="rgampois" ) rlik <- "rgampois2"
}
# check for aggregated binomial, but only when ll==TRUE
aggregated_binomial <- FALSE
size_var_is_data <- FALSE
if ( flik=="dbinom" & ll==TRUE ) {
ftemp <- flist_untag(fit@formula)[[1]]
size_sym <- ftemp[[3]][[2]]
if ( class(size_sym)=="name" ) {
aggregated_binomial <- TRUE
size_var_is_data <- TRUE
}
if ( class(size_sym)=="numeric" ) {
if ( size_sym > 1 ) aggregated_binomial <- TRUE
}
}
# disabling aggregated binom code for now
aggregated_binomial <- FALSE
# pull out parameters in likelihood
pars <- vector(mode="list",length=length(lik[[3]])-1)
for ( i in 1:length(pars) ) {
pars[[i]] <- lik[[3]][[i+1]]
}
if ( aggregated_binomial==TRUE ) {
# swap 'size' for '1' and outcome for vector of ones
# will expand to correct number of cases later
pars[[1]] <- 1
data[['ones__']] <- rep( 1 , length(data[[outcome]]) )
if ( refresh > 0 )
message("Aggregated binomial counts detected. Splitting to 0/1 outcome for WAIC calculation.")
}
pars <- paste( pars , collapse=" , " )
# build expression to evaluate
n_cases <- length(data[[outcome]])
if ( n_cases==0 ) {
# no outcome in custom data?
# get number of cases from first variable in data
n_cases <- length(data[[1]])
}
if ( ll==FALSE ) {
xeval <- paste( rlik , "(" , n_cases , "," , pars , ")" , collapse="" )
} else {
use_outcome <- outcome
if ( aggregated_binomial==TRUE ) use_outcome <- 'ones__'
xeval <- paste( rlik , "(" , use_outcome , "," , pars , ",log=TRUE )" , collapse="" )
}
# simulate outcomes
sim_out <- matrix(NA,nrow=n,ncol=n_cases)
# get linear model values from link
# don't flatten result, so we end up with a named list, even if only one element
# need to insert already-simulated vars here - should have happened at bottom of this loop
pred <- link( fit , data=data , n=n , post=post , flatten=FALSE , refresh=refresh , replace=replace , debug=debug , ... )
for ( s in 1:n ) {
# build environment
# extract s-th sample case calculations
elm <- list()
for ( j in 1:length(pred) ) {
elm[[j]] <- pred[[j]][s,]
}
names(elm) <- names(pred)
init <- list() # holds one row of samples across all params
for ( j in 1:length(post) ) {
par_name <- names(post)[ j ]
dims <- dim( post[[par_name]] )
# scalar
if ( is.null(dims) ) init[[par_name]] <- post[[par_name]][s]
# 1d vector
if ( length(dims)==1 ) init[[par_name]] <- post[[par_name]][s]
# vector
if ( length(dims)==2 ) init[[par_name]] <- post[[par_name]][s,]
# matrix
if ( length(dims)==3 ) init[[par_name]] <- post[[par_name]][s,,]
}#j
e <- list( as.list(data) , as.list(init) , as.list(elm) )
e <- unlist( e , recursive=FALSE )
# include previously simulated variables in environment so they are accessible
# need only s-th simulation right now, and should replace existing symbols in e (in case variables were included in data list)
if ( length(sim_vars) > 0 ) {
for ( j in 1:length(sim_vars) ) {
e[[ names(sim_vars)[j] ]] <- sim_vars[[j]][s,]
}#j
}
# evaluate
sim_out[s,] <- eval( parse( text=xeval ) , envir=e )
}#s
if ( debug==TRUE ) print(str(e))
# check for aggregated binomial outcome
if ( aggregated_binomial==TRUE ) {
# aggregated binomial with data for 'size'
# need to split binomial counts in outcome into series of 0/1 outcomes
# (1) sum 'size' variable in order to get number of new cases
if ( size_var_is_data==TRUE ) {
size_var <- data[[ as.character(ftemp[[3]][[2]]) ]]
} else {
# size is constant numeric, but > 1
size_var <- rep( as.numeric(ftemp[[3]][[2]]) , n_cases )
}
n_newcases <- sum(size_var)
# (2) make new sim_out with expanded dimension
sim_out_new <- matrix(NA,nrow=n,ncol=n_newcases)
# (3) loop through each aggregated case and fill sim_out_new with bernoulli loglik
current_newcase <- 1
outcome_var <- data[[ outcome ]]
for ( i in 1:n_cases ) {
num_ones <- outcome_var[i]
num_zeros <- size_var[i] - num_ones
ll1 <- sim_out[,i]
ll0 <- log( 1 - exp(ll1) )
if ( num_ones > 0 ) {
for ( j in 1:num_ones ) {
sim_out_new[,current_newcase] <- ll1
current_newcase <- current_newcase + 1
}
}
if ( num_zeros > 0 ) {
for ( j in 1:num_zeros ) {
sim_out_new[,current_newcase] <- ll0
current_newcase <- current_newcase + 1
}
}
}#i
sim_out <- sim_out_new
}
sim_vars[[ var ]] <- sim_out
data[[ var ]] <- sim_out # make available to subsequent vars
}#var
return(sim_vars)
}
setMethod("sim", "map",
function( fit , data , n=1000 , post , vars , ll=FALSE , refresh=0 , replace=list() , debug=FALSE , ... ) {
# when ll=FALSE, simulates sampling, one sample for each sample in posterior
# when ll=TRUE, computes loglik of each observation, for each sample in posterior
# vars argument optional
# when missing, simulations only first observed variable in formula
# when present, should be a character vector naming variables to simulate, in order to simulate them
# order matters, because some may be needed for the others to simulate right
########################################
# check arguments
if ( class(fit)!="map" ) stop("Requires map/quap fit")
if ( missing(data) ) {
data <- fit@data
} else {
# make sure it's a list, otherwise can't hold sample matrices from sims
data <- as.list(data)
# check for vars to sim that are not in data list
if ( !missing(vars) ) {
for ( i in 1:length(vars) ) {
if ( !(vars[i] %in% names(data)) ) {
# insert dummy for var - will get simulated over later
data[[ vars[i] ]] <- rep( 0 , length(data[[1]]) )
}
}#i
}
}
if ( missing(post) ) {
post <- extract.samples(fit,n=n)
} else {
n <- dim(post[[1]])[1]
if ( is.null(n) ) n <- length(post[[1]])
}
# variables to sim
if ( missing(vars) ) {
# extract likelihood, assuming it is first element of formula
lik <- flist_untag(fit@formula)[[1]]
# discover outcome
vars <- as.character(lik[[2]])
} else {
# vars listed
if ( debug==TRUE ) print(vars)
}
# loop over vars
sim_vars <- sim_core( fit=fit , data=data , post=post , vars=vars , n=n , refresh=refresh , replace=replace , debug=debug , ll=ll , ... )
# result
if ( length(sim_vars)==1 ) sim_vars <- sim_vars[[1]]
return( sim_vars )
}
)
setMethod("sim", "map2stan",
function( fit , data , n=1000 , post , refresh=0.1 , replace=list() , ... ) {
# trap for ulam2018 method
ag <- attr( fit , "generation" )
if ( !is.null(ag) )
if ( ag=="ulam2018" )
return( sim_ulam( fit , data=data , post=post , n=n , ... ) )
########################################
# check arguments
if ( missing(data) ) {
data <- fit@data
} else {
# make sure all variables same length
# weird vectorization errors otherwise
#data <- as.data.frame(data)
}
if ( n==0 ) {
n <- stan_total_samples(fit@stanfit)
} else {
tot_samples <- stan_total_samples(fit@stanfit)
n <- min(n,tot_samples)
}
if ( missing(post) )
post <- extract.samples(fit,n=n)
# get linear model values from link
# use our posterior samples, so later parameters have right correlation structure with link values
# don't flatten result, so we end up with a named list, even if only one element
pred <- link( fit , data=data , n=n , post=post , flatten=FALSE , refresh=refresh , replace=replace , ... )
# extract likelihood, assuming it is first element of formula
lik <- flist_untag(fit@formula)[[1]]
# discover outcome
outcome <- as.character(lik[[2]])
# discover likelihood function
flik <- as.character(lik[[3]][[1]])
# check whether we must convert from Stan-name distribution to R-name distribution
first_char <- substr( flik , 1 , 1 )
if ( first_char != "d" ) {
# loop over templates and get matching dfoo name
for ( ii in 1:length(map2stan.templates) ) {
aStanName <- map2stan.templates[[ii]]$stan_name
if ( aStanName==flik ) {
flik <- map2stan.templates[[ii]]$R_name
break
}
}#ii
}
# get simulation partner function
rlik <- flik
substr( rlik , 1 , 1 ) <- "r"
# hook for gampois, which needs rgampois2
if ( rlik=="rgampois" ) rlik <- "rgampois2"
# pull out parameters in likelihood
pars <- vector(mode="list",length=length(lik[[3]])-1)
for ( i in 1:length(pars) ) {
pars[[i]] <- lik[[3]][[i+1]]
}
pars <- paste( pars , collapse=" , " )
# build expression to evaluate
n_cases <- length(data[[1]])
xeval <- paste( rlik , "(" , n_cases , "," , pars , ")" , collapse="" )
# simulate outcomes
sim_out <- matrix( NA , nrow=n , ncol=n_cases )
# need to select out s-th sample for each parameter in post
# only need top-level parameters, so can coerce to data.frame?
post2 <- as.data.frame(post)
for ( s in 1:n ) {
# build environment
#ndims <- length(dim(pred[[1]]))
#if ( ndims==2 )
# elm <- pred[[1]][s,] # extract s-th sample case calculations
#if ( ndims==3 )
# elm <- pred[[1]][s,,]
#elm <- list(elm)
#names(elm)[1] <- names(pred)[1]
elm <- list()
for ( j in 1:length(pred) ) {
ndims <- length(dim(pred[[j]]))
if ( ndims==2 )
elm[[j]] <- pred[[j]][s,]
if ( ndims==3 )
elm[[j]] <- pred[[j]][s,,]
}
names(elm) <- names(pred)
e <- list( as.list(data) , as.list(post2[s,]) , as.list(elm) )
e <- unlist( e , recursive=FALSE )
# evaluate
if ( flik=="dordlogit" ) {
n_outcome_vals <- dim( pred[[1]] )[3]
probs <- pred[[1]][s,,]
sim_out[s,] <- sapply(
1:n_cases ,
function(i)
sample( 1:n_outcome_vals , size=1 , replace=TRUE , prob=probs[i,] )
)
} else {
sim_out[s,] <- eval(parse(text=xeval),envir=e)
}
}
return(sim_out)
}
)
# EXAMPLES/TEST
if ( FALSE ) {
library(rethinking)
data(cars)
fit <- map(
alist(
dist ~ dnorm(mu,sigma),
mu <- a + b*speed
),
data=cars,
start=list(a=30,b=0,sigma=5)
)
pred <- link(fit,data=list(speed=0:30),flatten=TRUE)
sim.dist <- sim(fit,data=list(speed=0:30))
plot( dist ~ speed , cars )
mu <- apply( pred , 2 , mean )
mu.PI <- apply( pred , 2, PI )
y.PI <- apply( sim.dist , 2 , PI )
lines( 0:30 , mu )
shade( mu.PI , 0:30 )
shade( y.PI , 0:30 )
# now map2stan
cars$y <- cars$dist # Stan doesn't allow variables named 'dist'
m <- map2stan(
alist(
y ~ dnorm(mu,sigma),
mu <- a + b*speed,
a ~ dnorm(0,100),
b ~ dnorm(0,10),
sigma ~ dunif(0,100)
),
data=cars,
start=list(
a=mean(cars$y),
b=0,
sigma=1
),
warmup=500,iter=2500
)
pred <- link(m,data=list(speed=0:30),flatten=TRUE)
sim.dist <- sim(m,data=list(speed=0:30),n=0)
plot( dist ~ speed , cars )
mu <- apply( pred , 2 , mean )
mu.PI <- apply( pred , 2, PI )
y.PI <- apply( sim.dist , 2 , PI )
lines( 0:30 , mu )
shade( mu.PI , 0:30 )
shade( y.PI , 0:30 )
} #EXAMPLES
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.