inst/preprint/analysis_script.R

## -*- coding: utf-8 -*-

######################################################################
## WARNING The interface has changed since the Behavior Research
## Methods paper was accepted for publication. In the current version
## of the package every parameter (delta, gamma, theta - in uv sdt
## models, eta in ordinal models, delta_sd_scale_1, etc.) is
## represented by a matrix even when it is unidimensional in a given
## model. So, for example, in an SDT model delta fixed effects are
## represented by parameters delta_fixed[1,1], ..., delta_fixed[1,n],
## where n is the number of delta fixed effects (i.e., the number of
## columns in the fixed effects model matrix for delta).

######################################################################
## IMPORTANT GLOBAL VARIABLES
##
## Change this to FALSE once all the stanfit objects are stored to
## save time and memory
fresh_start = FALSE
## Where do you want to store all the (large) stanfit objects? (I use
## Linux so '~/path' makes sense to me)
temp_path = '~/windows/temp'

## WARNING we are not setting the random seed here because 1) R and
## Stan have separate seeds and 2) the user of this script may use a
## different number of mcmc threads.

library(rstan)
library(ggplot2)
library(Hmisc) # rMultinom for simulations
library(bhsdtr)
library(plyr)
library(gridExtra)
## Standard Stan optimizations
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
data(gabor)

######################################################################
## Simulating individual data for sdt models

sim_sdt = function(n = 1, dprim = 1.5, criteria = c(-2.1, -1.4, -.7, 0, .7, 1.4, 2.1), model = 'sdt', sd_ratio = 1){
    which_bin = function(x, thr)min(which(x <= c(-Inf, thr, Inf)) - 1)
    d = data.frame(stim = rep(1:2, each = n), e = rnorm(n * 2), r = NA)
    for(i in 1:nrow(d))
        d$r[i] = which_bin(d$e[i] + .5 * dprim * c(-1, 1)[d$stim[i]], criteria)
    attr(d, 'dprim') = dprim
    attr(d, 'criteria') = criteria
    attr(d, 'sd_ratio') = sd_ratio
    d
}

######################################################################
## Quick test of all the possible types of non-hierarchical models
## fitted to simulated data

d = sim_sdt(n = 10000)
table(d[, c('stim', 'r')])
adata = aggregate_responses(d, 'stim', 'r')

## We will try evey kind of model with every link function
res = expand.grid(par = c('dprim', paste('c', 1:length(attr(d, 'criteria')), sep = '')),
                  model = c('sdt', 'uvsdt', 'metad'),
                  link = c('softmax', 'log_distance', 'log_ratio'), true = NA, est = NA)

for(link in levels(res$link)){
    for(model in levels(res$model)){
        print(sprintf('Fitting model %s with link %s', model, link))
        fixed = list(delta = ~ 1, gamma = ~ 1)
        if(model == 'uvsdt')fixed$theta = ~ 1
        fit = stan(model_code = make_stan_model(gamma_link = link, model = model),
                   data = make_stan_data(adata, fixed, gamma_link = link, model = model),
                   pars = c('delta_fixed', 'gamma_fixed',
                            ## we need counts_new for plotting
                            'counts_new'),
                   init_r = .5,
                   iter = 4000,
                   chains = 4)
        s = as.data.frame(fit)
        res[(res$link == link) & (res$model == model), 'est'] =
            c(mean(exp(s[, grep('delta_fixed\\[1', names(s))])), apply(gamma_to_crit(s, gamma_link = link), 2, mean))
        res[(res$link == link) & (res$model == model), 'true'] = c(dprim = attr(d, 'dprim'), attr(d, 'criteria'))
        rm(s)
        rm(fit)
        gc()
    }
}

## What is the correlation between the estimates and the true values
## of the parameters?
round(cor(res$est, res$true, use = 'pairwise.complete.obs'), 2)
## 1

## A plot comparing point estimates to known true values
res$par_type = c('criteria', 'dprim')[(as.character(res$par) == 'dprim') + 1]
ggplot(res, aes(est, true, group = par_type, color = par_type)) +
    geom_abline(slope = 1, intercept = 0) +
    geom_point(size = 5) +
    facet_grid(model ~ link)
## All is good

######################################################################
## Fitting the model described in the BRM paper to real study data

## We will be using the gabor dataset provided with the package
data(gabor)
?gabor

## Combined response has to be calculated for this dataset
gabor$r = combined_response(gabor$stim, gabor$rating, gabor$acc)
## Aggregation without information loss - we keep keep the variability
## due to the participants and all the experimental conditions.
adata = aggregate_responses(gabor, 'stim', 'r', c('duration', 'order', 'id'))
## Model specification
fixed = list(delta = ~ -1 + duration:order, gamma = ~ order)
random = list(list(group = ~ id, delta = ~ -1 + duration, gamma = ~ 1))
model = make_stan_model(random)
## The model code is human-readable
cat(model)
## Required Stan data structure
sdata = make_stan_data(adata, fixed, random)

## Main model fit
if(fresh_start){
    fit = stan(model_code = model, data = sdata,
               pars = c('delta_fixed', 'gamma_fixed',
                        'delta_sd_1', 'gamma_sd_1',
                        'delta_random_1', 'gamma_random_1',
                        'Corr_delta_1', 'Corr_gamma_1',
                        ## we need counts_new if we want to use the
                        ## plot_sdt_fit function
                        'counts_new'), 
               ## init_r = .5 helps a lot with rejected initial
               ## samples, the default range of initial values in stan
               ## is (-2, 2), which is just too wide for the
               ## parameters in our models.
               init_r = .5,
               iter = 8000,
               chains = 4)
    save(fit, file = paste(temp_path, 'fit', sep = '/'))
}else{
    load(paste(temp_path, 'fit', sep = '/'))
}

## Model fit summary does not indicate any convergence issues
print(fit, probs = c(.025, .957),
      pars = c('delta_fixed', 'gamma_fixed',
               'delta_sd_1', 'gamma_sd_1',
               'Corr_delta_1', 'Corr_gamma_1'))
##                    mean se_mean   sd  2.5% 95.7% n_eff Rhat
## delta_fixed[1,1]  -0.11    0.00 0.15 -0.42  0.14  4776 1.00
## delta_fixed[1,2]   1.12    0.00 0.09  0.94  1.26  4654 1.00
## delta_fixed[1,3]  -0.39    0.00 0.20 -0.78 -0.06  5941 1.00
## delta_fixed[1,4]   1.28    0.00 0.11  1.07  1.47  5967 1.00
## gamma_fixed[1,1]  -0.14    0.00 0.06 -0.27 -0.04  3590 1.00
## gamma_fixed[1,2]  -0.22    0.00 0.11 -0.43 -0.04  8695 1.00
## gamma_fixed[2,1]  -0.70    0.00 0.18 -1.05 -0.40  4161 1.00
## gamma_fixed[2,2]   0.49    0.00 0.29 -0.09  0.99  4002 1.00
## gamma_fixed[3,1]  -0.54    0.00 0.22 -0.96 -0.16  3537 1.00
## gamma_fixed[3,2]   0.84    0.01 0.36  0.13  1.45  3130 1.00
## gamma_fixed[4,1]   0.28    0.00 0.24 -0.19  0.70  3434 1.00
## gamma_fixed[4,2]   0.42    0.01 0.40 -0.37  1.10  3091 1.00
## gamma_fixed[5,1]  -0.21    0.00 0.30 -0.78  0.30  3599 1.00
## gamma_fixed[5,2]   0.83    0.01 0.48 -0.11  1.65  3432 1.00
## gamma_fixed[6,1]  -0.78    0.00 0.24 -1.24 -0.36  3756 1.00
## gamma_fixed[6,2]   0.80    0.01 0.39  0.05  1.47  3379 1.00
## gamma_fixed[7,1]  -0.32    0.00 0.17 -0.64 -0.04  4034 1.00
## gamma_fixed[7,2]   0.43    0.00 0.28 -0.12  0.91  3585 1.00
## delta_sd_1[1,1]    0.66    0.00 0.11  0.48  0.87  5937 1.00
## delta_sd_1[1,2]    0.44    0.00 0.06  0.34  0.54  5408 1.00
## gamma_sd_1[1,1]    0.16    0.00 0.07  0.02  0.30  2874 1.00
## gamma_sd_1[2,1]    0.82    0.00 0.11  0.63  1.03  5994 1.00
## gamma_sd_1[3,1]    1.08    0.00 0.12  0.86  1.30  4923 1.00
## gamma_sd_1[4,1]    1.28    0.00 0.14  1.03  1.53  4531 1.00
## gamma_sd_1[5,1]    1.55    0.00 0.17  1.25  1.86  4851 1.00
## gamma_sd_1[6,1]    1.19    0.00 0.13  0.95  1.43  5021 1.00
## gamma_sd_1[7,1]    0.78    0.00 0.10  0.60  0.97  4822 1.00

## Model fit summary table for the paper
library(xtable)
smr = as.data.frame(round(summary(fit)$summary[,c(1, 2, 3, 4, 8, 9, 10)], 2))
smr$n_eff = as.integer(round(smr$n_eff))
smr = smr[-grep('random', rownames(smr)),]
smr = smr[-grep('counts_new', rownames(smr)),]
print(xtable(smr[-nrow(smr),]), file = 'fit_table.tex')

## Some plots
(p1 = plot_sdt_fit(fit, adata, c('order', 'duration')))
ggsave(p1, file = 'roc_fit.pdf')
ggsave(p1, file = 'roc_fit.png')
(p2 = plot_sdt_fit(fit, adata, c('order', 'duration'), type = 'response'))
ggsave(p2, file = 'response_fit.pdf')
ggsave(p2, file = 'response_fit.png')

## According to the fitted model, the criteria are placed fairly
## symmetrically
crit = gamma_to_crit(as.data.frame(fit))
round(apply(crit, 2, mean), 2)
## criteria_fixed[1,1] criteria_fixed[2,1] criteria_fixed[3,1] criteria_fixed[4,1] 
##               -2.18               -1.56               -0.99                0.11 
## criteria_fixed[5,1] criteria_fixed[6,1] criteria_fixed[7,1] 
##                0.78                1.20                2.00 

######################################################################
## Fitting the single criterion hierarchical SDT model (just for fun)

fixed = list(delta = ~ -1 + duration:order, gamma = ~ order)
random = list(list(group = ~ id, delta = ~ -1 + duration, gamma = ~ 1))
## this time we ignore the ratings
gabor$r2 = combined_response(gabor$stim, accuracy = gabor$acc)
if(fresh_start){
    fit.s = stan(model_code = make_stan_model(random),
                 data = make_stan_data(aggregate_responses(gabor, 'stim', 'r2', c('duration', 'order', 'id')),
                                       fixed, random),
                 pars = c('delta_fixed', 'gamma_fixed',
                          'delta_sd_1', 'gamma_sd_1',
                          'delta_random_1', 'gamma_random_1',
                          'Corr_delta_1', 'Corr_gamma_1',
                          'counts_new'),
                 init_r = .5,
                 iter = 8000,
                 chains = 4)
    save(fit.s, file = paste(temp_path, 'fit.s', sep = '/'))
}else{
    load(paste(temp_path, 'fit.s', sep = '/'))
}

print(fit.s, probs = c(.025, .957),
      pars = c('delta_fixed', 'gamma_fixed',
               'delta_sd_1', 'gamma_sd_1',
               'Corr_delta_1', 'Corr_gamma_1'))
## All is fine

## It does not make sense to use plot_sdt_fit here, because the
## single-criterion SDT model is untestable (i.e., it is saturated)

## Compare delta estimates
smr1 = as.data.frame(summary(fit)$summary[,c(1, 2, 3, 4, 8, 9, 10)])
smr2 = as.data.frame(summary(fit.s)$summary[,c(1, 2, 3, 4, 8, 9, 10)])
smr = rbind(smr1[grep('delta', rownames(smr1)),],
            smr2[grep('delta', rownames(smr2)),])
smr$model = as.factor(rep(c('Multiple criteria', 'Single criterion'), each = nrow(smr) / 2))
names(smr)[4:5] = c('ci.lo', 'ci.hi')
smr$par = rep(rownames(smr)[1:(nrow(smr) / 2)], 2)
(p = ggplot(smr, aes(par, mean, color = model)) +
     geom_point() +
     geom_errorbar(aes(ymin = ci.lo, ymax = ci.hi)) +
     theme(axis.text.x = element_text(angle = 90)) +
     labs(color = 'Model', y = 'Posterior point and interval estimates'))
ggsave('single_vs_multiple_c.pdf', p)
## Similar but not exactly the same

## Let's compare the main criteria in the base condition
c1 = gamma_to_crit(as.data.frame(fit))
c2 = gamma_to_crit(as.data.frame(fit.s))
round(rbind(quantile(c1[,4], c(.025, .5, .957)),
            quantile(c2, c(.025, .5, .957))), 2)
## Fairly similar
##
##       2.5%  50% 95.7%
## [1,] -0.05 0.11  0.25
## [2,] -0.13 0.06  0.21

######################################################################
## Fitting the model to data simulated from itself to test if the
## model recovers known realistic parameter values

## This function takes a fitted hierarchical model and uses the point
## estimates to simulate new responses. The sample size is the same as
## in the aggregated data object.
simulate_from_fit = function(fit, adata, fixed, random){
    ## Realistic known parameter values
    s = apply(as.data.frame(fit), 2, mean)
    sdata = make_stan_data(adata, fixed, random)
    gamma_fixed = matrix(nrow = ncol(sdata$X_gamma), ncol = sdata$K - 1)
    for(r in 1:nrow(gamma_fixed))
        gamma_fixed[r,] = s[paste('gamma_fixed[', 1:(sdata$K - 1), ',', r, ']', sep = '')]
    gamma_random = list()
    for(r in 1:sdata$group_1_size)
        gamma_random[[r]] = matrix(s[paste('gamma_random_1[', r, ',', 1:(sdata$K - 1), ',1]', sep = '')],
                                   nrow = sdata$K - 1, ncol = sdata$Z_gamma_ncol)
    delta_fixed = s[grep('delta_fixed', names(s))]
    delta_random = matrix(nrow = sdata$group_1_size, ncol = sdata$Z_delta_ncol_1)
    for(r in 1:nrow(delta_random))
        for(i in 1:ncol(delta_random))
            delta_random[r, i] = s[sprintf('delta_random_1[%d,1,%d]', r, i)]
    multinomial_p = matrix(ncol = sdata$K, nrow = sdata$N)
    thr = gamma = matrix(ncol = sdata$K - 1, nrow = sdata$N)
    dprim = rep(NA, sdata$N)
    for(n in 1:sdata$N){
        dprim[n] = exp(sdata$X_delta[n,] %*% delta_fixed + sdata$Z_delta_1[n,] %*% delta_random[sdata$group_1[n],])
        gamma[n,] = t(gamma_fixed) %*% sdata$X_gamma[n,] + gamma_random[[sdata$group_1[n]]] %*% sdata$Z_gamma_1[n,]
    }
    criteria = gamma_to_crit(gamma, NULL)
    criteria = t(apply(gamma, 1, function(x){
        exp_x_0 = exp(c(x, 0))
        sdata$criteria_scale * qnorm(cumsum(exp_x_0 / sum(exp_x_0))[-sdata$K])
    }))
    distr_shift = -sdata$stim_sign * dprim / 2
    multinomial_p[, 1] = pnorm(criteria[, 1] + distr_shift)
    if(sdata$K > 2)
        for(k in 2:(sdata$K - 1)){
            multinomial_p[, k] = pnorm(criteria[, k] + distr_shift) - pnorm(criteria[, k - 1] + distr_shift)
        }
    multinomial_p[, sdata$K] = pnorm(-(criteria[, sdata$K - 1] + distr_shift))
    data_sim = adata
    ## Now we can simulate
    for(r in 1:nrow(adata$counts))
        data_sim$counts[r,] = table(c(rMultinom(t(multinomial_p[r,]), sum(adata$counts[r,])), 1:sdata$K)) - 1
    data_sim$dprim = dprim
    data_sim$gamma = gamma
    data_sim$criteria = criteria
    data_sim
}

adata = aggregate_responses(gabor, 'stim', 'r', c('duration', 'order', 'id'))
fixed = list(delta = ~ -1 + duration:order, gamma = ~ order)
random = list(list(group = ~ id, delta = ~ -1 + duration, gamma = ~ 1))
load(paste(temp_path, 'fit', sep = '/'))
if(fresh_start){
    adata_sim = simulate_from_fit(fit, adata, fixed, random)
    save(adata_sim, file = paste(temp_path, 'data_sim', sep = '/'))
}else{
    load(paste(temp_path, 'data_sim', sep = '/'))
}

if(fresh_start){
    fit.sim = stan(model_code = make_stan_model(random),
                   pars = c('delta_fixed', 'gamma_fixed',
                            'delta_sd_1', 'gamma_sd_1',
                            'delta_random_1', 'gamma_random_1',
                            'counts_new'),
                   data = make_stan_data(adata_sim, list(delta = ~ -1 + duration:order, gamma = ~ order),
                                         list(list(group = ~ id, delta = ~ -1 + duration, gamma = ~ 1))),
                   init_r = .5,
                   chains = 4,
                   iter = 8000)
    save(fit.sim, file = paste(temp_path, 'fit.sim', sep = '/'))
}else{
    load(paste(temp_path, 'fit.sim', sep = '/'))
}

print(fit.sim, probs = c(.025, .957),
      pars = c('delta_fixed', 'gamma_fixed', 'delta_sd_1', 'gamma_sd_1'))
## All is fine

cor(apply(as.data.frame(fit)[,1:20], 2, mean), apply(as.data.frame(fit.sim)[,1:20], 2, mean))
## .99

(p1 = plot_sdt_fit(fit.sim, adata_sim, c('duration', 'order')))
ggsave('roc_sim_fit.pdf', p1)
(p2 = plot_sdt_fit(fit.sim, adata_sim, c('duration', 'order'), type = 'response'))
ggsave('response_sim_fit.pdf', p2)

######################################################################
## Comparing true values with the point estimates based on the true
## model

s2 = as.data.frame(fit.sim)
s = apply(as.data.frame(fit), 2, mean)
sdata = make_stan_data(adata, fixed, random)
gamma_fixed = matrix(nrow = ncol(sdata$X_gamma), ncol = sdata$K - 1)
for(r in 1:nrow(gamma_fixed))
    gamma_fixed[r,] = s[paste('gamma_fixed[', 1:(sdata$K - 1), ',', r, ']', sep = '')]
gamma_random = list()
for(r in 1:sdata$group_1_size)
    gamma_random[[r]] = matrix(s[paste('gamma_random_1[', r, ',', 1:(sdata$K - 1), ',1]', sep = '')],
                               nrow = sdata$K - 1, ncol = sdata$Z_gamma_ncol)
delta_fixed = s[grep('delta_fixed', names(s))]
delta_random = matrix(nrow = sdata$group_1_size, ncol = sdata$Z_delta_ncol_1)
for(r in 1:nrow(delta_random))
    for(i in 1:ncol(delta_random))
        delta_random[r, i] = s[sprintf('delta_random_1[%d,1,%d]', r, i)]

res = as.data.frame(cbind(t(apply(s2[,grep('delta_fixed', names(s2))], 2, function(x)c(quantile(x, c(.025, .975)), mean(x))[c(1,3,2)])), delta_fixed))
names(res) = c('lo', 'fit', 'hi', 'true')
res$in_ci = (res$true >= res$lo) & (res$true <= res$hi)
res
## The proportion of true delta_fixed values which are within the 95%
## credible intervals
mean(res$in_ci)
## 1

res = as.data.frame(cbind(t(apply(s2[,grep('gamma_fixed', names(s2))], 2, function(x)c(quantile(x, c(.025, .975)), mean(x))[c(1,3,2)])),
                          as.vector(t(gamma_fixed))))
names(res) = c('lo', 'fit', 'hi', 'true')
res$in_ci = (res$true >= res$lo) & (res$true <= res$hi)
res
mean(res$in_ci)
## 1

res = as.data.frame(cbind(t(apply(s2[,grep('delta_random', names(s2))], 2, function(x)c(quantile(x, c(.025, .975)), mean(x))[c(1,3,2)])),
                          as.vector(delta_random)))
names(res) = c('lo', 'fit', 'hi', 'true')
res$in_ci = (res$true >= res$lo) & (res$true <= res$hi)
res
mean(res$in_ci)
## 1

gamma_random_mat = matrix(ncol = length(gamma_random[[1]]), nrow = length(gamma_random))
for(r in 1:nrow(gamma_random_mat))
  for(col in 1:ncol(gamma_random_mat))
    gamma_random_mat[r, col] = gamma_random[[r]][col]
res = as.data.frame(cbind(t(apply(s2[,grep('gamma_random', names(s2))], 2, function(x)c(quantile(x, c(.025, .975)), mean(x))[c(1,3,2)])),
                          as.vector(gamma_random_mat)))
names(res) = c('lo', 'fit', 'hi', 'true')
res$in_ci = (res$true >= res$lo) & (res$true <= res$hi)
res
mean(res$in_ci)
## 0.99
c(sum(!res$in_ci), nrow(res))
## 2 / 329

######################################################################
## Fitting the simplified, non-hierarchical (i.e., false in this case)
## model to simulated data aggregated over participants (i.e., too
## much aggregation). This is how SDT models are often (mis)used.

load(paste(temp_path, 'data_sim', sep = '/'))
df = adata_sim$data
df$stimulus = adata_sim$stimulus
df$i = 1:nrow(df)
df = ddply(df, c('order', 'duration', 'stimulus'), function(x)apply(adata_sim$counts[x$i,], 2, sum))
data_sim_2 = list(data = df[,c('order', 'duration')], stimulus = df$stimulus, counts = df[, paste(1:8)])
rbind(data_sim_2$counts[data_sim_2$data$order == 'DECISION-RATING' & data_sim_2$data$duration == '32 ms' & data_sim_2$stimulus == 1,],
      apply(adata_sim$counts[adata_sim$data$order == 'DECISION-RATING' & adata_sim$data$duration == '32 ms' & adata_sim$stimulus == 1,], 2, sum))
## Ok

if(fresh_start){
    fit.aggr = stan(model_code = make_stan_model(),
                    pars = c('delta_fixed', 'gamma_fixed', 'counts_new'),
                    data = make_stan_data(data_sim_2, list(delta = ~ -1 + duration:order, gamma = ~ order)),
                    init_r = .5,
                    chains = 4,
                    iter = 8000)
    save(fit.aggr, file = paste(temp_path, 'fit_aggr', sep = '/'))
}else{
    load(paste(temp_path, 'fit_aggr', sep = '/'))
}

## Excellent sampling
print(fit.aggr, pars = c('delta_fixed', 'gamma_fixed'),
      probs = c(.025, .975))

## Based on the ROC curve plots the false simplified model seems to
## fit the data very
(p1 = plot_sdt_fit(fit.aggr, data_sim_2, c('order', 'duration')))
ggsave('roc_sim_aggr_fit.pdf', p1)
(p2 = plot_sdt_fit(fit.aggr, data_sim_2, c('order', 'duration'), type = 'response'))
ggsave('response_sim_aggr_fit.pdf', p2)

######################################################################
## Comparison between the true and the simplified models fitted to the
## simulated data

ss = list('True hierarchical model' = as.data.frame(fit.sim),
          'Non-hierarchical model' = as.data.frame(fit.aggr))
## Here we are calculating the posterior samples for gamma for the
## second condition by adding the effect of the condition on gamma and
## than using the appropriate transformation.
ss_ = ss
for(i in 1:2){
    ss_[[i]][grep('delta_fixed', names(ss_[[i]]))] = apply(ss_[[i]][grep('delta_fixed', names(ss_[[i]]))], 2, exp)
    ss_[[i]][grep('gamma_fixed', names(ss_[[i]]))][,8:14] = ss_[[i]][grep('gamma_fixed', names(ss_[[i]]))][,1:7] +
        ss_[[i]][grep('gamma_fixed', names(ss_[[i]]))][,8:14]
    ss_[[i]][grep('gamma_fixed', names(ss_[[i]]))][,1:7] = gamma_to_crit(ss_[[i]])
    ss_[[i]][grep('gamma_fixed', names(ss_[[i]]))][,8:14] = gamma_to_crit(ss_[[i]], 2)
    names(ss_[[i]])= gsub('delta_fixed', 'dprim_fixed', names(ss_[[i]]))
    names(ss_[[i]])= gsub('gamma_fixed', 'criteria_fixed', names(ss_[[i]]))
}
round(apply(ss_[[1]][, grep('criteria_fixed', names(ss_[[1]]))], 2, mean), 2)
round(apply(ss_[[2]][, grep('criteria_fixed', names(ss_[[2]]))], 2, mean), 2)
## Ok

bias_plot = function(true_values, ss){
    df = data.frame(model = rep(names(ss), each = length(grep('fixed', names(ss[[1]])))),
                    par = c(names(ss[[1]])[grep('fixed', names(ss[[1]]))],
                            names(ss[[2]])[grep('fixed', names(ss[[2]]))]))
    if(length(grep('delta', levels(df$par))) > 0){
        df$par = factor(df$par, c(grep('delta', levels(df$par), value = T),
                                  grep('gamma', levels(df$par), value = T)))
    }else{
        df$par = factor(df$par, c(grep('dprim', levels(df$par), value = T),
                                  grep('criteria', levels(df$par), value = T)))
    }
    for(m in levels(df$model)){
        df$point.est[df$model == m] = apply(ss[[m]][,as.character(df$par[df$model == m])], 2, mean)
        df$ci.lo[df$model == m] = apply(ss[[m]][,as.character(df$par[df$model == m])], 2, function(x)quantile(x, .025))
        df$ci.hi[df$model == m] = apply(ss[[m]][,as.character(df$par[df$model == m])], 2, function(x)quantile(x, .975))
    }
    df$true = true_values[as.character(df$par)]
    ggplot(df, aes(par, point.est - true)) +
        geom_abline(intercept = 0, slope = 0, lty = 2, alpha = .5) +
        geom_point() +
        geom_errorbar(aes(ymin = ci.lo - true, ymax = ci.hi - true)) +
        facet_grid(~ model) +
        theme(axis.text.x = element_text(angle = 90)) +
        labs(y = 'Point and interval estimates centered on true values',
             x = 'Model parameter') +
        theme_minimalist()
}

s = as.data.frame(fit)
true_values = apply(s[, grep('fixed', names(s))], 2, mean)
(p1 = bias_plot(true_values, ss))
ggsave('true_vs_nonhier.pdf')
## The estimates based on overly aggregated data are COMPLETELY
## USELESS

true_values_ = apply(apply(s[, grep('delta_fixed', names(s))], 2, exp), 2, mean)
names(true_values_) = gsub('delta', 'dprim', names(true_values_))
## Translating between gamma and criteria for the true model that was
## used to simulate the data
s_ = s
s_[grep('gamma_fixed', names(s_))][,8:14] = s_[grep('gamma_fixed', names(s_))][,1:7] +
    s_[grep('gamma_fixed', names(s_))][,8:14]
true_values_ = c(true_values_, apply(gamma_to_crit(s_), 2, mean))
true_values_ = c(true_values_, apply(gamma_to_crit(s_, 2), 2, mean))
(p2 = bias_plot(true_values_, ss_))
ggsave('true_vs_nonhier_sdt.pdf')

p3 = grid.arrange(p1, p2, ncol = 2)
ggsave('true_vs_nonhier_both.pdf', p3, width = 13, height = 7)
ggsave('true_vs_nonhier_both.png')
ggsave('true_vs_nonhier_both.tiff')

res = as.data.frame(cbind(t(apply(ss[[2]][,grep('delta_fixed', names(s2))], 2, function(x)c(quantile(x, c(.025, .975)), mean(x))[c(1,3,2)])), delta_fixed))
names(res) = c('lo', 'fit', 'hi', 'true')
res$in_ci = (res$true >= res$lo) & (res$true <= res$hi)
res
mean(res$in_ci)
## .5

res = as.data.frame(cbind(t(apply(ss[[2]][,grep('gamma_fixed', names(s2))], 2, function(x)c(quantile(x, c(.025, .975)), mean(x))[c(1,3,2)])),
                          as.vector(t(gamma_fixed))))
names(res) = c('lo', 'fit', 'hi', 'true')
res$in_ci = (res$true >= res$lo) & (res$true <= res$hi)
res
mean(res$in_ci)
## .21
cbind(sum(res$in_ci), nrow(res))
## 3 14

## Let's compare the posterior SDs
aggr.posterior.sd = apply(as.data.frame(fit.aggr), 2, sd)
true.posterior.sd = apply(as.data.frame(fit.sim), 2, sd)
sd.ratios = true.posterior.sd[grep('fixed', names(true.posterior.sd))] /
  aggr.posterior.sd[grep('fixed', names(aggr.posterior.sd))]
mean(sd.ratios)
## 3.1 - 3.2 is the ratio of the correct posterior SD to posterior SD
## based on the simplified model

######################################################################
## Demonstration of approximately normal distribution of delta and
## gamma random effects

(p = ggplot(data.frame(delta = as.vector(t(delta_random)), i = c('32 ms', '64 ms')), aes(sample = delta)) +
     stat_qq() +
     facet_wrap(~i) +
     ylab('delta sample quantile') +
     xlab('Theoretical normal quantile') + theme_minimalist())
ggsave('delta_qq_plot.pdf', p)
## this is perfectly acceptable

gamma_rnd = gamma_random_mat[,1]
for(i in 2:ncol(gamma_random_mat))
  gamma_rnd = c(gamma_rnd, gamma_random_mat[,i])
k = rep(1:ncol(gamma_random_mat), each = nrow(gamma_random_mat))
(p = ggplot(data.frame(gamma = gamma_rnd, k = k), aes(sample = gamma)) + stat_qq() + facet_wrap(~k) +
     ylab('gamma sample quantile') +
     xlab('Theoretical normal quantile') +
     theme_minimalist())
ggsave('gamma_qq_plot.pdf', p)
## this is also perfectly acceptable
boryspaulewicz/bhsdtr documentation built on March 8, 2020, 8:24 a.m.