R/main.R

#--------------------------------------------#
# Example of hierarchical SDT model for 2AFC #
#--------------------------------------------#

# Clear workspace
rm(list = ls())


# set up directories
# setwd("D:/documents/UMass/summer/2016/cfs_bayes")
dir.home <- getwd()
dir.R = file.path(dir.home, 'R')
dir.stan = file.path(dir.home, 'stan')
dir.data = file.path(dir.home, 'data')


# Load useful packages
library(ggplot2)
library(mvtnorm) # For the multivariate normal distribution
# install.packages(rstan)
# For more installation notes -> http://mc-stan.org/interfaces/rstan
library(rstan)
options(mc.cores = parallel::detectCores())
library(loo)

# load helpful functions
source(file.path(dir.R, 'fns.R'))

# Create a logical vector indicating which code segments should be run
runCode = c( F, T )


###
### Model simulation and parameter recovery
###

if ( runCode[1] ) {

  Ns = 30 # Number of subjects
  Ni = 128 # Number of items

  # Parameters for subject effects
  rho = 0.25 # Correlation
  sigma_s = c( 0.25, 0.25 ) # Variance of effects
  Omega = diag(2)
  Omega[1,2] = rho; Omega[2,1] = rho;
  Sigma = diag(2);
  diag(Sigma) = sigma_s
  Sigma = Sigma^2
  Sigma = Sigma %*% Omega %*% Sigma

  # Generate subject effects
  eta = rmvnorm( Ns, c( 0, 0 ), Sigma )

  # Parameters for item effects
  sigma_i = .25
  zeta = rnorm( Ni, 0, sigma_i )

  # Population parameters for d'
  Beta_dp = rbind(1.5,-.2,.1,0.05)

  # Population parameters for kappa
  Beta_k = rbind(0.0,.25)

  # Create covariates
  intercept = rep(1,Ns*Ni)
  factor_A = rep( rep( c(-1,1), each = Ni/2 ), Ns )
  factor_B = rep( rep( rep( c(-1,1), each = Ni/4 ), 2 ), Ns )
  # Design matrices
  X_dp = cbind( intercept, factor_A, factor_B, factor_A*factor_B )
  X_k = cbind( intercept, factor_B )
  # Indices for subject and items
  index_i = matrix( 1:Ni, Ni, Ns )
  index_i = as.matrix( apply( index_i, 2, sample ) )
  index_i = as.vector( index_i )
  index_s = rep( 1:Ns, each = Ni )
  # Position of correct choice (0 = left, 1 = right)
  Correct = matrix( rep( c(0,1), each = Ni/2 ), Ni, Ns )
  Correct = as.matrix( apply( Correct, 2, sample ) )
  Correct = as.vector( Correct )

  # Calculate SDT parameters
  dprime = X_dp %*% Beta_dp + eta[ index_s, 1 ] + zeta[ index_i ]
  kappa = X_k %*% Beta_k + eta[ index_s, 2 ]

  Theta = SDTlikelihood( 1, 1, kappa, dprime, prob = T )

  Y = rbinom( nrow(Theta), 1, Correct*Theta[,1] + (1-Correct)*Theta[,2] )

  # Clean up workspace
  rm( Omega, Sigma )

  ### Parameter recovery ###

  # Define mildly informative priors (i.e. d'/criterion centered at 0
  # with 95% credible mass between -2 and 2, a correlation matrix with
  # a greater peak around the identity matrix, and scale parameters
  # centered at 0.5 with 95% credible mass between .01 and 1.2)
  Priors = rbind(
    c(0,1), # beta_dp
    c(0,1),
    c(0,1),
    c(0,1),
    c(0,1), # beta_k
    c(0,1),
    c(2,0), # Omega (Correlation matrix)
    c(2,4), # tau (Scale parameters)
    c(2,4),
    c(2,4) # sigma_i
  )

  # Create list of data as input to Stan
  stan_dat = list(
    Ns = Ns,
    Ni = Ni,
    No = length( Y ),
    K_dp = ncol( X_dp ),
    K_k = ncol( X_k ),
    Y = Y,
    index_s = index_s,
    index_i = index_i,
    Correct = Correct,
    X_dp = X_dp,
    X_k = X_k,
    Priors = Priors
  )

  burn = 250 # Burn-in
  niter = 2500 # Number of samples used to approximate posterior for each chain

  startTime = Sys.time() # To assess run time

  # It is good practice to define your stan script in a
  # separate file with the extension '.stan'.
  fit = stan(file= file.path(dir.stan, 'Hierarchical_SDT_for_2AFC.stan'), data = stan_dat,
             warmup = burn, iter = burn+niter,
             chains = 4)
  post = extract(fit) # Extract posterior samples

  # Total run time of code
  runTime = Sys.time() - startTime
  print( runTime); rm( startTime )

  # Create data frame with posterior values (by column)
  df = as.data.frame( cbind( post = as.vector( post$Beta_dp ),
                             par = rep( 1:4, each = nrow( post$Beta_dp ) ) ) )
  df$par = as.factor( df$par )

  # Violin plots provide a nice summary of the marginal posteriors for parameters
  x11()
  theme_set( theme_gray(base_size = 18) )
  plt = ggplot( df, aes( x = par, y = post, fill = par ) ) +
    geom_violin() + labs( list( x = 'Parameters', y = 'Posterior values' ) )
  # Define condition labels
  x_axis_labels = c(
    'Intercept',
    'Factor A',
    'Factor B',
    'Interaction' )
  # Change the x-axis values
  plt + scale_x_discrete( labels = x_axis_labels ) + guides( fill = FALSE )

}


###
### Fits to real data
###

if ( runCode[2] ) {

  # Load in an example data set
  load( file.path(dir.data, 'Ex_data.RData')  )

  ### Set up input for Stan ###

  # Extract relevant variables to fit
  AD = OriginalAllData[ OriginalAllData$Cond == 6, ]
  # Remove missing data
  AD = AD[ AD$Response != 0, ]
  d = as.data.frame( AD$Response - 1 ) # Subject's choice ( 0 = Left, 1 = Right )
  colnames(d) = 'Y'
  d$Correct = AD$Correct - 1 # Position of correct choice
  # Image type (0 = target, 1 = competitor)
  d$IT = AD$ImageType - 1
  # Selective retrieval condition ( 1 = yes, 0 = no )
  d$SR = 1 - AD$Baseline
  # Subject factors
  d$S = AD$Subject
  # The items are not in incremental order. We need to create a
  tmp = numeric( nrow( AD ) )
  inc = 1;
  for ( i in sort( unique( AD$ImageNum ) ) ) {
    tmp[ AD$ImageNum == i ] = inc
    inc = inc + 1
  }
  # Item factors
  d$I = tmp
  # Clean up workspace
  rm( OriginalAllData, AD, tmp, inc, i )

  # First, we'll create the design matrices for the group-level
  # d' and criterion parameters, since our priors will reflect this.

  # d' design matrix
  X_dp = cbind( 1, d$IT, d$SR )
  # We'll use effects coding
  X_dp[ d$IT == 0, 2 ] = -1
  X_dp[ d$SR == 0, 3 ] = -1
  # We'll include an interaction term as well
  X_dp = cbind( X_dp, X_dp[,2]*X_dp[,3] )

  # Criterion design matrix
  X_k = cbind( as.numeric( d$IT == 1 ),
               as.numeric( d$IT == 0 ) ) # Here, we'll use intercept coding instead

  ### Tools for determining priors ###
  # Impact of d' and the criterion on P(Right,Left)
  # round( SDTlikelihood( 1, 1, 0, 1.6835, prob = T ), 3 )
  # Impact of parameters for gamma distribution on standard deviations
  # x11(); curve( dgamma( x, 4, 8 ), from = 0, to = 2 )

  # I define mildly informative priors here
  Priors = rbind(
    c(2.2,.5), # beta_dp
    c(0,.5),
    c(0,.5),
    c(0,.5),
    c(0,.5), # beta_k
    c(0,.5),
    c(1,0), # Omega (Correlation matrix)
    c(4,8), # tau (Scale parameters)
    c(4,8),
    c(4,8) # sigma_i
  )

  # Create list of data as input to Stan
  stan_dat = list(
    Ns = length( unique( d$S ) ),
    Ni = length( unique( d$I ) ),
    No = length( d$Y ),
    K_dp = ncol( X_dp ),
    K_k = ncol( X_k ),
    Y = d$Y,
    index_s = d$S,
    index_i = d$I,
    Correct = d$Correct,
    X_dp = X_dp,
    X_k = X_k,
    Priors = Priors
  )

  ### Prior predictive check ###

  priorPred = F # Change to true to run a prior predictive check
  # priorPred = T
  if ( priorPred ) {

    startTime = Sys.time() # To assess run time

    fit = stan( file = 'Prior_pred_check.stan',
                data = stan_dat,
                algorithm = 'Fixed_param',
                warmup = 0,
                iter = 2500,
                chains = 4,
                seed = 832
    )

    ppc = extract(fit) # Extract posterior samples

    runTime = Sys.time() - startTime
    print( runTime); rm( startTime )

    Cond = rep(1,nrow(d)); Cond[ d$IT == 1 & d$SR == 0 ] = 2;
    Cond[ d$IT == 0 & d$SR == 1 ] = 3; Cond[ d$IT == 0 & d$SR == 0 ] = 4;

    tmp = matrix( NA, nrow( ppc$Ysim ), 4 )

    # Create a progress bar using a base R function
    pb = txtProgressBar( min = 1, max = 4, style = 3 )
    for ( i in 1:4 ) {
      tmp[,i] = apply( ppc$Ysim, 1, function(x) {
        sel = Cond == 1;
        mean( x[sel] == d$Correct[sel] )
      } )
      # Update the progress bar
      setTxtProgressBar(pb,i)
    }
    close(pb)

    x11();
    plot( c(1,4), c(0,1), type = 'n', bty = 'l',
          xlab = 'Condition', ylab = 'P(Correct)',
          xaxt = 'n', main = 'Prior Predictive Check' )
    abline( h = seq( .5, .9, .1 ), col = 'grey' )
    axis( 1, 1:4, c('T-SR','T-B','C-SR','C-B'), tick = F )
    segments( 1:4, apply( tmp, 2, min ),
              1:4, apply( tmp, 2, max ) )
    segments( 1:4, apply( tmp, 2, quantile, prob = .025 ),
              1:4, apply( tmp, 2, quantile, prob = .975 ), lwd = 3 )
    segments( 1:4, apply( tmp, 2, quantile, prob = .25 ),
              1:4, apply( tmp, 2, quantile, prob = .75 ), lwd = 5 )
    points( 1:4, colMeans( tmp ), pch = 21, bg = 'white', cex = 1.5 )

    # Clean up workspace
    rm( Cond, tmp, i )

  }

  ### Fit the model in Stan ###

  burn = 250 # Burn-in
  niter = 2500 # Number of samples used to approximate posterior for each chain

  startTime = Sys.time() # To assess run time

  # Fit the model using Stan
  fit = stan( file= file.path(dir.stan, 'Hierarchical_SDT_for_2AFC.stan'),
              data = stan_dat,
              warmup = burn,
              iter = burn+niter,
              chains = 4,
              seed = 1903, # For reproducibility
              control = list( adapt_delta = .9 ) # For stabiliity
  )
  post = extract(fit) # Extract posterior samples

  runTime = Sys.time() - startTime
  print( runTime); rm( startTime )

  ### Plots of the posterior estimates ###

  # Plots of d'

  # Create data frame with posterior values (by column)
  df = as.data.frame( cbind( post = as.vector( post$Beta_dp ),
                             par = rep( 1:4, each = nrow( post$Beta_dp ) ) ) )
  df$par = as.factor( df$par )

  x11()
  theme_set( theme_gray(base_size = 18) )
  plt = ggplot( df, aes( x = par, y = post, fill = par ) ) +
    geom_violin() + labs( list( x = 'Parameters', y = 'Posterior values' ) )
  plt
  # Define condition labels
  x_axis_labels = c(
    'Intercept',
    'Image Type',
    'Selective\nRetrieval',
    'Interaction' )
  # Change the x-axis values
  plt + scale_x_discrete( labels = x_axis_labels ) + guides( fill = FALSE )

  # Plots of criterion

  # Create data frame with posterior values (by column)
  df = as.data.frame( cbind( post = as.vector( post$Beta_k ),
                             par = rep( 1:2, each = nrow( post$Beta_k ) ) ) )
  df$par = as.factor( df$par )

  x11()
  theme_set( theme_gray(base_size = 18) )
  plt = ggplot( df, aes( x = par, y = post, fill = par ) ) +
    geom_violin() + labs( list( x = 'Parameters', y = 'Posterior values' ) )
  plt
  # Define condition labels
  x_axis_labels = c(
    'Targets',
    'Competitors' )
  # Change the x-axis values
  plt + scale_x_discrete( labels = x_axis_labels ) + guides( fill = FALSE )

  # Calculate the LOO for the model
  logLik = t( apply( post$theta, 1, function(x) dbinom( d$Y, 1, prob = x ) ) )
  model_1 = loo( logLik )

  ### Posterior retrodictive check ###

  postPred = F
  # postPred = T
  # Note this code runs very slowly
  if ( postPred == T ) {

    # Predicted fixed effects by posterior sample
    dprime_p = apply( post$Beta_dp, 1, function(x) stan_dat$X_dp %*% x )
    kappa_p = apply( post$Beta_k, 1, function(x) stan_dat$X_k %*% x )

    # Generate random subject and item level effects per posterior sample
    # and then calculate the probability of picking right, generate a set of data
    # collapse over conditions
    pred_ac = matrix( NA, 4, nrow( post$Beta_dp ) )
    # Create a progress bar using a base R function
    pb = txtProgressBar( min = 1, max = nrow( post$Beta_dp), style = 3 )
    for ( i in 1:nrow( post$Beta_dp ) ) {
      zeta_p = rnorm( stan_dat$Ni, 0, post$sigma_i[i] )
      Sigma_p = diag( post$tau[i,] ) %*% post$Omega[i,,] %*% diag( post$tau[i,] )
      eta_p = rmvnorm( stan_dat$Ns, c(0,0), Sigma_p )

      dprime_p[,i] = dprime_p[,i] + eta_p[ stan_dat$index_s,1] + zeta_p[stan_dat$index_i]
      kappa_p[,i] = kappa_p[,i] + eta_p[ stan_dat$index_s,2]
      theta = SDTlikelihood( 1, 1, kappa_p[,i], dprime_p[,i], prob = T)
      theta = stan_dat$Correct*theta[,1] + (1-stan_dat$Correct)*theta[,2]
      Y = rbinom( nrow( dprime_p ), 1, theta )

      sel = d$IT == 0 & d$SR == 0
      pred_ac[1,i] = mean( Y[ sel ] == d$Correct[ sel ] )
      sel = d$IT == 1 & d$SR == 0
      pred_ac[2,i] = mean( Y[ sel ] == d$Correct[ sel ] )
      sel = d$IT == 0 & d$SR == 1
      pred_ac[3,i] = mean( Y[ sel ] == d$Correct[ sel ] )
      sel = d$IT == 1 & d$SR == 1
      pred_ac[4,i] = mean( Y[ sel ] == d$Correct[ sel ] )

      # Update the progress bar
      setTxtProgressBar(pb,i)
    }
    close(pb)

    # Clean up workspace
    rm( dprime_p, kappa_p, zeta_p, Sigma_p, eta_p, theta, Y )

    x11();
    plot( c(1,4), c(.6,1), type = 'n', bty = 'l', xlab = 'Condition', xaxt = 'n',
          ylab = 'P(Correct)' )
    obs = aggregate( d$Y == d$Correct, list( d$IT, d$SR ), mean )$x

    pred = apply( pred_ac, 1, quantile, prob = c( .025, .25, .5, .75, .975 ) )
    segments( 1:4, pred[1,], 1:4, pred[5,] )
    segments( 1:4, pred[2,], 1:4, pred[4,],lwd = 3 )
    points( 1:4, pred[3,], pch = 21, bg = 'white', cex=1.5 )
    points( 1:4, obs, pch = 19, col = 'blue' )
  }

}


#-------------------------------------------------------------------------
# using data
#-------------------------------------------------------------------------
# NOTE: not sure if this is currently relevant, given the stuff that kevin added (Above)

runYes = F
if (runYes) {

  source(file=file.path(dir.R, 'loadData.R'))

  # Define priors
  Priors = rbind(
    c(0,1), # beta_dp
    c(0,1),
    c(0,1),
    c(0,1),
    c(0,1), # beta_k
    c(2,0), # Omega (Correlation matrix)
    c(2,4), # tau (Scale parameters)
    c(2,4),
    c(2,4) # sigma_i
  )

  # define design matrices
  X_k =  cbind(rep(1,nrow(d))) # criterion design matrix
  X_dp = matrix(0,nrow(d),4)  # condition d' design
  for( cond in 1:4 ) {
    X_dp[ data$condition.y==cond, cond] = 1
  }

  # Create list of data as input to Stan
  stan_dat = list(
    Ns = nS$subject,
    Ni = nS$whichItem,
    K_dp = ncol(X_dp),
    K_k = ncol(X_k),
    Y = d$whichResp,
    index_s = d$subject,
    index_i = d$whichItem,
    Correct = d$whichSide,
    X_dp = X_dp,
    X_k = X_k,
    Priors = Priors
  )



  burn = 500 # Burn-in
  niter = 2000 # Number of samples to approximate posterior

  startTime = Sys.time() # To assess run time

  # It is good practice to define your stan script in a
  # separate file with the extension '.stan'.
  fit = stan(file = file.path(dir.stan, 'Hierarchical_SDT_for_2AFC.stan'), data = stan_dat,
             warmup = burn, iter = burn+niter,
             chains = 4)
  post = extract(fit) # Extract posterior samples

  # Total run time of code
  runTime = Sys.time() - startTime
  print( runTime); rm( startTime )

}


# super simple plot of d' values
runYes = F
if (runYes) {
  library(dplyr) # note: dplyr has function 'extract', which masks rstan function'extract'
  dp <- as.data.frame(post$Beta_dp) %>%
    tidyr:::gather(., key = condition, value = dPrime, V1:V4) %>%
    mutate(., condition = plyr::mapvalues(condition, from=c('V1','V2','V3','V4'), to=c('foil','word','cfs','binoc')))

  dp$condition <- factor(dp$condition, levels=c("foil", "word", "cfs","binoc"))


  ggplot(dp, mapping=aes(x = condition, y = dPrime)) +
    geom_point()

}
psadil/cfsBayes documentation built on May 24, 2017, 12:17 p.m.