R/lpme_DoOneRun.R

Defines functions lpmec_onerun

Documented in lpmec_onerun

#' lpmec_onerun
#'
#' Implements analysis for latent variable models with measurement error correction
#'
#' @param Y A vector of observed outcome variables
#' @param observables A matrix of observable indicators used to estimate the latent variable
#' @param observables_groupings A vector specifying groupings for the observable indicators. Default is column names of observables.
#' @param make_observables_groupings Logical. If TRUE, creates dummy variables for each level of the observable indicators. Default is FALSE.
#' @param estimation_method Character specifying the estimation approach. Options include:
#' \itemize{
#' \item "em" (default): Uses expectation-maximization via \code{emIRT} package. Supports both binary (via \code{emIRT::binIRT}) and ordinal (via \code{emIRT::ordIRT}) indicators.
#' \item "pca": First principal component of observables.
#' \item "averaging": Uses feature averaging.
#' \item "mcmc": Markov Chain Monte Carlo estimation using either \code{pscl::ideal} (R backend) or \code{numpyro} (Python backend)
#' \item "mcmc_joint": Joint Bayesian model that simultaneously estimates latent variables and outcome relationship using \code{numpyro}
#' \item "mcmc_overimputation": Two-stage MCMC approach with measurement error correction via over-imputation
#' \item "custom": In this case, latent estimation performed using \code{latent_estimation_fn}.
#' }
#' @param latent_estimation_fn Custom function for estimating latent trait from \code{observables} if \code{estimation_method="custom"} (optional). The function should accept a matrix of observables (rows are observations) and return a numeric vector of length equal to the number of observations.
#' @param mcmc_control A list indicating parameter specifications if MCMC used.
#' \describe{
#'   \item{\code{backend}}{Character string indicating the MCMC engine to use.
#'     Valid options are \code{"pscl"} (default, uses the R-based \code{pscl::ideal} function)
#'     or \code{"numpyro"} (uses the Python numpyro package via reticulate).}
#'   \item{\code{n_samples_warmup}}{Integer specifying the number of warm-up (burn-in)
#'     iterations before samples are collected. Default is \code{500}.}
#'   \item{\code{n_samples_mcmc}}{Integer specifying the number of post-warmup MCMC
#'     iterations to retain. Default is \code{1000}.}
#'   \item{\code{chain_method}}{Character string passed to numpyro specifying how to run
#'     multiple chains. Options: \code{"parallel"} (default), \code{"sequential"},
#'     or \code{"vectorized"}.}
#'   \item{\code{n_thin_by}}{Integer indicating the thinning factor for MCMC samples.
#'     Default is \code{1}.}
#'   \item{\code{n_chains}}{Integer specifying the number of parallel MCMC chains to run.
#'     Default is \code{2}.}
#' }
#' @param conda_env A character string specifying the name of the conda environment to use
#'   via \code{reticulate}. Default is \code{"lpmec"}.
#' @param conda_env_required A logical indicating whether the specified conda environment
#'   must be strictly used. If \code{TRUE}, an error is thrown if the environment is not found.
#'   Default is \code{FALSE}.
#' @param ordinal Logical indicating whether the observable indicators are ordinal (TRUE) or binary (FALSE).
#'
#' @return A list containing various estimates and statistics:
#' \itemize{
#'   \item \code{ols_coef}: Coefficient from naive OLS regression
#'   \item \code{ols_se}: Standard error of naive OLS coefficient
#'   \item \code{ols_tstat}: T-statistic of naive OLS coefficient
#'   \item \code{iv_coef_a}: IV coefficient using first split as instrument
#'   \item \code{iv_coef_b}: IV coefficient using second split as instrument
#'   \item \code{iv_coef}: Averaged IV coefficient from both splits
#'   \item \code{iv_se}: Standard error of IV regression coefficient
#'   \item \code{iv_tstat}: T-statistic of IV regression coefficient
#'   \item \code{corrected_iv_coef_a}: Corrected IV coefficient using first split as instrument
#'   \item \code{corrected_iv_coef_b}: Corrected IV coefficient using second split as instrument
#'   \item \code{corrected_iv_coef}: Averaged corrected IV coefficient from both splits
#'   \item \code{corrected_iv_se}: Standard error of corrected IV coefficient
#'   \item \code{corrected_iv_tstat}: T-statistic of corrected IV coefficient
#'   \item \code{corrected_ols_coef_a}: Corrected OLS coefficient using first split
#'   \item \code{corrected_ols_coef_b}: Corrected OLS coefficient using second split
#'   \item \code{corrected_ols_coef}: Averaged corrected OLS coefficient from both splits
#'   \item \code{corrected_ols_se}: Standard error of corrected OLS coefficient (currently NA)
#'   \item \code{corrected_ols_tstat}: T-statistic of corrected OLS coefficient (currently NA)
#'   \item \code{corrected_ols_coef_alt}: Alternative corrected OLS coefficient (currently NA)
#'   \item \code{var_est_split}: Estimated variance of the measurement error
#'   \item \code{x_est1}: First set of latent variable estimates
#'   \item \code{x_est2}: Second set of latent variable estimates
#' }
#'
#' @details
#' This function implements a latent variable analysis with measurement error correction.
#' It splits the observable indicators into two sets, estimates latent variables using each set,
#' and then applies various correction methods including OLS correction and instrumental variable approaches.
#'
#' @section Standard Errors:
#' The following standard errors and t-statistics are currently returned as \code{NA} because
#' their analytical derivation is not yet implemented:
#' \itemize{
#'   \item \code{corrected_ols_se}: Standard error for the corrected OLS coefficient
#'   \item \code{corrected_ols_tstat}: T-statistic for the corrected OLS coefficient
#'   \item \code{corrected_ols_coef_alt}: Alternative corrected OLS coefficient
#' }
#' For inference on these quantities, use the bootstrap approach via \code{\link{lpmec}}, which
#' provides valid confidence intervals and standard errors through resampling.
#'
#'
#' @examples
#' \donttest{
#' # Generate some example data
#' set.seed(123)
#' Y <- rnorm(1000)
#' observables <- as.data.frame(matrix(sample(c(0,1), 1000*10, replace = TRUE), ncol = 10))
#'
#' # Run the analysis
#' results <- lpmec_onerun(Y = Y,
#'                         observables = observables)
#'
#' # View the corrected estimates
#' print(results)
#' }
#'
#' @export
#' @importFrom stats lm cor var rnorm coef model.matrix na.omit runif density
#' @importFrom graphics abline
#' @importFrom utils capture.output modifyList
#' @importFrom reticulate "%as%"
#' @importFrom AER ivreg
#' @importFrom pscl rollcall
#' @importFrom sandwich vcovHC
#' @importFrom mvtnorm rmvnorm
#' @importFrom Amelia amelia
#' @importFrom gtools quantcut
#' @importFrom emIRT binIRT ordIRT
#' @importFrom sensemakr extreme_robustness_value

lpmec_onerun <- function(Y,
                          observables,
                          observables_groupings = colnames(observables),
                          make_observables_groupings = FALSE,
                          estimation_method = "em",
                          latent_estimation_fn = NULL,
                          mcmc_control = list(
                            backend = "pscl",  # will override to use NumPyro-based MCMC
                            n_samples_warmup = 500L,
                            n_samples_mcmc   = 1000L,
                            batch_size = 512L,
                            chain_method = "parallel",
                            subsample_method = "full",
                            n_thin_by = 1L,
                            n_chains = 2L),
                          ordinal = FALSE,
                          conda_env = "lpmec",
                          conda_env_required = FALSE){
  # Merge user-provided mcmc_control with defaults
  # This ensures partial user specifications don't cause missing parameter errors
  default_mcmc_control <- list(
    backend = "pscl",
    n_samples_warmup = 500L,
    n_samples_mcmc = 1000L,
    batch_size = 512L,
    chain_method = "parallel",
    subsample_method = "full",
    n_thin_by = 1L,
    n_chains = 2L
  )

  # Warn user if partial mcmc_control was provided
  user_params <- names(mcmc_control)
  default_params <- names(default_mcmc_control)
  if (length(user_params) > 0 && length(user_params) < length(default_params)) {
    missing_params <- setdiff(default_params, user_params)
    message("Note: Partial mcmc_control provided. Using defaults for: ",
            paste(missing_params, collapse = ", "))
  }

  mcmc_control <- modifyList(default_mcmc_control, mcmc_control)

  # ============================================================================

  # INPUT VALIDATION
  # ============================================================================

  # Validate Y
  if (missing(Y) || is.null(Y)) {
    stop("'Y' is required and cannot be NULL.")
  }
  if (!is.numeric(Y)) {
    stop("'Y' must be a numeric vector.")
  }
  if (length(Y) < 10) {
    stop("'Y' must have at least 10 observations. Received: ", length(Y))
  }
  if (all(is.na(Y))) {
    stop("'Y' cannot be all NA values.")
  }

  # Validate observables
  if (missing(observables) || is.null(observables)) {
    stop("'observables' is required and cannot be NULL.")
  }
  if (!is.data.frame(observables) && !is.matrix(observables)) {
    stop("'observables' must be a data.frame or matrix.")
  }
  if (nrow(observables) != length(Y)) {
    stop("Number of rows in 'observables' (", nrow(observables),
         ") must match length of 'Y' (", length(Y), ").")
  }
  if (ncol(observables) < 4) {
    stop("'observables' must have at least 4 columns to allow split-half estimation. Received: ",
         ncol(observables))
  }

  # Validate estimation_method
  valid_methods <- c("em", "pca", "averaging", "mcmc", "mcmc_joint", "mcmc_overimputation", "custom")
  if (!estimation_method %in% valid_methods) {
    stop("'estimation_method' must be one of: ", paste(valid_methods, collapse = ", "),
         ". Received: '", estimation_method, "'")
  }

  # Validate custom estimation function when method is "custom"
  if (estimation_method == "custom") {
    if (is.null(latent_estimation_fn)) {
      stop("'latent_estimation_fn' is required when estimation_method = 'custom'.")
    }
    if (!is.function(latent_estimation_fn)) {
      stop("'latent_estimation_fn' must be a function.")
    }
  }

  # Validate ordinal parameter
  if (!is.logical(ordinal) || length(ordinal) != 1) {
    stop("'ordinal' must be a single logical value (TRUE or FALSE).")
  }

  # Validate mcmc_control parameters
  if (!is.list(mcmc_control)) {
    stop("'mcmc_control' must be a list.")
  }
  if (!mcmc_control$backend %in% c("pscl", "numpyro")) {
    stop("mcmc_control$backend must be either 'pscl' or 'numpyro'. Received: '",
         mcmc_control$backend, "'")
  }
  if (!is.numeric(mcmc_control$n_samples_warmup) || mcmc_control$n_samples_warmup < 1) {
    stop("mcmc_control$n_samples_warmup must be a positive integer.")
  }
  if (!is.numeric(mcmc_control$n_samples_mcmc) || mcmc_control$n_samples_mcmc < 1) {
    stop("mcmc_control$n_samples_mcmc must be a positive integer.")
  }
  if (!is.numeric(mcmc_control$n_chains) || mcmc_control$n_chains < 1) {
    stop("mcmc_control$n_chains must be a positive integer.")
  }

  # Warn about potential issues
  n_unique_obs <- length(unique(observables_groupings))
  if (n_unique_obs < 4) {
    warning("Only ", n_unique_obs, " unique observable groupings found. ",
            "Split-half estimation may be unreliable with fewer than 4 groupings.")
  }

  # Check for excessive missing data
  na_prop <- mean(is.na(as.matrix(observables)))
  if (na_prop > 0.5) {
    warning("More than 50% of values in 'observables' are NA (",
            round(na_prop * 100, 1), "%). Results may be unreliable.")
  }

  # ============================================================================

  # coerce to data.frame
  observables <- as.data.frame( observables )
  
  t0 <- Sys.time()
  INIT_SCALER <- 1/10
  Bayesian_OLSSE_InnerNormed <- Bayesian_OLSCoef_InnerNormed <- NA; 
  Bayesian_OLSSE_OuterNormed <- Bayesian_OLSCoef_OuterNormed <- NA;
  FullBayesianSlope_mean <- FullBayesianSlope_std <- NA
  items.split1_names <- sample(unique(observables_groupings), 
                               size = floor(length(unique(observables_groupings))/2), replace=FALSE)
  items.split2_names <- unique(observables_groupings)[! (observables_groupings %in% items.split1_names)]
  for(split_ in c("", "1", "2")){
    if(split_ == ""){ items.split_ <- 1:length(observables_groupings) }
    if(split_ == "1"){ items.split_ <- (1:length(observables_groupings))[observables_groupings %in% items.split1_names] }
    if(split_ == "2"){ items.split_ <- (1:length(observables_groupings))[observables_groupings %in% items.split2_names] }
    
    # estimating ideal points
    if(make_observables_groupings == FALSE){
      observables_ <- observables[,items.split_]
    }
    if(make_observables_groupings == TRUE){
      observables_ <- do.call(cbind,unlist(apply(observables[,items.split_],2,function(zer){
        list( model.matrix(~0+as.factor(zer))[,-1] )}),recursive = FALSE))
    }
    
    if(estimation_method == "pca"){
        # For PCA, we typically want numeric inputs only
        x_init <- scale(apply( observables_, 1, function(x){ mean(f2n(x), na.rm=TRUE)})) * INIT_SCALER
        observables__ <- apply(as.matrix(observables_), 2, f2n)
        
        # do not zero impute
        #observables__[is.na(observables__)] <- 0
        
        # Run PCA on centered + scaled columns:
        pca_out <- prcomp(observables__, center = TRUE, scale. = TRUE)
        
        # Extract the first principal component
        x.est_ <- pca_out$x[, 1]
        
        # Scale so that it has mean 0, sd 1
        x.est_ <- scale(x.est_)
        
        # (Optional) Flip sign to match an anchor — 
        # compare with whichever initial reference you want, e.g. x_init
        if (x.est_[which.max(x_init)] < 0) {
          x.est_ <- -1 * x.est_
        }
    }
    
    if (estimation_method == "averaging") {
      # Final estimate is just this averaged measure (split-by-split)
      x.est_ <- scale(apply(observables_, 1, function(x) mean(f2n(x), na.rm = TRUE)))
    }
    
    if (estimation_method == "custom") {
      # Final estimate is just this averaged measure (split-by-split)
      x.est_ <- scale( latent_estimation_fn(observables_) )
    }
    
    # first, do emIRT
    if(estimation_method == "em"){
      x_init <- scale(apply( observables_, 1, function(x){ mean(f2n(x), na.rm=TRUE)})) * INIT_SCALER

      if(ordinal){
        observables__ <- apply(as.matrix(observables_),2,f2n)
        if( !all(c(observables__) %in% 1:3) ){
          observables__ <- apply(observables__, 2, function(x){ 
            r <- rank(f2n(x), ties.method = "average")
            group <- ceiling(r / (length(x) / 3))
            return(group)
          })
        }
        if( !all(apply(observables__,2,function(x)length(unique(x))) == 3) ){
          warning("Some observables mapped to binary, not ordinal values -- dropping those.")
          observables__ <- observables__[,which(apply(observables__,2,function(x){length(unique(x))})==3)]
          # apply(observables__,2,table)
          # colSums(apply(observables__,2,table))
          # sum(observables__)
          # summary(c(cor(observables__)[lower.tri(cor(observables__))]))
        }
        observables__[is.na(observables__)] <- 0 # map missing to zero
        
        ## Generate starts and priors for ordered model
        JJ <- ncol(observables__)
        NN <- nrow(observables__)
        
        # starts
        starts <- vector(mode = "list")
        starts$DD <- matrix(rep(0.5,JJ), ncol=1)
        starts$tau <- matrix(rep(-0.5,JJ), ncol=1)
        starts$beta <- matrix(runif(JJ,-1,1), ncol=1) 
        starts$x <- as.matrix(c(x_init))
        
        priors <- list("x" = list(mu = matrix(0,1,1), sigma = matrix(1,1,1) ), 
                      "beta" = list(mu = matrix(0,2,1), sigma = matrix(diag(25,2),2,2)))
        
        capture.output(
          out_emIRT <- try(emIRT::ordIRT(.rc = observables__,
                                  .starts = starts, 
                                  .priors = priors, 
                                  .control = list(
                                    threads = 1,
                                    verbose = FALSE,
                                    thresh = 1e-6,
                                    maxit=3000,
                                    checkfreq=50)
                        ),TRUE)
        )
        if("try-error" %in% class(out_emIRT)){ 
          stop(out_emIRT)
        } 
        
        # Scale the final ideal points and store
        x.est_ <- scale(out_emIRT$means$x)
        if(x.est_[which.max(x_init)] < 0){ x.est_ <- -1*x.est_ }  # anchor - no .anchor_subject arg 
        
        # FORCING FORCING SANITY SANITY
        #x.est_ <- scale(x_init)
        
        x.est_EM <- x.est_
      }
    
      if( !ordinal ){ 
        # informative initialization 
        rc_ <- emIRT::convertRC( pscl::rollcall(observables_) )
        capture.output( out_emIRT <- emIRT::binIRT(.rc = rc_, 
                            .starts = list("alpha" = matrix(rnorm(ncol(observables_), sd = .1)),
                                           "beta" = matrix(rnorm(ncol(observables_), sd = 1)),
                                           "x" = matrix(x_init)), 
                            .priors = emIRT::makePriors(.N= rc_$n, .J = rc_$m, .D = 1), 
                            .control= list(threads=1, verbose=FALSE, thresh=1e-6,verbose=FALSE),
                            .anchor_subject = which.max(x_init)
                            ) ) 
        x.est_EM <- x.est_ <- scale(out_emIRT$means$x); 
      }
    }
    
    if( grepl(estimation_method, pattern = "mcmc") & mcmc_control$backend == "pscl" ){
      if(estimation_method == "mcmc_joint"){stop('Must use numpyro with option: estimation_method="mcmc_joint"')}
      if(estimation_method == "mcmc" ){
        t0_ <- Sys.time()
        startval_ <- rowMeans( observables_ )
        maxiter_ <- mcmc_control$n_samples_mcmc + mcmc_control$n_samples_warmup
        burnin_ <- mcmc_control$n_samples_warmup
        thin_ <- mcmc_control$n_thin_by
        pscl_input_ <- pscl::rollcall(observables_)
        capture.output( 
          pscl_ideal <- pscl::ideal( pscl_input_, 
                            normalize = TRUE, 
                            store.item = TRUE, 
                            startvals = list("x" = startval_ ),
                            maxiter = maxiter_, 
                            burnin = burnin_,
                            thin = thin_)
        )
        message(sprintf("\n MCMC Runtime: %.3f min",  tdiff_ <- as.numeric(difftime(Sys.time(),  t0_, units = "secs"))/60))
        # mean(pscl_ideal$xbar); sd(pscl_ideal$xbar) # confirm 0 and 1 
        x.est_MCMC <- x.est_ <- pscl_ideal$xbar; s_past <- 1 # summary(lm(Y~x.est_))
        if( split_ == "" ){
          RescaledAbilities_OuterNormed  <- t(pscl_ideal$x[,,1])
          #Bayesian_OLSCoef_OuterNormed <- apply(RescaledAbilities_OuterNormed, 2, function(x_){ coef(lm(Y~x_))[2]}) # simple MOC 
          Bayesian_OLSCoef_OuterNormed <- apply(RescaledAbilities_OuterNormed, 2, function(x_){ 
            VCovHat <- sandwich::vcovHC( myModel <- lm(Y~x_), type = "HC3" ) 
            coef_ <- mvtnorm::rmvnorm(n = 1, mean = coef(myModel), sigma = VCovHat)[1,2]
          } ) # complicated MOC 
          
          # summary( apply(RescaledAbilities_outer, 2, sd) ) 
          # sd(rowMeans(RescaledAbilities_outer));mean(rowMeans(RescaledAbilities_outer)) # confirm sanity value of 1, 0
          # hist( Bayesian_OLSCoef_OuterNormed ); summary( Bayesian_OLSCoef_OuterNormed)
          Bayesian_OLSSE_OuterNormed <- sd( Bayesian_OLSCoef_OuterNormed ) 
          Bayesian_OLSCoef_OuterNormed <- mean( Bayesian_OLSCoef_OuterNormed )
          
          # InnerNormed
          RescaledAbilities_InnerNormed  <- ( apply(t(pscl_ideal$x[,,1]), 2, function(x_){scale(x_)})  ) 
          Bayesian_OLSCoef_InnerNormed <- apply(RescaledAbilities_InnerNormed, 2, function(x_){ 
            VCovHat <- sandwich::vcovHC( myModel <- lm(Y~x_), type = "HC3" ) 
            coef_ <- mvtnorm::rmvnorm(n = 1, mean = coef(myModel), sigma = VCovHat)[1,2]
          } ) # complicated MOC 
          # apply(RescaledAbilities, 2, sd)
          # sd(rowMeans(RescaledAbilities));mean(rowMeans(RescaledAbilities)) # confirm sanity value of 1, 0
          # hist( Bayesian_OLSCoef_InnerNormed ); summary( Bayesian_OLSCoef_InnerNormed )
          Bayesian_OLSSE_InnerNormed <- sd( Bayesian_OLSCoef_InnerNormed ) 
          Bayesian_OLSCoef_InnerNormed <- mean( Bayesian_OLSCoef_InnerNormed )
        }
      }
      if(estimation_method == "mcmc_overimputation" & split_ == ""){
        t0_ <- Sys.time()
        startval_ <- rowMeans( observables_ )
        maxiter_ <- mcmc_control$n_samples_mcmc + mcmc_control$n_samples_warmup
        burnin_ <- mcmc_control$n_samples_warmup
        thin_ <- mcmc_control$n_thin_by
        pscl_input_ <- pscl::rollcall(observables_)
        capture.output( 
          pscl_ideal <- pscl::ideal( pscl_input_, 
                                     normalize = TRUE, 
                                     store.item = TRUE, 
                                     startvals = list("x" = startval_ ),
                                     maxiter = maxiter_, 
                                     burnin = burnin_,
                                     thin = thin_)
        )
        # mean(pscl_ideal$xbar); sd(pscl_ideal$xbar) # confirm 0 and 1 
        x.est_MCMC <- x.est_ <- pscl_ideal$xbar; s_past <- 1 
        if( split_ == "" ){
            for(outType_ in c("Outer","Inner")){ 
              # file:///Users/cjerzak/Dropbox/LatentMeasures/literature/CAUGHEY-ps8-solution.html
              if(outType_ == "Outer"){
                RescaledAbilities  <- t(pscl_ideal$x[,,1])
              }
              if(outType_ == "Inner"){
                RescaledAbilities  <- t(pscl_ideal$x[,,1])
                RescaledAbilities  <- ( apply(RescaledAbilities, 2, function(x_){scale(x_)})  ) 
              }
              Xobs_mean <- apply(RescaledAbilities, 1, function(x_){ mean(x_) } ) 
              Xobs_SE <- apply(RescaledAbilities, 1, function(x_){ sd(x_) } ) 
              
              dat_ <- cbind(Y, x.est_MCMC)
              
              # Specify (over-)imputation model
              # priors: #a numeric matrix with four columns 
              # (row, column, mean, standard deviation) 
              # indicating noisy estimates of the values to be imputed.
              outcome_priors <- cbind(
                1:nrow(dat_), 1,              
                Y, # mean 
                1 # sd 
              )
              policy_priors <- cbind( 
                1:nrow(dat_), 2,              
                Xobs_mean,
                Xobs_SE
              )
              #priors <- rbind(policy_priors, outcome_priors)
              priors <- policy_priors
              
              #a numeric matrix where each row indicates a row and column of x to be overimputed.
              # overimp <- rbind(cbind(1:nrow(dat_), 1), cbind(1:nrow(dat_), 2))
              overimp <- cbind(1:nrow(dat_), 2)
              
              # perform overimputation 
              overimputed_data <- Amelia::amelia( 
                x = dat_, 
                m = (nOverImpute <- 5), # default 
                p2s = 0,
                priors = priors, 
                overimp = overimp, 
                parallel = "no")
              
              # Perform multiple overimputation
              overimputed_Y <- do.call(cbind,lapply(overimputed_data$imputations,function(l_){l_[,1]}))
              overimputed_x.est_MCMC <- do.call(cbind,lapply(overimputed_data$imputations,function(l_){l_[,2]}))
              if(outType_ == "Outer"){
                overimputed_x.est_MCMC  <- (overimputed_x.est_MCMC-mean(rowMeans(overimputed_x.est_MCMC)))/sd(rowMeans(overimputed_x.est_MCMC))
                #sd(rowMeans(overimputed_x.est_MCMC))
              }
              if(outType_ == "Inner"){
                overimputed_x.est_MCMC  <- ( apply(overimputed_x.est_MCMC, 2, function(x_){scale(x_)})  ) 
                #apply(overimputed_x.est_MCMC,2,sd)
              }
              # cor(cbind(x.est_MCMC, overimputed_x.est_MCMC))

              # Analyze overimputed datasets
              overimputed_coefs <- unlist(unlist( sapply(1:nOverImpute, function(s_){
                coef(lm(overimputed_Y[,s_] ~ overimputed_x.est_MCMC[,s_]))[2]
              })))
              if(outType_ == "Outer"){ 
                Bayesian_OLSSE_OuterNormed <- sd( overimputed_coefs )
                Bayesian_OLSCoef_OuterNormed <- mean( overimputed_coefs )
              }
              if(outType_ == "Inner"){ 
                Bayesian_OLSSE_InnerNormed <- sd( overimputed_coefs )
                Bayesian_OLSCoef_InnerNormed <- mean( overimputed_coefs )
              }
            }
        }
        message(sprintf("\n Overimputation Runtime: %.3f min",  tdiff_ <- as.numeric(difftime(Sys.time(),  t0_, units = "secs"))/60))
      }
      if(estimation_method == "mcmc_joint" & split_ == ""){
        ## ?? 
      }
    }
    if( grepl(estimation_method, pattern = "mcmc") & mcmc_control$backend == "numpyro" ){
        if(!"jax" %in% ls(envir = lpmec_env)){ initialize_jax(conda_env, conda_env_required) }
        
        # Construct for annotating conditionally independent variables.
        # Within a plate context manager, sample sites will be automatically broadcasted to the size of the plate. 
        # Additionally, a scale factor might be applied by certain inference algorithms
        # if subsample_size is specified.
        
        # Set up MCMC
        lpmec_env$numpyro$set_host_device_count( mcmc_control$n_chains )
        N <- ai(nrow(observables_))
        K <- ai(ncol(observables_))
        
        # Define the two-parameter IRT model using Matt's trick + subsampling
        # Note: IRTModel_batch is deprecated 
        IRTModel_batch <- function(X, Y) {
            # Number of observations (rows) and items (columns)
            N <- X$shape[[1]]
            K <- X$shape[[2]]
            
            # Global hyperpriors for ability (non-centered)
            mu_ability <- lpmec_env$numpyro$sample("mu_ability",
                                                  lpmec_env$dist$Normal(0, 1))
            sigma_ability <- lpmec_env$numpyro$sample("sigma_ability",
                                                     lpmec_env$dist$HalfNormal(1))
            with(lpmec_env$numpyro$plate("rows", N), {
              eps_ability <- lpmec_env$numpyro$sample("eps_ability",
                                                     lpmec_env$dist$Normal(0, 1))
              ability <- lpmec_env$numpyro$deterministic(
                "ability", (mu_ability + sigma_ability * eps_ability)[,NULL] )
            })
            
            # Hyperpriors for item parameters
            mu_difficulty <- lpmec_env$numpyro$sample("mu_difficulty",
                                                     lpmec_env$dist$Normal(0, 3))
            sigma_difficulty <- lpmec_env$numpyro$sample("sigma_difficulty",
                                                        lpmec_env$dist$HalfNormal(3))
            mu_log_discrimination <- lpmec_env$numpyro$sample("mu_log_discrimination",
                                                             lpmec_env$dist$Normal(0.5, 1))
            sigma_log_discrimination <- lpmec_env$numpyro$sample("sigma_log_discrimination",
                                                                lpmec_env$dist$HalfNormal(0.5))
            with(lpmec_env$numpyro$plate("columns", K), {
              eps_difficulty <- lpmec_env$numpyro$sample("eps_difficulty",
                                                        lpmec_env$dist$Normal(0, 3))
              difficulty <- lpmec_env$numpyro$deterministic(
                "difficulty", (mu_difficulty + sigma_difficulty * eps_difficulty)[NULL,] )
              
              eps_log_discrimination <- lpmec_env$numpyro$sample("eps_log_discrimination",
                                                      lpmec_env$dist$Normal(0, 1))
              log_discrimination <- mu_log_discrimination + sigma_log_discrimination * eps_log_discrimination
              discrimination <- lpmec_env$numpyro$deterministic(
                "discrimination", lpmec_env$jax$nn$softplus(log_discrimination)[NULL,] )
            })
            
            # Define a local likelihood function for the *subset* of rows
            local_lik_fn <- function(){
              with(lpmec_env$numpyro$plate(
                "rows_subsample",
                size = N,
                subsample_size = ai(mcmc_control$batch_size),
                dim = -2L
              ) %as% "idx", {
                # Subset 
                ability_sub <- lpmec_env$jnp$take(ability, idx, axis = 0L)
                
                # Observed subset of X
                X_sub <- lpmec_env$jnp$take(X, indices = idx, axis = 0L)
                
                # Construct logit for subset
                logits_sub <- (ability_sub - difficulty) * discrimination
                
                # Add columns plate around the sample statement
                with(lpmec_env$numpyro$plate("columns", K, dim = -1L), {
                  lpmec_env$numpyro$sample(
                    "Xlik_sub",
                    lpmec_env$dist$Bernoulli(logits = logits_sub),
                    obs = X_sub )
                })
                
                # If doing a full regression on Y:
                if (estimation_method == "mcmc_joint") {
                  Y_intercept <- lpmec_env$numpyro$sample("YModel_intercept",
                                                         lpmec_env$dist$Normal(0, 1))
                  Y_slope <- lpmec_env$numpyro$sample("YModel_slope",
                                                     lpmec_env$dist$Normal(0, 1))
                  Y_sigma <- lpmec_env$numpyro$sample("YModel_sigma",
                                                     lpmec_env$dist$HalfNormal(1))
                  
                  Y_sub <- lpmec_env$jnp$take(Y, idx)
                  Y_mu_sub <- Y_intercept + Y_slope * ability_sub
                  lpmec_env$numpyro$sample(
                    "Ylik_sub",
                    lpmec_env$dist$Normal(Y_mu_sub, Y_sigma),
                    obs = Y_sub
                  )
                }
              })
            }
            
            # Scale the local likelihood by (N / batch_size) for unbiased gradient
            scaled_lik <- lpmec_env$numpyro$handlers$scale(
              scale = as.numeric(N / mcmc_control$batch_size))(local_lik_fn)
            
            # Execute the scaled likelihood
            #scaled_lik() # documentation doesn't seem to scale?
            local_lik_fn()
          }
        
        # Define the two-parameter IRT model using Matt's trick 
        IRTModel_full <- (function(X, # binary indicators 
                                   Y  # outcome 
        ){
            
            # Global hyperpriors for ability
            mu_ability <- lpmec_env$numpyro$sample("mu_ability",
                                                  lpmec_env$dist$Normal(0, 1))
            sigma_ability <- lpmec_env$numpyro$sample("sigma_ability",
                                                     lpmec_env$dist$HalfNormal(0.5))
            
            # Non-centered parameterization for ability
            with(lpmec_env$numpyro$plate("rows", X$shape[[1]], dim = -2L), { # N
              eps_ability <- lpmec_env$numpyro$sample("eps_ability",
                                                     lpmec_env$dist$Normal(0, 1))
              ability <- lpmec_env$numpyro$deterministic("ability", 
                                 (mu_ability + sigma_ability * eps_ability) )
            })
            
            # If the user gave us an anchor_parameter_id, we convert it to 0-based
            # for Python indexing. We will forcibly ensure difficulty[anchor_id] is > 0.
            anchor_id <- NULL
            if (!is.null(mcmc_control$anchor_parameter_id)) {
              anchor_id <- as.integer(mcmc_control$anchor_parameter_id - 1L)  # R is 1-based, NumPyro is 0-based
            }
            
            # For difficulty, you might keep it simple or also do a non-centered param:
            mu_difficulty <- lpmec_env$numpyro$sample("mu_difficulty",
                                                     lpmec_env$dist$Normal(0, 2))
            sigma_difficulty <- lpmec_env$numpyro$sample("sigma_difficulty",
                                                        lpmec_env$dist$HalfNormal(1))
            mu_log_discrimination <- lpmec_env$numpyro$sample("mu_log_discrimination",
                                                             lpmec_env$dist$Normal(0.5, 1))
            sigma_log_discrimination <- lpmec_env$numpyro$sample("sigma_log_discrimination",
                                                                lpmec_env$dist$HalfNormal(0.5))
            with(lpmec_env$numpyro$plate("columns", X$shape[[2]], dim = -1L), {  # D
              difficulty_raw <- lpmec_env$numpyro$sample("eps_difficulty",
                                                        lpmec_env$dist$Normal(0, 1))
              
              # If anchor_id was specified, force difficulty for that item to be positive
              difficulty_adjusted <- difficulty_raw
              if( !is.null(anchor_id) ){
                # Use at update to ensure the anchor difficulty is strictly positive
                difficulty_adjusted <- difficulty_raw$at[anchor_id]$set(
                  lpmec_env$jax$nn$softplus(difficulty_raw[anchor_id])
                )
              }
              difficulty <- lpmec_env$numpyro$deterministic("difficulty", 
                                         (difficulty_adjusted)[NULL,]) # if NOT using centered parameterization
                                        #(mu_difficulty + sigma_difficulty * difficulty_adjusted)[NULL,]) # if using centered parameterization

              # discrimination <- lpmec_env$numpyro$sample("discrimination", lpmec_env$dist$HalfNormal(2))
              eps_log_discrimination <- lpmec_env$numpyro$sample("eps_log_discrimination", 
                                                                lpmec_env$dist$Normal(0.5, 2))
              discrimination <- lpmec_env$numpyro$deterministic("discrimination", 
                                        lpmec_env$jax$nn$softplus(eps_log_discrimination)[NULL,]) # if NOT using centered parameterization
                                        #lpmec_env$jax$nn$softplus(mu_log_discrimination + sigma_log_discrimination * eps_log_discrimination)[NULL,]) # if using centered parameterization
            })
            
            # Construct logits using the new 'ability' & 'difficulty'
            # note: discrimination constrained to be positive 
            Xprobs <- ( ability - difficulty ) * discrimination
            
            # Likelihood for X using a Bernoulli logit, wrapped in plates
            #lpmec_env$numpyro$sample("Xlik", lpmec_env$dist$Bernoulli(logits=Xprobs), obs=X)
            with(lpmec_env$numpyro$plate("rows", X$shape[[1]], dim = -2L), {
              with(lpmec_env$numpyro$plate("columns", X$shape[[2]], dim = -1L), {
                lpmec_env$numpyro$sample("Xlik", lpmec_env$dist$Bernoulli(logits = Xprobs), obs = X)
              }) })
            
            # If you are using the outcome portion in "mcmc_joint" mode:
            if(estimation_method == "mcmc_joint"){
              Y_intercept <- lpmec_env$numpyro$sample("YModel_intercept",
                                                     lpmec_env$dist$Normal(0, 1))
              Y_slope <- lpmec_env$numpyro$sample("YModel_slope",
                                                 lpmec_env$dist$Normal(0, 1))
              Y_sigma <- lpmec_env$numpyro$sample("YModel_sigma",
                                                 lpmec_env$dist$HalfNormal(1))
              
              Y_mu <- Y_intercept + Y_slope * ability
              lpmec_env$numpyro$sample("Ylik",
                                      lpmec_env$dist$Normal(Y_mu, Y_sigma),
                                      obs=Y)
            }
          })
        
      # setup & run a MCMC run
      if(mcmc_control$subsample_method == "batch"){ 
        message("Enlisting HMCECS kernels...")
        # https://num.pyro.ai/en/stable/mcmc.html#numpyro.infer.hmc_gibbs.HMCECS
        kernel <- lpmec_env$numpyro$infer$HMCECS(lpmec_env$numpyro$infer$NUTS(IRTModel_batch), 
                                                num_blocks = 4L)
                                                #num_blocks = ai(ceiling(N/mcmc_control$batch_size)))
        #Batch size controls computational cost and the variance of the log-likelihood estimate.
        #Block size (via num_blocks) controls the proposal variance for subsample updates (smaller blocks → gentler changes → higher acceptance rates).
      }
      if(mcmc_control$subsample_method == "full"){ 
        message("Enlisting NUTS kernels...")
        kernel <- lpmec_env$numpyro$infer$NUTS(IRTModel_full, 
                                              max_tree_depth = ai(8), # 10 is default, slows performance down considerably
                                              target_accept_prob = 0.85 ) 
      }
      sampler <- lpmec_env$numpyro$infer$MCMC(
        kernel,
        num_warmup = mcmc_control$n_samples_warmup,
        num_samples = mcmc_control$n_samples_mcmc,
        thinning = mcmc_control$n_thin_by, # Positive integer that controls the fraction of post-warmup samples that are retained. For example if thinning is 2 then every other sample is retained. Defaults to 1, i.e. no thinning.
        chain_method = mcmc_control$chain_method, # 'parallel' (default), 'sequential', 'vectorized'. 
        num_chains = mcmc_control$n_chains,
        jit_model_args = TRUE, 
        progress_bar = TRUE # set to TRUE for progress 
      )
      
      # run sampler with initialized abilities as COLMEANS of X (ASSUMPTION!)
      pdtype_ <- ddtype_ <- lpmec_env$jnp$float32; lpmec_env$jax$config$update("jax_enable_x64", FALSE) 
      #pdtype_ <- ddtype_ <- lpmec_env$jnp$float64; lpmec_env$jax$config$update("jax_enable_x64", TRUE) 
      #pdtype_ <- ddtype_ <- lpmec_env$jnp$float16; lpmec_env$jax$config$update("jax_enable_x64", FALSE) 
      
      # set initial parameters 
      UseEMInits <- FALSE
      ability_init <- lpmec_env$jnp$broadcast_to(lpmec_env$jnp$array(x_init<- (scale(rowMeans(observables_))*INIT_SCALER))$astype(pdtype_), list(mcmc_control$n_chains, N, 1L))
      if(UseEMInits){ ability_init <- lpmec_env$jnp$broadcast_to(lpmec_env$jnp$array( x_init<- scale(rowMeans(observables_))*INIT_SCALER )$astype(pdtype_), list(mcmc_control$n_chains, N, 1L)) } 
      
      difficulty_init <- lpmec_env$jnp$array(matrix(rnorm(K*mcmc_control$n_chains,mean=0,sd=1/sqrt(K)),nrow=mcmc_control$n_chains) )$astype(pdtype_)
      if(UseEMInits){ difficulty_init <-  lpmec_env$jnp$broadcast_to(lpmec_env$jnp$array( out_emIRT$means$beta[,1]*INIT_SCALER )$astype(pdtype_), list(mcmc_control$n_chains, K)) }
      
      discrimination_init <- lpmec_env$jnp$array(  matrix( (rnorm(K*mcmc_control$n_chains,mean=1,sd=1/sqrt(K))),nrow=mcmc_control$n_chains))$astype(pdtype_)
      if(UseEMInits){ discrimination_init <-  lpmec_env$jnp$broadcast_to(lpmec_env$jnp$array( out_emIRT$means$beta[,2] *INIT_SCALER )$astype(pdtype_),list(mcmc_control$n_chains, K)) }
      
      # run sampler 
      t0_ <- Sys.time()
      sampler$run(lpmec_env$jax$random$PRNGKey( ai(runif(1,0,10000)) ), 
                  X = lpmec_env$jnp$array(as.matrix(observables_))$astype( ddtype_ ),  # note: lpmec_env$jnp$int16 here causes error (expects floats not ints)
                  Y = lpmec_env$jnp$array(as.matrix(Y))$astype( ddtype_ ))
                  #init_params = list(
                  #  "ability" = ability_init,
                  #  "difficulty" = difficulty_init,
                  #  "discrimination" =  discrimination_init ) ) 
      PosteriorDraws <- sampler$get_samples(group_by_chain = TRUE)
      # PosteriorDraws$ability$shape; PosteriorDraws$ability[1,,1]
      ExtractAbil <- function(abil){ 
        abil <- do.call(cbind, sapply(1L:mcmc_control$n_chains, function(c_){
          abil_c <- as.matrix(lpmec_env$np$array(abil)[c_,,,])
          return( list( t(abil_c) ) )
        }))
        theAnchor <- which.max(x_init)
        abil <- apply(abil,2,function(x){
                    if(x[theAnchor] < 0){ x <- -1*x } # anchor based on anchor 
                    return(x) })
        return( abil ) 
      }

      # summary( lm(Y~scale(AbilityMean) ) )
      # plot( as.matrix( lpmec_env$np$array( PosteriorDraws$discrimination_oriented ) )[1,,1])
      # plot( as.matrix( lpmec_env$np$array( PosteriorDraws$discrimination ) )[1,,1])
      # plot( as.matrix( lpmec_env$np$array( PosteriorDraws$discrimination ) )[1,,2])
      
      # Calculate posterior means
      # ExtractAbil(PosteriorDraws$ability)
      # plot(ExtractAbil(PosteriorDraws$ability)[1,],main="i=1")
      # plot(ExtractAbil(PosteriorDraws$ability)[2,],main="i=1")
      AbilityMean <- rowMeans(  ExtractAbil(PosteriorDraws$ability) )
      DifficultyMean <- as.matrix(lpmec_env$np$array(lpmec_env$jnp$mean(PosteriorDraws$difficulty,0L:1L))) #  colMeans( as.matrix(lpmec_env$np$array(PosteriorDraws$difficulty)) )
      
      # plot(scale(x.true[i_sampled]),scale(AbilityMean))
      # cor(x.true[i_sampled],AbilityMean)
      # plot(as.array(lpmec_env$np$array(PosteriorDraws$ability))[1,721,])
      message(sprintf("\n MCMC Runtime: %.3f min",  tdiff_ <- as.numeric(difftime(Sys.time(),  t0_, units = "secs"))/60))
      message(sprintf("Mean(N-eff of nMCMC %%): %.2f%% \n", 100*mean(
              lpmec_env$numpyro$diagnostics$effective_sample_size(# Computes effective sample size of input x, where the first dimension of x is chain dimension and the second dimension of x is draw dimension.
                lpmec_env$jnp$reshape( PosteriorDraws$ability, list(mcmc_control$n_chains, ai(mcmc_control$n_samples_mcmc/mcmc_control$n_thin_by), N))
                ), na.rm=T)/(ai(mcmc_control$n_chains*mcmc_control$n_samples_mcmc/mcmc_control$n_thin_by) ) ))
      plot(lpmec_env$np$array(PosteriorDraws$ability[1,,1,1]))
      if(estimation_method == "mcmc" & split_ == ""){ # method of compositions 
        # OuterNormed - this is what ideal does 
        RescaledAbilities_OuterNormed  <- (ExtractAbil(PosteriorDraws$ability)-mean(AbilityMean))/sd(AbilityMean)
        #Bayesian_OLSCoef_OuterNormed <- apply(RescaledAbilities, 2, function(x_){ coef(lm(Y~x_))[2]}) # simple MOC 
        Bayesian_OLSCoef_OuterNormed <- apply(RescaledAbilities_OuterNormed, 2, function(x_){ 
          VCovHat <- sandwich::vcovHC( myModel <- lm(Y~x_), type = "HC3" ) 
          coef_ <- mvtnorm::rmvnorm(n = 1, mean = coef(myModel), sigma = VCovHat)[1,2]
          } ) # complicated MOC 
        
        # summary( apply(RescaledAbilities_OuterNormed, 2, sd) ) 
        # sd(rowMeans(RescaledAbilities_OuterNormed));mean(rowMeans(RescaledAbilities_OuterNormed)) # confirm sanity value of 1, 0
        # hist( Bayesian_OLSCoef_OuterNormed ); summary( Bayesian_OLSCoef_OuterNormed)
        Bayesian_OLSSE_OuterNormed <- sd( Bayesian_OLSCoef_OuterNormed ) 
        Bayesian_OLSCoef_OuterNormed <- mean( Bayesian_OLSCoef_OuterNormed )
        
        # InnerNormed
        RescaledAbilities_InnerNormed  <- ( apply(ExtractAbil(PosteriorDraws$ability), 2, function(x_){scale(x_)})  ) 
        #Bayesian_OLSCoef_InnerNormed <- apply(RescaledAbilities_InnerNormed, 2, function(x_){ coef(lm(Y~x_))[2]}) # simple MOC 
        Bayesian_OLSCoef_InnerNormed <- apply(RescaledAbilities_InnerNormed, 2, function(x_){ 
          VCovHat <- sandwich::vcovHC( myModel <- lm(Y~x_), type = "HC3" ) 
          coef_ <- mvtnorm::rmvnorm(n = 1, mean = coef(myModel), sigma = VCovHat)[1,2]
        } ) # complicated MOC 
        # apply(RescaledAbilities_InnerNormed, 2, sd)
        # sd(rowMeans(RescaledAbilities_InnerNormed));mean(rowMeans(RescaledAbilities_InnerNormed)) # confirm sanity value of 1, 0
        # hist( Bayesian_OLSCoef_InnerNormed ); summary( Bayesian_OLSCoef_InnerNormed )
        Bayesian_OLSSE_InnerNormed <- sd( Bayesian_OLSCoef_InnerNormed ) 
        Bayesian_OLSCoef_InnerNormed <- mean( Bayesian_OLSCoef_InnerNormed )
      }
      if(estimation_method == "mcmc_joint" & split_ == ""){ # full bayesian model 
        #RescaledAbilities <- (ExtractAbil(PosteriorDraws$ability)-mean(AbilityMean))/sd(AbilityMean)

        # note: multiplication of coefficient by sd generates interpretation of coeff 
        # as representing change in outcome associated with 1 unit deviation
        Bayesian_OLSCoef_OuterNormed <- c(as.matrix(lpmec_env$np$array(PosteriorDraws$YModel_slope))) * sd(AbilityMean)
        # sd(rowMeans(RescaledAbilities));mean(rowMeans(RescaledAbilities)) # confirm sanity values of 1,0 
        # hist(Bayesian_OLSCoef_OuterNormed);abline(v=0.4); summary( Bayesian_OLSCoef_OuterNormed )
        #Bayesian_OLSCoef_OuterNormed[Bayesian_OLSCoef_OuterNormed<0] <- -1*Bayesian_OLSCoef_OuterNormed[Bayesian_OLSCoef_OuterNormed<0]
        Bayesian_OLSSE_OuterNormed <- sd( Bayesian_OLSCoef_OuterNormed ) 
        Bayesian_OLSCoef_OuterNormed <- mean( Bayesian_OLSCoef_OuterNormed )
        
        InnerSDs <- apply(ExtractAbil(PosteriorDraws$ability), 2, sd)
        Bayesian_OLSCoef_InnerNormed <- c(as.matrix(lpmec_env$np$array(PosteriorDraws$YModel_slope))) * InnerSDs
        # sd(rowMeans(RescaledAbilities));mean(rowMeans(RescaledAbilities)) # confirm sanity values of 1,0 
        # hist(Bayesian_OLSCoef_InnerNormed); abline(v=0.4,lwd=2); summary( Bayesian_OLSCoef_InnerNormed )
        Bayesian_OLSSE_InnerNormed <- sd( Bayesian_OLSCoef_InnerNormed ) 
        Bayesian_OLSCoef_InnerNormed <- mean( Bayesian_OLSCoef_InnerNormed )
      }
      
      # rescale 
      x.est_MCMC <- x.est_ <- as.matrix(scale(AbilityMean)); s_past <- 1
      if(estimation_method == "mcmc_overimputation" & split_ == ""){ 
        for(outType_ in c("Outer","Inner")){ 
          # file:///Users/cjerzak/Dropbox/LatentMeasures/literature/CAUGHEY-ps8-solution.html
          if(outType_ == "Outer"){
            RescaledAbilities  <- (ExtractAbil(PosteriorDraws$ability)-mean(AbilityMean))/sd(AbilityMean)
          }
          if(outType_ == "Inner"){
            RescaledAbilities  <- ( apply(ExtractAbil(PosteriorDraws$ability), 2, function(x_){scale(x_)})  ) 
          }
          Xobs_mean <- apply(RescaledAbilities, 1, function(x_){ mean(x_) } ) 
          Xobs_SE <- apply(RescaledAbilities, 1, function(x_){ sd(x_) } ) 
          
          dat_ <- cbind(Y, x.est_MCMC)
          
          # Specify (over-)imputation model
          # priors: #a numeric matrix with four columns 
          # (row, column, mean, standard deviation) 
          # indicating noisy estimates of the values to be imputed.
          outcome_priors <- cbind(
            1:nrow(dat_), 1,              
            Y, # mean 
            1 # sd 
          )
          policy_priors <- cbind( 
            1:nrow(dat_), 2,              
            Xobs_mean,
            Xobs_SE
          )
          #priors <- rbind(policy_priors, outcome_priors)
          priors <- policy_priors
          
          #a numeric matrix where each row indicates a row and column of x to be overimputed.
          # overimp <- rbind(cbind(1:nrow(dat_), 1), cbind(1:nrow(dat_), 2))
          overimp <- cbind(1:nrow(dat_), 2)
          
          # perform overimputation 
          overimputed_data <- Amelia::amelia( 
            x = dat_, m = (nOverImpute <- 5L),
            p2s = 0,
            priors = priors, 
            overimp = overimp 
          )
          
          # Perform multiple overimputation
          overimputed_Y <- do.call(cbind,lapply(overimputed_data$imputations,function(l_){l_[,1]}))
          overimputed_x.est_MCMC <- do.call(cbind,lapply(overimputed_data$imputations,function(l_){l_[,2]}))
          if(outType_ == "Outer"){
            overimputed_x.est_MCMC  <- (overimputed_x.est_MCMC-mean(rowMeans(overimputed_x.est_MCMC)))/sd(rowMeans(overimputed_x.est_MCMC))
            #sd(rowMeans(overimputed_x.est_MCMC))
          }
          if(outType_ == "Inner"){
            overimputed_x.est_MCMC  <- ( apply(overimputed_x.est_MCMC, 2, function(x_){scale(x_)})  ) 
            #apply(overimputed_x.est_MCMC,2,sd)
          }
          
          # cor(cbind(x.est_MCMC, overimputed_x.est_MCMC))
          # plot(x.est_MCMC,overimputed_x.est_MCMC[,1])
          
          # Analyze overimputed datasets
          overimputed_coefs <- unlist(unlist( sapply(1:nOverImpute, function(s_){
            coef(lm(overimputed_Y[,s_] ~ overimputed_x.est_MCMC[,s_]))[2]
          })))
          if(outType_ == "Outer"){ 
            Bayesian_OLSSE_OuterNormed <- sd( overimputed_coefs )
            Bayesian_OLSCoef_OuterNormed <- mean( overimputed_coefs )
          }
          if(outType_ == "Inner"){ 
            Bayesian_OLSSE_InnerNormed <- sd( overimputed_coefs )
            Bayesian_OLSCoef_InnerNormed <- mean( overimputed_coefs )
          }
        }
      }
    }
    
    if(split_ %in% c("1","2")){
      if(cor(x.est_, x_est_past) < 0){
        x.est_  <- -1*x.est_
      }
    }
    x_est_past <- x.est_
    if (split_ == "") {
      x.est <- x.est_
    } else if (split_ == "1") {
      x.est1 <- x.est_
    } else if (split_ == "2") {
      x.est2 <- x.est_
    }
  }
  # c(mean(x.est),sd(x.est))
  
  # simple linear reg 
  theOLS <- lm(Y ~ x.est)
  
  # stage 1 results
  IVStage1 <- lm(x.est2 ~ x.est1)
  
  # corrected IV
  # cor(x.est1,x.est2)
  IVStage2_a <- AER::ivreg(Y ~ x.est2 | x.est1)
  IVStage2_b <- AER::ivreg(Y ~ x.est1 | x.est2)
  IVRegCoef_a <- coef(IVStage2_a)[2]
  IVRegCoef_b <- coef(IVStage2_b)[2]
  IVRegCoef <- (IVRegCoef_a + IVRegCoef_b) / 2
  Corrected_IVRegCoef_a <- (coef(IVStage2_a)[2] * sqrt( max(c((ep_ <- 0.01), 
                                                              cor(x.est1, x.est2) ))) )
  Corrected_IVRegCoef_b <- (coef(IVStage2_b)[2] * sqrt( max(c(ep_,
                                                              cor(x.est1, x.est2) ))) )
  Corrected_IVRegCoef <- ( Corrected_IVRegCoef_a + Corrected_IVRegCoef_b )/2
  
  # corrected OLS 
  Corrected_OLSCoef_a <- coef(lm(Y ~ x.est1))[2] * (CorrectionFactor <- 1/sqrt( max(c(ep_, cor(x.est1, x.est2) ))))
  Corrected_OLSCoef_b <- coef(lm(Y ~ x.est2))[2] * CorrectionFactor
  Corrected_OLSCoef <- (Corrected_OLSCoef_a + Corrected_OLSCoef_b)/2
  t_OneRun <- as.numeric(difftime(Sys.time(), t0, units = "secs"))
  
  # ERV analysis 
  mstage1 <- lm(x.est2 ~ x.est1)
  mreduced <- lm(Y ~ x.est1)
  mstage1ERV <- sensemakr::extreme_robustness_value( mstage1 )[[2]]
  mreducedERV <- sensemakr::extreme_robustness_value( mreduced )[[2]]
  
  # save results 
  results <- list(
    "ols_coef" = coef(summary(theOLS))[2, 1],
    "ols_se" = coef(summary(theOLS))[2, 2],
    "ols_tstat" = coef(summary(theOLS))[2, 3],
    
    "iv_coef_a" = IVRegCoef_a, 
    "iv_coef_b" = IVRegCoef_b, 
    "iv_coef" = IVRegCoef, 
    "iv_se" = coef(summary(IVStage2_a))[2, 2],
    "iv_tstat" = coef(summary(IVStage2_a))[2, 3],
    
    "corrected_iv_coef_a" = Corrected_IVRegCoef_a,
    "corrected_iv_coef_b" = Corrected_IVRegCoef_b,
    "corrected_iv_coef" = Corrected_IVRegCoef,
    "corrected_iv_se" = NA,
    "corrected_iv_tstat" = Corrected_IVRegCoef / coef(summary(IVStage2_a))[2, 2],
    "var_est_split" = var(x.est1 - x.est2) / 2,  # var_est_split
    
    "corrected_ols_coef_a" = Corrected_OLSCoef_a, 
    "corrected_ols_coef_b" = Corrected_OLSCoef_b, 
    "corrected_ols_coef" = Corrected_OLSCoef, 
    "corrected_ols_se" = NA,
    "corrected_ols_tstat" = NA,
    
    "corrected_ols_coef_alt" = NA, 
    "corrected_ols_se_alt" = NA,
    "corrected_ols_tstat_alt" = NA,
    
    "bayesian_ols_coef_outer_normed" = Bayesian_OLSCoef_OuterNormed, 
    "bayesian_ols_se_outer_normed" = Bayesian_OLSSE_OuterNormed,
    "bayesian_ols_tstat_outer_normed" = Bayesian_OLSCoef_OuterNormed / Bayesian_OLSSE_OuterNormed,
    
    "bayesian_ols_coef_inner_normed" = Bayesian_OLSCoef_InnerNormed, 
    "bayesian_ols_se_inner_normed" = Bayesian_OLSSE_InnerNormed,
    "bayesian_ols_tstat_inner_normed" = Bayesian_OLSCoef_InnerNormed / Bayesian_OLSSE_InnerNormed,
    
    "m_stage_1_erv" = mstage1ERV, 
    "m_reduced_erv" = mreducedERV, 
    
    "x_est1" = x.est1,
    "x_est2" = x.est2
  )
    class(results) <- "lpmec_onerun"
    return( results ) 
}

Try the lpmec package in your browser

Any scripts or data that you put into this service are public.

lpmec documentation built on Feb. 9, 2026, 5:07 p.m.