R/enorm.R

Defines functions print.pmf_func print.sim_func print.exp_func print.inf_func theta_function p_score r_score expected_score information coef.prms print.prms plot.prms fit_enorm_ fit_enorm

Documented in coef.prms expected_score fit_enorm information plot.prms p_score r_score

#' Fit the extended nominal response model
#'
#' Fits an Extended NOminal Response Model (ENORM) using conditional maximum likelihood (CML)
#' or a Gibbs sampler for Bayesian estimation.
#'
#'
#' @param dataSrc a connection to a dexter database, a matrix, or a data.frame with columns: person_id, item_id, item_score
#' @param predicate An optional expression to subset data, if NULL all data is used
#' @param fixed_params Optionally, a prms object from a previous analysis or 
#' a data.frame with parameters, see details.
#' @param method If CML, the estimation method will be Conditional Maximum Likelihood;
#' otherwise, a Gibbs sampler will be used to produce a sample from the posterior
#' @param nDraws Number of Gibbs samples when estimation method is Bayes. 
#' @param merge_within_persons whether to merge different booklets administered to the same person, enabling linking over persons as well as booklets.
#' @return An object of type \code{prms}. The prms object can be cast to a data.frame of item parameters 
#' using function \code{coef} or used directly as input for other Dexter functions.
#' @details
#' To support some flexibility in fixing parameters, fixed_params can be a dexter prms object or a data.frame.
#' If a data.frame, it should contain the columns item_id, item_score and a difficulty parameter. Three types of parameters are supported:
#' \describe{
#' \item{delta/beta}{ thresholds between subsequent item categories }
#' \item{eta}{item-category parameters} 
#' \item{b}{exp(-eta)}
#' }
#' Each type corresponds to a different parametrization of the model. 
#' 
#' @references 
#' Maris, G., Bechger, T.M. and San-Martin, E. (2015) A Gibbs sampler for the (extended) marginal Rasch model. 
#' Psychometrika. 80(4), 859-879. 
#' 
#' Koops, J. and Bechger, T.M. and Maris, G. (in press); Bayesian inference for multistage and other 
#' incomplete designs. In Research for Practical Issues and Solutions in Computerized Multistage Testing.
#' Routledge, London. 
#' 
#' @seealso functions that accept a prms object as input: \code{\link{ability}}, \code{\link{plausible_values}}, 
#' \code{\link{plot.prms}}, and \code{\link{plausible_scores}}
#'
fit_enorm = function(dataSrc, predicate = NULL, fixed_params = NULL, method=c("CML", "Bayes"), 
                     nDraws=1000, merge_within_persons=FALSE)
{
  method = match.arg(method)
  check_dataSrc(dataSrc)
  check_num(nDraws, 'integer', .length=1, .min=1)
  qtpredicate = eval(substitute(quote(predicate)))
  env = caller_env()

  fit_enorm_(dataSrc, qtpredicate = qtpredicate, fixed_params = fixed_params, 
             method=method, nDraws=nDraws, env=env, merge_within_persons=merge_within_persons)
}


fit_enorm_ = function(dataSrc, qtpredicate = NULL, fixed_params = NULL, method=c("CML", "Bayes"), 
                      nDraws=1000, env=NULL, merge_within_persons=FALSE) 
{
  method = match.arg(method)
  if(is.null(env)) env = caller_env()

  pb = get_prog_bar(retrieve_data = is_db(dataSrc), nsteps = if.else(method=='Bayes',nDraws,NULL))
  on.exit({pb$close()})

  respData = get_resp_data(dataSrc, qtpredicate, summarised=FALSE, env=env, retain_person_id=FALSE,
                           merge_within_persons = merge_within_persons)
  
  pb$tick()
  ss = get_sufStats_nrm(respData)

  ssI = ss$ssI
  ssIS = ss$ssIS
  design = ss$design
  plt = ss$plt
  scoretab = ss$scoretab
  
  fixed_b = NULL
  has_fixed_parms = !is.null(fixed_params)

  ## deal with fixed parameters
  if(has_fixed_parms)
  {
    if(inherits(fixed_params,'prms'))
    {
      if (fixed_params$inputs$method!="CML")
        message("Posterior means are taken as values for fixed parameters")
      
      fixed_params = fixed_params$inputs$ssIS |>
        add_column(b = if.else(fixed_params$inputs$method=="CML", fixed_params$est$b, colMeans(fixed_params$est$b)))
      
    } else
    {
      # transform the fixed params to the b parametrization dexter uses internally
      fixed_params = transform.df.parms(fixed_params, out.format = 'b', include.zero = TRUE) 
    }  
    # avoid factor warnings and reduce
    fixed_params$item_id = ffactor(as.character(fixed_params$item_id), levels=levels(design$item_id))
    fixed_params = filter(fixed_params,!is.na(.data$item_id)) 
    
    # check for missing categories in fixed_params, necessary?
    missing_cat = ssIS |> 
      semi_join(fixed_params, by='item_id') |>
      left_join(fixed_params, by=c('item_id','item_score')) |>
      filter(is.na(.data$b) & .data$item_score != 0) 
      
    if(nrow(missing_cat) > 0)
    {
      cat(paste('Some score categories are fixed while some are not, for the same item.',
                'Dexter does not know how to deal with that.\nThe following score categories are missing:\n'))
      missing_cat |> 
        select('item_id', 'item_score') |>
        arrange(.data$item_id, .data$item_score) |>
        as.data.frame() |>
        print()
      stop('missing score categories for fixed items, see output')
    }
      
    fixed_b = fixed_params |>
      right_join(ssIS, by=c('item_id','item_score')) |>
      arrange(.data$item_id,.data$item_score) |>
      pull(.data$b)
    
    if(!anyNA(fixed_b)) stop('nothing to calibrate, all parameters are fixed')
  }
  
  if (method=="CML"){
    result = calibrate_CML(scoretab=scoretab, design=design, sufI=ssIS$sufI, a=ssIS$item_score, 
                               first=ssI$first, last=ssI$last, 
                               fixed_b=fixed_b)

  } else 
  {
    result = calibrate_Bayes(scoretab=scoretab, design=design, sufI=ssIS$sufI, a=ssIS$item_score,
                              first=ssI$first, last=ssI$last, nIter=nDraws, fixed_b=fixed_b)
    
    
  }

  mle = design |> 
    group_by(.data$booklet_id) |>
    do({
      est = theta_MLE(if.else(is.matrix(result$b),colMeans(result$b), result$b), 
                      a=ssIS$item_score, .$first, .$last, se=FALSE)
      theta = est$theta[2:(length(est$theta)-1)]
      tibble(booklet_score=1:length(theta), theta = theta)
    }) |>
    ungroup() 
  
  outpt = list(est=result, 
               inputs=list(scoretab=scoretab, design=design, ssIS=ssIS, ssI=ssI, design=design,plt=plt,
                            method=method, has_fixed_parms = has_fixed_parms), 
               abl_tables = list(mle = mle),
               xpr=deparse(qtpredicate))
  class(outpt) = append('prms', class(outpt)) 
  outpt
}



# to~do: is there a better plot for calibrate bayes?

#' Plot for the extended nominal Response model
#' 
#' The plot shows 'fit' by comparing the expected score based on the model (grey line)
#' with the average scores based on the data (black line with dots) for groups of students
#' with similar estimated ability.
#' 
#' @param x object produced by fit_enorm
#' @param item_id which item to plot, if NULL, one plot for each item is made
#' @param dataSrc data source, see details
#' @param predicate an expression to subset data in dataSrc
#' @param nbins number of ability groups
#' @param ci confidence interval for the error bars, between 0 and 1. Use 0 to suppress the error bars.
#' Default = 0.95 for a 95\% confidence interval
#' @param add logical; if TRUE add to an already existing plot
#' @param col color for the observed score average
#' @param col.model color for the expected score based on the model
#' @param ... further arguments to plot
#' @return 
#' Silently, a data.frame with observed and expected values possibly useful to create a numerical fit measure.
#' @details
#' The standard plot shows the fit against the sample on which the parameters were fitted. If
#' dataSrc is provided, the fit is shown against the observed data in dataSrc. This may be useful 
#' for plotting the fit in different subgroups as a visual test for item level DIF. The confidence 
#' intervals denote the uncertainty about the predicted pvalues within the ability groups for the 
#' sample size in dataSrc (if not NULL) or the original data on which the model was fit.
#' 
#' @method plot prms
#' 
plot.prms = function(x, item_id=NULL, dataSrc=NULL, predicate=NULL, nbins=5, ci = .95, 
                     add=FALSE, col = 'black', col.model='grey80', ...)
{
  check_num(nbins,'integer',.length=1, .min=2)
  dots = list(...)
  
  if(is.null(item_id))
  {
    if('items' %in% names(dots))
    {
      # common typo
      item_id = dots$items
      dots$items = NULL
    } else
    {
      item_id = x$inputs$ssI$item_id
    }
  }
  if(length(setdiff(item_id,x$inputs$ssI$item_id))>0)
  {
    message('The following items were not found in your fit object')
    print(setdiff(item_id,x$inputs$ssI$item_id))
    stop('unknown item',call.=FALSE)
  }

  if(!is.null(dataSrc))
  {
    check_dataSrc(dataSrc)
    qtpredicate = eval(substitute(quote(predicate)))
    env = caller_env()
    respData = get_resp_data(dataSrc, qtpredicate, env=env, retain_person_id=FALSE,
                             parms_check=filter(x$inputs$ssIS, .data$item_id %in% local(item_id)))
    
    if(length(setdiff(as.character(item_id), levels(respData$design$item_id)))>0)
    {
      message('The following items were not found in dataSrc')
      print(setdiff(as.character(item_id), levels(x$design$item_id)))
      stop('unknown item',call.=FALSE)
    }
    
    x$abl_tables = list()
    
	  x$abl_tables$mle =  suppressWarnings({inner_join(respData$design, x$inputs$ssI,by='item_id')}) |>
      group_by(.data$booklet_id) |>
      do({
        est = theta_MLE(b=x$est$b, a=x$inputs$ssIS$item_score, first=.$first, last=.$last, se=FALSE)
        theta = est$theta[2:(length(est$theta)-1)]
        tibble(booklet_score=1:length(theta), theta = theta)
      }) |>
      ungroup() 
    
    x$inputs$plt = get_sufStats_nrm(respData, check_sanity=FALSE)$plt
  }
  

  #many plots
  if(length(item_id) > 1)
  {
    out = lapply(item_id, function(itm) do.call(plot, append(list(x=x, item_id=itm, nbins=nbins, ci=ci), dots)))
    names(out) = as.character(item_id)
    
    return(invisible(out))
  }
  # for dplyr
  item_id_ = as.character(item_id)
  
  expf = expected_score(x, items = item_id)

  max_score = x$inputs$ssIS |>
    filter(.data$item_id == item_id_) |>
    pull(.data$item_score) |>
    max()
  
  plt = x$inputs$plt |>
    filter(.data$item_id==item_id_) |>
    inner_join(x$abl_tables$mle, by=c('booklet_id','booklet_score')) |>
    mutate(abgroup = weighted_ntile(.data$theta, .data$N, n = nbins)) |>
    group_by(.data$abgroup) |>
    summarize(gr_theta = weighted.mean(.data$theta,.data$N), avg_score = weighted.mean(.data$meanScore,.data$N), n=sum(.data$N)) |>
    ungroup() |>
    mutate(expected_score = expf(.data$gr_theta))
  
  rng = max(plt$gr_theta) - min(plt$gr_theta)
  rng = c(min(plt$gr_theta)-.5*rng/nbins,
          max(plt$gr_theta)+.5*rng/nbins)
  
  plot.args = merge_arglists(dots,
                             default=list(bty='l',xlab = expression(theta), ylab='score',main=item_id,
                                          lwd=par('lwd')),
                             override=list(x = rng,y = c(0,max_score), type="n"))
  
  plot.args$main = fstr(plot.args$main, list(item_id=item_id))
  plot.args$sub = fstr(plot.args$sub, list(item_id=item_id))
  
  if(!add) do.call(plot, plot.args)

  plot(expf,from = rng[1], to=rng[2], col=col.model, add=TRUE,lwd=plot.args$lwd)
  
  plt$outlier = FALSE
  
  if(!is.null(ci) && !is.na(ci) && ci !=0)
  {
    if(ci>1 && ci<100)
      ci = ci/100
    
    if(ci<0 || ci >= 1)
      stop('confidence interval must be between 0 and 1')
    
    qnt = abs(qnorm((1-ci)/2))
    
    I=information(x, items = item_id)
    plt = plt |>
      mutate(se = sqrt(I(.data$gr_theta)/.data$n),
             conf_min = pmax(.data$expected_score - qnt*.data$se,0),
             conf_max = pmin(.data$expected_score + qnt*.data$se,max_score)) |>
      mutate(outlier = .data$avg_score < .data$conf_min | .data$avg_score > .data$conf_max)
    
    suppressWarnings({
      arrows(plt$gr_theta, plt$conf_min, 
             plt$gr_theta, plt$conf_max, 
             lwd=plot.args$lwd,
             length=0.05, angle=90, code=3, col=col.model)})
  } 
  
  lines(plt$gr_theta,plt$avg_score,col=col,lwd=plot.args$lwd)  
  points(plt$gr_theta, plt$avg_score, 
         bg = if_else(plt$outlier, qcolors(1), coalesce(plot.args$bg,'transparent')), 
         pch = coalesce(dots$pch,21), col=col)
  invisible(df_format(plt))
}



print.prms = function(x, ...){
  p = paste0( 'Parameters for the Extended Nominal Response Model\n\n',
              'Method: ', x$inputs$method, ', ',
              ifelse(x$inputs$method == 'CML',
                     paste0('converged in ',x$est$n_iter, ' iterations'),
                     paste0('number of Gibbs samples: ',nrow(x$est$beta))),
              '\nitems: ', nrow(x$inputs$ssI), 
              '\nresponses: ', sum(x$inputs$ssIS$sufI),'\n\n',
              'Use coef() or coefficients() to extract the item parameters.\n')
    
  cat(p)
  invisible(x)
}



#' extract enorm item parameters
#' 
#' @param object an enorm parameters object, generated by the function \code{\link{fit_enorm}}
#' @param hpd width of Bayesian highest posterior density interval around mean_beta, 
#'  value must be between 0 and 1, default is 0.95 
#' @param what which coefficients to return. Defaults to \code{items} (the item parameters). Can also be \code{var} for the 
#' variance-covariance matrix (CML only) or \code{posterior} for all draws of the item parameters (Bayes only)  
#' @param ... further arguments to coef are ignored
#'  
#' @return 
#' Depends on the calibration method and the value of 'what'. For 'items' 
#' 
#' \describe{
#' \item{CML}{a data.frame with columns: item_id, item_score, beta, SE_beta}
#' \item{Bayes}{a data.frame with columns: item_id, item_score, mean_beta, SD_beta, <hpd_b_left>, <hpd_b_right>}
#' }
#' 
#' The posterior distribution and variance covariance matrix are returned as matrices.
#' 
#' 
#' 
coef.prms = function(object, hpd = 0.95, what=c('items','var','posterior'), ...)
{
  x = object
  what = match.arg(what)
  
  if(what=='items')
  {
    if (x$inputs$method=="CML")
    {
      atab=data.frame(item_id=x$inputs$ssIS$item_id[-x$inputs$ssI$first],
                      item_score=as.integer(x$inputs$ssIS$item_score[-x$inputs$ssI$first]),
                      beta=x$est$beta,
                      SE_beta=sqrt(diag(x$est$acov.beta)),stringsAsFactors=FALSE)
    } else
    {
      if(hpd <= 0 ||  hpd >= 1)
        stop('hpd must be between 0 and 1')
      
      hh = t(apply(x$est$beta,2,hpdens, conf=hpd))
      atab=data.frame(item_id = x$inputs$ssIS$item_id[-x$inputs$ssI$first],
                      a = x$inputs$ssIS$item_score[-x$inputs$ssI$first],
                      mb = colMeans(x$est$beta),
                      sdb = apply(x$est$beta, 2, sd),
                      hpdl = hh[,1], hpdr=hh[,2],stringsAsFactors=FALSE)
      colnames(atab)=c("item_id" ,"item_score", "mean_beta", "SD_beta", 
                       sprintf("%i_hpd_b_left", round(100 * hpd)),
                       sprintf("%i_hpd_b_right", round(100 * hpd)))
    }
    atab$item_id = as.character(atab$item_id)
    rownames(atab) = NULL
    return(df_format(atab))
  } else if(what=='var')
  {
    if(x$inputs$method!="CML")
      stop('Variance-covariance matrix is only available for CML extimation')
    m = x$est$acov.beta
    colnames(m) = rownames(m) = paste(x$inputs$ssIS$item_id, x$inputs$ssIS$item_score)[-x$inputs$ssI$first]
    return(m)
  } else if(what=='posterior')
  {
    if(x$inputs$method!="Bayes")
      stop('The posterior of item parameters is only available for Bayesian estimation')
    m=x$est$beta
    colnames(m) = paste(x$inputs$ssIS$item_id, x$inputs$ssIS$item_score)[-x$inputs$ssI$first]
    return(m)
  }
}


#' Functions of theta
#' 
#' returns information function, expected score function, score simulation function, or score distribution 
#' for a single item, an arbitrary group of items or all items
#' 
#' @param parms object produced by \code{\link{fit_enorm}} or a data.frame with columns item_id, item_score and, 
#' depending on parametrization, a column named either beta/delta, eta or b
#' @param items vector of one or more item_id's. If NULL and booklet_id is also NULL, all items in parms are used
#' @param booklet_id id of a single booklet (e.g. the test information function), if items is not NULL this is ignored
#' @param which.draw the number of the random draw (only applicable if calibration method was Bayes). If NULL, the mean 
#' beta parameter will be used
#' 
#' @return Each function returns a new function which accepts a vector of theta's. These return the following values: 
#' \describe{
#' \item{information}{an equal length vector with the information estimate at each value of theta.}
#' \item{expected_score}{an equal length vector with the expected score at each value of theta}
#' \item{r_score}{a matrix with length(theta) rows and one column for each item containing simulated scores based on theta. 
#' To obtain test scores, use rowSums on this matrix}
#' \item{p_score}{a matrix with length(theta) rows and one column for each possible sumscore containing the probability of 
#' the score given theta}
#' }
#' 
#' @examples
#' 
#' \dontshow{ RcppArmadillo::armadillo_throttle_cores(1)}
#' 
#' db = start_new_project(verbAggrRules,':memory:')
#' add_booklet(db,verbAggrData, "agg")
#' p = fit_enorm(db)
#' 
#' # plot information function for single item
#' 
#' ifun = information(p, "S1DoScold")
#' 
#' plot(ifun,from=-4,to=4)
#' 
#' # compare test information function to the population ability distribution
#' 
#' ifun = information(p, booklet="agg")
#' 
#' pv = plausible_values(db,p)
#' 
#' op = par(no.readonly=TRUE)
#' par(mar = c(5,4,2,4))
#' 
#' plot(ifun,from=-4,to=4, xlab='theta', ylab='test information')
#' 
#' par(new=TRUE)
#' 
#' plot(density(pv$PV1), col='green', axes=FALSE, xlab=NA, ylab=NA, main=NA)
#' axis(side=4)
#' mtext(side = 4, line = 2.5, 'population density (green)')
#' 
#' par(op)
#' close_project(db)
#' 
#' \dontshow{ RcppArmadillo::armadillo_reset_cores()}
#' 
information = function(parms, items=NULL, booklet_id=NULL, which.draw=NULL)
{
  theta_function(parms, items=items, booklet=booklet_id, which.draw=which.draw, what='information')
}

#' @rdname information
expected_score = function(parms, items=NULL, booklet_id=NULL, which.draw=NULL)
{
  theta_function(parms, items=items, booklet=booklet_id, which.draw=which.draw, what='expected')
}

#' @rdname information
r_score = function(parms, items=NULL, booklet_id=NULL, which.draw=NULL)
{
  theta_function(parms, items=items, booklet=booklet_id, which.draw=which.draw, what='sim')
}

#' @rdname information
p_score = function(parms, items=NULL, booklet_id=NULL, which.draw=NULL)
{
  theta_function(parms, items=items, booklet=booklet_id, which.draw=which.draw, what='pmf')
}


theta_function = function(parms, items=NULL, booklet=NULL, which.draw=NULL, 
                          what=c('information','expected','sim','pmf'))
{
  what = match.arg(what)
  if(is.factor(items)) items = as.character(items)
  check_character(items,nullable=TRUE)
  check_string(booklet,name='booklet_id',nullable=TRUE)
  check_num(which.draw,nullable=TRUE)
  
  # data preparation
  # create fl(item_id,first,last), a, b
  
  if(inherits(parms,'data.frame'))
  {
    out = transform.df.parms(parms,'b',include.zero=TRUE)
    a = out$item_score
    b = out$b
    
    fl = out |>
      mutate(rn=row_number()) |>
      group_by(.data$item_id) |>
      summarize(first=as.integer(min(.data$rn)), last=as.integer(max(.data$rn))) |>
      ungroup()
    
    if(!is.null(items))
      fl = filter(fl, fl$item_id %in% items)
    
  } else if(inherits(parms,'prms'))
  {
    a = parms$inputs$ssIS$item_score
    b = parms$est$b
    if(is.matrix(b))
    {
      if(is.null(which.draw))
      {
        b = colMeans(b)
      } else
      {
        if(which.draw<1 || which.draw > nrow(b))
          stop('argument `which.draw` out of range')
        b = as.vector(b[which.draw,])
      }
    }
    
    fl = parms$inputs$ssI
    fl$item_id = as.character(fl$item_id)
    
    if(!is.null(items))
    {
      items = unique(items)
      suppressWarnings({fl = semi_join(fl, tibble(item_id=items),by='item_id')})
      if(nrow(fl) != length(items))
      {
        message('unknown items:')
        print(sort(setdiff(items,fl$item_id)))
        stop('Some items were not found, see output')
      }
    } else if(!is.null(booklet))
    {
      booklet = unique(booklet)
      design = parms$inputs$design
      if(length(intersect(booklet,design$booklet_id))<length(booklet))
      {
        stop('unknown booklet')
      }
      
      fl = design |>
        filter(.data$booklet_id %in% booklet) |>
        distinct(.data$item_id, .keep_all=TRUE)
    }  
    fl = arrange(fl,.data$first)
  } else stop_('parms must be a data.frame or an object of type `prms`') 
  rm(parms)  
  #output
  
  if(what=='information')
  {
    out = function(theta)
    {
      check_num(theta)
      if(any(is.na(theta) | is.nan(theta)))
        stop('theta may not contain nan/NA values')  
      
      res = rep(0,length(theta))
      res[is.finite(theta)] = 
        IJ_(b,a,fl$first, fl$last, theta[is.finite(theta)])$I
      # extremely large values overflow to NaN, recover as 0
      res[is.nan(res)] = 0
      res
    }
    class(out) = append('inf_func',class(out))
    
  } else if(what == 'expected')
  {
    max_score = sum(a[fl$last])
    out = function(theta)
    {
      check_num(theta)
      if(any(is.na(theta) | is.nan(theta))) 
        stop('theta may not contain nan/NA values') 
      
      res = rep(0,length(theta))
      
      res[is.finite(theta)] = E_score(theta[is.finite(theta)],  
                                      b=b, a=a, 
                                      first=fl$first, last=fl$last)
      
      res[is.infinite(theta) & theta > 0] = max_score
      # extremely large values of theta overflow to NaN (small values undeflow to zero, which is fine)
      res[is.nan(res)] = max_score
      res
    }
    class(out) = append('exp_func',class(out))
  } else if(what=='sim')
  {
    out = function(theta)
    {
      res = rscore_item(theta,b=b,a=a,first = fl$first, last = fl$last)
      colnames(res) = fl$item_id
      res
    }
    class(out) = append('sim_func',class(out))
  } else if(what=='pmf')
  {
    out = function(theta)
    {
      res = pscore(theta,b=b,a=a,first = fl$first, last = fl$last)
      t(res)
    }
    class(out) = append('pmf_func',class(out))
  }
  
  out
}

print.inf_func = function(x,...) cat('Information function: I(theta)\n')
print.exp_func = function(x,...) cat('Conditional expected score function: E(X_i|theta)\n')
print.sim_func = function(x,...) cat('function to simulate item scores: (x_i1, ..., x_ip) ~ ENORM(theta)\n')
print.pmf_func = function(x,...) cat('Conditional score distribution function: P(x_+|theta)\n')

Try the dexter package in your browser

Any scripts or data that you put into this service are public.

dexter documentation built on May 29, 2024, 8:21 a.m.