R/fit.R

Defines functions parDistributionFor parDistributionCustom parInterval withoutIndex responseCurve assertDataFitCompat outlierTable toLoo calibrateItems pcStan prepFactorModel setFactorScalePrior prepSingleFactorModel findModel verifyIsPreppedData prepData prepCleanData normalizeData

#' Normalize data according to a canonical order
#'
#' @template args-df
#' @template args-dots-barrier
#' @param .palist a character vector giving an order to use instead of the default
#' @param .sortRows logical. Using the same order, sort rows in addition to vertex pairs.
#'
#' @description
#'
#' Pairwise comparison data are not commutative.
#' Alice beating Bob in chess is equivalent to Bob losing to
#' Alice. \code{normalizeData} assigns an arbitrary order to all
#' vertices and reorders vertices column-wise to match,
#' flipping signs as needed.
#'
#' @examples
#' df <- data.frame(pa1=NA, pa2=NA, i1=c(1, -1))
#' df[1,paste0('pa',1:2)] <- c('a','b')
#' df[2,paste0('pa',1:2)] <- c('b','a')
#' normalizeData(df)
#' @export
normalizeData <- function(df, ..., .palist=NULL, .sortRows=TRUE) {
  if (length(list(...)) > 0) stop("Rejected are any values passed in the '...' argument")

  palist <- verifyIsData(df)
  if (!is.null(.palist)) {
    if (length(palist) != length(.palist)) {
      stop(paste(".palist must be length", length(palist)))
    }
    if (any(is.na(match(palist, .palist)))) {
      stop(".palist must contain the names of all vertices")
    }
    palist <- .palist
  }
  flip <- match(df$pa1, palist) > match(df$pa2, palist)
  dataCols <- -match(paste0('pa',1:2), colnames(df))
  df[flip, dataCols] <- -df[flip, dataCols]
  tmp <- df[flip, 'pa1']
  df[flip, 'pa1'] <- df[flip, 'pa2']
  df[flip, 'pa2'] <- tmp
  if (.sortRows) {
    df <- df[order(df$pa1, df$pa2),]
  }
  df
}

#' Transforms data into a form tailored for efficient evaluation by Stan
#'
#' Vertex names, if not already factors, are converted to
#' factors.  The number of thresholds per item is determined by the
#' largest absolute response value.  Missing responses are filtered
#' out.  Responses on the same pair of vertices on the same item are
#' grouped together.  Within a vertex pair and item, responses
#' are ordered from negative to positive.
#'
#' @details
#' Note: Reordering of responses is likely unless something like
#' \code{\link{normalizeData}} has been used with \code{.sortRows=TRUE}.
#'
#' @template args-df
#'
#' @template return-datalist
#' @family data preppers
#' @examples
#' df <- prepCleanData(phyActFlowPropensity)
#' str(df)
#' @importFrom reshape2 melt
#' @export
prepCleanData <- function(df) {
  palist <- verifyIsData(df)

  if (!is.factor(df$pa1)) {
    df$pa1 <- factor(df$pa1, levels=palist)
    df$pa2 <- factor(df$pa2, levels=palist)
  }

  dataCols <- -match(paste0('pa',1:2), colnames(df))
  nthr <- apply(df[,dataCols,drop=FALSE], 2, function(x) max(abs(x), na.rm=TRUE))
  itemNames <- names(nthr)
  names(nthr) <- NULL

  tall <- melt(df, id.vars = paste0('pa',1:2),
               variable.name = "item", value.name="pick")

  base <- 1L+max(length(nthr), length(palist))
  likeId <- (base * base * unclass(tall$pa1) +
             base * unclass(tall$pa2) + unclass(tall$item))

  l <- split(tall, likeId)
  pa1 <- c()
  pa2 <- c()
  item <- c()
  weight <- c()
  pick <- c()
  refresh <- c()
  numOutcome <- c()
  numRefresh <- 0L
  for (lx in 1:length(l)) {
    x <- l[[lx]]
    pt <- table(x$pick, useNA="no")
    if (length(pt) == 0) next
    numRefresh <- numRefresh + 1L
    pa1 <- c(pa1, unclass(x$pa1)[1])
    pa2 <- c(pa2, unclass(x$pa2)[1])
    item <- c(item, unclass(x$item)[1])
    weight <- c(weight, pt, use.names=FALSE)
    pick <- c(pick, as.integer(names(pt)))
    refresh <- c(refresh, length(pt))
    numOutcome <- c(numOutcome, sum(pt))
  }

  dl <- list(
    nameInfo=list(pa=palist, item=itemNames),
    # all models
    NPA=length(palist),
    NCMP=length(pick),
    N=sum(weight),
    # multivariate models
    NITEMS=length(nthr),
    NTHRESH=nthr,
    TOFFSET=c(1L, 1L + cumsum(nthr)[-length(nthr)]),
    # data (all models)
    pa1=pa1,
    pa2=pa2,
    weight=weight,
    pick=pick,
    numRefresh=numRefresh,
    refresh=refresh,
    numOutcome=numOutcome,
    # multivariate data
    item=item,
    # priors
    alphaScalePrior = 0.2,
    # correlation model
    corLKJPrior = 2.5,  # 2.0 not quite enough
    # factor model
    propShape = 3.0   # 2.0 give divegence, 4.0 okay, but try to reduce?
  )
  if (any(is.na(match(preppedDataFields, names(dl))))) {
    stop("Bug in prepCleanData(); contact developers") # nocov
  }
  dl
}

#' Transforms data into a form tailored for efficient evaluation by Stan
#'
#' @template args-df
#'
#' @description
#' Invokes \code{\link{filterGraph}} and \code{\link{normalizeData}}.
#' Vertex names, if not already factors, are converted to
#' factors.  The number of thresholds per item is determined by the
#' largest absolute response value.  Missing responses are filtered
#' out.  Responses on the same pair of vertices on the same item are
#' grouped together.  Within a vertex pair and item, responses
#' are ordered from negative to positive.
#'
#' @template return-datalist
#' @family data preppers
#' @examples
#' df <- prepData(phyActFlowPropensity)
#' str(df)
#'
#' @export
prepData <- function(df) {
  df <- filterGraph(df)
  df <- normalizeData(df)
  prepCleanData(df)
}

preppedDataFields <- c('nameInfo','NPA','NCMP','N','pa1','pa1','weight',
                       'pick','refresh')

verifyIsPreppedData <- function(data) {
  if (is.data.frame(data)) stop("Data must be processed by prepData. Try prepData(data)")
  if (is.list(data) && all(!is.na(match(preppedDataFields, names(data))))) return()
  stop("Is data an object returned by prepData()? Seems not")
}

#' Given a model name, return stanmodel object
#'
#' @template args-locate
#' @description
#'
#' This is a convenience function to help you look up the path to an
#' appropriate model for your data.
#'
#' @details
#'
#' There are essentially three models: \sQuote{unidim}, \sQuote{covariance}, and \sQuote{factor}.
#' \sQuote{unidim} analyzes a single item. \sQuote{covariance} is suitable for two or more items.
#' Once you have vetted your items with the \sQuote{unidim} and \sQuote{covariance} models,
#' then you can try the \sQuote{factor} model.
#' For each model, there is a \sQuote{_ll} variation. This model
#' includes row-wise log likelihoods suitable for feeding to \pkg{loo}
#' for efficient approximate leave-one-out cross-validation (Vehtari, Gelman, & Gabry, 2017).
#'
#' There is also a special model \sQuote{unidim_adapt}.  Except for
#' this model, the other models require a scaling constant.  To find
#' an appropriate scaling constant, we recommend fitting
#' \sQuote{unidim_adapt} to each item separately and then take the
#' median of median point estimates to set the scale. \sQuote{unidim_adapt} requires a
#' varCorrection constant. In general, a varCorrection of 2.0 or 3.0
#' should provide optimal results.
#'
#' Since version 1.1.0, the factor model permits an arbitrary number
#' of factors and arbitrary factor-to-item paths. If you were using
#' the old factor model, you'll need to update your code to call
#' \link{prepSingleFactorModel}. Arbitrary factor model structure
#' should be specified using \link{prepFactorModel}. The single factor model
#' is called \sQuote{factor1} and the general factor model is called \sQuote{factor}.
#'
#' @return An instance of S4 class \code{\link[rstan:stanmodel-class]{stanmodel}} that can be passed to \code{\link{pcStan}}.
#' @seealso \code{\link{toLoo}}
#' @template ref-vehtari2017
#' @examples
#' findModel()  # shows available models
#' findModel('unidim')
#' @export
findModel <- function(model=NULL) {
  avail <- names(stanmodels)

  if (is.null(model)) {
    message(paste("Models available:", paste0(deparse(avail), collapse="")))
    return(invisible())
  }

  obj <- stanmodels[[model]]
  if(is.null(obj)) {
    stop(paste("Stan model not found:", model))
  }

  obj
}

#' Specify a single factor model
#'
#' Specify a single latent factor with a path to each item.
#'
#' @template args-factorScalePrior
#' @template args-data
#' @template return-datalist
#' @examples
#' dl <- prepData(phyActFlowPropensity)
#' dl <- prepSingleFactorModel(dl)
#' str(dl)
#' @family factor model
#' @family data preppers
#' @export
#' @importFrom lifecycle deprecate_warn
#' @importFrom lifecycle deprecated
prepSingleFactorModel <- function(data, factorScalePrior=deprecated()) {
  if (lifecycle::is_present(factorScalePrior)) {
    deprecate_warn("1.5.0", "prepSingleFactorModel(factorScalePrior = )")
  }
  verifyIsPreppedData(data)
  ni <- data$NITEMS
  data$factorItemPath <- matrix(c(rep(1,ni), 1:ni), nrow=2, byrow=TRUE)
  data$NFACTORS <- 1L
  data$NPATHS <- ni
  data <- setFactorScalePrior(data)
  data
}

setFactorScalePrior <- function(data) {
  data$factorScalePrior <- as.array(rep(1.2, data$NFACTORS))
  data
}

#' Specify a factor model
#'
#' Specify a factor model with an arbitrary number of factors and
#' arbitrary factor-to-item structure.
#'
#' @template detail-factorspec
#' @template args-path
#' @template args-factorScalePrior
#' @template args-data
#' @param psiScalePrior matrix of priors for factor correlations (deprecated)
#' @template return-datalist
#' @examples
#' pa <- phyActFlowPropensity[,setdiff(colnames(phyActFlowPropensity),
#'                                     c('goal1','feedback1'))]
#' dl <- prepData(pa)
#' dl <- prepFactorModel(dl,
#'                       list(flow=c('complex','skill','predict',
#'                                   'creative', 'novelty', 'stakes',
#'                                   'present', 'reward', 'chatter',
#'                                   'body'),
#'                            f2=c('waiting','control','evaluated','spont'),
#'                            rc=c('novelty', 'waiting')))
#' str(dl)
#' @family factor model
#' @family data preppers
#' @seealso To simulate data from a factor model: \link{generateFactorItems}
#' @export
prepFactorModel <- function(data, path, factorScalePrior=deprecated(),
                            psiScalePrior=deprecated()) {
  if (lifecycle::is_present(factorScalePrior)) {
    deprecate_warn("1.5.0", "prepFactorModel(factorScalePrior = )")
  }
  if (lifecycle::is_present(psiScalePrior)) {
    deprecate_warn("1.5.0", "prepFactorModel(psiScalePrior = )")
  }
  verifyIsPreppedData(data)
  items <- data$nameInfo$item
  validateFactorModel(items, path)
  itemsPerFactor <- sapply(path, length)
  data$factorItemPath <- matrix(c(rep(1:length(itemsPerFactor), itemsPerFactor),
                                  unlist(lapply(path, function(x) match(x, items)))),
                                nrow=2, byrow=TRUE)
  data$NFACTORS <- length(names(path))
  data$NPATHS <- sum(itemsPerFactor)
  data$nameInfo[['factor']] <- names(path)
  data <- setFactorScalePrior(data)
  data
}

#' Fit a paired comparison Stan model
#' @template args-locate
#' @template args-data
#' @template args-stan
#' @description Uses \code{\link{findModel}} to find the appropriate
#'   model and then invokes \link[rstan:sampling]{sampling}.
#' @return A \code{\link[rstan:stanfit-class]{stanfit}} object.
#' @seealso See \code{\link[rstan:sampling]{sampling}}, for which this function is
#'   a wrapper, for additional options. See \code{\link{prepData}} to
#'   create a suitable data list.  See
#'   \code{\link[rstan:print.stanfit]{print.stanfit}} for ways of getting tables
#'   summarizing parameter posteriors.
#'
#' @return An object of S4 class \code{\link[rstan:stanfit-class]{stanfit}}.
#' @seealso \code{\link{calibrateItems}}, \code{\link{outlierTable}}
#' @examples
#' dl <- prepData(phyActFlowPropensity[,c(1,2,3)])
#' dl$varCorrection <- 5.0
#' \donttest{pcStan('unidim_adapt', data=dl)}  # takes more than 5 seconds
#' @importFrom rstan sampling
#' @export
pcStan <- function(model, data, ...) {
  verifyIsPreppedData(data)
  obj <- findModel(model)
  rstan::sampling(obj, data = data, ...)
}

#' Determine the optimal scale constant for a set of items
#' @template args-df
#' @template args-stan
#' @param iter A positive integer specifying the number of iterations for each chain (including warmup).
#' @param chains A positive integer specifying the number of Markov chains.
#' @param varCorrection A correction factor greater than or equal to 1.0
#' @param maxAttempts How many times to try re-running a model with more iterations.
#'
#' @description
#'
#' Data are passed through \code{\link{filterGraph}} and \code{\link{normalizeData}}.
#' Then the \sQuote{unidim_adapt} model is fit to each item individually.
#' A larger \code{varCorrection} will obtain a more accurate
#' \code{scale}, but is also more likely to produce an intractable
#' model. A good compromise is between 5.0 and 9.0.
#'
#' @return
#' A data.frame (one row per item) with the following columns:
#' \describe{
#'  \item{item}{Name of the item}
#'  \item{iter}{Number of iterations per chain}
#'  \item{divergent}{Number of divergent transitions observed after warmup}
#'  \item{treedepth}{Number of times the treedepth was exceeded}
#'  \item{low_bfmi}{Number of chains with low E-BFMI}
#'  \item{n_eff}{Minimum effective number of samples across all parameters}
#'  \item{Rhat}{Maximum Rhat across all parameters}
#'  \item{scale}{Median marginal posterior of \code{scale}}
#'  \item{thetaVar}{Median variance of theta (latent scores)}
#'  }
#' @seealso \code{\link[rstan:check_hmc_diagnostics]{check_hmc_diagnostics}}
#' @template ref-vehtari2019
#' @examples
#' \donttest{
#' result <- calibrateItems(phyActFlowPropensity)  # takes more than 5 seconds
#' print(result)
#' }
#' @importMethodsFrom rstan summary
#' @importFrom rstan get_num_divergent get_max_treedepth_iterations get_low_bfmi_chains
#' @export
calibrateItems <- function(df, iter=2000L, chains=4L, varCorrection=5.0, maxAttempts=5L, ...) {
  df <- filterGraph(df)
  df <- normalizeData(df)
  vCol <- match(paste0('pa',1:2), colnames(df))
  result <- expand.grid(item=colnames(df[,-vCol]),
                        iter=NA,
                        divergent=NA, treedepth=NA, low_bfmi=NA, n_eff=NA, Rhat=NA,
                        scale=NA, thetaVar=NA)
  for (attempt in 1:maxAttempts) {
    for (rx in 1:nrow(result)) {
      if (!is.na(result[rx,'divergent']) &&
          (result[rx,'divergent'] || result[rx,'treedepth'] || result[rx,'low_bfmi'])) next
      if (!is.na(result[rx, 'Rhat']) &&
          result[rx, 'Rhat'] < 1.015 && result[rx, 'n_eff'] > 100 * chains) next

      itemCol <- match(result[rx,'item'], colnames(df))
      dl <- prepCleanData(df[,c(vCol, itemCol)])
      dl$varCorrection <- varCorrection
      result[rx,'iter'] <- ifelse(is.na(result[rx,'iter']), iter, result[rx,'iter'] * 1.5)
      fit1 <- suppressWarnings(pcStan("unidim_adapt", data=dl, chains=chains, iter=result[rx,'iter'], ...))
      result[rx,'divergent'] <- get_num_divergent(fit1)
      result[rx,'treedepth'] <- sum(get_max_treedepth_iterations(fit1))
      result[rx,'low_bfmi'] <- length(get_low_bfmi_chains(fit1))
      allPars <- summary(fit1, probs=0.5)$summary
      result[rx,'n_eff'] <- min(allPars[,'n_eff'])
      result[rx,'Rhat'] <- max(allPars[,'Rhat'])
      result[rx,'scale'] <- allPars['scale','50%']
      result[rx,'thetaVar'] <- allPars['thetaVar','50%']
    }
  }
  result
}

#' Compute approximate leave-one-out (LOO) cross-validation for Bayesian
#' models using Pareto smoothed importance sampling (PSIS)
#'
#' @template args-fit
#' @param ... Additional options passed to \code{\link[loo:loo]{loo}}.
#'
#' @description
#'
#' You must use an \sQuote{_ll} model variation (see \code{\link{findModel}}).
#'
#' @return a loo object
#' @seealso \code{\link{outlierTable}}, \code{\link[loo:loo]{loo}}
#' @importFrom loo loo extract_log_lik relative_eff
#' @export
#' @examples
#' palist <- letters[1:10]
#' df <- twoLevelGraph(palist, 300)
#' theta <- rnorm(length(palist))
#' names(theta) <- palist
#' df <- generateItem(df, theta, th=rep(0.5, 4))
#'
#' df <- filterGraph(df)
#' df <- normalizeData(df)
#' dl <- prepCleanData(df)
#' dl$scale <- 1.5
#'
#' \donttest{
#' m1 <- pcStan("unidim_ll", dl)
#'
#' loo1 <- toLoo(m1, cores=1)
#' print(loo1)
#' }
toLoo <- function(fit, ...) {
  ll <- extract_log_lik(fit, merge_chains = FALSE)
  loo(ll, r_eff=relative_eff(exp(ll)), ...)
}

#' List observations with Pareto values larger than a given threshold
#'
#' @template args-data
#' @param x An object created by \code{\link[loo:loo]{loo}}
#' @param threshold threshold is the minimum k value to include
#'
#' @description
#'
#' The function \code{\link{prepCleanData}} compresses observations
#' into the most efficient format for evaluation by Stan. This function
#' maps indices of observations back to the actual observations,
#' filtering by the largest Pareto k values. It is assumed that
#' \code{data} was processed by \code{\link{normalizeData}} or is in
#' the same order as seen by \code{\link{prepCleanData}}.
#'
#' @return
#' A data.frame (one row per observation) with the following columns:
#' \describe{
#' \item{pa1}{Name of object 1}
#' \item{pa2}{Name of object 2}
#' \item{item}{Name of item}
#' \item{pick}{Observed response}
#' \item{k}{Associated Pareto k value}
#' }
#' @seealso \code{\link{toLoo}}, \code{\link[loo:pareto-k-diagnostic]{pareto_k_ids}}
#' @importFrom loo pareto_k_ids pareto_k_values
#' @export
#' @examples
#' palist <- letters[1:10]
#' df <- twoLevelGraph(palist, 300)
#' theta <- rnorm(length(palist))
#' names(theta) <- palist
#' df <- generateItem(df, theta, th=rep(0.5, 4))
#'
#' df <- filterGraph(df)
#' df <- normalizeData(df)
#' dl <- prepCleanData(df)
#' dl$scale <- 1.5
#'
#' \donttest{
#' m1 <- pcStan("unidim_ll", dl)
#'
#' loo1 <- toLoo(m1, cores=1)
#' ot <- outlierTable(dl, loo1, threshold=.2)
#' df[df$pa1==ot[1,'pa1'] & df$pa2==ot[1,'pa2'], 'i1']
#' }
outlierTable <- function(data, x, threshold=0.5) {
  verifyIsPreppedData(data)
  ids <- pareto_k_ids(x, threshold)
  xx <- data.frame(rx=rep(NA, data$N),
                   px=rep(NA, data$N))
  cmpStart <- 1L
  cur <- 1L
  for (rx in 1:data$numRefresh) {
    len <- data$refresh[rx]
    for (ox in cmpStart:(cmpStart + len - 1)) {
      for (wx in 1:data$weight[ox]) {
        xx[cur,'rx']=rx
        xx[cur,'px']=ox
        cur <- cur + 1L
      }
    }
    cmpStart <- cmpStart + data$refresh[rx]
  }
  RX <- xx[ids,'rx']
  PX <- xx[ids,'px']
  palist <- data$nameInfo$pa
  itemName <- data$nameInfo$item
  df <- data.frame(pa1=data$pa1[RX],
             pa2=data$pa2[RX],
             item=data$item[RX],
             pick=data$pick[PX],
             k=pareto_k_values(x)[ids])
  df <- df[!duplicated(PX),]
  for (k in paste0('pa',1:2)) {
    levels(df[[k]]) <- palist
    class(df[[k]]) <- 'factor'
  }
  levels(df[['item']]) <- itemName
  class(df[['item']]) <- 'factor'
  df <- df[order(-df$k),]
  rownames(df) <- c()  # original order is meaningless
  df
}

assertDataFitCompat <- function(dl, fit) {
  if (!is(dl, 'list')) stop("dl must be a list of data")
  if (!is(fit, 'stanfit')) stop("fit must be a stanfit object")
  pd <- fit@par_dims
  if (pd$theta[1] != dl$NPA) {
    stop(paste0("dl has ",dl$NPA," objects but fit has ",pd$theta[1]," objects"))
  }
  fitItems <- ifelse(length(pd$theta)==1, 1, pd$theta[2])
  if (!fitItems == dl$NITEMS) {
    stop(paste0("dl has ",dl$NITEMS," items but fit has ",fitItems," items"))
  }
  if (pd$threshold != sum(dl$NTHRESH)) {
    stop(paste0("dl has a total of ",dl$NTHRESH," thresholds across all items but fit has ",
                pd$threshold," thresholds"))
  }
}

#' Produce data suitable for plotting item response curves
#'
#' @template args-dl
#' @template args-fit
#' @param item a vector of item names
#' @param responseNames a vector of labels for the possible responses
#' @template args-samples
#' @param from the starting latent difference value
#' @param to the ending latent difference value
#' @param by the grid increment
#'
#' @description
#' Selects \code{samples} random draws from the posterior and evaluates the item
#' response curve on the grid given by \code{seq(from,to,by)}.
#' All items use the same \code{responseNames}. If you have some items
#' with a different number of thresholds or different response names
#' then you can call \code{responseCurve} for each item separately
#' and \code{rbind} the results together.
#'
#' @template detail-response
#' @return
#' A data.frame with the following columns:
#' \describe{
#' \item{response}{Which response}
#' \item{worthDiff}{Difference in worth}
#' \item{item}{Which item}
#' \item{sample}{Which sample}
#' \item{prob}{Associated probability}
#' \item{responseSample}{A grouping index for independent item response samples}
#' }
#' @export
#' @importFrom rstan extract
#' @importFrom stats qnorm
#' @family data extractor
#' @examples
#' \donttest{ vignette('manual', 'pcFactorStan') }
responseCurve <- function(dl, fit, responseNames, item=dl$nameInfo$item,
                          samples=100, from=qnorm(.1), to=-from, by=.02) {
  assertDataFitCompat(dl, fit)
  pd <- fit@par_dims
  itemIndex <- match(item, dl$nameInfo$item)
  if (any(is.na(itemIndex))) {
    stop(paste0("Item not found: ",
                paste(item[is.na(itemIndex)], collapse=', ')))
  }
  mismatch <- (1 + 2 * dl$NTHRESH[itemIndex]) != length(responseNames)
  if (any(mismatch)) {
    stop(paste0("Item with different number of responseNames: ",
                paste(item[mismatch], collapse=', ')))
  }
  grid <- seq(from,to,by)
  df <- expand.grid(response=responseNames, worthDiff=1:length(grid),
                    item=item, sample=1:samples, prob=NA)
  pick <- c()
  for (i1 in item) {
    ii <- match(i1, dl$nameInfo$item)
    grid <- seq(from,to,by) / dl$scale[ii]
    thrInd <- dl$TOFFSET[ii] + 1:dl$NTHRESH[ii] - 1L
    thrData <- extract(fit, pars=paste0("threshold[",thrInd,"]"))
    if ('alpha' %in% names(pd)) {
      if (length(pd$alpha) == 0) {
        alphaData <- extract(fit, pars="alpha")[[1]]
      } else {
        alphaData <- extract(fit, pars=paste0("alpha[",ii,"]"))[[1]]
      }
    } else {
      alphaData <- dl$alpha[ii]
    }
    if (length(pick)==0) {
      samples <- min(length(thrData[[1]]), samples)
      pick <- sample.int(length(thrData[[1]]), samples)
    }
    for (sx in 1:samples) {
      mask <- df$item == i1 & df$sample == sx
      p1 <- sapply(grid, function(gx) {
        if (length(alphaData) > 1) {
          alpha <- alphaData[pick[sx]]
        } else {
          alpha <- alphaData
        }
        cmp_probs(dl$scale[ii], alpha, 0, gx,
                  sapply(thrData, function(x) x[pick[sx]]))
      })
      df[mask, 'prob'] <- c(p1)
      df[mask, 'worthDiff'] <- kronecker(grid, rep(1, nrow(p1)))
    }
  }
  df$responseSample <-
    (match(df$item, dl$nameInfo$item) * length(responseNames) * samples +
     df$sample * length(responseNames) +
     unfactor(df$response)-1L)
  df
}

#' Remove the array indexing from a parameter name
#' @param name a parameter name
#' @return the name without the square bracket parameter indexing
#' @examples
#' withoutIndex("foo[1,2]")
#' @export
withoutIndex <- function(name) {
  sub("\\[[\\d,]+\\]$", "", name, perl=TRUE)
}

#' Produce data suitable for plotting parameter estimates
#'
#' @template args-fit
#' @param pars a vector of parameter names
#' @param label column name for \code{nameVec}
#' @param nameVec a vector of explanatory parameters names
#' @param width a width in probability units for the uncertainty interval
#'
#' @return
#' A data.frame with the following columns:
#' \describe{
#' \item{L}{Lower quantile}
#' \item{M}{Median}
#' \item{U}{Upper quantile}
#' \item{\emph{label}}{\emph{nameVec}}
#' }
#' @export
#' @importFrom rstan summary
#' @family data extractor
#' @examples
#' \donttest{ vignette('manual', 'pcFactorStan') }
parInterval <- function(fit, pars, nameVec,
                        label=withoutIndex(pars[1]), width=0.8) {
  if (!is(fit, "stanfit")) stop("fit must be a stanfit object")
  if (width <= 0 || width >= 1) stop("width must be in the interval (0,1)")
  probs <- 0.5 + c(-width/2, 0, width/2)
  interval <- summary(fit, pars=pars, probs=probs)$summary[,4:6,drop=FALSE]
  colnames(interval) <- c("L","M","U")
  interval <- as.data.frame(interval)
  if (nrow(interval) != length(nameVec)) {
    stop(paste("pars and nameVec must be the same length. Currently",
               "pars is length", nrow(interval),
               "but nameVec is length", length(nameVec)))
  }
  interval[[label]] <- factor(nameVec, levels=nameVec)
  interval
}

#' Produce data suitable for plotting parameter distributions
#'
#' @template args-fit
#' @param pars a vector of parameter names
#' @param label column name for \code{nameVec}
#' @param nameVec a vector of explanatory parameters names
#' @template args-samples
#' @return
#' A data.frame with the following columns:
#' \describe{
#' \item{sample}{Sample index}
#' \item{\emph{label}}{A name from \emph{nameVec}}
#' \item{value}{A single sample of the associated parameter}
#' }
#' @export
#' @importFrom rstan extract
#' @importFrom reshape2 melt
#' @family data extractor
#' @examples
#' \donttest{ vignette('manual', 'pcFactorStan') }
parDistributionCustom <- function(fit, pars, nameVec,
                                  label=withoutIndex(pars[1]), samples=500) {
  if (!is(fit, "stanfit")) stop("fit must be a stanfit object")
  colSet <- extract(fit, pars)
  if (length(colSet) != length(nameVec)) {
    stop(paste("pars and nameVec must be the same length. Currently",
               "pars is length", length(colSet),
               "but nameVec is length", length(nameVec)))
  }
  nextCol <- 1L
  pick <- c()
  tall <- NULL
  for (c1 in colSet) {
    if (length(dim(c1)) == 1) c1 <- as.matrix(c1)
    colnames(c1) <- nameVec[nextCol:(nextCol + ncol(c1) - 1L)]
    nextCol <- nextCol + ncol(c1)
    if (length(pick) == 0) {
      samples <- min(nrow(c1), samples)
      pick <- sample.int(nrow(c1), samples)
    }
    c1 <- c1[pick,,drop=FALSE]
    tall1 <- melt(c1)
    colnames(tall1)[1:2] <- c('sample',label)
    tall <- rbind(tall, tall1)
  }
  tall
}

#' @param pi a data.frame returned by \code{\link{parInterval}}
#' @rdname parDistributionCustom
#' @export
parDistributionFor <- function(fit, pi, samples=500) {
  parDistributionCustom(fit, rownames(pi), nameVec=pi[[4]],
                        label=colnames(pi)[4], samples=samples)
}
jpritikin/pcFactorStan documentation built on Sept. 20, 2023, 2:51 a.m.