R/fitting_functions.R

#' Stan for item response theory
#'
#' \pkg{edstan} attempts to make easy the fitting of standard item response
#' theory models using \pkg{rstan}.
#'
#' A user will generally want to use the following functions (in order) to fit
#' a model:
#'
#' \enumerate{
#'   \item \code{\link{irt_data}} to format the data,
#'   \item \code{\link{irt_stan}} to fit a model, and
#'   \item \code{\link{print_irt_stan}} to view some results.
#' }
#'
#' Additionally, \code{\link{labelled_integer}} is some times helpful for data
#' formatting and \code{\link{stan_columns_plot}} creates a plots of convergence
#' and other statistics by parameter vector. The package also includes six Stan
#' models (see \code{\link{irt_stan}} for a list) and two example datasets
#' (\code{\link{aggression}} and \code{\link{spelling}}).
#'
#' It is expected that once a user is comfortable fitting pre-defined
#' \pkg{edstan} models, they will write their own Stan models and fit them with
#' \code{\link[rstan]{stan}}, for which \code{\link{irt_stan}} is a wrapper.
#'
#' @seealso Case studies for each of the \pkg{edstan} models have been published.
#'
#' Rasch and two-parameter logistic models
#'
#' \url{http://mc-stan.org/documentation/case-studies/rasch_and_2pl.html}
#'
#' (Generalized) partial credit model
#'
#' \url{http://mc-stan.org/documentation/case-studies/pcm_and_gpcm.html}
#'
#' (Generalized) rating scale model
#'
#' \url{http://mc-stan.org/documentation/case-studies/rsm_and_grsm.html}
#' @import rstan
"_PACKAGE"
#> [1] "_PACKAGE"


#' Create a Stan data list from an item response matrix or from long-form data.
#'
#' @param response_matrix An item response matrix.
#'   Columns represent items and rows represent persons.
#'   NA may be supplied for missing responses.
#'   The lowest score for each item should be 0, with exception to rating scale
#'   models.
#'   \code{y}, \code{ii}, and \code{jj} should not be supplied if a response
#'   matrix is given.
#' @param y A vector of scored responses for long-form data.
#'   The lowest score for each item should be 0, with exception to rating scale
#'   models.
#'   NAs are not permitted, but missing responses may simply be ommitted
#'   instead.
#'   Required if \code{response_matrix} is not supplied.
#' @param ii A vector indexing the items in \code{y}.
#'   This must consist of consecutive integers starting at 1.
#'   \code{\link{labelled_integer}} may be used to create a suitable vector.
#'   Required if \code{response_matrix} is not supplied.
#' @param jj A vector indexing the persons in \code{y}.
#'   This must consist of consecutive integers starting at 1.
#'   \code{\link{labelled_integer}} may be used to create a suitable vector.
#'   Required if \code{response_matrix} is not supplied.
#' @param covariates An optional data frame containing (only) person-covariates.
#'   It must contain one row per person or be of the same length as \code{y},
#'   \code{ii}, and \code{jj}. If it contains one row per person, it must be in
#'   the same order as the response matrix (or \code{unique(jj)}). If it has a
#'   number of columns equal to the length of \code{y},
#'   \code{ii}, and \code{jj}, it must be in the same order as \code{jj} (for
#'   example, it may be a subset of columns from the same data frame that contains
#'   \code{y}, \code{ii}, and \code{jj}).
#' @param formula An optional formula for the latent regression that is applied
#'   to \code{covariates}. The left side  should be blank (for example,
#'   \code{~ v1 + v2}). By default it includes only a model intercept,
#'   interpretable as the mean of the person distribution. If set to
#'   \code{NULL}, then \code{covariates} is used directly  as the design matrix
#'   for the latent regression.
#' @return A data list suitable for \code{\link{irt_stan}}.
#' @seealso See \code{\link{labelled_integer}} for a means of creating
#' appropriate inputs for \code{ii} and \code{jj}.
#' See \code{\link{irt_stan}} to fit a model to the data list.
#' @examples
#' # For a response matrix ("wide-form" data) with person covariates:
#' spelling_list <- irt_data(response_matrix = spelling[, 2:5],
#'                           covariates = spelling[, "male", drop = FALSE],
#'                           formula = ~ 1 + male)
#'
#' # Alternatively, the same may be created by:
#' W <- cbind(intercept = 1, spelling[, "male"])
#' spelling_list <- irt_data(response_matrix = spelling[, 2:5],
#'                           covariates = W,
#'                           formula = NULL)
#'
#' # For long-form data (one row per item-person pair):
#' agg_list_1 <- irt_data(y = aggression$poly,
#'                        ii = aggression$item,
#'                        jj = aggression$person)
#'
#' # Add a latent regression and use labelled_integer() with the items
#' agg_list_2 <- irt_data(y = aggression$poly,
#'                        ii = labelled_integer(aggression$description),
#'                        jj = aggression$person,
#'                        covariates = aggression[, c("male", "anger")],
#'                        formula = ~ 1 + male*anger)
#' @export
irt_data <- function(response_matrix = matrix(), y = integer(), ii = integer(),
                     jj = integer(), covariates = data.frame(), formula = ~1) {

  # If response matrix is provided...
  if(!identical(response_matrix, matrix())) {

    if(!all(y == integer(), ii == integer(), jj == integer())) {
      warning("Options 'y', 'ii', and 'jj' are ignored when response_matrix is provided.")
    }

    # Assemble y into a vector of non-missing values
    not_missing <- as.vector(t(!is.na(response_matrix)))
    y <- as.vector(t(response_matrix))[not_missing]

    # Assemble ii into a vector of non-missing values
    ii_unique <- 1:ncol(response_matrix)
    names(ii_unique) <- colnames(response_matrix)
    ii_expand <- rep(ii_unique, times = nrow(response_matrix))
    ii <- ii_expand[not_missing]

    # Assemble jj into a vector of non-missing values
    jj_unique <- 1:nrow(response_matrix)
    names(jj_unique) <- rownames(response_matrix)
    jj_expand <- rep(jj_unique, each = ncol(response_matrix))
    jj <- jj_expand[not_missing]


    # If response matrix is NOT provided...
  } else {

    # Check that all long-form options are provided
    if(any(identical(y, integer()), identical(ii, integer()),
           identical(jj, integer()))) {
      stop("Options 'y', 'ii', and 'jj' are all required when response_matrix is not provided.")
    }

    # Check that all vectors have same length
    vec_lengths <- lengths(list(y, ii, jj))
    if(min(vec_lengths) != max(vec_lengths)) {
      stop("Vectors 'y', 'ii', and 'jj' must have the same length.")
    }

    # Check that there are no NAs
    if(sum(is.na(c(y, ii, jj))) > 0) {
      stop("'y', 'ii', and 'jj' may not include NA.")
    }

    # Check that ii and jj are consecutive integers
    is_consec_integer <- function(x) {
      is_integer <- all(x == as.integer(x))
      is_start_at_one <- min(x) == 1
      is_consecutive <- all(1:max(x) %in% unique(x))
      return(all(is_integer, is_start_at_one, is_consecutive))
    }
    if(!is_consec_integer(ii)) {
      stop("'ii' must consist of consecutive integers with lowest value of one.")
    }
    if(!is_consec_integer(jj)) {
      stop("'jj' must consist of consecutive integers with lowest value of one.")
    }

  }

  # Check suitability of y
  min_y <- tapply(y, ii, min)
  max_y <- tapply(y, ii, max)
  lu_y <- tapply(y, ii, function(x) length(unique(x)))
  if(any(min_y != 0)) {
    warning("One or more items have a minimum score not equal to zero. This ",
            "is may be okay for rating scale models.")
  }
  if(any(max_y + 1 != lu_y)) {
    warning("One or more items have missing response categories. This is may ",
            "be okay for rating scale models.")
  }

  # If 'formula' set to NULL, use 'covariates' as is. If 'covariates' not
  # provided, set to be constant only. Otherwise, if nrow(covariates) equals
  # number of persons, apply formula to it. Or if nrow(covariates) equals
  # number of total responses, shorten it and then apply formula.
  if(is.null(formula)) {
    W <- covariates
  } else if(identical(covariates, data.frame())) {
    W <- matrix(1, ncol = 1, nrow = max(jj))
  } else {
    if(nrow(covariates) == max(jj)) {
      W <- stats::model.matrix(formula, covariates)
    } else if(nrow(covariates) == length(jj)) {
      W <- stats::model.matrix(formula, covariates[!duplicated(jj),])
    } else {
      stop("The 'covariates' must have a number of rows equal to the ",
           "number of persons or to the length of 'jj'. If 'covariates' has ",
           "multiple rows per person (as with long-form data), all of them ",
           "must be identical.")
    }
  }
  if(!all(W[,1] == 1)) {
    stop("The person covariate matrix must have a first column with all ",
         "elements set to one. If you supplied a formula, the intercept term ",
         "cannot be omitted. If you did not supply a formula, the first ",
         "column of 'covariates' must have all elements set to one.")
  }

  data_list <- list(N = length(y), I = max(ii), J = max(jj), ii = ii, jj = jj,
                    y = y, K = ncol(W), W = W)

  return(data_list)

}


#' Estimate an item response model with Stan
#'
#' @param data_list A Stan data list created with \code{\link{irt_data}}.
#' @param model The file name for one of the provided .stan files, or
#'   alternatively, a user-created .stan file that accepts \code{data_list} as
#'   input data.
#'   The ".stan" file extension may be omitted.
#'   Defaults to either "rasch_latent_reg.stan" or "pcm_latent_reg.stan".
#' @param ... Additional options passed to \code{\link[rstan]{stan}}. The
#'   usual choices are \code{iter} for the number of iterations and
#'   \code{chains} for the number of chains.
#'
#' @details
#' The following table lists the models inlcuded in \pkg{edstan} along with the
#' associated \emph{.stan} files. The file names are given as the \code{model}
#' argument.
#'
#' \tabular{ll}{
#'    \strong{Model}             \tab \strong{File}                \cr
#'    Rasch                      \tab \emph{rasch_latent_reg.stan} \cr
#'    Partial credit             \tab \emph{pcm_latent_reg.stan}   \cr
#'    Rating Scale               \tab \emph{rsm_latent_reg.stan}   \cr
#'    Two-parameter logistic     \tab \emph{2pl_latent_reg.stan}   \cr
#'    Generalized partial credit \tab \emph{gpcm_latent_reg.stan}  \cr
#'    Generalized rating Scale   \tab \emph{grsm_latent_reg.stan}
#' }
#'
#' Three simplified models are also available: \emph{rasch_simple.stan},
#' \emph{pcm_simple.stan}, \emph{rsm_simple.stan}. These are (respectively) the
#' Rasch, partial credit, and rating scale models omitting the latent
#' regression. There is no reason to use these instead of the models listed
#' above, given that the above models allow for rather than require the
#' inclusion of covariates for a latent regression. Instead, the purpose of
#' the simplified models is to provide a straightforward starting point
#' researchers who wish to craft their own Stan models.
#'
#' @return A \code{\link[rstan]{stanfit-class}} object.
#' @seealso See \code{\link[rstan]{stan}}, for which this function is a wrapper,
#'   for additional options.
#'   See \code{\link{irt_data}} and \code{\link{labelled_integer}} for functions
#'   that facilitate creating a suitable \code{data_list}.
#'   See \code{\link{print_irt_stan}} and \code{\link[rstan]{print.stanfit}} for ways of
#'   getting tables summarizing parameter posteriors.
#' @examples
#' # List the Stan models included in edstan
#' folder <- system.file("extdata", package = "edstan")
#' dir(folder, "\\.stan$")
#'
#' # List the contents of one of the .stan files
#' rasch_file <- system.file("extdata/rasch_latent_reg.stan",
#'                           package = "edstan")
#' cat(readLines(rasch_file), sep = "\n")
#'
#'\dontrun{
#' # Fit the Rasch and 2PL models on wide-form data with a latent regression
#'
#' spelling_list <- irt_data(response_matrix = spelling[, 2:5],
#'                           covariates = spelling[, "male", drop = FALSE],
#'                           formula = ~ 1 + male)
#'
#' rasch_fit <- irt_stan(spelling_list, iter = 300, chains = 4)
#' print_irt_stan(rasch_fit, spelling_list)
#'
#' twopl_fit <- irt_stan(spelling_list, model = "2pl_latent_reg.stan",
#'                       iter = 300, chains = 4)
#' print_irt_stan(twopl_fit, spelling_list)
#'
#'
#' # Fit the rating scale and partial credit models without a latent regression
#'
#' agg_list_1 <- irt_data(y = aggression$poly,
#'                        ii = labelled_integer(aggression$description),
#'                        jj = aggression$person)
#'
#' fit_rsm <- irt_stan(agg_list_1, model = "rsm_latent_reg.stan",
#'                     iter = 300, chains = 4)
#' print_irt_stan(fit_rsm, agg_list_1)
#'
#' fit_pcm <- irt_stan(agg_list_1, model = "pcm_latent_reg.stan",
#'                     iter = 300, chains = 4)
#' print_irt_stan(fit_pcm, agg_list_1)
#'
#'
#' # Fit the generalized rating scale and partial credit models including
#' # a latent regression
#'
#' agg_list_2 <- irt_data(y = aggression$poly,
#'                        ii = labelled_integer(aggression$description),
#'                        jj = aggression$person,
#'                        covariates = aggression[, c("male", "anger")],
#'                        formula = ~ 1 + male*anger)
#'
#' fit_grsm <- irt_stan(agg_list_2, model = "grsm_latent_reg.stan",
#'                      iter = 300, chains = 4)
#' print_irt_stan(fit_grsm, agg_list_2)
#'
#' fit_gpcm <- irt_stan(agg_list_2, model = "gpcm_latent_reg.stan",
#'                      iter = 300, chains = 4)
#' print_irt_stan(fit_grsm, agg_list_2)
#' }
#' @export
irt_stan <- function(data_list, model = "", ... ) {

  max_y <- tapply(data_list$y, data_list$ii, max)
  is_polytomous <- any(max_y > 1)

  # Choose Stan model file if one not provided. If provided, add ".stan" to Stan
  # file if needed. Look for file first in working directory/given file path. If
  # not found, look up in package install folder.
  if(model == "") {
    stub <- ifelse(is_polytomous, "pcm_latent_reg.stan", "rasch_latent_reg.stan")
    stan_file <- file.path(system.file("extdata", package = "edstan"), stub)
  } else {
    stan_file <- ifelse(grepl("\\.stan$", model), model, paste0(model, ".stan"))
    if(!file.exists(stan_file)) {
      alt_file <- file.path(system.file("extdata", package = "edstan"), stan_file)
      if(file.exists(alt_file)) {
        stan_file <- alt_file
      } else {
        stop("Stan model file not found.")
      }
    }
  }

  message("Using ", file.path(stan_file), ".")
  fit <- rstan::stan(stan_file, data = data_list, ...)
  return(fit)

}


#' Transform a vector into consecutive integers
#'
#' @param x A vector, which may be numeric, string, or factor.
#' @return A vector of integers corresponding to entries in \code{x}.
#'   The lowest value will be 1, and the greatest value will equal the number of
#'   unique elements in \code{x}.
#'   The elements of the recoded vector are named according to the original
#'   values of \code{x}.
#'   The result is suitable for the \code{ii} and \code{jj} options for
#'   \code{\link{irt_data}}.
#' @examples
#' x <- c("owl", "cat", "pony", "cat")
#' labelled_integer(x)
#'
#' y <- as.factor(x)
#' labelled_integer(y)
#'
#' z <- rep(c(22, 57, 13), times = 2)
#' labelled_integer(z)
#' @export
labelled_integer <- function(x = vector()) {
  label_vector <- as.character(x)
  unique_labels <- unique(label_vector)
  unique_ids <- 1:length(unique_labels)
  names(unique_ids) <- unique_labels
  new_id_vector <- unique_ids[label_vector]
  return(new_id_vector)
}


#' View a table of selected parameter posteriors after using \code{irt_stan}
#'
#' @param fit A \code{stanfit-class} object created by \code{\link{irt_stan}}.
#' @param data_list An optional Stan data list created with
#'   \code{\link{irt_data}}. If provided, the printed posterior summaries for
#'   selected parameters are grouped by item. Otherwise, ungrouped results are
#'   provided, which may be preferred, for example, for the Rasch or rating
#'   scale models.
#' @param ... Additional options passed to \code{\link[base]{print}}.
#' @examples
#' # Make a suitable data list:
#' spelling_list <- irt_data(response_matrix = spelling[, 2:5],
#'                           covariates = spelling[, "male", drop = FALSE],
#'                           formula = ~ 1 + male)
#'
#'\dontrun{
#' # Fit a latent regression  2PL
#' twopl_fit <- irt_stan(spelling_list, model = "2pl_latent_reg.stan",
#'                       iter = 300, chains = 4)
#'
#' # Get a table of parameter posteriors
#' print_irt_stan(twopl_fit, spelling_list)
#' # Or
#' print_irt_stan(twopl_fit)
#' }
#' @export
print_irt_stan <- function(fit, data_list = NULL, ...) {

  possible_pars <- c("alpha", "beta", "kappa", "lambda", "sigma")
  available <- possible_pars %in% fit@model_pars
  names(available) <- possible_pars

  if(is.null(data_list)) {

    capture <- utils::capture.output(
      print(fit, pars = possible_pars[available], ...)
    )

  } else {

    # Get number of beta parameters per item
    y <- data_list$y
    ii <- data_list$ii
    if(available["kappa"]) {
      m <- rep(1, times = max(ii))  # For rating scale models
    } else {
      m <- tapply(y, ii, max)       # For binary/partial credit models
    }

    # Make a list of groups of item parameter by item
    out_list <- list(paste0("beta[", 1:m[1], "]"))
    for(i in 2:max(ii)) {
      out_list[[i]] <- paste0("beta[", (sum(m[1:(i-1)])+1):sum(m[1:i]), "]")
    }
    if(available["alpha"]) {
      for(i in 1:max(ii)) {
        out_list[[i]] <- c(paste0("alpha[", i, "]"), out_list[[i]])
      }
    }

    # Make labels for the items
    if(is.null(names(data_list$ii))) {
      out_labels <- paste("Item", unique(data_list$ii))
    } else {
      out_labels <- paste0("Item ", unique(data_list$ii), ": ",
                           unique(names(data_list$ii)))
    }

    # Add kappas to the list and add a label, if needed
    if(available["kappa"]) {
      out_list[[length(out_list) + 1]] <- "kappa"
      out_labels[length(out_labels) + 1] <- "Rating scale step parameters"
    }

    # Add ability distribution parameters to the list and add a label
    if(available["sigma"]) {
      out_list[[length(out_list) + 1]] <- c("lambda", "sigma")
    } else {
      out_list[[length(out_list) + 1]] <- "lambda"
    }
    out_labels[length(out_labels) + 1] <- "Ability distribution"

    # Get print() output and reformat
    capture <- utils::capture.output(print(fit, pars = unlist(out_list), ...))
    blanks <- grep("^$", capture)
    capture[blanks[1]:blanks[2]] <- paste0("  ", capture[blanks[1]:blanks[2]])
    for(i in 1:length(out_list)) {
      search_str <- gsub("\\[", "\\\\[", out_list[[i]][1])
      idx <- min(grep(paste0("^  ", search_str), capture))
      capture <- c(capture[1:(idx-1)], out_labels[i],
                   capture[idx:length(capture)])
    }

  }

  cat(capture, sep = "\n")

}


#' View a plot of summary statistics after using \code{irt_stan}
#'
#' @param fit A \code{stanfit-class} object created by \code{\link{irt_stan}}
#'   or \code{\link[rstan]{stan}}.
#' @param stat A string for the statistic from the \code{summary} method for a
#'   \code{stanfit} object to plot. The default is "Rhat" but could, for
#'   example, be "mean" or "n_eff".
#' @param ... Additional options (such as \code{pars} or \code{use_cache}),
#'   passed to the \code{summary} method for a \code{stanfit} object. Not
#'   required.
#' @return A \code{ggplot} object.
#' @seealso See \code{\link[rstan]{stan_rhat}}, which provides a histogram of
#'   Rhat statistics.
#' @examples
#' # Make a suitable data list:
#' spelling_list <- irt_data(response_matrix = spelling[, 2:5],
#'                           covariates = spelling[, "male", drop = FALSE],
#'                           formula = ~ 1 + male)
#'
#'\dontrun{
#' # Fit a latent regression  2PL
#' twopl_fit <- irt_stan(spelling_list, model = "2pl_latent_reg.stan",
#'                       iter = 300, chains = 4)
#'
#' # Get a plot showing Rhat statistics
#' rhat_columns(twopl_fit)
#'
#' # Get a plot showing number of effective draws
#' rhat_columns(twopl_fit, stat = "n_eff")
#' }
#' @export
stan_columns_plot <- function(fit, stat = "Rhat", ...) {

  fit_summary <- as.data.frame(rstan::summary(fit, ...)[["summary"]])

  # Creating vector before adding to data.frame helps pass CRAN checks.
  Parameter <- as.factor(gsub("\\[.*]", "", rownames(fit_summary)))
  value_to_plot <- fit_summary[, stat]
  fit_summary$Parameter <- Parameter
  fit_summary$value_to_plot <- value_to_plot

  ggplot2::ggplot(fit_summary) +
    ggplot2::aes(x = Parameter, y = value_to_plot, color = Parameter) +
    ggplot2::geom_jitter(height = 0, width = 0.25, show.legend = FALSE) +
    ggplot2::ylab(stat)

}
danielcfurr/edstan documentation built on May 14, 2019, 4:03 p.m.