R/prepare.R

Defines functions .validate_regression_model .validate_continuous .validate_binary .long_format_covariates rescale_binary rescale_continuous labelled_integer irt_data

Documented in irt_data labelled_integer .long_format_covariates rescale_binary rescale_continuous .validate_binary .validate_continuous .validate_regression_model

#' Create a Stan data list from an item response matrix or from long-form data.
#'
#' This function prepares item response data, creating a data list that may be
#' passed to \code{\link{irt_stan}}.
#'
#' @param response_matrix An item response matrix.
#'   Columns represent items and rows represent persons.
#'   NA may be supplied for missing responses.
#'   The lowest score for each item should be 0, with exception to rating scale
#'   models.
#'   \code{y}, \code{ii}, and \code{jj} should not be supplied if a response
#'   matrix is given.
#' @param y A vector of scored responses for long-form data.
#'   The lowest score for each item should be 0, with exception to rating scale
#'   models.
#'   NAs are not permitted, but missing responses may simply be omitted
#'   instead.
#'   Required if \code{response_matrix} is not supplied.
#' @param ii A vector indexing the items in \code{y}.
#'   This must consist of consecutive integers starting at 1.
#'   \code{\link{labelled_integer}} may be used to create a suitable vector.
#'   Required if \code{response_matrix} is not supplied.
#' @param jj A vector indexing the persons in \code{y}.
#'   This must consist of consecutive integers starting at 1.
#'   \code{\link{labelled_integer}} may be used to create a suitable vector.
#'   Required if \code{response_matrix} is not supplied.
#' @param covariates An optional data frame containing (only) person-covariates.
#'   It must contain one row per person or be of the same length as \code{y},
#'   \code{ii}, and \code{jj}. If it contains one row per person, it must be in
#'   the same order as the response matrix (or \code{unique(jj)}). If it has a
#'   number of columns equal to the length of \code{y},
#'   \code{ii}, and \code{jj}, it must be in the same order as \code{jj} (for
#'   example, it may be a subset of columns from the same data frame that contains
#'   \code{y}, \code{ii}, and \code{jj}).
#' @param formula An optional formula for the latent regression that is applied
#'   to \code{covariates}. The left side  should be blank (for example,
#'   \code{~ v1 + v2}). By default it includes only a model intercept,
#'   which then represents the mean of the person distribution. If set to
#'   \code{NULL} (default), then \code{covariates} is used directly as the
#'   design matrix for the latent regression.
#' @param integerize Whether to apply \code{\link{labelled_integer}} to
#'   \code{ii} and \code{jj}. Defaults to \code{TRUE}, which should be the case
#'   unless the inputs are already consecutive integers.
#' @param validate_regression Whether to check the latent regression
#'   equation and covariates for compatibility with the prior distributions
#'   for the coefficients. Defaults to \code{TRUE} and throws a warning
#'   if problems are identified.
#' @return A data list suitable for \code{\link{irt_stan}}.
#' @seealso See \code{\link{labelled_integer}} for a means of creating
#' appropriate inputs for \code{ii} and \code{jj}.
#' See \code{\link{irt_stan}} to fit a model to the data list.
#' @examples
#' # For a response matrix ("wide-form" data) with person covariates:
#' spelling_list <- irt_data(response_matrix = spelling[, 2:5],
#'                           covariates = spelling[, "male", drop = FALSE],
#'                           formula = ~ rescale_binary(male))
#'
#' # For long-form data (one row per item-person pair):
#' agg_list_1 <- irt_data(y = aggression$poly,
#'                        ii = aggression$item,
#'                        jj = aggression$person)
#'
#' # Add a latent regression and use labelled_integer() with the items
#' agg_list_2 <- irt_data(y = aggression$poly,
#'                        ii = labelled_integer(aggression$description),
#'                        jj = aggression$person,
#'                        covariates = aggression[, c("male", "anger")],
#'                        formula = ~ 1 + rescale_continuous(male)*rescale_continuous(anger))
#' @export
irt_data <- function(response_matrix = matrix(), y = integer(), ii = integer(),
                     jj = integer(), covariates = data.frame(), formula = NULL,
                     integerize = TRUE, validate_regression = TRUE) {

  long_format <- identical(response_matrix, matrix())

  if(!long_format) {
    # If response matrix is provided...

    if(!all(y == integer(), ii == integer(), jj == integer())) {
      warning("Options 'y', 'ii', and 'jj' are ignored when response_matrix is provided.")
    }

    # Assemble y into a vector of non-missing values
    not_missing <- as.vector(t(!is.na(response_matrix)))
    y <- as.vector(t(response_matrix))[not_missing]

    # Assemble ii into a vector of non-missing values
    ii_unique <- 1:ncol(response_matrix)
    names(ii_unique) <- colnames(response_matrix)
    ii_expand <- rep(ii_unique, times = nrow(response_matrix))
    ii <- ii_expand[not_missing]

    # Assemble jj into a vector of non-missing values
    jj_unique <- 1:nrow(response_matrix)
    names(jj_unique) <- rownames(response_matrix)
    jj_expand <- rep(jj_unique, each = ncol(response_matrix))
    jj <- jj_expand[not_missing]

  } else {
    # If response matrix is NOT provided...

    # Check that all long-form options are provided
    if(any(identical(y, integer()), identical(ii, integer()),
           identical(jj, integer()))) {
      stop("Options 'y', 'ii', and 'jj' are all required when response_matrix ",
           "is not provided.")
    }

    # Check that all vectors have same length
    vec_lengths <- lengths(list(y, ii, jj))
    if(min(vec_lengths) != max(vec_lengths)) {
      stop("Vectors 'y', 'ii', and 'jj' must have the same length.")
    }

    # Check that there are no NAs
    if(sum(is.na(c(y, ii, jj))) > 0) {
      stop("'y', 'ii', and 'jj' may not include NA.")
    }

    # Apply labelled_integer() if specified
    if (integerize) {
      ii <- labelled_integer(ii)
      jj <- labelled_integer(jj)
    }

    # Check that ii and jj are consecutive integers
    is_consec_integer <- function(x) {
      is_integer <- all(x == as.integer(x))
      is_start_at_one <- min(x) == 1
      is_consecutive <- all(1:max(x) %in% unique(x))
      return(all(is_integer, is_start_at_one, is_consecutive))
    }
    if(!is_consec_integer(ii)) {
      stop("'ii' must consist of consecutive integers with lowest value of one.")
    }
    if(!is_consec_integer(jj)) {
      stop("'jj' must consist of consecutive integers with lowest value of one.")
    }

  }

  # Check suitability of y
  min_y <- tapply(y, ii, min)
  max_y <- tapply(y, ii, max)
  lu_y <- tapply(y, ii, function(x) length(unique(x)))
  if(any(min_y != 0)) {
    warning("One or more items have a minimum score not equal to zero.")
  }
  if(any(max_y + 1 != lu_y)) {
    warning("One or more items have unused response categories.")
  }

  # Construct a formula and/or covariate data frame if not provided
  if(!identical(covariates, data.frame()) & !is.null(formula)) {
    # If covariates and formula are both provided
    # Pass
  } else if (!identical(covariates, data.frame())) {
    # If only covariates
    formula <- ~ .
  } else if (!is.null(formula)) {
    # If only formula
    stop("Error: 'covariates' must be specified when formula is provided.")
  } else if (long_format) {
    # If neither provided and using long format data
    covariates <- data.frame(meh = rep(1, times = length(jj)))
    formula <- ~ 1
  } else {
    # If neither provided and using wide format data
    covariates <- data.frame(meh = rep(1, times = max(jj)))
    formula <- ~ 1
  }

  # If long form data, reduce covariates to one row per person
  if (long_format) {
    covariates <- .long_format_covariates(covariates, jj, formula)
  }

  # Check that covariates have expected number of rows
  if (nrow(covariates) != max(jj)) {
    stop("The 'covariates' must have a number of rows equal to the ",
         "number of persons for wide format data. For long format data, ",
         "the covariates should have a row count equal the length of 'y'")
  }

  # Apply the formula to the covariates, validating if requested
  if (validate_regression) {
    W <- .validate_regression_model(formula, covariates)
  } else {
    W <- stats::model.matrix(formula, covariates)
  }

  data_list <- list(N = length(y), I = max(ii), J = max(jj), ii = ii, jj = jj,
                    y = y, K = ncol(W), W = W)

  return(data_list)

}


#' Transform a vector into consecutive integers
#'
#' This takes vector and transforms it into a vector of consecutive integers,
#' which has a lowest value of one, a maximum value equal to the number of
#' unique values, and no gaps.
#'
#' @param x A vector, which may be numeric, string, or factor.
#' @return A vector of integers corresponding to entries in \code{x}.
#'   The lowest value will be 1, and the greatest value will equal the number of
#'   unique elements in \code{x}.
#'   The elements of the recoded vector are named according to the original
#'   values of \code{x}.
#'   The result is suitable for the \code{ii} and \code{jj} options for
#'   \code{\link{irt_data}}.
#' @examples
#' x <- c("owl", "cat", "pony", "cat")
#' labelled_integer(x)
#'
#' y <- as.factor(x)
#' labelled_integer(y)
#'
#' z <- rep(c(22, 57, 13), times = 2)
#' labelled_integer(z)
#' @export
labelled_integer <- function(x = vector()) {
  x_integer <- match(x, unique(x))
  names(x_integer) <- as.character(x)
  return(x_integer)
}


#' Rescale continuous covariates as appropriate for edstan models
#'
#' This function scales a covariate to have a mean of zero and standard
#' deviation of 0.5.
#'
#' @param x A numeric vector, matrix, or data frame
#' @return A numeric vector, matrix, or data frame with rescaled covariates
#'   having mean of zero and standard deviation of 0.5.
#' @examples
#' vec <- rnorm(5, 100, 20)
#' rescale_continuous(vec)
#'
#' mat <- matrix(rnorm(5*5, 100, 20), ncol = 5)
#' rescale_continuous(mat)
#' @export
rescale_continuous <- function(x) {
  f <- function(y) (y - mean(y)) / stats::sd(y) / 2
  d <- length(dim(x))
  if (is.vector(x)) {
    return(f(x))
  } else if (length(dim(x)) == 2) {
    return(apply(x, 2, f))
  } else {
    stop("Error: Input must be a vector, matrix, or data frame.")
  }
}


#' Rescale binary covariates as appropriate for edstan models
#'
#' This function rescales a covariate to have a mean of zero and range
#' (maximum - minimum) of one
#'
#' @param x A numeric vector, matrix, or data frame
#' @return A numeric vector, matrix, or data frame with rescaled covariates
#'   having mean of zero and range (maximum - minimum) of one.
#' @examples
#' vec <- c(1, 3, 1, 3, 1)
#' rescale_binary(vec)
#'
#' mat <- matrix(c(1, 3, 1, 3, 1), nrow = 5, ncol = 5)
#' rescale_binary(mat)
#' @export
rescale_binary <- function(x) {
  f <- function(y)  (y - mean(y)) / (max(y) - min(y))
  if (is.vector(x)) {
    return(f(x))
  } else if (length(dim(x)) == 2) {
    return(apply(x, 2, f))
  } else {
    stop("Error: Input must be a vector, matrix, or data frame.")
  }
}


#' Convert covariate data frame for long format data
#'
#' Intended for internal use only.
#'
#' @param covariates A data frame containing covariates.
#' @param jj Index for person associated with each row.
#' @return A data frame with one row per person.
#' @keywords internal
.long_format_covariates <- function(covariates, jj, formula) {

  mm <- stats::model.matrix(formula, covariates)
  split_df <- split(as.data.frame(mm), jj)
  unique_rows_by_person <- sapply(split_df, function(x) nrow(unique(x)))

  if (any(unique_rows_by_person > 1)) {
    stop("Error: For long format data, all 'covariate' rows for each person ",
         "must be identical.")
  }

  covariates <- covariates[!duplicated(jj), , drop=FALSE]

  return(covariates)

}


#' Validate a boolean covariate
#'
#' Intended for internal use only.
#'
#' @param x A vector of covariate values.
#' @param nm Name for the covariate.
#' @return A character vector of identified issues.
#' @keywords internal
.validate_binary <- function(x, nm) {
  x_mean <- mean(x)
  x_range <- max(x) - min(x)
  issues <- c()

  if (all.equal(0, x_mean, tolerance = .01) != TRUE) {
    text <- paste(nm, ": binary covariate has mean of",
                  sprintf("%0.3f", x_mean), "rather than 0")
    issues <- append(issues, c(text))
  }

  if (all.equal(1, x_range, tolerance = .01) != TRUE) {
    text <- paste(nm, ": binary covariate has range of",
                  sprintf("%0.3f", x_range), "rather than 1")
    issues <- append(issues, c(text))
  }

  return(issues)

}


#' Validate a continuous covariate
#'
#' Intended for internal use only.
#'
#' @param x A vector of covariate values.
#' @param nm Name for the covariate.
#' @return A character vector of identified issues.
#' @keywords internal
.validate_continuous <- function(x, nm) {
  x_mean <- mean(x)
  x_sd <- stats::sd(x)
  issues <- c()

  if (all.equal(0, x_mean, tolerance = .01) != TRUE) {
    text <- paste(nm, ": continuous covariate has mean of",
                  sprintf("%0.3f", x_mean), "rather than 0")
    issues <- append(issues, c(text))
  }

  if (all.equal(.5, x_sd, tolerance = .01) != TRUE) {
    text <- paste(nm, ": continuous covariate has SD of",
                  sprintf("%0.3f", x_sd), "rather than .5")
    issues <- append(issues, c(text))
  }

  return(issues)

}


#' Validate formula and covariate data
#'
#' Intended for internal use only.
#'
#' @param formula A formula for the latent regression.
#' @param data A data frame of covariates.
#' @return A character vector of identified issues.
#' @keywords internal
.validate_regression_model <- function(formula, data) {
  mm <- stats::model.matrix(formula, data)
  model_terms <- stats::terms(formula, data = data)

  # Check for missing values
  if (any(is.na(mm))) {
    stop("Error: The 'covariates' must not contain NA values.")
  }

  issues <- c()

  if (attr(model_terms, "intercept") == 0) {
    issues <- append(issues, c("formula does not include an intercept term"))
  }

  not_interactions <- which(attr(model_terms, "order")  == 1)
  columns_to_inspect <- which(attr(mm, "assign") %in% not_interactions)

  for (i in columns_to_inspect) {
    cname <- colnames(mm)[i]
    x <- mm[, i]
    u <- length(unique(x))

    if (u == 1) {
      issues <- append(issues, c(paste0(cname, "is constant")))
    } else if (u == 2) {
      issues <- append(issues, .validate_binary(x, cname))
    } else {
      issues <- append(issues, .validate_continuous(x, cname))
    }

  }

  if (length(issues)) {
    message("Potential scaling problems were observed for the covariates:")

    message(paste(
      paste("  -", issues),
      collapse = "\n"
    ))

    message(paste(
      "Recommendations for scaling appropriately with the coefficent priors:",
      "  - An intercept term should be included in the regression",
      "  - Continuous covariates should have a mean of 0 and SD of .5",
      "  - Binary covariates should have a mean of 0 and range (max minus min) of 1",
      "  - Other scalings may be sensible but cannot be validated with this function",
      sep = "\n"
    ))

    warning("Rescaling the regression covariates may be appropriate")

  }

  return(mm)

}

Try the edstan package in your browser

Any scripts or data that you put into this service are public.

edstan documentation built on April 4, 2025, 12:07 a.m.