R/opsr.R
In OPSR: Ordinal Probit Switching Regression

Documented in opsr

#' Fitting Ordinal Probit Switching Regression Models
#'
#' High-level formula interface to the workhorse [`opsr.fit`].
#'
#' @param formula an object of class `"Formula" "formula"`: A symbolic description
#'   of the model to be fitted. The details of model specification are given under
#'   'Details'.
#' @param data an optional data frame, list or environment (or object coercible by
#'   [`as.data.frame`] to a data frame) containing the variables in the model. If
#'   not found in `data`, the variables are taken from `environment(formula)`,
#'   typically the environment from which `opsr` is called.
#' @param subset an optional vector specifying a subset of observations to be used
#'   in the fitting process. (See additional details in the 'Details' section of
#'   the [`model.frame`] documentation.).
#' @param weights an optional vector of weights to be used in the fitting process.
#'   Should be `NULL` or a numeric vector. If non-NULL, then observation-specific
#'   log-likelihood contributions are multiplied by their corresponding weight
#'   before summing.
#' @param na.action a function which indicates what should happen when the data
#'   contain `NA`s. The default is set by the `na.action` setting of [`options`],
#'   and is [`na.fail`] if that is unset. The 'factory-fresh' default is [`na.omit`].
#'   Another possible value is `NULL`, no action. Value [`na.exclude`] can be useful.
#' @param start a numeric vector with the starting values (passed to [`maxLik::maxLik`]).
#'   If no starting values are provided, reasonable values are auto-generated via
#'   the Heckman 2-step procedure [`opsr_2step`]. The structure of `start` has to
#'   conform with `opsr`'s expectations. See [`opsr_check_start`] for further details.
#' @param fixed parameters to be treated as constants at their `start` values. If
#'   present, it is treated as an index vector of `start` parameters (passed to
#'   [`maxLik::maxLik`]).
#' @param method maximzation method (passed to [`maxLik::maxLik`]).
#' @param iterlim maximum number of iterations (passed to [`maxLik::maxLik`]).
#' @param printLevel larger number prints more working information (passed to [`maxLik::maxLik`]).
#' @param nThreads number of threads to be used. Do not pass higher number than
#'   number of ordinal outcomes. See also [`opsr_check_omp`] and [`opsr_max_threads`].
#' @param .get2step if `TRUE`, returns starting values as generated by [`opsr_2step`]. Will
#'   not proceed with the maximum likelihood estimation.
#' @param .useR if `TRUE` usese [`loglik_R`]. Go grab a coffe.
#' @param .censorRho if `TRUE`, rho starting values are censored to lie in the
#'   interval \[-0.85, 0.85\].
#' @param ... further arguments passed to [`maxLik::maxLik`].
#'
#' @return An object of class `"opsr" "maxLik" "maxim"`.
#'
#' @details
#' Models for `opsr` are specified symbolically. A typical model has the form
#' `ys | yo ~ terms_s | terms_o1 | terms_o2 | ...`. `ys` is the ordered (numeric)
#' response vector (starting from 1, in integer-increasing fashion). For the `terms`
#' specification the rules of the regular formula interface apply (see also [stats::lm]).
#' The intercept in the `terms_s` (selection process) is excluded automatically
#' (no need to specify `-1`). If the user wants to specify the same process for
#' all continuous outcomes, two processes are enough (`ys | yo ~ terms_s | terms_o`).
#' Note that the model is poorly identifiable if `terms_s == terms_o` (same regressors
#' are used in selection and outcome processes).
#'
#' @example R/examples/ex-basic_workflow.R
#'
#' @export
opsr <- function(formula, data, subset, weights, na.action, start = NULL,
                 fixed = NULL, method = "BFGS", iterlim = 1000, printLevel = 2,
                 nThreads = 1, .get2step = FALSE, .useR = FALSE, .censorRho = TRUE, ...) {
  start_time <- Sys.time()

  mf <- match.call(expand.dots = FALSE)
  m <- match(c("formula", "data", "subset", "weights", "na.action"), names(mf), 0)
  mf <- mf[c(1, m)]

  f <- Formula::Formula(formula)
  mf[[1]] <- as.name("model.frame")
  mf$formula <- f
  mf <- eval(mf, parent.frame())

  ## prep args for opsr.fit()
  l <- length(f)
  nRes <- l[1]

  if (nRes != 2) {
    stop("formula accepts two responses (selection and continuous outcome).",
         " However, ", nRes, " were specified.")
  }

  nParts <- l[2]
  Z <- Formula::model.part(f, data = mf, lhs = 1, drop = TRUE)

  if (is.factor(Z)) {
    stop("Selection outcome has to be numeric (and not a 'factor').")
  }

  Y <- Formula::model.part(f, data = mf, lhs = 2, drop = TRUE)
  nReg <- length(unique(Z))
  nObs <- length(Y)

  if (any(sort(unique(Z)) != 1:max(Z))) {
    stop("Selection outcome must be ordered starting from 1 in increasing fashion",
         " without any gaps. However, unique levels are ", unique(Z))
  }

  if (nParts != 2 && nParts != nReg + 1) {  # +1 for W (selection)
    stop("formula parts must be of length ", nReg + 1, " or 2 (if the same",
         " specification is used for all continuous outcomes. However, ", nParts,
         " were specified.")
  }

  w <- as.vector(model.weights(mf))
  if (!is.null(w) && !is.numeric(w)) {
    stop("'weights' must be a numeric vector")
  }
  if (is.null(w)) {
    w <- rep(1, length(Y))
  }

  ## reorder weights to match with shuffling in opsr.fit() where we compute
  ## likelihood values for all elements Z == 1, then Z == 2, etc. and then
  ## stack them
  weights <- w  # keep a copy to attach to output
  w <- w[order(Z)]

  W <- model.matrix(update(f, ~ . -1), mf, rhs = 1)  # no intercept (identification threshold)!
  Ws <- lapply(seq_len(nReg), function(i) {
    as.matrix(W[Z == i, ])
  })

  Xs <- lapply(seq_len(nReg), function(i) {
    ## if the same outcome equation applies
    rhs <- ifelse(nParts == 2, 2, i + 1)  # first is for selection process
    X <- model.matrix(f, mf, rhs = rhs)
    x_mat <- as.matrix(X[Z == i, ])
    intercept_only <- length(attr(stats::terms(f, rhs = rhs), "term.labels")) == 2
    if (intercept_only) colnames(x_mat) <- "(Intercept)"
    x_mat
  })

  Ys <- lapply(seq_len(nReg), function(i) {
    Y[Z == i]
  })

  if (.get2step) {
    return(opsr_2step(W, Xs, Z, Ys))
  }

  ## check or generate starting values (theta)
  if (!is.null(start)) {
    start <- opsr_check_start(start, W, Xs)
  } else {
    start <- opsr_2step(W, Xs, Z, Ys)
    ## censor rho to [-0.85, 0.85]
    if (.censorRho) {
      rho <- grepl("^rho", names(start))
      start[rho] <- censor(start[rho], lower = -0.85, upper = 0.85)
    }
  }

  fit <- opsr.fit(Ws, Xs, Ys, start, fixed, w,
                  method, iterlim, printLevel, nThreads, .useR, ...)

  runtime <- Sys.time() - start_time

  ## return also some other useful information
  fit$call <- match.call()
  fit$formula <- f
  fit$loglik <- function(theta) fit$objectiveFn(theta)[order(Z)] ## ll func with correct order
  fit$runtime <- runtime
  fit$start <- start
  fit$nReg <- nReg
  fit$nObs <- c(Total = nObs, stats::setNames(c(table(Z)), paste0("o", seq_len(nReg))))
  fit$nParams <- length(fit$estimate) - sum(fit$fixed)
  fit$df <- fit$nObs[["Total"]] - fit$nParams
  fit$nParts <- nParts
  fit$weights <- weights

  class(fit) <- c("opsr", class(fit))

  fit
}