R/gap.R
In EdSurvey: Analysis of NCES Education Survey and Assessment Data

Documented in gap print.gap print.gapList

#' @title Gap Analysis
#'
#' @description Compares the average levels of a variable between two groups
#'              that potentially share members.
#'
#' @param variable a character indicating the variable to be compared,
#'                 potentially with a subject scale or subscale
#' @param data     an \code{edsurvey.data.frame}, a \code{light.edsurvey.data.frame}, or an \code{edsurvey.data.frame.list}
#' @param groupA   an expression or character expression that defines a condition for the subset.
#'                 This subset will be compared to \code{groupB}. If not specified, it will define
#'                 a whole sample as in \code{data}.
#' @param groupB   an expression or character expression that defines a condition for the subset.
#'                 This subset will be compared to \code{groupA}. If not specified, it will define
#'                 a whole sample as in \code{data}. If set to \code{NULL}, estimates for the second group
#'                 will be dropped.
#' @param percentiles a numeric vector. The \code{gap} function calculates the
#'                    mean when this
#'                    argument is omitted or set to \code{NULL}. Otherwise,
#'                    the gap at the percentile given is calculated.
#' @param achievementLevel the achievement level(s) at which percentages
#'                         should be calculated
#' @param achievementDiscrete a logical indicating if the achievement level
#'                            specified in the \code{achievementLevel}
#'                            argument should be interpreted as discrete
#'                            so that
#'                            just the percentage in that particular achievement
#'                            level
#'                            will be included. Defaults to \code{FALSE}
#'                            so that
#'                            the percentage at or above that achievement level
#'                            will be
#'                            included in the percentage.
#' @param stDev a logical, set to \code{TRUE} to calculate the gap in standard deviations.
#' @param targetLevel a character string. When specified, calculates the gap in
#'                    the percentage of students at
#'                    \code{targetLevel} in the \code{variable} argument. This is useful for
#'                    comparing the gap in the percentage of students at a
#'                    survey response level.
#' @param weightVar a character indicating the weight variable to use.
#'                  See Details.
#' @param jrrIMax    a numeric value; when using the jackknife variance estimation method, the default estimation option, \code{jrrIMax=1}, uses the
#'                   sampling variance from the first plausible value as the component for sampling variance estimation. The \code{Vjrr}
#'                   term, or sampeling variance term, can be estimated with any number of plausible values, and values larger than the number of
#'                   plausible values on the survey (including \code{Inf}) will result in all plausible values being used.
#'                   Higher values of \code{jrrIMax} lead to longer computing times and more accurate variance estimates.
#' @param varMethod  deprecated parameter, \code{gap} always uses the jackknife variance estimation
#' @param dropOmittedLevels a logical value. When set to the default value of
#'                      \code{TRUE}, drops those levels of
#'                      all factor variables.
#'                      Use \code{print} on an \code{edsurvey.data.frame}
#'                      to see the omitted levels.
#' @param defaultConditions a logical value. When set to the default value
#'                          of \code{TRUE}, uses the default
#'                          conditions stored in \code{edsurvey.data.frame}
#'                          to subset the data.
#'                          Use \code{print} on an \code{edsurvey.data.frame}
#'                          to see the default conditions.
#' @param recode a list of lists to recode variables. Defaults to \code{NULL}.
#'               Can be set as
#'               \code{recode} \code{=} \code{list(var1} \code{=}
#'               \code{list(from} \code{=} \code{c("a",} \code{"b",}
#'               \code{"c"),} \code{to} \code{=} \code{"d"))}.
#' @param referenceDataIndex a numeric used only when the \code{data} argument is an
#'                           \code{edsurvey.data.frame.list},
#'                           indicating which dataset is the reference
#'                           dataset that other datasets are compared with.
#'                           Defaults to 1.
#' @param returnVarEstInputs a logical value; set to \code{TRUE} to return the
#'                           inputs to the jackknife and imputation variance
#'                           estimates which allows for the
#'                           computation
#'                           of covariances between estimates.
#' @param returnSimpleDoF    a logical value set to \code{TRUE} to return the degrees
#'                           of freedom for some statistics (see Value
#'                           section) that do not have a
#'                           \emph{t}-test; useful primarily for further computation
#' @param returnSimpleN      a logical value set to \code{TRUE} to add the count
#'                           (\emph{n}-size) of observations included in groups A and B
#'                           in the percentage object
#' @param returnNumberOfPSU a logical value set to \code{TRUE} to return the number of
#'                          PSUs used in the calculation
#' @param noCov set the covariances to zero in result
#' @param pctMethod a character that is one of \code{unbiased} or \code{simple}.
#'                  See the help for \code{\link{percentile}} for more information.
#' @param includeLinkingError a logical value set to \code{TRUE} to include the
#'                            linking error in variance estimation.
#'                            Standard errors (e.g., \code{diffAAse}, \code{diffBBse},
#'                            and \code{diffABABse}) and \emph{p}-values (e.g., \code{diffAApValue},
#'                            \code{diffBBpValue}, and \code{diffABABpValue}) would be adjusted for
#'                            comparisons between digitally based assessments (DBA) and
#'                            paper-based assessments (PBA) data.
#'                            This option is supported only for NAEP data.
#'
#' @param omittedLevels this argument is deprecated. Use \code{dropOmittedLevels}.
#'
#' @details This function calculates the gap between \code{groupA} and \code{groupB} (which
#' may be omitted to indicate the full sample). The gap is
#' calculated for one of four statistics:
#' \describe{
#'   \item{the gap in means}{The mean score gap (in the score
#'      variable) identified in the \code{variable} argument.
#'      This is the default. The means and their standard errors are
#'      calculated using the methods
#'      described in the \code{\link{lm.sdf}} function documentation.}
#'   \item{the gap in percentiles}{The gap between respondents at
#'      the percentiles specified in the \code{percentiles} argument.
#'      This is returned when the \code{percentiles} argument is
#'      defined. The mean and standard error are computed as described in the
#'      \code{\link{percentile}} function documentation.}
#'   \item{the gap in achievement levels}{The gap in the percentage of
#'      students at (when \code{achievementDiscrete} is \code{TRUE}) or at
#'      or above (when \code{achievementDiscrete} is \code{FALSE}) a
#'      particular achievement level. This is used when the
#'      \code{achievementLevel} argument is defined. The mean and standard error
#'      are calculated as described in the \code{\link{achievementLevels}}
#'      function documentation.}
#'   \item{the gap in a survey response}{The gap in the percentage of
#'      respondents responding at \code{targetLevel} to
#'      \code{variable}. This is used when \code{targetLevel} is
#'      defined. The mean and standard deviation are calculated as described in
#'      the \code{\link{edsurveyTable}} function documentation.}
#' }
#'
#' @return
#' The return type depends on if the class of the \code{data} argument is an
#' \code{edsurvey.data.frame} or an \code{edsurvey.data.frame.list}. Both
#' include the call (called \code{call}), a list called \code{labels},
#' an object named \code{percentage}
#' that shows the percentage in \code{groupA} and \code{groupB}, and an object
#' that shows the gap called \code{results}.
#'
#' The labels include the following elements:
#'   \item{definition}{the definitions of the groups}
#'   \item{nFullData}{the \emph{n}-size for the full dataset (before applying the definition)}
#'   \item{nUsed}{the \emph{n}-size for the data after the group is subsetted and other
#'                restrictions (such as omitted values) are applied}
#'   \item{nPSU}{the number of PSUs used in calculation--only returned when
#'               \code{returnNumberOfPSU} \code{=} \code{TRUE}}
#'
#' The percentages are computed according to the vignette titled
#' \href{https://www.air.org/sites/default/files/EdSurvey-Statistics.pdf}{\emph{Statistical Methods Used in EdSurvey}}
#'  in the section
#' \dQuote{Estimation of Weighted Percentages When Plausible Values Are Not Present.}
#' The standard errors are calculated according to
#' \dQuote{Estimation of the Standard Error of Weighted Percentages When Plausible Values Are Not Present, Using the Jackknife Method.}
#' Standard errors of differences are calculated as the square root of the typical
#' variance formula
#' \deqn{Var(A-B) = Var(A) + Var(B) - 2 Cov(A,B)}
#' where the covariance term is calculated as described in the vignette titled
#' \href{https://www.air.org/sites/default/files/EdSurvey-Statistics.pdf}{\emph{Statistical Methods Used in EdSurvey}}
#'  in the section
#' \dQuote{Estimation of Covariances.} These degrees of freedom are available only
#' with the jackknife variance estimation. The degrees of freedom used for hypothesis testing
#' are always set to the number of jackknife replicates in the data.
#'
#' \strong{the data argument is an edsurvey.data.frame}
#'   When the \code{data} argument is an \code{edsurvey.data.frame},
#'   \code{gap} returns an S3 object of class \code{gap}.
#'
#'   The \code{percentage} object is a numeric vector with the following elements:
#'     \item{pctA}{the percentage of respondents in \code{groupA} compared with the whole sample in \code{data}}
#'     \item{pctAse}{the standard error on the percentage of respondents in
#'                       \code{groupA}}
#'     \item{dofA}{degrees of freedom appropriate for a \emph{t}-test involving \code{pctA}.
#'                 This value is returned only if
#'                 \code{returnSimpleDoF}\code{=}\code{TRUE}.}
#'     \item{pctB}{the percentage of respondents in \code{groupB}.}
#'     \item{pctBse}{the standard error on the percentage of respondents in
#'                       \code{groupB}}
#'     \item{dofB}{degrees of freedom appropriate for a \emph{t}-test involving \code{pctA}.
#'                 This value is returned only if
#'                 \code{returnSimpleDoF}\code{=}\code{TRUE}.}
#'     \item{diffAB}{the value of \code{pctA} minus \code{pctB}}
#'     \item{covAB}{the covariance of \code{pctA} and \code{pctB}; used in
#'                  calculating \code{diffABse}.}
#'     \item{diffABse}{the standard error of \code{pctA}
#'                            minus \code{pctB}}
#'     \item{diffABpValue}{the \emph{p}-value associated with the \emph{t}-test used
#'                         for the hypothesis test that \code{diffAB}
#'                         is zero}
#'     \item{dofAB}{degrees of freedom used in calculating
#'                       \code{diffABpValue}}
#'
#'   The \code{results} object is a numeric data frame with the following elements:
#'     \item{estimateA}{the mean estimate of \code{groupA} (or the percentage estimate
#'                      if \code{achievementLevel} or \code{targetLevel} is specified)}
#'     \item{estimateAse}{the standard error of \code{estimateA}}
#'     \item{dofA}{degrees of freedom appropriate for a \emph{t}-test involving \code{meanA}.
#'                 This value is returned only if
#'                 \code{returnSimpleDoF}\code{=}\code{TRUE}.}
#'     \item{estimateB}{the mean estimate of \code{groupB} (or the percentage estimate
#'                      if \code{achievementLevel} or \code{targetLevel} is specified)}
#'     \item{estimateBse}{the standard error of \code{estimateB}}
#'     \item{dofB}{degrees of freedom appropriate for a \emph{t}-test involving \code{meanB}.
#'                 This value is returned only if
#'                 \code{returnSimpleDoF}\code{=}\code{TRUE}.}
#'     \item{diffAB}{the value of \code{estimateA} minus \code{estimateB}}
#'     \item{covAB}{the covariance of \code{estimateA} and \code{estimateB}. Used in
#'                  calculating \code{diffABse}.}
#'     \item{diffABse}{the standard error of \code{diffAB}}
#'     \item{diffABpValue}{the \emph{p}-value associated with the \emph{t}-test used
#'                         for the hypothesis test that \code{diffAB}
#'                         is zero.}
#'     \item{dofAB}{degrees of freedom used for the \emph{t}-test on \code{diffAB}}
#'
#'   If the gap was in  achievement levels or percentiles and more
#'   than one percentile or achievement level is requested,
#'   then an additional column
#'   labeled \code{percentiles} or \code{achievementLevel} is included
#'   in the \code{results} object.
#'
#'   When \code{results} has a single row and when \code{returnVarEstInputs}
#'   is \code{TRUE}, the additional elements \code{varEstInputs} and
#'   \code{pctVarEstInputs} also are returned. These can be used for calculating
#'   covariances with \code{\link{varEstToCov}}.
#'
#' \strong{the data argument is an edsurvey.data.frame.list}
#'   When the \code{data} argument is an \code{edsurvey.data.frame.list},
#'   \code{gap} returns an S3 object of class \code{gapList}.
#'
#'   The \code{results} object in the \code{edsurveyResultList} is
#'   a \code{data.frame}. Each row regards a particular dataset from the
#'   \code{edsurvey.data.frame}, and a reference dataset is dictated by
#'   the \code{referenceDataIndex} argument.
#'
#'   The \code{percentage} object is a \code{data.frame} with the following elements:
#'     \item{covs}{a data frame with a column for each column in the \code{covs}. See previous
#'                 section for more details.}
#'     \item{...}{all elements in the \code{percentage} object in the
#'                previous section}
#'     \item{diffAA}{the difference in \code{pctA} between the reference data
#'                   and this dataset. Set to \code{NA} for the
#'                   reference dataset.}
#'     \item{covAA}{the covariance of \code{pctA} in the reference data and
#'                  \code{pctA} on this row. Used in
#'                  calculating \code{diffAAse}.}
#'     \item{diffAAse}{the standard error for \code{diffAA}}
#'     \item{diffAApValue}{the \emph{p}-value associated with the \emph{t}-test used
#'                         for the hypothesis test that \code{diffAA}
#'                         is zero}
#'     \item{diffBB}{the difference in \code{pctB} between the reference data
#'                   and this dataset. Set to \code{NA} for the
#'                   reference dataset.}
#'     \item{covBB}{the covariance of \code{pctB} in the reference data and
#'                  \code{pctB} on this row. Used in
#'                  calculating \code{diffAAse}.}
#'     \item{diffBBse}{the standard error for \code{diffBB}}
#'     \item{diffBBpValue}{the \emph{p}-value associated with the \emph{t}-test used
#'                         for the hypothesis test that \code{diffBB}
#'                         is zero}
#'     \item{diffABAB}{the value of \code{diffAB} in the reference dataset
#'                            minus the value of \code{diffAB} in this dataset. Set
#'                            to \code{NA} for the reference dataset.}
#'     \item{covABAB}{the covariance of \code{diffAB} in the reference data and
#'                    \code{diffAB} on this row. Used in
#'                    calculating \code{diffABABse}.}
#'     \item{diffABABse}{the standard error for \code{diffABAB}}
#'     \item{diffABABpValue}{the \emph{p}-value associated with the \emph{t}-test used
#'                         for the hypothesis test that \code{diffABAB}
#'                         is zero}
#'
#'   The \code{results} object is a \code{data.frame} with the following elements:
#'     \item{...}{all elements in the \code{results} object in the
#'                previous section}
#'     \item{diffAA}{the value of \code{groupA} in the reference dataset minus
#'                          the value in this dataset. Set to \code{NA} for the
#'                          reference dataset.}
#'     \item{covAA}{the covariance of \code{meanA} in the reference data and
#'                  \code{meanA} on this row. Used in
#'                  calculating \code{diffAAse}.}
#'     \item{diffAAse}{the standard error for \code{diffAA}}
#'     \item{diffAApValue}{the \emph{p}-value associated with the \emph{t}-test used
#'                         for the hypothesis test that \code{diffAA}
#'                         is zero}
#'     \item{diffBB}{the value of \code{groupB} in the reference dataset minus
#'                          the value in this dataset. Set to \code{NA} for the
#'                          reference dataset.}
#'     \item{covBB}{the covariance of \code{meanB} in the reference data and
#'                  \code{meanB} on this row. Used in
#'                  calculating \code{diffBBse}.}
#'     \item{diffBBse}{the standard error for \code{diffBB}}
#'     \item{diffBBpValue}{the \emph{p}-value associated with the \emph{t}-test used
#'                         for the hypothesis test that \code{diffBB}
#'                         is zero}
#'     \item{diffABAB}{the value of \code{diffAB} in the reference dataset
#'                            minus the value of \code{diffAB}
#'                            in this dataset. Set
#'                            to \code{NA} for the reference dataset.}
#'     \item{covABAB}{the covariance of \code{diffAB} in the reference data and
#'                    \code{diffAB} on this row. Used in
#'                    calculating \code{diffABABse}.}
#'     \item{diffABABse}{the standard error for \code{diffABAB}}
#'     \item{diffABABpValue}{the \emph{p}-value associated with the \emph{t}-test used
#'                         for the hypothesis test that \code{diffABAB}
#'                         is zero}
#'     \item{sameSurvey}{a logical value indicating if this line uses the same
#'                       survey as the reference line. Set to \code{NA} for the
#'                       reference line.}
#'
#' @author Paul Bailey, Trang Nguyen, and Huade Huo
#' @importFrom stats formula
#' @example man/examples/gap.R
#' @export
gap <- function(variable, data, groupA = "default", groupB = "default",
                percentiles = NULL,
                achievementLevel = NULL,
                achievementDiscrete = FALSE,
                stDev = FALSE,
                targetLevel = NULL,
                weightVar = NULL, jrrIMax = 1,
                varMethod = c("jackknife"),
                dropOmittedLevels = TRUE,
                defaultConditions = TRUE,
                recode = NULL,
                referenceDataIndex = 1,
                returnVarEstInputs = FALSE,
                returnSimpleDoF = FALSE,
                returnSimpleN = FALSE,
                returnNumberOfPSU = FALSE,
                noCov = FALSE,
                pctMethod = c("unbiased", "symmetric", "simple"),
                includeLinkingError = FALSE,
                omittedLevels = deprecated()) {
  if (is.character(substitute(groupA))) {
    groupA <- parse(text = groupA)[[1]]
  }
  if (all(as.character(substitute(groupA)) == "default")) {
    groupB <- NULL
  }
  if (!missing(groupB)) {
    if (is.character(substitute(groupB))) {
      groupB <- parse(text = groupB)[[1]]
    }
  }
  if (!missing(varMethod)) {
    warning(paste0("Argument ", dQuote("varMethod"), " deprecated. The ", dQuote("gap"), " function always uses jackknife variance estimation."))
  }

  if (lifecycle::is_present(omittedLevels)) {
    lifecycle::deprecate_soft("4.0.0", "gap(omittedLevels)", "gap(dropOmittedLevels)")
    dropOmittedLevels <- omittedLevels
  }
  if (!is.logical(dropOmittedLevels)) stop("The ", sQuote("dropOmittedLevels"), " argument must be logical.")

  # check incoming variables
  call <- match.call()
  # get valid pctMethod
  pctMethod <- match.arg(pctMethod)

  checkDataClass(data, c("edsurvey.data.frame", "light.edsurvey.data.frame", "edsurvey.data.frame.list"))

  # if recode list is provided, it will be added to userConditions at first (before subset in groupA and groupB)
  if (!is.null(recode)) {
    data <- recode.sdf(data, recode)
  }

  if (inherits(data, "edsurvey.data.frame") | inherits(data, "light.edsurvey.data.frame")) {
    survey <- getAttributes(data, "survey")
  } else {
    # edsurvey.data.frame.list
    robustSurvey <- function(dat) {
      res <- tryCatch(getAttributes(dat, "survey"),
        error = function(e) {
          NULL
        }
      )
      return(res)
    }
    survey <- unique(unlist(lapply(data$data, robustSurvey)))
  }

  # check if linking error is supported for this survey
  if (!missing(includeLinkingError) && includeLinkingError) {
    checkDataClass(data, c("edsurvey.data.frame.list"))
    if (!any(c("NAEP", "PISA") %in% survey)) {
      stop("the argument ", dQuote("includeLinkingError"), " can only be set to ", dQuote("TRUE"), " when EdSurvey suports linking error for the survey. Currently EdSurvey supports linking error for NAEP and PISA.")
    }
  }
  # Set dbaLabels as NULL for linking error with non ESDFL object or
  # includeLinkingError set as False
  dbaLabels <- NULL

  # gapStat required for PISA regardless of linking error
  if ("PISA" %in% survey) {
    if (!is.null(percentiles)) {
      gapStat <- "percentile"
    } else if (!is.null(achievementLevel)) {
      gapStat <- "AL"
    } else if (stDev) {
      gapStat <- "SD"
    } else {
      gapStat <- "Mean"
    }
  }
  # Get parameters for linking error
  if (!missing(includeLinkingError) && includeLinkingError) {
    # the following is only for NAEP which has linking error within survey
    if ("NAEP" %in% survey) {
      if (inherits(data, "edsurvey.data.frame.list")) {
        uniqueSubject <- unique(unlist(lapply(data$data, getAttributes, "subject")))
        if (length(uniqueSubject) != 1) {
          stop(paste0("Linking error only support ", dQuote("edsurvey.data.frame.list"), " with a single subject. Found subjects: ", pasteItems(uniqueSubject), "."))
        }

        uniqueGrade <- unique(unlist(lapply(data$data, getAttributes, "gradeLevel")))
        if (length(uniqueGrade) != 1) {
          stop(paste0("Linking error only support ", dQuote("edsurvey.data.frame.list"), " with a single grade level. Found grades: ", pasteItems(uniqueGrade), "."))
        }
      }

      # For achievementDiscrete=TRUE, return "At basic" (not "below basic") if "achievementLevel = "basic"
      alLinkingErrorhelper <- function(alstrings, discrete) {
        matchedAL <- NULL
        for (alstring in alstrings) {
          if (discrete) {
            if (grepl("^basic", alstring, ignore.case = TRUE)) {
              matchedAL <- c(matchedAL, "At basic")
            } else {
              matchedAL <- c(
                matchedAL,
                c(
                  "Below basic", "At basic", "At proficient",
                  "At advanced"
                )[grepl(alstring, c(
                  "Below basic", "At basic",
                  "At proficient", "At advanced"
                ),
                ignore.case = TRUE
                )]
              )
            }
          } else {
            matchedAL <- c(
              matchedAL,
              c(
                "At or above basic",
                "At or above proficient",
                "At advanced"
              )[
                grepl(alstring, c("At or above basic", "At or above proficient", "At advanced"),
                  ignore.case = TRUE
                )
              ]
            )
          }
        }
        if (length(matchedAL) != length(alstrings)) {
          stop("Linking error for provided achievement level ", pasteItems(alstrings[!alstrings %in% matchedAL]), "is not available.")
        }
        return(matchedAL)
      }

      # Get DBA labels from DBA years
      # 2017 is the first NAEP DBA year
      if (inherits(data, "edsurvey.data.frame.list")) {
        allYears <- unlist(itterateESDFL(call("getAttributes", attribute = "year"), data))
        dbaLabels <- data[[2]]$labels[allYears >= 2017]
        if (!all(dbaLabels %in% data[[2]]$labels)) {
          stop(paste0(
            pasteItems(dbaLabels[!dbaLabels %in% data[[2]]$labels]),
            " is not in the ", dQuote("edsurvey.data.frame.list"), " labels."
          ))
        }
      } else {
        allYears <- getAttributes(data, attribute = "year")
        dbaLabels <- NULL
      }
      if (!is.null(percentiles) && !all(percentiles %in% c(10, 25, 50, 75, 90))) {
        stop(
          "NAEP linking error only supported for 10th, 25th, 50th, 75th, 90th percentiles.",
          "Set ", dQuote("includeLinkingError"), " to ", dQuote("FALSE"),
          "to calculate any percentile without linking error."
        )
      }
      if (!is.null(percentiles)) {
        gapStat <- percentiles
      } else if (!is.null(achievementLevel)) {
        gapStat <- alLinkingErrorhelper(achievementLevel, achievementDiscrete)
      } else if (stDev) {
        gapStat <- "SD"
      } else {
        gapStat <- "Mean"
      }
    }
  }

  # deal with the possibility that there is more than one line of results for various possible reasons
  if (inherits(data, "edsurvey.data.frame.list") | length(percentiles) > 1 | length(achievementLevel) > 1) {
    # a flag that is TRUE when there is no information in data$covs. Default to false
    nocovs <- FALSE
    if (!inherits(data, "edsurvey.data.frame.list")) {
      # this is a situation where data is not an esdfl. The following code
      # required it to be one, so make it one. But note that covs
      # does not have real information in it.
      nocovs <- TRUE
      data <- edsurvey.data.frame.list(list(data), labels = "data")
    }
    ll <- length(data$datalist)
    lpct <- ifelse(is.null(percentiles), 1, length(percentiles))
    lal <- ifelse(is.null(achievementLevel), 1, length(achievementLevel))

    # check variable specific to edsurvey.data.frame.list
    suppressWarnings(refi <- as.integer(referenceDataIndex))
    if (!refi %in% 1:ll) {
      stop(paste0("The argument ", sQuote("referenceDataIndex"), " must be an integer between 1 and ", ll, "."))
    }
    res <- list(
      results = list(),
      labels = list(A = substitute(groupA), B = substitute(groupB))
    )

    pctdf0 <- data$covs
    if (!nocovs) {
      pctdf0$covAA <- 0
      pctdf0$dofAA <- Inf
      pctdf0$covBB <- 0
      pctdf0$dofBB <- Inf
      pctdf0$covABAB <- 0
      pctdf0$dofABAB <- Inf
    }

    # make sure refi gets calculated first
    llvec <- unique(c(refi, 1:ll))
    resilist <- list()
    # call gap for each edsurvey.data.frame
    for (i in llvec) {
      # for each element of the edsurvey.data.frame.list
      calli <- call
      # adjust the call per the needs of this call
      calli$data <- data$datalist[[i]]
      # we need the these inputs to calculate across essurvey.data.frame
      # covariances (but, only if they are the same dataset) an issue
      # that will be resolved later.
      calli$returnVarEstInputs <- TRUE
      # this returns the degrees of freedom for meanA and meanB
      # used for diffAA and diffBB, respectively
      calli$returnSimpleDoF <- TRUE
      # if these are NULL then setting them changes nothing
      calli$percentiles <- percentiles
      calli$achievementLevel <- achievementLevel
      calli <- as.list(calli)
      # use the same list of arguments but change the function to gapHelper
      calli[[1]] <- as.name("gapHelper")
      resilist[[i]] <- tryCatch(resi <- eval(as.call(calli)),
        error = function(cond) {
          message(paste0(
            "An error occurred while working on a dataset ",
            pasteItems(dQuote(data$covs[i, ])),
            ". The results from that dataset will be excluded. Error message: ",
            cond
          ))
          return(0)
        }
      )
      # if there was not an error
      if (!inherits(resilist[[i]], "gap")) {
        # there was an error
        if (i %in% referenceDataIndex) {
          stop("An error prevented processing of the reference dataset. This must be fixed before comparisons to the reference dataset can be made.")
        }
      } else {
        resi$call <- NULL

        # the following code is to clean up the results (adding some columns to make the code to extract results from gapHelper more consistent)
        # after this code, we should expect that each gap result will have:
        # 1. resi$results will have 'statisticsLevel' column: indicate percentile or achievementLevel, or 'data' if none is specified
        # 2. resi$varEstInputs: PV and JK data frame will have 'Level' column similar to 'statisticsLevel' column in resi$results
        if (is.null(percentiles) && is.null(achievementLevel)) {
          resi$results$statisticsLevel <- "data"
        }
        # if no or only 1 percentile/achievementLevel is specified
        if (length(percentiles) <= 1 && length(achievementLevel) <= 1) {
          resi$varEstInputs$JK$Level <- "data"
          if (!is.null(resi$varEstInputs$PV)) { # PV is null for type = "eq"
            resi$varEstInputs$PV$Level <- "data"
          }
        }
        # in gapHelper, type = 'pct' will return 'percentiles' column and type = 'al' will return 'achievementLevel' column
        # this replacement is to use 1 consistent name for the result extraction code below
        # it will be renamed back to original names after the extraction
        colnames(resi$results) <- gsub("achievementLevel|percentiles", "statisticsLevel", colnames(resi$results))
        resilist[[i]] <- resi
      }
    } # end for(i in llvec)

    # Extract results and compute AA or BB statistics
    lstats <- max(lpct, lal)
    for (j in 1:lstats) {
      resdf <- data$covs
      if (!nocovs) {
        # only generate these columns if there are multiple datasets
        resdf$covAA <- 0
        resdf$dofAA <- Inf
        resdf$covBB <- 0
        resdf$dofBB <- Inf
        resdf$covABAB <- 0
        resdf$dofABAB <- Inf
      }
      # to check whether the two samples are from the same survey or not
      # we need to use DOFCorrection and compute covariance if the two samples are from the same survey
      resdf$sameSurvey <- ifelse(1:nrow(resdf) == refi,
        rep(NA, nrow(resdf)),
        rep(FALSE, nrow(resdf))
      )
      if (lpct > 1) {
        resdf$percentiles <- percentiles[[j]]
      }
      for (i in llvec) {
        if (!inherits(resilist[[i]], "gap")) {
          # there was an error
          if (i %in% referenceDataIndex) {
            stop("An error prevented processing of the reference dataset. This must be fixed before comparisons to the reference dataset can be made.")
          }
          # this is to make sure that all achievementLevels or percentiles are created as to the reference edsurvey.data.frame
          # Missing values will be NA for statistics
          if (!is.null(achievementLevel)) {
            resdf$achievementLevel[i] <- resilist[[refi]]$results$statisticsLevel[j]
          } else if (!is.null(percentiles)) {
            resdf$percentiles[i] <- resilist[[refi]]$results$statisticsLevel[j]
          }
        } else { # there was not an error
          # dataframes do not like vectorwise assignment by column name,
          # so assign one at a time
          resi <- resilist[[i]]

          resi$results <- resi$results[j, ]
          resi$varEstInputs$JK <- resi$varEstInputs$JK[tolower(resi$varEstInputs$JK$Level) %in% tolower(c(resi$results$statisticsLevel, "data")), ]
          if (!is.null(resi$varEstInputs$PV)) {
            resi$varEstInputs$PV <- resi$varEstInputs$PV[tolower(resi$varEstInputs$PV$Level) %in% tolower(c(resi$results$statisticsLevel, "data")), ]
          }

          if (!is.null(achievementLevel)) {
            colnames(resi$results) <- gsub("statisticsLevel", "achievementLevel", colnames(resi$results))
          } else if (!is.null(percentiles)) {
            colnames(resi$results) <- gsub("statisticsLevel", "percentiles", colnames(resi$results))
          } else {
            resi$results$statisticsLevel <- NULL
          }
          for (ii in seq_along(resi$results)) {
            resdf[i, names(resi$results)[ii]] <- resi$results[[ii]]
          }
          skipB <- is.null(resi$labels$B) # a boolean to indicate whether groupB estimates are available in the results of gapHelper
          if (i == refi) {
            refVarEstInputs <- resi$varEstInputs
            refPctVarEstInputs <- resi$pctVarEstInputs
          }
          # refi is the index of reference dataset
          # AA and BB statistics of reference dataset will be set to NA
          if (i != refi) {
            resdf$dofAA[i] <- dofCompute((resi$results)$estimateAse, resdf[refi, "estimateAse"], (resi$results)[["dofA"]], resdf[refi, "dofA"])
            if (!skipB) {
              resdf$dofBB[i] <- dofCompute((resi$results)$estimateBse, resdf[refi, "estimateBse"], (resi$results)[["dofB"]], resdf[refi, "dofB"])
              resdf$dofABAB[i] <- dofCompute((resi$results)$diffABse, resdf[refi, "diffABse"], (resi$results)[["dofAB"]], resdf[refi, "dofAB"])
            }
            # if the datasets and from the same sample, add covariances and do DoFCorrection
            if (sameSurvey(data$datalist[[i]], data$datalist[[refi]])) {
              if (nrow(resi$varEstInputs$JK) != 0) { # some Level (percentile or achievementLevel) do not have varEstInput data so the subset
                # in line 476 will return an empty data.frame
                resdf$dofAA[i] <- DoFCorrection(resi$varEstInputs, refVarEstInputs, "A", method = "JR")
              }
              resdf$covAA[i] <- varEstToCov(resi$varEstInputs, refVarEstInputs, "A", jkSumMultiplier = getAttributes(data$datalist[[i]], "jkSumMultiplier"))
              if (!skipB & nrow(resi$varEstInputs$JK) != 0) {
                resdf$dofBB[i] <- DoFCorrection(resi$varEstInputs, refVarEstInputs, "B", method = "JR")
                resdf$dofABAB[i] <- DoFCorrection(resi$varEstInputs, refVarEstInputs, "A-B", method = "JR")
                resdf$covBB[i] <- varEstToCov(resi$varEstInputs, refVarEstInputs, "B", jkSumMultiplier = getAttributes(data$datalist[[i]], "jkSumMultiplier"))
                resdf$covABAB[i] <- varEstToCov(resi$varEstInputs, refVarEstInputs, "A-B", jkSumMultiplier = getAttributes(data$datalist[[i]], "jkSumMultiplier"))
              }
              resdf$sameSurvey[i] <- TRUE
            } else {
              if ("PISA" %in% survey) {
                linke <- 0
                if (includeLinkingError & gapStat %in% c("Mean", "percentile", "SD")) {
                  if (variable %in% c("read", "math", "scie")) {
                    yearref <- getAttributes(data$datalist[[refi]], "year")
                    yeari <- getAttributes(data$datalist[[i]], "year")
                    if (yeari != yearref) {
                      linke <- calLinkingErrorPISA(subject = variable, years = c(yearref, yeari))
                    }
                  } else {
                    warning("No published linking error for variable ", dQuote(variable), "\n")
                  }
                }
                covvars <- c("covAA", "covBB", "covABAB")
                for (cvi in seq_along(covvars)) {
                  if (covvars[cvi] %in% colnames(resdf)) {
                    resdf[i, covvars[cvi]] <- -1 / 2 * linke^2
                  }
                }
              }
            }
          } else { # end if(i != refi)
            # if this is refi, set these to NULL
            resdf$covAA[i] <- resdf$covBB[i] <- resdf$covABAB[i] <- NA
            resdf$dofAA[i] <- resdf$dofBB[i] <- resdf$dofABAB[i] <- NA
          }
          # for the first j value only, grab the percent
          if (j == 1) {
            for (ii in 1:ncol(resi$percentage)) {
              pctdf0[i, colnames(resi$percentage)[ii]] <- resi$percentage[ , ii]
              if (i != refi) {
                pctdf0$dofAA[i] <- dofCompute((resi$percentage)$pctAse, pctdf0[refi, "pctAse"], (resi$percentage)[["dofA"]], pctdf0[refi, "dofA"])
                if (!skipB) {
                  pctdf0$dofBB[i] <- dofCompute((resi$percentage)$pctBse, pctdf0[refi, "pctBse"], (resi$percentage)[["dofB"]], pctdf0[refi, "dofB"])
                  pctdf0$dofABAB[i] <- dofCompute((resi$percentage)$diffABse, pctdf0[refi, "diffABse"], (resi$percentage)[["dofAB"]], pctdf0[refi, "dofAB"])
                }

                if (sameSurvey(data$datalist[[i]], data$datalist[[refi]])) {
                  pctdf0$dofAA[i] <- DoFCorrection(resi$pctVarEstInputs, refPctVarEstInputs, "A", method = "JR")
                  pctdf0$covAA[i] <- varEstToCov(resi$pctVarEstInputs, refPctVarEstInputs, "A", jkSumMultiplier = getAttributes(data$datalist[[i]], "jkSumMultiplier"))
                  if (!skipB) {
                    pctdf0$dofBB[i] <- DoFCorrection(resi$pctVarEstInputs, refPctVarEstInputs, "B", method = "JR")
                    pctdf0$dofABAB[i] <- DoFCorrection(resi$pctVarEstInputs, refPctVarEstInputs, "A-B", method = "JR")
                    pctdf0$covBB[i] <- varEstToCov(resi$pctVarEstInputs, refPctVarEstInputs, "B", jkSumMultiplier = getAttributes(data$datalist[[i]], "jkSumMultiplier"))
                    pctdf0$covABAB[i] <- varEstToCov(resi$pctVarEstInputs, refPctVarEstInputs, "A-B", jkSumMultiplier = getAttributes(data$datalist[[i]], "jkSumMultiplier"))
                  }
                }
              } else { # end if(i != refi)
                # if this is refi, set these to NULL
                pctdf0$covAA[i] <- pctdf0$covBB[i] <- pctdf0$covABAB[i] <- NA
                pctdf0$dofAA[i] <- pctdf0$dofBB[i] <- pctdf0$dofABAB[i] <- NA
              } # end else for (i != refi)
            } # end for(ii in 1:ncol(resi$percentage))
            if (returnSimpleN) {
              # return the n counts
              pctdf0$nA[i] <- resi$labels$nUsedA
              pctdf0$nB[i] <- resi$labels$nUsedB
            }
            if (returnNumberOfPSU) {
              pctdf0$nPSUA[i] <- ifelse(is.null(resi$labels$nPSUA), NA, resi$labels$nPSUA)
              pctdf0$nPSUB[i] <- ifelse(is.null(resi$labels$nPSUB), NA, resi$labels$nPSUB)
            }
          } # end if(j == 1)
        } # end if(inherits(temp, "gap"))
      } # End of for(i in 1:i) loop
      # Compute diff statistics (across different edsurvey.data.frame)
      resdf$diffAA <- ifelse(1:nrow(resdf) == refi, NA, resdf$estimateA[refi] - resdf$estimateA)
      if (!is.null(dbaLabels)) {
        # get linking error
        leAA <- calLinkingError(
          X = resdf$estimateA[refi],
          subjectI = uniqueSubject,
          gradeI = uniqueGrade,
          dependentVarI = variable,
          statElementI = gapStat,
          gapI = FALSE
        )[j]
        resdf$diffAAse <- ifelse(1:nrow(resdf) == refi, NA,
          ifelse(resdf$labels %in% dbaLabels,
            sqrt(resdf$estimateAse[refi]^2 + resdf$estimateAse^2 - 2 * resdf$covAA),
            sqrt(resdf$estimateAse[refi]^2 + resdf$estimateAse^2 - 2 * resdf$covAA + leAA)
          )
        )
      } else {
        resdf$diffAAse <- ifelse(1:nrow(resdf) == refi, NA, sqrt(resdf$estimateAse[refi]^2 + resdf$estimateAse^2 - 2 * resdf$covAA))
      }

      resdf$diffAApValue <- ifelse(1:nrow(resdf) == refi, NA, 2 * (1 - pt2(abs(resdf$diffAA / resdf$diffAAse), df = resdf$dofAA)))
      if (!skipB) {
        resdf$diffBB <- ifelse(1:nrow(resdf) == refi, NA, resdf$estimateB[refi] - resdf$estimateB)
        if (!is.null(dbaLabels)) {
          leBB <- calLinkingError(
            X = resdf$estimateB[refi],
            subjectI = uniqueSubject,
            gradeI = uniqueGrade,
            dependentVarI = variable,
            statElementI = gapStat,
            gapI = FALSE
          )[j]
          resdf$diffBBse <- ifelse(1:nrow(resdf) == refi, NA,
            ifelse(resdf$labels %in% dbaLabels,
              sqrt(resdf$estimateBse[refi]^2 + resdf$estimateBse^2 - 2 * resdf$covBB),
              sqrt(resdf$estimateBse[refi]^2 + resdf$estimateBse^2 - 2 * resdf$covBB + leBB)
            )
          )
        } else {
          resdf$diffBBse <- ifelse(1:nrow(resdf) == refi, NA, sqrt(resdf$estimateBse[refi]^2 + resdf$estimateBse^2 - 2 * resdf$covBB))
        }
        resdf$diffBBpValue <- ifelse(1:nrow(resdf) == refi, NA, 2 * (1 - pt2(abs(resdf$diffBB / resdf$diffBBse), df = resdf$dofBB)))
        resdf$diffABAB <- ifelse(1:nrow(resdf) == refi, NA, resdf$diffAB[refi] - resdf$diffAB)
        if (!is.null(dbaLabels)) {
          leABAB <- calLinkingError(
            X = resdf$diffAB[refi],
            subjectI = uniqueSubject,
            gradeI = uniqueGrade,
            dependentVarI = variable,
            statElementI = gapStat,
            gapI = TRUE
          )[j]
          resdf$diffABABse <- ifelse(1:nrow(resdf) == refi, NA,
            ifelse(resdf$labels %in% dbaLabels,
              sqrt(resdf$diffABse[refi]^2 + resdf$diffABse^2 - 2 * resdf$covABAB),
              sqrt(resdf$diffABse[refi]^2 + resdf$diffABse^2 - 2 * resdf$covABAB + leABAB)
            )
          )
        } else {
          resdf$diffABABse <- ifelse(1:nrow(resdf) == refi, NA, sqrt(resdf$diffABse[refi]^2 + resdf$diffABse^2 - 2 * resdf$covABAB))
        }
        resdf$diffABABpValue <- ifelse(1:nrow(resdf) == refi, NA, 2 * (1 - pt2(abs(resdf$diffABAB / resdf$diffABABse), df = resdf$dofABAB)))
      }

      # only generate these
      if (!nocovs) {
        pctdf0$diffAA <- ifelse(1:nrow(pctdf0) == refi, NA, pctdf0$pctA[refi] - pctdf0$pctA)
        pctdf0$diffAAse <- ifelse(1:nrow(pctdf0) == refi, NA, sqrt(pctdf0$pctAse[refi]^2 + pctdf0$pctAse^2 - 2 * pctdf0$covAA))
        pctdf0$diffAApValue <- ifelse(1:nrow(pctdf0) == refi, NA, 2 * (1 - pt2(abs(pctdf0$diffAA / pctdf0$diffAAse), df = pctdf0$dofAA)))
        if (!skipB) {
          pctdf0$diffBB <- ifelse(1:nrow(pctdf0) == refi, NA, pctdf0$pctB[refi] - pctdf0$pctB)
          pctdf0$diffBBse <- ifelse(1:nrow(pctdf0) == refi, NA, sqrt(pctdf0$pctBse[refi]^2 + pctdf0$pctBse^2 - 2 * pctdf0$covBB))
          pctdf0$diffBBpValue <- ifelse(1:nrow(pctdf0) == refi, NA, 2 * (1 - pt2(abs(pctdf0$diffBB / pctdf0$diffBBse), df = pctdf0$dofBB)))
          pctdf0$diffABAB <- ifelse(1:nrow(pctdf0) == refi, NA, pctdf0$diffAB[refi] - pctdf0$diffAB)
          pctdf0$diffABABse <- ifelse(1:nrow(pctdf0) == refi, NA, sqrt(pctdf0$diffABse[refi]^2 + pctdf0$diffABse^2 - 2 * pctdf0$covABAB))
          pctdf0$diffABABpValue <- ifelse(1:nrow(pctdf0) == refi, NA, 2 * (1 - pt2(abs(pctdf0$diffABAB / pctdf0$diffABABse), df = pctdf0$dofABAB)))
        }
      }
      if (j == 1) {
        resdf0 <- resdf
      } else {
        resdf0 <- rbind(resdf0, resdf)
      }
    } # end for(j in 1:lstats) loop
    if (nocovs) {
      # remove the labels columns which are just populated
      # with the unhelpful text "data"
      pctdf0$labels <- NULL
      resdf0$labels <- NULL
      # reorder columns when there are no covariates
      if (returnSimpleDoF) {
        if (!skipB) {
          resvar0 <- c(
            "estimateA", "estimateAse", "dofA",
            "estimateB", "estimateBse", "dofB"
          )
        } else {
          resvar0 <- c("estimateA", "estimateAse", "dofA")
        }
      } else {
        if (!skipB) {
          resvar0 <- c(
            "estimateA", "estimateAse",
            "estimateB", "estimateBse"
          )
        } else {
          resvar0 <- c("estimateA", "estimateAse")
        }
      }
      if (!skipB) {
        resvar0 <- c(
          resvar0,
          "diffAB", "covAB",
          "diffABse", "diffABpValue", "dofAB"
        )
      }
      addons <- c("achievementLevel", "percentiles")
      resvar <- c(addons[addons %in% names(resdf0)], resvar0)
      if ("sameSurvey" %in% names(resdf0) && sum(!is.na(resdf0$sameSurvey)) > 0) {
        resvar <- c(resvar, "sameSurvey")
      }
      resdf0 <- resdf0[ , resvar]
      # remove "estimate" with "pct" where "estimate" must start the variable name
      resvar <- gsub("^estimate", "pct", resvar0)
      if (returnSimpleN) {
        if (!skipB) {
          resvar <- c(resvar, "nA", "nB")
        } else {
          resvar <- c(resvar, "nA")
        }
      }
      if (returnNumberOfPSU) {
        if (!skipB) {
          resvar <- c(resvar, "nPSUA", "nPSUB")
        } else {
          resvar <- c(resvar, "nPSUA")
        }
      }
      pctdf0 <- pctdf0[ , resvar]
    } else { # end  if(nocovs)
      # reorder columns when there are covs
      if (returnSimpleDoF) {
        if (!skipB) {
          resvar0 <- c(
            names(data$covs), "estimateA", "estimateAse", "dofA",
            "estimateB", "estimateBse", "dofB"
          )
        } else {
          resvar0 <- c(names(data$covs), "estimateA", "estimateAse", "dofA")
        }
      } else {
        if (!skipB) {
          resvar0 <- c(
            names(data$covs), "estimateA", "estimateAse",
            "estimateB", "estimateBse"
          )
        } else {
          resvar0 <- c(names(data$covs), "estimateA", "estimateAse")
        }
      }
      if (!skipB) {
        resvar0 <- c(
          resvar0,
          "diffAB", "covAB", "diffABse",
          "diffABpValue", "dofAB",
          "diffAA", "covAA", "diffAAse",
          "diffAApValue", "dofAA",
          "diffBB", "covBB", "diffBBse",
          "diffBBpValue", "dofBB",
          "diffABAB", "covABAB", "diffABABse",
          "diffABABpValue", "dofABAB"
        )
      } else {
        resvar0 <- c(
          resvar0,
          "diffAA", "covAA", "diffAAse",
          "diffAApValue", "dofAA"
        )
      }
      addons <- c("achievementLevel", "percentiles")
      resvar <- c(addons[addons %in% names(resdf0)], resvar0)
      if ("sameSurvey" %in% names(resdf0) && sum(!is.na(resdf0$sameSurvey)) > 0) {
        resvar <- c(resvar, "sameSurvey")
      }
      resdf0 <- resdf0[ , resvar]
      resvar <- gsub("^estimate", "pct", resvar0)
      missing <- resvar[!resvar %in% names(pctdf0)]
      if (returnSimpleN) {
        if (!skipB) {
          resvar <- c(resvar, "nA", "nB")
        } else {
          resvar <- c(resvar, "nA")
        }
      }
      pctdf0 <- pctdf0[ , resvar]
    } # end else  if(nocovs)
    res[["results"]] <- resdf0
    res[["percentage"]] <- pctdf0
    res[["call"]] <- call
    class(res) <- "gapList"
    return(res)
  } else { # end if(inherits(data, "edsurvey.data.frame.list"))
    calli <- as.list(call)
    calli[[1]] <- as.name("gapHelper")
    calli$data <- data
    resi <- eval(as.call(calli))
    resi$call <- call
    return(resi)
  } # end else for if(inherits(data, "edsurvey.data.frame.list")) {
}


# this is called when data is edsurvey.data.frame or light.edsurvey.data.frame
gapHelper <- function(variable, data, groupA = "default", groupB = "default",
                      percentiles = NULL,
                      achievementLevel = NULL,
                      achievementDiscrete = FALSE,
                      stDev = FALSE,
                      targetLevel = NULL,
                      weightVar = NULL, jrrIMax = 1,
                      dropOmittedLevels = TRUE,
                      defaultConditions = TRUE,
                      recode = NULL,
                      referenceDataIndex = 1,
                      returnVarEstInputs = FALSE,
                      returnSimpleDoF = FALSE,
                      returnSimpleN = FALSE,
                      returnNumberOfPSU = FALSE,
                      noCov = FALSE,
                      pctMethod = c("unbiased", "symmetric", "simple"),
                      varMethod = NULL,
                      includeLinkingError = NULL) {
  if (is.character(substitute(groupA))) {
    groupA <- parse(text = groupA)[[1]]
  }
  if (all(as.character(substitute(groupA)) == "default")) {
    groupB <- NULL
  }
  if (!missing(groupB)) {
    if (is.character(substitute(groupB))) {
      groupB <- parse(text = groupB)[[1]]
    }
  }
  pctMethod <- match.arg(pctMethod)
  # get the weight var
  # if the weight var is not set, use the default
  wgt <- checkWeightVar(data, weightVar)

  varEstInputs <- NULL
  type <- "mu" # mean is the default
  if (!is.null(percentiles)) {
    type <- "pct"
  }
  if (!is.null(achievementLevel)) {
    if (type != "mu") {
      stop(paste0(
        "Only one of ", sQuote("percentiles"), ", ", sQuote("achievementLevel"),
        ", ", sQuote("SD"), ", and ", sQuote("targetLevel"), " can be defined."
      ))
    }
    type <- "AL"
    achievementLevel <- gmatchAttr(achievementLevel, getALNames(data, variable))
  }
  if (!is.null(targetLevel)) {
    if (type != "mu") {
      stop(paste0(
        "Only one of ", sQuote("percentiles"), ", ", sQuote("achievementLevel"),
        ", ", sQuote("SD"), ", and ", sQuote("targetLevel"), " can be defined."
      ))
    }
    type <- "eq"
  }
  if (stDev) {
    if (type != "mu") {
      stop(paste0(
        "Only one of ", sQuote("percentiles"), ", ", sQuote("achievementLevel"),
        ", ", sQuote("SD"), ", and ", sQuote("targetLevel"), " can be defined."
      ))
    }
    type <- "stdev"
  }
  # make the data with just groupA
  if (all(as.character(substitute(groupA)) == "default")) {
    dataA <- data
  } else {
    dataA <- subset(data, substitute(groupA), inside = TRUE)
  }
  # skipB is a boolean to indicate whether we need to compute statistics for group B
  # skipB = TRUE happens when (1) groupB is set to NULL or (2) groupA = 'default' (full sample)
  skipB <- FALSE
  # if groupB is null, just use data as the reference.
  # NOTE: the call to is.null requires "substitute" or groupB
  # will be evaluated right here
  if (is.null(substitute(groupB))) {
    skipB <- TRUE
  } else if (all(as.character(substitute(groupB)) == "default")) {
    dataB <- data
    dataAB <- dataA
  } else {
    # otherwise filter on groupB
    dataB <- subset(data, substitute(groupB), inside = TRUE)
    dataAB <- subset(dataA, substitute(groupB), inside = TRUE)
  }
  callv <- list(
    weightVar = wgt,
    jrrIMax = jrrIMax,
    dropOmittedLevels = dropOmittedLevels,
    recode = NULL
  ) # recode.sdf is already done in line 335
  if (!missing(defaultConditions)) {
    callv <- c(callv, list(defaultConditions = defaultConditions))
  }
  if (type == "mu") {
    # gap in mean
    calllm <- c(
      list(formula(paste0(variable, " ~ 1"))),
      callv,
      list(returnVarEstInputs = TRUE)
    ) # necessary for covariance est
    calllmA <- c(calllm, list(data = dataA))
    meanA <- do.call(lm.sdf, calllmA)
    coefA <- meanA$coefmat
    resA <- coefA[1, 1]
    if (!skipB) {
      calllmB <- c(calllm, list(data = dataB))
      meanB <- do.call(lm.sdf, calllmB)
      diff <- unname(meanA$coef[1] - meanB$coef[1]) # unname removes the name (Intercept) that otherwise ends up in the retruned value
      coefB <- meanB$coefmat
      resB <- coefB[1, 1]
      SEs <- list(estimateAse = coefA[1, 2], estimateBse = coefB[1, 2])
    } else {
      SEs <- list(estimateAse = coefA[1, 2])
    }

    # prepare varEstInputs
    # make varEstInputs$JK
    maJK <- subset(meanA$varEstInputs$JK, variable == "(Intercept)")
    maJK$variable <- "A"
    if (!skipB) {
      mbJK <- subset(meanB$varEstInputs$JK, variable == "(Intercept)")
      mbJK$variable <- "B"
      # calculate the difference
      mdJK <- merge(maJK, mbJK, by = c("PV", "JKreplicate"), suffixes = c(".A", ".B"))
      mdJK$variable <- "A-B"
      mdJK$value <- mdJK$value.A - mdJK$value.B
      mdJK$value.A <- mdJK$value.B <- NULL
      mdJK$variable.A <- mdJK$variable.B <- NULL
      varEstJK <- rbind(maJK, mbJK, mdJK)
    } else {
      varEstJK <- maJK
    }
    # make varEstInputs$PV
    if (!is.null(meanA$varEstInput$PV)) {
      maPV <- subset(meanA$varEstInputs$PV, variable == "(Intercept)")
      maPV$variable <- "A"
      if (!skipB) {
        mbPV <- subset(meanB$varEstInputs$PV, variable == "(Intercept)")
        mbPV$variable <- "B"
        # calculate the difference
        mdPV <- merge(maPV, mbPV, by = c("PV"), suffixes = c(".A", ".B"))
        mdPV$variable <- "A-B"
        mdPV$value <- mdPV$value.A - mdPV$value.B
        mdPV$value.A <- mdPV$value.B <- NULL
        mdPV$variable.A <- mdPV$variable.B <- NULL
        varEstPV <- rbind(maPV, mbPV, mdPV)
      } else {
        # only group A
        varEstPV <- maPV
      }
    } else { # end if(!is.null(meanA$varEstInput$PV))
      # no PVs, so no error from them
      varEstPV <- NULL
    } # end else for if(!is.null(meanA$varEstInput$PV))
    # make varEstInputs
    row.names(varEstJK) <- NULL
    row.names(varEstPV) <- NULL
    varEstInputs <- list(JK = varEstJK, PV = varEstPV)
  } # end if(type == "mu")
  if (type == "pct") {
    # gap at a percentile
    # here the length of percentiles is always one
    callpct <- c(
      list(
        variable = variable,
        percentiles = percentiles,
        confInt = FALSE, # do not share CI, so do not calculate it
        pctMethod = pctMethod
      ),
      callv,
      returnVarEstInputs = TRUE
    )
    callpctA <- c(callpct, list(data = dataA))
    meanA <- do.call(percentile, callpctA)
    resA <- meanA$estimate
    statisticsLevelOut <- paste0("P", percentiles)
    if (!skipB) {
      callpctB <- c(callpct, list(data = dataB))
      meanB <- do.call(percentile, callpctB)
      resB <- meanB$estimate
      SEs <- list(estimateAse = meanA$se, estimateBse = meanB$se)
    } else {
      SEs <- list(estimateAse = meanA$se)
    }
    maJK <- attributes(meanA)$varEstInputs$JK
    maJK$Level <- maJK$variable
    maJK$variable <- "A"
    if (!skipB) {
      mbJK <- attributes(meanB)$varEstInputs$JK
      mbJK$Level <- mbJK$variable
      mbJK$variable <- "B"
      # calculate the difference
      mdJK <- merge(maJK, mbJK, by = c("PV", "JKreplicate", "Level"), suffixes = c(".A", ".B"))
      mdJK$variable <- "A-B"
      mdJK$value <- mdJK$value.A - mdJK$value.B
      mdJK$value.A <- mdJK$value.B <- NULL
      mdJK$variable.A <- mdJK$variable.B <- NULL
      varEstJK <- rbind(maJK, mbJK, mdJK)
    } else {
      varEstJK <- maJK
    }

    # make varEstInputs$PV
    maPV <- attributes(meanA)$varEstInputs$PV
    maPV$Level <- maPV$variable
    maPV$variable <- "A"

    if (!skipB) {
      mbPV <- attributes(meanB)$varEstInputs$PV
      mbPV$Level <- mbPV$variable
      mbPV$variable <- "B"
      # calculate the difference
      mdPV <- merge(maPV, mbPV, by = c("PV", "Level"), suffixes = c(".A", ".B"))
      mdPV$variable <- "A-B"
      mdPV$value <- mdPV$value.A - mdPV$value.B
      mdPV$value.A <- mdPV$value.B <- NULL
      mdPV$variable.A <- mdPV$variable.B <- NULL
      varEstPV <- rbind(maPV, mbPV, mdPV)
    } else {
      varEstPV <- maPV
    }

    # make varEstInputs
    row.names(varEstJK) <- NULL
    row.names(varEstPV) <- NULL
    varEstInputs <- list(JK = varEstJK, PV = varEstPV)
  } # end if (type == 'pct')
  if (type == "AL") {
    # gap in percent at or above an achievement level
    callal <- c(
      list(
        achievementVars = variable,
        returnVarEstInputs = TRUE,
        returnCumulative = !achievementDiscrete
      ),
      callv
    )
    als <- getALNames(data, variable)
    callalA <- c(callal, list(data = dataA))
    meanA <- do.call(achievementLevels, callalA)
    if (!is.null(meanA$discrete)) {
      colnames(meanA$discrete)[grepl("Level$", colnames(meanA$discrete))] <- "Level"
    }
    if (!is.null(meanA$cumulative)) {
      colnames(meanA$cumulative)[grepl("Level$", colnames(meanA$cumulative))] <- "Level"
    }
    if (!skipB) {
      callalB <- c(callal, list(data = dataB))
      meanB <- do.call(achievementLevels, callalB)
      if (!is.null(meanB$discrete)) {
        colnames(meanB$discrete)[grepl("Level$", colnames(meanB$discrete))] <- "Level"
      }
      if (!is.null(meanB$cumulative)) {
        colnames(meanB$cumulative)[grepl("Level$", colnames(meanB$cumulative))] <- "Level"
      }
    }
    # achievementDiscrete indicates whether we should extract cumulative or discrete achievement level results
    if (achievementDiscrete) {
      # get index of desired achievement levels from the results
      if (typeof(als) == "character") {
        lA <- sapply(achievementLevel, function(al) {
          lal <- addALPrefix(al = al, als = als, discrete = TRUE)
          grep(lal, meanA$discrete$Level, ignore.case = TRUE)
        })
      } else {
        lA <- rep(TRUE, length(als[[achievementLevel]]))
      }

      if (length(lA) == 0) {
        stop("Achievement level cannot be found in the results of the call of the function ", sQuote("achievementLevel"), ".")
      }
      if (!skipB) {
        if (typeof(als) == "character") {
          lB <- sapply(achievementLevel, function(al) {
            lal <- addALPrefix(al = al, als = als, discrete = TRUE)
            grep(lal, meanB$discrete$Level, ignore.case = TRUE)
          })
        } else {
          lB <- rep(TRUE, length(als[[achievementLevel]]))
        }
        if (length(lB) == 0) {
          stop("Achievement level cannot be found in the results of the call of the function ", sQuote("achievementLevel"), ".")
        }
      }
      resA <- meanA$discrete$Percent[unlist(lA)]
      if (!skipB) {
        resB <- meanB$discrete$Percent[unlist(lB)]
        SEs <- list(
          estimateAse = meanA$discrete$StandardError[unlist(lA)],
          estimateBse = meanB$discrete$StandardError[unlist(lB)]
        )
      } else {
        SEs <- list(estimateAse = meanA$discrete$StandardError[unlist(lA)])
      }
    } else {
      # get index of desired achievement levels from the results
      if (typeof(als) == "character") {
        lA <- sapply(achievementLevel, function(al) {
          lal <- addALPrefix(al = al, als = als, discrete = FALSE)
          grep(lal, meanA$cumulative$Level, ignore.case = TRUE)
        })
      } else {
        lA <- rep(TRUE, length(als[[achievementLevel]]))
      }
      if (length(lA) == 0) {
        stop("Achievement level cannot be found in the results of the call of the function ", sQuote("achievementLevel"), ".")
      }
      if (!skipB) {
        if (typeof(als) == "character") {
          lB <- sapply(achievementLevel, function(al) {
            lal <- addALPrefix(al = al, als = als, discrete = FALSE)
            grep(lal, meanB$cumulative$Level, ignore.case = TRUE)
          })
        } else {
          lB <- rep(TRUE, length(als[[achievementLevel]]))
        }
      }
      resA <- meanA$cumulative$Percent[unlist(lA)]
      if (!skipB) {
        resB <- meanB$cumulative$Percent[unlist(lB)]
        SEs <- list(
          estimateAse = meanA$cumulative$StandardError[unlist(lA)],
          estimateBse = meanB$cumulative$StandardError[unlist(lB)]
        )
      } else {
        SEs <- list(estimateAse = meanA$cumulative$StandardError[unlist(lA)])
      }
    }
    # used inside next if, but also outside
    statisticsLevelOut <- meanA$discrete$Level[unlist(lA)]
    # get Cov
    if (achievementDiscrete) {
      maJK <- subset(meanA$discVarEstInputs$JK, variable %in% meanA$discrete$Level[unlist(lA)])
      maJK$Level <- maJK$variable
      maJK$variable <- "A"
      if (!skipB) {
        mbJK <- subset(meanB$discVarEstInputs$JK, variable %in% meanB$discrete$Level[unlist(lB)])
        mbJK$Level <- mbJK$variable
        mbJK$variable <- "B"
      }
    } # end if(achievementDiscrete)
    else {
      maJK <- subset(meanA$cumVarEstInputs$JK, variable %in% meanA$cumulative$Level[unlist(lA)])
      maJK$Level <- maJK$variable
      maJK$variable <- "A"
      if (!skipB) {
        mbJK <- subset(meanB$cumVarEstInputs$JK, variable %in% meanB$cumulative$Level[unlist(lB)])
        mbJK$Level <- mbJK$variable
        mbJK$variable <- "B"
      }
    } # end else if(achievementDiscrete)
    if (!skipB) {
      mdJK <- merge(maJK, mbJK, by = c("PV", "JKreplicate", "Level"), suffixes = c(".A", ".B"))
      mdJK$variable <- "A-B"
      mdJK$value <- mdJK$value.A - mdJK$value.B
      mdJK$value.A <- mdJK$value.B <- NULL
      mdJK$variable.A <- mdJK$variable.B <- NULL
      varEstJK <- rbind(maJK, mbJK, mdJK)
    } else {
      varEstJK <- maJK
    }
    # make varEstInputs$PV
    if (achievementDiscrete) {
      maPV <- subset(meanA$discVarEstInputs$PV, variable %in% meanA$discrete$Level[unlist(lA)])
      maPV$Level <- maPV$variable
      maPV$variable <- "A"
      if (!skipB) {
        mbPV <- subset(meanB$discVarEstInputs$PV, variable %in% meanB$discrete$Level[unlist(lB)])
        mbPV$Level <- mbPV$variable
        mbPV$variable <- "B"
      }
    } # end if(achievementDiscrete)
    else {
      statisticsLevelOut <- meanA$cumulative$Level[unlist(lA)]
      maPV <- subset(meanA$cumVarEstInputs$PV, variable %in% meanA$cumulative$Level[unlist(lA)])
      maPV$Level <- maPV$variable
      maPV$variable <- "A"
      if (!skipB) {
        mbPV <- subset(meanB$cumVarEstInputs$PV, variable %in% meanB$cumulative$Level[unlist(lB)])
        mbPV$Level <- mbPV$variable
        mbPV$variable <- "B"
      }
    } # end else if(achievementDiscrete)
    # calculate the difference
    if (!skipB) {
      mdPV <- merge(maPV, mbPV, by = c("PV", "Level"), suffixes = c(".A", ".B"))
      mdPV$variable <- "A-B"
      mdPV$value <- mdPV$value.A - mdPV$value.B
      mdPV$value.A <- mdPV$value.B <- NULL
      mdPV$variable.A <- mdPV$variable.B <- NULL
      varEstPV <- rbind(maPV, mbPV, mdPV)
    } else {
      varEstPV <- maPV
    }
    # make varEstInputs
    row.names(varEstJK) <- NULL
    row.names(varEstPV) <- NULL
    varEstInputs <- list(JK = varEstJK, PV = varEstPV)
  } # end if (type == "AL")
  if (type == "stdev") {
    # gap at a standard deviation
    # here the length of standard deviation is always one
    callStDev <- c(list(variable = variable),
      callv,
      returnVarEstInputs = TRUE
    )
    if ("weightVar" %in% names(callStDev) && is.null(callStDev[["weightVar"]])) {
      # default is used only if weightVar is missing.
      callStDev[["weightVar"]] <- NULL
    }
    callStDevA <- c(callStDev, list(data = dataA))
    meanA <- do.call(SD, callStDevA)
    resA <- meanA$std
    statisticsLevelOut <- "SD"
    if (!skipB) {
      callStDevB <- c(callStDev, list(data = dataB))
      meanB <- do.call(SD, callStDevB)
      resB <- meanB$std
      SEs <- list(estimateAse = meanA$stdSE, estimateBse = meanB$stdSE)
    } else {
      SEs <- list(estimateAse = meanA$stdSE)
    }
    maJK <- meanA$varEstInputs$JK
    maJK$Level <- maJK$variable
    maJK$variable <- "A"
    if (!skipB) {
      mbJK <- meanB$varEstInputs$JK
      mbJK$Level <- mbJK$variable
      mbJK$variable <- "B"
      # calculate the difference
      mdJK <- merge(maJK, mbJK, by = c("PV", "JKreplicate", "Level"), suffixes = c(".A", ".B"))
      mdJK$variable <- "A-B"
      mdJK$value <- mdJK$value.A - mdJK$value.B
      mdJK$value.A <- mdJK$value.B <- NULL
      mdJK$variable.A <- mdJK$variable.B <- NULL
      varEstJK <- rbind(maJK, mbJK, mdJK)
    } else {
      varEstJK <- maJK
    }

    # make varEstInputs$PV
    if (!is.null(meanA$varEstInput$PV)) {
      maPV <- meanA$varEstInputs$PV
      maPV$Level <- maPV$variable
      maPV$variable <- "A"

      if (!skipB) {
        mbPV <- meanB$varEstInputs$PV
        mbPV$Level <- mbPV$variable
        mbPV$variable <- "B"
        # calculate the difference
        mdPV <- merge(maPV, mbPV, by = c("PV", "Level"), suffixes = c(".A", ".B"))
        mdPV$variable <- "A-B"
        mdPV$value <- mdPV$value.A - mdPV$value.B
        mdPV$value.A <- mdPV$value.B <- NULL
        mdPV$variable.A <- mdPV$variable.B <- NULL
        varEstPV <- rbind(maPV, mbPV, mdPV)
      } else {
        varEstPV <- maPV
      }
    } else { # end if(!is.null(meanA$varEstInput$PV))
      # no PVs, so no error from them
      varEstPV <- NULL
    } # end else for if(!is.null(meanA$varEstInput$PV))

    # make varEstInputs
    row.names(varEstJK) <- NULL
    row.names(varEstPV) <- NULL
    varEstInputs <- list(JK = varEstJK, PV = varEstPV)
  } # end if (type == 'stDev')
  groupA_0 <- FALSE
  groupB_0 <- FALSE
  if (type == "eq") {
    # gap in percent in some covariate
    calleq <- c(callv, list(
      formula = formula(paste0(" ~ ", variable)),
      returnMeans = FALSE,
      pctAggregationLevel = NULL,
      returnSepct = TRUE,
      drop = FALSE,
      returnVarEstInputs = TRUE
    ))
    calleqA <- c(calleq, list(data = dataA))
    meanA <- do.call(edsurveyTable, calleqA)

    if (!targetLevel %in% meanA$data[[variable]]) {
      groupA_0 <- TRUE
    }
    resA <- ifelse(groupA_0, 0, meanA$data[meanA$data[variable] == targetLevel, 4])
    if (!skipB) {
      calleqB <- c(calleq, list(data = dataB))
      meanB <- do.call(edsurveyTable, calleqB)
      if (!targetLevel %in% meanB$data[[variable]]) {
        groupB_0 <- TRUE
      }
      resB <- ifelse(groupB_0, 0, meanB$data[meanB$data[variable] == targetLevel, 4])
      SEs <- list(
        estimateAse = ifelse(groupB_0, NA, meanA$data[meanA$data[variable] == targetLevel, 5]),
        estimateBse = ifelse(groupB_0, NA, meanB$data[meanB$data[variable] == targetLevel, 5])
      )
    } else {
      SEs <- list(estimateAse = ifelse(groupB_0, NA, meanA$data[meanA$data[variable] == targetLevel, 5]))
    }

    # make varEstInputs$JK
    if (!groupA_0) {
      maJK <- meanA$pctVarEstInputs$JK[meanA$pctVarEstInputs$JK$variable == paste0(variable, "=", targetLevel), ]
      maJK$variable <- "A"
    } else {
      maJK <- NULL
    }
    if (!skipB) {
      if (!groupB_0) {
        mbJK <- meanB$pctVarEstInputs$JK[meanB$pctVarEstInputs$JK$variable == paste0(variable, "=", targetLevel), ]
        mbJK$variable <- "B"
      } else {
        mbJK <- NULL
      }
      # calculate the difference
      if (!groupA_0 & !groupB_0) {
        mdJK <- merge(maJK, mbJK, by = c("PV", "JKreplicate"), suffixes = c(".A", ".B"))
        mdJK$variable <- "A-B"
        mdJK$value <- mdJK$value.A - mdJK$value.B
        mdJK$value.A <- mdJK$value.B <- NULL
        mdJK$variable.A <- mdJK$variable.B <- NULL
      } else {
        mdJK <- NULL
      }
      varEstJK <- rbind(maJK, mbJK, mdJK)
    } else {
      varEstJK <- maJK
    }

    # make varEstInputs
    if (!is.null(varEstJK)) {
      row.names(varEstJK) <- NULL
      varEstInputs <- list(JK = varEstJK, PV = NULL)
    } else {
      varEstInputs <- NULL
    }
  } # end if (type == "eq")
  # Compute difference statistics (diffAB, covAB, pooledse, pooleddf)
  if (!skipB) {
    diff <- unname(resA - resB) # unname removes the name (Intercept) that otherwise ends up in the retruned value
    nSize <- c(nrow(dataA), nrow(dataB))
    total <- which.max(nSize) # total is 1 if groupA is the total and 2 if groupB is the total
    p <- nrow(dataAB) / nSize[total]
    cov <- p * (SEs[[total]])^2
    wgtl <- getAttributes(data, "weights")[[wgt]]
    JRdfA <- JRdfB <- JRdf <- length(wgtl$jksuffixes)
    if (!is.null(varEstInputs)) {
      if (type %in% c("pct", "AL")) {
        cov <- c()
        JRdfA <- c()
        JRdfB <- c()
        JRdf <- c()
        # we need to calculate cov by each Level
        for (si in statisticsLevelOut) {
          varEstInputsTemp <- list(
            JK = subset(varEstInputs$JK, Level %in% si),
            PV = subset(varEstInputs$PV, Level %in% si)
          )
          if (nrow(varEstInputsTemp$JK) == 0) {
            varEstInputsTemp$JK <- NULL
          }
          if (nrow(varEstInputsTemp$PV) == 0) {
            varEstInputsTemp$PV <- NULL
          }
          if (!is.null(varEstInputsTemp$JK)) {
            cov <- c(cov, varEstToCov(varEstInputsTemp,
              varA = "A", varB = "B", jkSumMultiplier = getAttributes(data, "jkSumMultiplier")
            ))

            JRdfA <- c(JRdfA, DoFCorrection(varEstInputsTemp, varA = "A", method = c("JR")))
            JRdfB <- c(JRdfB, DoFCorrection(varEstInputsTemp, varA = "B", method = c("JR")))
            JRdf <- c(JRdf, DoFCorrection(varEstInputsTemp, varA = "A", varB = "B", method = c("JR")))
          } # end if (!is.null(varEstInputsTemp$JK))
          else {
            cov <- c(cov, NA)
            JRdfA <- c(JRdfA, NA)
            JRdfB <- c(JRdfB, NA)
            JRdf <- c(JRdf, NA)
          }
        }
      } # end if (type %in% c("pct","AL"))
      else {
        cov <- ifelse(groupA_0 | groupB_0, cov, varEstToCov(varEstInputs, varA = "A", varB = "B", jkSumMultiplier = getAttributes(data, "jkSumMultiplier")))
        JRdfA <- ifelse(groupA_0, NA, DoFCorrection(varEstInputs, varA = "A", method = c("JR")))
        JRdfB <- ifelse(groupB_0, NA, DoFCorrection(varEstInputs, varA = "B", method = c("JR")))
        JRdf <- ifelse(groupA_0 | groupB_0, NA, DoFCorrection(varEstInputs, varA = "A", varB = "B", method = c("JR")))
      }
    } # end if (!is.null(varEstInputs))
    if (noCov) {
      cov <- 0 * cov
      JRdf <- ifelse(groupA_0 | groupB_0, NA, (SEs[[1]]^2 + SEs[[2]]^2)^2 / (SEs[[1]]^4 / JRdfA + SEs[[2]]^4 / JRdfB))
    }
    diffSE <- mapply(function(a, b, c) {
      unname(sqrt(a^2 + b^2 - 2 * c))
    }, SEs[[1]], SEs[[2]], cov)
    diffSE <- unlist(diffSE)
    # get percent in each group

    # get variables used in conditions for groupA and groupB
    vn <- c(parseVars(substitute(groupA), data), parseVars(substitute(groupB), data))
    # add weights and variable itself
    vn <- c(vn, wgt, variable)

    tryCatch(vn <- unique(c(vn, PSUStratumNeeded(returnNumberOfPSU, data))),
      error = function(e) {
        warning(paste0("Stratum and PSU variables are required for this call and are not on the incoming data. Ignoring ", dQuote("returnNumberOfPSU=TRUE"), "."))
        returnNumberOfPSU <<- FALSE
      }
    )

    # setup non-data call variables
    callv <- list(
      varnames = vn,
      drop = FALSE,
      dropOmittedLevels = dropOmittedLevels,
      recode = NULL
    ) # recode is already done in line 335
    if (!missing(defaultConditions)) {
      callv <- c(callv, list(defaultConditions = defaultConditions))
    }

    # add weight call variables and do calls
    wgtcall0 <- c(callv, list(data = data))
    d0 <- do.call(getData, wgtcall0)
    wgtcallA <- c(callv, list(data = dataA))
    dA <- do.call(getData, wgtcallA)
    wgtcallB <- c(callv, list(data = dataB))
    dB <- do.call(getData, wgtcallB)
    pctA <- sum(dA[ , wgt]) / sum(d0[ , wgt])
    pctB <- sum(dB[ , wgt]) / sum(d0[ , wgt])
    # calculate covAB using JK method
    wgtl <- getAttributes(data, "weights")[[wgt]]
    pctJK <- t(sapply(wgtl$jksuffixes, function(suffix) {
      w0 <- sum(d0[ , paste0(wgtl$jkbase, suffix)])
      wA <- sum(dA[ , paste0(wgtl$jkbase, suffix)])
      wB <- sum(dB[ , paste0(wgtl$jkbase, suffix)])
      c(pctA = wA / w0, pctB = wB / w0)
    }))
    seA <- 100 * sqrt(getAttributes(data, "jkSumMultiplier") * sum((pctJK[ , 1] - pctA)^2))
    seB <- 100 * sqrt(getAttributes(data, "jkSumMultiplier") * sum((pctJK[ , 2] - pctB)^2))
    varJK <- data.frame(
      stringsAsFactors = FALSE,
      PV = rep(0, 3 * nrow(pctJK)),
      JKreplicate = rep(1:nrow(pctJK), 3),
      variable = rep(c("A", "B", "A-B"), each = nrow(pctJK)),
      value = c(
        pctJK[ , 1] - pctA,
        pctJK[ , 2] - pctB,
        (pctJK[ , 1] - pctA) - (pctJK[ , 2] - pctB)
      )
    )
    # there is no PV here because weights are not imputed
    pctVarEstInputs <- list(JK = varJK, PV = NULL)
    if (noCov) {
      covAB <- rep(0, length(seA))
      pctDoFA <- DoFCorrection(pctVarEstInputs, varA = "A", method = "JR")
      pctDoFB <- DoFCorrection(pctVarEstInputs, varA = "B", method = "JR")
      # Satterthwaite approximation
      pctJRdf <- (seA^2 + seB^2)^2 / (seA^4 / pctDoFA + seB^4 / pctDoFB)
    } else {
      covAB <- varEstToCov(pctVarEstInputs, varA = "A", varB = "B", jkSumMultiplier = getAttributes(data, "jkSumMultiplier"))
      pctDoFA <- DoFCorrection(pctVarEstInputs, varA = "A", method = "JR")
      pctDoFB <- DoFCorrection(pctVarEstInputs, varA = "B", method = "JR")
      pctJRdf <- DoFCorrection(pctVarEstInputs, varA = "A-B", method = "JR")
    }
    # end covAB calculation
    pctA <- 100 * pctA
    pctB <- 100 * pctB
    seAB <- unname(sqrt(seA^2 + seB^2 - 2 * 100 * 100 * covAB))

    # start return vector, first name achievement level
    vec <- switch(type,
      pct = data.frame(stringsAsFactors = FALSE, percentiles = as.numeric(percentiles)),
      AL = data.frame(stringsAsFactors = FALSE, achievementLevel = gsub("Below", "below", as.character(statisticsLevelOut))),
      data.frame()
    )
    if (returnSimpleDoF) {
      # add dofA and dofB if requested
      if (nrow(vec) > 0) {
        vec <- cbind(
          vec,
          data.frame(
            stringsAsFactors = FALSE,
            estimateA = resA, estimateAse = SEs[[1]], dofA = JRdfA,
            estimateB = resB, estimateBse = SEs[[2]], dofB = JRdfB
          )
        )
      } else { # this is equivalent to  if (type %in%  c("mu","eq"))
        vec <- data.frame(
          stringsAsFactors = FALSE,
          estimateA = resA, estimateAse = SEs[[1]], dofA = JRdfA,
          estimateB = resB, estimateBse = SEs[[2]], dofB = JRdfB
        )
      }
    } # end if(returnSimpleDoF)
    else {
      # otherwise, no dofA or dofB
      if (nrow(vec) > 0) {
        vec <- cbind(
          vec,
          data.frame(
            stringsAsFactors = FALSE,
            estimateA = resA, estimateAse = SEs[[1]],
            estimateB = resB, estimateBse = SEs[[2]]
          )
        )
      } else { # this is equivalent to  if (type %in%  c("mu","eq"))
        vec <- data.frame(
          stringsAsFactors = FALSE,
          estimateA = resA, estimateAse = SEs[[1]],
          estimateB = resB, estimateBse = SEs[[2]]
        )
      }
    }
    diffp <- 2 * (1 - pt2(abs(diff / diffSE), df = JRdf))
    vec <- cbind(
      vec,
      data.frame(
        stringsAsFactors = FALSE,
        diffAB = diff, covAB = unname(cov),
        diffABse = diffSE,
        diffABpValue = diffp,
        dofAB = JRdf
      )
    )
    row.names(vec) <- NULL
    if (returnSimpleDoF) {
      percentage <- data.frame(
        stringsAsFactors = FALSE,
        pctA = pctA,
        pctAse = seA,
        dofA = pctDoFA,
        pctB = pctB,
        pctBse = seB,
        dofB = pctDoFB
      )
    } else {
      percentage <- data.frame(
        stringsAsFactors = FALSE,
        pctA = pctA,
        pctAse = seA,
        pctB = pctB,
        pctBse = seB
      )
    }
    pdiffp <- 2 * (1 - pt2(abs(pctA - pctB) / seAB, df = pctJRdf))
    percentage <- cbind(
      percentage,
      data.frame(
        stringsAsFactors = FALSE,
        diffAB = pctA - pctB,
        covAB = 100 * 100 * covAB,
        diffABse = seAB,
        diffABpValue = pdiffp,
        dofAB = pctJRdf
      )
    )

    lst <- list(
      results = vec,
      labels = list(
        A = substitute(groupA), B = substitute(groupB),
        n0A = nrow2.edsurvey.data.frame(dataA),
        n0B = nrow2.edsurvey.data.frame(dataB),
        nUsedA = nrow(dA),
        nUsedB = nrow(dB)
      ),
      percentage = percentage
    )
    if (returnNumberOfPSU) {
      lst$labels$nPSUA <- nrow(unique(dA[ , c(getAttributes(dataA, "stratumVar"), getAttributes(dataA, "psuVar"))]))
      lst$labels$nPSUB <- nrow(unique(dB[ , c(getAttributes(dataB, "stratumVar"), getAttributes(dataB, "psuVar"))]))
    }
  } else { # end if(!skipB)
    wgtl <- getAttributes(data, "weights")[[wgt]]
    JRdfA <- ifelse(groupA_0, NA, length(wgtl$jksuffixes))
    if (!is.null(varEstInputs)) {
      if (type %in% c("pct", "AL")) {
        JRdfA <- c()
        for (si in statisticsLevelOut) {
          varEstInputsTemp <- list(
            JK = subset(varEstInputs$JK, Level %in% si),
            PV = subset(varEstInputs$PV, Level %in% si)
          )
          if (nrow(varEstInputsTemp$JK) == 0) {
            varEstInputsTemp$JK <- NULL
          }
          if (nrow(varEstInputsTemp$PV) == 0) {
            varEstInputsTemp$PV <- NULL
          }
          if (!is.null(varEstInputsTemp$JK)) {
            JRdfA <- c(JRdfA, DoFCorrection(varEstInputsTemp, varA = "A", method = c("JR")))
          } else {
            JRdfA <- c(JRdfA, NA)
          }
        }
      } # end if (type %in% c("pct","AL"))
      else {
        JRdfA <- ifelse(groupA_0, NA, DoFCorrection(varEstInputs, varA = "A", method = c("JR")))
      }
    }
    # get variables used in conditions for groupA and groupB
    vn <- c(parseVars(substitute(groupA), data), parseVars(substitute(groupB), data))
    # add weights and variable itself
    vn <- c(vn, wgt, variable)
    tryCatch(vn <- unique(c(vn, PSUStratumNeeded(returnNumberOfPSU, data))),
      error = function(e) {
        warning(paste0("Stratum and PSU variables are required for this call and are not on the incoming data. Ignoring ", dQuote("returnNumberOfPSU=TRUE"), "."))
        returnNumberOfPSU <<- FALSE
      }
    )
    # setup non-data call variables
    callv <- list(
      varnames = vn,
      drop = FALSE,
      dropOmittedLevels = dropOmittedLevels,
      recode = NULL,
      addAttributes = TRUE
    ) # recode is already done in line 335
    if (!missing(defaultConditions)) {
      callv <- c(callv, list(defaultConditions = defaultConditions))
    }

    # add weight call variables and do calls
    wgtcall0 <- c(callv, list(data = data))
    d0 <- do.call(getData, wgtcall0)
    wgtcallA <- c(callv, list(data = dataA))
    dA <- do.call(getData, wgtcallA)
    pctA <- sum(dA[ , wgt]) / sum(d0[ , wgt])
    pctJK <- t(sapply(wgtl$jksuffixes, function(suffix) {
      w0 <- sum(d0[ , paste0(wgtl$jkbase, suffix)])
      wA <- sum(dA[ , paste0(wgtl$jkbase, suffix)])
      c(pctA = wA / w0, pctB = 0)
    }))
    seA <- 100 * sqrt(getAttributes(data, "jkSumMultiplier") * sum((pctJK[ , 1] - pctA)^2))
    varJK <- data.frame(
      stringsAsFactors = FALSE,
      PV = rep(0, nrow(pctJK)),
      JKreplicate = 1:nrow(pctJK),
      variable = "A",
      value = pctJK[ , 1] - pctA
    )
    # there is no PV here because weights are not imputed
    pctVarEstInputs <- list(JK = varJK, PV = NULL)
    pctDoFA <- DoFCorrection(pctVarEstInputs, varA = "A", method = "JR")
    pctA <- 100 * pctA
    # start return vector, first name achievement level
    vec <- switch(type,
      pct = data.frame(stringsAsFactors = FALSE, percentiles = as.numeric(percentiles)),
      AL = data.frame(stringsAsFactors = FALSE, achievementLevel = gsub("Below", "below", as.character(statisticsLevelOut))),
      data.frame()
    )
    if (returnSimpleDoF) {
      # add dofA and dofB if requested
      if (nrow(vec) > 0) {
        vec <- cbind(
          vec,
          data.frame(stringsAsFactors = FALSE, estimateA = resA, estimateAse = SEs[[1]], dofA = JRdfA)
        )
      } else {
        vec <- data.frame(stringsAsFactors = FALSE, estimateA = resA, estimateAse = SEs[[1]], dofA = JRdfA)
      }
    } else {
      # otherwise, no dofA or dofB
      if (nrow(vec) > 0) {
        vec <- cbind(
          vec,
          data.frame(stringsAsFactors = FALSE, estimateA = resA, estimateAse = SEs[[1]])
        )
      } else {
        vec <- data.frame(stringsAsFactors = FALSE, estimateA = resA, estimateAse = SEs[[1]])
      }
    }
    row.names(vec) <- NULL
    if (returnSimpleDoF) {
      percentage <- data.frame(
        stringsAsFactors = FALSE,
        pctA = pctA,
        pctAse = seA,
        dofA = pctDoFA
      )
    } else {
      percentage <- data.frame(
        stringsAsFactors = FALSE,
        pctA = pctA,
        pctAse = seA
      )
    }

    lst <- list(
      results = vec,
      labels = list(
        A = substitute(groupA),
        n0A = nrow2.edsurvey.data.frame(dataA),
        n0B = NA,
        nUsedA = nrow(dA),
        nUsedB = NA
      ),
      percentage = percentage
    )
    if (returnNumberOfPSU) {
      if (getAttributes(dA, "stratumVar") %in% "JK1") {
        lst$labels$nPSUA <- nrow(dA)
      } else {
        lst$labels$nPSUA <- nrow(dA[ , c(getAttributes(dA, "stratumVar"), getAttributes(dA, "psuVar"))])
      }
      lst$labels$nPSUB <- NA
    }
  }

  if (returnVarEstInputs) {
    if ("Level" %in% colnames(varEstInputs$JK)) { # equivalent to type = 'pct' or 'AL'
      if (type == "pct") {
        varEstInputs$JK$Level <- as.numeric(gsub("^P", "", varEstInputs$JK$Level))
        varEstInputs$PV$Level <- as.numeric(gsub("^P", "", varEstInputs$PV$Level))
      }
      if (length(percentiles) <= 1 & length(achievementLevel) <= 1) {
        varEstInputs$JK$Level <- NULL
        varEstInputs$PV$Level <- NULL
      }
    }
    # assign values close to 0 to 0
    varEstInputs$JK$value[which(abs(varEstInputs$JK$value) < (sqrt(.Machine$double.eps) * sqrt(nrow(varEstInputs$JK))))] <- 0
    pctVarEstInputs$JK$value[which(abs(pctVarEstInputs$JK$value) < (sqrt(.Machine$double.eps) * sqrt(nrow(pctVarEstInputs$JK))))] <- 0

    lst <- c(lst, list(
      varEstInputs = varEstInputs,
      pctVarEstInputs = pctVarEstInputs
    ))
  }
  class(lst) <- "gap"
  return(lst)
}

# @title Linking error for PISA 2015 and subsequent DBA v prior PBA
# @param years a vector of two years involved, including 2000, 2003, 2006, 2009, 2012, 2015 or 2018
# @param subject, one of "read", "math", or "scei"
# The linking error for PISA is entirely cross year and one value
# @author Paul Bailey
calLinkingErrorPISA <- function(subject = c("read", "math", "scie"),
                                years = c(2000, 2003, 2006, 2009, 2012, 2015, 2018)) {
  years <- as.numeric(years)
  linkingErrorsFinance <- data.frame(
    subject = rep("fin", 3),
    yearSmall = c(2012, 2015, 2012),
    yearBig = c(2018, 2018, 2015),
    error = c(5.55, 9.37, 5.3309),
    stringsAsFactors = FALSE
  )
  linkingErrorsRead <- data.frame(
    subject = rep("read", 11),
    yearSmall = c(2000, 2003, 2006, 2009, 2012, 2000, 2003, 2006, 2009, 2012, 2015),
    yearBig = c(2015, 2015, 2015, 2015, 2015, 2018, 2018, 2018, 2018, 2018, 2018),
    error = c(6.8044, 5.3907, 6.6064, 3.4301, 5.2535, 4.04, 7.77, 5.24, 3.52, 3.74, 3.93),
    stringsAsFactors = FALSE
  )
  linkingErrorsMath <- data.frame(
    subject = rep("math", 9),
    yearSmall = c(2003, 2006, 2009, 2012, 2003, 2006, 2009, 2012, 2015),
    yearBig = c(2015, 2015, 2015, 2015, 2018, 2018, 2018, 2018, 2018),
    error = c(5.6080, 3.511, 3.7853, 3.5462, 2.80, 3.18, 3.54, 3.34, 2.33),
    stringsAsFactors = FALSE
  )
  linkingErrorsScei <- data.frame(
    subject = rep("scei", 7),
    yearSmall = c(2006, 2009, 2012, 2006, 2009, 2012, 2015),
    yearBig = c(2015, 2015, 2015, 2018, 2018, 2018, 2018),
    error = c(4.4821, 4.5016, 3.9228, 3.47, 3.59, 4.01, 1.51),
    stringsAsFactors = FALSE
  )
  linkingErrorsPISA <- rbind(linkingErrorsFinance, linkingErrorsRead, linkingErrorsMath, linkingErrorsScei)
  sbj <- match.arg(subject)
  years <- years[years %in% c(2000, 2003, 2006, 2009, 2012, 2015, 2018)]
  if (length(years) != 2) {
    stop("Argument ", dQuote("years"), " must have 2 valid years so there are two years to contrast.")
  }
  lep <- linkingErrorsPISA[linkingErrorsPISA$subject == sbj, ]

  res <- expand.grid(yearSmall = years, yearBig = years)
  res <- res[res$yearSmall < res$yearBig, ]
  res <- merge(lep, res, by = c("yearSmall", "yearBig"), all = FALSE)
  if (nrow(res) == 0) {
    return(0)
  }
  if (nrow(res) > 1) {
    stop("Multiple returns.")
  }
  return(res$error)
}

# @title Linking error for NAEP DBA/PBA gap analysis.
# @param X a numeric value estimated from digitally based assessments
# @param subjectI a character value set to "Mathematics" or "Reading" for subject information
# @param gradeI a character value set to "Grade 4" or "Grade 8" as grade level
# @param gapI  a logical indicating if the linking error is a gap
# @param dependentVarI a character value set to one of NAEP math or reading subscales
# @param statElementI a character vector selected from "Mean", "Row Percent", "10", "25", "50", "75", "90",
# "At or above basic", "At or above proficient", "Below basic", "At basic", "At proficient", or "At advanced" indicating the relevant statistic
# @author Huade Huo
calLinkingError <- function(X,
                            subjectI = NULL,
                            gradeI = NULL,
                            dependentVarI = NULL,
                            statElementI = NULL,
                            gapI = FALSE) {
  linkingErrorsRDS <- readRDS(system.file("linkingErrors", "linkingErrors.rds", package = "EdSurvey"))
  linkingErrorRDSi <- linkingErrorsRDS[linkingErrorsRDS$subject %in% subjectI &
    linkingErrorsRDS$dependentVar %in% dependentVarI &
    linkingErrorsRDS$grade %in% gradeI &
    linkingErrorsRDS$statElement %in% statElementI &
    linkingErrorsRDS$gap %in% gapI, ]
  linkingErrorRDSi$statElement <- factor(linkingErrorRDSi$statElement, levels = statElementI)
  linkingErrorRDSi <- linkingErrorRDSi[order(linkingErrorRDSi$statElement), ]
  if (nrow(linkingErrorRDSi) == 0) {
    stop("Linking error is not avaialble.")
  }

  linkingErrori <- linkingErrorRDSi$Aterm * X^2 + linkingErrorRDSi$Bterm * X + linkingErrorRDSi$Cterm
  return(linkingErrori)
}

#' @rdname printGap
#' @title Gap Analysis Printing
#'
#' @description Prints labels and a results vector of a gap analysis.
#' @param x an \code{R} object representing a \code{gap} of class \code{gap} or \code{gapList}
#' @param printPercentage a logical value set to \code{TRUE} to request printing
#'                        of the percentage in the groups. Defaults to \code{TRUE}.
#' @param ... these arguments are not passed anywhere and are included only for compatibility
#' @param use_es_round use the EdSurvey rounding methods for gap
#' @method print gap
#' @author Paul Bailey
#' @aliases print.gapList
#' @export
print.gap <- function(x, ..., printPercentage = TRUE, use_es_round=getOption("EdSurvey_round_output")) {
  if(use_es_round) {
  	x <- es_round(x)
  }
  cat("Call: ")
  print(x$call)
  cat("\nLabels:\n")
  lab <- data.frame(
    stringsAsFactors = FALSE,
    group = c("A", "B"),
    definition = c(deparse(x$labels$A), deparse(x$labels$B)),
    nFullData = c(x$labels$n0A, x$labels$n0B),
    nUsed = c(x$labels$nUsedA, x$labels$nUsedB)
  )
  if (!is.null(x$labels$nPSUA)) {
    lab$nPSU <- c(x$labels$nPSUA, x$labels$nPSUB)
  }
  print(lab, row.names = FALSE)
  cl_print <- match.call()
  if (printPercentage) {
    cat("\nPercentage:\n")
    pct <- data.frame(x$percentage, stringsAsFactors = FALSE)
    res <- prepGapDf(pct)
    res$df <- roundLikePrintCoefmat(res$df, cs=res$cs, tst=res$tst, cl_print)
    print(res$df, row.names = FALSE, cs.ind = res$cs, tst.ind = res$tst, ...)
  }
  cat("\nResults:\n")
  attributes(x$results)$row.names <- rep("", nrow(x$results))
  inds = 1:ncol(x$results)
  tst <- inds[grepl("pValue", colnames(x$results))]
  cs <- inds[!grepl("pValue", colnames(x$results))]
  ind0 <- -1 + which.max(colnames(x$results) == "estimateA")
  tst <- tst[!tst %in% (1:ind0)]
  cs <- cs[!cs %in% (1:ind0)]
  cl_print <- match.call()
  x$results <- roundLikePrintCoefmat(x$results, cs=cs, tst=tst, cl_print)
  print(x$results)
}

roundLikePrintCoefmat <- function(x, cs, tst, cl_print) {
  # round like printCoefmat
  absvec <- abs(x[ , cs])
  absvec <- absvec[absvec > 0 & !is.na(absvec)]
  if("digits" %in% names(cl_print)) {
    digits <- cl_print["digits"]
  } else {
  	digits <- max(3, getOption("digits") - 2)
  }
  if("eps.Pvalue" %in% names(cl_print)) {
    eps.Pvalue <- cl_print["eps.Pvalue"]
  } else {
    eps.Pvalue <- .Machine$double.eps
  }
  if("dig.tst" %in% names(cl_print)) {
  	dig.tst <- cl_print["dig.tst"]
  } else {
  	dig.tst <- max(1, min(5, digits-1))
  }
  new_digits <- max(1, digits - 1+floor(log10(max(absvec))))
  x[,cs] <- format(round(x[ , cs], new_digits), digits=digits)
  x[,tst] <- format.pval(x[ , tst], digits= dig.tst, eps= eps.Pvalue)
  return(x)
}

#' @rdname printGap
#' @method print gapList
#' @export
print.gapList <- function(x, ..., printPercentage = TRUE) {
  cat("gapList\n")
  cat(paste0("Call: "))
  print(x$call)
  cat("\nLabels:\n")
  lab <- data.frame(
    stringsAsFactors = FALSE,
    group = c("A", "B"),
    definition = c(deparse(x$labels$A), deparse(x$labels$B))
  )
  print(lab, row.names = FALSE)
  cl_print <- match.call()
  if (printPercentage) {
    cat("\nPercentage:\n")
    pct <- data.frame(x$percentage, stringsAsFactors = FALSE)
    res <- prepGapDf(pct)
    res$df <- roundLikePrintCoefmat(res$df, cs=res$cs, tst=res$tst, cl_print)
    print(res$df, row.names = FALSE, cs.ind = res$cs, tst.ind = res$tst, ...)
  }
  cat("\nResults:\n")
  res <- prepGapDf(x$results)
  res$df <- roundLikePrintCoefmat(res$df, cs=res$cs, tst=res$tst, cl_print)
  print(res$df)
}

prepGapDf <- function(df) {
  attributes(df)$row.names <- rep("", nrow(df))
  inds = 1:ncol(df)
  tst <- inds[grepl("pValue", colnames(df))]
  cs <- inds[!grepl("pValue", colnames(df))]
  ind0 <- -1 + which.max(colnames(df) %in% c("estimateA", "pctA"))
  tst <- tst[!tst %in% (1:ind0)]
  cs <- cs[!cs %in% (1:ind0)]
  return(list(df=df, tst=tst, cs=cs))
}


# helper that identifies variables in a call
# ccall is the call to parse
# x is an edsurvey.data.frame or light.edsurvey.data.frame
# @author Paul Bailey
parseVars <- function(ccall, x) {
  vars <- c()
  # for each element
  if (typeof(ccall) == "symbol") {
    return(vars)
  }
  for (i in seq_along(ccall)) {
    # if it is a name
    if (inherits(ccall[[i]], "name")) {
      ccall_c <- as.character(ccall[[i]])
      # if it is not in the data and is in the parent.frame, it might be a variable.
      if (ccall_c %in% colnames(x)) { # orignally:: if(ccall_c %in% names(x$data) ) { TOM FINK::dataList update
        if (ccall[[i]] == "%in%" || is.function(ccall[[i]])) {
          # do nothing
        } else {
          vars <- c(vars, ccall_c)
        }
      } # End of if statment: if (ccall_c %in% names(x$data))
    } # end if(class(ccall[[i]]) %in% c("name")) {
    if (inherits(ccall[[i]], "name")) {
      # if this is a call, recursively parse that
      vars <- c(vars, parseVars(ccall[[i]], x))
    } # end of if statment: if class(ccall[[i]]) %in% "call"
  } # end of for loop: i in seq_along(ccall)
  vars
} # End of fucntion: parseVars

# helpter function that calculates "pooled" dof, used for two distinct samples
# @author Trang Nguyen and Paul Bailey
dofCompute <- function(seA, seB, dofA, dofB) {
  return(mapply(function(seA, seB, dofA, dofB) {
    ifelse(seA + seB > 0, (seA^2 + seB^2)^2 / (seA^4 / dofA + seB^4 / dofB), 0)
  }, seA, seB, dofA, dofB))
}

# returns NA when df is 0, but passes Info to pt to avoid a warning
# @author Paul Bailey
pt2 <- function(q, df) {
  df2 <- ifelse(df == 0, Inf, df)
  return(ifelse(df == 0, NA, pt(q, df2)))
}

# match strings to attributes in a case insensitive way
# throwing an error if the string matches more than one achievement level
# or matches none
gmatchAttr <- function(strings, achievementLevelNames) {
  s0 <- strings
  alN0 <- achievementLevelNames
  # lower case everyting to make the function case insensitive
  strings <- tolower(trimws(strings))
  alN <- tolower(achievementLevelNames)
  # find and store the prefix, then remove it from strings
  prefix <- ifelse(grepl("^at or above", strings), substr(s0, 0, nchar("at or above") + 1), "")
  prefix <- ifelse(nchar(prefix) == 0 & grepl("^at ", strings), substr(s0, 0, 3), prefix)
  prefix <- ifelse(grepl("^below", strings), substr(s0, 0, nchar("below") + 1), prefix)
  strings <- trimws(substr(strings, nchar(prefix) + 1, nchar(strings)))
  out <- rep("", length(strings))
  error <- rep("", length(strings))
  for (i in seq_along(out)) {
    for (j in seq_along(alN)) {
      update <- grepl(strings[i], alN[j], fixed = TRUE)
      if (update) {
        if (nchar(out[i]) == 0) {
          # base case, store the output
          out[i] <- ifelse(update, alN0[j], out[i])
        } else {
          # append this level to previously found levels in an error
          # we can then replay all the levels to the user at the end.
          # first...
          # if this is the second match, store the previous
          # one as an error
          if (nchar(error[i]) == 0) {
            error[i] <- sQuote(out[i])
          }
          error[i] <- paste0(c(error[i], sQuote(alN0[j])), collapse = ", ")
          out[i] <- "Error"
        }
      }
    }
  }
  # finalize formatting for errors
  error <- ifelse(nchar(error) > 0, paste0(" matched to ", error), "")
  error <- ifelse(nchar(out) == 0, paste0(" is not one of the achievement Level for the current survey. Choose one from ", pasteItems(achievementLevelNames)), error)
  # if there is an error, format it and throw it
  if (any(nchar(error) > nchar(out))) {
    evars <- nchar(error) > 0
    ferror <- paste0(sQuote(s0[evars]), error[evars], collapse = "; ")
    stop(paste0(ferror, "."))
  }
  out <- trimws(paste(trimws(prefix), trimws(out), sep = " "))
  return(out)
}

getALNames <- function(data, variable) {
  al <- getAttributes(data, "achievementLevels")
  if (is.list(al)) {
    if (variable %in% names(al)) {
      al <- al[[variable]]
    } else {
      # assumes all achievement levels are the same
      al <- al[[1]]
    }
  }
  names(al)
}

addALPrefix <- function(al, als, discrete) {
  if (discrete) {
    if (grepl("below", al, ignore.case = TRUE)) {
      lal <- al
    } else {
      if (!grepl("^at", al, ignore.case = TRUE)) {
        lal <- paste0("At ", al)
      } else {
        lal <- al
      }
    }
  } else {
    if (grepl("^below", al, ignore.case = TRUE)) { # cumulative should not have Below achievementLevel
      return(NULL)
    } else {
      # if it doesn't have the at prefix, add it
      if (!grepl("^at", al, ignore.case = TRUE)) {
        lal <- paste0(
          ifelse(tolower(al) == tolower(als[length(als)]),
            "At ",
            "At or Above "
          ),
          al
        )
      } else {
        lal <- al
      }
    }
  }
  return(lal)
}