R/varEstToCov.R
In EdSurvey: Analysis of NCES Education Survey and Assessment Data

Documented in varEstToCov

#' @title Covariance Estimation
#'
#' @description When the variance of a derived statistic (e.g., a difference) is
#'              required, the covariance between the two statistics must be
#'              calculated. This function uses results generated by various
#'              functions (e.g., a \code{\link{lm.sdf}}) to find the covariance
#'              between two statistics.
#' @param varEstA a list of two \code{data.frame}s returned by a function after
#'                the \code{returnVarEstInputs} argument was turned on.
#'                The statistic named in the \code{varA} argument must be
#'                present in each \code{data.frame}.
#' @param varEstB a list of two \code{data.frame}s returned by a function after
#'                the \code{returnVarEstInputs} argument was turned on.
#'                The statistic named in the \code{varA} argument must be
#'                present in each \code{data.frame}. When the same as
#'                \code{varEstA}, the covariance is within one result.
#' @param varA a character that names the statistic in the \code{varEstA}
#'             argument for which a covariance is required
#' @param varB a character that names the statistic in the \code{varEstB}
#'             argument for which a covariance is required
#' @param jkSumMultiplier when the jackknife variance estimation method---or
#'                        balanced repeated replication (BRR)
#'                        method---multiplies the final jackknife variance estimate by a value,
#'                        set \code{jkSumMultiplier} to that value.
#'                        For an \code{edsurvey.data.frame} or
#'                        a \code{light.edsurvey.data.frame},
#'                        the recommended value can be recovered with
#'                        \code{EdSurvey::getAttributes(}\code{myData,} \code{"jkSumMultiplier")}.
#' @param returnComponents set to \code{TRUE} to return the imputation variance seperate from the sampling variance
#'
#' @details
#' These functions are not vectorized, so \code{varA} and
#' \code{varB} must contain exactly one variable name.
#'
#' The method used to compute the covariance is in
#' the vignette titled
#' \href{https://www.air.org/sites/default/files/EdSurvey-Statistics.pdf}{\emph{Statistical Methods Used in EdSurvey}}
#'
#' The method used to compute the degrees of freedom is in the vignette titled
#' \href{https://www.air.org/sites/default/files/EdSurvey-Statistics.pdf}{\emph{Statistical Methods Used in EdSurvey}}
#' in the section \dQuote{Estimation of Degrees of Freedom.}
#'
#' @return
#' a numeric value; the jackknife covariance estimate. If \code{returnComponents} is \code{TRUE}, returns a vector of
#' length three, \code{V} is the variance estimate, \code{Vsamp} is the sampling component of the variance, and \code{Vimp} is the imputation component of the variance
#'
#' @example man/examples/varEstToCov.R
#' @author Paul Bailey
#' @export
varEstToCov <- function(varEstA, varEstB = varEstA, varA, varB = varA, jkSumMultiplier, returnComponents = FALSE) {
  if (!varA %in% unique(varEstA$JK$variable)) {
    stop(paste0("Could not find ", sQuote("varA"), " value of ", dQuote(varA), " in varEstA$JK$variable. Existing values include ", pasteItems(unique(varEstA$JK$variable)), "."))
  }
  if (!varB %in% unique(varEstB$JK$variable)) {
    stop(paste0("Could not find ", sQuote("varB"), " value of ", dQuote(varB), " in varEstB$JK$variable. Existing values include ", pasteItems(unique(varEstB$JK$variable)), "."))
  }
  if (!is.null(varEstA$PV) & !is.null(varEstB$PV)) {
    if (!varA %in% unique(varEstA$PV$variable)) {
      stop(paste0("Could not find ", sQuote("varA"), " value of ", dQuote(varA), " in varEstA$PV$variable. Existing values include ", pasteItems(unique(varEstA$PV$variable)), "."))
    }
    if (!varB %in% unique(varEstB$PV$variable)) {
      stop(paste0("Could not find ", sQuote("varB"), " value of ", dQuote(varB), " in varEstB$PV$variable. Existing values include ", pasteItems(unique(varEstB$PV$variable)), "."))
    }
  }
  mergeVar <- c("PV", "JKreplicate")
  if (any(!mergeVar %in% names(varEstA$JK))) {
    stop(paste0("Merge vars missing from varEstA$JK ", pasteItems(mergeVar[!mergeVar %in% names(varEstA$JK)]), "."))
  }
  if (any(!mergeVar %in% names(varEstB$JK))) {
    stop(paste0("Merge vars missing from varEstB$JK ", pasteItems(mergeVar[!mergeVar %in% names(varEstB$JK)]), "."))
  }
  JK <- merge(subset(varEstA$JK, variable == varA),
    subset(varEstB$JK, variable == varB),
    by = mergeVar, all.x = FALSE, all.y = FALSE,
    suffixes = c(".A", ".B")
  )
  if (nrow(JK) < 1) {
    stop("Could not find appropriate values in JK results to calculate covariance.")
  }
  JK$cov <- jkSumMultiplier * JK$value.A * JK$value.B
  JK <- JK[!is.na(JK$cov), ]
  CovJK <- sum(JK$cov) / length(unique(JK$PV))
  # now get Imputation Covariance
  if (!is.null(varEstA$PV) & !is.null(varEstB$PV)) {
    mergeVar <- c("PV")
    if (sum(!mergeVar %in% names(varEstA$PV)) > 0) {
      stop(paste0("Merge vars missing from varEstA$PV ", pasteItems(mergeVar[!mergeVar %in% names(varEstA$PV)]), "."))
    }
    if (sum(!mergeVar %in% names(varEstB$PV)) > 0) {
      stop(paste0("Merge vars missing from varEstB$PV ", pasteItems(mergeVar[!mergeVar %in% names(varEstB$PV)]), "."))
    }
    PV <- merge(subset(varEstA$PV, variable == varA),
      subset(varEstB$PV, variable == varB),
      by = mergeVar, all.x = FALSE, all.y = FALSE,
      suffixes = c(".A", ".B")
    )
    if (nrow(PV) < 1) {
      stop("Could not find appropriate values in PV results to calculate covariance.")
    }
    PV$cov <- PV$value.A * PV$value.B
    M <- length(unique(PV$PV))
    if (M == 1) {
      stop("Insufficient number of plausible values in PV results to calculate imputation variance.")
    }
    # see statistics vignette for source of the scaling factor
    CovImp <- (M + 1) / (M * (M - 1)) * sum(PV$cov)
  } else { # ends  if(!is.null(varEstA$PV) & !is.null(varEstB$PV))
    CovImp <- 0
  }
  if (returnComponents) {
    V <- CovJK + CovImp
    return(list(V = V, Vsamp = CovJK, Vimp = CovImp))
  }
  return(CovJK + CovImp)
}