R/variance_function.R
In gustave: A User-Oriented Statistical Toolkit for Analytical Variance Estimation

Documented in res_cal varDT var_pois var_srs

#' Linear Regression Residuals Calculation
#'
#' @description \code{res_cal} calculates linear regression residuals in an 
#' efficient way : handling several dependent variables at a time, using 
#' Matrix::TsparseMatrix capabilities and allowing for pre-calculation of 
#' the matrix inverse.
#'
#' @param y A (sparse) numerical matrix of dependent variable(s).
#' @param x A (sparse) numerical matrix of independent variable(s).
#' @param w An optional numerical vector of row weights.
#' @param by An optional categorical vector (factor or character)
#' when residuals calculation is to be conducted within by-groups 
#' (see Details).
#' @param precalc A list of pre-calculated results (see Details).
#' @param id A vector of identifiers of the units used in the calculation.
#'   Useful when \code{precalc = TRUE} in order to assess whether the ordering of the
#'   \code{y} data matrix matches the one used at the precalculation step.
#'
#' @details In the context of the \code{gustave} package, linear 
#' regression residual calculation is solely used to take into account 
#' the effect of calibration on variance estimation. Independent variables 
#' are therefore most likely to be the same from one variance estimation 
#' to another, hence the inversion of the matrix 
#' \code{t(x) \%*\% Diagonal(x = w) \%*\% x} can be done once and for all 
#' at a pre-calculation step.
#'
#' The parameters \code{y} and \code{precalc} determine whether a list of 
#' pre-calculated data should be used in order to speed up the regression 
#' residuals computation at execution time:
#' \itemize{
#'  \item if \code{y} not \code{NULL} and \code{precalc} \code{NULL} : 
#'  on-the-fly calculation of the matrix inverse and the regression residuals 
#'  (no pre-calculation).
#'  \item if \code{y} \code{NULL} and \code{precalc} \code{NULL} : 
#'  pre-calculation of the matrix inverse which is stored in a list of 
#'  pre-calculated data.
#'  \item if \code{y} not \code{NULL} and \code{precalc} not \code{NULL} : 
#'  calculation of the regression residuals using the list of pre-calculated 
#'  data.
#' }
#'
#' The \code{by} parameter allows for calculation within by-groups : all 
#' calculation are made separately for each by-group (when calibration was 
#' conducted separately on several subsamples), but in an efficient way using 
#' Matrix::TsparseMatrix capabilities (especially when the matrix inverse is 
#' pre-calculated).
#' 
#'
#' @return \itemize{ \item if \code{y} is not \code{NULL} (calculation step) : a
#'   numerical matrix with same structure (regular base::matrix or
#'   Matrix::TsparseMatrix) and dimensions as \code{y}. \item if \code{y} is
#'   \code{NULL} (pre-calculation step) : a list containing pre-calculated data.}
#'   
#' @author Martin Chevalier
#'
#' @examples # Generating random data
#' set.seed(1)
#' n <- 100
#' H <- 5
#' y <- matrix(rnorm(2*n), nrow = n)
#' x <- matrix(rnorm(10*n), nrow = n)
#' by <- letters[sample(1:H, n, replace = TRUE)]
#'
#' # Direct calculation
#' res_cal(y, x)
#'
#' # Calculation with pre-calculated data
#' precalc <- res_cal(y = NULL, x)
#' res_cal(y, precalc = precalc)
#' identical(res_cal(y, x), res_cal(y, precalc = precalc))
#'
#' # Matrix::TsparseMatrix capability
#' require(Matrix)
#' X <- as(x, "TsparseMatrix")
#' Y <- as(y, "TsparseMatrix")
#' identical(res_cal(y, x), as.matrix(res_cal(Y, X)))
#'
#' # by parameter for within by-groups calculation
#' res_cal(Y, X, by = by)
#' all.equal(
#'  res_cal(Y, X, by = by)[by == "a", ],
#'   res_cal(Y[by == "a", ], X[by == "a", ])
#' )
#'
#' @export

res_cal <- function(y = NULL, x, w = NULL, by = NULL, precalc = NULL, id = NULL){

  # by <- NULL; w <- NULL
  
  if(is.null(precalc)){

    if(is.null(w)) w <- rep(1, NROW(x))

    # Taking the by into account
    x <- coerce_to_TsparseMatrix(x)
    if(!is.null(by)) x <- detect_block(x, by) %||% make_block(x, by)

    # Determine the inverse of the t(x) %*% x matrix while removing colinear variables
    while(TRUE){
      u <- crossprod(x,  Matrix::Diagonal(x = w) %*% x)
      if(Matrix::rankMatrix(u, method = "qr") != NROW(u)){
        is_colinear <- as.vector(is.na(stats::lm.fit(x = as.matrix(u), y = rep(1, NROW(u)))$coef))
        if(any(is_colinear)) note("Some variables in x were discarded due to collinearity.")
        x <- x[, !is_colinear, drop = FALSE]
      }else break
    }
    inv <- solve(u)

  }else list2env(precalc, envir = environment())


  if(is.null(y)) return(list(x = x, w = w, inv = inv, id = id)) else {
    is_sparse_y <- inherits(y, c("Matrix"))
    is_matrix_y <- !is_sparse_y && inherits(y, c("matrix"))
    dimnames_y <- dimnames(y)
    if(!is.null(precalc) && !is.null(id) && !is.null(rownames(y)) && !identical(as.character(id), rownames(y))) stop(
      "The names of the data matrix (y argument) do not match the reference id (id argument)."
    )
    e <- y - x %*% ( inv  %*% crossprod(x,  Matrix::Diagonal(x = w) %*% y) )
    if(!is_sparse_y) e <- if(is_matrix_y) as.matrix(e) else as.vector(e)
    dimnames(e) <- dimnames_y
    e
  }

}

# n <- 70000; p <- 10; q <- 150; H <- 6; y <- matrix(rnorm(n*p),ncol=p); x <- Matrix(rnorm(n*q)*(runif(n*q) > 0.98),ncol=q); w <- runif(n); by <- rep(1:H,n %/% H + 1)[1:n][sample.int(n)];
# precalc <- res_cal(y = NULL, x = x, w = w, by = by)
# microbenchmark(times = 10, res_cal(y, precalc = precalc), res_cal(y, x = x, w = w, by = by))
# inv <- ginv(as.matrix(t(x * w) %*% x))
# t2 <- res_calib(y,x,w,inv)
# identical(t,t2)
# microbenchmark(res_cal(y,x,w),res_cal(y,x,w,inv),times = 10)

#' Variance approximation with Deville-Tillé (2005) formula
#' 
#' @aliases varDT var_srs
#' 
#' @description \code{varDT} estimates the variance of the estimator of a total
#'   in the case of a balanced sampling design with equal or unequal probabilities 
#'   using Deville-Tillé (2005) formula. Without balancing variables, it falls back 
#'   to Deville's (1993) classical approximation. Without balancing variables and 
#'   with equal probabilities, it falls back to the classical Horvitz-Thompson 
#'   variance estimator for the total in the case of simple random sampling. 
#'   Stratification is natively supported.
#'   
#'   \code{var_srs} is a convenience wrapper for the (stratified) simple random
#'   sampling case.
#'   
#' @param y A (sparse) numerical matrix of the variable(s) whose variance of their total
#'   is to be estimated. 
#' @param pik A numerical vector of first-order inclusion probabilities.
#' @param x An optional (sparse) numerical matrix of balancing variable(s).
#' @param strata An optional categorical vector (factor or character) when
#'   variance estimation is to be conducted within strata.
#' @param w An optional numerical vector of row weights (see Details).
#' @param precalc A list of pre-calculated results (see Details).
#' @param id A vector of identifiers of the units used in the calculation.
#'   Useful when \code{precalc = TRUE} in order to assess whether the ordering of the
#'   \code{y} data matrix matches the one used at the pre-calculation step.
#'   
#' @details \code{varDT} aims at being the workhorse of most variance estimation conducted
#'   with the \code{gustave} package. It may be used to estimate the variance
#'   of the estimator of a total in the case of (stratified) simple random sampling, 
#'   (stratified) unequal probability sampling and (stratified) balanced sampling. 
#'   The native integration of stratification based on Matrix::TsparseMatrix allows 
#'   for significant performance gains compared to higher level vectorizations
#'   (\code{*apply} especially).
#'   
#'   Several time-consuming operations (e.g. collinearity-check, matrix
#'   inversion) can be pre-calculated in order to speed up the estimation at
#'   execution time. This is determined by the value of the parameters \code{y}
#'   and \code{precalc}: \itemize{ \item if \code{y} not \code{NULL} and
#'   \code{precalc} \code{NULL} : on-the-fly calculation (no pre-calculation). 
#'   \item if \code{y} \code{NULL} and \code{precalc} \code{NULL} :
#'   pre-calculation whose results are stored in a list of pre-calculated data. 
#'   \item if \code{y} not \code{NULL} and \code{precalc} not \code{NULL} :
#'   calculation using the list of pre-calculated data. }
#'   
#'   \code{w} is a row weight used at the final summation step. It is useful
#'   when \code{varDT} or \code{var_srs} are used on the second stage of a 
#'   two-stage sampling design applying the Rao (1975) formula.
#'   
#' @section Difference with \code{varest} from package \code{sampling}:
#'   
#'   \code{varDT} differs from \code{sampling::varest} in several ways: 
#'   \itemize{ \item The formula implemented in \code{varDT} is more general and
#'   encompasses balanced sampling. \item Even in its reduced
#'   form (without balancing variables), the formula implemented in \code{varDT}
#'   slightly differs from the one implemented in \code{sampling::varest}.
#'   Caron (1998, pp. 178-179) compares the two estimators
#'   (\code{sampling::varest} implements V_2, \code{varDT} implements V_1). 
#'   \item \code{varDT} introduces several optimizations: \itemize{ \item
#'   matrixwise operations allow to estimate variance on several interest
#'   variables at once \item Matrix::TsparseMatrix capability and the native
#'   integration of stratification yield significant performance gains. \item
#'   the ability to pre-calculate some time-consuming operations speeds up the
#'   estimation at execution time. } \item \code{varDT} does not natively
#'   implements the calibration estimator (i.e. the sampling variance estimator
#'   that takes into account the effect of calibration). In the context of the
#'   \code{gustave} package, \code{\link{res_cal}} should be called before 
#'   \code{varDT} in order to achieve the same result.}
#'   
#' @seealso \code{\link{res_cal}}
#' 
#' @return \itemize{ \item if \code{y} is not \code{NULL} (calculation step) : 
#'   the estimated variances as a numerical vector of size the number of 
#'   columns of y. \item if \code{y} is \code{NULL} (pre-calculation step) : a list 
#'   containing pre-calculated data.}
#'   
#' @author Martin Chevalier
#'   
#' @references Caron N. (1998), "Le logiciel Poulpe : aspects méthodologiques", \emph{Actes 
#'   des Journées de méthodologie statistique} \url{http://jms-insee.fr/jms1998s03_1/}
#'   Deville, J.-C. (1993), \emph{Estimation de la variance pour les enquêtes en
#'   deux phases}, Manuscript, INSEE, Paris.
#'   
#'   Deville, J.-C., Tillé, Y. (2005), "Variance approximation under balanced
#'   sampling", \emph{Journal of Statistical Planning and Inference}, 128, issue
#'   2 569-591
#'   
#'   Rao, J.N.K (1975), "Unbiased variance estimation for multistage designs",
#'   \emph{Sankhya}, C n°37
#'
#' @examples library(sampling)
#' set.seed(1)
#'
#' # Simple random sampling case
#' N <- 1000
#' n <- 100
#' y <- rnorm(N)[as.logical(srswor(n, N))]
#' pik <- rep(n/N, n)
#' varDT(y, pik)
#' sampling::varest(y, pik = pik)
#' N^2 * (1 - n/N) * var(y) / n
#'
#' # Unequal probability sampling case
#' N <- 1000
#' n <- 100
#' pik <- runif(N)
#' s <- as.logical(UPsystematic(pik))
#' y <- rnorm(N)[s]
#' pik <- pik[s]
#' varDT(y, pik)
#' varest(y, pik = pik)
#' # The small difference is expected (see Details).
#'
#' # Balanced sampling case
#' N <- 1000
#' n <- 100
#' pik <- runif(N)
#' x <- matrix(rnorm(N*3), ncol = 3)
#' s <- as.logical(samplecube(x, pik))
#' y <- rnorm(N)[s]
#' pik <- pik[s]
#' x <- x[s, ]
#' varDT(y, pik, x)
#'
#' # Balanced sampling case (variable of interest
#' # among the balancing variables)
#' N <- 1000
#' n <- 100
#' pik <- runif(N)
#' y <- rnorm(N)
#' x <- cbind(matrix(rnorm(N*3), ncol = 3), y)
#' s <- as.logical(samplecube(x, pik))
#' y <- y[s]
#' pik <- pik[s]
#' x <- x[s, ]
#' varDT(y, pik, x)
#' # As expected, the total of the variable of interest is perfectly estimated.
#'
#' # strata argument
#' n <- 100
#' H <- 2
#' pik <- runif(n)
#' y <- rnorm(n)
#' strata <- letters[sample.int(H, n, replace = TRUE)]
#' all.equal(
#'  varDT(y, pik, strata = strata),
#'  varDT(y[strata == "a"], pik[strata == "a"]) + varDT(y[strata == "b"], pik[strata == "b"])
#' )
#'
#' # precalc argument
#' n <- 1000
#' H <- 50
#' pik <- runif(n)
#' y <- rnorm(n)
#' strata <- sample.int(H, n, replace = TRUE)
#' precalc <- varDT(y = NULL, pik, strata = strata)
#' identical(
#'  varDT(y, precalc = precalc),
#'  varDT(y, pik, strata = strata)
#' )
#'
#' @export

varDT <- function(y = NULL, pik, x = NULL, strata = NULL, w = NULL, precalc = NULL, id = NULL){

  if(is.null(precalc)){

    if(any(pik <= 0 | pik > 1)) stop("All pik must be in ]0;1]")
    
    exh <- pik == 1
    if(any(exh)) note(
      sum(exh), " units are exhaustive (pik = 1). They are discarded from the variance estimation process."
    )
    pik <- pik[!exh]
    
    x <- if(!is.null(x)) coerce_to_TsparseMatrix(x)[!exh, , drop = FALSE] else pik

    # Stratification
    if(!is.null(strata)){
      strata <- droplevels(as.factor(strata[!exh]))
      if(any(tapply(strata, strata, length) == 1, na.rm = TRUE))
        stop("Some strata contain less than 2 samples units.")
      x <- detect_block(x, strata) %||% make_block(x, strata)
      colby <- attr(x, "colby")
      rowby <- attr(x, "rowby")
    }else{
      colby <- rep("1", NCOL(x))
      rowby <- rep("1", NROW(x))
    }

    # Determine A, ck and inv terms while removing colinear variables
    n <- as.vector(tapply(rowby, rowby, length)[rowby])
    A <- t(x) %*% Diagonal(x = 1 / pik)
    while(TRUE){
      p <- as.vector(tapply(colby, colby, length)[rowby])
      ck <- (1 - pik) * n / pmax(n - p, 1)
      u <- tcrossprod(A %*% Matrix::Diagonal(x = ck), A)
      if(Matrix::rankMatrix(u, method = "qr") != NROW(u)){
        # TODO: See if tol = 1e-12 in rankMatrix does not solve some issues
        is_colinear <- as.vector(is.na(stats::lm.fit(x = as.matrix(u), y = rep(1, NROW(u)))$coef))
        if(any(is_colinear)) note("Some variables in x were discarded due to collinearity.")
        A <- A[!is_colinear, , drop = FALSE]
        colby <- colby[!is_colinear]
      }else break
    }
    inv <- solve(u)

  }else list2env(precalc, envir = environment())

  if(is.null(y)){
    # Diagonal term of the variance estimator
    diago <- ck * (1 - colSums(A * (inv %*% A)) * ck)/pik^2
    names(diago) <- names(pik)
    if(!is.null(w)) stop("w is not to be included in the precalculated data.")
    return(list(id = id, pik = pik, exh = exh, A = A, ck = ck, inv = inv, diago = diago))
  }else{
    y <- coerce_to_TsparseMatrix(y)
    if(!is.null(precalc) && !is.null(id) && !is.null(rownames(y)) && !identical(as.character(id), rownames(y))) stop(
      "The names of the data matrix (y argument) do not match the reference id (id argument)."
    )
    if(is.null(w)) w <- rep(1, length(pik))
    z <- Diagonal(x = 1 / pik) %*% y[!exh, , drop = FALSE]
    zhat <- t(A) %*% inv %*% (A %*% Matrix::Diagonal(x = ck) %*% z)
    return(Matrix::colSums(ck * w * (z - zhat)^2))
  }

}



#' @rdname varDT
#' @export
var_srs <- function(y, pik, strata = NULL, w = NULL, precalc = NULL){
  if(is.null(precalc) && !is.null(strata) && any(tapply(pik, strata, stats::sd) > 1e-6, na.rm = TRUE))
    stop("First-order inclusion probabilities are not equal (within strata if any).")
  varDT(y = y, pik = pik, x = NULL, strata = strata, w = w, precalc = precalc)
}



#' Variance estimator for a Poisson sampling design
#' 
#' @description \code{var_pois} estimates the variance of the estimator 
#' of a total for a Poisson sampling design.
#' 
#' @param y A (sparse) numerical matrix of the variable(s) whose variance of their total
#'   is to be estimated.
#' @param pik A numerical vector of first-order inclusion probabilities.
#' @param w An optional numerical vector of row weights (see Details).
#' 
#' @details \code{w} is a row weight used at the final summation step. It is useful
#'   when \code{var_pois} is used on the second stage of a two-stage sampling
#'   design applying the Rao (1975) formula.
#' 
#' @return The estimated variances as a numerical vector of size the number of 
#'   columns of \code{y}. 
#'    
#' @author Martin Chevalier
#'   
#' @references Rao, J.N.K (1975), "Unbiased variance estimation for multistage designs",
#'   \emph{Sankhya}, C n°37

#' @export
var_pois <- function(y, pik, w = rep(1, length(pik))){
  colSums(w * (1 - pik) * (y / pik)^2)
}


#' Sen-Yates-Grundy variance estimator
#' 
#' @description \code{varSYG} computes the Sen-Yates-Grundy 
#' variance estimator.
#' 
#' @param y A (sparse) numerical matrix of the variable(s) whose variance of their total
#'   is to be estimated.
#' @param pikl A numerical matrix of second-order inclusion probabilities.
#' @param precalc A list of pre-calculated results (analogous to the one used by 
#'   \code{\link{varDT}}).
#' 
#' @details \code{varSYG} aims at being an efficient implementation of the 
#'   Sen-Yates-Grundy variance estimator for sampling designs with fixed sample 
#'   size. It should be especially useful when several variance estimations are
#'   to be conducted, as it relies on (sparse) matrix linear algebra.
#' 
#'   Moreover, in order to be consistent with \code{\link{varDT}}, \code{varSYG}
#'   has a \code{precalc} argument allowing for the re-use of intermediary
#'   results calculated once and for all in a pre-calculation step (see 
#'   \code{\link{varDT}} for details).
#'   
#' @section Difference with \code{varHT} from package \code{sampling}:
#'   
#'   \code{varSYG} differs from \code{sampling::varHT} in several ways: 
#'   \itemize{ \item The formula implemented in \code{varSYG} is solely
#'   the Sen-Yates-Grundy estimator, which is the one calculated 
#'   by \code{varHT} when method = 2.
#'   \item \code{varSYG} introduces several optimizations: \itemize{ \item
#'   matrixwise operations allow to estimate variance on several interest
#'   variables at once \item Matrix::TsparseMatrix capability yields significant 
#'   performance gains.}}
#' 
#' @return \itemize{ \item if \code{y} is not \code{NULL} (calculation step) : a
#'   numerical vector of size the number of columns of y. \item if \code{y} is
#'   \code{NULL} (pre-calculation step) : a list containing pre-calculated data 
#'   (analogous to the one used by \code{\link{varDT}}).}
#' 
#' @author Martin Chevalier
#' 
#' @examples library(sampling)
#' set.seed(1)
#' 
#' # Simple random sampling case
#' N <- 1000
#' n <- 100
#' y <- rnorm(N)[as.logical(srswor(n, N))]
#' pikl <- matrix(rep((n*(n-1))/(N*(N-1)), n*n), nrow = n)
#' diag(pikl) <- rep(n/N, n)
#' varSYG(y, pikl)
#' sampling::varHT(y = y, pikl = pikl, method = 2)

#' @export

varSYG <- function (y = NULL, pikl, precalc = NULL){
  if(is.null(precalc)){
    pik = diag(pikl)
    delta <- 1 - pik %*% t(pik)/pikl
  }else list2env(precalc, envir = environment())
  if(is.null(y)){
    diago <- -(1/pik^2) * rowSums(delta - diag(x = diag(delta)))
    names(diago) <- row.names(pikl)
    return(list(pikl = pikl, pik = pik, delta = delta, diago = diago))
  }else{
    var <- colSums((y/pik) * (delta %*% (y/pik)) - delta %*% (y/pik)^2)
    return(var)
  }
}

# TODO: Add a varHT() estimator