Nothing
#' Linear Regression Residuals Calculation
#'
#' @description \code{res_cal} calculates linear regression residuals in an
#' efficient way : handling several dependent variables at a time, using
#' Matrix::TsparseMatrix capabilities and allowing for pre-calculation of
#' the matrix inverse.
#'
#' @param y A (sparse) numerical matrix of dependent variable(s).
#' @param x A (sparse) numerical matrix of independent variable(s).
#' @param w An optional numerical vector of row weights.
#' @param by An optional categorical vector (factor or character)
#' when residuals calculation is to be conducted within by-groups
#' (see Details).
#' @param precalc A list of pre-calculated results (see Details).
#' @param id A vector of identifiers of the units used in the calculation.
#' Useful when \code{precalc = TRUE} in order to assess whether the ordering of the
#' \code{y} data matrix matches the one used at the precalculation step.
#'
#' @details In the context of the \code{gustave} package, linear
#' regression residual calculation is solely used to take into account
#' the effect of calibration on variance estimation. Independent variables
#' are therefore most likely to be the same from one variance estimation
#' to another, hence the inversion of the matrix
#' \code{t(x) \%*\% Diagonal(x = w) \%*\% x} can be done once and for all
#' at a pre-calculation step.
#'
#' The parameters \code{y} and \code{precalc} determine whether a list of
#' pre-calculated data should be used in order to speed up the regression
#' residuals computation at execution time:
#' \itemize{
#' \item if \code{y} not \code{NULL} and \code{precalc} \code{NULL} :
#' on-the-fly calculation of the matrix inverse and the regression residuals
#' (no pre-calculation).
#' \item if \code{y} \code{NULL} and \code{precalc} \code{NULL} :
#' pre-calculation of the matrix inverse which is stored in a list of
#' pre-calculated data.
#' \item if \code{y} not \code{NULL} and \code{precalc} not \code{NULL} :
#' calculation of the regression residuals using the list of pre-calculated
#' data.
#' }
#'
#' The \code{by} parameter allows for calculation within by-groups : all
#' calculation are made separately for each by-group (when calibration was
#' conducted separately on several subsamples), but in an efficient way using
#' Matrix::TsparseMatrix capabilities (especially when the matrix inverse is
#' pre-calculated).
#'
#'
#' @return \itemize{ \item if \code{y} is not \code{NULL} (calculation step) : a
#' numerical matrix with same structure (regular base::matrix or
#' Matrix::TsparseMatrix) and dimensions as \code{y}. \item if \code{y} is
#' \code{NULL} (pre-calculation step) : a list containing pre-calculated data.}
#'
#' @author Martin Chevalier
#'
#' @examples # Generating random data
#' set.seed(1)
#' n <- 100
#' H <- 5
#' y <- matrix(rnorm(2*n), nrow = n)
#' x <- matrix(rnorm(10*n), nrow = n)
#' by <- letters[sample(1:H, n, replace = TRUE)]
#'
#' # Direct calculation
#' res_cal(y, x)
#'
#' # Calculation with pre-calculated data
#' precalc <- res_cal(y = NULL, x)
#' res_cal(y, precalc = precalc)
#' identical(res_cal(y, x), res_cal(y, precalc = precalc))
#'
#' # Matrix::TsparseMatrix capability
#' require(Matrix)
#' X <- as(x, "TsparseMatrix")
#' Y <- as(y, "TsparseMatrix")
#' identical(res_cal(y, x), as.matrix(res_cal(Y, X)))
#'
#' # by parameter for within by-groups calculation
#' res_cal(Y, X, by = by)
#' all.equal(
#' res_cal(Y, X, by = by)[by == "a", ],
#' res_cal(Y[by == "a", ], X[by == "a", ])
#' )
#'
#' @export
res_cal <- function(y = NULL, x, w = NULL, by = NULL, precalc = NULL, id = NULL){
# by <- NULL; w <- NULL
if(is.null(precalc)){
if(is.null(w)) w <- rep(1, NROW(x))
# Taking the by into account
x <- coerce_to_TsparseMatrix(x)
if(!is.null(by)) x <- detect_block(x, by) %||% make_block(x, by)
# Determine the inverse of the t(x) %*% x matrix while removing colinear variables
while(TRUE){
u <- crossprod(x, Matrix::Diagonal(x = w) %*% x)
if(Matrix::rankMatrix(u, method = "qr") != NROW(u)){
is_colinear <- as.vector(is.na(stats::lm.fit(x = as.matrix(u), y = rep(1, NROW(u)))$coef))
if(any(is_colinear)) note("Some variables in x were discarded due to collinearity.")
x <- x[, !is_colinear, drop = FALSE]
}else break
}
inv <- solve(u)
}else list2env(precalc, envir = environment())
if(is.null(y)) return(list(x = x, w = w, inv = inv)) else {
is_sparse_y <- inherits(y, c("Matrix"))
is_matrix_y <- !is_sparse_y && inherits(y, c("matrix"))
dimnames_y <- dimnames(y)
if(!is.null(precalc) && !is.null(id) && !is.null(rownames(y)) && !identical(as.character(id), rownames(y))) stop(
"The names of the data matrix (y argument) do not match the reference id (id argument)."
)
e <- y - x %*% ( inv %*% crossprod(x, Matrix::Diagonal(x = w) %*% y) )
if(!is_sparse_y) e <- if(is_matrix_y) as.matrix(e) else as.vector(e)
dimnames(e) <- dimnames_y
e
}
}
# n <- 70000; p <- 10; q <- 150; H <- 6; y <- matrix(rnorm(n*p),ncol=p); x <- Matrix(rnorm(n*q)*(runif(n*q) > 0.98),ncol=q); w <- runif(n); by <- rep(1:H,n %/% H + 1)[1:n][sample.int(n)];
# precalc <- res_cal(y = NULL, x = x, w = w, by = by)
# microbenchmark(times = 10, res_cal(y, precalc = precalc), res_cal(y, x = x, w = w, by = by))
# inv <- ginv(as.matrix(t(x * w) %*% x))
# t2 <- res_calib(y,x,w,inv)
# identical(t,t2)
# microbenchmark(res_cal(y,x,w),res_cal(y,x,w,inv),times = 10)
#' Variance approximation with Deville-Tillé (2005) formula
#'
#' @aliases varDT var_srs
#'
#' @description \code{varDT} estimates the variance of the estimator of a total
#' in the case of a balanced sampling design with equal or unequal probabilities
#' using Deville-Tillé (2005) formula. Without balancing variables, it falls back
#' to Deville's (1993) classical approximation. Without balancing variables and
#' with equal probabilities, it falls back to the classical Horvitz-Thompson
#' variance estimator for the total in the case of simple random sampling.
#' Stratification is natively supported.
#'
#' \code{var_srs} is a convenience wrapper for the (stratified) simple random
#' sampling case.
#'
#' @param y A (sparse) numerical matrix of the variable(s) whose variance of their total
#' is to be estimated.
#' @param pik A numerical vector of first-order inclusion probabilities.
#' @param x An optional (sparse) numerical matrix of balancing variable(s).
#' @param strata An optional categorical vector (factor or character) when
#' variance estimation is to be conducted within strata.
#' @param w An optional numerical vector of row weights (see Details).
#' @param precalc A list of pre-calculated results (see Details).
#' @param id A vector of identifiers of the units used in the calculation.
#' Useful when \code{precalc = TRUE} in order to assess whether the ordering of the
#' \code{y} data matrix matches the one used at the pre-calculation step.
#'
#' @details \code{varDT} aims at being the workhorse of most variance estimation conducted
#' with the \code{gustave} package. It may be used to estimate the variance
#' of the estimator of a total in the case of (stratified) simple random sampling,
#' (stratified) unequal probability sampling and (stratified) balanced sampling.
#' The native integration of stratification based on Matrix::TsparseMatrix allows
#' for significant performance gains compared to higher level vectorizations
#' (\code{*apply} especially).
#'
#' Several time-consuming operations (e.g. collinearity-check, matrix
#' inversion) can be pre-calculated in order to speed up the estimation at
#' execution time. This is determined by the value of the parameters \code{y}
#' and \code{precalc}: \itemize{ \item if \code{y} not \code{NULL} and
#' \code{precalc} \code{NULL} : on-the-fly calculation (no pre-calculation).
#' \item if \code{y} \code{NULL} and \code{precalc} \code{NULL} :
#' pre-calculation whose results are stored in a list of pre-calculated data.
#' \item if \code{y} not \code{NULL} and \code{precalc} not \code{NULL} :
#' calculation using the list of pre-calculated data. }
#'
#' \code{w} is a row weight used at the final summation step. It is useful
#' when \code{varDT} or \code{var_srs} are used on the second stage of a
#' two-stage sampling design applying the Rao (1975) formula.
#'
#' @section Difference with \code{varest} from package \code{sampling}:
#'
#' \code{varDT} differs from \code{sampling::varest} in several ways:
#' \itemize{ \item The formula implemented in \code{varDT} is more general and
#' encompasses balanced sampling. \item Even in its reduced
#' form (without balancing variables), the formula implemented in \code{varDT}
#' slightly differs from the one implemented in \code{sampling::varest}.
#' Caron (1998, pp. 178-179) compares the two estimators
#' (\code{sampling::varest} implements V_2, \code{varDT} implements V_1).
#' \item \code{varDT} introduces several optimizations: \itemize{ \item
#' matrixwise operations allow to estimate variance on several interest
#' variables at once \item Matrix::TsparseMatrix capability and the native
#' integration of stratification yield significant performance gains. \item
#' the ability to pre-calculate some time-consuming operations speeds up the
#' estimation at execution time. } \item \code{varDT} does not natively
#' implements the calibration estimator (i.e. the sampling variance estimator
#' that takes into account the effect of calibration). In the context of the
#' \code{gustave} package, \code{\link{res_cal}} should be called before
#' \code{varDT} in order to achieve the same result.}
#'
#' @seealso \code{\link{res_cal}}
#'
#' @return \itemize{ \item if \code{y} is not \code{NULL} (calculation step) :
#' the estimated variances as a numerical vector of size the number of
#' columns of y. \item if \code{y} is \code{NULL} (pre-calculation step) : a list
#' containing pre-calculated data.}
#'
#' @author Martin Chevalier
#'
#' @references Caron N. (1998), "Le logiciel Poulpe : aspects méthodologiques", \emph{Actes
#' des Journées de méthodologie statistique} \url{http://jms-insee.fr/jms1998s03_1/}
#' Deville, J.-C. (1993), \emph{Estimation de la variance pour les enquêtes en
#' deux phases}, Manuscript, INSEE, Paris.
#'
#' Deville, J.-C., Tillé, Y. (2005), "Variance approximation under balanced
#' sampling", \emph{Journal of Statistical Planning and Inference}, 128, issue
#' 2 569-591
#'
#' Rao, J.N.K (1975), "Unbiased variance estimation for multistage designs",
#' \emph{Sankhya}, C n°37
#'
#' @examples library(sampling)
#' set.seed(1)
#'
#' # Simple random sampling case
#' N <- 1000
#' n <- 100
#' y <- rnorm(N)[as.logical(srswor(n, N))]
#' pik <- rep(n/N, n)
#' varDT(y, pik)
#' sampling::varest(y, pik = pik)
#' N^2 * (1 - n/N) * var(y) / n
#'
#' # Unequal probability sampling case
#' N <- 1000
#' n <- 100
#' pik <- runif(N)
#' s <- as.logical(UPsystematic(pik))
#' y <- rnorm(N)[s]
#' pik <- pik[s]
#' varDT(y, pik)
#' varest(y, pik = pik)
#' # The small difference is expected (see Details).
#'
#' # Balanced sampling case
#' N <- 1000
#' n <- 100
#' pik <- runif(N)
#' x <- matrix(rnorm(N*3), ncol = 3)
#' s <- as.logical(samplecube(x, pik))
#' y <- rnorm(N)[s]
#' pik <- pik[s]
#' x <- x[s, ]
#' varDT(y, pik, x)
#'
#' # Balanced sampling case (variable of interest
#' # among the balancing variables)
#' N <- 1000
#' n <- 100
#' pik <- runif(N)
#' y <- rnorm(N)
#' x <- cbind(matrix(rnorm(N*3), ncol = 3), y)
#' s <- as.logical(samplecube(x, pik))
#' y <- y[s]
#' pik <- pik[s]
#' x <- x[s, ]
#' varDT(y, pik, x)
#' # As expected, the total of the variable of interest is perfectly estimated.
#'
#' # strata argument
#' n <- 100
#' H <- 2
#' pik <- runif(n)
#' y <- rnorm(n)
#' strata <- letters[sample.int(H, n, replace = TRUE)]
#' all.equal(
#' varDT(y, pik, strata = strata),
#' varDT(y[strata == "a"], pik[strata == "a"]) + varDT(y[strata == "b"], pik[strata == "b"])
#' )
#'
#' # precalc argument
#' n <- 1000
#' H <- 50
#' pik <- runif(n)
#' y <- rnorm(n)
#' strata <- sample.int(H, n, replace = TRUE)
#' precalc <- varDT(y = NULL, pik, strata = strata)
#' identical(
#' varDT(y, precalc = precalc),
#' varDT(y, pik, strata = strata)
#' )
#'
#' @export
varDT <- function(y = NULL, pik, x = NULL, strata = NULL, w = NULL, precalc = NULL, id = NULL){
if(is.null(precalc)){
if(any(pik <= 0 | pik > 1)) stop("All pik must be in ]0;1]")
exh <- pik == 1
if(any(exh)) note(
sum(exh), " units are exhaustive (pik = 1). They are discarded from the variance estimation process."
)
pik <- pik[!exh]
x <- if(!is.null(x)) coerce_to_TsparseMatrix(x)[!exh, , drop = FALSE] else pik
# Stratification
if(!is.null(strata)){
strata <- droplevels(as.factor(strata[!exh]))
if(any(tapply(strata, strata, length) == 1, na.rm = TRUE))
stop("Some strata contain less than 2 samples units.")
x <- detect_block(x, strata) %||% make_block(x, strata)
colby <- attr(x, "colby")
rowby <- attr(x, "rowby")
}else{
colby <- rep("1", NCOL(x))
rowby <- rep("1", NROW(x))
}
# Determine A, ck and inv terms while removing colinear variables
n <- as.vector(tapply(rowby, rowby, length)[rowby])
A <- t(x) %*% Diagonal(x = 1 / pik)
while(TRUE){
p <- as.vector(tapply(colby, colby, length)[rowby])
ck <- (1 - pik) * n / pmax(n - p, 1)
u <- tcrossprod(A %*% Matrix::Diagonal(x = ck), A)
if(Matrix::rankMatrix(u, method = "qr") != NROW(u)){
# TODO: See if tol = 1e-12 in rankMatrix does not solve some issues
is_colinear <- as.vector(is.na(stats::lm.fit(x = as.matrix(u), y = rep(1, NROW(u)))$coef))
if(any(is_colinear)) note("Some variables in x were discarded due to collinearity.")
A <- A[!is_colinear, , drop = FALSE]
colby <- colby[!is_colinear]
}else break
}
inv <- solve(u)
}else list2env(precalc, envir = environment())
if(is.null(y)){
# Diagonal term of the variance estimator
diago <- ck * (1 - colSums(A * (inv %*% A)) * ck)/pik^2
names(diago) <- names(pik)
if(!is.null(w)) stop("w is not to be included in the precalculated data.")
return(list(id = id, pik = pik, exh = exh, A = A, ck = ck, inv = inv, diago = diago))
}else{
y <- coerce_to_TsparseMatrix(y)
if(!is.null(precalc) && !is.null(id) && !is.null(rownames(y)) && !identical(as.character(id), rownames(y))) stop(
"The names of the data matrix (y argument) do not match the reference id (id argument)."
)
if(is.null(w)) w <- rep(1, length(pik))
z <- Diagonal(x = 1 / pik) %*% y[!exh, , drop = FALSE]
zhat <- t(A) %*% inv %*% (A %*% Matrix::Diagonal(x = ck) %*% z)
return(Matrix::colSums(ck * w * (z - zhat)^2))
}
}
#' @rdname varDT
#' @export
var_srs <- function(y, pik, strata = NULL, w = NULL, precalc = NULL){
if(is.null(precalc) && !is.null(strata) && any(tapply(pik, strata, stats::sd) > 1e-6, na.rm = TRUE))
stop("First-order inclusion probabilities are not equal (within strata if any).")
varDT(y = y, pik = pik, x = NULL, strata = strata, w = w, precalc = precalc)
}
#' Variance estimator for a Poisson sampling design
#'
#' @description \code{var_pois} estimates the variance of the estimator
#' of a total for a Poisson sampling design.
#'
#' @param y A (sparse) numerical matrix of the variable(s) whose variance of their total
#' is to be estimated.
#' @param pik A numerical vector of first-order inclusion probabilities.
#' @param w An optional numerical vector of row weights (see Details).
#'
#' @details \code{w} is a row weight used at the final summation step. It is useful
#' when \code{var_pois} is used on the second stage of a two-stage sampling
#' design applying the Rao (1975) formula.
#'
#' @return The estimated variances as a numerical vector of size the number of
#' columns of \code{y}.
#'
#' @author Martin Chevalier
#'
#' @references Rao, J.N.K (1975), "Unbiased variance estimation for multistage designs",
#' \emph{Sankhya}, C n°37
#' @export
var_pois <- function(y, pik, w = rep(1, length(pik))){
colSums(w * (1 - pik) * (y / pik)^2)
}
#' Sen-Yates-Grundy variance estimator
#'
#' @description \code{varSYG} computes the Sen-Yates-Grundy
#' variance estimator.
#'
#' @param y A (sparse) numerical matrix of the variable(s) whose variance of their total
#' is to be estimated.
#' @param pikl A numerical matrix of second-order inclusion probabilities.
#' @param precalc A list of pre-calculated results (analogous to the one used by
#' \code{\link{varDT}}).
#'
#' @details \code{varSYG} aims at being an efficient implementation of the
#' Sen-Yates-Grundy variance estimator for sampling designs with fixed sample
#' size. It should be especially useful when several variance estimations are
#' to be conducted, as it relies on (sparse) matrix linear algebra.
#'
#' Moreover, in order to be consistent with \code{\link{varDT}}, \code{varSYG}
#' has a \code{precalc} argument allowing for the re-use of intermediary
#' results calculated once and for all in a pre-calculation step (see
#' \code{\link{varDT}} for details).
#'
#' @section Difference with \code{varHT} from package \code{sampling}:
#'
#' \code{varSYG} differs from \code{sampling::varHT} in several ways:
#' \itemize{ \item The formula implemented in \code{varSYG} is solely
#' the Sen-Yates-Grundy estimator, which is the one calculated
#' by \code{varHT} when method = 2.
#' \item \code{varSYG} introduces several optimizations: \itemize{ \item
#' matrixwise operations allow to estimate variance on several interest
#' variables at once \item Matrix::TsparseMatrix capability yields significant
#' performance gains.}}
#'
#' @return \itemize{ \item if \code{y} is not \code{NULL} (calculation step) : a
#' numerical vector of size the number of columns of y. \item if \code{y} is
#' \code{NULL} (pre-calculation step) : a list containing pre-calculated data
#' (analogous to the one used by \code{\link{varDT}}).}
#'
#' @author Martin Chevalier
#'
#' @examples library(sampling)
#' set.seed(1)
#'
#' # Simple random sampling case
#' N <- 1000
#' n <- 100
#' y <- rnorm(N)[as.logical(srswor(n, N))]
#' pikl <- matrix(rep((n*(n-1))/(N*(N-1)), n*n), nrow = n)
#' diag(pikl) <- rep(n/N, n)
#' varSYG(y, pikl)
#' sampling::varHT(y = y, pikl = pikl, method = 2)
#' @export
varSYG <- function (y = NULL, pikl, precalc = NULL){
if(is.null(precalc)){
pik = diag(pikl)
delta <- 1 - pik %*% t(pik)/pikl
}else list2env(precalc, envir = environment())
if(is.null(y)){
diago <- -(1/pik^2) * rowSums(delta - diag(x = diag(delta)))
names(diago) <- row.names(pikl)
return(list(pikl = pikl, pik = pik, delta = delta, diago = diago))
}else{
var <- colSums((y/pik) * (delta %*% (y/pik)) - delta %*% (y/pik)^2)
return(var)
}
}
# TODO: Add a varHT() estimator
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.