R/approximated_variance_estimators.R
In UPSvarApprox: Approximate the Variance of the Horvitz-Thompson Total Estimator

Documented in approx_var_est

#' Approximated Variance Estimators
#'
#' Approximated variance estimators which use only first-order inclusion probabilities
#'
#' @param y numeric vector of sample observations
#' @param pik numeric vector of first-order inclusion probabilities of length N,
#'     the population size,  or n, the sample size
#' depending on the chosen method (see Details for more information)
#' @param method string indicating the desired approximate variance estimator.
#' One of "Deville1", "Deville2", "Deville3", "Hajek", "Rosen", "FixedPoint",
#' "Brewer1", "HartleyRao", "Berger", "Tille", "MateiTille1", "MateiTille2",
#' "MateiTille3", "MateiTille4", "MateiTille5", "Brewer2", "Brewer3", "Brewer4".
#' @param sample Either a numeric vector of length equal to the sample size with
#' the indices of sample units, or a boolean vector of the same length of \code{pik}, indicating which
#' units belong to the sample (\code{TRUE} if the unit is in the sample,
#' \code{FALSE} otherwise.
#' Only used with estimators of the third class (see Details for more information).
#' @param ... two optional parameters can be modified to control the iterative
#' procedures in methods \code{"MateiTille5"}, \code{"Tille"} and
#' \code{"FixedPoint"}: \code{maxIter} sets the maximum number
#' of iterations to perform and \code{eps} controls the convergence error
#'
#' @details
#' The choice of the estimator to be used is made through the argument \code{method},
#' the list of methods and their respective equations is presented below.
#'
#' Matei and Tillé (2005) divides the approximated variance estimators into
#' three classes, depending on the quantities they require:
#'
#' \enumerate{
#' \item First and second-order inclusion probabilities:
#' The first class is composed of the Horvitz-Thompson estimator (Horvitz and Thompson 1952)
#' and the Sen-Yates-Grundy estimator (Yates and Grundy 1953; Sen 1953),
#' which are available through function \code{varHT} in package \code{sampling};
#'
#' \item Only first-order inclusion probabilities and only for sample units;
#'
#' \item Only first-order inclusion probabilities, for the entire population.
#' }
#'
#'
#' Haziza, Mecatti and Rao (2008) provide a common form to express most of the
#' estimators in class 2 and 3:
#'
#' \deqn{\widehat{var}(\hat{t}_{HT}) = \sum_{i \in s}c_i e_i^2 }{ var = \sum_s c*e^2}
#'
#' where \eqn{ e_i = \frac{y_i}{\pi_i} - \hat{B} }{ e =  y/\pi - B}, with
#'
#' \deqn{ \hat{B} = \frac{\sum_{i\in s} a_i (y_i/\pi_i) }{\sum_{i\in s} a_i} }{
#' \sum a*(y/\pi) / \sum (a)}
#'
#' and \eqn{a_i}{a} and \eqn{c_i}{c} are parameters that define the different
#' estimators:
#'
#' \itemize{
#' \item \code{method="Hajek"} [Class 2]
#'     \deqn{c_i = \frac{n}{n-1}(1-\pi_i) ; \quad a_i= c_i}{c = (n/(n-1)) (1-\pi);   a = c}
#'
#' \item \code{method="Deville2"} [Class 2]
#'     \deqn{c_i = (1-\pi_i)\Biggl\{ 1 - \sum_{j\in s}\Bigl[ \frac{1-\pi_j}{\sum_{k\in s} (1-\pi_k)} \Bigr]^2 \Biggr\}^{-1} ; \quad a_i= c_i}{
#'     c = (1-\pi) [1 - \sum ( (1-pi)/\sum (1-pi))^2)]^(-1); a=c }
#'
#' \item \code{method="Deville3"} [Class 2]
#'     \deqn{c_i = (1-\pi_i)\Biggl\{ 1 - \sum_{j\in s}\Bigl[ \frac{1-\pi_j}{\sum_{k\in s} (1-\pi_k)} \Bigr]^2 \Biggr\}^{-1}; \quad a_i= 1 }{
#'     c = (1-\pi)[1 - \sum ( (1-pi)/\sum (1-pi))^2)]^(-1); a=1 }
#'
#' \item \code{method="Rosen"} [Class 2]
#'     \deqn{c_i = \frac{n}{n-1} (1-\pi_i); \quad a_i= (1-\pi_i)log(1-\pi_i) / \pi_i}{ c = (n/(n-1)) (1-\pi); a= (1-\pi)log(1-\pi) / \pi }
#'
#' \item \code{method="Brewer1"} [Class 2]
#'     \deqn{c_i = \frac{n}{n-1}(1-\pi_i); \quad a_i= 1}{ c = (n/(n-1)) (1-\pi), a=1}
#'
#' \item \code{method="Brewer2"} [Class 3]
#'     \deqn{c_i = \frac{n}{n-1} \Bigl(1-\pi_i+ \frac{\pi_i}{n} - n^{-2}\sum_{j \in U} \pi_j^2 \Bigr) ; \quad a_i=1}{
#'     c = (n/(n-1)) (1-\pi+\pi/n - \sum_U \pi/(n^2)); a=1}
#'
#' \item \code{method="Brewer3"} [Class 3]
#'     \deqn{c_i = \frac{n}{n-1} \Bigl(1-\pi_i - \frac{\pi_i}{n} - n^{-2}\sum_{j \in U} \pi_j^2 \Bigr); \quad a_i = 1}{
#'     c = (n/(n-1)) (1-\pi - \pi/n - \sum_U \pi/(n^2)); a=1}
#'
#' \item \code{method="Brewer4"} [Class 3]
#'     \deqn{c_i = \frac{n}{n-1} \Bigl(1-\pi_i - \frac{\pi_i}{n-1} + n^{-1}(n-1)^{-1}\sum_{j \in U} \pi_j^2 \Bigr); \quad a_i=1}{
#'     c= (n/(n-1)) (1-\pi - \pi/(n-1) + \sum_U \pi^2 / (n*(n-1))); a=1}
#'
#' \item \code{method="Berger"} [Class 3]
#'     \deqn{c_i = \frac{n}{n-1} (1-\pi_i) \Biggl[ \frac{\sum_{j\in s} (1-\pi_j)}{\sum_{j\in U} (1-\pi_j)} \Biggr] ; \quad a_i=c_i}{
#'     c = (n/(n-1)) (1-\pi) (\sum_s (1-\pi) / \sum_U (1-pi)); a=c }
#'
#' \item \code{method="HartleyRao"} [Class 3]
#'     \deqn{c_i = \frac{n}{n-1} \Bigl(1-\pi_i - n^{-1}\sum_{j \in s}\pi_i + n^{-1}\sum_{j\in U} \pi_j^2 \Bigr) ; \quad a_i=1}{
#'     c= (n/(n-1)) (1-\pi - \sum_s \pi/n + \sum_U \pi^2 / n); a=1 }
#'}
#'
#'
#' Some additional estimators are defined in Matei and Tillé (2005):
#'
#' \itemize{
#'     \item \code{method="Deville1"} [Class 2]
#'     \deqn{\widehat{var}(\hat{t}_{HT}) = \sum_{i \in s} \frac{c_i}{ \pi_i^2} (y_i - y_i^*)^2 }{
#'     \sum (c/\pi^2) (y-y*)^2 }
#'
#'     where
#'     \deqn{ y_i^* = \pi_i \frac{\sum_{j \in s} c_j y_j / \pi_j}{\sum_{j \in s} c_j} }{
#'     y* = \pi (\sum c*y/\pi) / (\sum c)}
#'     and \eqn{c_i = (1-\pi_i)\frac{n}{n-1} }{c = (n/(n-1)) (1-\pi)}
#'
#'
#'  \item \code{method="Tille"} [Class 3]
#'      \deqn{
#'      \widehat{var}(\hat{t}_{HT}) = \biggl( \sum_{i \in s} \omega_i \biggr)
#'      \sum_{i\in s} \omega_i (\tilde{y}_i - \bar{\tilde{y}}_\omega )^2
#'      - n \sum_{i\in s}\biggl( \tilde{y}_i - \frac{\hat{t}_{HT}}{n} \biggr)^2
#'      }{
#'      var = ( \sum \omega ) \sum \omega( y* - \gamma )^2 - n\sum (y* - \sum(y/\pi)/n )^2}
#'
#'      where  \eqn{\tilde{y}_i = y_i / \pi_i }{ y* = y/\pi},
#'      \eqn{\omega_i = \pi_i / \beta_i}{ \omega = \pi/\beta}
#'      and \eqn{\bar{\tilde{y}}_\omega =
#'                   \biggl( \sum_{i \in s} \omega_i \biggr)^{-1} \sum_{i \in s} \omega_i \tilde{y}_i }{
#'               \gamma = \sum(\omega y*)/\sum(\omega) }
#'
#'    The coefficients \eqn{\beta_i}{\beta} are computed iteratively through the
#'    following procedure:
#'     \enumerate{
#'         \item \eqn{\beta_i^{(0)} = \pi_i, \,\, \forall i\in U}{\beta(0) = \pi, i = 1, ..., N}
#'         \item \eqn{ \beta_i^{(2k-1)} = \frac{(n-1)\pi_i}{\beta^{(2k-2)} - \beta_i^{(2k-2)}}  }{
#'                     \beta(2k-1) = ( (n-1)\pi )/(\sum\beta(2k-2) - \beta(2k-2)) }
#'         \item \eqn{\beta_i^{2k} = \beta_i^{(2k-1)}
#'         \Biggl( \frac{n(n-1)}{(\beta^(2k-1))^2 - \sum_{i\in U} (\beta_k^{(2k-1)})^2 } \Biggr)^{(1/2)} }{
#'         \beta(2k) = \beta(2k-1) ( n(n-1) / ( (\sum\beta(2k-1))^2 - \sum( \beta(2k-1)^2 ) ) )^(1/2) }
#'     }
#'     with \eqn{\beta^{(k)} = \sum_{i\in U} \beta_i^{i}, \,\, k=1,2,3, \dots }
#'
#'  \item \code{method="MateiTille1"} [Class 3]
#'      \deqn{\widehat{var}(\hat{t}_{HT}) = \frac{n(N-1)}{N(n-1)} \sum_{i\in s} \frac{b_i}{\pi_i^3} (y_i - \hat{y}_i^*)^2 }{
#'            var = n(N-1)/(N(n-1)) \sum (b/\pi^3) (y-y*)^2 }
#'      where \deqn{\hat{y}_i^* = \pi_i \frac{\sum_{i\in s} b_i y_i/\pi_i^2}{\sum_{i\in s} b_i/\pi_i} }{
#'      y* = \pi (\sum b*y/(\pi^2)) / (\sum b/\pi )}
#'
#'      and the coefficients \eqn{b_i}{b} are computed iteratively by the algorithm:
#'      \enumerate{
#'          \item \deqn{b_i^{(0)} = \pi_i (1-\pi_i) \frac{N}{N-1}, \,\, \forall i \in U }{ b0 = \pi(1-\pi)(N/(N-1))}
#'          \item \deqn{ b_i^{(k)} = \frac{(b_i^{(k-1)})^2 }{\sum_{j\in U} b_j^{(k-1)} } + \pi_i(1-\pi_i) }{
#'          b(k) = [ b(i-1) ]^2 / [ \sum b(i-1) ] + \pi(1-\pi) }
#'       }
#'       a necessary condition for convergence is checked and, if not satisfied,
#'       the function returns an alternative solution that uses only one iteration:
#'       \deqn{b_i = \pi_i(1-\pi_i)\Biggl( \frac{N\pi_i(1-\pi_i)}{ (N-1)\sum_{j\in U}\pi_j(1-\pi_j) } + 1 \Biggr) }{
#'       b = \pi(1-\pi)( 1 + (N\pi(1-\pi)) / ( (N-1) \sum \pi(1-\pi) ) ) }
#'
#'  \item \code{method="MateiTille2"} [Class 3]
#'      \deqn{ \widehat{var}(\hat{t}_{HT}) = \frac{1}{1 - \sum_{i\in U} \frac{d_i^2}{\pi_i} }
#'      \sum_{i\in s} (1-\pi_i) \Biggl( \frac{y_i}{\pi_i} - \frac{\hat{t}_{HT}}{n} \Biggr)^2 }{
#'      var = ( \sum (1-\pi)( y/\pi - \sum(y/\pi)/n)^2  ) / (1-\sum (d^2/\pi)) }
#'
#'      where
#'      \deqn{ d_i = \frac{\pi_i(1-\pi_i)}{\sum_{j\in U} \pi_j(1-\pi_j) } }{
#'      d = (\pi(1-\pi)) / (\sum \pi(1-\pi)) }
#'
#'  \item \code{method="MateiTille3"} [Class 3]
#'      \deqn{ \widehat{var}(\hat{t}_{HT}) = \frac{1}{1 - \sum_{i\in U} \frac{d_i^2}{\pi_i} }
#'      \sum_{i\in s} (1-\pi_i) \Biggl( \frac{y_i}{\pi_i} -
#'      \frac{ \sum_{j\in s} (1-\pi_j)\frac{y_j}{\pi_j} }{ \sum_{j\in s} (1-\pi_j)  } \Biggr)^2 }{
#'      var = ( \sum (1-\pi)( y/\pi - (\sum(1-\pi)(y/\pi))/(\sum(1-\pi))  )^2  ) / (1-\sum (d^2/\pi)) }
#'
#'      where \eqn{d_i}{d} is defined as in \code{method="MateiTille2"}.
#'
#'  \item \code{method="MateiTille4"} [Class 3]
#'      \deqn{ \widehat{var}(\hat{t}_{HT}) = \frac{1}{1 - \sum_{i\in U} b_i/n^2 }
#'      \sum_{i\in s} \frac{b_i}{\pi_i^3} (y_i - y_i^* )^2  }{
#'      var =  \sum ( (b/(\pi^3)) (y-y*)^2 )
#'      }
#'      where
#'      \deqn{  y_i^* = \pi_i \frac{ \sum_{j\in s} b_j y_j/\pi_j^2 }{  \sum_{j\in s} b_j/\pi_j } }{
#'       y* = \pi (\sum b*y/(\pi^2)) / (\sum b/\pi )}
#'      and
#'      \deqn{ b_i = \frac{ \pi_i(1-\pi_i)N }{ N-1 } }{ b = ( \pi(1-\pi)N ) / (N-1)}
#'
#'  \item \code{method="MateiTille5"} [Class 3]
#'  This estimator is defined as in \code{method="MateiTille4"}, and the \eqn{b_i}{b}
#'  values are defined as in \code{method="MateiTille1"}
#' }
#'
#'
#' @return a scalar, the estimated variance
#'
#' @examples
#'
#' ### Generate population data ---
#' N <- 500; n <- 50
#'
#' set.seed(0)
#' x <- rgamma(500, scale=10, shape=5)
#' y <- abs( 2*x + 3.7*sqrt(x) * rnorm(N) )
#'
#' pik <- n * x/sum(x)
#' s   <- sample(N, n)
#'
#' ys <- y[s]
#' piks <- pik[s]
#'
#' ### Estimators of class 2 ---
#' approx_var_est(ys, piks, method="Deville1")
#' approx_var_est(ys, piks, method="Deville2")
#' approx_var_est(ys, piks, method="Deville3")
#' approx_var_est(ys, piks, method="Hajek")
#' approx_var_est(ys, piks, method="Rosen")
#' approx_var_est(ys, piks, method="FixedPoint")
#' approx_var_est(ys, piks, method="Brewer1")
#'
#' ### Estimators of class 3 ---
#' approx_var_est(ys, pik, method="HartleyRao", sample=s)
#' approx_var_est(ys, pik, method="Berger", sample=s)
#' approx_var_est(ys, pik, method="Tille", sample=s)
#' approx_var_est(ys, pik, method="MateiTille1", sample=s)
#' approx_var_est(ys, pik, method="MateiTille2", sample=s)
#' approx_var_est(ys, pik, method="MateiTille3", sample=s)
#' approx_var_est(ys, pik, method="MateiTille4", sample=s)
#' approx_var_est(ys, pik, method="MateiTille5", sample=s)
#' approx_var_est(ys, pik, method="Brewer2", sample=s)
#' approx_var_est(ys, pik, method="Brewer3", sample=s)
#' approx_var_est(ys, pik, method="Brewer4", sample=s)
#'
#'
#' @references
#' Matei, A.; Tillé, Y., 2005. Evaluation of variance approximations and estimators
#' in maximum entropy sampling with unequal probability and fixed sample size.
#' Journal of Official Statistics 21 (4), 543-570.
#'
#' Haziza, D.; Mecatti, F.; Rao, J.N.K. 2008.
#' Evaluation of some approximate variance estimators under the Rao-Sampford
#' unequal probability sampling design. Metron LXVI (1), 91-108.
#'
#'
#' @export
#'
#'

approx_var_est <- function(y, pik, method, sample=NULL, ...){

    ### Check input ---
    method <- match.arg( method,
                         c(  #second class
                             "Deville1",
                             "Deville2",
                             "Deville3",
                             "Hajek",
                             "Rosen",
                             "FixedPoint",
                             "Brewer1",
                             #third class
                             "HartleyRao",
                             "Berger",
                             "Tille",
                             "MateiTille1",
                             "MateiTille2",
                             "MateiTille3",
                             "MateiTille4",
                             "MateiTille5",
                             "Brewer2",
                             "Brewer3",
                             "Brewer4" )
    )

    argList <- list(...)
    ifelse( is.null(argList$eps), eps <- 1e-05, eps <- argList$eps )
    ifelse( is.null(argList$maxIter),  maxIter <- 1000, maxIter <- argList$maxIter )


    ly <- length(y)
    lp <- length(pik)
    ls <- length(sample)

    if( !identical( class(pik), "numeric" ) ){
        stop( "The argument 'pik' should be a numeric vector!")
    }else if( lp < 2 ){
        stop( "The 'pik' vector is too short!" )
    }else if( any(pik<0)  | any(pik>1) ){
        stop( "Some 'pik' values are outside the interval [0, 1]")
    }else if( any(pik %in% c(NA, NaN, Inf)) ){
        stop( "The 'pik' vector contains invalid values (NA, NaN, Inf)" )
    }

    if( !(class(y) %in% c("numeric", "integer")) ){
        stop( "The argument 'y' should be a numeric vector!")
    }else if( ly < 2 ){
        stop( "The 'y' vector is too short!" )
    }else if( any(y %in% c(NA, NaN, Inf)) ){
        stop( "The 'y' vector contains invalid values (NA, NaN, Inf)" )
    }


    if( any(y<0) ){
        message( "Some 'y' values are negative, continuing anyway...")
    }


    #second class of estimators
    if( method %in% c( "Deville1", "Deville2", "Deville3", "Rosen", "Hajek",
                       "FixedPoint", "Brewer1") ){
        if( !identical( length(y), length(pik) ) )
            stop( "Method ", "'", method, "' ",
                  "requires argument 'y' and 'pik' to have the same length, ",
                  "which should be equal to the sample size!"
            )

    }

    #third class of estimators
    if( method %in% c( "HartleyRao", "Berger", "Tille", "MateiTille1",
                       "MateiTille2", "MateiTille3", "MateiTille4", "MateiTille5",
                       "Brewer2", "Brewer3", "Brewer4" ) ){

        if( !is.wholenumber( sum(pik) ) )
            stop( "The sum of pik values is not an integer!")

        if( ly >= lp )
            stop("With method ", "'", method, "'",
                 ", pik values for all population units should be provided. ",
                 "However, the length of pik is <= the length of y, ",
                 "which should be equal to the sample size. ",
                 "Please, check again your input or check the help page for more details!"
            )

        if( missing(sample) )
            stop("The argument 'sample' is required for method=", "'", method, "'")
        if( !is.numeric(sample) & !is.logical(sample) )
            stop( "The argument 'sample' should be either a numeric or logical vector")
        if( is.logical(sample) & !identical( length(pik), length(sample) ) )
            stop( "The arguments 'pik' and 'sample' should have the same length when 'sample' is a logical vector!" )
        if( is.numeric(sample) & !identical( length(y), length(sample) ) )
            stop( "The argument 'sample' should have length equal to the sample size!" )


    }



    ### Call method ---
    v <- switch(method,
                #second class
                "Deville1"    = var_Deville(y, pik, method),
                "Deville2"    = var_Deville(y, pik, method),
                "Deville3"    = var_Deville(y, pik, method),
                "Hajek"       = var_Hajek(y, pik),
                "Rosen"       = var_Rosen(y, pik),
                "FixedPoint"  = var_FixedPoint(y, pik, eps=1e-5, maxIter=1000),
                "Brewer1"     = var_Brewer_class2(y, pik),
                #third class
                "HartleyRao"  = var_HartleyRao(y, pik, sample),
                "Berger"      = var_Berger(y, pik, sample),
                "Tille"       = var_Tille(y, pik, sample, eps=1e-5, maxIter=1000),
                "MateiTille1" = var_MateiTille(y, pik, method, sample) ,
                "MateiTille2" = var_MateiTille(y, pik, method, sample),
                "MateiTille3" = var_MateiTille(y, pik, method, sample),
                "MateiTille4" = var_MateiTille(y, pik, method, sample),
                "MateiTille5" = var_MateiTille(y, pik, method, sample),
                "Brewer2"     = var_Brewer_class3(y, pik, method, sample),
                "Brewer3"     = var_Brewer_class3(y, pik, method, sample),
                "Brewer4"     = var_Brewer_class3(y, pik, method, sample)
    )

    # do.call( FUN, list(y, pik))

    ### Return result ---
    return( v )
}