R/survey.R

Defines functions jk_lin_comb_se lin_comb std jk_wtd_mean_se jk_wtd_means wtd_mean

Documented in jk_lin_comb_se jk_wtd_means jk_wtd_mean_se lin_comb std wtd_mean

#' Functions for post-stratification adjustments in surveys
#' 
#' @family post-stratification functions
#' @seealso \code{\link{wtd_mean}} for weighted means, \code{link{lin_comb}} for linear
#'      combinations, \code{\link{jk_wtd_mean_se}} for a jacknife estimate of the SE of a weighted mean
#'      and \code{\link{jk_lin_comb_se}} for a jacknife estimate of the SE of a linear combination.
#' @name survey
NULL
## NULL
#' Weighted mean 
#' 
#' @param x numerical vector
#' @param w vector of weights replicated to have same length as \code{x}
#' @family Post-stratification survey functions
#' @seealso \code{\link{wtd_mean}} for weighted means, \code{link{lin_comb}} for linear
#'      combinations, \code{\link{jk_wtd_mean_se}} for a jacknife estimate of the SE of a weighted mean
#'      and \code{\link{jk_lin_comb_se}} for a jacknife estimate of the SE of a linear combination.
#' @export
wtd_mean <- function(x, w) {
  w <- rep(w, length.out = length(x))
  sum(x*w) / sum(w)
}
#' Post-stratification jacknife estimates of standard error
#' 
#' Jacknife estimates of the weighted mean with a stratified sample. 
#' Weights are adjusted to take the change in n into account when obtaining the
#' jacknife estimate dropping an observation within a stratum.
#' @param x numerical vector
#' @param by stratification variable or list of stratification variable
#' @param w vector of weights, e.g. Horvitz-Thomson weights or renormed version
#' @param check (default: TRUE) check that 'w' is constant within levels of 'by'
#' @param FUN (default: wtd_mean) function used to obtain estimate
#' @family Post-stratification survey functions
#' @seealso \code{\link{wtd_mean}} for weighted means, \code{link{lin_comb}} for linear
#'      combinations, \code{\link{jk_wtd_mean_se}} for a jacknife estimate of the SE of a weighted mean
#'      and \code{\link{jk_lin_comb_se}} for a jacknife estimate of the SE of a linear combination.
#' @export
jk_wtd_means <- function(x, by, w, check = TRUE, FUN = wtd_mean){
  # Value: vector of 'drop-one' estimates for jacknife estimator of SE
  #
  # x: response variable, a numeric vector
  # by: vector or list of vectors defining strata
  # w: numeric vector of sampling weights constant within each stratum
  # Check that weights are constant within levels of 'by'
  # library(spida2)  # devtools::install_github('gmonette/spida2')
  if(check) {
    constant <- function(z) length(unique(z)) <= 1 
    constant_by <- function(z, by) all(sapply(split(z, by), constant))
    if(!constant_by(w, by)) stop("w must constant within strata")
  }
  ns <- capply(x, by, length)
  force(w)
  by <- interaction(by)
  wtd_mean_drop_i <- function(i) {
    ns_drop <- capply(x[-i], by[-i], length)
    w_drop <- w[-i] * ns[-i] / ns_drop
    FUN(x[-i], w_drop)
  }
  sapply(seq_along(x), wtd_mean_drop_i)
}
#' Post-stratification jacknife estimate of SE of weighted mean
#' 
#' Jacknife estimates of the weighted mean with a stratified sample. 
#' Weights are adjusted to take the change in n into account when obtaining the
#' jacknife estimate dropping an observation within a stratum.
#' @param x numerical vector
#' @param by stratification variable or list of stratification variable. Default: single level for unstratified samples
#' @param w vector of weights, e.g. Horvitz-Thomson weights or renormed version. Default: 1
#' @param FUN (default: wtd_mean) function used to obtain estimate
#' @examples
#' \dontrun{
#' #
#' # This is a made-up survey in 3 states along the eastern edge of the Rockies.
#' #
#' ds <- read.table(header = TRUE, text = "
#'                  Gender   State    Income
#'                  Male     Utah        10
#'                  Male     Utah        11
#'                  Male     Utah        12
#'                  Male     Utah        13
#'                  Male     Utah        14
#'                  Male     Utah        15
#'                  Female   Utah        16
#'                  Female   Utah        17
#'                  Female   Utah        18
#'                  Female   Utah        19
#'                  Male     Idaho       20
#'                  Male     Idaho       21
#'                  Male     Idaho       22
#'                  Female   Idaho       23
#'                  Female   Idaho       24
#'                  Female   Idaho       25
#'                  Female   Idaho       26
#'                  Female   Idaho       27
#'                  Female   Idaho       28
#'                  Female   Idaho       29
#'                  Male     Arizona     30
#'                  Male     Arizona     31
#'                  Male     Arizona     32
#'                  Male     Arizona     33
#'                  Male     Arizona     34
#'                  Male     Arizona     35
#'                  Male     Arizona     36
#'                  Female   Arizona     37
#'                  Female   Arizona     38
#'                  Female   Arizona     39
#'                  ")
#' #
#' # and the population within each stratum (imaginary data):
#' # 
#' dpop <- read.table(header = TRUE, text = "
#'                    Gender   State          N
#'                    Male     Utah     1500000
#'                    Female   Utah     1500000
#'                    Male     Idaho     700000
#'                    Female   Idaho     900000
#'                    Male     Arizona  4000000
#'                    Female   Arizona  3000000
#'                    ")
#' # 
#' # First we create the 'overall' $n$ and $N$ variables.
#' # 
#' ds$n <- with(ds, capply(State, list(State, Gender), length))  
#' # 
#' # Note that first argument of 'capply', 'State', could be any variable in the dataset
#' # since it is used only for the length of the chunks within each 'State' by 'Gender' stratum.
#' # 
#' ds <- merge(ds, dpop)
#' head(ds)
#' #
#' # ### Overall Horvitz-Thompson weights
#' # 
#' ds$w_o_ht <- with(ds, N/n)
#' head(ds)
#' sum(ds$w_o_ht)
#' 
#' with(ds, wtd_mean(Income, w_o_ht))
#' with(ds, jk_wtd_mean_se(Income, list(Gender,State), w_o_ht))
#' with(ds, jk_wtd_mean_se(Income))
#' sd(ds$Income)/sqrt(length(ds$Income))
#' #
#' # ### Weighted mean within Utah
#' #
#' with(subset(ds, State == "Utah"), wtd_mean(Income, w_o_ht))
#' with(subset(ds, State == "Utah"), jk_wtd_mean_se(Income, Gender, w_o_ht))
#' # alternatively:
#' with(ds, wtd_mean(Income, w_o_ht * (State == "Utah")))
#' with(ds, jk_wtd_mean_se(Income, list(Gender, State), w_o_ht * (State == "Utah")))
#' #
#' # ### Weighted mean within Utah and Idaho
#' #
#' with(subset(ds, State %in% c("Utah","Idaho")), wtd_mean(Income, w_o_ht))
#' with(subset(ds, State %in% c("Utah","Idaho")), jk_wtd_mean_se(Income, list(Gender, State), w_o_ht))
#' # alternatively:
#' with(ds, wtd_mean(Income, w_o_ht * (State %in% c("Utah","Idaho"))))
#' with(ds, jk_wtd_mean_se(Income, list(Gender, State), w_o_ht * (State %in% c("Utah","Idaho"))))
#' #
#' # ### Weighted female income
#' #
#' with(subset(ds, Gender == "Female"), wtd_mean(Income, w_o_ht))
#' with(subset(ds, Gender == "Female"), jk_wtd_mean_se(Income, list(Gender, State), w_o_ht))
#' # alternatively:
#' with(ds, wtd_mean(Income, w_o_ht * (Gender == "Female")))
#' with(ds, jk_wtd_mean_se(Income, list(Gender, State), w_o_ht * (Gender == "Female")))
#' #
#' # We would get the same results with weights normed differently.
#' # 
#' # ### 'Sum to $n$' weights
#' # 
#' ds$w_o_sn <- with(ds, (N/n)/mean(N/n))
#' sum(ds$w_o_sn)
#' 
#' with(ds, wtd_mean(Income, w_o_sn))
#' with(ds, jk_wtd_mean_se(Income, list(Gender,State), w_o_sn))
#' # 
#' # ### 'Sum to 1' weights:
#' # 
#' ds$w_o_s1 <- with(ds, (N/n)/sum(N/n))
#' sum(ds$w_o_s1)
#' 
#' with(ds, wtd_mean(Income, w_o_s1))
#' with(ds, jk_wtd_mean_se(Income, list(Gender,State), w_o_s1))
#' 
#' #### Gender gap in mean salary in Utah
#' 
#' # With linear combinations, we can estimate population parameters that can't be
#' # estimated as weighted means. For example, the Gender gap in Utah:
#' 
#' coeffs <- with(ds, 
#'                std((State == "Utah") * (Gender == "Male") * w_o_ht)
#'                - std((State == "Utah") * (Gender == "Female") * w_o_ht) )
#' names(coeffs) <- with(ds, interaction(State,Gender))
#' cbind(coeffs)
#' with(ds, lin_comb(Income, coeffs))
#' with(ds, jk_lin_comb_se(Income, list(State, Gender), coeffs))
#' #'
#' #' #### Comparing two states
#' #' 
#' #' To compare Utah with Idaho using the population proportions of Gender:
#' #' 
#' coeffs <- with(ds, 
#'                std((State == "Utah") * w_o_ht)
#'                - std((State == "Idaho") *  w_o_ht) )
#' coeffs
#' names(coeffs) <- with(ds, interaction(State,Gender))
#' cbind(coeffs)
#' with(ds, lin_comb(Income, coeffs))
#' with(ds, jk_lin_comb_se(Income, list(State, Gender), coeffs))
#' #'
#' #' #### Unweighted average of two states
#' #' 
#' #' The unweighted average of Utah and Idaho using the Gender proportion in each state:
#' #' 
#' coeffs <- with(ds, 
#'                .5 * std((State == "Utah") * w_o_ht)
#'                + .5 * std((State == "Idaho") *  w_o_ht) )
#' coeffs
#' with(ds, lin_comb(Income, coeffs))
#' with(ds, jk_lin_comb_se(Income, list(State, Gender), coeffs))        
#' #'
#' #' #### The Gender gap within each state
#' #' 
#' states <- c("Idaho", "Utah", "Arizona")
#' names(states) <- states
#' coefs_gap <- lapply( states,
#'                      function(state)
#'                        with(ds, std((State == state) * (Gender == "Male") * w_o_ht) -
#'                               std((State == state) * (Gender == "Female") * w_o_ht) ))
#' mat <- do.call(cbind,coefs_gap)
#' rownames(mat) <- with(ds, interaction(State,Gender))
#' MASS::fractions(mat)
#' with(ds, lapply(coefs_gap, function(w) lin_comb(Income,w)))
#' with(ds, lapply(coefs_gap, function(w) jk_lin_comb_se(Income,list(Gender, State), w)))
#' #' 
#' #' ## Hierarchical weights
#' #' 
#' #' The following illustrates how to create within-state weights and then how to modify them to form overall 
#' #' weights.
#' #' 
#' #' We can start with the convenient fact that Horvitz-Thompson weights serve as both within-state and
#' #' overall weights.  To get within-state 'sum to $n$' weights, we need to norm the Horvitz-Thompson weights
#' #' within each state. We need to divide the Horvitz-Thompson weights by their within-state means:
#' #' 
#' ds$ht_ws_mean <- with(ds, capply(w_o_ht, State, mean))
#' ds$w_ws_sn <- with(ds, w_o_ht / ht_ws_mean)   # within-state 'sum to n' mean
#' 
#' with(ds, tapply(w_ws_sn, State, sum)) # check that they sum to within-state n
#' tab(ds, ~ State)
#' #'
#' #' As noted earlier, these weights need to be multiplied by $\frac{N_s/N}{n_s/n}$ to transform them into
#' #' overall 'sum to $n$' weights. The easiest way to generate $N_s$ is to use the Horvitz-Thomson weights:
#' #' 
#' ds$N_s <- with(ds, capply(w_o_ht, State, sum))
#' ds$n_s <- with(ds, capply(w_o_ht, State, length))
#' N_total <- sum(ds$w_o_ht)
#' n_total <- nrow(ds)
#' 
#' ds$w_o_sn2 <- with(ds, w_ws_sn * (N_s / N_total) / (n_s/n_total))
#' ds
#' #' Comparing these weighs with previous ones:
#' with(ds, max(abs(w_o_sn2 - w_o_sn)))
#' #'
#' #' Estimating means per stratum
#' #'
#' ds$mean_sg <- capply(ds, ds[c('State','Gender')], with, wtd_mean(Income, w_o_ht))
#' ds$mean_sg_se <- capply(ds, ds[c('State','Gender')], with, jk_wtd_mean_se(Income, w = w_o_ht))
#' ds$mean_s <- capply(ds, ds$State, with, wtd_mean(Income, w_o_ht))
#' ds$mean_s_se <- capply(ds, ds$State, with, jk_wtd_mean_se(Income, Gender, w_o_ht))
#' ds$mean_g <- capply(ds, ds$Gender, with, wtd_mean(Income, w_o_ht))
#' ds$mean_g_se <- capply(ds, ds$Gender, with, jk_wtd_mean_se(Income, State, w_o_ht))
#' # Define a round method for data frames so round will work on a data frame with non-numeric variables
#' round.data.frame <- function(x, digits = 0) as.data.frame(lapply(x, function(x) if(is.numeric(x)) round(x, digits = digits) else x))  
#' round(ds, 3)
#' dssum <- up(ds, ~ State/Gender) # keep all State x Gender invariant variables
#' round(dssum, 3)
#' }
#' @family Post-stratification survey functions
#' @seealso \code{\link{wtd_mean}} for weighted means, \code{link{lin_comb}} for linear
#'      combinations, \code{\link{jk_wtd_mean_se}} for a jacknife estimate of the SE of a weighted mean
#'      and \code{\link{jk_lin_comb_se}} for a jacknife estimate of the SE of a linear combination.
#' @export
jk_wtd_mean_se <- function(x, by = rep(1, length(x)), w = rep(1, length(x)), FUN = wtd_mean) {
  # Value: jacknife estimator of the standard error of a weighted mean
  # using post-stratification
  #
  # x: response variable, a numeric vector
  # by: vector or list of vectors defining strata
  # w: numeric vector of sampling weights constant within each stratum
  #
  library(spida2)
  theta_hat <- FUN(x, w)
  theta_hat_i <- jk_wtd_means(x, by, w, FUN = FUN)
  ns <- capply(x, by, length)
  sqrt(sum((theta_hat_i - theta_hat)^2 * (ns-1) / ns))
}
#' 
#'
#' ## Linear combinations
#' 
#' ### Functions for linear combinations
#' 
#' Standardize a vector of weights
#' 
#' @param x vector of weights
#' @param div divisor (default: sum(x))
#' @rdname survey
#' @export
std <- function(x, div = sum(x)) x/div
#' 
#' Linear combination of x
#' 
#' @param x numerical vector
#' @param w vector of weights replicated to have same length as 'x'
#' @family Post-stratification survey functions
#' @export
lin_comb <- function(x, w) {
  w <- rep(w, length.out = length(x))
  sum(x*w)
}
#' Post-stratification Jacknife estimate of SE for a linear combination
#' 
#' @inheritParams jk_wtd_mean_se
#' @rdname survey
#' @export
jk_lin_comb_se <- function(x, by, w, check = TRUE) jk_wtd_mean_se(x, by, w, FUN = lin_comb)
#' 
gmonette/spida2 documentation built on July 14, 2024, 12:45 p.m.