jk_wtd_mean_se: Post-stratification jacknife estimate of SE of weighted mean
In gmonette/WWCa: Tools for poststratification

Description Usage Arguments See Also Examples

Jacknife estimates of the weighted mean with a stratified sample. Weights are adjusted to take the change in n into account when obtaining the jacknife estimate dropping an observation within a stratum.

1 2	jk_wtd_mean_se(x, by = rep(1, length(x)), w = rep(1, length(x)), FUN = wtd_mean, na.rm = FALSE)

`x`	numerical vector
`by`	stratification variable or list of stratification variable. Default: single level for unstratified samples
`w`	vector of weights, e.g. Horvitz-Thomson weights or renormed version. Default: 1
`FUN`	(default: wtd_mean) function used to obtain estimate

wtd_mean for weighted means, link{lin_comb} for linear combinations, jk_wtd_mean_se for a jacknife estimate of the SE of a weighted mean and jk_lin_comb_se for a jacknife estimate of the SE of a linear combination.

Other Post-stratification survey functions: jk_wtd_means, lin_comb, wtd_mean

## Not run: 
#
# This is a made-up survey in 3 states along the eastern edge of the Rockies.
#
ds <- read.table(header = TRUE, text = "
                 Gender   State    Income
                 Male     Utah        10
                 Male     Utah        11
                 Male     Utah        12
                 Male     Utah        13
                 Male     Utah        14
                 Male     Utah        15
                 Female   Utah        16
                 Female   Utah        17
                 Female   Utah        18
                 Female   Utah        19
                 Male     Idaho       20
                 Male     Idaho       21
                 Male     Idaho       22
                 Female   Idaho       23
                 Female   Idaho       24
                 Female   Idaho       25
                 Female   Idaho       26
                 Female   Idaho       27
                 Female   Idaho       28
                 Female   Idaho       29
                 Male     Arizona     30
                 Male     Arizona     31
                 Male     Arizona     32
                 Male     Arizona     33
                 Male     Arizona     34
                 Male     Arizona     35
                 Male     Arizona     36
                 Female   Arizona     37
                 Female   Arizona     38
                 Female   Arizona     39
                 ")
#
# and the population within each stratum (imaginary data):
# 
dpop <- read.table(header = TRUE, text = "
                   Gender   State          N
                   Male     Utah     1500000
                   Female   Utah     1500000
                   Male     Idaho     700000
                   Female   Idaho     900000
                   Male     Arizona  4000000
                   Female   Arizona  3000000
                   ")
# 
# First we create the 'overall' $n$ and $N$ variables.
# 
ds$n <- with(ds, capply(State, list(State, Gender), length))  
# 
# Note that first argument of 'capply', 'State', could be any variable in the dataset
# since it is used only for the length of the chunks within each 'State' by 'Gender' stratum.
# 
ds <- merge(ds, dpop)
head(ds)
#
# ### Overall Horvitz-Thompson weights
# 
ds$w_o_ht <- with(ds, N/n)
head(ds)
sum(ds$w_o_ht)

with(ds, wtd_mean(Income, w_o_ht))
with(ds, jk_wtd_mean_se(Income, list(Gender,State), w_o_ht))
with(ds, jk_wtd_mean_se(Income))
sd(ds$Income)/sqrt(length(ds$Income))
#
# ### Weighted mean within Utah
#
with(subset(ds, State == "Utah"), wtd_mean(Income, w_o_ht))
with(subset(ds, State == "Utah"), jk_wtd_mean_se(Income, Gender, w_o_ht))
# alternatively:
with(ds, wtd_mean(Income, w_o_ht * (State == "Utah")))
with(ds, jk_wtd_mean_se(Income, list(Gender, State), w_o_ht * (State == "Utah")))
#
# ### Weighted mean within Utah and Idaho
#
with(subset(ds, State %in% c("Utah","Idaho")), wtd_mean(Income, w_o_ht))
with(subset(ds, State %in% c("Utah","Idaho")), jk_wtd_mean_se(Income, list(Gender, State), w_o_ht))
# alternatively:
with(ds, wtd_mean(Income, w_o_ht * (State %in% c("Utah","Idaho"))))
with(ds, jk_wtd_mean_se(Income, list(Gender, State), w_o_ht * (State %in% c("Utah","Idaho"))))
#
# ### Weighted female income
#
with(subset(ds, Gender == "Female"), wtd_mean(Income, w_o_ht))
with(subset(ds, Gender == "Female"), jk_wtd_mean_se(Income, list(Gender, State), w_o_ht))
# alternatively:
with(ds, wtd_mean(Income, w_o_ht * (Gender == "Female")))
with(ds, jk_wtd_mean_se(Income, list(Gender, State), w_o_ht * (Gender == "Female")))
#
# We would get the same results with weights normed differently.
# 
# ### 'Sum to $n$' weights
# 
ds$w_o_sn <- with(ds, (N/n)/mean(N/n))
sum(ds$w_o_sn)

with(ds, wtd_mean(Income, w_o_sn))
with(ds, jk_wtd_mean_se(Income, list(Gender,State), w_o_sn))
# 
# ### 'Sum to 1' weights:
# 
ds$w_o_s1 <- with(ds, (N/n)/sum(N/n))
sum(ds$w_o_s1)

with(ds, wtd_mean(Income, w_o_s1))
with(ds, jk_wtd_mean_se(Income, list(Gender,State), w_o_s1))

#### Gender gap in mean salary in Utah

# With linear combinations, we can estimate population parameters that can't be
# estimated as weighted means. For example, the Gender gap in Utah:

coeffs <- with(ds, 
               std((State == "Utah") * (Gender == "Male") * w_o_ht)
               - std((State == "Utah") * (Gender == "Female") * w_o_ht) )
names(coeffs) <- with(ds, interaction(State,Gender))
cbind(coeffs)
with(ds, lin_comb(Income, coeffs))
with(ds, jk_lin_comb_se(Income, list(State, Gender), coeffs))
#'
#' #### Comparing two states
#' 
#' To compare Utah with Idaho using the population proportions of Gender:
#' 
coeffs <- with(ds, 
               std((State == "Utah") * w_o_ht)
               - std((State == "Idaho") *  w_o_ht) )
coeffs
names(coeffs) <- with(ds, interaction(State,Gender))
cbind(coeffs)
with(ds, lin_comb(Income, coeffs))
with(ds, jk_lin_comb_se(Income, list(State, Gender), coeffs))
#'
#' #### Unweighted average of two states
#' 
#' The unweighted average of Utah and Idaho using the Gender proportion in each state:
#' 
coeffs <- with(ds, 
               .5 * std((State == "Utah") * w_o_ht)
               + .5 * std((State == "Idaho") *  w_o_ht) )
coeffs
with(ds, lin_comb(Income, coeffs))
with(ds, jk_lin_comb_se(Income, list(State, Gender), coeffs))        
#'
#' #### The Gender gap within each state
#' 
states <- c("Idaho", "Utah", "Arizona")
names(states) <- states
coefs_gap <- lapply( states,
                     function(state)
                       with(ds, std((State == state) * (Gender == "Male") * w_o_ht) -
                              std((State == state) * (Gender == "Female") * w_o_ht) ))
mat <- do.call(cbind,coefs_gap)
rownames(mat) <- with(ds, interaction(State,Gender))
MASS::fractions(mat)
with(ds, lapply(coefs_gap, function(w) lin_comb(Income,w)))
with(ds, lapply(coefs_gap, function(w) jk_lin_comb_se(Income,list(Gender, State), w)))
#' 
#' ## Hierarchical weights
#' 
#' The following illustrates how to create within-state weights and then how to modify them to form overall 
#' weights.
#' 
#' We can start with the convenient fact that Horvitz-Thompson weights serve as both within-state and
#' overall weights.  To get within-state 'sum to $n$' weights, we need to norm the Horvitz-Thompson weights
#' within each state. We need to divide the Horvitz-Thompson weights by their within-state means:
#' 
ds$ht_ws_mean <- with(ds, capply(w_o_ht, State, mean))
ds$w_ws_sn <- with(ds, w_o_ht / ht_ws_mean)   # within-state 'sum to n' mean

with(ds, tapply(w_ws_sn, State, sum)) # check that they sum to within-state n
tab(ds, ~ State)
#'
#' As noted earlier, these weights need to be multiplied by $\frac{N_s/N}{n_s/n}$ to transform them into
#' overall 'sum to $n$' weights. The easiest way to generate $N_s$ is to use the Horvitz-Thomson weights:
#' 
ds$N_s <- with(ds, capply(w_o_ht, State, sum))
ds$n_s <- with(ds, capply(w_o_ht, State, length))
N_total <- sum(ds$w_o_ht)
n_total <- nrow(ds)

ds$w_o_sn2 <- with(ds, w_ws_sn * (N_s / N_total) / (n_s/n_total))
ds
#' Comparing these weighs with previous ones:
with(ds, max(abs(w_o_sn2 - w_o_sn)))
#'
#' Estimating means per stratum
#'
ds$mean_sg <- capply(ds, ds[c('State','Gender')], with, wtd_mean(Income, w_o_ht))
ds$mean_sg_se <- capply(ds, ds[c('State','Gender')], with, jk_wtd_mean_se(Income, w = w_o_ht))
ds$mean_s <- capply(ds, ds$State, with, wtd_mean(Income, w_o_ht))
ds$mean_s_se <- capply(ds, ds$State, with, jk_wtd_mean_se(Income, Gender, w_o_ht))
ds$mean_g <- capply(ds, ds$Gender, with, wtd_mean(Income, w_o_ht))
ds$mean_g_se <- capply(ds, ds$Gender, with, jk_wtd_mean_se(Income, State, w_o_ht))
# Define a round method for data frames so round will work on a data frame with non-numeric variables
round.data.frame <- function(x, digits = 0) as.data.frame(lapply(x, function(x) if(is.numeric(x)) round(x, digits = digits) else x))  
round(ds, 3)
dssum <- up(ds, ~ State/Gender) # keep all State x Gender invariant variables
round(dssum, 3)

## End(Not run)