#' Population frequencies
#'
#' This data frame should be used only for testing
#'
#' This data frame contains the number of individuals within each county in the
#' US and Puerto Rico (PR) in clusters formed by combining categories of
#' age (in 14 categories), raceethnicity (in 5 categories) and sex (Male or Female).
#' Each county is also identified by its 5-digit county code,
#' its state (50 states using 2-letter abbreviations plus DC and PR),
#' and its country (US or PR).
#'
#' @format A data frame with 450,800 rows and 8 variables.
#' \describe{
#' \item{state.code}{first 2 digits of county code identifying state}
#' \item{sex}{Male or Female}
#' \item{age}{an ordered factor with 14 age categories}
#' \item{raceethnicity}{in 6 categories}
#' \item{county}{5-digit county code}
#' \item{state}{2-letter state abbreviation}
#' \item{country}{US or PR for Puerto Rico}
#' \item{date}{date at which population count is valid}
#' \item{population}{count of people in a particular combination (Note that this number has been imputed through some process and should be used only for testing, not for production)}
#' \item{population_neg}{count of people in a particular combination including negative values to be explained}
#' }
#' @source Datassist plus some arbitraty adjustments for population
#' @name popagetable
NULL
#' Functions for post-stratification adjustments in surveys
#'
#' # Main utilities here:
#'
#' capply(x, by, FUN, ...): applies FUN to each 'chunk' of x defined by
#' levels of 'by'. "..." are additional arguments to FUN.
#' 'by' can be a variable or a list, often a set of columns
#' in the data set in which 'x' lives. The result has the
#' same shape as 'x' and can thus be added as a variable to
#' the data frame for 'x'. If FUN returns a single value in
#' each chunk, the value is recycled to fill the elements
#' corresponding to the chunk.
#'
#' For example, in a data frame, data, with variables:
#' state, county, sex, population
#' where population is the population size within each
#' county (within state) by sex combination:
#' > data$pop.state <- with(data,
#' capply(population, list(sex,state), sum))
#' creates a variable that is equal to the total population
#' within each state x sex combination repeated, of course, for
#' each county.
#'
#' up(data, by) keeps rows corresponding to unique values of the strata
#' defined by the variable or list of variables in 'by'.
#' 'by' can also be a formula, evaluated in 'data'. For example,
#' 'by = ~ a + b', is equivalent to 'by = data[,c('a','b')]'.
#' For example, with 'data' used above:
#'
#' > data.state <- up(data, ~ state + sex)
#'
#' creates a data frame with one row per state x sex combination
#' and the variable 'pop.state' contains the total population in
#' in each combination. Variables, such as 'county' and 'population'
#' that are not invariant within levels of 'by' are dropped.
#'
#' tab(data, fmla):
#'
#' creates a frequency table showing the number of rows in 'data' for
#' each stratum formed by evaluating the formula 'fmla' in 'data'.
#'
#' For example 'tab(data, ~ a + b + c)' will return an array of frequencies
#' for variables 'a', 'b', and 'c' in 'data'.
#' If the formula has a variable on the left side, that variable is
#' summed to get the entries of the table.
#'
#' For example, if 'population' contains the population in each row
#' where each row is a 'sex x state x county' combination:
#'
#' > tab(data, population ~ sex + age)
#'
#' will produce a table of total population by sex and age, and
#'
#' > tab(data, population ~ age)
#'
#' will show overall totals by age group.
#'
#' These tables can be tranformed into data frames with, e.g.,
#'
#' > as.data.frame(tab(data, population ~ age))
#'
#' @family post-stratification functions
#' @seealso \code{\link{wtd_mean}} for weighted means, \code{link{lin_comb}} for linear
#' combinations, \code{\link{jk_wtd_mean_se}} for a jacknife estimate of the SE of a weighted mean
#' and \code{\link{jk_lin_comb_se}} for a jacknife estimate of the SE of a linear combination.
#' @name survey
NULL
## NULL
#' Weighted mean
#'
#' @param x numerical vector
#' @param w vector of weights replicated to have same length as \code{x}
#' @param na.rm if TRUE, drop units for which x is NA. Default: FALSE.
#' @family Post-stratification survey functions
#' @seealso \code{\link{wtd_mean}} for weighted means, \code{link{lin_comb}} for linear
#' combinations, \code{\link{jk_wtd_mean_se}} for a jacknife estimate of the SE of a weighted mean
#' and \code{\link{jk_lin_comb_se}} for a jacknife estimate of the SE of a linear combination.
#' @export
wtd_mean <- function(x, w, na.rm = FALSE) {
w <- rep(w, length.out = length(x))
if(na.rm) {
drop <- is.na(x)
x <- x[!drop]
w <- w[!drop]
}
sum(x*w) / sum(w)
}
#' Post-stratification jacknife estimates of standard error
#'
#' Jacknife estimates of the weighted mean with a stratified sample.
#' Weights are adjusted to take the change in n into account when obtaining the
#' jacknife estimate dropping an observation within a stratum.
#' @param x numerical vector
#' @param by stratification variable or list of stratification variable
#' @param w vector of weights, e.g. Horvitz-Thomson weights or renormed version
#' @param check (default: TRUE) check that 'w' is constant within levels of 'by'
#' @param FUN (default: wtd_mean) function used to obtain estimate
#' @family Post-stratification survey functions
#' @seealso \code{\link{wtd_mean}} for weighted means, \code{link{lin_comb}} for linear
#' combinations, \code{\link{jk_wtd_mean_se}} for a jacknife estimate of the SE of a weighted mean
#' and \code{\link{jk_lin_comb_se}} for a jacknife estimate of the SE of a linear combination.
#' @export
jk_wtd_means <- function(x, by, w, check = TRUE, FUN = wtd_mean, ...){
# Value: vector of 'drop-one' estimates for jacknife estimator of SE
#
# x: response variable, a numeric vector
# by: vector or list of vectors defining strata
# w: numeric vector of sampling weights constant within each stratum
# Check that weights are constant within levels of 'by'
# library(spida2) # devtools::install_github('gmonette/spida2')
if(check) {
constant <- function(z) length(unique(z)) <= 1
constant_by <- function(z, by) all(sapply(split(z, by), constant))
if(!constant_by(w, by)) stop("w must constant within strata")
}
ns <- capply(x, by, length)
force(w)
by <- interaction(by)
wtd_mean_drop_i <- function(i) {
ns_drop <- capply(x[-i], by[-i], length)
w_drop <- w[-i] * ns[-i] / ns_drop
FUN(x[-i], w_drop, ...)
}
sapply(seq_along(x), wtd_mean_drop_i)
}
#' Post-stratification jacknife estimate of SE of weighted mean
#'
#' Jacknife estimates of the weighted mean with a stratified sample.
#' Weights are adjusted to take the change in n into account when obtaining the
#' jacknife estimate dropping an observation within a stratum.
#' @param x numerical vector
#' @param by stratification variable or list of stratification variable. Default: single level for unstratified samples
#' @param w vector of weights, e.g. Horvitz-Thomson weights or renormed version. Default: 1
#' @param FUN (default: wtd_mean) function used to obtain estimate
#' @examples
#' \dontrun{
#' #
#' # This is a made-up survey in 3 states along the eastern edge of the Rockies.
#' #
#' ds <- read.table(header = TRUE, text = "
#' Gender State Income
#' Male Utah 10
#' Male Utah 11
#' Male Utah 12
#' Male Utah 13
#' Male Utah 14
#' Male Utah 15
#' Female Utah 16
#' Female Utah 17
#' Female Utah 18
#' Female Utah 19
#' Male Idaho 20
#' Male Idaho 21
#' Male Idaho 22
#' Female Idaho 23
#' Female Idaho 24
#' Female Idaho 25
#' Female Idaho 26
#' Female Idaho 27
#' Female Idaho 28
#' Female Idaho 29
#' Male Arizona 30
#' Male Arizona 31
#' Male Arizona 32
#' Male Arizona 33
#' Male Arizona 34
#' Male Arizona 35
#' Male Arizona 36
#' Female Arizona 37
#' Female Arizona 38
#' Female Arizona 39
#' ")
#' #
#' # and the population within each stratum (imaginary data):
#' #
#' dpop <- read.table(header = TRUE, text = "
#' Gender State N
#' Male Utah 1500000
#' Female Utah 1500000
#' Male Idaho 700000
#' Female Idaho 900000
#' Male Arizona 4000000
#' Female Arizona 3000000
#' ")
#' #
#' # First we create the 'overall' $n$ and $N$ variables.
#' #
#' ds$n <- with(ds, capply(State, list(State, Gender), length))
#' #
#' # Note that first argument of 'capply', 'State', could be any variable in the dataset
#' # since it is used only for the length of the chunks within each 'State' by 'Gender' stratum.
#' #
#' ds <- merge(ds, dpop)
#' head(ds)
#' #
#' # ### Overall Horvitz-Thompson weights
#' #
#' ds$w_o_ht <- with(ds, N/n)
#' head(ds)
#' sum(ds$w_o_ht)
#'
#' with(ds, wtd_mean(Income, w_o_ht))
#' with(ds, jk_wtd_mean_se(Income, list(Gender,State), w_o_ht))
#' with(ds, jk_wtd_mean_se(Income))
#' sd(ds$Income)/sqrt(length(ds$Income))
#' #
#' # ### Weighted mean within Utah
#' #
#' with(subset(ds, State == "Utah"), wtd_mean(Income, w_o_ht))
#' with(subset(ds, State == "Utah"), jk_wtd_mean_se(Income, Gender, w_o_ht))
#' # alternatively:
#' with(ds, wtd_mean(Income, w_o_ht * (State == "Utah")))
#' with(ds, jk_wtd_mean_se(Income, list(Gender, State), w_o_ht * (State == "Utah")))
#' #
#' # ### Weighted mean within Utah and Idaho
#' #
#' with(subset(ds, State %in% c("Utah","Idaho")), wtd_mean(Income, w_o_ht))
#' with(subset(ds, State %in% c("Utah","Idaho")), jk_wtd_mean_se(Income, list(Gender, State), w_o_ht))
#' # alternatively:
#' with(ds, wtd_mean(Income, w_o_ht * (State %in% c("Utah","Idaho"))))
#' with(ds, jk_wtd_mean_se(Income, list(Gender, State), w_o_ht * (State %in% c("Utah","Idaho"))))
#' #
#' # ### Weighted female income
#' #
#' with(subset(ds, Gender == "Female"), wtd_mean(Income, w_o_ht))
#' with(subset(ds, Gender == "Female"), jk_wtd_mean_se(Income, list(Gender, State), w_o_ht))
#' # alternatively:
#' with(ds, wtd_mean(Income, w_o_ht * (Gender == "Female")))
#' with(ds, jk_wtd_mean_se(Income, list(Gender, State), w_o_ht * (Gender == "Female")))
#' #
#' # We would get the same results with weights normed differently.
#' #
#' # ### 'Sum to $n$' weights
#' #
#' ds$w_o_sn <- with(ds, (N/n)/mean(N/n))
#' sum(ds$w_o_sn)
#'
#' with(ds, wtd_mean(Income, w_o_sn))
#' with(ds, jk_wtd_mean_se(Income, list(Gender,State), w_o_sn))
#' #
#' # ### 'Sum to 1' weights:
#' #
#' ds$w_o_s1 <- with(ds, (N/n)/sum(N/n))
#' sum(ds$w_o_s1)
#'
#' with(ds, wtd_mean(Income, w_o_s1))
#' with(ds, jk_wtd_mean_se(Income, list(Gender,State), w_o_s1))
#'
#' #### Gender gap in mean salary in Utah
#'
#' # With linear combinations, we can estimate population parameters that can't be
#' # estimated as weighted means. For example, the Gender gap in Utah:
#'
#' coeffs <- with(ds,
#' std((State == "Utah") * (Gender == "Male") * w_o_ht)
#' - std((State == "Utah") * (Gender == "Female") * w_o_ht) )
#' names(coeffs) <- with(ds, interaction(State,Gender))
#' cbind(coeffs)
#' with(ds, lin_comb(Income, coeffs))
#' with(ds, jk_lin_comb_se(Income, list(State, Gender), coeffs))
#' #'
#' #' #### Comparing two states
#' #'
#' #' To compare Utah with Idaho using the population proportions of Gender:
#' #'
#' coeffs <- with(ds,
#' std((State == "Utah") * w_o_ht)
#' - std((State == "Idaho") * w_o_ht) )
#' coeffs
#' names(coeffs) <- with(ds, interaction(State,Gender))
#' cbind(coeffs)
#' with(ds, lin_comb(Income, coeffs))
#' with(ds, jk_lin_comb_se(Income, list(State, Gender), coeffs))
#' #'
#' #' #### Unweighted average of two states
#' #'
#' #' The unweighted average of Utah and Idaho using the Gender proportion in each state:
#' #'
#' coeffs <- with(ds,
#' .5 * std((State == "Utah") * w_o_ht)
#' + .5 * std((State == "Idaho") * w_o_ht) )
#' coeffs
#' with(ds, lin_comb(Income, coeffs))
#' with(ds, jk_lin_comb_se(Income, list(State, Gender), coeffs))
#' #'
#' #' #### The Gender gap within each state
#' #'
#' states <- c("Idaho", "Utah", "Arizona")
#' names(states) <- states
#' coefs_gap <- lapply( states,
#' function(state)
#' with(ds, std((State == state) * (Gender == "Male") * w_o_ht) -
#' std((State == state) * (Gender == "Female") * w_o_ht) ))
#' mat <- do.call(cbind,coefs_gap)
#' rownames(mat) <- with(ds, interaction(State,Gender))
#' MASS::fractions(mat)
#' with(ds, lapply(coefs_gap, function(w) lin_comb(Income,w)))
#' with(ds, lapply(coefs_gap, function(w) jk_lin_comb_se(Income,list(Gender, State), w)))
#' #'
#' #' ## Hierarchical weights
#' #'
#' #' The following illustrates how to create within-state weights and then how to modify them to form overall
#' #' weights.
#' #'
#' #' We can start with the convenient fact that Horvitz-Thompson weights serve as both within-state and
#' #' overall weights. To get within-state 'sum to $n$' weights, we need to norm the Horvitz-Thompson weights
#' #' within each state. We need to divide the Horvitz-Thompson weights by their within-state means:
#' #'
#' ds$ht_ws_mean <- with(ds, capply(w_o_ht, State, mean))
#' ds$w_ws_sn <- with(ds, w_o_ht / ht_ws_mean) # within-state 'sum to n' mean
#'
#' with(ds, tapply(w_ws_sn, State, sum)) # check that they sum to within-state n
#' tab(ds, ~ State)
#' #'
#' #' As noted earlier, these weights need to be multiplied by $\frac{N_s/N}{n_s/n}$ to transform them into
#' #' overall 'sum to $n$' weights. The easiest way to generate $N_s$ is to use the Horvitz-Thomson weights:
#' #'
#' ds$N_s <- with(ds, capply(w_o_ht, State, sum))
#' ds$n_s <- with(ds, capply(w_o_ht, State, length))
#' N_total <- sum(ds$w_o_ht)
#' n_total <- nrow(ds)
#'
#' ds$w_o_sn2 <- with(ds, w_ws_sn * (N_s / N_total) / (n_s/n_total))
#' ds
#' #' Comparing these weighs with previous ones:
#' with(ds, max(abs(w_o_sn2 - w_o_sn)))
#' #'
#' #' Estimating means per stratum
#' #'
#' ds$mean_sg <- capply(ds, ds[c('State','Gender')], with, wtd_mean(Income, w_o_ht))
#' ds$mean_sg_se <- capply(ds, ds[c('State','Gender')], with, jk_wtd_mean_se(Income, w = w_o_ht))
#' ds$mean_s <- capply(ds, ds$State, with, wtd_mean(Income, w_o_ht))
#' ds$mean_s_se <- capply(ds, ds$State, with, jk_wtd_mean_se(Income, Gender, w_o_ht))
#' ds$mean_g <- capply(ds, ds$Gender, with, wtd_mean(Income, w_o_ht))
#' ds$mean_g_se <- capply(ds, ds$Gender, with, jk_wtd_mean_se(Income, State, w_o_ht))
#' # Define a round method for data frames so round will work on a data frame with non-numeric variables
#' round.data.frame <- function(x, digits = 0) as.data.frame(lapply(x, function(x) if(is.numeric(x)) round(x, digits = digits) else x))
#' round(ds, 3)
#' dssum <- up(ds, ~ State/Gender) # keep all State x Gender invariant variables
#' round(dssum, 3)
#' }
#' @family Post-stratification survey functions
#' @seealso \code{\link{wtd_mean}} for weighted means, \code{link{lin_comb}} for linear
#' combinations, \code{\link{jk_wtd_mean_se}} for a jacknife estimate of the SE of a weighted mean
#' and \code{\link{jk_lin_comb_se}} for a jacknife estimate of the SE of a linear combination.
#' @export
jk_wtd_mean_se <- function(x, by = rep(1, length(x)), w = rep(1, length(x)), FUN = wtd_mean, na.rm = FALSE) {
# Value: jacknife estimator of the standard error of a weighted mean
# using post-stratification
#
# x: response variable, a numeric vector
# by: vector or list of vectors defining strata
# w: numeric vector of sampling weights constant within each stratum
#
theta_hat <- FUN(x, w, na.rm = na.rm)
theta_hat_i <- jk_wtd_means(x, by, w, FUN = FUN, na.rm = na.rm)
ns <- capply(x, by, length)
drop <- is.na(x)
sqrt(sum((theta_hat_i[!drop] - theta_hat)^2 * (ns[!drop]-1) / ns[!drop]))
}
#'
#'
#' ## Linear combinations
#'
#' ### Functions for linear combinations
#'
#' Standardize a vector of weights
#'
#' @param x vector of weights
#' @param div divisor (default: sum(x))
#' @rdname survey
#' @export
std <- function(x, div = sum(x)) x/div
#'
#' Linear combination of x
#'
#' @param x numerical vector
#' @param w vector of weights replicated to have same length as 'x'
#' @family Post-stratification survey functions
#' @export
lin_comb <- function(x, w) {
w <- rep(w, length.out = length(x))
sum(x*w)
}
#' Post-stratification Jacknife estimate of SE for a linear combination
#'
#' @inheritParams jk_wtd_mean_se
#' @rdname survey
#' @export
jk_lin_comb_se <- function(x, by, w, check = TRUE) jk_wtd_mean_se(x, by, w, FUN = lin_comb)
#'
function (lhs, rhs)
{
parent <- parent.frame()
env <- new.env(parent = parent)
chain_parts <- split_chain(match.call(), env = env)
pipes <- chain_parts[["pipes"]]
rhss <- chain_parts[["rhss"]]
lhs <- chain_parts[["lhs"]]
env[["_function_list"]] <- lapply(1:length(rhss), function(i) wrap_function(rhss[[i]],
pipes[[i]], parent))
env[["_fseq"]] <- `class<-`(eval(quote(function(value) freduce(value,
`_function_list`)), env, env), c("fseq", "function"))
env[["freduce"]] <- freduce
if (is_placeholder(lhs)) {
env[["_fseq"]]
}
else {
env[["_lhs"]] <- eval(lhs, parent, parent)
result <- withVisible(eval(quote(`_fseq`(`_lhs`)), env,
env))
if (is_compound_pipe(pipes[[1L]])) {
eval(call("<-", lhs, result[["value"]]), parent,
parent)
}
else {
if (result[["visible"]])
result[["value"]]
else invisible(result[["value"]])
}
}
}
#' Sample from a sampling frame
#'
#' Generate a probability sample from a sampling frame
#'
#' \code{sam} takes a sample size, a 'population' data frame, a formula specifying the count variable and
#' stratification variable(s), an optional expression to select a subset of the
#' population, relative sampling factor and the probability parameter for
#' a binomial response. It generates a sample based on these parameters.
#'
#' @param N sample size (default: 1)
#' @param pop a data frame giving frequencies in each stratum defined by a combination of
#' variables. The stratification can be finer than the stratification given in \code{fmla} (Default: \code{popagetable})
#' @param fmla a formula of the form \code{count ~ a + ... + c} where \code{count} is a variable in
#' \code{pop} giving the population count in each row of \code{pop} and the right hand side
#' of the formula identifies the variables used to aggregate \code{pop} to form a sampling frame
#' (Default: \code{population ~ sex + age + raceethnicity + state}). The default aggregates over
#' counties in the default value of \code{pop}
#' @param subset an expression that selects a subset, e.g. state == 'AK' & age > '5 to 9 years' (Default: no subsetting)
#' @param fac a data frame or a list of data frames whose variables include some subset of variables in the
#' right hand side of \code{fmla} and a variable named \code{fac} giving relative sampling factors
#' for each combination of values of the stratification variables. Omitted combinations of variables get
#' a default relative sampling factor of 1. If \code{fac} is a list, the relative sampling factors
#' from each element are multiplied.
#' @param prob a data frame or a list of data frames similar to \code{fac}, except that a variable named
#' \code{prob} is used to specify the binomial probability of a '1' in each stratum of respondents.
#' If \code{prob} is a list, the probabilities from each data frame are combined by adding their
#' logits.
#' @export
sam <- function(N = 1,
pop = popagetable, # population frequency table
fmla = population ~ sex + age + raceethnicity + state, # aggregation formula
subset, # an expression that selects a subset, e.g. state == 'AK' & age > '5 to 9 years'
fac, # a data frame or list of data frames to assign relative sampling frequencies,
# variable names are the RHS names in 'fmla' and 'fac', a numeric sampling factor
# If 'fac' is a list, the factors are applied multiplicatively
prob) # same as fac except that prob. of a 'yes' is given in variable 'prob'. They are
# combined by summing log odds
# TO BE GENERALIZED!!
{
# Note: reserved names: fac, prob, y
# local functions
last <- function(x) x[length(x)]
na2 <- function(x, rep = 0) {
x[is.na(x)] <- rep
x
}
# sampling frame
sampling_frame <- tab_df(pop, fmla)
library(magrittr) # for pipes
strat_vars <- fmla %>%
as.character %>%
last %>%
gsub(" ","",.) %>%
strsplit("[*+/:]") %>%
unlist
if(!missing(subset)) { # code from 'subset.data.frame'
e <- substitute(subset)
r <- eval(e, sampling_frame, parent.frame())
if(!is.logical(r)) stop("'subset' must be logical")
sampling_frame <- sampling_frame[r & !is.na(r),]
}
# sampling factor
sampling_frame$fac <-1
if(!missing(fac)) {
if(is.data.frame(fac)) fac <- list(fac)
fac <- lapply(fac, function(d) d[,c(intersect(names(d),strat_vars),'fac')])
for(dd in fac){
sampling_frame <- merge(sampling_frame, dd, all.x = T, by = intersect(strat_vars, names(dd)))
sampling_frame$fac <- sampling_frame$fac.x * na2(sampling_frame$fac.y, 1)
sampling_frame$fac.y <- NULL
sampling_frame$fac.x <- NULL
}
}
# probability of a 'yes'
logit <- function(p) log(p/(1-p))
sampling_frame$logit <- 0
if(!missing(prob)) {
if(is.data.frame(prob)) prob <- list(prob)
prob <- lapply(prob, function(d) d[,c(intersect(names(d),strat_vars),'prob')])
for(dd in prob){
dd$logit <- logit(dd$prob)
dd$prob <- NULL
sampling_frame <- merge(sampling_frame, dd, all.x = T, by = intersect(strat_vars, names(dd)))
sampling_frame$logit <- sampling_frame$logit.x + na2(sampling_frame$logit.y, 0)
sampling_frame$logit.y <- NULL
sampling_frame$logit.x <- NULL
}
}
sampling_frame$prob <- 1/(1+exp(-sampling_frame$logit))
# select respondents
rows <- sample(nrow(sampling_frame), N, replace = TRUE,
prob = with(sampling_frame, population * fac))
sample <- sampling_frame[rows,]
# Generate response
sample$y <- rbinom(nrow(sample), 1, prob = sample$prob)
attr(sample,'fmla') <- fmla
attr(sample,'strat_vars') <- strat_vars
sample
}
#' Sample and population strata counts
#'
#' Calculate counts for the stratum for each sampled unit
#'
#' @param sample a data frame including stratification variables
#' @param target_pop a data frame with stratification variables and a variable named 'population'
#' that contains population counts for each row. The data frame can include other stratification
#' variables than those in \code{fmla} and counts will be aggregated over these variables
#' @param fmla a one-sided formula of the form \code{~a + b + c ...} where 'a', 'b', 'c', ... are
#' stratification variables in both \code{sample} and in \code{target_pop}
#'
#' @return the \code{sample} data frame with two additional variables 'n', 'N' giving the stratum
#' counts in the sample and in the population, respectively. If some strata combinations are
#' missing in the sample, a row for each missing combination is added with n equal to 0 and
#' N equal to the count for that stratum in the population
#'
#' @export
ns <- function(sample, target_pop, formula) {
# The sample and the target_pop data frames
# must include the variables used in the formula of the
# form, e.g. ~ sex + age.
#
# The variable 'population' is assumed to give
# population counts in each row of 'target_pop'
#
# The function returns a data frame like 'sample'
# with two added variables: n and N for the sample
# and population counts respectively.
ret <- sample
by <- model.frame(formula, sample)
# sample counts: WWCa::capply(x, by, FUN, ...) applies the function 'FUN'
ret$n <- capply(ret[[1]], by, length)
# population counts
pop <- tab_df(target_pop, formula, weight = target_pop$population)
pop$N <- pop$Freq
pop$Freq <- NULL
ret <- merge(ret,pop, all = T)
fac.names <- names(pop)[sapply(pop,is.factor)]
for(nn in fac.names) {
ret[[nn]] <- factor(ret[[nn]],levels=levels(pop[[nn]]))
if(is.ordered(pop[[nn]])) ret[[nn]] <- ordered(ret[[nn]])
}
ret$n[is.na(ret$n)] <- 0
ret
}
#' Horvitz-Thompson and related weights
#'
#' Calculates sample weights based on the sample and population count variables calculated by
#' \code{\link{ns}}.
#'
#' @param sample a sample data frame with sample and population stratum counts given by the
#' variables 'n' and 'N' respectively.
#' @param cap (default 3) a maximum for truncated median normed Horvitz-Thompson weights.
#' @param \dots other arguments for future weight functions
#' @export
HTwts <- function(sample, cap = 3, ...) {
# sample with n and N
ret <- sample
ret$HT <- with(sample, N/n)
ret$HTcap <- pmin(ret$HT, cap)
ret$HT_mean <- ret$HT/mean(ret$HT)
ret$HT_med <- ret$HT/median(ret$HT)
ret$HT_mean_cap <- pmin(ret$HT_mean, cap)
ret$HT_med_cap <- pmin(ret$HT_med, cap)
ret$none <- 1
ret
}
#' Estimates and SEs from a sample
#'
#' Use \code{\link{ns}}, \code{\link{HTwts}} and other functions to compute estimated parameters
#' and SEs with different algorithms
#'
#' @param s a sample data frame
#' @param pop a population data frame
#' @param fmla a formula of the form \code{population ~ a + b + c} giving a variable with population
#' counts in \code{pop} and stratification variables.
#' @param \dots other arguments passed to \code{\link{HTwts}}
#' @return a list with parameter estimates and standard errors
#' @export
est <- function(s, pop, fmla, ...) {
s <- HTwts(ns(s, pop, fmla),...)
clist <- model.frame(fmla,s)
with(s, list(
est_raw = wtd_mean(y, none, na.rm = TRUE),
se_raw = jk_wtd_mean_se(y, clist, none, na.rm = TRUE),
est_HT = wtd_mean(y, HT, na.rm = TRUE),
se_HT = jk_wtd_mean_se(y, clist, HT, na.rm = TRUE),
est_HTc = wtd_mean(y, HT_med_cap, na.rm = TRUE),
se_HTc = jk_wtd_mean_se(y, clist, HT_med_cap, na.rm = TRUE)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.