Nothing
#' @title Calibrate weights
#'
#' @description
#' Calibrate weights for bootstrap replicates by using iterative proportional
#' updating to match population totals on various household and personal levels.
#'
#'
#' @details
#' `recalib` takes survey data (`dat`) containing the bootstrap replicates
#' generated by [draw.bootstrap] and calibrates weights for each bootstrap
#' replication according to population totals for person- or household-specific
#' variables. \cr
#' `dat` must be household data where household members correspond to multiple
#' rows with the same household identifier. The data should at least containt
#' the following columns:
#'
#' * Column indicating the sample period;
#' * Column indicating the household ID;
#' * Column containing the household sample weights;
#' * Columns which contain the bootstrap replicates (see output of
#' [draw.bootstrap]);
#' * Columns indicating person- or household-specific variables for which sample
#' weight should be adjusted.
#'
#' For each period and each variable in `conP.var` and/or `conH.var` contingency
#' tables are estimated to get margin totals on personal- and/or
#' household-specific variables in the population.\cr
#' Afterwards the bootstrap replicates are multiplied with the original sample
#' weight and the resulting product ist then adjusted using [ipf()] to match the
#' previously calcualted contingency tables. In this process the columns of the
#' bootstrap replicates are overwritten by the calibrated weights.\cr
#'
#' @param dat either data.frame or data.table containing the sample survey for
#' various periods.
#' @param hid character specifying the name of the column in `dat` containing
#' the household ID.
#' @param weights character specifying the name of the column in `dat`
#' containing the sample weights.
#' @param b.rep character specifying the names of the columns in `dat`
#' containing bootstrap weights which should be recalibratet
#' @param period character specifying the name of the column in `dat` containing
#' the sample period.
#' @param conP.var character vector containig person-specific variables to which
#' weights should be calibrated or a list of such character vectors.
#' Contingency tables for the population are calculated per `period` using
#' `weights`.
#' @param conH.var character vector containig household-specific variables to
#' which weights should be calibrated or a list of such character vectors.
#' Contingency tables for the population are calculated per `period` using
#' `weights`.
#' @param conP list or (partly) named list defining the constraints on person
#' level. The list elements are contingency tables in array representation
#' with dimnames corresponding to the names of the relevant calibration
#' variables in `dat`. If a numerical variable is to be calibrated, the
#' respective list element has to be named with the name of that numerical
#' variable. Otherwise the list element shoud NOT be named.
#' @param conH list or (partly) named list defining the constraints on
#' household level. The list elements are contingency tables in array
#' representation with dimnames corresponding to the names of the relevant
#' calibration variables in `dat`. If a numerical variable is to be
#' calibrated, the respective list element has to be named with the name of
#' that numerical variable. Otherwise the list element shoud NOT be named.
#' @param epsP numeric value specifying the convergence limit for `conP.var`
#' or `conP`, see [ipf()].
#' @param epsH numeric value specifying the convergence limit for `conH.var`
#' or `conH`, see [ipf()].
#' @param ... additional arguments passed on to function [ipf()] from this
#' package.
#'
#' @return Returns a data.table containing the survey data as well as the
#' calibrated weights for the bootstrap replicates. The original bootstrap
#' replicates are overwritten by the calibrated weights. If calibration of a
#' bootstrap replicate does not converge the bootsrap weight is not returned
#' and numeration of the returned bootstrap weights is reduced by one.
#'
#'
#' @seealso [ipf()] for more information on iterative
#' proportional fitting.
#'
#' @author Johannes Gussenbauer, Alexander Kowarik, Statistics Austria
#'
#' @examples
#' \dontrun{
#'
#' eusilc <- demo.eusilc(prettyNames = TRUE)
#'
#' dat_boot <- draw.bootstrap(eusilc, REP = 10, hid = "hid",
#' weights = "pWeight",
#' strata = "region", period = "year")
#'
#' # calibrate weight for bootstrap replicates
#' dat_boot_calib <- recalib(dat_boot, conP.var = "gender", conH.var = "region",
#' verbose = TRUE)
#'
#'
#' # calibrate on other variables
#' dat_boot_calib <- recalib(dat_boot, conP.var = c("gender", "age"),
#' conH.var = c("region", "hsize"), verbose = TRUE)
#'
#' # supply contingency tables directly
#' conP <- xtabs(pWeight ~ age + gender + year, data = eusilc)
#' conH <- xtabs(pWeight ~ hsize + region + year,
#' data = eusilc[!duplicated(paste(db030,year))])
#' dat_boot_calib <- recalib(dat_boot, conP.var = NULL,
#' conH.var = NULL, conP = list(conP),
#' conH = list(conH), verbose = TRUE)
#' }
#'
#' @export recalib
#'
recalib <- function(
dat, hid = attr(dat, "hid"), weights = attr(dat, "weights"), b.rep =
attr(dat, "b.rep"), period = attr(dat, "period"), conP.var = NULL,
conH.var = NULL, conP = NULL, conH = NULL, epsP = 1e-2, epsH = 2e-2, ...) {
hidfactor <- calibWeight <- FirstPersonInHousehold_ <- verbose <-
bound <- maxiter <- meanHH <- check_hh_vars <- allPthenH <-
returnNA <- conversion_messages <- maxIter <- NULL
removeCols <- c()
##########################################################
# define default values for ipf
ipfDefaults <- formals(ipf)
ipfDefaults <- ipfDefaults[!names(ipfDefaults) %in% names(formals(recalib))]
ellipsis <- list(...)
# set these to FALSE by default
ellipsis[["check_hh_vars"]] <- getEllipsis2("check_hh_vars",
FALSE, ellipsis)
ellipsis[["conversion_messages"]] <- getEllipsis2("conversion_messages",
FALSE, ellipsis)
ellipsis[["verbose"]] <- getEllipsis2("verbose", TRUE, ellipsis)
ellipsis <- lapply(names(ipfDefaults), function(z) {
getEllipsis2(z, ipfDefaults[[z]], ellipsis)
})
names(ellipsis) <- names(ipfDefaults)
ellipsisNames <- names(ellipsis)
ellipsisContent <- paste0("ellipsis[['", ellipsisNames, "']]")
eval(parse(text = paste(
ellipsisNames,
ellipsisContent, sep = "<-"
)))
##########################################################
# INPUT CHECKING
if (is.data.frame(dat)) {
dat <- as.data.table(dat)
} else if (!is.data.table(dat)) {
stop("dat must be a data.frame or data.table")
}
dat <- copy(dat)
c.names <- colnames(dat)
# check hid
hidNULL <- is.null(hid)
if (hidNULL) {
hid <- generateRandomName(20, colnames(dat))
dat[, c(hid) := 1:.N]
removeCols <- c(removeCols, hid)
}
if (length(hid) != 1) {
stop("hid must have length 1")
}
if (!hid %in% c.names) {
stop(paste0(hid, " is not a column in dat"))
}
# check weights
if (length(weights) != 1) {
stop("weights must have length 1")
}
if (!weights %in% c.names) {
stop(paste0(weights, " is not a column in dat"))
}
if (!is.numeric(dt.eval("dat[,", weights, "]"))) {
stop(paste0(weights, " must be a numeric column"))
}
# check b.rep
if (!all(b.rep %in% c.names)) {
stop("Not all elements in b.rep are column names in dat")
}
if (any(!grepl("^[[:alpha:]]", b.rep))) {
stop("Column names of bootstrap replicates must start with ",
"alphabetic character")
}
if (any(!unlist(lapply(dat[, mget(b.rep)], is.numeric)))) {
stop("Column containing bootstrap replicates must be numeric")
}
# check conP.var
if (!all(unique(unlist(conP.var)) %in% c.names)) {
stop("Not all elements in conP.var are column names in dat")
}
# check conH.var
if (!all(unique(unlist(conH.var)) %in% c.names)) {
print(all(unique(unlist(conH.var)) %in% c.names))
print(unique(unlist(conH.var)))
print(c.names[1:30])
stop("Not all elements in conH.var are column names in dat")
}
# check period
periodNULL <- is.null(period)
if (periodNULL) {
period <- generateRandomName(20, colnames(dat))
dat[, c(period) := 1]
removeCols <- c(removeCols, period)
}
if (length(period) != 1) {
stop(paste0(period, " must have length 1"))
}
if (!period %in% c.names) {
stop(paste0(period, " is not a column in dat"))
}
##########################################################
# check conP and conH
conPnames <- lapply(conP, function(z) {
z <- names(dimnames(z))
z[z != period]
})
conHnamesNumeric <- conPnamesNumeric <- NULL
if(!is.null(names(conP))){
conPnamesNumeric <- unique(names(conP))
conPnamesNumeric <- conPnamesNumeric[conPnamesNumeric!=""]
}
conHnames <- lapply(conH, function(z) {
z <- names(dimnames(z))
z[z != period]
})
if(!is.null(names(conH))){
conHnamesNumeric <- unique(names(conH))
conHnamesNumeric <- conHnamesNumeric[conHnamesNumeric!=""]
}
if (!all(unlist(conPnames) %in% c.names)) {
stop("Not all dimnames in conP are column names in dat")
}
if (!all(unlist(conHnames) %in% c.names)) {
stop("Not all dimnames in conH are column names in dat")
}
if (!is.null(conH.var) | !is.null(conP.var) |
!is.null(conP) | !is.null(conH)) {
var.miss <- unlist(
dat[, lapply(
.SD,
function(z) {
sum(is.na(z))
}
), .SDcols = c(unique(unlist(c(conH.var, conP.var,
conPnames, conHnames))))])
var.miss <- var.miss[var.miss > 0]
if (length(var.miss) > 0) {
stop("Missing values detected in column(s)", names(var.miss))
}
} else {
message("recalib: conP.var, conH.var, conP and conH are all missing. ",
"Only calibrating for the population totals")
}
# recode household and person variables to factor
# improves runtime for ipf
#
vars <- c(period, unique(unlist(c(conP.var, conH.var, conPnames, conHnames))))
vars.class <- unlist(lapply(dat[, mget(vars)], function(z) {
z.class <- class(z)
if (z.class[1] %in% c("labelled", "haven_labelled")) {
z.class <- "factor"
}
return(z.class)
}))
# convert to factor
for (i in seq_along(vars)) {
if (vars.class[[vars[i]]] != "factor") {
dt.eval("dat[,", vars[i], ":=as.factor(", vars[i], ")]")
}
}
# calculate contingency tables
for (p in seq_along(conP.var)) {
existTab <- sapply(conPnames, setequal, y = conP.var[[p]])
if (all(!existTab)) {
formTab <- paste(weights, "~",
paste(c(period, conP.var[[p]]), collapse = "+"))
conP <- c(conP, list(xtabs(formTab, data = dat)))
}else{
stop("contingency table for ", paste(conP.var[p], collapse = ", "),
" was supplied through parameter conP AND conP.var")
}
}
dat[, FirstPersonInHousehold_ := c(1L, rep(0, .N - 1)), by = c(hid, period)]
for (h in seq_along(conH.var)) {
existTab <- sapply(conHnames, setequal, y = conH.var[[h]])
if (all(!existTab)) {
formTab <- paste(weights, "~", paste(c(period, conH.var[[h]]),
collapse = "+"))
conH <- c(
conH, list(xtabs(formTab, data = dat[FirstPersonInHousehold_ == 1])))
} else {
stop("contingency table for ", paste(conH.var[[h]], collapse = ", "),
" was supplied through parameter conH AND conH.var")
}
}
# define new Index
dat[, hidfactor := .GRP, by = c(hid, period)]
dat[, hidfactor := factor(hidfactor)]
# calibrate weights to conP and conH
select.var <- unique(c("hidfactor", weights, period,
unlist(c(conP.var, conH.var, conPnames, conHnames,
conPnamesNumeric, conHnamesNumeric))))
calib.fail <- c()
for (g in b.rep) {
set(dat, j = g, value = dt.eval("dat[,", g, "*", weights, "]"))
# check if margins for bootstrap weights are always positive
check.conP <- lapply(conP, function(z) {
check.z <- dt.eval("dat[,sum(", g, "),by=list(",
paste(names(dimnames(z)), collapse = ","), ")][V1==0]")
nrow(check.z) > 0
})
check.conH <- lapply(conH, function(z) {
check.z <- dt.eval("dat[,sum(", g, "),by=list(",
paste(names(dimnames(z)), collapse = ","), ")][V1==0]")
nrow(check.z) > 0
})
if (!is.null(conP) | !is.null(conH)) {
if (any(unlist(c(check.conH, check.conP)))) {
calib.fail <- c(calib.fail, g)
set(dat, j = g, value = NA_real_)
} else {
set(dat, j = g, value = ipf(
dat = copy(dat[, mget(c(g, select.var))]), conP = conP,
conH = conH, verbose = verbose, epsP = epsP, epsH = epsH,
w = g, bound = bound, maxIter = maxIter, meanHH = meanHH,
hid = "hidfactor", check_hh_vars = check_hh_vars,
allPthenH = allPthenH, returnNA = returnNA,
conversion_messages = conversion_messages
)[, calibWeight])
if (dat[, any(is.na(get(g)))]) {
calib.fail <- c(calib.fail, g)
}
}
}
}
# paste warnings if calibration failed in some instances
if (length(calib.fail) > 0) {
dat[, c(calib.fail) := NULL]
b.rep <- b.rep[!b.rep %in% calib.fail]
if (length(b.rep) == 0) {
cat("Calibration failed for all bootstrap replicates\n")
cat("Returning no bootstrap weights\n")
} else {
cat("Calibration failed for bootstrap replicates", calib.fail, "\n")
cat("Corresponding bootstrap replicates will be discarded\n")
lead.char <- sub("[[:digit:]].*", "", b.rep[1])
b.rep_new <- paste0(lead.char, seq_along(b.rep))
setnames(dat, b.rep, b.rep_new)
cat("Returning", length(b.rep), "calibrated bootstrap weights\n")
b.rep <- b.rep_new
}
}
dat[, c("hidfactor", "FirstPersonInHousehold_") := NULL]
# recode vars back to either integer of character
for (i in seq_along(vars.class)) {
if (vars.class[i] %in% c("integer", "numeric")) {
dt.eval("dat[,", vars[i], ":=as.numeric(as.character(", vars[i], "))]")
} else if (vars.class[i] == "character") {
dt.eval("dat[,", vars[i], ":=as.character(", vars[i], ")]")
}
}
if (periodNULL) {
period <- NULL
}
setattr(dat, "weights", weights)
setattr(dat, "period", period)
setattr(dat, "b.rep", b.rep)
return(dat)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.