R/recalib.R

Defines functions recalib

Documented in recalib

#' @title Calibrate weights
#'
#' @description
#' Calibrate weights for bootstrap replicates by using iterative proportional
#' updating to match population totals on various household and personal levels.
#'
#'
#' @details
#' `recalib` takes survey data (`dat`) containing the bootstrap replicates
#' generated by [draw.bootstrap] and calibrates weights for each bootstrap
#' replication according to population totals for person- or household-specific
#' variables. \cr
#' `dat` must be household data where household members correspond to multiple
#' rows with the same household identifier. The data should at least containt
#' the following columns:
#'
#' * Column indicating the sample period;
#' * Column indicating the household ID;
#' * Column containing the household sample weights;
#' * Columns which contain the bootstrap replicates (see output of
#'   [draw.bootstrap]);
#' * Columns indicating person- or household-specific variables for which sample
#'   weight should be adjusted.
#'
#' For each period and each variable in `conP.var` and/or `conH.var` contingency
#' tables are estimated to get margin totals on personal- and/or
#' household-specific variables in the population.\cr
#' Afterwards the bootstrap replicates are multiplied with the original sample
#' weight and the resulting product ist then adjusted using [ipf()] to match the
#' previously calcualted contingency tables. In this process the columns of the
#' bootstrap replicates are overwritten by the calibrated weights.\cr
#'
#' @param dat either data.frame or data.table containing the sample survey for
#'   various periods.
#' @param hid character specifying the name of the column in `dat` containing
#'   the household ID.
#' @param weights character specifying the name of the column in `dat`
#'   containing the sample weights.
#' @param b.rep character specifying the names of the columns in `dat`
#'   containing bootstrap weights which should be recalibratet
#' @param period character specifying the name of the column in `dat` containing
#'   the sample period.
#' @param conP.var character vector containig person-specific variables to which
#'   weights should be calibrated or a list of such character vectors.
#'   Contingency tables for the population are calculated per `period` using
#'   `weights`.
#' @param conH.var character vector containig household-specific variables to
#'   which weights should be calibrated or a list of such character vectors.
#'   Contingency tables for the population are calculated per `period` using
#'   `weights`.
#' @param conP list or (partly) named list defining the constraints on person
#'   level.  The list elements are contingency tables in array representation
#'   with dimnames corresponding to the names of the relevant calibration
#'   variables in `dat`. If a numerical variable is to be calibrated, the
#'   respective list element has to be named with the name of that numerical
#'   variable. Otherwise the list element shoud NOT be named.
#' @param conH list or (partly) named list defining the constraints on
#'   household level.  The list elements are contingency tables in array
#'   representation with dimnames corresponding to the names of the relevant
#'   calibration variables in `dat`. If a numerical variable is to be
#'   calibrated, the respective list element has to be named with the name of
#'   that numerical variable. Otherwise the list element shoud NOT be named.

#' @param epsP numeric value specifying the convergence limit for `conP.var`
#'   or `conP`, see [ipf()].
#' @param epsH numeric value specifying the convergence limit for `conH.var`
#'   or `conH`, see [ipf()].
#' @param ... additional arguments passed on to function [ipf()] from this
#'   package.
#'
#' @return Returns a data.table containing the survey data as well as the
#'   calibrated weights for the bootstrap replicates. The original bootstrap
#'   replicates are overwritten by the calibrated weights. If calibration of a
#'   bootstrap replicate does not converge the bootsrap weight is not returned
#'   and numeration of the returned bootstrap weights is reduced by one.
#'
#'
#' @seealso [ipf()] for more information on iterative
#'   proportional fitting.
#'
#' @author Johannes Gussenbauer, Alexander Kowarik, Statistics Austria
#'
#' @examples
#' \dontrun{
#'
#' eusilc <- demo.eusilc(prettyNames = TRUE)
#'
#' dat_boot <- draw.bootstrap(eusilc, REP = 10, hid = "hid",
#'                            weights = "pWeight",
#'                            strata = "region", period = "year")
#'
#' # calibrate weight for bootstrap replicates
#' dat_boot_calib <- recalib(dat_boot, conP.var = "gender", conH.var = "region",
#'                           verbose = TRUE)
#'
#'
#' # calibrate on other variables
#' dat_boot_calib <- recalib(dat_boot, conP.var = c("gender", "age"),
#'                           conH.var = c("region", "hsize"), verbose = TRUE)
#'
#' # supply contingency tables directly
#' conP <- xtabs(pWeight ~ age + gender + year, data = eusilc)
#' conH <- xtabs(pWeight ~ hsize + region + year,
#'               data = eusilc[!duplicated(paste(db030,year))])
#' dat_boot_calib <- recalib(dat_boot, conP.var = NULL,
#'                           conH.var = NULL, conP = list(conP),
#'                           conH = list(conH), verbose = TRUE)
#' }
#'
#' @export recalib
#'

recalib <- function(
  dat, hid = attr(dat, "hid"), weights = attr(dat, "weights"), b.rep =
    attr(dat, "b.rep"), period = attr(dat, "period"), conP.var = NULL,
  conH.var = NULL, conP = NULL, conH = NULL, epsP = 1e-2, epsH = 2e-2, ...) {

  hidfactor <- calibWeight <- FirstPersonInHousehold_ <- verbose <-
    bound <- maxiter <- meanHH <- check_hh_vars <- allPthenH <-
    returnNA <- conversion_messages <- maxIter <- NULL

  removeCols <- c()
  
  ##########################################################
  # define default values for ipf
  ipfDefaults <- formals(ipf)
  ipfDefaults <- ipfDefaults[!names(ipfDefaults) %in% names(formals(recalib))]
  ellipsis <- list(...)
  # set these to FALSE by default
  ellipsis[["check_hh_vars"]] <- getEllipsis2("check_hh_vars",
                                              FALSE, ellipsis)
  ellipsis[["conversion_messages"]] <- getEllipsis2("conversion_messages",
                                                    FALSE, ellipsis)
  ellipsis[["verbose"]] <- getEllipsis2("verbose", TRUE, ellipsis)
  ellipsis <- lapply(names(ipfDefaults), function(z) {
    getEllipsis2(z, ipfDefaults[[z]], ellipsis)
  })
  names(ellipsis) <- names(ipfDefaults)
  
  ellipsisNames <- names(ellipsis)
  ellipsisContent <- paste0("ellipsis[['", ellipsisNames, "']]")
  eval(parse(text = paste(
    ellipsisNames,
    ellipsisContent, sep = "<-"
  )))
  
  
  ##########################################################
  # INPUT CHECKING
  if (is.data.frame(dat)) {
    dat <- as.data.table(dat)
  } else if (!is.data.table(dat)) {
    stop("dat must be a data.frame or data.table")
  }
  dat <- copy(dat)

  c.names <- colnames(dat)

  # check hid
  hidNULL <- is.null(hid)
  if (hidNULL) {
    hid <- generateRandomName(20, colnames(dat))
    dat[, c(hid) := 1:.N]
    removeCols <- c(removeCols, hid)
  }

  if (length(hid) != 1) {
    stop("hid must have length 1")
  }
  if (!hid %in% c.names) {
    stop(paste0(hid, " is not a column in dat"))
  }

  # check weights
  if (length(weights) != 1) {
    stop("weights must have length 1")
  }
  if (!weights %in% c.names) {
    stop(paste0(weights, " is not a column in dat"))
  }
  if (!is.numeric(dt.eval("dat[,", weights, "]"))) {
    stop(paste0(weights, " must be a numeric column"))
  }

  # check b.rep
  if (!all(b.rep %in% c.names)) {
    stop("Not all elements in b.rep are column names in dat")
  }
  if (any(!grepl("^[[:alpha:]]", b.rep))) {
    stop("Column names of bootstrap replicates must start with ",
         "alphabetic character")
  }
  if (any(!unlist(lapply(dat[, mget(b.rep)], is.numeric)))) {
    stop("Column containing bootstrap replicates must be numeric")
  }

  # check conP.var
  if (!all(unique(unlist(conP.var)) %in% c.names)) {
    stop("Not all elements in conP.var are column names in dat")
  }

  # check conH.var
  if (!all(unique(unlist(conH.var)) %in% c.names)) {
    print(all(unique(unlist(conH.var)) %in% c.names))
    print(unique(unlist(conH.var)))
    print(c.names[1:30])
    stop("Not all elements in conH.var are column names in dat")
  }

  # check period
  periodNULL <- is.null(period)
  if (periodNULL) {
    period <- generateRandomName(20, colnames(dat))
    dat[, c(period) := 1]
    removeCols <- c(removeCols, period)
  }
  if (length(period) != 1) {
    stop(paste0(period, " must have length 1"))
  }
  if (!period %in% c.names) {
    stop(paste0(period, " is not a column in dat"))
  }


  ##########################################################

  # check conP and conH
  conPnames <- lapply(conP, function(z) {
    z <- names(dimnames(z))
    z[z != period]
  })
  conHnamesNumeric <- conPnamesNumeric <- NULL
  if(!is.null(names(conP))){
    conPnamesNumeric <- unique(names(conP))
    conPnamesNumeric <- conPnamesNumeric[conPnamesNumeric!=""]
  }
  conHnames <- lapply(conH, function(z) {
    z <- names(dimnames(z))
    z[z != period]
  })
    if(!is.null(names(conH))){
      conHnamesNumeric <- unique(names(conH))
      conHnamesNumeric <- conHnamesNumeric[conHnamesNumeric!=""]
  }

  if (!all(unlist(conPnames) %in% c.names)) {
    stop("Not all dimnames in conP are column names in dat")
  }
  if (!all(unlist(conHnames) %in% c.names)) {
    stop("Not all dimnames in conH are column names in dat")
  }


  if (!is.null(conH.var) | !is.null(conP.var) |
      !is.null(conP) | !is.null(conH)) {
    var.miss <- unlist(
      dat[, lapply(
        .SD,
        function(z) {
          sum(is.na(z))
        }
      ), .SDcols = c(unique(unlist(c(conH.var, conP.var,
                                     conPnames, conHnames))))])
    var.miss <- var.miss[var.miss > 0]
    if (length(var.miss) > 0) {
      stop("Missing values detected in column(s)", names(var.miss))
    }
  } else {
    message("recalib: conP.var, conH.var, conP and conH are all missing. ",
            "Only calibrating for the population totals")
  }



  # recode household and person variables to factor
  # improves runtime for ipf
  #
  vars <- c(period, unique(unlist(c(conP.var, conH.var, conPnames, conHnames))))
  vars.class <- unlist(lapply(dat[, mget(vars)], function(z) {
    z.class <- class(z)
    if (z.class[1] %in% c("labelled", "haven_labelled")) {
      z.class <- "factor"
    }
    return(z.class)
  }))
  # convert to factor
  for (i in seq_along(vars)) {
    if (vars.class[[vars[i]]] != "factor") {
      dt.eval("dat[,", vars[i], ":=as.factor(", vars[i], ")]")
    }
  }


  # calculate contingency tables
  for (p in seq_along(conP.var)) {
    existTab <- sapply(conPnames, setequal, y = conP.var[[p]])
    if (all(!existTab)) {
      formTab <- paste(weights, "~",
                       paste(c(period, conP.var[[p]]), collapse = "+"))
      conP <- c(conP, list(xtabs(formTab, data = dat)))
    }else{
      stop("contingency table for ", paste(conP.var[p], collapse = ", "),
           " was supplied through parameter conP AND conP.var")
    }
  }

  dat[, FirstPersonInHousehold_ := c(1L, rep(0, .N - 1)), by = c(hid, period)]
  for (h in seq_along(conH.var)) {
    existTab <- sapply(conHnames, setequal, y = conH.var[[h]])
    if (all(!existTab)) {
      formTab <- paste(weights, "~", paste(c(period, conH.var[[h]]),
                                           collapse = "+"))
      conH <- c(
        conH, list(xtabs(formTab, data = dat[FirstPersonInHousehold_ == 1])))
    } else {
      stop("contingency table for ", paste(conH.var[[h]], collapse = ", "),
           " was supplied through parameter conH AND conH.var")
    }
  }
  # define new Index
  dat[, hidfactor := .GRP, by = c(hid, period)]
  dat[, hidfactor := factor(hidfactor)]
  
  # calibrate weights to conP and conH
  select.var <- unique(c("hidfactor", weights, period,
                         unlist(c(conP.var, conH.var, conPnames, conHnames,
                                  conPnamesNumeric, conHnamesNumeric))))
  calib.fail <- c()

  for (g in b.rep) {
    set(dat, j = g, value = dt.eval("dat[,", g, "*", weights, "]"))

    # check if margins for bootstrap weights are always positive
    check.conP <- lapply(conP, function(z) {
      check.z <- dt.eval("dat[,sum(", g, "),by=list(",
                         paste(names(dimnames(z)), collapse = ","), ")][V1==0]")
      nrow(check.z) > 0
    })
    check.conH <- lapply(conH, function(z) {
      check.z <- dt.eval("dat[,sum(", g, "),by=list(",
                         paste(names(dimnames(z)), collapse = ","), ")][V1==0]")
      nrow(check.z) > 0
    })
    if (!is.null(conP) | !is.null(conH)) {
      if (any(unlist(c(check.conH, check.conP)))) {
        calib.fail <- c(calib.fail, g)
        set(dat, j = g, value = NA_real_)
      } else {
        set(dat, j = g, value = ipf(
          dat = copy(dat[, mget(c(g, select.var))]), conP = conP,
          conH = conH, verbose = verbose, epsP = epsP, epsH = epsH,
          w = g, bound = bound, maxIter = maxIter, meanHH = meanHH,
          hid = "hidfactor", check_hh_vars = check_hh_vars,
          allPthenH = allPthenH, returnNA = returnNA,
          conversion_messages = conversion_messages
        )[, calibWeight])
        if (dat[, any(is.na(get(g)))]) {
          calib.fail <- c(calib.fail, g)
        }
      }
    }
  }


  # paste warnings if calibration failed in some instances
  if (length(calib.fail) > 0) {

    dat[, c(calib.fail) := NULL]
    b.rep <- b.rep[!b.rep %in% calib.fail]

    if (length(b.rep) == 0) {
      cat("Calibration failed for all bootstrap replicates\n")
      cat("Returning no bootstrap weights\n")
    } else {
      cat("Calibration failed for bootstrap replicates", calib.fail, "\n")
      cat("Corresponding bootstrap replicates will be discarded\n")
      lead.char <- sub("[[:digit:]].*", "", b.rep[1])
      b.rep_new <- paste0(lead.char, seq_along(b.rep))
      setnames(dat, b.rep, b.rep_new)
      cat("Returning", length(b.rep), "calibrated bootstrap weights\n")
      b.rep <- b.rep_new
    }
  }


  dat[, c("hidfactor", "FirstPersonInHousehold_") := NULL]

  # recode vars back to either integer of character
  for (i in seq_along(vars.class)) {
    if (vars.class[i] %in% c("integer", "numeric")) {
      dt.eval("dat[,", vars[i], ":=as.numeric(as.character(", vars[i], "))]")
    } else if (vars.class[i] == "character") {
      dt.eval("dat[,", vars[i], ":=as.character(", vars[i], ")]")
    }
  }

  if (periodNULL) {
    period <- NULL
  }

  setattr(dat, "weights", weights)
  setattr(dat, "period", period)
  setattr(dat, "b.rep", b.rep)

  return(dat)
}

Try the surveysd package in your browser

Any scripts or data that you put into this service are public.

surveysd documentation built on Dec. 28, 2022, 2:15 a.m.