R/ipf.r

Defines functions ipf addWeightsAndAttributes getFormulas calibH calibP check_population_totals check_eps boundsFakHH boundsFak getMeanFun combine_factors

Documented in combine_factors ipf

#' @rdname ipf_step
#' @name ipf_step
#' @md
#'
#' @param dat a `data.frame` containing the factor variables to be combined.
#'
#' @export
combine_factors <- function(dat, targets) {

  x <- as.data.frame(targets)
  x$ID_ipu <- seq_len(nrow(x))
  x <- merge(dat, x, by = names(dimnames(targets)), sort = FALSE, all.x = TRUE)
  factor(x$ID_ipu, levels = seq_along(targets))
}

getMeanFun <- function(meanHH) {
  if (isTRUE(meanHH))
    meanHH <- "arithmetic"
  if (identical(meanHH, FALSE))
    meanHH <- "none"
  meanfun <- switch(meanHH,
                    arithmetic = arithmetic_mean,
                    geometric = geometric_mean,
                    none = function(x, w) {
                      x
                    }
  )
  if (is.null(meanfun))
    stop("invalid value for meanHH")
  meanfun
}
boundsFak <- function(g1, g0, f, bound = 4, minMaxTrim=NULL) {
  # Berechnet die neuen Gewichte (innerhalb 4, .25 Veraenderungsraten)
  g1 <- g1 * f
  if(!is.null(bound)){
    TF <- which((g1 / g0) > bound)
    TF[is.na(TF)] <- FALSE
    g1[TF] <- bound * g0[TF]
    TF <- which((g1 / g0) < (1 / bound))
    TF[is.na(TF)] <- FALSE
    g1[TF] <- (1 / bound) * g0[TF]
  }
  if(!is.null(minMaxTrim)){
    g1[g1<minMaxTrim[1]] <- minMaxTrim[1]
    g1[g1>minMaxTrim[2]] <- minMaxTrim[2]
  }
  return(g1)
}
boundsFakHH <- function(g1, g0, eps, orig, p, bound = 4, minMaxTrim=NULL) {
  # Berechnet die neuen Gewichte fuer Unter- und Obergrenze (innerhalb 4,
  #   .25 Veraenderungsraten)
  u <- orig * (1 - eps)
  o <- orig * (1 + eps)

  pbo <- which(p > o)
  psu <- which(p < u)
  g1[pbo] <- g1[pbo] * o[pbo] / p[pbo]
  g1[psu] <- g1[psu] * u[psu] / p[psu]
  if(!is.null(bound)){
    TF <- which((g1 / g0) > bound)
    g1[TF] <- bound * g0[TF]
    TF <- which((g1 / g0) < (1 / bound))
    g1[TF] <- (1 / bound) * g0[TF]
  }
  if(!is.null(minMaxTrim)){
    g1[g1<minMaxTrim[1]] <- minMaxTrim[1]
    g1[g1>minMaxTrim[2]] <- minMaxTrim[2]
  }
  return(g1)
}

check_eps <- function(con, eps, type){
  l <- length(con)
  if(is.list(eps)){
    if(length(eps)!=l){
      stop(paste("Provided",type,"eps argument does not fit",type,"constraints."))
    }
    for(i in 1:length(eps)){
      if(is.array(eps[[i]])){
        if(!identical(dim(eps[[i]]),dim(con[[i]]))){
          stop(paste("Provided",type,"eps argument",i,"does not fit in dimension to ",type,"constraints",i,"."))
        }
      }
    }
  }else if(length(eps)>1){
    stop("Individual eps arguments for each constraints must be defined as list.")
  }
}

check_population_totals <- function(con, dat, type = "personal") {
  # do not apply this check for numerical calibration
  if (is.null(names(con))) {
    ind <- seq_along(con)
  } else {
    ind <- which(names(con) == "")
  }

  # do not apply this check for constraints that only cover the population
  #   partially
  ind <- ind[vapply(
    ind,
    function(i) {
      constraint <- con[[i]]
      for (variable in names(dimnames(constraint))) {
        if (!(variable %in% names(dat)))
          stop("variable ", variable, " appears in a constraint but not ",
               "in the dataset")
        if (!identical(sort(unique(as.character(dat[[variable]]))),
                       sort(dimnames(constraint)[[variable]]))) {
          message(type, " constraint ", i, " only covers a subset of the ",
                  "population")
          return(FALSE)
        }
      }
      return(TRUE)
    },
    TRUE)]

  if (length(ind) == 0)
    return(NULL)

  pop_totals <- vapply(
    ind,
    function(index) {
      sum(con[[index]])
    },
    0)
  rel_errors <- abs(pop_totals - pop_totals[1]) / pop_totals[1]

  # use a 1% tolerance. Maybe it would be better to make the tolerance
  #   dependent on conH?
  if (any(rel_errors > 1e-2))
    stop("population totals for different constraints do not match")
}

calibP <- function(i, dat, error, valueP, pColNames, bound, verbose, calIter,
                   numericalWeighting, numericalWeightingVar, w, cw, minMaxTrim) {
  selectGroupNotConverged <- epsPcur <- maxFac <- OriginalSortingVariable <- V1 <-
    epsvalue <- fVariableForCalibrationIPF <- NULL
  temporary_hvar <- value <-
    wValue <- representativeHouseholdForCalibration <- NULL
  variableKeepingTheBaseWeight <- w
  variableKeepingTheCalibWeight <- cw
  combined_factors <- dat[[paste0("combined_factors_", i)]]
  setnames(dat, valueP[i], "value")
  setnames(dat, paste0("epsP_", i), "epsPcur")
  tmp <- data.table(x = factor(levels(combined_factors)))
  setnames(tmp, "x", paste0("combined_factors_", i))
  paste0("combined_factors_h_", i)
  con_current <- dat[tmp, on = paste0("combined_factors_", i),
                     mult = "first", value]

  if (!is.null(numericalWeightingVar)) {
    ## numerical variable to be calibrated
    ## use name of conP list element to define numerical variable
    set(dat, j = "fVariableForCalibrationIPF",
        value = ipf_step_f(dat[[variableKeepingTheCalibWeight]] *
                             dat[[numericalWeightingVar]],
                           combined_factors, con_current))
    set(dat, j = "wValue", value = dat[["value"]] /
          dat[["fVariableForCalibrationIPF"]])

    # try to divide the weight between units with larger/smaller value in the
    #   numerical variable linear
    dat[, fVariableForCalibrationIPF := numericalWeighting(
      head(wValue, 1), head(value, 1), get(numericalWeightingVar),
      get(variableKeepingTheCalibWeight)),
      by = eval(paste0("combined_factors_", i))]
  } else {
    # categorical variable to be calibrated
    set(dat, j = "fVariableForCalibrationIPF", value = ipf_step_f(
      dat[[variableKeepingTheCalibWeight]], combined_factors, con_current))
  }
  if (dat[!is.na(fVariableForCalibrationIPF),
          any(abs(1 / fVariableForCalibrationIPF - 1) > epsPcur)]) {
    ## sicherheitshalber abs(epsPcur)? Aber es wird schon niemand negative eps
    ##   Werte uebergeben??
    if (verbose && calIter %% 10 == 0) {
      message(calIter, ":Not yet converged for P-Constraint", i, "\n")
      if (calIter %% 100 == 0) {

        dat[, selectGroupNotConverged := any(!is.na(fVariableForCalibrationIPF) &
                                               (abs(1 / fVariableForCalibrationIPF - 1) > epsPcur)), by= c(pColNames[[i]])]
        tmp <- dat[
          selectGroupNotConverged == TRUE,
          list(
            maxFac = max(abs(1 / fVariableForCalibrationIPF - 1)), .N,
            epsP = head(epsPcur, 1),
            CalibMargin = {
              if (!is.null(numericalWeightingVar)) {
                sum(get(variableKeepingTheCalibWeight) *
                      get(numericalWeightingVar))
              }else{
                sum(get(variableKeepingTheCalibWeight))
              }
            },
            PopMargin = head(value, 1)),
          by = eval(pColNames[[i]])]
        dat[, selectGroupNotConverged := NULL]

        print(tmp[order(maxFac, decreasing = TRUE), ])
        message("-----------------------------------------\n")
      }
    }
    if (!is.null(bound) || !is.null(minMaxTrim)) {
      dat[!is.na(fVariableForCalibrationIPF),
          c(variableKeepingTheCalibWeight) :=
            boundsFak(
              get(variableKeepingTheCalibWeight),
              get(variableKeepingTheBaseWeight), fVariableForCalibrationIPF,
              bound = bound, minMaxTrim = minMaxTrim)]
      #,by=eval(pColNames[[i]])]
    } else {
      dat[!is.na(fVariableForCalibrationIPF),
          c(variableKeepingTheCalibWeight) := fVariableForCalibrationIPF *
            get(variableKeepingTheCalibWeight),
          by = eval(paste0("combined_factors_", i))]
    }
    error <- TRUE
  }
  setnames(dat, "value", valueP[i])
  setnames(dat, "epsPcur", paste0("epsP_", i))
  return(error)
}

calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter,
                   looseH, numericalWeighting, numericalWeightingVar, w, cw, minMaxTrim) {
  variableKeepingTheBaseWeight <- w
  variableKeepingTheCalibWeight <- cw
  epsHcur <- OriginalSortingVariable <- V1 <-
    epsvalue <- fVariableForCalibrationIPF <- NULL
  maxFac <- temporary_hvar <-
    value <- wValue <- representativeHouseholdForCalibration <- NULL

  setnames(dat, valueH[i], "value")
  setnames(dat, paste0("epsH_", i), "epsHcur")

  combined_factors <- dat[[paste0("combined_factors_h_", i)]]
  tmp <- data.table(x = factor(levels(combined_factors)))
  setnames(tmp, "x", paste0("combined_factors_h_", i))
  paste0("combined_factors_h_", i)
  con_current <- dat[tmp, on = paste0("combined_factors_h_", i),
                     mult = "first", value]
  if (!is.null(numericalWeightingVar)) {
    ## numerical variable to be calibrated
    ## use name of conH list element to define numerical variable
    set(dat, j = "fVariableForCalibrationIPF", value = ipf_step_f(
      dat[[variableKeepingTheCalibWeight]] *
        dat[["representativeHouseholdForCalibration"]] *
      dat[[numericalWeightingVar]], combined_factors, con_current))
    set(dat, j = "wValue", value = dat[["value"]] /
          dat[["fVariableForCalibrationIPF"]])

    # try to divide the weight between units with larger/smaller value in the
    #   numerical variable linear
    dat[, fVariableForCalibrationIPF := numericalWeighting(
      head(wValue, 1), head(value, 1), get(numericalWeightingVar),
      get(variableKeepingTheCalibWeight)),
      by = eval(paste0("combined_factors_h_", i))]

  } else {
    # categorical variable to be calibrated
    set(dat, j = "fVariableForCalibrationIPF", value = ipf_step_f(
      dat[[variableKeepingTheCalibWeight]] *
        dat[["representativeHouseholdForCalibration"]],
      combined_factors, con_current))
  }

  set(dat, j = "wValue", value = dat[["value"]] /
        dat[["fVariableForCalibrationIPF"]])

  if (dat[!is.na(fVariableForCalibrationIPF),
          any(abs(1 / fVariableForCalibrationIPF - 1) > epsHcur)]) {
    if (verbose && calIter %% 10 == 0) {
      message(calIter, ":Not yet converged for H-Constraint", i, "\n")
      if (calIter %% 100 == 0) {
        tmp <- dat[
          !is.na(fVariableForCalibrationIPF) &
            (abs(1 / fVariableForCalibrationIPF - 1) > epsHcur),
          list(maxFac = max(abs(1 / fVariableForCalibrationIPF - 1)), .N,
               epsH = head(epsHcur, 1),
               sumCalibWeight = sum(get(variableKeepingTheCalibWeight) *
                                      representativeHouseholdForCalibration),
               PopMargin = head(value, 1)),
          by = eval(hColNames[[i]])]
        print(tmp[order(maxFac, decreasing = TRUE), ])

        message("-----------------------------------------\n")
      }
    }
    if (!is.null(bound) || !is.null(minMaxTrim)) {
      if (!looseH) {
        set(dat, j = variableKeepingTheCalibWeight, value = boundsFak(
          g1 = dat[[variableKeepingTheCalibWeight]],
          g0 = dat[[variableKeepingTheBaseWeight]],
          f = dat[["fVariableForCalibrationIPF"]],
          bound = bound, minMaxTrim = minMaxTrim))
      }else{
        set(dat, j = variableKeepingTheCalibWeight, value = boundsFakHH(
          g1 = dat[[variableKeepingTheCalibWeight]],
          g0 = dat[[variableKeepingTheBaseWeight]],
          eps = dat[["epsHcur"]], orig = dat[["value"]],
          p = dat[["wValue"]], bound = bound, minMaxTrim = minMaxTrim)
        )
      }
    } else {
      dat[, c(variableKeepingTheCalibWeight) := fVariableForCalibrationIPF *
            get(variableKeepingTheCalibWeight),
          by = eval(paste0("combined_factors_h_", i))]
    }
    error <- TRUE
  }

  setnames(dat, "value", valueH[i])
  setnames(dat, "epsHcur", paste0("epsH_", i))
  return(error)
}

## recreate the formula argument to xtabs based on conP, conH
getFormulas <- function(con, w) {
  formOut <- NULL
  for (i in seq_along(con)) {
    lhs <- names(con)[i]
    if (is.null(lhs) || lhs == "") {
      lhs <- w
    } else {
      lhs <- paste(lhs, "*", w)
    }
    rhs <- paste(names(dimnames(con[[i]])), collapse = "+")
    formOut[[i]] <- formula(paste(lhs, "~", rhs), env = .GlobalEnv)
  }
  formOut
}

## enrich dat_original with the calibrated weights and assign attributes

addWeightsAndAttributes <- function(dat, conP, conH, epsP, epsH, dat_original,
                                    maxIter, calIter, returnNA, cw, verbose, looseH) {
  variableKeepingTheCalibWeight <- cw
  representativeHouseholdForCalibration <- OriginalSortingVariable <-
    outTable <- copy(dat_original)

  formP <- getFormulas(conP, w = variableKeepingTheCalibWeight)
  formH <- getFormulas(conH, w = variableKeepingTheCalibWeight)

  # general information
  setattr(outTable, "iterations", min(maxIter, calIter))

  # input constraints
  setattr(outTable, "conP", conP)
  setattr(outTable, "conH", conH)

  # adjusted constraints (conP, conH according to the calibrated weights)
  conP_adj <- lapply(formP, xtabs, dat)
  conH_adj <- lapply(
    formH, xtabs, dat[representativeHouseholdForCalibration == 1])
  setattr(outTable, "conP_adj", conP_adj)
  setattr(outTable, "conH_adj", conH_adj)

  # tolerances
  setattr(outTable, "epsP", epsP)
  setattr(outTable, "epsH", epsH)

  setkey(dat, OriginalSortingVariable)


  # convergence
  conP_converged <- sapply(seq_along(conP), function(i) {
    epsP_current <- switch(is.list(epsP) + 1, epsP, epsP[[i]])
    all(abs(conP[[i]] - conP_adj[[i]]) <= epsP_current * conP[[i]])
  })
  if(looseH){

    inc <- function(x){
      10^(floor(log10(x))-1) * 9.99
    }

    conH_converged <- sapply(seq_along(conH), function(i) {
      epsH_current <- switch(is.list(epsH) + 1, epsH, epsH[[i]])
      if(verbose){
        message(paste("For looseH=TRUE epsH+",inc(epsH_current),"is allowed as tolerance for the convergence."))
      }
      all(abs(conH[[i]] - conH_adj[[i]])/conH[[i]] <= (epsH_current + inc(epsH_current)))

    })
  }else{
    conH_converged <- sapply(seq_along(conH), function(i) {
      epsH_current <- switch(is.list(epsH) + 1, epsH, epsH[[i]])
      all(abs(conH[[i]] - conH_adj[[i]]) <= epsH_current * conH[[i]])
    })
  }
  converged <- all(conP_converged) && all(conH_converged)
  setattr(outTable, "converged", converged)
  if (verbose) {
    if (converged)
      message("Convergence reached")
    else
      message("No convergence reached")
  }

  # add calibrated weights. Use setkey to make sure the indexes match
  setkey(dat, OriginalSortingVariable)

  if (!converged & returnNA) {
    outTable[, c(variableKeepingTheCalibWeight) := NA]
  } else {
    outTable[, c(variableKeepingTheCalibWeight) :=
               dat[[variableKeepingTheCalibWeight]]]
  }

  # formulas
  setattr(outTable, "formP", formP)
  setattr(outTable, "formH", formH)

  # not used yet
  #class(outTable) <- c("ipf", class(outTable))

  invisible(outTable)
}


#' Iterative Proportional Fitting
#'
#' Adjust sampling weights to given totals based on household-level and/or
#' individual level constraints.
#'
#' This function implements the weighting procedure described
#' here: \doi{10.17713/ajs.v45i3.120}.
#' Usage examples can be found in the corresponding vignette
#' (`vignette("ipf")`).
#'
#' `conP` and `conH` are contingency tables, which can be created with `xtabs`.
#' The `dimnames` of those tables should match the names and levels of the
#' corresponding columns in `dat`.
#'
#' `maxIter`, `epsP` and `epsH` are the stopping criteria. `epsP` and `epsH`
#' describe relative tolerances in the sense that
#' \deqn{1-epsP < \frac{w_{i+1}}{w_i} < 1+epsP}{1-epsP < w(i+1)/w(i) < 1+epsP}
#' will be used as convergence criterium. Here i is the iteration step and wi is
#' the weight of a specific person at step i.
#'
#' The algorithm
#' performs best if all varables occuring in the constraints (`conP` and `conH`)
#' as well as the household variable are coded as `factor`-columns in `dat`.
#' Otherwise, conversions will be necessary which can be monitored with the
#' `conversion_messages` argument. Setting `check_hh_vars` to `FALSE` can also
#' incease the performance of the scheme.
#'
#' @name ipf
#' @md
#' @aliases ipf
#' @param dat a `data.table` containing household ids (optionally), base
#'   weights (optionally), household and/or personal level variables (numerical
#'   or categorical) that should be fitted.
#' @param hid name of the column containing the household-ids within `dat` or
#'   NULL if such a variable does not exist.
#' @param w name if the column containing the base weights within `dat` or NULL
#'   if such a variable does not exist. In the latter case, every observation
#'   in `dat` is assigned a starting weight of 1.
#' @param conP list or (partly) named list defining the constraints on person
#'   level.  The list elements are contingency tables in array representation
#'   with dimnames corresponding to the names of the relevant calibration
#'   variables in `dat`. If a numerical variable is to be calibrated, the
#'   respective list element has to be named with the name of that numerical
#'   variable. Otherwise the list element shoud NOT be named.
#' @param conH list or (partly) named list defining the constraints on
#'   household level.  The list elements are contingency tables in array
#'   representation with dimnames corresponding to the names of the relevant
#'   calibration variables in `dat`. If a numerical variable is to be
#'   calibrated, the respective list element has to be named with the name of
#'   that numerical variable. Otherwise the list element shoud NOT be named.
#' @param epsP numeric value or list (of numeric values and/or arrays)
#'   specifying the convergence limit(s) for `conP`. The list can contain
#'   numeric values and/or arrays which must appear in the same order as the
#'   corresponding constraints in `conP`. Also, an array must have the same
#'   dimensions and dimnames as the corresponding constraint in `conP`.
#' @param epsH numeric value or list (of numeric values and/or arrays)
#'   specifying the convergence limit(s) for `conH`. The list can contain
#'   numeric values and/or arrays which must appear in the same order as the
#'   corresponding constraints in `conH`. Also, an array must have the same
#'   dimensions and dimnames as the corresponding constraint in `conH`.
#' @param verbose if TRUE, some progress information will be printed.
#' @param bound numeric value specifying the multiplier for determining the
#'   weight trimming boundary if the change of the base weights should be
#'   restricted, i.e. if the weights should stay between 1/`bound`*`w`
#'   and `bound`*\code{w}.
#' @param minMaxTrim numeric vector of length2, first element a minimum value
#'   for weights to be trimmed to, second element a maximum value for weights to
#'   be trimmed to.
#' @param maxIter numeric value specifying the maximum number of iterations
#' that should be performed.
#' @param meanHH if TRUE, every person in a household is assigned the mean of
#'   the person weights corresponding to the household. If `"geometric"`, the
#'   geometric mean is used rather than the arithmetic mean.
#' @param allPthenH if TRUE, all the person level calibration steps are
#'   performed before the houshold level calibration steps (and `meanHH`, if
#'   specified). If FALSE, the houshold level calibration steps (and `meanHH`,
#'   if specified) are performed after everey person level calibration step.
#'   This can lead to better convergence properties in certain cases but also
#'   means that the total number of calibration steps is increased.
#' @param returnNA if TRUE, the calibrated weight will be set to NA in case of
#'   no convergence.
#' @param looseH if FALSE, the actual constraints `conH` are used for
#'   calibrating all the hh weights. If TRUE, only the weights for which the
#'   lower and upper thresholds defined by `conH` and `epsH` are exceeded are
#'   calibrated. They are however not calibrated against the actual constraints
#'   `conH` but against these lower and upper thresholds, i.e.
#'   `conH`-`conH`*`epsH` and `conH`+`conH`*\code{epsH}.
#' @param numericalWeighting See [numericalWeighting]
#' @param check_hh_vars If `TRUE` check for non-unique values inside of a
#'   household for variables in household constraints
#' @param conversion_messages show a message, if inputs need to be reformatted.
#'   This can be useful for speed optimizations if ipf is called several times
#'   with similar inputs (for example bootstrapping)
#' @param nameCalibWeight character defining the name of the variable for the
#'   newly generated calibrated weight.
#' @return The function will return the input data `dat` with the calibrated
#'   weights `calibWeight` as an additional column as well as attributes. If no
#'   convergence has been reached in `maxIter` steps, and `returnNA` is `TRUE`
#'   (the default), the column `calibWeights` will only consist of `NA`s. The
#'   attributes of the table are attributes derived from the `data.table` class
#'   as well as the following.
#' \tabular{ll}{
#'   `converged` \tab Did the algorithm converge in `maxIter` steps? \cr
#'   `iterations` \tab The number of iterations performed. \cr
#'   `conP`, `conH`, `epsP`, `epsH` \tab See Arguments. \cr
#'   `conP_adj`, `conH_adj` \tab Adjusted versions of `conP` and `conH` \cr
#'   `formP`, `formH` \tab Formulas that were used to calculate `conP_adj` and
#'   `conH_adj` based on the output table.
#' }
#' @export ipf
#' @author Alexander Kowarik, Gregor de Cillia
#' @examples
#' \dontrun{
#'
#' # load data
#' eusilc <- demo.eusilc(n = 1, prettyNames = TRUE)
#'
#' # personal constraints
#' conP1 <- xtabs(pWeight ~ age, data = eusilc)
#' conP2 <- xtabs(pWeight ~ gender + region, data = eusilc)
#' conP3 <- xtabs(pWeight*eqIncome ~ gender, data = eusilc)
#'
#' # household constraints
#' conH1 <- xtabs(pWeight ~ hsize + region, data = eusilc)
#'
#' # simple usage ------------------------------------------
#'
#' calibweights1 <- ipf(
#'   eusilc,
#'   conP = list(conP1, conP2, eqIncome = conP3),
#'   bound = NULL,
#'   verbose = TRUE
#' )
#'
#' # compare personal weight with the calibweigth
#' calibweights1[, .(hid, pWeight, calibWeight)]
#'
#' # advanced usage ----------------------------------------
#'
#' # use an array of tolerances
#' epsH1 <- conH1
#' epsH1[1:4, ] <- 0.005
#' epsH1[5, ] <- 0.2
#'
#' # create an initial weight for the calibration
#' eusilc[, regSamp := .N, by = region]
#' eusilc[, regPop := sum(pWeight), by = region]
#' eusilc[, baseWeight := regPop/regSamp]
#'
# calibrate
#' calibweights2 <- ipf(
#'   eusilc,
#'   conP = list(conP1, conP2),
#'   conH = list(conH1),
#'   epsP = 1e-6,
#'   epsH = list(epsH1),
#'   bound = 4,
#'   w = "baseWeight",
#'   verbose = TRUE
#' )
#'
#' # show an adjusted version of conP and the original
#' attr(calibweights2, "conP_adj")
#' attr(calibweights2, "conP")
#' }
ipf <- function(
  dat, hid = NULL, conP = NULL, conH = NULL, epsP = 1e-6, epsH = 1e-2,
  verbose = FALSE, w = NULL, bound = 4, maxIter = 200, meanHH = TRUE,
  allPthenH = TRUE, returnNA = TRUE, looseH = FALSE, numericalWeighting =
    computeLinear, check_hh_vars = TRUE, conversion_messages = FALSE,
  nameCalibWeight = "calibWeight", minMaxTrim = NULL) {

  check_population_totals(conP, dat, "personal")
  check_population_totals(conH, dat, "household")
  check_eps(conP, epsP, type = "personal")
  check_eps(conH, epsH, type = "household")
  variableKeepingTheBaseWeight <- w
  variableKeepingTheCalibWeight <- nameCalibWeight
  if ("variableKeepingTheBaseWeight" %in% names(dat))
    stop("The provided dataset must not have a column called",
         " 'variableKeepingTheBaseWeight'")
  if(!is.null(minMaxTrim)){
    if(length(minMaxTrim)!=2)
      stop("minMaxTrim must have exactly 2 elements, a minimum and a maximum.")
    if(!is.numeric(minMaxTrim)){
      stop("minMaxTrim must be a numeric vector of length two.")
    }
    if(minMaxTrim[2]<minMaxTrim[1]){
      stop("minMaxTrim must have a minimum as a first element and a maximum as a second element.
           But in the input the second element is smaller than the first.")
    }
  }
  OriginalSortingVariable <- V1 <- epsvalue <-
    f <- temporary_hvar <-
    value <- wValue <- representativeHouseholdForCalibration <- ..hid <- NULL
  dat_original <- dat
  dat <- copy(dat)
  ## originalsorting is fucked up without this
  dat[, OriginalSortingVariable := .I]
  meanfun <- getMeanFun(meanHH)

  # dat sollte ein data.table sein
  # w ein Name eines Basisgewichts oder NULL
  valueP <- paste0("valueP", seq_along(conP))
  ###fixed target value, should not be changed in iterations
  valueH <- paste0("valueH", seq_along(conH))
  ###Housekeeping of the varNames used
  usedVarNames <- c(valueP, valueH, "value",
                    "representativeHouseholdForCalibration", "wValue")

  if (any(names(dat) %in% usedVarNames)) {
    renameVars <- names(dat)[names(dat) %in% usedVarNames]
    setnames(dat, renameVars, paste0(renameVars, "_safekeeping"))
  }
  ### Treatment of HID, creating 0,1 var for being the first hh member
  #delVars <- c()
  if (is.null(hid)) {
    #delVars <- c("hid")
    hid <- "hid"
    dat[, hid := as.factor(seq_len(nrow(dat)))]
    dat[, representativeHouseholdForCalibration := 1]
  } else {

    if(!hid%in%colnames(dat)){
      stop("dat does not contain column ",hid)
    }

    if(any(is.na(dat[[hid]]))){
      stop("hid contains missing values")
    }

    if (!is.factor(dat[[hid]]))
      data.table::set(dat, NULL, hid, as.factor(dat[[hid]]))
    dat[, representativeHouseholdForCalibration :=
          as.numeric(!duplicated(get(..hid)))]
  }

  ## Names of the calibration variables for Person and household dimension
  pColNames <- lapply(conP, function(x) names(dimnames(x)))
  hColNames <- lapply(conH, function(x) names(dimnames(x)))

  for (i in seq_along(conP)) {
    current_colnames <- pColNames[[i]]

    for (colname in current_colnames) {
      if (!inherits(dat[[colname]], "factor")) {
        if (conversion_messages)
          message("converting column ", colname, " to factor")
        set(
          dat, j = colname,
          value = factor(dat[[colname]],
                         levels = dimnames(conP[[i]])[[colname]])
        )
      }
      else if (!identical(levels(dat[[colname]]),
                          dimnames(conP[[i]])[[colname]])) {
        if (conversion_messages)
          message("correct levels of column ", colname)
        set(
          dat, j = colname, value = factor(
            dat[[colname]], levels = dimnames(conP[[i]])[[colname]])
        )
      }
    }
    combined_factors <- combine_factors(dat, conP[[i]])
    set(dat, j = paste0("combined_factors_", i), value = combined_factors)
    set(dat, j = paste0("valueP", i),
        value = as.vector(conP[[i]][combined_factors]))
  }
  for (i in seq_along(conH)) {
    colnames <- hColNames[[i]]

    ## make sure the columns mentioned in the contingency table are in fact
    ##   factors
    for (colname in colnames) {
      if (!inherits(dat[[colname]], "factor")) {
        if (conversion_messages)
          message("converting column ", colname, " to factor")
        set(
          dat, j = colname, value = factor(
            dat[[colname]], levels = dimnames(conH[[i]])[[colname]])
        )
      }
      else if (!identical(levels(dat[[colname]]),
                          dimnames(conH[[i]])[[colname]])) {
        if (conversion_messages)
          message("correct levels of column ", colname)
        set(
          dat, j = colname, value = factor(
            dat[[colname]], levels = dimnames(conH[[i]])[[colname]])
        )
      }
    }

    combined_factors <- combine_factors(dat, conH[[i]])

    set(dat, j = paste0("combined_factors_h_", i), value = combined_factors)
    set(dat, j = paste0("valueH", i),
        value = as.vector(conH[[i]][combined_factors]))
  }

  if (is.null(variableKeepingTheBaseWeight)) {
    if (!is.null(bound) && is.null(w))
      stop("Bounds are only reasonable if base weights are provided")
    set(dat, j = variableKeepingTheCalibWeight, value = 1)
  } else {
    set(dat, j = variableKeepingTheCalibWeight,
        value = dat[[variableKeepingTheBaseWeight]])
  }

  if (check_hh_vars) {
    ## Check for non-unqiue values inside of a household for variabels used
    ##   in Household constraints
    for (hh in hColNames) {
      for (h in hh) {
        setnames(dat, h, "temporary_hvar")
        if (dat[, length(unique(temporary_hvar)),
                by = c(hid)][, any(V1 != 1)]) {
          stop(paste(h, "has different values inside a household"))
        }
        setnames(dat, "temporary_hvar", h)
      }
    }
  }

  if (is.list(epsP)) {
    for (i in seq_along(epsP)) {
      if (is.array(epsP[[i]])) {
        combined_factors <- dat[[paste0("combined_factors_", i)]]
        set(dat, j = paste0("epsP_", i),
            value = as.vector(epsP[[i]][combined_factors]))
      } else {
        set(dat, j = paste0("epsP_", i), value = epsP[[i]])
      }
    }
  } else {
    for (i in seq_along(conP)) {
      set(dat, j = paste0("epsP_", i), value = epsP)
    }
  }
  if (is.list(epsH)) {
    for (i in seq_along(epsH)) {
      if (is.array(epsH[[i]])) {
        combined_factors <- dat[[paste0("combined_factors_h_", i)]]
        set(dat, j = paste0("epsH_", i),
            value = as.vector(epsH[[i]][combined_factors]))
      } else {
        set(dat, j = paste0("epsH_", i), value = epsH[[i]])
      }
    }
  } else {
    for (i in seq_along(conH)) {
      set(dat, j = paste0("epsH_", i), value = epsH)
    }
  }
  ###Calib
  error <- TRUE
  calIter <- 1
  while (error && calIter <= maxIter) {
    error <- FALSE

    if (allPthenH) {
      ### Person calib
      for (i in seq_along(conP)) {
        numericalWeightingTmp <- NULL
        if (isTRUE(names(conP)[i] != "")) {
          numericalWeightingTmp <- names(conP)[i]
        }
        error <- calibP(
          i = i, dat = dat, error = error, valueP = valueP,
          pColNames = pColNames, bound = bound, verbose = verbose,
          calIter = calIter, numericalWeighting = numericalWeighting,
          numericalWeightingVar = numericalWeightingTmp,
          w = variableKeepingTheBaseWeight,
          cw = variableKeepingTheCalibWeight, minMaxTrim = minMaxTrim)
      }
      ## replace person weight with household average
      set(dat, j = variableKeepingTheCalibWeight,
          value = meanfun(dat[[variableKeepingTheCalibWeight]], dat[[hid]]))

      ### Household calib
      for (i in seq_along(conH)) {
        numericalWeightingTmp <- NULL
        if (isTRUE(names(conH)[i] != "")) {
          numericalWeightingTmp <- names(conH)[i]
        }
        error <- calibH(
          i = i, dat = dat, error = error, valueH = valueH,
          hColNames = hColNames, bound = bound, verbose = verbose,
          calIter = calIter, looseH = looseH,
          numericalWeighting = numericalWeighting,
          numericalWeightingVar = numericalWeightingTmp,
          w = variableKeepingTheBaseWeight,
          cw = variableKeepingTheCalibWeight, minMaxTrim = minMaxTrim)
      }
    } else {
      ### Person calib
      for (i in seq_along(conP)) {
        numericalWeightingTmp <- NULL
        if (isTRUE(names(conP)[i] != "")) {
          numericalWeightingTmp <- names(conP)[i]
        }
        error <- calibP(
          i = i, dat = dat, error = error, valueP = valueP,
          pColNames = pColNames, bound = bound, verbose = verbose,
          calIter = calIter, numericalWeighting = numericalWeighting,
          numericalWeightingVar = numericalWeightingTmp,
          w = variableKeepingTheBaseWeight,
          cw = variableKeepingTheCalibWeight, minMaxTrim = minMaxTrim)

        ## replace person weight with household average
        set(dat, j = variableKeepingTheCalibWeight,
            value = meanfun(dat[[variableKeepingTheCalibWeight]], dat[[hid]]))

        ### Household calib
        for (i in seq_along(conH)) {
          numericalWeightingTmp <- NULL
          if (isTRUE(names(conH)[i] != "")) {
            numericalWeightingTmp <- numericalWeighting
          }
          error <- calibH(
            i = i, dat = dat, error = error, valueH = valueH,
            hColNames = hColNames, bound = bound, verbose = verbose,
            calIter = calIter, numericalWeighting = numericalWeighting,
            numericalWeightingVar = numericalWeightingTmp, looseH = looseH,
            w = variableKeepingTheBaseWeight,
            cw = variableKeepingTheCalibWeight, minMaxTrim = minMaxTrim)
        }
      }
    }

    if (verbose && !error) {
      message("Iteration stopped after ", calIter, " steps")
    } else if (maxIter == calIter) {
      warning("Not converged in ", maxIter, " steps")
    }
    calIter <- calIter + 1
  }
  # Remove Help Variables
  fVariableForCalibrationIPF <- NULL
  dat[, fVariableForCalibrationIPF := NULL]
  addWeightsAndAttributes(dat, conP, conH, epsP, epsH, dat_original, maxIter,
                          calIter, returnNA, variableKeepingTheCalibWeight,
                          verbose, looseH)
}

Try the surveysd package in your browser

Any scripts or data that you put into this service are public.

surveysd documentation built on Dec. 28, 2022, 2:15 a.m.