R/calibrate.R

Defines functions checkSolverStatus diffCalculation trimmedCalibrate trimWeight createFormula computeCalibrate createMargins numericTargets categoricalTargets convertToDataFrame Calibrate

Documented in Calibrate

#' \code{Calibrate}
#' @description Creates a sampling weight based on raking/calibration. Raking is used if it can be used (i.e.,
#' no numeric adjustment variables and \code{always.calibrate} not \code{FALSE})
#' @param categorical.variables An optional list or data frame of categorical adjustment variables.
#' @param categorical.targets The target probabilities for each category listed in \code{categorical.variables}
#' @param numeric.variables An optional list or data frame of categorical adjustment variables.
#' @param numeric.targets the target mean for each numeric variable in numeric.variables
#' @param lower A lower bound weight value (not guaranteed to be achieved).
#' @param upper An upper bound weight value (not guaranteed to be achieved).
#' @param trim.iterations The number of times to run the trim loop over the final weightings
#' @param always.calibrate If \code{FALSE}, which is the default,
#' problems with only categorical adjustment variables are solved via
#' iterative-proportional fitting (raking). Otherwise, they are solved via calibration.
#' @param package The R package used to calibrate the model when raking is not conducted.
#' Defaults to \code{CVXR} (see https://cvxr.rbind.io/cvxr_examples/cvxr_survey_calibration/). Other options
#' are \code{icarus} and \code{survey}. .
#' Use icarus with care, as sometimes target categories can be switched around
#' @param subset A logical vector indicating which subset of cases should be used to create the weight
#' @param input.weight An optional weight variable; if supplied, the created weight is created to be as close
#' to this input.weight as possible
#' @return numeric A vector of weights
#' @importFrom icarus calibration
#' @export
Calibrate <- function(categorical.variables = NULL,
                      categorical.targets = NULL,
                      numeric.variables = NULL,
                      numeric.targets = NULL,
                      lower = NA,
                      upper = NA,
                      trim.iterations = 20,
                      package = c("CVXR", "survey", "icarus")[1],
                      always.calibrate = FALSE,
                      subset = NULL,
                      input.weight = NULL)
{

    # Preparing inputs
    adjustment.variables = NULL
    targets = list()

    if ((is.null(categorical.variables) || length(categorical.variables) == 0) && (is.null(numeric.variables) || length(numeric.variables) == 0)) {
        stop("Nothing to do! At least one categorical OR numeric variable required.")
    }

    # Categorical inputs
    if (!is.null(categorical.variables))
    {
        adjustment.variables = convertToDataFrame(categorical.variables)
        categorical.targets = if (is.null(categorical.targets) || is.list(categorical.targets)) categorical.targets else list(categorical.targets)
        targets = categoricalTargets(adjustment.variables, categorical.targets, subset)
    }
    n.categorical = length(targets)

    # Numeric inputs
    if (!is.null(numeric.variables))
    {
        num.adjustment.variables = convertToDataFrame(numeric.variables)
        adjustment.variables = if (is.null(categorical.variables)) num.adjustment.variables else cbind(adjustment.variables, num.adjustment.variables)
        targets = numericTargets(targets, adjustment.variables, numeric.targets, subset)
    }

    # Bounds
    if (is.na(lower))
        lower = 0
    if (is.na(upper))
        upper = Inf

    # Adding the weight variable or a proxy (and normalizing to a mean of 1)
    n = NROW(adjustment.variables)
    weight = if (is.null(input.weight)) rep(1, n) else input.weight / mean(input.weight)

    # Filtering data if required
    if (!is.null(subset) & length(subset) > 1)
    {
        filter.lookup = which(subset)
        adjustment.variables = adjustment.variables[filter.lookup, , drop = FALSE]
        weight = weight[filter.lookup]
    }

    # Removing empty factor levels
    if (n.categorical > 0)
        for (i in 1:n.categorical) # ordered = FALSE needed for weirdness in survey package
            adjustment.variables[[i]] = factor(adjustment.variables[[i]], ordered = FALSE)

    # Working out if to rake
    raking = n.categorical == length(adjustment.variables) & !always.calibrate
    if (raking & min(sapply(targets, min)) == 0)
        stop("One of your targets is set to 0. To achieve this, remove this category from the target and instead apply a filter to the weight to remove the category (or, in the R CODE set always.calibrate = TRUE)")

    # Creating the table of margins/targets in the desired format
    marg = createMargins(targets, adjustment.variables, n.categorical, raking, package)

    # Calculating/updating the weight
    result <- trimmedCalibrate(adjustment.variables, marg, weight, lower, upper, trim.iterations, raking, package)

    if (length(result) < n)
    {
        out = rep(NA, length(subset))
        out[filter.lookup] = result
        result = out
    }
    class(result) <- "Calibrate"
    result
}

# Converts lists to data fames, checking the data for errors along the way.
#' @importFrom verbs SumEachColumn
convertToDataFrame <- function(x)
{
    x = as.data.frame(x, stringsAsFactors = TRUE)
    # Check that data frame is complete
    var.missing = SumEachColumn(is.na(x), remove.missing = FALSE) > 0
    if (any(var.missing))
    {
        var.missing = paste(names(var.missing)[var.missing], collapse = ", ")
        stop("One or more of the adjustment variables contains missing values; you cannot have missing values in adjustment variables: ", var.missing)
    }
    x
}

## Checks and tidies categorical targets
#' @importFrom stringr str_trim
#' @importFrom verbs Sum
categoricalTargets <- function(adjustment.variables, categorical.targets, subset)
{
    targets = list()
    if (missing(subset))
        subset <- rep(TRUE, nrow(adjustment.variables))
    n.categorical = length(adjustment.variables)
    if (n.categorical != length(categorical.targets)) {
        stop("The number of categorical adjustment variables needs to be the same as the number of sets of targets (it isn't)")
    }
    for (i in 1 : n.categorical)
    {
        tgt = categorical.targets[[i]]
        targets[[i]] = suppressWarnings(as.numeric(str_trim(tgt[, 2])))
        names(targets[[i]]) = tgt[, 1]
        adj.variable = if(is.null(subset)) droplevels(adjustment.variables[[i]]) else droplevels(adjustment.variables[[i]][subset])
        adj.unique = levels(adj.variable)
        missing.targets = ! adj.unique %in% tgt[, 1]
        varname = names(adjustment.variables)[i]
        if(any(idx <- is.na(targets[[i]])))
        {
            bad.inputs <- paste(paste(tgt[idx, 1], tgt[idx, 2], sep = " - "), collapse = ", ")
            stop("Invalid target values for ", varname, ": ", bad.inputs, ".")
        }

        if (any(missing.targets))
        {
            missing.cats = paste0(adj.unique[missing.targets], collapse = ", ")
            if (nchar(missing.cats) > 0)
                stop("No targets have been provided for ", varname, ": ", missing.cats)
        }
        excess.targets = ! tgt[, 1] %in% adj.unique
        if (any(excess.targets))
        {
            excess.cats = paste0(tgt[excess.targets, 1], collapse = ", ")
            if (nchar(excess.cats) > 0)
                stop("Target category that does not appear in variable ", varname, ": ",
                     excess.cats, (if(is.null(subset)) "" else " (after applying filter/subset)"))
        }
        if ((sm = Sum(targets[[i]], remove.missing = FALSE)) != 1)
        {
            if (round(sm, 6) == 1)# Forcing to add up to 1 if difference likely due to rounding errors.
                targets[[i]] = prop.table(targets[[i]])
            else
                stop("Targets for ", varname, " add up to ", sm, "; they need to add up to exactly 1.")
        }
        # Reordering targets to match order of levels
        targets[[i]] = targets[[i]][adj.unique]
    }
    targets
}

# Extract and checks the targets for numeric  adjustment variables
numericTargets <- function(targets, adjustment.variables, numeric.targets, subset)
{
    n.categorical = length(targets)
    n = NROW(adjustment.variables)
    if (missing(subset))
        subset <- rep(TRUE, n)
    if (length(adjustment.variables) - n.categorical != length(numeric.targets))
    {
        stop("The number of numeric adjustment variables needs to be the same as the number of sets of targets (it isn't)")
    }
    n.numeric = length(adjustment.variables) - n.categorical
    for (i in 1:n.numeric)
    {
        i.adjustment.variables = i + n.categorical
        numvar = adjustment.variables[[i.adjustment.variables]]
        if(!is.null(subset))
            numvar = numvar[subset]
        varname = names(adjustment.variables)[i.adjustment.variables]
        targets[[i.adjustment.variables]] = tgt = numeric.targets[i]
        if (is.na(tgt))
            stop("The target for ", varname, " is invalid.")
        if (!is.numeric(numvar))
            stop(varname, " is not a numeric variable; numeric adjustment variables must be numeric.")
        if (max(numvar) < tgt)
            stop("The target for ", varname, " of ", tgt, " is higher than the maximum value of the variable", (if(is.null(subset)) "" else " (after applying filter/subset)"))
        if (min(numvar) > tgt)
            stop("The target for ", varname, " of ", tgt, " is lower than the minimum value of the variable", (if(is.null(subset)) "" else " (after applying filter/subset)"))
    }
    targets
}

# Formats table of margins, as required by the various functions and packages
#' @importFrom stats terms.formula
createMargins <- function(targets, adjustment.variables, n.categorical, raking, package)
{
    # Creating the targets in the required format
    if (raking) # Iterative post-stratification
    {
        margins = list()
        for (i in seq_along(adjustment.variables))
        {
            tgt = targets[[i]]
            out = data.frame(x = names(tgt), Freq = unname(tgt))
            names(out)[1] = names(adjustment.variables)[i]
            margins[[i]] = out
        }
    } else {
        n = NROW(adjustment.variables)
        if (package == "icarus")
        {
            n.cats = sapply(targets, length)
            nms = names(adjustment.variables)
            max.cats = max(n.cats)
            n.vars = length(adjustment.variables)
            tgts = matrix("0", n.vars, max.cats)
            for (i in 1 : n.vars)
            {
                if (i <= n.categorical) {
                    tgts[i, 1 : n.cats[i]] = targets[[i]]
                } else {
                    tgts[i, 1] = targets[[i]] * n # Sums rather than averages in icarus
                    n.cats[i] = 0 # icarus assumes 0 in this field for numeric variables
                }
            }
            margins = cbind(nms, n.cats, tgts)
        }
        else { # survey
            margins = c(1)
            for (i in seq_along(adjustment.variables))
            {
                new.margin = if (i <= n.categorical) targets[[i]][-1] else targets[[i]]
                margins = c(margins, new.margin)
            }
            margins = margins * n
            formula = createFormula(adjustment.variables)
            names(margins) = colnames(model.matrix(terms.formula(formula), data = adjustment.variables))
        }
    }
    margins
}
# Calibration function
#' @importFrom icarus calibration
#' @importFrom survey calibrate rake
#' @importFrom stats model.matrix weights terms.formula
#' @importFrom CVXR Variable Minimize Problem entr solve
#' @importFrom verbs Sum
computeCalibrate <- function(adjustment.variables, margins, input.weight, raking, package)
{
    if (package == "survey" | raking)
        design = svydesign(ids = ~1, weights = ~input.weight, data = adjustment.variables)
    if (raking)
    {
        sample.margins = lapply(names(adjustment.variables), function(x) as.formula(paste("~", x)))
        weights(rake(design,
                      sample.margins,
                      margins,
                      control = list(maxit = 1000, epsilon = 1e-10)))
    } else switch(package,
                  icarus = calibration(data = data.frame(adjustment.variables,
                                                         q_pop_wgt = input.weight),
                                       marginMatrix = margins,
                                       colWeights = "q_pop_wgt",
                                       pct = TRUE,
                                       popTotal = NROW(adjustment.variables),
                                       method = "raking",
                                       description = FALSE),
                  survey = weights(calibrate(design,
                                             createFormula(adjustment.variables),
                                             maxit = 1000,
                                             epsilon = 1e-8,
                                             population = margins,
                                             calfun = "raking")),
                  CVXR = {
                      formula = createFormula(adjustment.variables)
                      # wrapping formula with terms.formula fixes the
                      # "argument "frml" is missing, with no default"
                      # bug on the R server
                      X <- model.matrix(object = terms.formula(formula),
                                        data = adjustment.variables)
                      A <- input.weight * X
                      n <- NROW(X)
                      g <- Variable(n)
                      constraints = list(t(A) %*% g == margins)
                      Phi_R = Minimize(sum(input.weight * (-entr(g) - g + 1)))
                      p = Problem(Phi_R, constraints)
                      res = solve(p)
                      checkSolverStatus(res)
                      as.numeric(input.weight * res$getValue(g))
                      }
                  )
}

# Creating formula as required by survey package
createFormula = function(adjustment.variables)
{
    as.formula(paste("~", paste(names(adjustment.variables), collapse = "+")))
}

# Trims the weights (not really trimming, but this is the language in the literature
trimWeight = function(weight, lower, upper)
{
    weight[weight < lower] = lower
    weight[weight > upper] = upper
    weight / mean(weight)
}

# Trimming of weight (not really trimming, but this is the language in the literature
trimmedCalibrate <- function(adjustment.variables, margins, input.weight, lower, upper, trim.iterations, raking, package)
{
    weight = computeCalibrate(adjustment.variables, margins, input.weight, raking, package)
    # DS-3682: computeCalibrate produces weights which are not normalized to a mean of 1.
    # As a result, the calculations below which compare the weight to upper and lower,
    # which are user-specified bounds for a weight with a mean value of 1, do not produce
    # the desired effect (typically the weight is either not trimmed, or trimming produces
    # a weight where all values are identical because the original values were all below
    # the lower bound.
    weight = weight / mean(weight)
    trims = 0
    prev_diff = Inf
    dif = diffCalculation(weight, lower, upper)
    while (trims < trim.iterations &
           dif > 0.000001 &
           dif < prev_diff)
    {
        #cat("Difference between bounds and weight: ", dif, "\n")
        trims = trims + 1
        weight = trimWeight(weight, lower, upper)
        weight = computeCalibrate(adjustment.variables, margins, weight, raking, package)
        # DS-3682 see above
        weight = weight / mean(weight)
        prev_diff = dif
        dif = diffCalculation(weight, lower, upper)
    }
    trimWeight(weight, lower, upper)
}

# Sum of maximum differences between bounds and weight
diffCalculation = function(weight, lower, upper)
{
    rng = range(weight)
    max(rng[2] - upper, 0) - max(lower - rng[1], 0)
}

#' \code{print.Calibrate}
#' @description Print method for calibrate weights
#' @param x A n object of class 'Calibrate'.
#' @param ... Other ignored parameters
#' @return character string
#' @method print Calibrate
#' @importFrom flipFormat FormatAsReal FormatAsPercent
#' @export
print.Calibrate <- function (x, ...)
{
    x = x[!is.na(x)]
    instruction.for.getting.variable = ""
    product = get0("productName")
    if (!is.null(product))
        instruction.for.getting.variable <- "\n\nTo save the variable, click SAVE VARIABLE(S) > Save Weight Variable from Configuration"
    ess = EffectiveSampleSize(x)
    ess.percent = round(ess / length(x) * 100, 1)
    n = length(x)
    rng = range(x)


    cat(paste0("The weight has been computed for ", FormatAsReal(n, decimals = 0), " observations.\n",
              "The weight has an effective sample size of ",
              FormatAsReal(ess, decimals = 0),
              " (",  ess.percent, "%)\n",
              "Smallest weight is ", FormatAsReal(rng[1], decimals = 3), "\n",
              "Largest weight is ", FormatAsReal(rng[2], decimals = 3), " (", FormatAsReal(rng[2] / rng[1], decimals = 3), " times the smallest weight)",
              instruction.for.getting.variable))
}

#' Check for errors from running CVXR::solve
#' @noRd
checkSolverStatus <- function(solve.output)
{
    if (is.list(solve.output) && is.character(solve.output[["status"]]))
    {
        status <- solve.output[["status"]]
        if (status == "solver_error" || status == "infeasible")
            stop("Calibration could not be performed for the given input data. ",
                 "Please check that the supplied targets are appropriate for your data.")
    }
    return(invisible())
}
NumbersInternational/flipData documentation built on March 2, 2024, 10:52 a.m.