microweight: Tools for Weighting Microdata Files

Documented in get_cc_sparse get_inputs reweight

#' Reweight a microdata file.
#'
#' Calculate new weights for each household in a microdata file
#' so that (1) selected variables, weighted with the new weights and summed, hit
#' or come close to desired targets, and (2) a measure of distortion based on
#' how much the new weights differ from an initial set of weights is minimized.
#'
#' @param iweights Initial weights, 1 per household, numeric vector length h.
#' @param xmat Data for households. Matrix with 1 row per household and 1 column
#'   per target (h x k matrix). Columns must be named. Each cell is the amount
#'   that a household, when weighted, will add to the corresponding target. For
#'   example, if the column corresponds to the target for total income, the cell
#'   value would be the income for the household. If the column corresponds to
#'   the target for number of married households, then the cell value would be 1
#'   if the household is married, and 0 otherwise.
#' @param targets Named numeric vector of length k. Each element must correspond
#'   to a column of `xmat`.
#' @param tol Additive tolerances, 1 per target. Numeric vector length k.
#'   `reweight` will seek to hit the targets, plus or minus tol.
#' @param xlb Lower bounds for the ratio of new weights to initial weights.
#'   Either a vector of 1 value per household or a scalar that will be used for
#'   all households. Default is 0.
#' @param xub Upper bounds for the ratio of new weights to initial weights.
#'   Either a vector of 1 value per household or a scalar that will be used for
#'   all households. Default is 50.
#' @param method =c("auglag", "ipopt"). auglag is default. ipopt requires
#'   installation of `ipoptr`, which can be difficult. See details.
#' @param optlist Named list of allowable options:
#'   If method = "auglag" (default) @seealso [nloptr::nloptr()] or run
#'   `nloptr::nloptr.print.options()` for all `nloptr` options.
#'   If method = "ipopt" @seealso [ipoptr::ipoptr()] and see
#'   \href{https://coin-or.github.io/Ipopt/OPTIONS.html}{IPOPT options} .
#' @param quiet TRUE (default) or FALSE.
#'
#' @returns A list with the following elements:
#'
#' \describe{
#' \item{solver_message}{The message produced by IPOPT. See
#' \href{https://coin-or.github.io/Ipopt/OUTPUT.html}{IPOPT output}.}
#' \item{etime}{Elapsed time.}
#' \item{objective}{The objective function value at the solution.}
#' \item{weights}{Numeric vector of new weights.}
#' \item{targets_df}{Data frame with target names and values, values at the
#' starting point, values at the solution, tolerances, differences, and percent
#' differences. The suffix diff indicates the difference from a target and the
#' suffix pdiff indicates the percent difference from a target.}
#' \item{result}{List with output from the solver that was used.}
#' }
#'
#' @details
#' `reweight` constructs a nonlinear program from the provided inputs. It
#' minimizes a distortion function that is based upon how much weights change
#' from the initial weights, while seeking to satisfy constraints that are the
#' targets +/- tolerances. The default distortion function is given below, where
#' `w_i` is an initial weight and `w_n` is a new weight:
#'
#' \deqn{\sum w_i * (w_n / w_i - 1)^2}
#'
#' Thus, we minimize the sum of differences between the ratio of each new weight
#' and its corresponding initial weight and 1, with each term weighted by the
#' initial weight. Future versions will allow alternative distortion functions.
#'
#' The default is to use an augmented lagrangian approach
#' implemented with the `nloptr` function in the `nloptr` package, using the
#' L-BFGS algorithm. This usually works well on reasonably sized problems but
#' may have difficulty with very large or very difficult problems.
#'
#' If you have \code{\link[ipoptr]{ipoptr}} installed, which uses the
#' \href{https://coin-or.github.io/Ipopt/}{IPOPT solver}, you will be able to
#' solve much larger problems much more quickly by specifying method = "ipopt"
#' in the `reweight` call.
#'
#' @examples
#' # Determine new weights for a small problem using ACS data
#' library(tidyverse)
#' data(acs)

#' # let's focus on income group 5 and create and then try to hit targets for:
#' #    number of records (nrecs -- to be created based on the weight, pwgtp)
#' #    personal income (pincp)
#' #    wages (wagp)
#' #    number of people with wages (wagp_nnz -- to be created)
#' #    supplemental security income (ssip)
#' #    number of people with supplemental security income (ssip_nnz -- to
#' #       be created)
#' # we also need to get pwgtp - the person weight for each record, which
#' #       will be our initial weight
#' # for each "number of" variable we need to create an indicator variable that
#' # defines whether it is true for that record
#'
#' # get the data and prepare it
#' data_df <- acs %>%
#'   filter(incgroup == 5) %>%
#'   select(pwgtp, pincp, wagp, ssip) %>%
#'   # create the indicator variables
#'   mutate(nrecs = 1, # indicator used for number of records
#'          wagp_nnz = (wagp != 0) * 1.,
#'          ssip_nnz = (ssip != 0) * 1.)
#' data_df # 1,000 records
#'
#' iweights <- data_df$pwgtp # initial weights
#'
#' # prepare targets: in practice we would get them from an external source but
#' # in this case we'll get actual sums on the file and perturb them randomly
#' # so that we will need new weights to hit these targets.
#' set.seed(1234)
#' targets_df <- data_df %>%
#'   pivot_longer(-pwgtp) %>%
#'   mutate(wtd_value = value * pwgtp) %>%
#'   group_by(name) %>%
#'   summarise(wtd_value = sum(wtd_value), .groups = "drop") %>%
#'   mutate(target = wtd_value * (1 + rnorm(length(.), mean=0, sd=.02)))
#' # in practice we'd make sure that targets make sense (e.g., not negative)
#' targets_df
#'
#' targets <- targets_df$target
#' names(targets) <- targets_df$name
#' targets
#'
#' tol <- .005 * abs(targets) # use 0.5% as our tolerance
#'
#' # Prepare the matrix of characteristics of each household. These
#' # characteristics must correspond to the targets. Columns must # either be in
#' # the same order as the targets, or must have the same names.
#' xmat <- data_df %>% # important that columns be in the same order as the targets
#'   select(all_of(names(targets))) %>% as.matrix
#'
#' res <- reweight(iweights = iweights,
#'                 xmat = xmat,
#'                 targets = targets,
#'                 tol = tol)
#' names(res)
#' res$etime
#' res$objective_unscaled
#' res$targets_df
#' quantile(res$weights)
#' quantile(iweights)
#' quantile(res$weights / iweights)
#'
#'#' \dontrun{
#' # This example is not run because it uses `ipoptr`, which is not a
#' # requirement for `microweight`.
#'
#' # You can write ipopt output to a text file and even monitor results of a
#' # long-running optimization by opening the file with a text editor, as long
#' # as the editor does not lock the file for writing. Normally you would specify
#' # the path to the output file in a location on your system but for this
#' # example we'll use a temporary file, which we'll call tfile.
#'
#' tfile <- tempfile("tfile",fileext=".txt")
#'
#' opts <- list(output_file = tfile,
#'              file_print_level = 5,
#'              linear_solver = "ma27")
#'
#' res2 <- reweight(iweights = iweights,
#'                 xmat = xmat,
#'                 targets = targets,
#'                 tol = tol,
#'                 method = "ipopt",
#'                 optlist = opts,
#'                 quiet = TRUE) # write progress to tfile, not console
#' names(res2)
#' res2$solver_message
#' res2$etime
#' res2$objective_unscaled
#' res2$targets_df
#'
#' # Normally you would examine the optimization output file in a text editor
#' # For this example, we display it in the console:
#' writeLines(readLines(tfile))
#' unlink(tfile) # delete the temporary file in this example
#' } # end dontrun
#'
#' @export
reweight <- function(iweights,
                     xmat,
                     targets,
                     tol,
                     xlb=0,
                     xub=50,
                     method = 'auglag',
                     optlist = NULL,
                     quiet = TRUE){

  args <- as.list(environment()) # gets explicit and default arguments
  stopifnot(method %in% c('auglag', 'ipopt'))

  check_args(method, args)
  target_names <- names(targets)

  t1 <- proc.time()
  cc_sparse <- get_cc_sparse(xmat, target_names, iweights)

  inputs <- get_inputs(iweights,
                       targets, target_names, tol,
                       cc_sparse,
                       xlb, xub)

  if(method == "auglag") optfn <- call_auglag else
    if(method == "ipopt") optfn <- call_ipopt

  output <- optfn(iweights,
                  targets,
                  target_names,
                  tol,
                  xmat,
                  xlb,
                  xub,
                  optlist,
                  quiet,
                  inputs)
  t2 <- proc.time()

  # define additional values to be returned
  # solver_message <- result$message
  etime <- t2 - t1

  # use if statements below, even though not needed, in case result names change
  if(method == "auglag"){
    solver_message <- output$result$message
    objective_unscaled <- output$result$objective * output$inputs$objscale
    weights <- iweights * output$result$solution
    xfinal <- output$result$solution

  } else if(method == "ipopt"){
    solver_message <- output$result$message
    objective_unscaled <- output$result$objective * output$inputs$objscale
    weights <- iweights * output$result$solution
    xfinal <- output$result$solution
  }

  targets_df <- tibble::tibble(targnum = 1:length(targets),
                       targname = target_names,
                       target = targets) %>%
    dplyr::mutate(targinit = eval_g(inputs$x0, inputs),
           targcalc = eval_g(xfinal, inputs),
           targinit_diff = .data$targinit - .data$target,
           targtol = tol,
           targcalc_diff = .data$targcalc - .data$target,
           targinit_pdiff = .data$targinit_diff / .data$target * 100,
           targtol_pdiff = abs(.data$targtol / .data$target) * 100,
           targcalc_pdiff = .data$targcalc_diff / .data$target * 100)

  keepnames <- c("solver_message",
                 "etime",
                 "objective_unscaled",
                 "weights",
                 "xfinal",
                 "targets_df")
  # output <- list()
  for(var in keepnames) output[[var]] <- get(var)
  # reorder the list
  ordered_names <- c(keepnames, "result", "opts", "inputs")
  output <- output[ordered_names]

  output
}


check_args <- function(method, args){
  stopifnot(is.matrix(args$xmat),
            length(args$targets) == length(args$tol),
            length(args$targets) == ncol(args$xmat),
            length(args$iweights) == nrow(args$xmat))

  if(method == "auglag"){
    # print("auglag check arguments")

  } else if(method == "ipopt"){

    if(!is.null(args$optlist)){ # check options list

      if(!is.null(args$optlist$file_print_level)) {
        stopifnot("valid output_file name needed if file_print_level > 0" = !is.null(args$optlist$output_file))
      }

    }
  }
}

call_auglag <- function(iweights,
                        targets,
                        target_names,
                        tol,
                        xmat,
                        xlb,
                        xub,
                        optlist,
                        quiet,
                        inputs){
  # check inputs

  # add inputs needed for auglag
  inputs$objscale <- length(iweights)
  inputs$gscale <- targets / 10
  # inputs$objscale <- 1
  # inputs$gscale <- rep(1, length(targets))

  jac <- matrix(0, nrow=inputs$n_constraints, ncol=inputs$n_variables)
  f <- function(i){
    rep(i, length(inputs$eval_jac_g_structure[[i]]))
  }
  iidx <- lapply(1:length(inputs$eval_jac_g_structure), f) %>% unlist
  jidx <- unlist(inputs$eval_jac_g_structure)
  indexes <- cbind(iidx, jidx)
  jac[indexes] <- inputs$cc_sparse$nzcc

  jac_scaled <- sweep(jac, MARGIN=1, FUN="/", STATS=inputs$gscale)
  jac <- jac_scaled

  inputs$jac_heq <- jac[inputs$i_heq, ]
  inputs$jac_hin <- rbind(-jac[inputs$i_hin, ], jac[inputs$i_hin, ])

  # define auglag options
  local_opts <- list(algorithm = "NLOPT_LD_LBFGS",
                     xtol_rel = 1.0e-4)

  opts <- list(algorithm = "NLOPT_LD_AUGLAG",
               ftol_rel = 1.0e-4,
               maxeval =  5000,
               print_level = 0,
               local_opts = local_opts)

  # update default list just defined with any options specified in
  opts <- purrr::list_modify(opts, !!!optlist)
  if(!is.null(opts$maxiter)) opts$maxeval <- maxiter # give priority to directly passed maxiter
  if(!quiet) opts$print_level <- 1

  result <- nloptr::nloptr(x0 = inputs$x0,
                           eval_f = eval_f_xm1sq,
                           eval_grad_f = eval_grad_f_xm1sq,
                           lb = inputs$xlb,
                           ub = inputs$xub,
                           eval_g_ineq = hin_fn,
                           eval_jac_g_ineq = hin.jac_fn,
                           eval_g_eq = heq_fn,
                           eval_jac_g_eq = heq.jac_fn,
                           opts = opts,
                           inputs = inputs)

  output <- list()
  output$inputs <- inputs
  output$opts <- opts
  output$result <- result
  output
}


call_ipopt <- function(iweights,
                       targets,
                       target_names,
                       tol,
                       xmat,
                       xlb,
                       xub,
                       optlist,
                       quiet,
                       inputs){
  # check inputs

  # define defaults for ipopt options
  opts <- list(print_level = 0,
               file_print_level = 0, # integer
               max_iter= 50,
               linear_solver = "ma27", # mumps pardiso ma27 ma57 ma77 ma86 ma97
               jac_c_constant = "yes", # does not improve on moderate problems equality constraints
               jac_d_constant = "yes", # does not improve on  moderate problems inequality constraints
               hessian_constant = "yes") # KEEP default NO - if yes Ipopt asks for Hessian of Lagrangian function only once and reuses; default "no"

  # update default list just defined with any options specified in
  opts <- purrr::list_modify(opts, !!!optlist)
  if(!quiet) opts$print_level <- 5

  result <- ipoptr::ipoptr(x0 = inputs$x0,
                           lb = inputs$xlb,
                           ub = inputs$xub,
                           eval_f = eval_f_xm1sq, # arguments: x, inputs; eval_f_xtop eval_f_xm1sq
                           eval_grad_f = eval_grad_f_xm1sq, # eval_grad_f_xtop eval_grad_f_xm1sq
                           eval_g = eval_g, # constraints LHS - a vector of values
                           eval_jac_g = eval_jac_g,
                           eval_jac_g_structure = inputs$eval_jac_g_structure,
                           eval_h = eval_h_xm1sq, # the hessian is essential for this problem eval_h_xtop eval_h_xm1sq
                           eval_h_structure = inputs$eval_h_structure,
                           constraint_lb = inputs$clb,
                           constraint_ub = inputs$cub,
                           opts = opts,
                           inputs = inputs)

  output <- list()
  output$inputs <- inputs
  output$opts <- opts
  output$result <- result
  output
}



#' Create Sparse Constraints Coefficients Data Frame.
#'
#' Construct a data frame that represents a sparse matrix with nonzero
#' constraint coefficients. A constraint coefficient is the amount by which a
#' constraint changes if an element of the vector `x` changes by one unit. This
#' stores only those coefficients that are nonzero, in triplet form, where `i`
#' is an index identifying which constraint is represented, `j` is an index
#' identifying which element of the vector `x` is represented, and `nzcc` is the
#' value of the nonzero constraint coefficient.
#'
#' The data value in `xmat` multiplied by the initial weight determines how much
#' a constraint will change with a unit change in `x`, the ratio of the new
#' weight to the initial weight. For example, suppose household number 7 has
#' income of $10,000 then. Suppose that this household is in row 7 of `xmat` and
#' that column 3 of `xmat` corresponds to income. If this household has an
#' initial weight of 20, then the constraint coefficient for this cell - how
#' much changing the `x` value for this household will change the constraint for
#' total income - is 200,000 (20 x 100,000). The constraints coefficient data
#' frame will have a row where `i` (the index for the constraint, not the household)
#' is 3, `j` (the index for the household) is 7, and `nzcc` is 200000.
#'
#' @param xmat Numeric matrix of unweighted data values in dense
#'   form where each row is a household, each column corresponds to a
#'   constraint, and each cell is a data value.
#' @param target_names Character vector with names for each constraint. One element per
#' column in `xmat`.
#' @param iweights Numeric vector of initial weights in the microdata file, one per row in `xmat`.
#'
#' @returns A data frame representing a sparse matrix of constraint
#'   coefficients, with columns:
#'   \describe{
#'   \item{i}{the constraint number}
#'   \item{j}{an index into x}
#'   \item{cname}{constraint name, from `target_names`}
#'   \item{nzcc}{the nonzero constraint coefficient - the initial weight
#'   multiplied by value}
#'   \item{iweight}{initial weight for this cell}
#'   \item{value}{data value}
#'   }
#'
#' @export
get_cc_sparse <- function(xmat, target_names, iweights) {
  #.. dense matrix of constraint coefficients for the nation
  cc_sparse <- tidyr::as_tibble(xmat) %>%
    dplyr::select(dplyr::all_of(target_names)) %>%
    dplyr::mutate(iweight=iweights,
           j=dplyr::row_number()) %>%
    tidyr::pivot_longer(cols = dplyr::all_of(target_names),
                 names_to="cname",
                 values_to = "value") %>%
    dplyr::mutate(nzcc = .data$iweight * .data$value) %>%
    dplyr::filter(.data$nzcc != 0) %>%
    dplyr::mutate(i=match(.data$cname, target_names)) %>%
    dplyr::select(.data$i, .data$j, .data$cname, .data$nzcc,
                  .data$iweight, .data$value) %>%
    dplyr::arrange(.data$i, .data$j) # this ordering is crucial for the Jacobian

  return(cc_sparse)
}

#' Create List of Inputs for Optimization Functions.
#'
#' Most users will never use this function. Can be useful for debugging.
#'
#' @param iweights Numeric vector of initial weights in the microdata file.
#' @param targets Numeric vector of targets.
#' @param target_names Character vector of names for targets.
#' @param tol Numeric vector of tolerances, one per target.
#' @param cc_sparse Data frame with nonzero constraint coefficients in sparse format.
#' @param xlb Numeric vector of lower bounds for the `x` vector.
#' @param xub Numeric vector of upper bounds for the `x` vector.
#'
#' @export
get_inputs <- function(iweights,
                       targets,
                       target_names,
                       tol,
                       cc_sparse,
                       xlb,
                       xub){

  inputs <- list()
  inputs$iweight <- iweights
  inputs$cc_sparse <- cc_sparse
  inputs$constraints <- targets
  inputs$constraint_names <- target_names
  inputs$n_variables <- length(inputs$iweight)
  inputs$n_constraints <- length(inputs$constraints)
  inputs$clb <- targets - tol
  inputs$cub <- targets + tol
  inputs$i_heq <- which(tol==0)
  inputs$i_hin <- which(tol!=0)

  # finally, add xlb, xub, x0, and the relevant structures
  inputs$xlb <- rep(xlb, inputs$n_variables)
  inputs$xub <- rep(xub, inputs$n_variables)
  inputs$x0 <- rep(1, inputs$n_variables)

  # for now:
  inputs$objscale <- 1 # divisor to scale objective function

  inputs$eval_jac_g_structure <- define_jac_g_structure_sparse(inputs$cc_sparse, ivar="i", jvar="j")
  inputs$eval_h_structure <- lapply(1:inputs$n_variables, function(x) x) # diagonal elements of our Hessian

  inputs
}


# functions for nloptr
evg_scaled <- function(x, inputs){
  eval_g(x, inputs) / inputs$gscale
}


heq_fn <- function(x, inputs){
  evg_scaled(x, inputs)[inputs$i_heq] - inputs$targets[inputs$i_heq] / inputs$gscale[inputs$i_heq]
}

hin_fn <- function(x, inputs){
  # NLopt always expects constraints to be of the form myconstraint (x) ≤ 0,
  g <- evg_scaled(x, inputs)[inputs$i_hin]

  c(inputs$clb[inputs$i_hin]  / inputs$gscale[inputs$i_hin] - g,
    g - inputs$cub[inputs$i_hin]  / inputs$gscale[inputs$i_hin])
}


heq.jac_fn <- function(x, inputs){
  inputs$jac_heq
}


hin.jac_fn <- function(x, inputs){
  inputs$jac_hin
}


# Note that in the main package file, I have: @import dplyr so that we can access dplyr functions
# Throughout this file, the use of dplyr will generate a "no visible binding for global variable"
# note. However, it is not an error - it is ok.
# use .data$ before the variables that offend
# must  @importFrom rlang .data