fusionModel: Data fusion and analysis of synthetic data in R

Documented in monotonic

#' Create a monotonic relationship between two variables
#'
#' @description
#' \code{monotonic()} returns modified values of input vector \code{y} that are smoothed, monotonic, and consistent across all values of input \code{x}. It was designed to be used post-fusion when one wants to ensure a plausible relationship between consumption (\code{x}) and expenditure (\code{y}), under the assumption that all consumers face an identical, monotonic pricing structure. By default, the mean of the returned values is forced to equal the original mean of \code{y} (\code{preserve = TRUE}). The direction of monotonicity (increasing or decreasing) is detected automatically, so use cases are not limited to consumption and expenditure variables.
#' @param x Numeric.
#' @param y Numeric.
#' @param w Numeric. Optional observation weights.
#' @param preserve Logical. Preserve the original mean of the \code{y} values in the returned values?
#' @param expend Logical. Assume \code{y} is an expenditure variable? If \code{TRUE}, a safety check is implemented to ensure \code{y > 0} when \code{x > 0}.
#' @param fast Logical. If \code{TRUE}, only \code{\link[scam]{supsmu}} is used with coercion of result to monotone.
#' @param nmax Integer. Maximum number of observations to use for smoothing. Set lower for faster computation. \code{nmax = Inf} eliminates sampling.
#' @param plot Logical. Plot the (sampled) data points and derived monotonic relationship?
#' @details The initial smoothing is accomplished via \code{\link[scam]{supsmu}} with the result coerced to monotone. If \code{fast = FALSE} and the coercion step modifies the values too much, a second smooth is attempted via a \code{\link[scam]{scam}} model with either a monotone increasing or decreasing constraint. If the SCAM fails to fit, the function falls back to \code{\link[stats]{lm}} with simple linear predictions. If \code{y = 0} when \code{x = 0} (as typical for consumption-expenditure variables), then that outcome is enforced in the result. The input data are randomly sampled to no more than \code{nmax} observations, if necessary, for speed.
#' @return A numeric vector of modified \code{y} values. Optionally, a plot showing the returned monotonic relationship.
#' @examples
#' y <- monotonic(x = recs$propane_btu, y = recs$propane_expend, plot = TRUE)
#' mean(recs$propane_expend)
#' mean(y)
#' @export

#---------

# TEST
# library(tidyverse)
# library(data.table)
#
# d <- fusionModel::read_fsd("~/Downloads/RECS_2020_2019_fused_UP.fsd")
# acs <- fst::read_fst("~/Documents/Projects/fusionData/survey-processed/ACS/2019/ACS_2019_H_processed.fst", columns = c('weight', 'state', 'puma10'))
# d <- cbind(d, acs)
# system.time(
#   d[, `:=`(dollarel_z =  monotonic(x = btuel, y = dollarel, w = weight),
#            dollarng_z =  monotonic(x = btung, y = dollarng, w = weight),
#            dollarlp_z =  monotonic(x = btulp, y = dollarlp, w = weight),
#            dollarfo_z =  monotonic(x = btufo, y = dollarfo, w = weight)),
#     by = .(state, puma10)]
# )

#---------

monotonic <- function(x,
                      y,
                      w = NULL,
                      preserve = TRUE,
                      expend = TRUE,
                      fast = TRUE,
                      nmax = 5000,
                      plot = FALSE) {

  stopifnot(exprs = {
    length(x) == length(y)
    is.numeric(x) & !anyNA(x)
    is.numeric(y) & !anyNA(y)
    is.null(w) | length(w) == length(x)
    is.logical(preserve)
    is.logical(expend)
    is.logical(fast)
    nmax > 1
    is.logical(plot)
  })

  if (is.null(w)) w <- rep.int(1L, length(x))
  ymean <- weighted.mean(y, w, na.rm = TRUE)
  yint <- is.integer(y)
  ymin <- if (all(y == 0)) 0 else min(y[y != 0])
  x0 <- x
  w0 <- w

  # If 'expend = TRUE', check for violations
  # If any issues are detected, a helpful warning is issued
  if (expend) {
    if (any(x < 0)) stop("'expend = TRUE' but detected negative values in 'x'")
    if (any(y < 0)) stop("'expend = TRUE' but detected negative values in 'y'")
    i <- x == 0 & y != 0
    if (any(i)) {
      y[i] <- 0L
      warning("Set ", sum(i), " non-zero y-value(s) (", paste0(round(100 * sum(i) / length(y), 2), "%"), ") to zero where x == 0 because 'expend = TRUE'")
    }
    i <- x > 0 & y == 0
    if (any(i)) {
      y[i] <- ymin
      warning("Set ", sum(i), " zero y-value(s) (", paste0(round(100 * sum(i) / length(y), 2), "%"), ") to observed non-zero minimum where x > 0 because 'expend = TRUE'")
    }
  }

  # If 'expend = TRUE' OR zeros in 'x' (almost) always produce zeros in 'y', restrict to non-zero observations in 'x'
  force.zero <- FALSE
  if (expend | (any(x == 0) & sum(y[x == 0] == 0) / sum(x == 0) > 0.995)) {
    force.zero <- TRUE
    i <- c(match(0, x), which(x != 0))  # Retains first instance of zero in 'x'
    x <- x[i]
    y <- y[i]
    w <- w[i]
  }

  # If necessary, sample the data for speed
  n <- length(x)
  if (n > nmax) {
    i <- match(range(x), x)  # Retains first instance of min and max 'x'
    i <- c(i, sample.int(n = n, size = nmax - 2))  # Downsample to 'nmax' observations
    x <- x[i]
    y <- y[i]
    w <- w[i]
  }

  # Initial smooth via stats::supsmu()
  # 'span' set based on recommendation in ?supsmu
  m <- stats::supsmu(x, y, wt = w, span = ifelse(length(x) < 40, 0.3, "cv"))
  xu <- m$x
  inc <- suppressWarnings(cor(m$x, m$y) >= 0)
  if (is.na(inc)) inc <- TRUE
  p <- sort(m$y, decreasing = !inc) # Force monotonic predictions

  # Check if supsmu() smoothed and monotonic output is sufficient
  # Ideally, coercion to monotonic via sort() does not cause significant difference between 'p' and m$y
  fail <- sum(abs((p - m$y) / m$y) > 0.05) / length(p)  # Percent of observations with more than 5% absolute error
  if (is.na(fail)) fail <- Inf
  if (!fast & fail > 0.05 & length(p) >= 100) {
    # Attempt to fit SCAM model with monotonic constraint
    m <- try(scam::scam(y ~ s(x, bs = ifelse(inc, "mpi", "mpd")), data = data.frame(x, y), weights = w), silent = TRUE)
    if (inherits(m, "scam")) {
      p <- as.vector(predict(m, newdata = data.frame(x = xu), type = "response", newdata.guaranteed = TRUE))
    } else {
      # If SCAM model fails, fall back to OLS model and make simple linear predictions
      m <- stats::lm(y ~ x, weights = w)
      p <- as.vector(suppressWarnings(predict(m, newdata = data.frame(x = xu))))
    }
  }

  # First time: If expend = TRUE, ensure that the output values meet some minimum positive value when x > 0
  if (expend) p[xu > 0] <- pmax(p[xu > 0], ymin)

  # If necessary, set values to zero when 'x' is zero
  if (force.zero) p[xu == 0] = 0

  # Make 'y' predictions for all original 'x'
  yout <- if (length(xu) == 1) rep(p, length(x0)) else approx(xu, p, xout = x0, rule = 2)$y

  # If requested, adjustment factor to ensure mean of transformed 'y' matches original mean value
  yadj <- 1  # Defined for use in plotting code, below, if 'preserve = FALSE'
  if (preserve) {
    yadj <- ymean / weighted.mean(yout, w0)
    if (!is.finite(yadj)) yadj <- 1  # Catch divide by zero case (mean not strictly preserved in this case)
    yout <- yout * yadj
  }

  # If 'y' input is integer, force output to integer
  # May cause input mean of 'y' to not be strictly preserved in output
  if (yint) yout <- as.integer(round(yout))

  # May cause input mean of 'y' to NOT be strictly preserved in output
  if (expend) yout[x0 > 0] <- pmax(yout[x0 > 0], ymin)

  # Optional plot of transformation
  if (plot) {
    plot(x, y, col = "#00000033", ylim = range(c(y, p)), xlab = "x", ylab = "y")
    lines(xu, p * yadj, col = "red")
  }

  # Safety check
  if (anyNA(yout)) stop("NA values in result vector")

  return(yout)

}