fusionModel: Data fusion and analysis of synthetic data in R

Documented in analyze

#' Analyze fusion output
#'
#' @description
#' Calculation of point estimates and associated margin of error for analyses using fused/synthetic microdata. Can calculate means, proportions, sums, counts, and medians, optionally across population subgroups.
#'
#' @param x List. Named list specifying the desired analysis type(s) and the associated target variable(s). Example: \code{x = list(mean = c("v1", "v2"), median = "v3")} translates as: "Return the mean value of variables v1 and v2 and the median of v3". Supported analysis types include \code{mean}, \code{sum}, and \code{median}. Mean and sum automatically return proportions and counts, respectively, if the target variable is a factor. Target variables must be in \code{implicates}, \code{static}, or a data.frame returned by a custom \code{fun}.
#' @param implicates Data frame. Implicates of synthetic (fused) variables. Typically generated by \link{fuse}. The implicates should be row-stacked and identified by integer column "M".
#' @param static Data frame. Optional static (non-synthetic) variables that do not vary across implicates. Note that \code{nrow(static) = nrow(implicates) / max(implicates$M)} and the row-ordering is assumed to be consistent between \code{static} and \code{implicates}.
#' @param weight Character. Name of the observation weights column in \code{static}. If NULL (default), uniform weights are assumed.
#' @param rep_weights Character. Optional vector of replicate weight columns in \code{static}. If provided, the returned margin of errors reflect additional variance due to uncertainty in sample weights.
#' @param by Character. Optional column name(s) in \code{implicates} or \code{static} (typically factors) that collectively define the set of population subgroups for which each analysis is executed. If \code{NULL}, analysis is done for the whole sample.
#' @param fun Function. Optional function applied to input data prior to executing analyses. Can be used to do non-conventional/custom analyses.
#' @param var_scale Scalar. Factor by which to scale the unadjusted replicate weight variance. This is determined by the survey design. The default (\code{var_scale = 4}) is appropriate for ACS and RECS.
#' @param cores Integer. Number of cores used. Only applicable on Unix systems.
#'
#' @details At a minimum, the user must supply synthetic implicates (typically generated by \link{fuse}). Inputs are checked for consistent dimensions.
#' @details If \code{implicates} contains only a single implicate and \code{rep_weights = NULL}, the "typical" standard error is returned with a warning to make sure the user is aware of the situation.
#' @details Estimates and standard errors for the requested analysis are calculated separately for each implicate. The final point estimate is the mean estimate across implicates. The final standard error is the pooled SE across implicates, calculated using Rubin's pooling rules (1987).
#' @details When replicate weights are provided, the standard errors of each implicate are calculated via the variance of estimates across replicates. Calculations leverage \code{\link[data.table]{data.table}} operations for speed and memory efficiency. The within-implicate variance is calculated around the point estimate (rather than around the mean of the replicates). This is equivalent to \code{mse = TRUE} in \code{\link[survey]{svrepdesign}}. This seems to be the appropriate method for most surveys.
#' @details If replicate weights are NOT provided, the standard errors of each implicate are calculated using variance within the implicate. For means, the ratio variance approximation of Cochran (1977) is used, as this is known to be a good approximation of bootstrapped SE's for weighted means (Gatz and Smith 1995). For proportions, a generalization of the unweighted SE formula is used (\href{https://stats.stackexchange.com/questions/159204/how-to-calculate-the-standard-error-of-a-proportion-using-weighted-data}{see here}).
#'
#' @return A data.table reporting analysis results, possibly across subgroups defined in \code{by}. The returned quantities include:
#' @return \describe{
#'   \item{N}{Number of observations used for the analysis.}
#'   \item{y}{Target variable.}
#'   \item{level}{Levels of factor target variables.}
#'   \item{type}{Type of estimate returned: mean, proportion, sum, count, or median.}
#'   \item{est}{Point estimate.}
#'   \item{moe}{Margin of error associated with the 90% confidence interval.}
#'   }
#'
#' @references Cochran, W. G. (1977). \emph{Sampling Techniques} (3rd Edition). Wiley, New York.
#' @references Gatz, D.F., and Smith, L. (1995). The Standard Error of a Weighted Mean Concentration — I. Bootstrapping vs Other Methods. \emph{Atmospheric Environment}, vol. 29, no. 11, 1185–1193.
#' @references Rubin, D.B. (1987). \emph{Multiple imputation for nonresponse in surveys}. Hoboken, NJ: Wiley.
#'
#' @examples
#' # Build a fusion model using RECS microdata
#' fusion.vars <- c("electricity", "natural_gas", "aircon")
#' predictor.vars <- names(recs)[2:12]
#' fsn.path <- train(data = recs, y = fusion.vars, x = predictor.vars)
#'
#' # Generate 30 implicates of the 'fusion.vars' using original RECS as the recipient
#' sim <- fuse(data = recs, fsn = fsn.path, M = 30)
#' head(sim)
#'
#' #---------
#'
#' # Multiple types of analyses can be done at once
#' # This calculates estimates using the full sample
#' result <- analyze(x = list(mean = c("natural_gas", "aircon"),
#'                            median = "electricity",
#'                            sum = c("electricity", "aircon")),
#'                   implicates = sim,
#'                   weight = "weight")
#'
#' View(result)
#'
#' #-----
#'
#' # Mean electricity consumption, by climate zone and urban/rural status
#' result1 <- analyze(x = list(mean = "electricity"),
#'                   implicates = sim,
#'                   static = recs,
#'                   weight = "weight",
#'                   by = c("climate", "urban_rural"))
#'
#' # Same as above but including sample weight uncertainty
#' # Note that only the first 30 replicate weights are used internally
#' result2 <- analyze(x = list(mean = "electricity"),
#'                   implicates = sim,
#'                   static = recs,
#'                   weight = "weight",
#'                   rep_weights = paste0("rep_", 1:96),
#'                   by = c("climate", "urban_rural"))
#'
#' # Helper function for comparison plots
#' pfun <- function(x, y) {plot(x, y); abline(0, 1, lty = 2)}
#'
#' # Inclusion of replicate weights does not affect estimates, but it does
#' # increase margin of error due to uncertainty in RECS sample weights
#' pfun(result1$est, result2$est)
#' pfun(result1$moe, result2$moe)
#'
#' # Notice that relative uncertainty declines with subset size
#' plot(result1$N, result1$moe / result1$est)
#'
#' #-----
#'
#' # Use a custom function to perform more complex analyses
#' # Custom function should return a data frame with non-standard target variables
#'
#' my_fun <- function(data) {
#'
#'   # Manipulate 'data' as desired
#'   # All variables in 'implicates' and 'static' are available
#'
#'   # Construct electricity consumption per square foot
#'   kwh_per_ft2 <- data$electricity / data$square_feet
#'
#'   # Binary (T/F) indicator if household uses natural gas
#'   use_natural_gas <- data$natural_gas > 0
#'
#'   # Return data.frame of custom variables to be analyzed
#'   data.frame(kwh_per_ft2, use_natural_gas)
#'}
#'
#' # Do analysis using variables produced by custom function
#' # Can included non-custom target variables as well
#' result <- analyze(x = list(mean = c("kwh_per_ft2", "use_natural_gas", "electricity")),
#'                   implicates = sim,
#'                   static = recs,
#'                   weight = "weight",
#'                   fun = my_fun)
#'
#' @export

# NOT USED
# @param donor.N Number of (un-weighted) observations in the original "donor" data used to generate the implicates.

#-----

# library(fusionModel)
# library(data.table)
# library(tidyverse)
# source("R/utils.R")
#
# fusion.vars <- c("electricity", "natural_gas", "aircon")
# predictor.vars <- names(recs)[2:12]
#
# # Train model
# fsn.path <- train(data = recs, y = fusion.vars, x = predictor.vars, fsn = "test_model.fsn")
#
# # Generate multiple implicates of synthetic 'fusion.vars'
# recipient <- recs[predictor.vars]
# #sim.fuse <- fuse(data = recipient, fsn = fsn.path, M = 30)
# sim.fuse <- fuse(data = recipient, fsn = fsn.path, M = 30, fsd = "test.fsd")
#
# # Example working with no variance across # of implicates equal to number of replicate weights
# #sim.fuse <- cbind(M = rep(1:96, each = nrow(recs)), recs[rep(1:nrow(recs), 96), fusion.vars])
#
# # Inputs
# implicates <- sim.fuse
# x <- list(mean = names(implicates)[-1],
#           sum = names(implicates)[-1],
#           median = "electricity")
# #glm = electricity ~ natural_gas + aircon)
# static <- recs
# weight <- "weight"
# rep_weights <- paste0("rep_", 1:96)
# by <- c("urban_rural", "division")
# by = "race"
# by = NULL
# var_scale = 4
# cores <- 1
# fun = NULL
#
# # Custom function from user
# x <- list(mean = c("kwh_per_ft2", "use_natural_gas"),
#           sum = "use_natural_gas",
#           median = "electricity")
# fun <- function(data) {
#
#   # Manipulate 'data' as desired
#   # All variables in 'implicates' and 'static' are available
#   kwh_per_ft2 <- data$electricity / data$square_feet
#
#   # Binary (T/F) indicator if household uses natural gas
#   use_natural_gas <- data$natural_gas > 0
#
#   data.frame(kwh_per_ft2, use_natural_gas)
#
# }
#
#-----

analyze <- function(x,
                    implicates,
                    static = NULL,
                    weight = NULL,
                    rep_weights = NULL,
                    by = NULL,
                    fun = NULL,
                    var_scale = 4,
                    cores = 1) {

  t0 <- Sys.time()

  stopifnot({
    is.data.frame(implicates)
    is.null(static) | is.data.frame(static)
    is.null(weight) | weight %in% names(static)
    is.null(rep_weights) | all(rep_weights %in% names(static))
    is.null(by) | (all(by %in% c(names(implicates), names(static))))
    var_scale > 0
    cores > 0 & cores %% 1 == 0
  })


  sim <- data.table(implicates)
  rm(implicates)

  # Check validity of 'x' argument
  stopifnot(all(names(x) %in% c("mean", "sum", "median")))

  # Variables for which estimates will be computed
  fx <- sapply(x, class) == "formula"
  fvars <- if (is.null(fun)) {
    unique(unlist(c(x[!fx], lapply(x, all.vars))))
  } else {
    setdiff(c(names(sim), names(static)), c(by, weight, rep_weights))  # If custom function supplied, retain all variables for the moment
  }
  stopifnot(all(fvars %in% c(names(sim), names(static))))
  if (any(by %in% fvars)) stop("Analysis variables and 'by' variables must be distinct")

  #---

  # Check that input dimensions are consistent with one another
  Mimp <- max(sim$M)
  N <- nrow(sim)  / Mimp
  nM <- sim[, .N, by = M]
  stopifnot(all(nM$M %in% seq_len(Mimp)))
  stopifnot(all(nM$N == N))
  if (!is.null(static)) {
    if (nrow(static) != N) stop("The number of rows in 'static' should equal the number of rows in 'sim' per implicate (", N, ")")
  }
  if (Mimp > 1) {
    cat("Using", Mimp, "implicates\n")
  } else {
    cat("Only 1 implicate; returning NA for 'moe'\n")
  }

  #---

  # Prepare 'static' data.table
  if (is.null(static)) {
    static <- data.table(W = rep(1, N))
    cat("Assuming uniform sample weights\n")
  } else {
    static <- as.data.table(static)
    if (is.null(weight) & is.null(rep_weights)) {
      static[, W := 1]
      cat("Assuming uniform sample weights\n")
    } else {
      setnames(static, weight, "W")
      static[, W := as.double(W)]
      rm(weight)
    }
  }

  #---

  # SHOULD replicate weights always supersede sample weights?
  # Extract replicate weights
  WR <- NULL
  if (!is.null(rep_weights)) {
    if (length(rep_weights) < Mimp) stop("Number of replicate weights must be >= number of implicates\n")
    wkeep <- rep_weights[1:Mimp]
    cat("Using", Mimp, "of", length(rep_weights), "replicate weights\n")
    WR <- static[, ..wkeep]
    WR <- as.double(unlist(WR))  # Make double to avoid integer overflow
  }

  #---

  # Subset and (optionally) one-hot encode 'sim'
  vsim <- c("M", intersect(names(sim), c(by, fvars)))  # Can include 'by' variables? Why not?
  sim <- sim[, ..vsim]

  # Subset and (optionally) one-hot encode 'static'
  vstatic <- c("W", intersect(names(static), c(by, fvars)))
  vstatic <- setdiff(vstatic, vsim)  # Prioritizes variables in 'sim' if names are duplicated
  static <- static[, ..vstatic]

  # Combine static and simulated data
  sim <- cbind(sim, static[rep(1:N, Mimp)])

  # If replicate weight exist, add them to 'sim'; different set of weights for each implicate
  if (is.null(WR)) sim[, WR := W] else sim[, WR := WR]

  rm(static, WR)

  #---

  # If a custom function is supplied, apply it to 'sim' before proceeding
  # 'fun' returns a data frame that is then cbind'd to original 'sim'
  if (!is.null(fun)) {
    cat("Applying custom 'fun' to data\n")
    fout <- fun(sim)
    if (!is.data.frame(fout)) stop("Custom 'fun' must return a data.frame")
    drop <- intersect(names(fout), names(sim))
    if (length(drop)) sim[, c(drop) := NULL]
    sim <- cbind(sim, fout)
  }

  #---

  # Variables to retain prior to potential one-hot encoding
  # This ensures that only the variables specified in 'x' are retained
  fvars <- unique(unlist(c(x[!fx], lapply(x, all.vars))))
  stopifnot(all(fvars %in% names(sim)))
  keep <- c("M", "W", "WR", by, fvars)
  sim <- sim[, ..keep]

  # Convert any logical 'fvars' to factor so they are treated as such by code
  # This ensures the logical variables get into 'flink' and are assigned correct typing in output
  flgl <- names(which(sapply(sim[, ..fvars], is.logical)))
  for (v in flgl) set(sim, j = v, value = as.factor(sim[[v]]))

  #---

  # Check that 'fvars' for median analysis are all numeric
  check <- x$median
  if (length(check)) {
    if (!all(sapply(sim[, ..check], is.numeric))) stop("Variables for median analyses must be numeric")
  }

  #---

  # Create one-hot encoded dummies of factor variables for which "mean" or "sum" are requested
  hotf <- names(x) %in% c("mean", "sum")
  othx <- unique(unlist(lapply(x[!hotf], function(x) if (is.vector(x)) x else all.vars(x))))
  hotx <- unique(unlist(x[hotf]))
  hotx <- names(which(sapply(sim[, ..hotx], is.factor)))
  if (length(hotx) > 0) {
    temp <- one_hot(sim[, ..hotx], dropOriginal = TRUE)
    sim <- cbind(sim, temp)
    attr(sim, "one_hot_link") <- attr(temp, "one_hot_link")
    rm(temp)
  }

  # Look up table for variables that are one-hot encoded
  flink <- attr(sim, "one_hot_link")

  # Which 'hotx' original columns can be dropped?
  # This occurs if a 'hotx' variable is not requested by any other analysis
  drop <- setdiff(hotx, othx)
  if (length(drop) > 0) sim[, c(drop) := NULL]

  # TO DO: Print status update to console about which variables being used...

  # Key the data.tables for speed in 'by' operations below
  setkeyv(sim, c("M", by))

  #---

  # Function to compute confidence interval bounds
  # p = 0.95 entails a 90% confidence interval
  # calcCI <- function(d, p = 0.95) {
  #   d %>%
  #     mutate(
  #       lwr = est - qt(p, df) * se, # CI lower bound
  #       upr = est + qt(p, df) * se  # CI upper bound
  #     )
  # }

  # Function to compute margin of error (MOE)
  # p = 0.95 entails a 90% confidence interval
  calcMOE <- function(d, p = 0.95) {
    d %>%
      mutate(
        moe = se * qt(p, df)
      )
  }

  #-----

  # Process a particular combination of subsetting variables

  #processSubset <- function(iset) {

  # svar <- scomb[[iset]]
  #
  # # Calculate the share of observations within each subset, using the observed data
  # subshr <- obs[, .(share = .N / N), by = svar]
  # subshr <- subshr[share >= min_size / N]
  # subshr$id <- paste(iset, 1:nrow(subshr), sep = "_")
  #
  # # Restrict 'obs' and 'sim' to subsets with at least 'min_size' observations
  # obs <- obs[subshr, on = svar]
  # sim <- sim[subshr, on = svar]
  #
  #---

  # MEAN
  y <- x$mean
  out.mean <- if (length(y)) {
    yf <- intersect(y, hotx)
    if (length(yf)) {
      y <- setdiff(y, yf)
      y <- c(y, filter(flink, original %in% yf)$dummy)
    }
    out <- sim[, lapply(.SD, FUN = weighted_mean, w1 = W, w2 = WR), by = c("M", by), .SDcols = y]
    out[, type := "mean"]
    out
  } else {
    NULL
  }

  # SUM
  y <- x$sum
  out.sum <- if (length(y)) {
    yf <- intersect(y, hotx)
    if (length(yf)) {
      y <- setdiff(y, yf)
      y <- c(y, filter(flink, original %in% yf)$dummy)
    }
    out <- sim[, lapply(.SD, FUN = weighted_sum, w1 = W, w2 = WR), by = c("M", by), .SDcols = y]
    out[, type := "sum"]
    out
  } else {
    NULL
  }

  # MEDIAN
  y <- x$median
  out.median <- if (length(y)) {
    out <- sim[, lapply(.SD, FUN = weighted_median, w1 = W, w2 = WR), by = c("M", by), .SDcols = y]
    out[, type := "median"]
    out
  } else {
    NULL
  }

  # GLM (TO DO)

  # Combine analysis results
  d <- rbindlist(list(out.mean, out.sum, out.median), fill = TRUE)

  #----

  d$metric <- rep(c("estimate1", "estimate2", "variance"), times = nrow(d) / 3)
  d <- melt(d, id.vars = c("M", "type", "metric", by), variable.name = "y", na.rm = TRUE)
  d <- dcast(d, ... ~ metric, value.var = "value")

  # Calculate the necessary quantities
  # est: Point estimates (mean of estimates across implicates)
  # ubar: Mean of the within-implicate variances
  # b1: Variance of estimates across implicates (approximated by variance of mixture of normal distributions)
  d <- d[, .(est = mean(estimate1),
             ubar = mean(variance),
             b1 = var(estimate1),
             #b1 = (sum(estimate1 ^ 2 + variance) / Mimp) - mean(estimate1) ^ 2,  # Conservative approximation of var(estimate1); does not risk b1 < ubar
             b2 = pmax(0, var(estimate2) - var(estimate1)) * var_scale * (Mimp - 1) / Mimp),  # Approximate additional variance due to uncertainty in sample weights
         by = c(by, "type", "y")]

  # Calculate standard error, degrees of freedom, and margin of error
  # Note that if ubar = 0, then b = 0 necessarily; the maxr adjustment results in b = NA; ifelse() below forces b = 0 in this case
  # 'df' is NA when se = 0, so it is forced to 1 so CI calculation does not produce error
  #maxr <- maxr_fun(Mimp)
  d <- mutate(d,
              b = b1 + b2,
              # b = ifelse(ubar / b > maxr, ubar / maxr, b),
              # b = ifelse(ubar == 0, 0, b),

              # Rubin 1987
              se = sqrt(ubar + (1 + Mimp^(-1)) * b),
              r = (1+Mimp^(-1))*b/ubar,
              df = (Mimp-1)*(1+r^(-1))^2,
              r = NULL

              # Reiter and Raghunathan (2007)
              # se = sqrt(b * (1 + 1 / Mimp) - ubar),
              # df = (Mimp - 1) * (1 - Mimp * ubar / ((Mimp + 1) * b)) ^ 2,
              # df = ifelse(se == 0, 1, df)
  ) %>%
    calcMOE()

  #---

  # Process each subset combination in 'scomb'
  # cat("Processing validation analyses for", yN, "fusion variables\n")
  # comp <- parallel::mclapply(seq_along(scomb), processSubset, mc.cores = cores) %>%
  #   rbindlist()

  # Number of observations in each group
  # The total number of observations must sum to nrow(static)
  out.N <- sim[, list(N = as.integer(.N / Mimp)), by = by]

  # Add the number of observations in each group
  if (is.null(by)) {
    d$N <- as.integer(out.N)
  } else {
    d <- d[out.N, on = by]
  }

  #---

  # Add original factor variable levels, if necessary
  if (is.data.frame(flink)) {
    d <- merge(x = d, y = flink, by.x = "y", by.y = "dummy", all.x = TRUE, sort = FALSE)
    d <- d %>%
      mutate(y = ifelse(is.na(level), y, original),
             type = ifelse(is.na(level), type, ifelse(type == "mean", "proportion", "count"))) %>%
      select(-original)
  } else {
    d$level <- NA
  }

  # # Enforce lower and upper CI bounds on nominal variables that report proportions
  # if (fun == "mean") {
  #   d <- d %>%
  #     mutate(lwr = ifelse(is.na(level), lwr, pmax(0, lwr)),
  #            upr = ifelse(is.na(level), upr, pmin(1, upr)))
  # }

  # Enforce lower and upper CI bounds on nominal variables that report sums
  # if (fun == "sum") {
  #   d <- d %>%
  #     mutate(lwr = ifelse(is.na(level), lwr, pmax(0, lwr)),
  #            upr = ifelse(is.na(level), upr, pmin(wtotal, upr)))
  # }

  # Output results
  keep <- c(by, "N", "y", "level", "type", "est", "moe")
  result <- d[, ..keep]
  result$y <- as.character(result$y)
  setorderv(result, c(by, "y", "type"))
  # class(result) <- c("validate", class(result))

  # Report processing time
  tout <- difftime(Sys.time(), t0)
  cat("Total processing time:", signif(as.numeric(tout), 3), attr(tout, "units"), "\n", sep = " ")

  return(result)

}

#------------------
#------------------

# Below are functions originally stored in 'analyze_funs.R'

weighted_mean <- function(x, w1, w2) {

  if (!(is.numeric(x) | is.logical(x)) | length(x) != length(w1) | length(x) != length(w2)) stop("Input vectors are invalid")

  out <- if (length(x) == 0) {
    rep(NA, 3)
  } else {

    # If we need mean and variance for BOTH sets of weights:

    # Combine the weights
    # w <- cbind(w1 / sum(w1), w2 / sum(w2))
    #
    # # Continuous case (ONLY)
    # # Code: https://stats.stackexchange.com/questions/25895/computing-standard-error-in-weighted-mean-estimation
    # # Computes the variance of a weighted mean following Cochran 1977: https://www.sciencedirect.com/science/article/abs/pii/135223109400210C
    # n <- nrow(w)
    # xWbar <- colSums(x * w)  # Weighted mean, since the weights are already scaled
    # wbar <- colMeans(w)
    #
    # # Single weights case - confirms vectorized version
    # # W <- w[, 1]
    # # n/((n-1)*sum(W)^2)*(sum((W*x-wbar[1]*xWbar[1])^2)-2*xWbar[1]*sum((W-wbar[1])*(W*x-wbar[1]*xWbar[1]))+xWbar[1]^2*sum((W-wbar[1])^2))
    # # W <- w[, 2]
    # # n/((n-1)*sum(W)^2)*(sum((W*x-wbar[2]*xWbar[2])^2)-2*xWbar[2]*sum((W-wbar[2])*(W*x-wbar[2]*xWbar[2]))+xWbar[2]^2*sum((W-wbar[2])^2))
    #
    # a <- sweep(x * w, 2, wbar * xWbar)  # Equiv. to: w*x-wbar*xWbar
    # b <- sweep(w, 2, wbar) # Equiv. to: w-wbar
    #
    # wvar <- n/((n-1)*colSums(w)^2) * (colSums(a ^ 2) - 2*xWbar*colSums(b*a) + xWbar^2*colSums(b^2))

    #---

    W <- w1

    if (any(!x %in% 0:1)) {

      # Continuous case
      # Code: https://stats.stackexchange.com/questions/25895/computing-standard-error-in-weighted-mean-estimation
      # Computes the variance of a weighted mean following Cochran 1977: https://www.sciencedirect.com/science/article/abs/pii/135223109400210C
      n <- length(x)
      xWbar <- wmean(x, W)
      wbar <- mean(W)
      wvar <- n/((n-1)*sum(W)^2)*(sum((W*x-wbar*xWbar)^2)-2*xWbar*sum((W-wbar)*(W*x-wbar*xWbar))+xWbar^2*sum((W-wbar)^2))

    } else {

      # Binary case
      # https://stats.stackexchange.com/questions/159204/how-to-calculate-the-standard-error-of-a-proportion-using-weighted-data
      w <- W / sum(W)
      xWbar <- sum(w * x)
      sw2 <- sum(w ^ 2)
      wvar <- xWbar * (1 - xWbar) * sw2

    }

    # Secondary (replicate) weighted mean
    xWbar2 <- wmean(x, w2)

    # Return the primary weighted mean, secondary (replicate) weighted mean, and primary weights within-sample variance
    c(xWbar, xWbar2, wvar)

  }

  return(out)

}

#------------------------

weighted_sum <- function(x, w1, w2) {
  if (!(is.numeric(x) | is.logical(x)) | length(x) != length(w1) | length(x) != length(w2)) stop("Input vectors are invalid")
  out <- weighted_mean(x, w1, w2)
  s <- sum(w1)
  out[1] <- out[1] * s
  out[2] <- out[2] * sum(w2)
  out[3] <- out[3] * s ^ 2
  return(out)
}

#------------------------

# nboot: number of bootstrap samples for small N variance estimation
weighted_median <- function(x, w1, w2, nboot = 200) {

  n <- length(x)

  out <- if (n > 1 & var(x) > 0) {

    # Median estimate using 'w1'
    med1 <- matrixStats::weightedMedian(x, w1)

    # Median estimate using 'w2'
    med2 <- matrixStats::weightedMedian(x, w2)

    # If 'x' is large, use approximation; otherwise, do bootstrap estimate of the sampling variance
    wvar <- if (n > 1000 & med1 != 0) {

      # Approximate variance of sample median for large N
      # https://stats.stackexchange.com/questions/45124/central-limit-theorem-for-sample-medians
      pdf <- density(x, weights = w1 / sum(w1))
      fu <- approx(pdf$x, pdf$y, xout = med1)$y
      1 / (4 * n * fu ^ 2)

    } else {

      # Draw bootstrap samples
      bdraw <- replicate(n = nboot, sample(x, size = n, replace = TRUE))

      # Bootstrap weighted median estimates using 'w1'
      bq <- matrixStats::colWeightedMedians(bdraw, w = w1)

      # Bootstrapped variance estimate
      var(bq)

    }

    c(med1, med2, wvar)

  } else {
    c(x[1], x[1], 0)
  }

  return(out)

}

# REGRESSION - TO DO
# f_glm <- function(data, formula, w1, w2) {
#
#   fit1 <- stats::glm(formula = formula, data = data, weights = w1)
#   cf1 <- coef(summary(fit1))
#
#   fit2 <- stats::glm(formula = formula, data = data, weights = w2)
#   cf2 <- coef(summary(fit2))
#
# }

# Other cases to do....
# regression (y ~ x1 + x2) -- inputs a formula
# bivariate smooth (y ~ x) -- inputs a formula
# density -- inputs a length 1 character
# Custom function -- inputs a character of any length or NULL (?)