fusionModel: Data fusion and analysis of synthetic data in R

Documented in validate

#' Validate fusion output
#'
#' @description
#' Performs internal validation analyses on fused microdata to estimate how well the simulated variables reflect patterns in the dataset used to train the underlying fusion model (i.e. observed/donor data). This provides a standard approach to validating fusion output and associated models. See Examples for recommended usage.
#'
#' @param observed Data frame. Observed data against which to validate the \code{simulated} variables. Typically the same dataset used to \code{\link{train}} the fusion model used to generate \code{simulated}.
#' @param implicates Data frame. Implicates of synthetic (fused) variables. Typically generated by \link{fuse}. The implicates should be row-stacked and identified by integer column "M".
#' @param subset_vars Character. Vector of columns in \code{observed} used to define the population subsets across which the fusion variables are validated. The levels of each \code{subset_vars} (including all two-way interactions of \code{subset_vars}) define the population subsets. Continuous \code{subset_vars} are converted to a five-level ordered factor based on a univariate k-means clustering.
#' @param weight Character. Name of the observation weights column in \code{observed}. If NULL (default), uniform weights are assumed.
#' @param min_size Integer. Subsets with less than \code{min_size} observations are excluded. Since subsets with few observations are unlikely to give reliable estimates, it doesn't make sense to consider them for validation purposes.
#' @param plot Logical. If TRUE (default), \code{\link{plot_valid}} is called internally and summary plots are returned along with complete validation results. Requires the \code{\link{ggplot2}} package.
#' @param cores Integer. Number of cores used. Only applicable on Unix systems.

#' @details The objective of \code{\link{validate}} is to confirm that the fusion output is sensible and help establish the utility of the synthetic data across myriad analyses. Utility here is based on comparison of point estimates and confidence intervals derived using multiple-implicate synthetic data with those derived using the original donor data.
#'
#' The specific analyses tested include variable levels (means and proportions) across population subsets of varying size. This allows estimates of how each of the synthetic variables perform in analyses with real-world relevance, at varying levels of complexity. In effect, \code{validate()} performs a large number of analyses of the kind that the \code{\link{analyze}} function is designed to do on a one-by-one basis.
#'
#' Most users will want to use the default setting \code{plot = TRUE} to simultaneously return visualization (plots) of the validation results. Plot creation is detailed in \code{\link{plot_valid}}.
#'
#' @return If \code{plot = FALSE}, a data frame containing complete validation results. If If \code{plot = FALSE}, a list containing full results as well as additional lot objects as described in \code{\link{plot_valid}}.
#'
#' @examples
#' # Build a fusion model using RECS microdata
#' # Note that "fusion_model.fsn" will be written to working directory
#' fusion.vars <- c("electricity", "natural_gas", "aircon")
#' predictor.vars <- names(recs)[2:12]
#' fsn.path <- train(data = recs,
#'                   y = fusion.vars,
#'                   x = predictor.vars,
#'                   weight = "weight")
#'
#' # Fuse back onto the donor data (multiple implicates)
#' sim <- fuse(data = recs,
#'             fsn = fsn.path,
#'             M = 20)
#'
#' # Calculate validation results
#' valid <- validate(observed = recs,
#'                   implicates = sim,
#'                   subset_vars = c("income", "education", "race", "urban_rural"))
#'
#' @export

#-----

# library(fusionModel)
# library(dplyr)
# library(data.table)
# library(purrr)
# library(ggplot2)
# library(fst)
# source("R/utils.R")
# source("R/analyze_funs.R")
# observed <- read_fst("~/Documents/Projects/fusionData/fusion/RECS/2015/2015/input/RECS_2015_2015_train.fst")
# implicates <- read_fsd("~/Documents/Projects/fusionData/fusion/RECS/2015/2015/output/RECS_2015_2015_valid.fsd")
# subset_vars = c("moneypy__hincp", "hhage__agep", "householder_race__rac1p", "education__schl", "nhsldmem__np", "kownrent__ten", "loc..recs_division")
# weight = "weight"
# min_size = 30
# plot = TRUE
# cores = 1

#-----

validate <- function(observed,
                     implicates,
                     subset_vars,
                     weight = NULL,
                     min_size = 30,
                     plot = TRUE,
                     cores = 1) {

  t0 <- Sys.time()

  obs <- as.data.table(observed)
  sim <- as.data.table(implicates)
  rm(observed, implicates)

  # Check inputs
  stopifnot({
    all(setdiff(names(sim), "M") %in% names(obs))
    is.character(subset_vars) & all(subset_vars %in% names(obs))
    is.null(weight) | (length(weight) == 1 & weight %in% names(obs))
    min_size > 0
    is.logical(plot)
    cores > 0 & cores %% 1 == 0
  })

  # Check that input dimensions are consistent with one another
  N <- nrow(obs)
  Mimp <- max(sim$M)
  if (Mimp == 1) stop("Validation requires 'implicates' to contain more than one implicate")
  nM <- sim[, .N, by = M]
  stopifnot(N == nrow(sim) / Mimp)
  stopifnot(all(nM$M %in% seq_len(Mimp)))
  stopifnot(all(nM$N == N))

  if (is.null(weight)) {
    cat("Assuming uniform sample weights\n")
    obs$W <- 1L
  } else {
    setnames(obs, weight, "W")
    obs[, W := W / mean(W)]
  }

  # Variables for which estimates will be computed
  y <- setdiff(names(sim), "M")
  yN <- length(y)  # Reported to console later
  if (any(y %in% subset_vars)) stop("Fused variabes are not allowed as 'subset_vars'")

  # Generate list of combinations of subset variables to test
  # Restricted to no more than 2 subset variables per combination (m = 2)
  scomb <- as.list(subset_vars)
  if (length(subset_vars) > 1) scomb <- c(scomb, combn(subset_vars, m = 2, simplify = FALSE))

  # Restrict 'obs' to only the necessary variables
  v <- c("W", y, subset_vars)
  obs <- obs[, ..v]

  #---

  # Which subset_vars are numeric or ordered factor?
  sclus <- names(which(sapply(obs[, ..subset_vars], function(x) is.ordered(x) | is.numeric(x))))

  # Convert numeric and ordered factor subset variables into clustered, ordered factors with k levels via uniCluster()
  for (v in sclus) set(obs, j = v, value = uniCluster(x = obs[[v]], k = 5))

  #---

  # Convert logical 'y' variables to factor so they are handled appropriately
  ylog <- names(which(sapply(obs[, ..y], is.logical)))
  for (v in ylog) {
    set(obs, j = v, value = as.factor(obs[[v]]))
    set(sim, j = v, value = as.factor(sim[[v]]))
  }

  # One-hot encode the y factor variables
  yfct <- names(which(sapply(obs[, ..y], is.factor)))
  if (length(yfct) > 0) {
    cat("One-hot encoding categorical fusion variables\n")
    obs <- one_hot(obs, yfct)
    sim <- one_hot(sim, yfct)
    y <- setdiff(names(sim), "M")
  }

  #---

  # Detect and impute any missing values in 'observed'
  # Note that 'simulated' should not have any NA's by design
  na.cols <- names(which(sapply(obs, anyNA)))
  if (length(na.cols) > 0) {
    cat("Missing values imputed for the following variable(s):\n", paste(na.cols, collapse = ", "), "\n")
    for (j in na.cols) {
      x <- observed[[j]]
      ind <- is.na(x)
      observed[ind, j] <- imputationValue(x, ind)
    }
  }

  #---

  # Combine observed and simulated data
  v <- c("W", subset_vars)
  sim <- cbind(sim, obs[rep(1:N, Mimp), ..v])

  # Key the data.tables for speed in 'by' operations below
  setkeyv(sim, c("M", subset_vars))
  setkeyv(obs, subset_vars)

  #---

  # Report the overall correlation between observed and simulated fusion variables
  # This is a quick check if the simulated data are simply replicating the original values (over-fitting)
  ycor <- sapply(y, function(v) suppressWarnings(cor(rep(obs[[v]], Mimp), sim[, ..v])))
  cat("Correlation between observed and fused values:\n")
  print(summary(ycor), digits = 2)

  #---

  # TEST -- generate random versions of the 'y' variables in 'sim'
  # for (v in y) {
  #   set(sim, j = paste0(v, "__RND"), value = sample(obs[[v]], size = nrow(sim), replace = TRUE))
  # }
  # ysim <- c(y, paste0(y, "__RND"))

  #---

  # !!! TO DO: Potentially return correlation coefficients for each subset
  # x <- filter(sim, M == 1)
  # test <- cor(x[, ..y])
  # f <- function(x) cor(x[, ..y])  # Restrict to desired entries...
  # test <- sim[, f(.SD), by = M, .SDcols = y]

  #---

  # Function to compute confidence interval bounds
  # p = 0.95 entails a 90% confidence interval
  # calcCI <- function(d, p = 0.95) {
  #   d %>%
  #     mutate(
  #       lwr = est - qt(p, df) * se, # CI lower bound
  #       upr = est + qt(p, df) * se  # CI upper bound
  #     )
  # }

  # Function to compute margin of error (MOE)
  # p = 0.95 entails a 90% confidence interval
  calcMOE <- function(d, p = 0.95) {
    d %>%
      mutate(
        moe = se * qt(p, df)
      )
  }

  #---

  # Process a particular combination of subsetting variables

  processSubset <- function(iset) {

    svar <- scomb[[iset]]

    # Calculate the share of observations within each subset, using the observed data
    subshr <- obs[, .(share = .N / N), by = svar]
    subshr <- subshr[share >= min_size / N]
    subshr$id <- paste(iset, 1:nrow(subshr), sep = "_")

    # Restrict 'obs' and 'sim' to subsets with at least 'min_size' observations
    obs <- obs[subshr, on = svar]
    sim <- sim[subshr, on = svar]

    #---

    # Calculate outcomes for the observed data
    g <- obs[, lapply(.SD, weighted_mean, w1 = W, w2 = W), by = svar, .SDcols = y]
    g$metric <- rep(c("estimate1", "estimate2", "variance"), times = nrow(g) / 3)
    g <- melt(g, measure.vars = y, variable.name = "y")
    g <- dcast(g, ... ~ metric, value.var = "value")
    g <- g[subshr, on = svar]  # Adds "share" and "id" variables
    g <- mutate(g,
                est = estimate1,
                se = sqrt(variance),
                df = as.integer(share * N - 1)) %>%
      select(-estimate1, -estimate2, -variance) %>%
      calcMOE()

    #---

    # Calculate outcomes for the simulated data

    # Weighted mean and variance of the mean for each implicate
    d <- sim[, lapply(.SD, weighted_mean, w1 = W, w2 = W), by = c("M", svar), .SDcols = y]
    #d <- sim[, lapply(.SD, weighted_mean, w1 = W, w2 = W), by = c("M", svar), .SDcols = ysim]
    d$metric <- rep(c("estimate1", "estimate2", "variance"), times = nrow(d) / 3)
    d <- melt(d, id.vars = c("M", "metric", svar), variable.name = "y")
    d <- dcast(d, ... ~ metric, value.var = "value")

    # Calculate the necessary quantities
    # est: Point estimates
    # ubar: Mean of the within-implicate variances
    # b: Variance of estimates, across the implicates (approximated by variance of mixture of normal distributions)
    d <- d[, .(est = mean(estimate1),
               ubar = mean(variance),
               b = var(estimate1)),
               #b = (sum(estimate1 ^ 2 + variance) / Mimp) - mean(estimate1) ^ 2),  # Conservative approximation of var(estimate1); does not risk b1 < ubar
           by = c(svar, "y")]

    # Adds "share" and "id" variables
    d <- d[subshr, on = svar]

    # Calculate margin of error
    #maxr <- maxr_fun(Mimp)
    d <- mutate(d,
                # b = ifelse(ubar / b > maxr, ubar / maxr, b),
                # b = ifelse(ubar == 0, 0, b),

                # Rubin 1987
                se = sqrt(ubar + (1 + Mimp^(-1)) * b),
                r = (1 + Mimp^(-1)) * b / ubar,
                df = (Mimp - 1) * (1 + r^(-1)) ^ 2,
                df = ifelse(is.infinite(df), as.integer(share * N - 1), df), # Set 'df' to standard value if Inf is returned by Rubin's formula (i.e. when b = 0; no variance across implicates)
                r = NULL

                # Reiter and Raghunathan (2007)
                # se = sqrt(b * (1 + 1 / Mimp) - ubar),
                # df = (Mimp - 1) * (1 - Mimp * ubar / ((Mimp + 1) * b)) ^ 2,
                # df = ifelse(se == 0, 1, df)
    ) %>%
      select(-ubar, -b, -share, -id) %>%
      #select(-ubar, -b) %>%
      calcMOE()

    # Merge observed and simulated results
    comp <- merge(g, d, by = c(svar, "y"), suffixes = c(".obs", ".sim")) %>%
      select(-all_of(svar))
    return(comp)

  }

  #---

  # Process each subset combination in 'scomb'
  cat("Processing validation analyses for", yN, "fusion variables\n")
  comp <- parallel::mclapply(seq_along(scomb), processSubset, mc.cores = cores) %>%
    rbindlist()

  #---

  # Add the observed mean estimates (est.mean)
  avg <- matrixStats::colWeightedMeans(x = as.matrix(obs[, ..y]), w = obs$W)
  set(comp, j = "est.mean", value = avg[match(comp$y, names(avg))])

  #---

  # Add original factor variable levels, if necessary
  if (length(yfct) > 0) {
    comp <- merge(x = comp, y = attr(obs, "one_hot_link"), by.x = "y", by.y = "dummy", all.x = TRUE, sort = FALSE)
    comp <- comp %>%
      mutate(y = ifelse(is.na(level), y, original)) %>%
      select(-original)
  } else {
    comp$level <- NA
  }

  #---

  # Report total unique subsets analyzed
  cat("Performed", nrow(comp), "analyses across", uniqueN(comp$id), "subsets\n")

  # Data frame with comparison results
  keep <- c("id", "share", "y", "level", "est.obs", "moe.obs", "est.sim", "moe.sim", "est.mean")
  result <- comp[, ..keep]
  result$y <- as.character(result$y)
  class(result) <- c("validate", class(comp))

  # Clean up
  rm(obs, sim, comp)

  #---

  # Check if ggplot2 is installed; if so, call plot_valid() prior to returning result
  if (plot) {
    suppressMessages(ok <- require(ggplot2, quietly = TRUE, warn.conflicts = FALSE))
    if (ok) {
      result <- plot_valid(result, cores = cores)
      class(result) <- c("validate", class(result))
    } else {
      cat("Skipping plot generation because 'ggplot2' package is not installed\n")
    }
  }

  # Report processing time
  tout <- difftime(Sys.time(), t0)
  cat("Total processing time:", signif(as.numeric(tout), 3), attr(tout, "units"), "\n", sep = " ")

  return(result)

}