tests/testthat/helpers_dgp.R

make_sim_data <- function(n_samp = 1000, dgp_type) {
  # Simulation helper for generating data under a variety of data-generating
  # processes. In each case, the treatment mechanism is normally distributed
  # with the mean (and, sometimes, variance) being functions of the baseline
  # covariates, while the outcome mechanism is a nontrivial function of the
  # exposure and baseline covariates.

  # Scenario definitions
  # - number indicates _same_ outcome mechanism, similar treatment mechanisms
  # - usually, treatment mechanism complexity differs within scenarios

  # SCENARIO 1a: exposure density is N(mu = f(W), sigma2 = c)
  if (dgp_type == "1a") {
    # define treatment mechanism
    g0 <- function(W1, W2, W3) {
      mu <- 2 * W1 + W2 - 2 * (1 - W1) * W2
      sigma2 <- 2
      return(list(mu = mu, sigma2 = sigma2))
    }

    # define outcome mechanism
    Q0 <- function(A, W1, W2, W3) {
      plogis(3 * (A - 1) + W1 + 1.5 * W2^2 - W3 - 3 * (1 - W1) * W2)
    }

    # simulate data
    W1 <- rbinom(n_samp, 1, 0.6)
    W2 <- runif(n_samp, 0.5, 1.5)
    W3 <- rpois(n_samp, 2)
    g_obs <- g0(W1, W2, W3)
    A <- rnorm(n_samp, g_obs$mu, sqrt(g_obs$sigma2))
    Y <- rbinom(n_samp, 1, Q0(A, W1, W2, W3))
    data_obs <- as_tibble(
      list(W1 = W1, W2 = W2, W3 = W3, A = A, Y = Y)
    )
  }

  # SCENARIO 1b: exposure density is N(mu = f_1(W), sigma2 = f_2(W))
  if (dgp_type == "1b") {
    # define treatment mechanism
    g0 <- function(W1, W2, W3) {
      mu <- W1 + 2 * W2 - 2 * (1 - W1) * W2
      sigma2 <- (W1 * 2) + ((1 - W1) * 0.5) + W2
      return(list(mu = mu, sigma2 = sigma2))
    }

    # define outcome mechanism
    Q0 <- function(A, W1, W2, W3) {
      plogis(5 * (A - 2) + W1 + 3 * W2^4 - 2 * W3 - 4 * (1 - W1) * W2)
    }

    # simulate data
    W1 <- rbinom(n_samp, 1, 0.6)
    W2 <- runif(n_samp, 0.5, 1.5)
    W3 <- rpois(n_samp, 2)
    g_obs <- g0(W1, W2, W3)
    A <- rnorm(n_samp, g_obs$mu, sqrt(g_obs$sigma2))
    Y <- rbinom(n_samp, 1, Q0(A, W1, W2, W3))
    data_obs <- as_tibble(
      list(W1 = W1, W2 = W2, W3 = W3, A = A, Y = Y)
    )
  }

  # SCENARIO 2a: exposure density is Poisson(lambda = f(W))
  if (dgp_type == "2a") {
    # define treatment mechanism
    g0 <- function(W1, W2, W3) {
      lambda <- (1 - W1) + 0.25 * W2^3 + 2 * W1 * W2 + 4
      return(list(rate = lambda))
    }

    # define outcome mechanism
    Q0 <- function(A, W1, W2, W3) {
      plogis(A + 2 * (1 - W1) + 0.5 * W2 + 0.5 * W3 + 2 * W1 * W2 - 7)
    }

    # simulate data
    W1 <- rbinom(n_samp, 1, 0.6)
    W2 <- runif(n_samp, 0.5, 1.5)
    W3 <- rpois(n_samp, 2)
    g_obs <- g0(W1, W2, W3)
    A <- rpois(n_samp, lambda = g_obs$rate)
    Y <- rbinom(n_samp, 1, Q0(A, W1, W2, W3))
    data_obs <- as_tibble(
      list(W1 = W1, W2 = W2, W3 = W3, A = A, Y = Y)
    )
  }

  # output
  dgp_funs <- list(g0 = g0, Q0 = Q0)
  return(list(data_obs = data_obs, dgp_funs = dgp_funs))
}

# get truth for a given data-generating mechanism
# NOTE: g0 and Q0 are functions of {A, W1, W2, W3} only
get_truth <- function(n_samp = 1e7,
                      delta,
                      dgp_type = c("1a", "1b", "2a"),
                      param_type = c("pie", "tsm")) {
  # input check
  dgp_type <- match.arg(dgp_type)
  param_type <- match.arg(param_type)

  # compute large data set from data-generating mechanism
  dgp <- make_sim_data(n_samp = n_samp, dgp_type = dgp_type)
  wow_so_much_data <- dgp$data_obs

  # extract helper functions
  g0 <- dgp$dgp_funs$g0
  Q0 <- dgp$dgp_funs$Q0

  # compute treatment mechanism in large sample
  gn_mech <- with(wow_so_much_data, g0(W1, W2, W3))

  # compute likelihood factors
  if (dgp_type %in% c("1a", "1b")) {
    gn_Aminusdelta <- dnorm(wow_so_much_data$A - delta,
      mean = gn_mech$mu,
      sd = sqrt(gn_mech$sigma2)
    )
    gn_Anat <- dnorm(wow_so_much_data$A,
      mean = gn_mech$mu,
      sd = sqrt(gn_mech$sigma2)
    )
  } else if (dgp_type == "2a") {
    gn_Aminusdelta <- dpois(wow_so_much_data$A - delta,
      lambda = gn_mech$rate
    )
    gn_Anat <- dpois(wow_so_much_data$A,
      lambda = gn_mech$rate
    )
  }
  Qbar_noshift <- with(wow_so_much_data, Q0(A, W1, W2, W3))
  Qbar_shift <- with(wow_so_much_data, Q0(A + delta, W1, W2, W3))

  # EIF for shift parameter
  Hn <- gn_Aminusdelta / gn_Anat
  eif_shift <- Hn * (wow_so_much_data$Y - Qbar_noshift) +
    (Qbar_shift - mean(Qbar_shift))

  if (param_type == "pie") {
    # truth in terms of PIE = EY - EY_delta
    EY <- mean(wow_so_much_data$Y)
    eif_EY <- wow_so_much_data$Y - EY
    eif_pie <- eif_EY - eif_shift
    true_psi <- EY - mean(Qbar_shift)
    eff_bound <- var(eif_pie)
  } else if (param_type == "tsm") {
    # truth in terms of TSM EY_delta
    true_psi <- mean(Qbar_shift)
    eff_bound <- var(eif_shift)
  }

  # output
  return(list(
    psi = true_psi,
    eff_bound = eff_bound
  ))
}
nhejazi/haldensify documentation built on Feb. 23, 2024, 8:25 a.m.