#' @title Unbiased treatment effect estimation with Horvitz-Thompson
#'
#' @description Produce Horvitz-Thompson estimators of treatment assignment with standard error estimates, confidence intervals
#' and hypothesis tests.
#'
#' @param outcome Outcome vector for a given experiment.
#' @param assignment Assignment vector for the experiment.
#' @param contrasts A list of contasts. For example could be c(-1, 1, 0) for the above example if we wanted to compare treatment 1 to treatment 2. But in a factorial design, for example, we might want to compute the AMCE (see Hainmueller et al.). For example, if we had a 2x2x2 factorial design (8 treatment arms) and we wanted to look at an AMCE we might specify c(0.25, -0.25, 0.25, -0.25, 0.25, -0.25, 0.25, - 0.25) and the elements of the treatment vector should be ti 2 {1,2,3,4,5,6,7,8}
#' @param prob_matrix Probabilty matrix of assignment, as generated by createProbMatrix.
#' @param approx Options for bounding unidentified part of equation.
#' Default: "youngs" - Youngs inequality (see Aronow and Middleton).
#' Other options:
#' "constant effects" - constant effects assumption (See Aronow dissertation section 2.5),
#' "sharp null" - sharp null hypothesis (a special case of the constant effects assumption).
#' @param totals Calculate outcome totals rather than means, defaults to False.
#' @param cluster_id Cluster identifier, if data is in clusters. Outcomes will then be converted to cluster-totals.
#' @return estimate, standard_error, p value (two-tailed test of null)
#'
#' @examples
#'
#' # Example using data from RI package.
#' y <- c(8,6,2,0,3,1,1,1,2,2,0,1,0,2,2,4,1,1)
#' z <- c(1,1,0,0,1,1,0,0,1,1,1,1,0,0,1,1,0,0)
#' # Generate 10,000 random permutations of the assignment vector.
#' perms = ri::genperms(z, maxiter=10000)
#' # Estimate the probability of assignment for each unit and assignment level.
#' prob_matrix = createProbMatrix(perms)
#' # Estimate the treatment effect using Horvitz-Thompson.
#' htestimate(y, z, contrasts = c(-1, 1), prob_matrix = prob_matrix)
#'
htestimate = function(outcome, assignment, contrasts, prob_matrix,
approx = "youngs", totals = F, cluster_id = NULL,
verbose = F, parallel = T) {
# Rename the original version of the assignments before we renumber the assignment levels (potentially).
raw_assignment = assignment
# Prepare some basic variables.
# Handle clustering if the cluster id was defined.
if (!is.null(cluster_id)) {
cat("Handling clustering with cluster_id variable.\n")
# Create a combined dataframe for use with dplyr.
combined_df = data.frame(outcome, raw_assignment, cluster_id)
# Aggregate the data by cluster id. NOTE: could use aggregate() function potentially?
agg_df = dplyr::group_by(combined_df, cluster_id)
# Confirm that all assignments are equal within each cluster, otherwise give an error.
assignments_by_cluster = dplyr::n_distinct(agg_df, raw_assignment)
count_of_nonunique_clusters = sum(assignments_by_cluster == 1)
if (count_of_nonunique_clusters != 0) {
throw("Assignments differ within clusters.")
}
# TODO: Aggregate the outcome data into cluster totals.
# TODO: Aggregate the assignment data into cluster totals.
# TODO: do we need to mess with the probability matrix? Collapse it to clusters?
}
# We sort the assignment levels in ascending order in case they are not consecutive natural numbers.
raw_assignment_levels = sort(unique(raw_assignment))
# Create a dictionary mapping the raw assignment to the clean assignment levels.
assign_dict = list()
for (i in 1:length(raw_assignment_levels)) {
# assign_dict = assignment_levels[assignment_levels %in% assignment_levels]
#cat("Assign levels", i, "has", assign_levels[[i]])
# Convert it to a character because 0 cannot be used as a list name in R.
assign_dict[[as.character(raw_assignment_levels[i])]] = i
}
# Now create a clean assignment field with consecutive natural numbers.
# Here we just need to loop over elements, because it's an array rather than a matrix as in createProbMatrix.
# That's why we use sapply here.
assignment = sapply(raw_assignment, FUN=function(x) { assign_dict[[as.character(x)]] })
# NOTE: createProbMatrix does a similar operation to convert assignment levels to natural numbers.
# Now use the new assignment levels within the function.
assignment_levels = unlist(assign_dict, use.names=F)
# K is the number of unique assignment values.
k = length(assignment_levels)
# N is the number of units.
n = length(assignment)
#cat("N:", n, "K:", k, "\n")
# 1. Calculate the effect estimate - see Aronow diss pages 14-15, esp. eq 2.8.
# For each assignment, scale the outcomes of units with that assignment by the inverse of their
# probability of assignment (on the diagonal matrix), generating a total for that outcome.
outcome_totals = rep(NA, k)
# Determine the location of the weights in the probability matrix.
# We just need to identify the row because the column will be the same.
# CK 12/15: NOTE, we seem to not be using this part anymore. Delete?
weight_rows = c()
for (i in 1:n) {
weight_rows[i] = 1 + (i - 1) * n + (assignment[i] - 1)
# cat("i:", i, "assignment[i]", assignment[i], "Weight index:", weight_rows[i], "\n")
}
#cat("Weight_rows:", weight_rows, "\n")
# Loop over each assignment level.
#cat("Loop over each assignment level.\n")
for (level_i in 1:k) {
assignment_level = assignment_levels[level_i]
# TODO: are these the right cells in the prob matrix? Doesn't it need to vary by the assignment level?
# I think we need to use getRawMatrixEntries function here, based on the assignment level.
#weights = prob_matrix[weight_rows[assignment == assignment_level], weight_rows[assignment == assignment_level]]
# CK 12/15 I think we should be using level_i instead of the raw assignment level, which may not be 1..k.
# cells = getRawMatrixEntries(assignment_level, assignment_level, n)
cells = getRawMatrixEntries(level_i, level_i, n)
#cat("i", i, "assignment_level", assignment_level, "Cells:", "\n")
#print(cells)
# CK 12/15 again, use the level_i variable instead.
#weights = prob_matrix[cells$start_row:cells$end_row, cells$start_col:cells$end_col][assignment == assignment_level, assignment == assignment_level]
weights = prob_matrix[cells$start_row:cells$end_row, cells$start_col:cells$end_col][assignment == level_i, assignment == level_i]
# Number of observations with this assignment level
# CK 12/15 again, use level_i assignment variable.
#n_obs = length(outcome[assignment == assignment_level])
n_obs = length(outcome[assignment == level_i])
# cat("Obs in level", level_i, ":", sqrt(length(weights)), "or", n_obs, "\n")
#cat("Weights:\n")
#print(weights)
#cat("Dim weights:\n")
#print(dim(weights))
#cat("Outcomes: (", n_obs, ")\n")
#print(outcome[assignment == assignment_level])
#cat("Diag of weights:\n")
#print(diag(weights))
# Calculate the inverse-probability weighted total.
# This doesn't work correctly - commenting out and running in a loop for now.
# outcome_totals[i] = outcome[assignment == assignment_level] / diag(weights)
# Do it slower for debugging.
# TODO: convert to dot product?
outcome_totals[level_i] = 0
for (diag_i in 1:n_obs) {
# If there is only one observation with this level we need a simpler syntax because weights is
# a scalar value rather than a matrix, so diag() does not work properly.
# CK 12/15: we could just do as.matrix though.
if (n_obs > 1) {
wgt = diag(weights)[diag_i]
} else {
wgt = weights
}
if (is.na(wgt)) {
cat("NA for wgt :( Diag_i=", diag_i, "Level_i=", level_i, "\n")
}
# CK 12/15: again, use level_i
# outcome_totals[level_i] = outcome_totals[level_i] + outcome[assignment == assignment_level][diag_i] / wgt
outcome_totals[level_i] = outcome_totals[level_i] + outcome[assignment == level_i][diag_i] / wgt
}
#cat("Outcome totals:\n")
#print(outcome_totals[i])
}
#cat("Completed the loop.\n")
#cat("Outcome totals:", outcome_totals, "\n")
# Then weight those outcomes by the contrast weightings, computing the sum, and divide by N.
if (totals) {
# Calculate the difference in totals.
# TODO: confirm this is the correct way with Joel.
estimate = sum(outcome_totals * contrasts)
} else {
# Calculate the difference in means (default).
estimate = sum(outcome_totals * contrasts) / n
}
#cat("Estimate:", estimate, "\n")
# 2. Calculate the SE.
# Calculate all needed covariances - we will use a weighted sum of their totals.
# If we assume constant effects we need to calculate potential outcomes for each outcome and assignment pair.
if (approx %in% c("constant effects", "sharp null")) {
# Create an n by k matrix to store the potential outcomes.
potential_outcomes = matrix(nrow=n, ncol=k)
# Loop over each unit and assignment.
# TODO: convert to matrix calculations.
for (unit_i in 1:n) {
for (assign_a in 1:k) {
# Use the observed value if it exists or if we're using the sharp null.
if (assignment[unit_i] == assign_a | approx == "sharp null") {
temp_outcome = outcome[unit_i]
} else {
# Otherwise we need to calculate tau-hat and use it to impute the unobserved potential outcome.
# TODO: if we assume no effect, tau-hat is zero here.
# CK 12/15 - is dividing by n correct here, or should it be based on # of observed units at that assignment level?
# TODO: double-check this to make sure imputation is correct.
temp_outcome = outcome[unit_i] + (outcome_totals[assign_a] - outcome_totals[assignment[unit_i]]) / n
}
potential_outcomes[unit_i, assign_a] = temp_outcome
}
}
rm(temp_outcome)
} else {
# We don't use this, but want to be able to include the variable in the returned results.
potential_outcomes = NA
}
# TODO: create cov_combined for all types of approximation.
cov_combined = NA
if (approx == "constant effects") {
# Setup parallelism. Thanks to Jeremy Coyle's origami package for this approach.
`%do_op%` = foreach::`%do%`
# Use parallelism if there is a backend registered, unless parallel == F.
if (foreach::getDoParRegistered() && parallel) {
`%do_op%` = foreach::`%dopar%`
if (verbose) cat("Parallel backend detected: using foreach parallelization.\n")
} else {
if (verbose) cat("No parallel backend detected. Operating sequentially.\n")
}
# CUEATE#32: Loop over each assignment level and calculate the variance of the total.
# TODO: generalize EQ#32 to multiple treatment arms for reference.
variance_of_totals = rep(NA, k)
covariances = matrix(nrow=k, ncol=k)
for (assign_a in 1:k) {
if (verbose) {
cat("Calculating variances for assignment level", assign_a, "of", k, "\n")
}
# Reset the variance running sum for each level.
# var_running_sum = 0
#cat("Assign_a:", assign_a, "Level_a:", level_a, "\n")
# We use the same cells for every observation so define this outside of the unit loops.
cells = getRawMatrixEntries(assign_a, assign_a, n)
# Loop over each observation.
# OLD VERSION (just this next line):
#for (obs_k in 1:n) {
var_running_sum = foreach::foreach(obs_k = 1:n, .combine = "sum") %do_op% {
obs_k_sum = 0
#cat("Assignment level:", level_a, "Observation:", obs_k)
# Pi_ai is the non-joint assignment probability of this unit to this assignment level/arm.
# TODO: should we be using level_a here?
#cat("Cells:\n")
#print(cells)
# Initialize to NA for debugging purposes - will help to track down any logic errors.
first_component = NA
pi_ak = prob_matrix[cells$start_row:cells$end_row, cells$start_col:cells$end_col][obs_k, obs_k]
# Equation: UEATE#32, first component.
# T'k(1 - pi_1k)*(Y_k/Pi_1k)^2
# TODO: Does this need to check if pi_indiv == 0?? Doesn't seem to in the formula.
if (approx == "youngs") {
first_component = (assignment[obs_k] == assign_a) * (1 - pi_ak) * (outcome[obs_k] / pi_ak)^2
} else if (approx %in% c("constant effects", "sharp null")) {
# TODO 10/13/15: confirm that we can use sharp null here.
# Aronow dissertation, Eq 2.15 (p. 18); line 1.
first_component = pi_ak * (1 - pi_ak) * (potential_outcomes[obs_k, assign_a] / pi_ak)^2
}
# Stop here if first_component is still NA, so we can figure out what went wrong.
stopifnot(!is.na(first_component))
#individual_component = pi_individual * (1 - pi_individual) * (outcome[i]/pi_individual)^2
obs_k_sum = first_component
# Youngs: Second and third components of equation 32.
for (obs_l in 1:n) {
# Skip this iteration if k == l
if (obs_k == obs_l) next
# Set joint component to NA as a safety feature - if it's not set it will be noticable.
joint_component = NA
# TODO: confirm if we should use j or assign_i here.
#cells = getRawMatrixEntries(assign_i, assign_i, n)
pi_al = prob_matrix[cells$start_row:cells$end_row, cells$start_col:cells$end_col][obs_l, obs_l]
# TODO: confirm if we should use j or assign_i here.
#cells = getRawMatrixEntries(assign_i, assign_i, n)
pi_ak_al = prob_matrix[cells$start_row:cells$end_row, cells$start_col:cells$end_col][obs_k, obs_l]
if (approx == "youngs") {
if (pi_ak_al > 0) {
# Option 1: pi_ij > 0
# Second component of equation 32.
joint_component = (assignment[obs_k] == assign_a) * (assignment[obs_l] == assign_a) / pi_ak_al *
(pi_ak_al - pi_ak * pi_al) * outcome[obs_k] / pi_ak * outcome[obs_l] / pi_al
} else {
# Option 2: pi_ij = 0
# Third component of equation 32.
# This is young's equality right now.
joint_component = (assignment[obs_k] == assign_a) * outcome[obs_k]^2 / (2 * pi_ak)
+ (assignment[obs_l] == assign_a) * outcome[obs_l]^2 / (2 * pi_al)
}
} else if (approx %in% c("constant effects", "sharp null")) {
# TODO 10/13/15: confirm that we can use the sharp null here.
# Arronow diss, EQ 2.15 (p. 18) line 2
joint_component = (pi_ak_al - pi_ak * pi_al) * potential_outcomes[obs_k, assign_a] / pi_ak * potential_outcomes[obs_l, assign_a] / pi_al
}
# Stop here if joint_component is still NA, so we can figure out what went wrong.
stopifnot(!is.na(joint_component))
obs_k_sum = obs_k_sum + joint_component
}
obs_k_sum
}
#var_running_sum = var_running_sum + running_sum
variance_of_totals[assign_a] = var_running_sum
# Could put the result directly into the covariance matrix so that we don't have to copy it later.
# CK 12/15 - disabled for now because our old_cov calculation assumes the diagonals are NA.
# covariances[assign_a, assign_a] = var_running_sum
# Create the covariances for this assignment level.
# CUE#34
# TODO: save work by only computing one triangle, rather than both triangles of the symmetric matrix.
for (assign_b in 1:k) {
# Skip when we are at our own level.
if (assign_a == assign_b) next
# Confirm that we are processing two different levels.
stopifnot(assign_a != assign_b)
# Reset the running sum for each level we're comparing to assign_a.
cov_running_sum = 0
# CUE #34
for (obs_k in 1:n) {
cells = getRawMatrixEntries(assign_a, assign_a, n)
pi_ak = prob_matrix[cells$start_row:cells$end_row, cells$start_col:cells$end_col][obs_k, obs_k]
for (obs_l in 1:n) {
# Skip the same observation.
if (obs_k == obs_l) next
# Set to NA for safety.
first_component = NA
cells = getRawMatrixEntries(assign_a, assign_b, n)
pi_ak_bl = prob_matrix[cells$start_row:cells$end_row, cells$start_col:cells$end_col][obs_k, obs_l]
cells = getRawMatrixEntries(assign_b, assign_b, n)
pi_bl = prob_matrix[cells$start_row:cells$end_row, cells$start_col:cells$end_col][obs_l, obs_l]
if (approx == "youngs") {
# Equation #34 HERE (youngs inequality):
# Per CUE, p. 147 if the probability is 0 then we set Pi_hat to 1 to avoid dividing by zero.
pi_ak_bl_epp = ifelse(pi_ak_bl == 0, 1, pi_ak_bl)
# First component:
first_component = (assignment[obs_k] == assign_a) * (assignment[obs_l] == assign_b) /
pi_ak_bl_epp * (pi_ak_bl - pi_ak * pi_bl) * outcome[obs_k] * outcome[obs_l] / (pi_ak * pi_bl)
if (is.nan(first_component)) {
cat("Generated NaN in cov calculation. assign_a=", assign_a, "assign_b=", assign_b, "obs_k=", obs_k
, "obs_l=", obs_l, "Pi_ak=", pi_ak, "Pi_bl=", pi_bl, "Pi_ak_bl=", pi_ak_bl,"\n")
}
} else if (approx %in% c("constant effects", "sharp null")) {
# TODO 10/13/15: confirm that we can use the sharp null here.
# Aronow diss Eq. 2.15 (p. 18) line 5 (effectively also includes line 6).
first_component = (pi_ak_bl - pi_ak * pi_bl) * potential_outcomes[obs_k, assign_a] * potential_outcomes[obs_l, assign_b] / (pi_ak * pi_bl)
}
# Confirm that we generated a new value for first_component.
stopifnot(!is.na(first_component))
cov_running_sum = cov_running_sum + first_component
}
if (approx == "youngs") {
# CK 12/15 this part may have an error in it, but it seems ok after double-checking.
# Eq#34 second component:
cov_running_sum = cov_running_sum - (assign_a == assignment[obs_k]) * outcome[obs_k]^2 / (2 * pi_ak)
# Eq#34 third component:
cells = getRawMatrixEntries(assign_b, assign_b, n)
pi_bk = prob_matrix[cells$start_row:cells$end_row, cells$start_col:cells$end_col][obs_k, obs_k]
cov_running_sum = cov_running_sum - (assign_b == assignment[obs_k]) * outcome[obs_k]^2 / (2 * pi_bk)
}
}
covariances[assign_a, assign_b] = cov_running_sum
}
}
# The sampling variance of the estimator is: (T = total, 1 = treatment, 0 = control)
# 1/N^2 * (Var(Y1_T) + Var(Y0_T) - 2Cov(Y1_T, Y0_T))
# Weighted-sum of variance and covariance terms.
# TODO: confirm that this use of the contrast weights is correct.
# The general formula is from https://en.wikipedia.org/wiki/Variance#Weighted_sum_of_variables
# I think we don't need to multiply two because we are using both triangles of a symmetric matrix.
# na.rm=T because the diagonals are NA since they are already in the variance calculation.
old_var = sum(variance_of_totals * contrasts^2, sum(contrasts %*% t(contrasts) * covariances, na.rm=T))
# Merge the var estimates into the diagonals of the covariance matrix.
cov_combined = covariances
for (var_i in 1:length(variance_of_totals)) {
cov_combined[var_i, var_i] = variance_of_totals[var_i]
}
# Should give us the same covariance results in a simpler formula.
var = sum(contrasts %*% t(contrasts) * cov_combined)
if (is.na(var) || var < 0) {
cat("Error: estimated variance is negative or NA:", var, ". Contrasts:", paste(contrasts, collapse=", "), "\n")
cat("contrasts %*% t(constrasts): \n")
print(contrasts %*% t(contrasts))
cat("Covariances:\n")
print(cov_combined)
} else {
# Only do this step if we have a non-NA and positive variance.
# Double-check that we get the same results, for debugging purposes.
if (abs(old_var - var) > .Machine$double.eps*1000) {
cat("Warning: new formula for variance doesn't match the result from the old variance formula\n")
cat("Old var:\n")
print(old_var)
cat("New var:\n")
print(var)
}
}
}
# Implement Joel's matrix formula, generalized to multiple treatments.
# TODO: expand to also work for constant treatment effects
if (approx == "sharp null") {
# NOTE: we could stack our potential outcome matrix here.
y_long = rep(contrasts, each=length(outcome)) * rep(outcome, length(contrasts))
# We're basically doing: y_long = c(-y, y)
# y_expanded = y over the probabilities (diag(P^-1) * Y_long).
y_exp = solve(diag(diag(prob_matrix))) %*% y_long
# y_exp^t P y_exp = (diag(P^-1) y_long)^T P (diag(P^-1) y_long)
# The sum converts a 1x1 matrix to a scalar.
# sdest_sn = sqrt(sum(t(y_exp) %*% prob_matrix %*% y_exp) / n^2)
# sdest_sn
# FIX THIS:
#var = sum(t(y_exp) %*% prob_matrix %*% y_exp)
# Use Joel matrix formulas.
y_contrast = NULL
for (assign_i in 1:k) {
y_contrast = rbind(y_contrast, as.matrix((assignment == assign_i) * contrasts[assign_i] * outcome))
}
p = prob_matrix
p_diag_inv = solve(diag(diag(p)))
D_star = p_diag_inv %*% (p - diag(p) %*% t(diag(p))) %*% p_diag_inv
var = sum(t(y_contrast) %*% D_star %*% y_contrast)
} else if (approx == "youngs") {
#y_long = rep(contrasts, each=length(outcome)) * rep(outcome, length(contrasts))
y_long = NULL
for (assign_i in 1:k) {
y_long = rbind(y_long, as.matrix((assignment == assign_i) * contrasts[assign_i] * outcome))
}
# Use Joel matrix formulas.
p = prob_matrix
p_plus = p
p_plus[p_plus == 0] = 0.000001
p_diag_inv = solve(diag(diag(p)))
D = p_diag_inv %*% ((p - diag(p) %*% t(diag(p))) / p_plus) %*% p_diag_inv
M = D + p_diag_inv
#sd <- sqrt(t(y_long) %*% M %*% y_long) / n_obs
var = t(y_long) %*% M %*% y_long
}
# Divide by n^2 if we're not calculating the VAR of totals.
if (!totals) {
var = var / n^2
}
# This will give a NaN if the estimated variance is negative.
se = sqrt(var)
#if (approx == "sharp null") {
# se = sdest_sn
#}
# 3. Calculate the probability using the normal distribution.
# TODO: could we use RI or something else as another way to generate the p-value? E.g. pass in permutations
p_value = 2*pnorm(-abs(estimate/se))
# Return the results.
# We include the variances, covariances, and outcome totals for debugging purposes only.
result = list(estimate=estimate, std_err=se, p_value=p_value,
covariances=cov_combined, totals=outcome_totals,
potential_outcomes = potential_outcomes)
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.