#' Calculate lower and upper median
#'
#' Calculates the lower and and median of a vector as proposed in Comment 4.2 in the paper.
#'
#' @param x A numeric vector.
#'
#' @return
#' A list with the upper, lower, and usual median (where the latter is the average of the former two).
#'
#' @references
#' Chernozhukov V., Demirer M., Duflo E., Fernández-Val I. (2020). \dQuote{Generic Machine Learning Inference on Heterogenous Treatment Effects in Randomized Experiments.} \emph{arXiv preprint arXiv:1712.04802}. URL: \url{https://arxiv.org/abs/1712.04802}.
#'
#' @examples
#' set.seed(1)
#' x <- runif(100)
#' Med(x)
#'
#' @export
Med <- function(x){
# get lower median and upper median
lower <- med_lo(x)
upper <- med_up(x)
return(list(lower_median = lower,
upper_median = upper,
median = mean(c(lower, upper))))
} # END FUN
## lower median
med_lo <- function(x) stats::quantile(x, probs = 0.5, type = 1, names = FALSE)
## upper median
med_up <- function(x)
{
x_rev <- -x # reverse order
-stats::quantile(x_rev, probs = 0.5, type = 1, names = FALSE) # undo reversing
}
#' Partition a vector into quantile groups
#'
#' Partitions a vector into quantile groups and returns a logical matrix indicating group membership.
#'
#' @param x A numeric vector to be partitioned.
#' @param cutoffs A numeric vector denoting the quantile cutoffs for the partition. Default are the quartiles: \code{c(0.25, 0.5, 0.75)}.
#'
#' @return
#' An object of type \code{"quantile_group"}, which is a logical matrix indicating group membership.
#'
#' @examples
#' set.seed(1)
#' x <- runif(100)
#' cutoffs <- c(0.25, 0.5, 0.75)
#' quantile_group(x, cutoffs)
#'
#' @export
quantile_group <- function(x,
cutoffs = c(0.25, 0.5, 0.75)){
# input checks
stopifnot(is.numeric(x))
stopifnot(is.numeric(cutoffs))
stopifnot(0 < min(cutoffs) & max(cutoffs) < 1)
# return
quantile_group_NoChecks(x = x, cutoffs = cutoffs)
} # FUN
# same as above, just w/o input checks
quantile_group_NoChecks <- function(x = x,
cutoffs = cutoffs){
## number of groups
num_groups <- length(cutoffs) + 1L
# to have non-zero within-group variation, we require at least 2 observations per group
n <- length(x)
group_size_min <- 2L
### obtain the grouping
# we require a while loop here because grouping on the raw x might be illegal, that is, the groups' sizes do not correspond to what one would expect under a continuous variable (see above). Such violations can happen when sufficiently large subsets of x have zero variation. To overcome this problem (if it is present), we add tiny random noise to x and repeat the grouping until a legal grouping is found.
# NB: this problem was first spotted by Lukas Kitzmueller, who proposed the original bugfix (which we have adapted since). Many thanks to Lukas!
grouping_unifinished <- TRUE
ct <- 0L
while(grouping_unifinished)
{
# get empirical quantiles
q <- stats::quantile(x, cutoffs)
# get names of breaks
qnam <- breaks_format(breaks = q, dig.lab = 3L)
# initialize out matrix and helper objects
out <- matrix(NA, n, num_groups)
legal_grouping <- rep(TRUE, num_groups)
groupnam <- rep(NA_character_, num_groups)
for(k in seq_len(num_groups))
{
## get the grouping
if(k == 1L)
{
bool_k <- x < q[k]
groupnam[k] <- paste0("(-Inf, ", qnam[k], ")")
} else if(k == num_groups)
{
bool_k <- x >= q[k-1L]
groupnam[k] <- paste0("[", qnam[k-1L], ", Inf)")
} else
{
bool_k <- q[k-1L] <= x & x < q[k]
groupnam[k] <- paste0("[", qnam[k-1L], ", ", qnam[k], ")")
} # IF
## check if grouping is legal
# a grouping is illegal if group doesn't have minimum size
# this can happen if there is very little variation in x
size_k <- sum(bool_k)
if(size_k < group_size_min)
{
legal_grouping[k] <- FALSE
} # IF
## store the candidate grouping
out[,k] <- bool_k
} # FOR k
## column naming
colnames(out) <- groupnam
## check if the grouping is complete and legal
if(all(legal_grouping))
{
grouping_unifinished <- FALSE # legal grouping -> will break the while loop
} else
{
## illegal grouping: induce tiny noise to increase variation
x <- x + stats::rnorm(n, mean = 0.0, sd = 0.001)
} # IF
## update counter
ct <- ct + 1L
if(ct > 3L)
{
stop("The specified quantile cutoffs do not allow for a grouping that results in groups with nonzero within-group variation. Increase the expected group size through the quantile cutoffs argument.")
} # IF
} # WHILE
# return
return(structure(out, type = "quantile_group"))
} # FUN
#' Single iteration of the GenericML algorithm
#'
#' Performs generic ML inference for a single learning technique and a given split of the data. Can be seen as a single iteration of Algorithm 1 in the paper.
#'
#' @param Z A numeric design matrix that holds the covariates in its columns.
#' @param D A binary vector of treatment assignment. Value one denotes assignment to the treatment group and value zero assignment to the control group.
#' @param Y A numeric vector containing the response variable.
#' @param learner A character specifying the machine learner to be used for estimating the baseline conditional average (BCA) and conditional average treatment effect (CATE). Either \code{'lasso'}, \code{'random_forest'}, \code{'tree'}, or a custom learner specified with \code{mlr3} syntax. In the latter case, do \emph{not} specify in the \code{mlr3} syntax specification if the learner is a regression learner or classification learner. Example: \code{'mlr3::lrn("ranger", num.trees = 100)'} for a random forest learner with 100 trees. Note that this is a string and the absence of the \code{classif.} or \code{regr.} keywords. See \url{https://mlr3learners.mlr-org.com} for a list of \code{mlr3} learners.
#' @param propensity_scores A numeric vector of propensity score estimates.
#' @param M_set A numerical vector of indices of observations in the main sample.
#' @param A_set A numerical vector of indices of observations in the auxiliary sample. Default is complementary set to \code{M_set}.
#' @param Z_CLAN A numeric matrix holding variables on which classification analysis (CLAN) shall be performed. CLAN will be performed on each column of the matrix. If \code{NULL} (default), then \code{Z_CLAN = Z}, i.e. CLAN is performed for all variables in \code{Z}.
#' @param HT Logical. If \code{TRUE}, a Horvitz-Thompson (HT) transformation is applied in the BLP and GATES regressions. Default is \code{FALSE}.
#' @param quantile_cutoffs The cutoff points of the quantiles that shall be used for GATES grouping. Default is \code{c(0.25, 0.5, 0.75)}, which corresponds to the four quartiles.
#' @param X1_BLP Specifies the design matrix \eqn{X_1} in the regression. Must be an object of class \code{"\link{setup_X1}"}. See the documentation of \code{\link{setup_X1}()} for details.
#' @param X1_GATES Same as \code{X1_BLP}, just for the GATES regression.
#' @param diff_GATES Specifies the generic targets of GATES. Must be an object of class \code{"\link{setup_diff}"}. See the documentation of \code{\link{setup_diff}()} for details.
#' @param diff_CLAN Same as \code{diff_GATES}, just for the CLAN generic targets.
#' @param vcov_BLP Specifies the covariance matrix estimator in the BLP regression. Must be an object of class \code{"\link{setup_vcov}"}. See the documentation of \code{\link{setup_vcov}()} for details.
#' @param vcov_GATES Same as \code{vcov_BLP}, just for the GATES regression.
#' @param monotonize Logical. Should GATES point estimates and confidence bounds be rearranged to be monotonically increasing following the monotonization method of Chernozhukov et al. (2009, Biometrika)? Default is \code{TRUE}.
#' @param equal_variances_CLAN \bold{(deprecated and will be removed in a future release)} Logical. If \code{TRUE}, then all within-group variances of the CLAN groups are assumed to be equal. Default is \code{FALSE}. This specification is required for heteroskedasticity-robust variance estimation on the difference of two CLAN generic targets (i.e. variance of the difference of two means). If \code{TRUE} (corresponds to homoskedasticity assumption), the pooled variance is used. If \code{FALSE} (heteroskedasticity), the variance of Welch's t-test is used.
#' @param external_weights Optional vector of external numeric weights for weighted means in CLAN and weighted regression in BLP and GATES (in addition to the standard weights used when \code{HT = FALSE}).
#' @param significance_level Significance level for VEIN. Default is 0.05.
#' @param min_variation Specifies a threshold for the minimum variation of the BCA/CATE predictions. If the variation of a BCA/CATE prediction falls below this threshold, random noise with distribution \eqn{N(0, var(Y)/20)} is added to it. Default is \code{1e-05}.
#'
#' @return
#' A list with the following components:
#' \describe{
#' \item{\code{BLP}}{An object of class \code{"\link{BLP}"}.}
#' \item{\code{GATES}}{An object of class \code{"\link{GATES}"}.}
#' \item{\code{CLAN}}{An object of class \code{"\link{CLAN}"}.}
#' \item{\code{proxy_BCA}}{An object of class \code{"\link{proxy_BCA}"}.}
#' \item{\code{proxy_CATE}}{An object of class \code{"\link{proxy_CATE}"}.}
#' \item{\code{best}}{Estimates of the \eqn{\Lambda} parameters for finding the best learner. Returned by \code{\link{lambda_parameters}()}.}
#' }
#'
#' @details
#' The specifications \code{"lasso"}, \code{"random_forest"}, and \code{"tree"} in \code{learner} correspond to the following \code{mlr3} specifications (we omit the keywords \code{classif.} and \code{regr.}). \code{"lasso"} is a cross-validated Lasso estimator, which corresponds to \code{'mlr3::lrn("cv_glmnet", s = "lambda.min", alpha = 1)'}. \code{"random_forest"} is a random forest with 500 trees, which corresponds to \code{'mlr3::lrn("ranger", num.trees = 500)'}. \code{"tree"} is a tree learner, which corresponds to \code{'mlr3::lrn("rpart")'}.
#'
#' @references
#' Chernozhukov V., Demirer M., Duflo E., Fernández-Val I. (2020). \dQuote{Generic Machine Learning Inference on Heterogenous Treatment Effects in Randomized Experiments.} \emph{arXiv preprint arXiv:1712.04802}. URL: \url{https://arxiv.org/abs/1712.04802}.
#'
#' Lang M., Binder M., Richter J., Schratz P., Pfisterer F., Coors S., Au Q., Casalicchio G., Kotthoff L., Bischl B. (2019). \dQuote{mlr3: A Modern Object-Oriented Machine Learning Framework in R.} \emph{Journal of Open Source Software}, \bold{4}(44), 1903. \doi{10.21105/joss.01903}.
#'
#' Chernozhukov V., Fernández-Val I., Galichon, A. (2009). \dQuote{Improving Point and Interval Estimators of Monotone Functions by Rearrangement.} \emph{Biometrika}, \bold{96}(3), 559--575. \doi{10.1093/biomet/asp030}.
#'
#' @seealso
#' \code{\link{GenericML}()}
#'
#' @examples
#' if(require("ranger")){
#' ## generate data
#' set.seed(1)
#' n <- 150 # number of observations
#' p <- 5 # number of covariates
#' Z <- matrix(runif(n*p), n, p) # design matrix
#' D <- rbinom(n, 1, 0.5) # random treatment assignment
#' Y <- runif(n) # outcome variable
#' propensity_scores <- rep(0.5, n) # propensity scores
#' M_set <- sample(1:n, size = n/2) # main set
#'
#' ## specify learner
#' learner <- "mlr3::lrn('ranger', num.trees = 10)"
#'
#' ## run single GenericML iteration
#' GenericML_single(Z, D, Y, learner, propensity_scores, M_set)
#' }
#'
#' @export
GenericML_single <- function(Z, D, Y,
learner,
propensity_scores,
M_set,
A_set = setdiff(1:length(Y), M_set),
Z_CLAN = NULL,
HT = FALSE,
quantile_cutoffs = c(0.25, 0.5, 0.75),
X1_BLP = setup_X1(),
X1_GATES = setup_X1(),
diff_GATES = setup_diff(),
diff_CLAN = setup_diff(),
vcov_BLP = setup_vcov(),
vcov_GATES = setup_vcov(),
monotonize = TRUE,
equal_variances_CLAN = FALSE,
external_weights = NULL,
significance_level = 0.05,
min_variation = 1e-05){
# input checks
InputChecks_D(D)
InputChecks_Y(Y)
InputChecks_Z(Z)
InputChecks_Z_CLAN(Z_CLAN)
InputChecks_equal.length3(D, Y, Z)
N <- length(Y)
stopifnot(is.character(learner))
stopifnot(length(learner) == 1)
InputChecks_index_set(A_set, N)
InputChecks_index_set(M_set, N)
InputChecks_X1(X1_BLP, N)
InputChecks_X1(X1_GATES, N)
InputChecks_vcov.control(vcov_BLP)
InputChecks_vcov.control(vcov_GATES)
InputChecks_diff(diff_GATES, K = length(quantile_cutoffs) + 1)
InputChecks_diff(diff_CLAN, K = length(quantile_cutoffs) + 1)
stopifnot(is.numeric(quantile_cutoffs))
stopifnot(0 < min(quantile_cutoffs) & max(quantile_cutoffs) < 1)
stopifnot(is.logical(equal_variances_CLAN))
stopifnot(is.logical(HT))
stopifnot(is.logical(monotonize))
stopifnot(is.numeric(significance_level) & length(significance_level) == 1)
stopifnot(0.0 < significance_level & significance_level < 0.5)
stopifnot(is.numeric(min_variation) & min_variation > 0)
stopifnot(is.character(learner))
stopifnot(length(learner) == 1)
stopifnot(is.numeric(propensity_scores))
InputChecks_equal.length2(Y, propensity_scores)
InputChecks_propensity_scores(propensity_scores)
InputChecks_external_weights(external_weights, N)
message_changes()
# if no input provided, set Z_CLAN equal to Z
if(is.null(Z_CLAN)) Z_CLAN <- Z
InputChecks_equal.length2(Z, Z_CLAN)
# set variable names for CLAN
if(is.null(colnames(Z_CLAN))) colnames(Z_CLAN) <- paste0("V", 1:ncol(Z_CLAN))
if(any(colnames(Z_CLAN) == "")){
idx <- which(colnames(Z_CLAN) == "")
colnames(Z_CLAN)[idx] <- paste0("V", idx)
} # IF
# render the learner an mlr3 environment
learner <- get.learner_regr(make.mlr3.environment(learner, regr = TRUE))
# call the main function
GenericML_single_NoChecks(Z = Z, D = D, Y = Y,
propensity_scores = propensity_scores,
learner = learner,
M_set = M_set, A_set = A_set,
Z_CLAN = Z_CLAN,
X1_BLP = X1_BLP,
X1_GATES = X1_GATES,
HT = HT,
vcov_BLP = setup_vcov_align(vcov_BLP), # align for consistency
vcov_GATES = setup_vcov_align(vcov_GATES), # align for consistency
equal_variances_CLAN = equal_variances_CLAN,
quantile_cutoffs = quantile_cutoffs,
diff_GATES = diff_GATES,
diff_CLAN = diff_CLAN,
monotonize = monotonize,
significance_level = significance_level,
external_weights = external_weights,
min_variation = min_variation)
} # END FUN
# helper that skips the input checks
GenericML_single_NoChecks <-
function(Z, D, Y,
propensity_scores,
learner,
M_set, A_set,
Z_CLAN = NULL,
X1_BLP = setup_X1(),
X1_GATES = setup_X1(),
HT = FALSE,
vcov_BLP = setup_vcov(),
vcov_GATES = setup_vcov(),
equal_variances_CLAN = FALSE,
quantile_cutoffs = c(0.25, 0.5, 0.75),
diff_GATES = setup_diff(),
diff_CLAN = setup_diff(),
monotonize = TRUE,
external_weights = NULL,
significance_level = 0.05,
min_variation = 1e-05){
### step 1a: learn proxy predictors by using the auxiliary set ----
# get proxy baseline estimates
proxy_BCA.obj <- proxy_BCA_NoChecks(
Z = Z, D = D, Y = Y,
A_set = A_set,
learner = learner,
min_variation = min_variation)
# get estimates on main sample
proxy_BCA_M <- proxy_BCA.obj$estimates[M_set]
# get the proxy estimates of the CATE
proxy_CATE.obj <-
proxy_CATE_NoChecks(
Z = Z, D = D, Y = Y,
A_set = A_set,
learner = learner,
proxy_BCA = proxy_BCA.obj$estimates,
min_variation = min_variation)
# get estimates on main sample
proxy_CATE_M <- proxy_CATE.obj$estimates$CATE[M_set]
# restrict the X1_controls to M_set
X1_BLP_M <-
setup_X1_NoChecks(
funs_Z = X1_BLP$funs_Z,
covariates = X1_BLP$covariates[M_set,,drop = FALSE],
fixed_effects = X1_BLP$fixed_effects[M_set])
X1_GATES_M <-
setup_X1_NoChecks(
funs_Z = X1_GATES$funs_Z,
covariates = X1_GATES$covariates[M_set,,drop = FALSE],
fixed_effects = X1_GATES$fixed_effects[M_set])
# restrict the vcov controls to M_set
vcov_BLP_M <- setup_vcov_subset(vcov_BLP, idx = M_set)
vcov_GATES_M <- setup_vcov_subset(vcov_GATES, idx = M_set)
### step 1b: estimate BLP parameters ----
blp.obj <- BLP_NoChecks(
D = D[M_set],
Y = Y[M_set],
propensity_scores = propensity_scores[M_set],
proxy_BCA = proxy_BCA_M,
proxy_CATE = proxy_CATE_M,
HT = HT,
X1_control = X1_BLP_M,
vcov_control = vcov_BLP_M,
external_weights = external_weights[M_set],
significance_level = significance_level)
### step 1c: estimate GATES parameters by OLS ----
# group the proxy estimators for the CATE in the main sample by quantiles
membership_M <- quantile_group_NoChecks(proxy_CATE_M,
cutoffs = quantile_cutoffs)
# estimate GATES
gates.obj <- GATES_NoChecks(
D = D[M_set],
Y = Y[M_set],
propensity_scores = propensity_scores[M_set],
proxy_BCA = proxy_BCA_M,
proxy_CATE = proxy_CATE_M,
membership = membership_M,
HT = HT,
X1_control = X1_GATES_M,
vcov_control = vcov_GATES_M,
monotonize = monotonize,
diff = diff_GATES,
external_weights = external_weights[M_set],
significance_level = significance_level)
### step 1d: estimate CLAN parameters on the main sample ----
clan.obj <- CLAN_NoChecks(
Z_CLAN = Z_CLAN[M_set,,drop = FALSE],
membership = membership_M,
equal_variances = equal_variances_CLAN,
diff = diff_CLAN,
external_weights = external_weights[M_set],
significance_level = significance_level)
### step 1e: get parameters over which we maximize to find the "best" ML method ----
best.obj <- lambda_parameters_NoChecks(BLP = blp.obj,
GATES = gates.obj,
proxy_CATE = proxy_CATE_M,
membership = membership_M)
### organize output in a list ----
return(list(BLP = blp.obj,
GATES = gates.obj,
CLAN = clan.obj,
proxy_CATE = proxy_CATE.obj,
proxy_BCA = proxy_BCA.obj,
best = best.obj
))
} # FUN
#' Estimate the two lambda parameters
#'
#' Estimates the lambda parameters \eqn{\Lambda} and \eqn{\bar{\Lambda}} whose medians are used to find the best ML method.
#'
#' @param BLP An object of class \code{"\link{BLP}"}.
#' @param GATES An object of class \code{"\link{GATES}"}.
#' @param proxy_CATE Proxy estimates of the CATE.
#' @param membership A logical matrix that indicates the group membership of each observation in \code{Z_CLAN}. Needs to be of type \code{"\link{quantile_group}"}. Typically, the grouping is based on CATE estimates, which are for instance returned by \code{\link{proxy_CATE}()}.
#'
#' @return
#' A list containing the estimates of \eqn{\Lambda} and \eqn{\bar{\Lambda}}, denoted \code{lambda} and \code{lambda.bar}, respectively.
#'
#' @references
#' Chernozhukov V., Demirer M., Duflo E., Fernández-Val I. (2020). \dQuote{Generic Machine Learning Inference on Heterogenous Treatment Effects in Randomized Experiments.} \emph{arXiv preprint arXiv:1712.04802}. URL: \url{https://arxiv.org/abs/1712.04802}.
#'
#' @examples
#' ## generate data
#' set.seed(1)
#' n <- 200 # number of observations
#' p <- 5 # number of covariates
#' D <- rbinom(n, 1, 0.5) # random treatment assignment
#' Y <- runif(n) # outcome variable
#' propensity_scores <- rep(0.5, n) # propensity scores
#' proxy_BCA <- runif(n) # proxy BCA estimates
#' proxy_CATE <- runif(n) # proxy CATE estimates
#' membership <- quantile_group(proxy_CATE) # group membership
#'
#' ## perform BLP
#' BLP <- BLP(Y, D, propensity_scores, proxy_BCA, proxy_CATE)
#'
#' ## perform GATES
#' GATES <- GATES(Y, D, propensity_scores, proxy_BCA, proxy_CATE, membership)
#'
#' ## get estimates of the lambda parameters
#' lambda_parameters(BLP, GATES, proxy_CATE, membership)
#'
#' @export
lambda_parameters <- function(BLP,
GATES,
proxy_CATE,
membership){
if(!inherits(x = BLP, what = "BLP", which = FALSE)){
stop("The BLP object must be an instance of BLP()")
}
if(!inherits(x = GATES, what = "GATES", which = FALSE)){
stop("The GATES object must be an instance of GATES()")
}
InputChecks_group.membership(membership)
stopifnot(is.numeric(proxy_CATE))
InputChecks_equal.length2(proxy_CATE, membership)
temp <- GATES$coefficients
gates.coefs <- temp[startsWith(rownames(temp), "gamma."), "Estimate"]
if(ncol(membership) != length(gates.coefs)){
stop("The number of columns of 'membership' must be equal to the number of GATES gamma coefficients")
}
# return
lambda_parameters_NoChecks(BLP = BLP,
GATES = GATES,
proxy_CATE = proxy_CATE,
membership = membership)
} # END FUN
# same as above, but w/o input checks
lambda_parameters_NoChecks <- function(BLP,
GATES,
proxy_CATE,
membership){
temp <- GATES$coefficients
gates.coefs <- temp[startsWith(rownames(temp), "gamma."), "Estimate"]
return(list(lambda = BLP$coefficients["beta.2", "Estimate"]^2 * stats::var(proxy_CATE),
lambda.bar = as.numeric(colMeans(membership) %*% gates.coefs^2)))
} # FUN
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.