GenericML: Generic Machine Learning Inference

Documented in GenericML_single lambda_parameters Med quantile_group

#' Calculate lower and upper median
#'
#' Calculates the lower and and median of a vector as proposed in Comment 4.2 in the paper.
#'
#' @param x A numeric vector.
#'
#' @return
#' A list with the upper, lower, and usual median (where the latter is the average of the former two).
#'
#' @references
#' Chernozhukov V., Demirer M., Duflo E., Fernández-Val I. (2020). \dQuote{Generic Machine Learning Inference on Heterogenous Treatment Effects in Randomized Experiments.} \emph{arXiv preprint arXiv:1712.04802}. URL: \url{https://arxiv.org/abs/1712.04802}.
#'
#' @examples
#' set.seed(1)
#' x <- runif(100)
#' Med(x)
#'
#' @export
Med <- function(x){

  # get lower median and upper median
  lower <- med_lo(x)
  upper <- med_up(x)

  return(list(lower_median = lower,
              upper_median = upper,
              median = mean(c(lower, upper))))

} # END FUN


## lower median
med_lo <- function(x) stats::quantile(x, probs = 0.5, type = 1, names = FALSE)

## upper median
med_up <- function(x)
{
  x_rev <- -x # reverse order
  -stats::quantile(x_rev, probs = 0.5, type = 1, names = FALSE) # undo reversing
}



#' Partition a vector into quantile groups
#'
#' Partitions a vector into quantile groups and returns a logical matrix indicating group membership.
#'
#' @param x A numeric vector to be partitioned.
#' @param cutoffs A numeric vector denoting the quantile cutoffs for the partition. Default are the quartiles: \code{c(0.25, 0.5, 0.75)}.
#'
#' @return
#' An object of type \code{"quantile_group"}, which is a logical matrix indicating group membership.
#'
#' @examples
#' set.seed(1)
#' x <- runif(100)
#' cutoffs <- c(0.25, 0.5, 0.75)
#' quantile_group(x, cutoffs)
#'
#' @export
quantile_group <- function(x,
                           cutoffs = c(0.25, 0.5, 0.75)){


  # input checks
  stopifnot(is.numeric(x))
  stopifnot(is.numeric(cutoffs))
  stopifnot(0 < min(cutoffs) & max(cutoffs) < 1)

  # return
  quantile_group_NoChecks(x = x, cutoffs = cutoffs)


} # FUN


# same as above, just w/o input checks
quantile_group_NoChecks <- function(x = x,
                                    cutoffs = cutoffs){

  ## number of groups
  num_groups <- length(cutoffs) + 1L

  # to have non-zero within-group variation, we require at least 2 observations per group
  n              <- length(x)
  group_size_min <- 2L

  ### obtain the grouping
  # we require a while loop here because grouping on the raw x might be illegal, that is, the groups' sizes do not correspond to what one would expect under a continuous variable (see above). Such violations can happen when sufficiently large subsets of x have zero variation. To overcome this problem (if it is present), we add tiny random noise to x and repeat the grouping until a legal grouping is found.
  # NB: this problem was first spotted by Lukas Kitzmueller, who proposed the original bugfix (which we have adapted since). Many thanks to Lukas!
  grouping_unifinished <- TRUE
  ct <- 0L

  while(grouping_unifinished)
  {
    # get empirical quantiles
    q <- stats::quantile(x, cutoffs)

    # get names of breaks
    qnam <- breaks_format(breaks = q, dig.lab = 3L)

    # initialize out matrix and helper objects
    out <- matrix(NA, n, num_groups)
    legal_grouping <- rep(TRUE, num_groups)
    groupnam <- rep(NA_character_, num_groups)

    for(k in seq_len(num_groups))
    {
      ## get the grouping
      if(k == 1L)
      {
        bool_k <- x < q[k]
        groupnam[k] <- paste0("(-Inf, ", qnam[k], ")")

      } else if(k == num_groups)
      {
        bool_k <- x >= q[k-1L]
        groupnam[k] <- paste0("[", qnam[k-1L], ", Inf)")
      } else
      {
        bool_k <- q[k-1L] <= x & x < q[k]
        groupnam[k] <- paste0("[", qnam[k-1L], ", ",  qnam[k], ")")
      } # IF

      ## check if grouping is legal
      # a grouping is illegal if group doesn't have minimum size
      # this can happen if there is very little variation in x
      size_k <- sum(bool_k)
      if(size_k < group_size_min)
      {
        legal_grouping[k] <- FALSE
      } # IF

      ## store the candidate grouping
      out[,k] <- bool_k
    } # FOR k

    ## column naming
    colnames(out) <- groupnam

    ## check if the grouping is complete and legal
    if(all(legal_grouping))
    {
      grouping_unifinished <- FALSE # legal grouping -> will break the while loop
    } else
    {
      ## illegal grouping: induce tiny noise to increase variation
      x <- x + stats::rnorm(n, mean = 0.0, sd = 0.001)
    } # IF

    ## update counter
    ct <- ct + 1L

    if(ct > 3L)
    {
      stop("The specified quantile cutoffs do not allow for a grouping that results in groups with nonzero within-group variation. Increase the expected group size through the quantile cutoffs argument.")
    } # IF
  } # WHILE

  # return
  return(structure(out, type = "quantile_group"))

} # FUN



#' Single iteration of the GenericML algorithm
#'
#' Performs generic ML inference for a single learning technique and a given split of the data. Can be seen as a single iteration of Algorithm 1 in the paper.
#'
#' @param Z A numeric design matrix that holds the covariates in its columns.
#' @param D A binary vector of treatment assignment. Value one denotes assignment to the treatment group and value zero assignment to the control group.
#' @param Y A numeric vector containing the response variable.
#' @param learner A character specifying the machine learner to be used for estimating the baseline conditional average (BCA) and conditional average treatment effect (CATE). Either \code{'lasso'}, \code{'random_forest'}, \code{'tree'}, or a custom learner specified with \code{mlr3} syntax. In the latter case, do \emph{not} specify in the \code{mlr3} syntax specification if the learner is a regression learner or classification learner. Example: \code{'mlr3::lrn("ranger", num.trees = 100)'} for a random forest learner with 100 trees. Note that this is a string and the absence of the \code{classif.} or \code{regr.} keywords. See \url{https://mlr3learners.mlr-org.com} for a list of \code{mlr3} learners.
#' @param propensity_scores A numeric vector of propensity score estimates.
#' @param M_set A numerical vector of indices of observations in the main sample.
#' @param A_set A numerical vector of indices of observations in the auxiliary sample. Default is complementary set to \code{M_set}.
#' @param Z_CLAN A numeric matrix holding variables on which classification analysis (CLAN) shall be performed. CLAN will be performed on each column of the matrix. If \code{NULL} (default), then \code{Z_CLAN = Z}, i.e. CLAN is performed for all variables in \code{Z}.
#' @param HT Logical. If \code{TRUE}, a Horvitz-Thompson (HT) transformation is applied in the BLP and GATES regressions. Default is \code{FALSE}.
#' @param quantile_cutoffs The cutoff points of the quantiles that shall be used for GATES grouping. Default is \code{c(0.25, 0.5, 0.75)}, which corresponds to the four quartiles.
#' @param X1_BLP Specifies the design matrix \eqn{X_1} in the regression. Must be an object of class  \code{"\link{setup_X1}"}. See the documentation of \code{\link{setup_X1}()} for details.
#' @param X1_GATES Same as \code{X1_BLP}, just for the GATES regression.
#' @param diff_GATES Specifies the generic targets of GATES. Must be an object of class \code{"\link{setup_diff}"}. See the documentation of \code{\link{setup_diff}()} for details.
#' @param diff_CLAN Same as \code{diff_GATES}, just for the CLAN generic targets.
#' @param vcov_BLP Specifies the covariance matrix estimator in the BLP regression. Must be an object of class \code{"\link{setup_vcov}"}. See the documentation of \code{\link{setup_vcov}()} for details.
#' @param vcov_GATES Same as \code{vcov_BLP}, just for the GATES regression.
#' @param monotonize Logical. Should GATES point estimates and confidence bounds be rearranged to be monotonically increasing following the monotonization method of Chernozhukov et al. (2009, Biometrika)? Default is \code{TRUE}.
#' @param equal_variances_CLAN \bold{(deprecated and will be removed in a future release)} Logical. If \code{TRUE}, then all within-group variances of the CLAN groups are assumed to be equal. Default is \code{FALSE}. This specification is required for heteroskedasticity-robust variance estimation on the difference of two CLAN generic targets (i.e. variance of the difference of two means). If \code{TRUE} (corresponds to homoskedasticity assumption), the pooled variance is used. If \code{FALSE} (heteroskedasticity), the variance of Welch's t-test is used.
#' @param external_weights Optional vector of external numeric weights for weighted means in CLAN and weighted regression in BLP and GATES (in addition to the standard weights used when \code{HT = FALSE}).
#' @param significance_level Significance level for VEIN. Default is 0.05.
#' @param min_variation Specifies a threshold for the minimum variation of the BCA/CATE predictions. If the variation of a BCA/CATE prediction falls below this threshold, random noise with distribution \eqn{N(0, var(Y)/20)} is added to it. Default is \code{1e-05}.
#'
#' @return
#' A list with the following components:
#' \describe{
#' \item{\code{BLP}}{An object of class \code{"\link{BLP}"}.}
#' \item{\code{GATES}}{An object of class \code{"\link{GATES}"}.}
#' \item{\code{CLAN}}{An object of class \code{"\link{CLAN}"}.}
#' \item{\code{proxy_BCA}}{An object of class \code{"\link{proxy_BCA}"}.}
#' \item{\code{proxy_CATE}}{An object of class \code{"\link{proxy_CATE}"}.}
#' \item{\code{best}}{Estimates of the \eqn{\Lambda} parameters for finding the best learner. Returned by \code{\link{lambda_parameters}()}.}
#' }
#'
#' @details
#' The specifications \code{"lasso"}, \code{"random_forest"}, and \code{"tree"} in \code{learner} correspond to the following \code{mlr3} specifications (we omit the keywords \code{classif.} and \code{regr.}). \code{"lasso"} is a cross-validated Lasso estimator, which corresponds to \code{'mlr3::lrn("cv_glmnet", s = "lambda.min", alpha = 1)'}. \code{"random_forest"} is a random forest with 500 trees, which corresponds to \code{'mlr3::lrn("ranger", num.trees = 500)'}. \code{"tree"} is a tree learner, which corresponds to \code{'mlr3::lrn("rpart")'}.
#'
#' @references
#' Chernozhukov V., Demirer M., Duflo E., Fernández-Val I. (2020). \dQuote{Generic Machine Learning Inference on Heterogenous Treatment Effects in Randomized Experiments.} \emph{arXiv preprint arXiv:1712.04802}. URL: \url{https://arxiv.org/abs/1712.04802}.
#'
#' Lang M., Binder M., Richter J., Schratz P., Pfisterer F., Coors S., Au Q., Casalicchio G., Kotthoff L., Bischl B. (2019). \dQuote{mlr3: A Modern Object-Oriented Machine Learning Framework in R.} \emph{Journal of Open Source Software}, \bold{4}(44), 1903. \doi{10.21105/joss.01903}.
#'
#' Chernozhukov V., Fernández-Val I., Galichon, A. (2009). \dQuote{Improving Point and Interval Estimators of Monotone Functions by Rearrangement.} \emph{Biometrika}, \bold{96}(3), 559--575. \doi{10.1093/biomet/asp030}.
#'
#' @seealso
#' \code{\link{GenericML}()}
#'
#' @examples
#' if(require("ranger")){
#' ## generate data
#' set.seed(1)
#' n  <- 150                        # number of observations
#' p  <- 5                          # number of covariates
#' Z  <- matrix(runif(n*p), n, p)   # design matrix
#' D  <- rbinom(n, 1, 0.5)          # random treatment assignment
#' Y  <- runif(n)                   # outcome variable
#' propensity_scores <- rep(0.5, n) # propensity scores
#' M_set <- sample(1:n, size = n/2) # main set
#'
#' ## specify learner
#' learner <- "mlr3::lrn('ranger', num.trees = 10)"
#'
#' ## run single GenericML iteration
#' GenericML_single(Z, D, Y, learner, propensity_scores, M_set)
#' }
#'
#' @export
GenericML_single <- function(Z, D, Y,
                             learner,
                             propensity_scores,
                             M_set,
                             A_set                = setdiff(1:length(Y), M_set),
                             Z_CLAN               = NULL,
                             HT                   = FALSE,
                             quantile_cutoffs     = c(0.25, 0.5, 0.75),
                             X1_BLP               = setup_X1(),
                             X1_GATES             = setup_X1(),
                             diff_GATES           = setup_diff(),
                             diff_CLAN            = setup_diff(),
                             vcov_BLP             = setup_vcov(),
                             vcov_GATES           = setup_vcov(),
                             monotonize           = TRUE,
                             equal_variances_CLAN = FALSE,
                             external_weights     = NULL,
                             significance_level   = 0.05,
                             min_variation        = 1e-05){

  # input checks
  InputChecks_D(D)
  InputChecks_Y(Y)
  InputChecks_Z(Z)
  InputChecks_Z_CLAN(Z_CLAN)
  InputChecks_equal.length3(D, Y, Z)
  N <- length(Y)
  stopifnot(is.character(learner))
  stopifnot(length(learner) == 1)
  InputChecks_index_set(A_set, N)
  InputChecks_index_set(M_set, N)
  InputChecks_X1(X1_BLP, N)
  InputChecks_X1(X1_GATES, N)
  InputChecks_vcov.control(vcov_BLP)
  InputChecks_vcov.control(vcov_GATES)
  InputChecks_diff(diff_GATES, K = length(quantile_cutoffs) + 1)
  InputChecks_diff(diff_CLAN, K = length(quantile_cutoffs) + 1)
  stopifnot(is.numeric(quantile_cutoffs))
  stopifnot(0 < min(quantile_cutoffs) & max(quantile_cutoffs) < 1)
  stopifnot(is.logical(equal_variances_CLAN))
  stopifnot(is.logical(HT))
  stopifnot(is.logical(monotonize))
  stopifnot(is.numeric(significance_level) & length(significance_level) == 1)
  stopifnot(0.0 < significance_level & significance_level < 0.5)
  stopifnot(is.numeric(min_variation) & min_variation > 0)
  stopifnot(is.character(learner))
  stopifnot(length(learner) == 1)
  stopifnot(is.numeric(propensity_scores))
  InputChecks_equal.length2(Y, propensity_scores)
  InputChecks_propensity_scores(propensity_scores)
  InputChecks_external_weights(external_weights, N)
  message_changes()


  # if no input provided, set Z_CLAN equal to Z
  if(is.null(Z_CLAN)) Z_CLAN <- Z
  InputChecks_equal.length2(Z, Z_CLAN)

  # set variable names for CLAN
  if(is.null(colnames(Z_CLAN))) colnames(Z_CLAN) <- paste0("V", 1:ncol(Z_CLAN))
  if(any(colnames(Z_CLAN) == "")){

    idx <- which(colnames(Z_CLAN) == "")
    colnames(Z_CLAN)[idx] <- paste0("V", idx)

  } # IF

  # render the learner an mlr3 environment
  learner <- get.learner_regr(make.mlr3.environment(learner, regr = TRUE))


  # call the main function
  GenericML_single_NoChecks(Z = Z, D = D, Y = Y,
                            propensity_scores = propensity_scores,
                            learner = learner,
                            M_set = M_set, A_set = A_set,
                            Z_CLAN                     = Z_CLAN,
                            X1_BLP                     = X1_BLP,
                            X1_GATES                   = X1_GATES,
                            HT                         = HT,
                            vcov_BLP                   = setup_vcov_align(vcov_BLP),   # align for consistency
                            vcov_GATES                 = setup_vcov_align(vcov_GATES), # align for consistency
                            equal_variances_CLAN       = equal_variances_CLAN,
                            quantile_cutoffs           = quantile_cutoffs,
                            diff_GATES                 = diff_GATES,
                            diff_CLAN                  = diff_CLAN,
                            monotonize                 = monotonize,
                            significance_level         = significance_level,
                            external_weights           = external_weights,
                            min_variation              = min_variation)

} # END FUN


# helper that skips the input checks
GenericML_single_NoChecks <-
  function(Z, D, Y,
           propensity_scores,
           learner,
           M_set, A_set,
           Z_CLAN                     = NULL,
           X1_BLP                     = setup_X1(),
           X1_GATES                   = setup_X1(),
           HT                         = FALSE,
           vcov_BLP                   = setup_vcov(),
           vcov_GATES                 = setup_vcov(),
           equal_variances_CLAN       = FALSE,
           quantile_cutoffs           = c(0.25, 0.5, 0.75),
           diff_GATES                 = setup_diff(),
           diff_CLAN                  = setup_diff(),
           monotonize                 = TRUE,
           external_weights           = NULL,
           significance_level         = 0.05,
           min_variation              = 1e-05){


    ### step 1a: learn proxy predictors by using the auxiliary set ----

    # get proxy baseline estimates
    proxy_BCA.obj <- proxy_BCA_NoChecks(
      Z = Z, D = D, Y = Y,
      A_set         = A_set,
      learner       = learner,
      min_variation = min_variation)

    # get estimates on main sample
    proxy_BCA_M     <- proxy_BCA.obj$estimates[M_set]

    # get the proxy estimates of the CATE
    proxy_CATE.obj <-
      proxy_CATE_NoChecks(
        Z = Z, D = D, Y = Y,
        A_set          = A_set,
        learner        = learner,
        proxy_BCA      = proxy_BCA.obj$estimates,
        min_variation  = min_variation)

    # get estimates on main sample
    proxy_CATE_M <- proxy_CATE.obj$estimates$CATE[M_set]

    # restrict the X1_controls to M_set
    X1_BLP_M   <-
      setup_X1_NoChecks(
       funs_Z        = X1_BLP$funs_Z,
       covariates    = X1_BLP$covariates[M_set,,drop = FALSE],
       fixed_effects = X1_BLP$fixed_effects[M_set])

    X1_GATES_M <-
      setup_X1_NoChecks(
        funs_Z        = X1_GATES$funs_Z,
        covariates    = X1_GATES$covariates[M_set,,drop = FALSE],
        fixed_effects = X1_GATES$fixed_effects[M_set])

    # restrict the vcov controls to M_set
    vcov_BLP_M   <- setup_vcov_subset(vcov_BLP, idx = M_set)
    vcov_GATES_M <- setup_vcov_subset(vcov_GATES, idx = M_set)


    ### step 1b: estimate BLP parameters ----
    blp.obj <- BLP_NoChecks(
      D                  = D[M_set],
      Y                  = Y[M_set],
      propensity_scores  = propensity_scores[M_set],
      proxy_BCA          = proxy_BCA_M,
      proxy_CATE         = proxy_CATE_M,
      HT                 = HT,
      X1_control         = X1_BLP_M,
      vcov_control       = vcov_BLP_M,
      external_weights   = external_weights[M_set],
      significance_level = significance_level)


    ### step 1c: estimate GATES parameters by OLS ----
    # group the proxy estimators for the CATE in the main sample by quantiles
    membership_M <- quantile_group_NoChecks(proxy_CATE_M,
                                            cutoffs = quantile_cutoffs)

    # estimate GATES
    gates.obj <- GATES_NoChecks(
      D                   = D[M_set],
      Y                   = Y[M_set],
      propensity_scores   = propensity_scores[M_set],
      proxy_BCA           = proxy_BCA_M,
      proxy_CATE          = proxy_CATE_M,
      membership          = membership_M,
      HT                  = HT,
      X1_control          = X1_GATES_M,
      vcov_control        = vcov_GATES_M,
      monotonize          = monotonize,
      diff                = diff_GATES,
      external_weights    = external_weights[M_set],
      significance_level  = significance_level)


    ### step 1d: estimate CLAN parameters on the main sample ----
    clan.obj <- CLAN_NoChecks(
      Z_CLAN             = Z_CLAN[M_set,,drop = FALSE],
      membership         = membership_M,
      equal_variances    = equal_variances_CLAN,
      diff               = diff_CLAN,
      external_weights   = external_weights[M_set],
      significance_level = significance_level)


    ### step 1e: get parameters over which we maximize to find the "best" ML method ----
    best.obj <- lambda_parameters_NoChecks(BLP        = blp.obj,
                                           GATES      = gates.obj,
                                           proxy_CATE = proxy_CATE_M,
                                           membership = membership_M)

    ### organize output in a list ----
    return(list(BLP            = blp.obj,
                GATES          = gates.obj,
                CLAN           = clan.obj,
                proxy_CATE     = proxy_CATE.obj,
                proxy_BCA      = proxy_BCA.obj,
                best           = best.obj
    ))

  } # FUN



#' Estimate the two lambda parameters
#'
#' Estimates the lambda parameters \eqn{\Lambda} and \eqn{\bar{\Lambda}} whose medians are used to find the best ML method.
#'
#' @param BLP An object of class \code{"\link{BLP}"}.
#' @param GATES An object of class \code{"\link{GATES}"}.
#' @param proxy_CATE Proxy estimates of the CATE.
#' @param membership A logical matrix that indicates the group membership of each observation in \code{Z_CLAN}. Needs to be of type \code{"\link{quantile_group}"}. Typically, the grouping is based on CATE estimates, which are for instance returned by \code{\link{proxy_CATE}()}.
#'
#' @return
#' A list containing the estimates of \eqn{\Lambda} and \eqn{\bar{\Lambda}}, denoted \code{lambda} and \code{lambda.bar}, respectively.
#'
#' @references
#' Chernozhukov V., Demirer M., Duflo E., Fernández-Val I. (2020). \dQuote{Generic Machine Learning Inference on Heterogenous Treatment Effects in Randomized Experiments.} \emph{arXiv preprint arXiv:1712.04802}. URL: \url{https://arxiv.org/abs/1712.04802}.
#'
#' @examples
#' ## generate data
#' set.seed(1)
#' n  <- 200                                # number of observations
#' p  <- 5                                  # number of covariates
#' D  <- rbinom(n, 1, 0.5)                  # random treatment assignment
#' Y  <- runif(n)                           # outcome variable
#' propensity_scores <- rep(0.5, n)         # propensity scores
#' proxy_BCA         <- runif(n)            # proxy BCA estimates
#' proxy_CATE        <- runif(n)            # proxy CATE estimates
#' membership <- quantile_group(proxy_CATE) # group membership
#'
#' ## perform BLP
#' BLP <- BLP(Y, D, propensity_scores, proxy_BCA, proxy_CATE)
#'
#' ## perform GATES
#' GATES <- GATES(Y, D, propensity_scores, proxy_BCA, proxy_CATE, membership)
#'
#' ## get estimates of the lambda parameters
#' lambda_parameters(BLP, GATES, proxy_CATE, membership)
#'
#' @export
lambda_parameters <- function(BLP,
                              GATES,
                              proxy_CATE,
                              membership){

  if(!inherits(x = BLP, what = "BLP", which = FALSE)){
    stop("The BLP object must be an instance of BLP()")
  }
  if(!inherits(x = GATES, what = "GATES", which = FALSE)){
    stop("The GATES object must be an instance of GATES()")
  }
  InputChecks_group.membership(membership)
  stopifnot(is.numeric(proxy_CATE))
  InputChecks_equal.length2(proxy_CATE, membership)
  temp <- GATES$coefficients
  gates.coefs <- temp[startsWith(rownames(temp), "gamma."), "Estimate"]

  if(ncol(membership) != length(gates.coefs)){
    stop("The number of columns of 'membership' must be equal to the number of GATES gamma coefficients")
  }

  # return
  lambda_parameters_NoChecks(BLP = BLP,
                             GATES = GATES,
                             proxy_CATE = proxy_CATE,
                             membership = membership)

} # END FUN


# same as above, but w/o input checks
lambda_parameters_NoChecks <- function(BLP,
                                       GATES,
                                       proxy_CATE,
                                       membership){

  temp <- GATES$coefficients
  gates.coefs <- temp[startsWith(rownames(temp), "gamma."), "Estimate"]

  return(list(lambda = BLP$coefficients["beta.2", "Estimate"]^2 * stats::var(proxy_CATE),
              lambda.bar = as.numeric(colMeans(membership) %*%  gates.coefs^2)))

} # FUN