R/identificationDML.R

Defines functions identificationDML

Documented in identificationDML

#' Testing identification with double machine learning
#'
#' @param y Dependent variable, must not contain missings.
#' @param d Treatment variable, must be discrete, must not contain missings.
#' @param x Covariates, must not contain missings.
#' @param z Instrument, must not contain missings.
#' @param score Orthogonal score used for testing identification, either \code{"DR"} for using the average of the doubly robust (DR) score function (see Section 6 of Huber and Kueck, 2022) for testing, or \code{"squared"} for using squared differences in the conditional means outcomes (see Section 7 of Huber and Kueck, 2022). Default is \code{"DR"}. Note that this argument is ignored if \code{bootstrap=TRUE}.
#' @param bootstrap If set to \code{TRUE}, testing identification is based on the DR score function within data-driven partitioning of the data (using a random forest with 200 trees) as described at the end of Sections 6 and 8 in Huber and Kueck (2022). Default is \code{FALSE}. Note that the argument \code{score} is ignored if \code{bootstrap=TRUE}.
#' @param ztreat Value of the instrument in the "treatment" group. Default is 1.
#' @param zcontrol Value of the instrument in the "control" group. Default is 0.
#' @param seed Default is 123.
#' @param MLmethod Machine learning method for estimating the nuisance parameters based on the \code{SuperLearner} package. Must be either  \code{"lasso"} (default) for lasso estimation,  \code{"randomforest"} for random forests, \code{"xgboost"} for xg boosting,  \code{"svm"} for support vector machines, \code{"ensemble"} for using an ensemble algorithm based on all previously mentioned machine learners, or \code{"parametric"} for linear or logit regression.
#' @param k Number of folds in k-fold cross-fitting. Default is 3.
#' @param DR_parameters List of input parameters to test identification using the doubly robust score:
#' s: Indicator function for defining a subpopulation for which the treatment effect is estimated as a function of the subpopulation's distribution of \code{x}. Default is \code{NULL} (estimation of the average treatment effect in the total population).
#' normalized: If set to \code{TRUE}, then the inverse probability-based weights are normalized such that they add up to 1 within treatment groups. Default is \code{TRUE}
#' trim: Trimming rule for discarding observations with treatment propensity scores that are smaller than \code{trim} or larger than \code{1-trim} (to avoid too small denominators in weighting by the inverse of the propensity scores). Default is 0.01.
#' @param squared_parameters List of input parameters to test identification using the squared deviation:
#' zeta_sigma: standard deviation of the normal distributed errors to avoid degenerated limit distribution. Default is min(0.05,500/n).
#' @param bootstrap_parameters List of input parameters to test identification using the DR score and sample splitting to detect heterogeneity (if \code{bootstrap=TRUE}):
#' B: number of bootstrap samples to be used in the multiplier bootstrap. Default is 2000.
#' importance: upper quantile of covariates in terms of their predictive importance for heterogeneity in the DR score function according to a random forest (with 200 trees). The data are split into subsets based on the median values of these predictive covariates (entering the upper quantile). Default is 0.95.
#' alpha: level of the statistical test. Default is 0.1.
#' share: share of observations used to detect heterogeneity in the DR score function by the random forest (while the remaining observations are used for hypothesis testing). Default is 0.5.
#' @details Testing the identification of causal effects of a treatment \code{d} on an outcome \code{y} in observational data using a supposed instrument \code{z} and controlling for observed covariates \code{x}.
#' @return An \code{identificationDML} object contains different parameters, at least the two following:
#' @return \code{effect}: estimate of the target parameter(s).
#' @return \code{pval}: p-value(s) of the identification test.
#' @references Huber, M., & Kueck, J. (2022): Testing the identification of causal effects in observational data. arXiv:2203.15890.
#' @examples # Two examples with simulated data
#' \dontrun{
#' set.seed(777)
#' n <- 20000  # sample size
#' p <- 50    # number of covariates
#' s <- 5  # sparsity (relevant covariates)
#' alpha <- 0.1    # level
#'
#' control violation of identification
#' delta <- 2    # effect of unobservable in outcome on index of treatment - either 0 or 2
#' gamma <- 0   # direct effect of the instrument on outcome  - either 0 or 0.1
#'
#' DGP - general
#' xcorr <- 1   # if 1, then non-zero covariance between regressors
#' if (xcorr == 0) {
#'  sigmax <- diag(1,p)}       # covariate matrix at baseline
#' if (xcorr != 0){
#'  sigmax = matrix(NA,p,p)
#' for (i in 1:p){
#'    for (j in 1:p){
#'      sigmax[i,j] = 0.5^(abs(i-j))
#'    }
#'  }}
#' sparse = FALSE # if FALSE, an approximate sparse setting is considered
#' beta = rep(0,p)
#' if (sparse == TRUE){
#'  for (j in 1:s){ beta[j] <- 1} }
#' if (sparse != TRUE){
#'  for (j in 1:p) beta[j] <- (1/j)}
#' noise_U <- 0.1 # control signal-to-noise
#' noise_V <- 0.1
#' noise_W <- 0.25
#' x <- (rmvnorm(n,rep(0,p),sigmax))
#' w <- rnorm(n,0,sd=noise_W)
#' z <- 1*(rnorm(n)>0)
#' d <- (x%*%beta+z+w+rnorm(n,0,sd=noise_V)>0)*1         # treatment equation
#'
#' DGP 1 - effect homogeneity
#'
#' y <- x%*%beta+d+gamma*z+delta*w+rnorm(n,0,sd=noise_U)
#'
#' output1 <- identificationDML(y = y, d=d, x=x, z=z, score = "DR", bootstrap = FALSE,
#' ztreat = 1, zcontrol = 0 , seed = 123, MLmethod ="lasso", k = 3,
#' DR_parameters = list(s = NULL , normalized = TRUE, trim = 0.01))
#' output1$pval
#' output2 <- identificationDML(y=y, d=d, x=x, z=z, score = "squared", bootstrap = FALSE,
#' ztreat = 1, zcontrol =0 , seed = 123, MLmethod ="lasso", k = 3)
#' output2$pval
#' output3 <- identificationDML(y=y, d=d, x=x, z=z, score = "squared", bootstrap = TRUE,
#' ztreat = 1, zcontrol =0 , seed = 123, MLmethod ="lasso", k = 3,
#' DR_parameters = list(s = NULL , normalized = TRUE, trim = 0.005),
#' bootstrap_parameters = list(B = 2000, importance = 0.95, alpha = 0.1, share = 0.5))
#' output3$pval
#'
#' DGP 2 - effect heterogeneity
#'
#' y = x%*%beta+d+gamma*z*x[,1]+gamma*z*x[,2]+delta*w*x[,1]+delta*w*x[,2]+rnorm(n/2,0,sd=noise_U)
#'
#' output1 <- identificationDML(y = y, d=d, x=x, z=z, score = "DR", bootstrap = FALSE,
#' ztreat = 1, zcontrol = 0 , seed = 123, MLmethod ="lasso", k = 3,
#' DR_parameters = list(s = NULL , normalized = TRUE, trim = 0.01))
#' output1$pval
#' output2 <- identificationDML(y=y, d=d, x=x, z=z, score = "squared", bootstrap = FALSE,
#' ztreat = 1, zcontrol =0 , seed = 123, MLmethod ="lasso", k = 3)
#' output2$pval
#' output3 <- identificationDML(y=y, d=d, x=x, z=z, score = "DR", bootstrap = TRUE,
#' ztreat = 1, zcontrol =0 , seed = 123, MLmethod ="lasso", k = 3,
#' DR_parameters = list(s = NULL , normalized = TRUE, trim = 0.005),
#' bootstrap_parameters = list(B = 2000, importance = 0.95, alpha = 0.1, share = 0.5))
#' output3$pval
#' }
#' @importFrom stats binomial fitted.values glm lm pnorm sd rnorm dnorm quantile coef fitted gaussian median qnorm
#' @import SuperLearner glmnet ranger xgboost e1071 mvtnorm grf checkmate
#' @export

identificationDML = function(y, d, x, z, score = "DR", bootstrap = FALSE, ztreat = 1, zcontrol =0 , seed = 123, MLmethod ="lasso", k = 3,
  DR_parameters = list(s = NULL ,normalized = TRUE, trim = 0.01), squared_parameters = list(zeta_sigma = min(0.5,500/dim(y)[1])),
  bootstrap_parameters = list(B = 2000, importance = 0.95, alpha = 0.1, share = 0.5))
{
  #### Checking Arguments ####
  checkmate::assertChoice(score, c("DR","squared"))
  checkmate::assertLogical(bootstrap)
  checkmate::assertIntegerish(k, lower = 1)
  checkmate::assertList(DR_parameters)
  checkmate::assertList(squared_parameters)
  checkmate::assertList(bootstrap_parameters)

  if(bootstrap == FALSE){
    if(score == "DR"){
      output <- treatDML(y = y, d = z, x = cbind(d,x), dtreat = ztreat, dcontrol = zcontrol, MLmethod = MLmethod, k = k,
        s = DR_parameters$s, normalized = DR_parameters$normalized, trim = DR_parameters$trim)
    }
    else{
      output <- treatDML_newscore(y = y, d = z, x = cbind(d,x), dtreat = ztreat, dcontrol = zcontrol, seed = seed, MLmethod = MLmethod, k = k,
        zeta_sigma =  squared_parameters$zeta_sigma)
    }
  }
  else{
    output <- treatDML_bootstrap(y = y, d = z, x = cbind(d,x), dtreat = ztreat, dcontrol = zcontrol, seed = seed, MLmethod = MLmethod, k = k,
      s = DR_parameters$s, normalized = DR_parameters$normalized, trim = DR_parameters$trim,
      B =  bootstrap_parameters$B, importance =  bootstrap_parameters$importance,  alpha=bootstrap_parameters$alpha, share = bootstrap_parameters$share)
  }
}

Try the causalweight package in your browser

Any scripts or data that you put into this service are public.

causalweight documentation built on May 4, 2023, 5:10 p.m.