hdm.R
In hdm: High-Dimensional Metrics

## ----R_setup, include=FALSE---------------------------------------------------
library(knitr)
knitr::opts_chunk$set(warning = FALSE)
knitr::opts_chunk$set(tidy=TRUE, tidy.opts=list(width.cutoff=80)) #{r, tidy=TRUE, tidy.opts=list(width.cutoff=60)}

# # source: https://github.com/yihui/knitr-examples/blob/master/077-wrap-output.Rmd
# knitr::opts_chunk$set(linewidth = NULL)
# hook_output = knit_hooks$get('output')
# knit_hooks$set(output = function(x, options) {
#   # this hook is used only when the linewidth option is not NULL
#   if (!is.null(n <- options$linewidth)) {
#     x = knitr:::split_lines(x)
#     # any lines wider than n should be wrapped
#     if (any(nchar(x) > n)) x = strwrap(x, width = n)
#     x = paste(x, collapse = '\n')
#   }
#   hook_output(x, options)
# })

## ----eval=FALSE---------------------------------------------------------------
#  install.packages("hdm")

## ----eval=FALSE---------------------------------------------------------------
#  remotes::install_github("MartinSpindler/hdm")

## ----echo=TRUE----------------------------------------------------------------
library(hdm)

## ----echo=FALSE,results='hide'----------------------------
library(hdm); library(ggplot2); library(Formula)
options(width=60)


## ----eval=FALSE-------------------------------------------
#  help(package="hdm")
#  help(rlasso)

## ----eval=FALSE-------------------------------------------
#   help.start()

## ----eval=FALSE-------------------------------------------
#  example(rlasso)

## ----DGP_lasso1-------------------------------------------
set.seed(12345)
n = 100 #sample size
p = 100 # number of variables
s = 3 # nubmer of variables with non-zero coefficients
X = matrix(rnorm(n*p), ncol=p)
beta = c(rep(5,s), rep(0,p-s))
Y = X%*%beta + rnorm(n)

## ----Estimation_lasso1------------------------------------
lasso.reg = rlasso(Y~X,post=FALSE)  # use lasso, not-Post-lasso
# lasso.reg = rlasso(X, Y, post=FALSE)
sum.lasso <- summary(lasso.reg, all=FALSE) # can also do print(lasso.reg, all=FALSE)
yhat.lasso = predict(lasso.reg)   #in-sample prediction
Xnew = matrix(rnorm(n*p), ncol=p)  # new X
Ynew =  Xnew%*%beta + rnorm(n)  #new Y
yhat.lasso.new = predict(lasso.reg, newdata=Xnew)  #out-of-sample prediction

post.lasso.reg = rlasso(Y~X,post=TRUE) #now use post-lasso
print(post.lasso.reg, all=FALSE)    # or use  summary(post.lasso.reg, all=FALSE) 
yhat.postlasso = predict(post.lasso.reg)  #in-sample prediction
yhat.postlasso.new = predict(post.lasso.reg, newdata=Xnew)  #out-of-sample prediction
MAE<- apply(cbind(abs(Ynew-yhat.lasso.new), abs(Ynew - yhat.postlasso.new)),2, mean)
names(MAE)<- c("lasso MAE", "Post-lasso MAE")
print(MAE, digits=2)  # MAE for Lasso and Post-Lasso

## ----simulation_partialling_out---------------------------
set.seed(1)
n =5000; p = 20;  X = matrix(rnorm(n*p), ncol=p)
colnames(X) = c("d", paste("x", 1:19, sep=""));xnames = colnames(X)[-1]
beta = rep(1,20)
y = X%*%beta + rnorm(n)
dat = data.frame(y=y, X)

## ----simulation_partialling_out_full_fit------------------
# full fit
fmla = as.formula(paste("y ~ ", paste(colnames(X), collapse= "+")))
full.fit= lm(fmla, data=dat)
summary(full.fit)$coef["d",1:2]

## ----simulation_partialling_out_partial_fit---------------
fmla.y = as.formula(paste("y ~ ", paste(xnames,  collapse= "+")))
fmla.d = as.formula(paste("d ~ ", paste(xnames, collapse= "+")))
# partial fit via ols
rY = lm(fmla.y, data = dat)$res
rD = lm(fmla.d, data = dat)$res
partial.fit.ls= lm(rY~rD)
summary(partial.fit.ls)$coef["rD",1:2]

## ----simulation_partialling_out_partial_fit_lasso---------
# partial fit via post-lasso
rY = rlasso(fmla.y, data =dat)$res
rD = rlasso(fmla.d, data =dat)$res
partial.fit.postlasso= lm(rY~rD)
summary(partial.fit.postlasso)$coef["rD",1:2]

## ----simulation_partialling_out_rlassoEffectone-----------
Eff= rlassoEffect(X[,-1],y,X[,1], method="partialling out")
summary(Eff)$coef[,1:2]

## ----simulation_doubleselection_rlassoEffectone-----------
Eff= rlassoEffect(X[,-1],y,X[,1], method="double selection")
summary(Eff)$coef[,1:2]

## ----DGP_lasso--------------------------------------------
set.seed(1)
n = 100 #sample size
p = 100 # number of variables
s = 3 # nubmer of non-zero variables
X = matrix(rnorm(n*p), ncol=p)
colnames(X) <- paste("X", 1:p, sep="")
beta = c(rep(3,s), rep(0,p-s))
y = 1 + X%*%beta + rnorm(n)
data = data.frame(cbind(y,X))
colnames(data)[1] <- "y"
fm = paste("y ~", paste(colnames(X), collapse="+"))
fm = as.formula(fm)                 

## ----Estimation_inference---------------------------------
#lasso.effect = rlassoEffects(X, y, index=c(1,2,3,50))
lasso.effect = rlassoEffects(fm, I = ~ X1 + X2 + X3 + X50, data=data)
print(lasso.effect)
summary(lasso.effect)
confint(lasso.effect)

## ----joint_Estimation_inference---------------------------
confint(lasso.effect, level=0.95, joint=TRUE)

## ----lasso_plot, eval=FALSE-------------------------------
#  plot(lasso.effect, main="Confidence Intervals")

## ----cps_example------------------------------------------
library(hdm)
data(cps2012)
X <- model.matrix( ~ -1 + female + female:(widowed + divorced + separated + nevermarried  + 
hsd08+hsd911+ hsg+cg+ad+mw+so+we+exp1+exp2+exp3) + 
+ (widowed + divorced + separated + nevermarried  +
 hsd08+hsd911+ hsg+cg+ad+mw+so+we+exp1+exp2+exp3)^2, data=cps2012)
dim(X)
X <- X[,which(apply(X, 2, var)!=0)] # exclude all constant variables
dim(X)
index.gender <- grep("female", colnames(X))
y <- cps2012$lnw

## ----Joint_Estimation-------------------------------------
effects.female <- rlassoEffects(x=X, y=y, index=index.gender)
summary(effects.female)

## ----CPS_example------------------------------------------
joint.CI <- confint(effects.female, level = 0.95, joint = TRUE)
joint.CI
# plot(effects.female, joint=TRUE, level=0.95) # plot of the effects

## ----CPS_Example_formula, eval=FALSE----------------------
#  effects.female <- rlassoEffects(lnw ~ female + female:(widowed + divorced + separated + nevermarried  +
#  hsd08+hsd911+ hsg+cg+ad+mw+so+we+exp1+exp2+exp3)+
#  (widowed + divorced + separated + nevermarried  +
#  hsd08+hsd911+ hsg+cg+ad+mw+so+we+exp1+exp2+exp3)^2, data=cps2012,
#  I = ~  female + female:(widowed + divorced + separated + nevermarried  +
#   hsd08+hsd911+ hsg+cg+ad+mw+so+we+exp1+exp2+exp3))

## ----Growth_processing------------------------------------
data(GrowthData)
dim(GrowthData)
y = GrowthData[,1,drop=F]
d = GrowthData[,3, drop=F]
X = as.matrix(GrowthData)[,-c(1,2,3)]
varnames = colnames(GrowthData)

## ---------------------------------------------------------
xnames= varnames[-c(1,2,3)] # names of X variables
dandxnames= varnames[-c(1,2)] # names of D and X variables
# create formulas by pasting names (this saves typing times)
fmla= as.formula(paste("Outcome ~ ", paste(dandxnames, collapse= "+")))
ls.effect= lm(fmla, data=GrowthData)

## ----Growth_analysis--------------------------------------
dX = as.matrix(cbind(d,X))
lasso.effect = rlassoEffect(x=X, y=y, d=d, method="partialling out")
summary(lasso.effect)

## ----Growth_analysis2-------------------------------------
dX = as.matrix(cbind(d,X))
doublesel.effect = rlassoEffect(x=X, y=y, d=d, method="double selection")
summary(doublesel.effect)

## ----summary_results--------------------------------------
library(knitr)
table= rbind(summary(ls.effect)$coef["gdpsh465",1:2],
  summary(lasso.effect)$coef[,1:2],
  summary(doublesel.effect)$coef[,1:2])
colnames(table)= c("Estimate", "Std. Error") #names(summary(full.fit)$coef)[1:2]
rownames(table)= c("full reg via ols",  "partial reg
via post-lasso ", "partial reg via double selection")
kable(table)

## ----AJR_processing---------------------------------------
data(AJR); y = AJR$GDP; d = AJR$Exprop; z = AJR$logMort
x = model.matrix(~ -1 + (Latitude + Latitude2 + Africa + 
                  Asia + Namer + Samer)^2, data=AJR)
dim(x)

## ----AJR_analysis-----------------------------------------
#AJR.Xselect = rlassoIV(x=x, d=d, y=y, z=z, select.X=TRUE, select.Z=FALSE)
AJR.Xselect = rlassoIV(GDP ~ Exprop +  (Latitude + Latitude2 + Africa + 
                  Asia + Namer + Samer)^2 | logMort +  (Latitude + Latitude2 + Africa + 
                  Asia + Namer + Samer)^2, data=AJR, select.X=TRUE, select.Z=FALSE)
summary(AJR.Xselect)
confint(AJR.Xselect)

## ----AJR_partiallingout_ols-------------------------------
# parialling out by linear model
fmla.y = GDP ~ (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2
fmla.d = Exprop ~ (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2
fmla.z = logMort ~ (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2
rY = lm(fmla.y, data = AJR)$res
rD = lm(fmla.d, data = AJR)$res
rZ = lm(fmla.z, data = AJR)$res
#ivfit.lm = tsls(y=rY,d=rD, x=NULL, z=rZ, intercept=FALSE)
ivfit.lm = tsls(rY ~ rD | rZ, intercept = FALSE)
print(cbind(ivfit.lm$coef, ivfit.lm$se),digits=3)

## ----AJR_partiallingout_lasso-----------------------------
# parialling out by lasso
rY = rlasso(fmla.y, data = AJR)$res
rD = rlasso(fmla.d, data = AJR)$res
rZ = rlasso(fmla.z, data = AJR)$res
#ivfit.lasso = tsls(y=rY,d=rD, x=NULL, z=rZ, intercept=FALSE)
ivfit.lasso = tsls(rY ~ rD | rZ, intercept = FALSE)
print(cbind(ivfit.lasso$coef, ivfit.lasso$se), digits=3)

## ----lassoIV----------------------------------------------
data(EminentDomain)
z <- as.matrix(EminentDomain$logGDP$z)
x <- as.matrix(EminentDomain$logGDP$x)
y <- EminentDomain$logGDP$y
d <- EminentDomain$logGDP$d
x <- x[,apply(x, 2, mean, na.rm=TRUE) > 0.05] #
z <- z[,apply(z, 2, mean, na.rm=TRUE) > 0.05] # 

## ----OLS_EminentDomina------------------------------------
ED.ols = lm(y~cbind(d,x))
ED.2sls = tsls(y=y, d=d, x=x, z=z[,1:2], intercept=FALSE)

## ----ED_analysis------------------------------------------
lasso.IV.Z = rlassoIV(x=x, d=d, y=y, z=z, select.X=FALSE, select.Z=TRUE) 
# or lasso.IV.Z = rlassoIVselectZt(x=X, d=d, y=y, z=z) 
summary(lasso.IV.Z)
confint(lasso.IV.Z)

## ----lassoIV_analysis-------------------------------------
lasso.IV.XZ = rlassoIV(x=x, d=d, y=y, z=z, select.X=TRUE, select.Z=TRUE) 
summary(lasso.IV.XZ)
confint(lasso.IV.XZ)

## ----summary_results_ED,  include=TRUE--------------------
table= matrix(0, 4, 2)
table[1,]= summary(ED.ols)$coef[2,1:2]
table[2,]= cbind(ED.2sls$coef[1], ED.2sls$se[1])
table[3,]= summary(lasso.IV.Z)[,1:2]
table[4,]= summary(lasso.IV.XZ)[,1:2]
colnames(table)= c("Estimate", "Std. Error")
rownames(table)= c("ols regression",  "IV estimation ", "selection on Z", "selection on X and Z")
kable(table)

## ----401_processing---------------------------------------
data(pension)
y = pension$tw; d = pension$p401; z = pension$e401
X = pension[,c("i2", "i3", "i4", "i5", "i6", "i7", "a2", "a3", "a4", "a5",
    "fsize", "hs", "smcol", "col", "marr", "twoearn", "db", "pira", "hown")] # simple model
xvar = c("i2", "i3", "i4", "i5", "i6", "i7", "a2", "a3", "a4", "a5",
    "fsize", "hs", "smcol", "col", "marr", "twoearn", "db", "pira", "hown")
xpart = paste(xvar, collapse = "+")
form <- as.formula(paste("tw ~ ", paste(c("p401", xvar), collapse ="+"), "|",
paste(xvar, collapse = "+")))
formZ <- as.formula(paste("tw ~ ", paste(c("p401", xvar), collapse ="+"), "|",
paste(c("e401", xvar), collapse = "+")))

## ----401_analysis_ATE-------------------------------------
#pension.ate =  rlassoATE(X,d,y)
pension.ate = rlassoATE(form, data = pension)
summary(pension.ate)
#pension.atet =  rlassoATET(X,d,y)
pension.atet = rlassoATET(form, data = pension)
summary(pension.atet)

## ----401_analysis_LATE------------------------------------
pension.late = rlassoLATE(X,d,y,z, always_takers = FALSE)
#pension.late = rlassoLATE(formZ, data=pension, always_takers = FALSE)
summary(pension.late)
pension.latet = rlassoLATET(X,d,y,z, always_takers = FALSE)
#pension.latet = rlassoLATET(formZ, data=pension, always_takers = FALSE)
summary(pension.latet)

## ----summary_pension, include=TRUE------------------------
table= matrix(0, 4, 2)
table[1,]= summary(pension.ate)[,1:2]
table[2,]= summary(pension.atet)[,1:2]
table[3,]= summary(pension.late)[,1:2]
table[4,]= summary(pension.latet)[,1:2]
colnames(table)= c("Estimate", "Std. Error")
rownames(table)= c("ATE",  "ATET ", "LATE", "LATET")
kable(table)

## ----401_processing_interaction---------------------------

# generate all interactions of X's
xvar2 = paste("(", paste(xvar, collapse = "+"), ")^2", sep="")

# ATE and ATE with interactions
forminteract = formula(paste("tw ~", xvar2, " + p401", 
                                     "|", 
                                     xvar2, sep=""))
# LATE and LATET  with interactions
formZinteract = formula(paste("tw ~", xvar2, " + p401", 
                                     "|", 
                                     xvar2, " + e401", sep=""))

## ----401_analysis_interaction, results="hide", echo="FALSE"----
# pension.ate= rlassoATE(forminteract, data = pension)
# summary(pension.ate)
# 
# pension.atet= rlassoATET(forminteract, data = pension)
# summary(pension.atet)
# 
# pension.late= rlassoLATE(formZinteract, data = pension, always_takers = FALSE)
# summary(pension.late)
# 
# pension.latet= rlassoLATET(formZinteract, data = pension, always_takers = FALSE)
# summary(pension.latet)
# 
# table= matrix(0, 4, 2)
# table[1,]= summary(pension.ate)[,1:2]
# table[2,]= summary(pension.atet)[,1:2]
# table[3,]= summary(pension.late)[,1:2]
# table[4,]= summary(pension.latet)[,1:2]
# colnames(table)= c("Estimate", "Std. Error")
# rownames(table)= c("ATE",  "ATET ", "LATE", "LATET")
# tab= xtable(table, digits=c(2, 2,2))

## ----results="asis",echo=FALSE----------------------------
# tab

## ----lasso------------------------------------------------
#library(hdm)
#library(stats)
set.seed(1)
n = 100
p1 = 20
p2= 20
D= matrix(rnorm(n*p1), n, p1)  # Causes
W= matrix(rnorm(n*p2), n, p2)  # Controls
X = cbind(D,W)   # Regressors
Y = D[,1]*5 + W[,1]*5 + rnorm(n)  #Outcome
confint(rlassoEffects(X,Y, index=c(1:p1)), joint=TRUE) 
#BCK Joint Confidence Band for Reg Coefficients 1 to 20

## ----siminf-----------------------------------------------
library(mvtnorm) 
set.seed(1)
n = 100
p = 80
s = 9
covar=toeplitz(0.9^(0:(p-1)))
diag(covar) = rep(1,p)
mu = rep(0,p)
X = mvtnorm::rmvnorm(n=n, mean=mu, sigma=covar)  # Regressors
beta = c(s:1, rep(0, p-s))
Y = X%*% beta + rnorm(n, sd = 5)  #Outcome
# Estimate rlassoEffects
rl = rlassoEffects(X,Y, index=c(1:p))

# unadjusted
p.unadj = p_adjust(rl, method = "none")
# Number of rejections at a prespecified significance level
sum(p.unadj[,2]<0.05)

## ----siminf2----------------------------------------------
# Romano-Wolf Stepdown Correction
p.rw = p_adjust(rl, method = "RW", B = 1000)
# Number of rejections at a prespecified significance level (5%)
sum(p.rw[,2]<0.05)

## ----siminf3----------------------------------------------
# Adjust with Bonferroni correction
p.bonf = p_adjust(rl, method = "bonferroni")
# Number of rejections at a prespecified significance level
sum(p.bonf[,2]<0.05)

# Romano-Wolf Stepdown Correction
p.bh = p_adjust(rl, method = "BH")
# Number of rejections at a prespecified significance level
sum(p.bh[,2]<0.05)

## ----results="hide"---------------------------------------
data(pension)

## ----eval=FALSE-------------------------------------------
#  help(pension)

## ----results="hide"---------------------------------------
data(GrowthData)

## ----results="hide"---------------------------------------
data(AJR)

## ----results="hide"---------------------------------------
data(EminentDomain)

## ----results="hide"---------------------------------------
data(BLP)

## ----results="hide"---------------------------------------
data(cps2012)