################################################################################
# #
# Simulation Body Part #
# #
################################################################################
#' Simulation Body Part
#'
#' @param kfold number of folds for sample splitting
#' @param data data
#' @param trimLowerBound.pi trimming lower bound of pi estimation
#' @param trimLowerBound.pibar trimming lower bound of pibar estimation
#' @param trimUpperBound.pi trimming upper bound of pi estimation
#' @param trimUpperBound.pibar trimming upper bound of pibar estimation
#' @param method estimation method we choose from double machine learning method ("CDML")
#' and simple regression method ("SR"), Hirano & Imbens method ("HI")
#' @param g.method method for regression estimation
#'
#' @param gps.method method for generalized propensity score estimation
#' @param trimLowerBound.t trimming lower bound of treatment vector
#' @param trimUpperBound.t trimming upper bound of treatment vector
#' @param model model varies from "IV", "CTE"
#' @param variation choose from "whole","train.data".
#' Estimate the probability measure by empirical measure
#' of covariates. "whole" uses every data sample,
#' "train.data" only trainning data for estimating empirical measure.
#' As default, we select "whole"
#' @param verbose print in console step by step
#' @param detoured if FALSE method will be done normally;
#' otherwise a special simulation will be examined, where CDML is chosen method, CTE is chosen model,
#' gps.method "series" is not allowed.
#'
#' @return a list of estimated value "yhat" evaluated at "t.grid", with "record" to record details of this simulation
#' @export
simulation <- function(kfold,
data = data,
trimLowerBound.pi = 0.00001,
trimLowerBound.pibar = 0.00001,
trimUpperBound.pi = 10,
trimUpperBound.pibar = 10,
method = "CDML",
g.method = "rf",
gps.method = "series",
trimLowerBound.t = -10,
trimUpperBound.t = 10,
model,
variation = "whole",
verbose = TRUE,
detoured = FALSE)
{
## check whether the number of folds we set is an integer
check.integer <- function(N) {
!grepl("[^[:digit:]]", format(N, digits = 20, scientific = FALSE))
}
## check whether the model name input makes sense
check.model <- function(name) {
return(name == "CTE" | name == "IV")
}
## check whether the method name input makes sense
check.method <- function(name) {
return(name == "CDML" | name == "SR" | name == "HI")
}
## check whether the variation name is valid
check.variation <- function(name) {
return(name == "whole" | name == "data.train")
}
## check whether the g.method name input is valid
check.g.method <- function(name) {
return(name == "rf" |
name == "nnet" |
name == "trees" | name == "MARS" | name == "lasso")
}
## check whether the gps.method name input is valid
check.gps.method <- function(name) {
return(
name == "series" |
name == "linear&normal" |
name == "rf&normal" |
name == "nnet&normal" | name == "rf&boxcox" |
name == "nnet&boxcox" |
name == "linear&boxcox" | name == "boosting&normal" | name == "quantregForest" | name == "quantregForest&normal"
)
}
if (!check.model(model)) {
stop("Please input a valid model name!")
}
if (!check.method(method)) {
stop("Please input a valid method name!")
}
if (!check.variation(variation)) {
stop("Please input a valid variation name!")
}
if (!check.integer(kfold)) {
stop("Please set argument kfold as an integer!")
}
if (!check.g.method(g.method)) {
stop("Please input a valid g.method name!")
}
if (!check.gps.method(gps.method)) {
stop("Please input a valid gps.method name!")
}
if(detoured == TRUE & kfold == 1){
object2 <-
gps.method.estimation(
data = data,
gps.method = gps.method,
treatment.max = trimUpperBound.t,
treatment.min = trimLowerBound.t,
verbose = verbose,
detoured = TRUE
)
dat <-
gps.predict(
data.chunk = data,
data.eval = data,
gps.method = gps.method,
object = object2,
detoured = TRUE
)
return(dat)
}
if (kfold == 1 & detoured == FALSE) {
print("Simulation starts!")
## choose which estimation model we choose
# here we choose double machine learning method
if (method == "CDML") {
## if model is choosen "CTE"
if (model == "CTE") {
## estimation of g and gbar
object <-
g.method.estimation(data = data, g.method = g.method)
g.rf.result <-
g.predict(
object = object,
data.eval = data,
data.chunk = data,
mode = method,
verbose = verbose,
g.method = g.method
)
## combine g estimation to our data
## combine gbar estimation to our data
gbar <- rowMeans(g.rf.result$matrix)
data.with.gbar.g <-
cbind(g = g.rf.result$vector, gbar = gbar, data)
if (length(gbar) != dim(data)[1] &&
length(g.rf.result$vector) != dim(data)[1]) {
stop("g estimation or gbar estimation is invalid!")
}
## estimation of pi, pibar
object2 <-
gps.method.estimation(
data = data,
gps.method = gps.method,
treatment.max = trimUpperBound.t,
treatment.min = trimLowerBound.t,
verbose = verbose
)
pi.series.result <-
gps.predict(
data.chunk = data,
gps.method = gps.method,
data.eval = data,
object = object2
)
## combine pi estimation to our data
## combine pibar estimation to our data
pibar <- rowMeans(pi.series.result$matrix)
data.complete <-
cbind(pi = pi.series.result$vector,
pibar = pibar,
data.with.gbar.g)
if (length(pibar) != dim(data)[1] &&
length(pi.series.result$vector) != dim(data)[1]) {
stop("pi estimation or pibar estimation is invalid!")
}
}
## if model is choosen "IV"
if (model == "IV") {
## estimation of g and gbar
## the instrument does not influence the g estiamtion
object <-
g.method.estimation(data = data[, !(names(data) %in% "z")], g.method = g.method)
## debug ##
##
g.rf.result <-
g.predict(
object = object,
data.eval = data[, !(names(data) %in% "z")],
data.chunk = data[, !(names(data) %in% "z")],
mode = method,
verbose = verbose,
g.method = g.method
)
## combine g estimation to our data
## combine gbar estimation to our data
gbar <- rowMeans(g.rf.result$matrix)
data.with.gbar.g <-
cbind(g = g.rf.result$vector, gbar = gbar, data)
if (length(gbar) != dim(data)[1] &&
length(g.rf.result$vector) != dim(data)[1]) {
stop("g estimation or gbar estimation is invalid!")
}
## estimation of pi, pibar
data.iv <- data[, !(names(data) %in% "t")]
colnames(data.iv)[which(names(data.iv) == "z")] <- "t"
object2 <-
gps.method.estimation(
data = data.iv,
gps.method = gps.method,
treatment.min = min(data.iv[, (names(data.iv) %in% "t")]),
treatment.max = max(data.iv[, (names(data.iv) %in% "t")]),
verbose = verbose
)
pi.series.result <-
gps.predict(
data.chunk = data.iv,
gps.method = gps.method,
data.eval = data.iv,
object = object2
)
## combine pi estimation to our data
## combine pibar estimation to our data
pibar <- rowMeans(pi.series.result$matrix)
data.complete <-
cbind(pi = pi.series.result$vector,
pibar = pibar,
data.with.gbar.g)
if (length(pibar) != dim(data)[1] &&
length(pi.series.result$vector) != dim(data)[1]) {
stop("pi estimation or pibar estimation is invalid!")
}
}
}
## choose which estimation model we choose
# here we choose simple regression method
if (method == "SR") {
if(model == "CTE"){
## estimation of g an gbar
object <-
g.method.estimation(data = data, g.method = g.method)
## generate a evaluate grid
temp.grid <-
seq(
from = trimLowerBound.t,
to = trimUpperBound.t,
length.out = 100 * (trimUpperBound.t - trimLowerBound.t)
)
g.rf.result <-
g.predict(
object = object,
data.eval = temp.grid,
data.chunk = data,
mode = method,
verbose = verbose,
g.method = g.method
)
## combine gbar estimation to our data
gbar <- rowMeans(g.rf.result$matrix)
}
if(model == "IV"){
## estimation of g an gbar
object <-
g.method.estimation(data = data[, !(names(data) %in% "z")], g.method = g.method)
## generate a evaluate grid
temp.grid <-
seq(
from = trimLowerBound.t,
to = trimUpperBound.t,
length.out = 100 * (trimUpperBound.t - trimLowerBound.t)
)
g.rf.result <-
g.predict(
object = object,
data.eval = temp.grid,
data.chunk = data[, !(names(data) %in% "z")],
mode = method,
verbose = verbose,
g.method = g.method
)
## combine gbar estimation to our data
gbar <- rowMeans(g.rf.result$matrix)
}
}
## choose which estimation model we choose
# here we choose Hirano & Imbens method
if (method == "HI") {
## estimation of gps
object <-
gps.method.estimation(
data = data,
gps.method = gps.method,
treatment.max = trimUpperBound.t,
treatment.min = trimLowerBound.t,
verbose = verbose
)
pi.series.result <-
gps.predict(
data.chunk = NULL,
data.eval = data,
gps.method = gps.method,
object = object,
t.grid = seq(
from = trimLowerBound.t,
to = trimUpperBound.t,
length.out = 100 * (trimUpperBound.t - trimLowerBound.t)
)
)
# our REAL complete data to pass down are data.complete and
# data.complete.comple (complements)
data.complete.comple <- pi.series.result$matrix
data.complete <-
cbind(pi = pi.series.result$vector, data)
}
}
## we will consider the sample splitting case in below the else bracket
if(kfold > 1){
print("Simulation starts!")
################### data splitting ##########################
## data splitting
split <- runif(nrow(data))
split.group <-
as.numeric(cut(split, quantile(split, probs = seq(0, 1, 1 / kfold)), include.lowest = TRUE))
# everything okay, now we store the data in this fold to data.complete
data.complete <- NULL
## focus on individual splitting
for (jth.fold in 1:kfold) {
print(paste("fold", jth.fold))
ii <- split.group == jth.fold
nii <- split.group != jth.fold
# training data & evaluation data
data.train <- data[nii,]
data.eval <- data[ii,]
if(detoured == TRUE){
object2 <-
gps.method.estimation(
data = data.train,
gps.method = gps.method,
treatment.max = trimUpperBound.t,
treatment.min = trimLowerBound.t,
verbose = verbose,
detoured = TRUE
)
if (variation == "data.train") {
res <-
gps.predict(
data.chunk = data.train,
data.eval = data.eval,
gps.method = gps.method,
object = object2,
detoured = TRUE
)
}
if (variation == "whole") {
res <-
gps.predict(
data.chunk = data,
data.eval = data.eval,
gps.method = gps.method,
object = object2,
detoured = TRUE
)
}
data.complete <-
rbind(data.complete, res)
}
if(detoured == FALSE){
## choose which estimation model we choose
# here we choose double machine learning method
if (method == "CDML") {
## if the chosen model is "CTE"
if (model == "CTE") {
## estimation of g an gbar
object <-
g.method.estimation(data = data.train, g.method = g.method)
if (variation == "data.train") {
g.rf.result <-
g.predict(
object = object,
data.eval = data.eval,
data.chunk = data.train,
verbose = verbose,
g.method = g.method
)
}
if (variation == "whole") {
g.rf.result <-
g.predict(
object = object,
data.eval = data.eval,
data.chunk = data,
verbose = verbose,
g.method = g.method
)
}
## combine g estimation to our data
## combine gbar estimation to our data
gbar <- rowMeans(g.rf.result$matrix)
data.with.gbar.g <-
cbind(g = g.rf.result$vector,
gbar = gbar,
data.eval)
if (length(gbar) != dim(data.eval)[1] &&
length(g.rf.result$vector) != dim(data.eval)[1]) {
stop("g estimation or gbar estimation is invalid!")
}
## estimation of pi, pibar
object2 <-
gps.method.estimation(
data = data.train,
gps.method = gps.method,
treatment.max = trimUpperBound.t,
treatment.min = trimLowerBound.t,
verbose = verbose
)
if (variation == "data.train") {
pi.series.result <-
gps.predict(
data.chunk = data.train,
data.eval = data.eval,
gps.method = gps.method,
object = object2
)
}
if (variation == "whole") {
pi.series.result <-
gps.predict(
data.chunk = data,
data.eval = data.eval,
gps.method = gps.method,
object = object2
)
}
## combine pi estimation to our data
## combine pibar estimation to our data
pibar <- rowMeans(pi.series.result$matrix)
data.complete.foldwise <-
cbind(pi = pi.series.result$vector,
pibar = pibar,
data.with.gbar.g)
if (length(pibar) != dim(data.eval)[1] &&
length(pi.series.result$vector) != dim(data.eval)[1]) {
stop("pi estimation or pibar estimation is invalid!")
}
data.complete <-
rbind(data.complete, data.complete.foldwise)
}
## if the chosen model is "IV"
if (model == "IV") {
## estimation of g an gbar
object <-
g.method.estimation(data = data.train[, !(names(data.train) %in% "z")], g.method = g.method)
if (variation == "data.train") {
g.rf.result <-
g.predict(
object = object,
data.eval = data.eval[, !(names(data.train) %in% "z")],
data.chunk = data.train[, !(names(data.train) %in% "z")],
verbose = verbose,
g.method = g.method
)
}
if (variation == "whole") {
g.rf.result <-
g.predict(
object = object,
data.eval = data.eval[, !(names(data.train) %in% "z")],
data.chunk = data[, !(names(data.train) %in% "z")],
verbose = verbose,
g.method = g.method
)
}
## combine g estimation to our data
## combine gbar estimation to our data
gbar <- rowMeans(g.rf.result$matrix)
data.with.gbar.g <-
cbind(g = g.rf.result$vector,
gbar = gbar,
data.eval)
if (length(gbar) != dim(data.eval)[1] &&
length(g.rf.result$vector) != dim(data.eval)[1]) {
stop("g estimation or gbar estimation is invalid!")
}
## estimation of pi, pibar
data.iv <- data[, !(names(data) %in% "t")]
colnames(data.iv)[which(names(data.iv) == "z")] <- "t"
data.train.iv <-
data.train[, !(names(data.train) %in% "t")]
colnames(data.train.iv)[which(names(data.train.iv) == "z")] <-
"t"
data.eval.iv <- data.eval[, !(names(data.eval) %in% "t")]
colnames(data.eval.iv)[which(names(data.eval.iv) == "z")] <-
"t"
object2 <-
gps.method.estimation(
data = data.train.iv,
gps.method = gps.method,
treatment.min = min(data.iv[, (names(data.iv) %in% "t")]),
treatment.max = max(data.iv[, (names(data.iv) %in% "t")]),
verbose = verbose
)
if (variation == "data.train") {
pi.series.result <-
gps.predict(
data.chunk = data.train.iv,
data.eval = data.eval.iv,
gps.method = gps.method,
object = object2
)
}
if (variation == "whole") {
pi.series.result <-
gps.predict(
data.chunk = data.iv,
data.eval = data.eval.iv,
gps.method = gps.method,
object = object2
)
}
## combine pi estimation to our data
## combine pibar estimation to our data
pibar <- rowMeans(pi.series.result$matrix)
data.complete.foldwise <-
cbind(pi = pi.series.result$vector,
pibar = pibar,
data.with.gbar.g)
if (length(pibar) != dim(data.eval)[1] &&
length(pi.series.result$vector) != dim(data.eval)[1]) {
stop("pi estimation or pibar estimation is invalid!")
}
data.complete <-
rbind(data.complete, data.complete.foldwise)
}
}
}
#################### the end of data sampling part ####################
}
}
##################### post-processing ######################
if(detoured == FALSE){
## post-processing for CDML method
if (method == "CDML") {
####### we have to filter the conditional density so that ub > pi, pibar > lb ######
data.complete <-
data.complete[(
data.complete$pibar > trimLowerBound.pibar &
trimUpperBound.pibar > data.complete$pibar
),]
data.complete <-
data.complete[(data.complete$pi > trimLowerBound.pi &
trimUpperBound.pi > data.complete$pi),]
row.names(data.complete) <- 1:nrow(data.complete)
##############################################################
## The formula: we use here CDML
pseudo.out <-
(data.complete$y - data.complete$g) / (data.complete$pi / data.complete$pibar) + data.complete$gbar
data.complete <- cbind(data.complete, ybar = pseudo.out)
## to what degree out debiased technique will influence the pseudo-output
influence.pseudo.out <-
abs(
(data.complete$y - data.complete$g) / (data.complete$pi / data.complete$pibar) /
data.complete$gbar
)
###################### local linear regression part! #########################
treatment <- data.complete$t
response <- data.complete$ybar
## centralize and plot!
x <- quantile(response, c(0.01, 0.99))
response.cleaned <-
response[response >= x[1] & response <= x[2]]
treatment.cleaned <-
treatment[response >= x[1] & response <= x[2]]
#plot(treatment.cleaned, response.cleaned)
h <- 0
# choose bandwidth by https://stat.ethz.ch/R-manual/R-devel/library/KernSmooth/html/dpill.html
try(h <- dpill(treatment, response))
# this happens sometimes that by dpill we cannot select bandwidth h
# hence below try to fix this scenerio and report when h is NA
if (is.na(h) | h <= 0) {
print("Caution: dpill fails to choose h. We will choose h manually")
# manually select bandwidth from a given range
bw.from = 0.01 # bandwidth has to be strictly positive
bw.to = (max(treatment) - min(treatment)) / 3 # bandwidth cant be too wide
bw.length.out = 100
epsGrid = seq(bw.from, bw.to, length.out = bw.length.out) # range of tuning parameters from the kernel (bandwidth)
error = rep(NA, length(epsGrid)) # vector to store the errors of the models
# when using different parameter value
# tuning epsilon:
for (ii in 1:length(epsGrid))
{
eps = epsGrid[ii]
object = locpoly(treatment,
response,
bandwidth = eps,
degree = 1)
error[ii] = sum((object$y - response) ^ 2)
}
bestEps = epsGrid[which.min(error)] # best bandwidth
# run this seriesCDE again with best bandwidth and set to choose Delta
final.result = locpoly(
treatment,
response,
bandwidth = bestEps,
degree = 1,
range.x = c(trimLowerBound.t, trimUpperBound.t),
gridsize = 100 * (trimUpperBound.t - trimLowerBound.t)
)
#lines(final.result)
print(h)
} else{
# final.result as the result of local linear regression
final.result <-
locpoly(
treatment,
response,
bandwidth = h,
degree = 1,
range.x = c(trimLowerBound.t, trimUpperBound.t),
gridsize = 100 * (trimUpperBound.t - trimLowerBound.t)
)
#lines(final.result)
}
########## NA-value of CDML estimation filling #############
if (any(is.na(final.result$y)) |
any(is.infinite(final.result$y))) {
## to inform the user that we have done this NA-value filling process
print("CDML estimation cannot estimate some values at the edge.")
cat("\n")
print("Hence we fill the missing edge values with the closest value to the missing ones.")
print("Or we change the Inf value to the closest non-Inf value")
# this problem is because of locpoly does weirdly when estimating treatment effects far from data
### convert value in result.cdml$yhat to NA if the value == infinite
indices <- which(is.infinite(final.result$y))
##debug
print(indices)
for (indice in indices) {
final.result$y[indice] <- NA
}
### change the value in result.cdml$yhat to the closest value, if the value == NA
indices <- which(is.na(final.result$y))
## debug
print(indices)
for (indice in indices) {
if (abs(indice - 1) > abs(indice - length(final.result$y))) {
final.result$y[indice] <-
final.result$y[max(which(!is.na(final.result$y)))]
} else{
final.result$y[indice] <-
final.result$y[min(which(!is.na(final.result$y)))]
}
}
}
}
## post-processing for simple regression method
# if (method == "SR") {
# }
## post-processing for Hirano & Imbens method
if (method == "HI") {
hi.formula <- y ~ pi + I(pi ^ 2) + t + I(t ^ 2) + t * pi
hi.model <- lm(hi.formula, data = data.complete)
t.grid <-
seq(
from = trimLowerBound.t,
to = trimUpperBound.t,
length.out = 100 * (trimUpperBound.t - trimLowerBound.t)
)
yhat <-
rep(NA, length(t.grid)) # vector to save the final result
for (i in 1:length(t.grid)) {
data.temp <- data.complete
data.temp$t <- t.grid[i]
data.temp$pi <- data.complete.comple[i, ]
## do things!
yhat[i] <- mean(predict(hi.model, newdata = data.temp))
}
#plot(x = t.grid,
# y = yhat,
# type = "l")
}
################# returns ###################
########### legend to record ##########
## record the number of covariates
if (model == "CTE") {
numofcov <- dim(data)[2] - 2
}
if (model == "IV") {
numofcov <- dim(data)[2] - 3
}
legend <-
list(
"g.method" = g.method,
"gps.method" = gps.method,
"trimming" = c(trimLowerBound.t, trimUpperBound.t),
"method" = method,
"fold" = kfold,
"covariates" = numofcov,
"samples" = dim(data)[1]
)
# ######################################
if (method == "HI") {
return.list <-
list("yhat" = yhat,
# "t.grid" = t.grid,
"record" = legend)
}
if (method == "CDML") {
return.list <-
list("yhat" = final.result$y,
# "t.grid" = final.result$x,
"record" = legend)
}
if (method == "SR") {
return.list <-
list("yhat" = gbar,
# "t.grid" = temp.grid,
"record" = legend)
#plot(return.list$t.grid, return.list$yhat, type = "l")
}
return(return.list)
}
if(detoured == TRUE)
return(data.complete)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.