# R/wrapperFunc.R In jackiemauro/hurdleIV:

#### Documented in hurdle.IV

```#' Hurdle IV regression
#'
#' @description Run a hurdle IV regression, either lognormal IV or cragg IV.
#' You should specify the model you want to fit. Currently this does not have
#' a nice summary function for the output.
#'
#' @param formula the second stage regression: y~exogenous + endogenous
#' @param inst a vector of your instrument(s): c(inst1,inst2)
#' @param endog a vector of your endogenous variable(s): c(end1,end2)
#' @param exog a vector of your exogenous variable(s): c(ex1,ex2)
#' @param data the dataframe
#' @param endog_reg a list of endogenous regression formulae. By default will
#' estimate endogenous ~ all exogenous variables + all instruments
#' @param start_val an optional list of start values. By default the function
#' will find start values using simple linear/probit regressions of the specified
#' formulae
#' @param type either "lognormal" or "cragg"
#' @param options: cholesky true or false; maxit = maximal number of iterations;
#' trace = 0; method = "BFGS". Options are similar to those for optim, see
#' optim documentation for more details
#'
#' @return Returns estimated parameters as well as a the hessian and standard
#' deviations

hurdle.IV<-function(formula,
inst,
endog,
exog,
data,
endog_reg = list(),
start_val = list(),
type = "lognormal",
options = list(cholesky = T
, maxit = 5000
, trace = 0
, method = "BFGS")
){

# helper fn
rename.input <- function(input){
if(class(input)=="name"){
name = toString(input)
}
else if(class(input) == "call"){
text = input[-1]
k = length(text)
name = NULL
for(i in 1:k){name[i] = toString(text[[i]])}
return(name)
}
}

###### format your data, grouping variable types #####
data = data[complete.cases(data),]
attach(data)
inst_mat = as.data.frame(matrix(inst,nrow = dim(data)[1], byrow = F))
inst_names = rename.input(substitute(inst))
names(inst_mat)<-inst_names

endog_mat = as.data.frame(matrix(endog, nrow = dim(data)[1], byrow = F))
endog_names = rename.input(substitute(endog))
names(endog_mat)<- endog_names

if(is.null(exog)){
# if you want to run with no covariates
exog_mat = data.frame(none = rep(0,dim(data)[1]))
exog_names = NULL
}
else{
exog_mat = as.data.frame(matrix(exog, nrow = dim(data)[1], byrow = F))
exog_names = rename.input(substitute(exog))
}
names(exog_mat)<- exog_names

outcome = eval(parse(text=paste("data\$",formula[[2]],sep = "")))
y_mat = model.matrix(formula)
y_colnames = colnames(y_mat)
y_exog = exog_mat[names(exog_mat) %in% y_colnames]
y_endog = endog_mat[names(endog_mat) %in% y_colnames]
if(any(names(inst_mat)%in% y_colnames)){
print("Error: main regression cannot include instruments")
return(NA)
}
######### checks #########
#check you have a formula
tryCatch({
form = as.formula(formula)
}, error = function(e){
detach(data)
stop("Error: formula must be a formula, eg: y~x1+x2")
}
)
# check the formula includes endogenous variables
if(dim(y_endog)[2]==0){
detach(data)
stop("Error: endogenous variables must be included in main regression")
}

# check no more endogenous variables than instruments
if(length(inst)<length(endog)){
detach(data)
stop("Error: More endogenous variables than instruments")
}

# if not specified, replace endog_reg with formula that has all exog's, all inst's
if(!is.list(endog_reg)){
stop("Error: endog_reg must be a list (can be empty)")
}

##### make endog reg #####
if(length(endog_reg) == 0){
a = names(endog_mat)[1]
d = paste(names(inst_mat), collapse = "+")
if(is.null(exog)){endog_reg = list(as.formula(paste(a,"~",d)))}
else{
b = paste(names(exog_mat),collapse = "+")
endog_reg = list(as.formula(paste(a,"~",b,"+",d)))
}
}
else{
if(length(endog_reg)!=dim(endog_mat)[2]){
detach(data)
stop("Error: Need one regression for each endogenous variable")
}
}

ER_mat = lapply(endog_reg, function(x) model.matrix(x))
ER_colnames = lapply(ER_mat, function(x) colnames(x))
ER_exog = lapply(ER_colnames, function(x) exog_mat[names(exog_mat) %in% x])
ER_inst = lapply(ER_colnames, function(x) inst_mat[names(inst_mat) %in% x])

### drop NA's
if(is.null(exog)){mf = cbind(outcome, inst_mat, endog_mat)}
else{mf = cbind(outcome, exog_mat, inst_mat, endog_mat)}
mf = mf[complete.cases(mf),] #drop NA's
detach(data)
attach(mf)

############# get start values #######
# if start values aren't specified, get start values
if(length(start_val) == 0){
start_val = start.val(formula = update(form,outcome~.)
, endog_reg = endog_reg
, data = mf
, type = type)
}

else{
if(class(start_val) != "list"){
detach(mf)
stop("Error: start values must be a list")
}
if(length(start_val[['beta']])!=length(start_val[['gamma']])){
detach(mf)
stop("Error: beta and gamma must be equal length vectors")
}
k = dim(endog_mat)[2]
if(length(start_val\$pi)!=k | length(start_val\$tau0)!= k | length(start_val\$tau1)!=k){
detach(mf)
stop("Error: tau0, tau1 and number of pi coordinates must equal number of endogenous variables")
}
if(is.null(names(start_val\$gammas))){
names(start_val\$gamma)<-c("(Intercept)", exog_names, endog_names)
}
if(is.null(names(start_val\$beta))){
names(start_val\$beta)<-c("(Intercept)", exog_names, endog_names)
}
}

# start cov values should be cholesky-fied if that option is true
cov_start = make.cov(rho=start_val\$rho
,tau0=start_val\$tau0
,tau1=start_val\$tau1
,y_sd=start_val\$y_sd
,endog_sd=start_val\$endog_sd)
if(options\$cholesky == T){
cov_start = chol(cov_start)
}

########## run optimizer #############
# save info about parameters and model matrices
pars = list(len_cov = length(which(upper.tri(cov_start, diag = T)))-1
,num_endog = dim(endog_mat)[2]
,num_betas = dim(y_mat)[2]
#  ,num_pis = lapply(ER_mat, function(x) dim(x)[2])
,numpis = lapply(ER_mat, function(x) dim(x)[2])
,myChol = options\$cholesky
,ER_mat = ER_mat
,y_mat = y_mat
,endog_mat = endog_mat
,inst_names = inst_names
,exog_names = exog_names
,endog_names = endog_names)
attach(pars)

cov_in = cov_start[upper.tri(cov_start, diag = T)][-1]
start_vec = c(cov_in, start_val\$beta, start_val\$gamma, unlist(start_val\$pi))

if(type == "lognormal"){
ll = loglik_lgiv
}
else if(type == "cragg"){
ll = loglik_craggiv
}

out = optim(start_vec
, ll
, method= options\$method
, hessian  = T
, control = list(maxit = options\$maxit,trace = options\$trace)
)
name_pars = name.pieces(out\$par)
name_pars\$cov = reconstitute.cov(vals=name_pars\$cov_start
,num=num_endog
,chol=myChol)
name_pars\$cov_start <-NULL
detach(mf)
detach(pars)
sds = 1/(diag(out\$hessian))

if(min(eigen(out\$hessian)\$value)*max(eigen(out\$hessian)\$value)<0){