#' Simulate a dataset
#'
#' @description Simulates a dataset from either Cragg or Lognormal distributions.
#' Allows you to specify the data generating process, though it still
#' can't handle more than one endogenous regression
#'
#' @param formula For now, must be False. In the future, will allow you to leave
#' out exogenous variables you use in first stage regressions from second
#' stage regressions.
#' @param pi a vector of the coefficients on your first stage regression.
#' Should be in the order: intercept, exogenous variables, instruments.
#' Defaults to (1,-1,3).
#' @param gamma a vector of the coefficients on your second stage probit
#' regression. Should be in the order: intercept, exogenous variables,
#' endogenous variable. Defaults to (-.2,.8,.07)
#' @param beta a vector of the coefficients on your second stage linear
#' regression. Should be in the order: intercept, exogenous variables,
#' endogenous variable. Defaults to (.05,.06,.02)
#' @param endog_reg for now, leave as an empty list. Will create one endogenous
#' regression which includes the instrument and all exogenous variables
#' @param exog_mean the mean(s) of your exogenous variable(s). Defaults to 1
#' @param exog_sd the standard deviation(s) of your exogenous variable(s).
#' Defaults to 1.
#' @param z_mean the mean(s) of your instrument(s). Defaults to 3.
#' @param z_sd the standard deviation(s) of your instrument(s).
#' Defaults to 1.
#' @param y_sd the standard deviation of your second stage linear regression.
#' Defaults to 2.
#' @param rho the covariance term between second stage linear and probit
#' regressions. Defaults to 0.2.
#' @param tau0 the covariance term between the second stage probit and the
#' first stage regression. Defaults to 0.3.
#' @param tau1 the covariance term between the second stage linear regression and the
#' first stage regression. Defaults to 0.1.
#' @param n the number of observations. Defaults to 10,000.
#' @param type defaults to lognormal. Enter "cragg" for cragg model.
#' @param options Options are: silent = F (will print histograms and percent censored),
#' cragg_errors = {1,2,3,4} cragg error type 4 will generate x2 and y0 first, then
#' generate y1 from a conditional distribution on the first two and on itself being
#' positive.
#' See documentation for cragg_errs for more detail.
#'
#' @return returns a simulated dataset
hurdle.IV.sim <- function(formula = F,
pi = c(1,-1,3), #intercept, exog, inst
gamma = c(-.2,.8,.07), #intercept, exog, endog
beta = c(.05,.06,.02), #intercept, exog, endog
endog_reg = list(),
exog_mean = 1,
exog_sd = 1,
z_mean = 3,
z_sd = 1,
endog_sd = 5,
y_sd = 2,
rho = .2,
tau0 = .3,
tau1 = .1,
n = 10000,
options = list(silent = F, cragg_errors = 4),
type = "lognormal"){
#checks
n_z = length(z_mean)
n_x2 = length(endog_sd)
if(n_x2 > n_z){
stop("Error: more endogenous variables than instruments")
}
if(length(gamma)!= length(beta) ){
stop("Error: coefficient vector lengths differ")
}
if(length(tau0)!=n_x2|length(tau1)!=n_x2){
stop("Error: length of tau vectors must equal number of endogenous variables")
}
if(length(endog_reg)!=0){
# need to allow you to exclude some exogenous variables
stop("This functionality not developed yet, set endog_reg = list()")
}
if(formula!=F){
# need to allow you to exclude some exogenous variables
stop("This functionality not developed yet, set formula = F")
}
# make instruments and exogenous variables as random normals
z = make.df(mean = z_mean, sd = z_sd, n = n, pref = "inst")
x1 = make.df(mean = exog_mean, sd = exog_sd, n = n, pref = "exog")
df = data.frame(x1,z)
# make covariance matrix
require(MASS)
cov = make.cov(rho=rho
,tau0=tau0
,tau1=tau1
,y_sd=y_sd
,endog_sd=endog_sd)
if(type == "cragg"){
if(options$cragg_errors == 1){
temp = cragg_errs1(cov=cov,pi=pi,x1=x1,gamma=gamma,beta=beta,n=n,z=z)
}
else if(options$cragg_errors == 2){
temp = cragg_errs2(cov=cov,pi=pi,x1=x1,gamma=gamma,beta=beta,n=n,z=z)
}
else if(options$cragg_errors == 3){
temp = cragg_errs3(cov=cov,pi=pi,x1=x1,gamma=gamma,beta=beta,n=n,z=z)
}
else if(options$cragg_errors == 4){
temp = cragg_errs_MG(cov=cov,pi=pi,x1=x1,gamma=gamma,beta=beta,n=n,z=z)
}
endog = temp['endog'][[1]]
errors = temp['errors'][[1]]
y0 = temp['y0'][[1]]
yStar = temp['yStar'][[1]]
y = y0*yStar
}
else{
errors = mvrnorm(n,rep(0,dim(cov)[1]),cov)
frame = as.matrix(cbind(1,df))
endog = frame%*%pi + errors[,3]
frame = as.matrix(cbind(1,x1,endog))
y0 = as.numeric(frame%*%gamma + errors[,1] > 0)
yStar = frame%*%beta + errors[,2]
y = exp(yStar)*y0
}
out = data.frame(y,y0,endog,df)
if(options$silent == F){
par(mfrow = c(1,2))
hist(log(out$y), main = "log(y)",xlab = "")
hist(out$y, main = "y", xlab = "")
par(mfrow = c(1,1))
print(paste(mean(out$y0)*100," percent uncensored"))
}
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.