#' Generate data for
#'
#' @param n desired sample size.
#' @param dgp character string indicating desired data-generating process. Currently the following DGPs are implemented, each based on a different publication: \code{ks}, Kang and Shafer; \code{ld}, Lunceford and Davidian; \code{fi} Fan and Imai; \code{ls}, Leacy and Stuart; \code{ik} Iacus and King; \code{iw} Waernbaum; \code{pa} Austin.
#' @param correct_outcome logical indicating whether the outcome model should eb correctly specified.
#' @param correct_ps logical indicating whether the propensity score model should be correctly specified.
#'
#' @return a list of objects including
#' @export
#' @importFrom stringr str_c
#' @importFrom dplyr sample_n
#' @importFrom mvnfast rmvn
#'
#' @examples
#' gen_mod <- generate_data(n = 100,
#' dgp = 'ks',
#' correct_outcome = FALSE,
#' correct_ps = TRUE)
generate_data <- function(n, dgp, correct_outcome = TRUE, correct_ps = TRUE) {
genfn <- get(paste0('generate_model_', dgp))
mod <- genfn(correct_outcome, correct_ps)
data <- mod$data_fn(n)
list(data = data,
dgp = dgp,
outcome_fm = mod$outcome_fm,
outcome_fam = mod$outcome_fam,
ps_fm = mod$ps_fm,
ps_fam = mod$ps_fam,
cov_ids = mod$cov_ids,
true_ate = mod$true_ate)
}
#--------------------------------
## all data-generating mechanisms
#--------------------------------
inv.logit <- plogis
generate_data_pa <- function(n, pi = 0.2) {
d <- rbinom(n, 1, pi)
x1 <- rnorm(n,d*0.2)
x2 <- rnorm(n,d*0.3)
x3 <- rnorm(n,d*0.4)
x4 <- rnorm(n,d*0.5)
x5 <- rnorm(n,d*0.6)
v1 <- rbinom(n, 1, 0.1*(1-d) + 0.168*d)
v2 <- rbinom(n, 1, 0.2*(1-d) + 0.331*d)
v3 <- rbinom(n, 1, 0.3*(1-d) + 0.492*d)
v4 <- rbinom(n, 1, 0.4*(1-d) + 0.642*d)
v5 <- rbinom(n, 1, 0.5*(1-d) + 0.776*d)
y <- -2.8 + 1.5*x1 + 2*x2 + 3*x3 + 4*x4 + 5*x5 + 5*v1 + 4*v2 + 3*v3 + 2*v4 + 1.5*v5 + d + rnorm(n, sd = 2)
data.frame(id = 1:n, x1, x2, x3, x4, x5, v1, v2, v3, v4, v5, y = y, d = d)
}
generate_model_pa <- function(correct_outcome, correct_ps) {
cov_ids <- c(stringr::str_c('x', 1:5), stringr::str_c('v', 1:5))
if (correct_outcome) {
outcome_fm <- stringr::str_c(cov_ids, collapse = ' + ')
} else {
outcome_fm <- stringr::str_c(cov_ids[-c(3)], collapse = ' + ')
}
if (correct_ps) {
ps_fm <- stringr::str_c(cov_ids, collapse = ' + ')
} else {
cov_ids <- c(str_c('x', c(1,2,4,5)), str_c('v', c(1,2,4,5)))
ps_fm <- stringr::str_c(cov_ids, collapse = ' + ')
}
data_fn <- function(n) generate_data_pa(n, pi = 0.2)
outcome_fam <- gaussian
ps_fam <- binomial
list(data_fn = data_fn, outcome_fm = outcome_fm,
outcome_fam = outcome_fam,
ps_fm = ps_fm, ps_fam = ps_fam,
cov_ids = cov_ids, true_ate = 1)
}
generate_data_r <- function(n, pi) {
d <- rbinom(n, 1, pi)
x1 <- rnorm(n,d*0.2)
x2 <- rnorm(n,d*0.3)
x3 <- rnorm(n,d*0.4)
x4 <- rnorm(n,d*0.5)
x5 <- rnorm(n,d*0.6)
v1 <- rbinom(n, 1, 0.1*(1-d) + 0.168*d)
v2 <- rbinom(n, 1, 0.2*(1-d) + 0.331*d)
v3 <- rbinom(n, 1, 0.3*(1-d) + 0.492*d)
v4 <- rbinom(n, 1, 0.4*(1-d) + 0.642*d)
v5 <- rbinom(n, 1, 0.5*(1-d) + 0.776*d)
y <- -2.8 + 1.5*(x1 > 1) + 2*(x2 < -1) + 3*(x3 > 1)*(x1 < 1) + 4*(x4 < 1) + 5*(x5 > 0) +
5*v1 + 4*v2 + 3*v3 + 2*v4 + 1.5*v5 + d + rnorm(n, sd = 2)
data.frame(id = 1:n, x1, x2, x3, x4, x5, v1, v2, v3, v4, v5, y = y, d = d)
}
generate_model_r <- function(correct_outcome, correct_ps) {
cov_ids <- c(stringr::str_c('x', 1:5), stringr::str_c('v', 1:5))
if (correct_outcome) {
outcome_fm <- stringr::str_c(cov_ids, collapse = ' + ')
} else {
outcome_fm <- stringr::str_c(cov_ids[-c(3)], collapse = ' + ')
}
if (correct_ps) {
ps_fm <- stringr::str_c(cov_ids, collapse = ' + ')
} else {
cov_ids <- c(str_c('x', c(1,2,4,5)), str_c('v', c(1,2,4,5)))
ps_fm <- stringr::str_c(cov_ids, collapse = ' + ')
}
data_fn <- function(n) generate_data_r(n, pi = 0.2)
outcome_fam <- gaussian
ps_fam <- binomial
list(data_fn = data_fn, outcome_fm = outcome_fm,
outcome_fam = outcome_fam,
ps_fm = ps_fm, ps_fam = ps_fam)
}
generate_data_fi <- function(n, correct_ps, correct_outcome) {
z <- rnorm(4*n, mean = 3, sd = 2) %>% matrix(n, 4)
x <- z
x[,1] <- exp(z[,1]/3)
x[,2] = z[,2]/(1+exp(z[,1])) + 10
x[,3] = z[,1]*z[,3]/25 + 0.6
x[,4] = z[,2]+z[,4]+20
if (correct_ps) {
pi_0 <- cbind(1, z) %*% c(0, -1, 0.5, -0.25, -0.1)
} else {
pi_0 <- cbind(1, x) %*% c(0, -1, 0.5, -0.25, -0.1)
}
d <- rbinom(n, 1, inv.logit(pi_0))
if (correct_outcome) {
mu_0 <- 200 + 27.4*z[,1]*d + 13.5*z[,2] + 13.5*z[,3] + 13.5*z[,4]
} else {
mu_0 <- 200 + 27.4*z[,1]^2*d + 13.5*z[,2]^2 + 13.5*z[,3]^2 + 13.5*z[,4]^2
}
y <- rnorm(n, mu_0, 1)
data_0 <- data.frame(id = 1:n, x = x, z = z, d = d, y = y)
data_0
}
generate_model_fi <- function(correct_outcome, correct_ps) {
cov_ids <- stringr::str_c('z.', 1:4)
outcome_fm <- ps_fm <- stringr::str_c(cov_ids, collapse = ' + ')
data_fn <- function(n) generate_data_fi(n, correct_ps = correct_ps,
correct_outcome = correct_outcome)
outcome_fam <- gaussian
ps_fam <- binomial
list(data_fn = data_fn, outcome_fm = outcome_fm,
outcome_fam = outcome_fam,
ps_fm = ps_fm, ps_fam = ps_fam,
cov_ids = cov_ids,
true_ate = case_when(
correct_outcome ~ 0,
TRUE ~ 27.4
))
}
#
generate_data_ik <- function(n) {
library(Matching)
data(lalonde)
ik_data <- dplyr::sample_n(lalonde, n, replace = TRUE)
ik_data <- ik_data %>% mutate(
muhat = 1 + 1.428e-4 * age^2 - 2.918e-3 * educ^2 -
0.2275 * black - 0.8276 * hisp + 0.2071 * married -
0.8232 * nodegr - 1.236e-9 * re74^2 +
5.865e-10 * re75^2 - 0.04328 * u74 - 0.3804 * u75,
pi_0 = 1 + 0.5*muhat + 0.01 * age^2 - 0.3 * educ^2 -
0.01 * log((re74 + 0.01)^2) + 0.01 * log((re75 + 0.01)^2),
d = rbinom(n, 1, inv.logit(pi_0)),
l75 = log((re75 + 0.01)^2),
ll74 = log((re74 + 0.01 + 0.7 * l75)^2),
mu_0 = 1000*d + 0.1 * exp(0.7 * ll74),
y = mu_0 + rnorm(n, 0, 10),
age2 = age^2,
educ2 = educ^2,
l74 = log((re74 + 0.01)^2),
re74_2 = re74^2,
re75_2 = re75^2,
id = 1:n
)
}
generate_model_ik <- function(correct_outcome, correct_ps) {
if (correct_outcome) {
outcome_fm <- c('bs(re74)*bs(re75)')
} else {
outcome_fm <- 'age + educ + re74 + re75 + black +
hisp + married + nodegr + u74 + u75'
}
if (correct_ps) {
cov_ids <- c('age2', 'educ2', 'black', 'hisp', 'married',
'nodegr', #'u74', 'u75',
'l74', 'l75', 're74_2', 're75_2')
ps_fm <- stringr::str_c(cov_ids, collapse = ' + ')
} else {
cov_ids <- c('age', 'educ', 'black', 'hisp', 'married',
'nodegr', 'u74', 'u75',
're74', 're75')
ps_fm <- stringr::str_c(cov_ids, collapse = ' + ')
}
data_fn <- function(n) generate_data_ik(n)
outcome_fam <- gaussian
ps_fam <- binomial
list(data_fn = data_fn, outcome_fm = outcome_fm,
outcome_fam = outcome_fam,
ps_fm = ps_fm, ps_fam = ps_fam,
cov_ids = cov_ids, true_ate = 1000)
}
generate_data_ks <- function(n, a = c(0,-1,0.5,-0.25,-0.1),
b = c(210,40,27.4,13.7,13.7,13.7),
sigma = 1) {
z <- rnorm(4*n) %>% matrix(n, 4)
x <- z
x[,1] <- exp(z[,1]/2)
x[,2] = z[,2]/(1+exp(z[,1])) + 10
x[,3] = (z[,1]*z[,3]/25 + 0.6)^3
x[,4] = (z[,2]+z[,4]+20)^2
pi_0 <- cbind(1, z) %*% a
d <- rbinom(n, 1, inv.logit(pi_0))
mu_0 <- cbind(1, d, z) %*% b
y <- rnorm(n, mu_0, sigma)
data_0 <- data.frame(id = 1:n, x = x, z = z, d = d, y = y)
data_0
}
generate_model_ks <- function(correct_outcome, correct_ps) {
if (correct_outcome) {
cov_ids <- stringr::str_c('z.', 1:4)
outcome_fm <- stringr::str_c(cov_ids, collapse = ' + ')
} else {
cov_ids <- stringr::str_c('x.', 1:4)
outcome_fm <- stringr::str_c(cov_ids[-c(3)], collapse = ' + ')
}
if (correct_ps) {
cov_ids <- stringr::str_c('z.', 1:4)
ps_fm <- stringr::str_c(cov_ids, collapse = ' + ')
} else {
cov_ids <- stringr::str_c('x.', 1:4)
ps_fm <- stringr::str_c(cov_ids, collapse = ' + ')
}
data_fn <- function(n) generate_data_ks(n)
outcome_fam <- gaussian
ps_fam <- binomial
list(data_fn = data_fn, outcome_fm = outcome_fm,
outcome_fam = outcome_fam,
ps_fm = ps_fm, ps_fam = ps_fam,
cov_ids = cov_ids, true_ate = 40)
}
generate_data_iw <- function(n) {
x1 <- runif(n, 1, 5)
x3 <- x1^2
x2 <- runif(n, 1, 5)
x4 <- x2^2
pi <- (1 + exp(-0.36 + 1.25*x1 + 1.25*x2 - 0.36*x1^2 - 0.35*x2^2))^-1
t <- rbinom(n, prob= pi, size= 1)
mu <- 2*t + exp(1 + 0.2*x1 + 0.2*x2)
y <- rpois(n, lambda = mu)
data.frame(id = 1:n, x.1 = x1, x.2 = x2, x.3 = x3, x.4 = x4, d = t, y = y)
}
generate_model_iw <- function(correct_outcome, correct_ps) {
if (correct_outcome) {
outcome_fam <- poisson
} else {
outcome_fam <- poisson(link = 'identity')
}
if (correct_ps) {
ps_fam <- binomial
} else {
ps_fam <- binomial(link = 'probit')
}
data_fn <- function(n) generate_data_iw(n)
cov_ids <- stringr::str_c('x.', 1:2)
outcome_fm <- ps_fm <- stringr::str_c(stringr::str_c('x.', 1:4), collapse = ' + ')
list(data_fn = data_fn, outcome_fm = outcome_fm,
outcome_fam = outcome_fam,
ps_fm = ps_fm, ps_fam = ps_fam,
cov_ids = cov_ids, true_ate = 2)
}
generate_data_ld <- function(n) {
beta <- c(0,0.6,-0.6,0.6)
zeta <- c(-1,1,1)
nu <- c(0, -1, 1, -1, 2)
tau1 <- c(1,1,-1,-1)
tau0 <- c(-1,-1,1,1)
Sigma <- matrix(c(1,0.5,-0.5,-0.5,
0.5,1,-0.5,-0.5,
-0.5,-0.5,1,0.5,
-0.5,-0.5,0.5,1), nrow=4, byrow=TRUE)
X3 <- rbinom(n,1,0.2)
V3 <- rbinom(n,1,0.75*X3+0.25*(1-X3))
other_covariates <- matrix(NA, n, 4)
other_covariates[X3 == 1,] <- mvnfast::rmvn(sum(X3 == 1), mu = tau1, sigma = Sigma)
other_covariates[X3 == 0,] <- mvnfast::rmvn(sum(X3 == 0), mu = tau0, sigma = Sigma)
X1 <- other_covariates[,1]
V1 <- other_covariates[,2]
X2 <- other_covariates[,3]
V2 <- other_covariates[,4]
e = inv.logit(beta[1]+beta[2]*X1+beta[3]*X2+beta[4]*X3)
Z = rbinom(n,1,e)
Y = nu[1] + nu[2]*X1 + nu[3]*X2 + nu[4]*X3 + nu[5]*Z +
zeta[1]*V1 + zeta[2]*V2 + zeta[3]*V3 + rnorm(n)
# plot(e,Y,col=ifelse(Z==1,"black","red"))
# list(Y=Y,Z=Z,X1=X1,X2=X2,X3=X3,V1=V1,V2=V2,V3=V3)
data.frame(id = 1:n, y = Y, d = Z, x.1 = X1, x.2 = X2, x.3 = X3,
v.1 = V1, v.2 = V2, v.3 = V3)
}
generate_model_ld <- function(correct_outcome, correct_ps) {
cov_ids <- c(stringr::str_c('x.', 1:3), stringr::str_c('v.', 1:3))
if (correct_outcome) {
outcome_fm <- ps_fm <- stringr::str_c(cov_ids, collapse = ' + ')
} else {
outcome_fm <- stringr::str_c(cov_ids[-c(1,4)], collapse = ' + ')
}
if (correct_ps) {
ps_fm <- stringr::str_c(cov_ids, collapse = ' + ')
} else {
cov_ids <- c(stringr::str_c('x.', 2:3), stringr::str_c('v.', 2:3))
ps_fm <- stringr::str_c(cov_ids, collapse = ' + ')
}
data_fn <- function(n) generate_data_ld(n)
outcome_fam <- gaussian
ps_fam <- binomial
list(data_fn = data_fn, outcome_fm = outcome_fm,
outcome_fam = outcome_fam,
ps_fm = ps_fm, ps_fam = ps_fam,
cov_ids = cov_ids, true_ate = 2)
}
generate_data_ls <- function(n, ps_correct, outcome_correct) {
sigma <- diag(10)
sigma[1,5] <- sigma[5,1] <- sigma[3,8] <- sigma[8,3] <- 0.2
sigma[2,6] <- sigma[6,2] <- sigma[3,9] <- sigma[9,3] <- 0.9
w <- mvnfast::rmvn(n, mu = rep(0, 10), sigma = sigma)
w[,c(1,3,5,6,8,9)] <- ifelse(w[,c(1,3,5,6,8,9)] > 0, 1, 0)
w_ps <- cbind(1, w, w[,2]^2, w[,4]^2, w[,7]^2, w[,1]*w[,3],
w[,2]*w[,4], w[,3]*w[,5], w[,6]*w[,4],
w[,5]*w[,7], w[,1]*w[,6], w[,2]*w[,3],
w[,3]*w[,4], w[,5]*w[,4], w[,5]*w[,6])
alpha <- c(-1.897, 0.8, -0.25, 0.6, -0.4, -0.8, -0.5, 0.7, 0, 0, 0,
-0.25, -0.4, 0.7, 0.5*0.8,
0.7*(-0.25), 0.5*0.6, 0.7*(-0.4),
0.5*(-0.8), 0.5*0.8, 0.7*(-0.25),
0.5*0.6, 0.5*(-0.4), 0.5*(-0.8))
w_o <- cbind(1, w, w[,2]^2, w[,4]^2, w[,10]^2,
w[,1]*w[,3],
w[,2]*w[,4], w[,3]*w[,8], w[,9]*w[,4],
w[,8]*w[,10], w[,1]*w[,9], w[,2]*w[,3],
w[,3]*w[,4], w[,8]*w[,4], w[,8]*w[,9])
beta <- c(-1.386, 0.3, -0.36, -0.73, -0.2, 0, 0, 0, 0.71, -0.19, 0.26,
-0.36, -0.2, 0.26,
0.5*0.3, 0.7*(-0.36), 0.5*(-0.73), 0.7*(-0.2),
0.5*0.71, 0.5*0.3, 0.7*(-0.36),
0.5*(-0.73), 0.5*(-0.2), 0.5*(0.71))
pi_0 <- w_ps %*% alpha
d <- rbinom(n, 1, plogis(pi_0))
y <- w_o %*% beta - 0.4*d + rnorm(n, sd = 0.1)
data.frame(id = 1:n, y = y, d = d, ww = w, w_ps = w_ps, w_o = w_o)
}
generate_model_ls <- function(correct_outcome, correct_ps) {
if (correct_outcome) {
outcome_fm <- stringr::str_c(stringr::str_c('w_o.', 2:24),
collapse = ' + ')
} else {
outcome_fm <- stringr::str_c(stringr::str_c('ww.', 1:10),
collapse = ' + ')
}
if (correct_ps) {
cov_ids <- stringr::str_c('w_ps.', 2:24)
} else {
cov_ids <- stringr::str_c('ww.', 1:10)
}
ps_fm <- stringr::str_c(cov_ids, collapse = ' + ')
data_fn <- function(n) generate_data_ls(n)
outcome_fam <- gaussian
ps_fam <- binomial
list(data_fn = data_fn, outcome_fm = outcome_fm,
outcome_fam = outcome_fam,
ps_fm = ps_fm, ps_fam = ps_fam,
cov_ids = cov_ids, true_ate = -0.4)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.