rm(list = ls())
library(speff2trial) # dataset for part 4
library(dplyr)
library(SuperLearner)
library(ggplot2)
library(mgcv)
################### Section 3 : Simulation Study #####################
################### no sample splitting ##############################
nseq <- c(500, 1000, 2000, 3000, 4000)
expit <- function(x){ exp(x)/(1+exp(x))}
tau <- function(x){
x1 <- x[,1]
x2 <- x[,2]
res <- x1*x1*(x1 + 7/5) + 25*x2*x2/9
return (res)
}
t <- 100
res <- data.frame(matrix(nrow = length(nseq), ncol = 8))
rownames(res) <- nseq
colnames(res) <- c('psi_1_hat', 'psi_1_hat_bias', 'variance_1', 'coverage_1', 'psi_2_hat', 'psi_2_hat_bias', 'variance_2', 'coverage_2')
for (j in 1:length(nseq)){
n <- nseq[j]
tmp_res <- data.frame(matrix(nrow = t, ncol = 8))
colnames(tmp_res) <- c('psi_1_hat', 'psi_1_hat_bias', 'variance_1', 'coverage_1', 'psi_2_hat', 'psi_2_hat_bias', 'variance_2', 'coverage_2')
for (i in 1:t){
x <- as.data.frame(replicate(2, runif(n, -1, 1)))
a <- rbinom(n, 1, expit(-0.4*x[,1] + 0.1*x[,1]*x[,2]))
y <- rnorm(n, x[,1]*x[,2] + 2*x[,2]^2 - x[,1] + a*tau(x), 1)
# true estimand values
tau_p <- mean(tau(x))
theta_p <- var(tau(x))
tau_1 <- 25/9*x[,2]^2 + 7/15
tau_2 <- x[,1]^2*(x[,1] + 7/5) + 25/27
psi_1 <- 1 - var(tau_1)/var(tau(x))
psi_2 <- 1 - var(tau_2)/var(tau(x))
# estimated values
data <- cbind(y, a, x)
colnames(data) <- c('y', 'a', 'x1', 'x2')
data_0 <- data[a == 0, , drop = FALSE]
data_1 <- data[a == 1, , drop = FALSE]
mu0hat <- predict(gam(y ~ s(x1) + s(x2) + te(x1, x2), data = data_0), newdata = data)
mu1hat <- predict(gam(y ~ s(x1) + s(x2) + te(x1, x2), data = data_1), newdata = data)
# pihat <- predict(gam(a ~ s(x1) + s(x2), family = 'binomial', data = data), newdata = data)
pihat <- expit(predict(gam(a ~ x1 + x2 + x1*x2, family = 'binomial', data = data), newdata = data))
pseudo_hat <- ((a-pihat)/(pihat*(1-pihat)))*(y-a*mu1hat-(1-a)*mu0hat) + mu1hat-mu0hat
# T-learner, algorithm A
# tau_hat <- mu1hat - mu0hat
# DR-learner, algorithm B
tau_hat <- predict(gam(pseudo_hat ~ s(x1) + s(x2), data = cbind(data, pseudo_hat)), newdata = cbind(data, pseudo_hat))
# try mgcv
tau_1_hat <- predict(gam(tau_hat ~ s(x2), data = cbind(data, tau_hat)), newdata = cbind(data, tau_hat))
tau_2_hat <- predict(gam(tau_hat ~ s(x1), data = cbind(data, tau_hat)), newdata = cbind(data, tau_hat))
tau_1_hat <- tau_1
tau_2_hat <- tau_2
tau_hat <- tau(x)
pihat <- expit(-0.4*x[,1] + 0.1*x[,1]*x[,2])
mu1hat <- x[,1]*x[,2] + 2*x[,2]^2 - x[,1] + tau(x)
mu0hat <- x[,1]*x[,2] + 2*x[,2]^2 - x[,1]
pseudo_hat <- ((a-pihat)/(pihat*(1-pihat)))*(y-a*mu1hat-(1-a)*mu0hat) + mu1hat-mu0hat
tau_p_hat <- mean(tau_hat)
theta_1_hat <- mean((pseudo_hat - tau_1_hat)^2 - (pseudo_hat - tau_hat)^2)
theta_2_hat <- mean((pseudo_hat - tau_2_hat)^2 - (pseudo_hat - tau_hat)^2)
theta_p_hat <- mean((pseudo_hat - tau_p_hat)^2 - (pseudo_hat - tau_hat)^2)
psi_1_hat <- theta_1_hat / theta_p_hat
psi_2_hat <- theta_2_hat / theta_p_hat
# bias
tmp_res[i, 1] <- psi_1_hat
tmp_res[i, 2] <- (psi_1_hat - psi_1) * sqrt(n)
tmp_res[i, 5] <- psi_2_hat
tmp_res[i, 6] <- (psi_2_hat - psi_2) * sqrt(n)
# variance
phi_1_hat <- ((pseudo_hat - tau_1_hat)^2 - psi_1_hat*(pseudo_hat - tau_p_hat)^2 + (psi_1_hat - 1)*(pseudo_hat - tau_hat)^2) / theta_p_hat
variance_1 <- 1/n^2 * sum((phi_1_hat)^2)
phi_2_hat <- ((pseudo_hat - tau_2_hat)^2 - psi_2_hat*(pseudo_hat - tau_p_hat)^2 + (psi_2_hat - 1)*(pseudo_hat - tau_hat)^2) / theta_p_hat
variance_2 <- 1/n^2 * sum((phi_2_hat)^2)
tmp_res[i, 3] <- variance_1 * n
tmp_res[i, 7] <- variance_2 * n
# coverage
if (psi_1 <= psi_1_hat + 1.96 * sqrt(variance_1) & psi_1 >= psi_1_hat - 1.96 * sqrt(variance_1)){
coverage_1 <- 1
} else {
coverage_1 <- 0
}
if (psi_2 <= psi_2_hat + 1.96 * sqrt(variance_2) & psi_2 >= psi_2_hat - 1.96 * sqrt(variance_2)){
coverage_2 <- 1
} else {
coverage_2 <- 0
}
tmp_res[i, 4] <- coverage_1
tmp_res[i, 8] <- coverage_2
print(paste('Iteration', i, 'finished!'))
}
print(tmp_res)
res[j, ] <- colMeans(tmp_res)
}
res
################### Section 4: Real World Example #####################
dta_aids <- get(data(ACTG175))
dta2 <- filter(dta_aids, arms %in% c(1,3))
# add treatment
dta2 <- mutate(dta2, treatment = ifelse(arms == 3, 0, 1))
# dataset used in section 4:
dta2 <- dta2[, c('cd420', 'treatment',
'age', 'wtkg', 'karnof', 'cd40', 'cd80', 'gender', 'homo', 'race', 'symptom', 'drugs', 'hemo', 'str2')]
y <- dta2[, 'cd420']
a <- dta2[, 'treatment']
x <- dta2[, 3:14]
sl.lib <- c("SL.glm", "SL.gam", "SL.glmnet", "SL.xgboost", "SL.ranger")
# sl.lib <- c("SL.glm", "SL.gam", "SL.glmnet")
# 1. without sample splitting.
cate_estimate <- function(x, y, a, x.estimate, model){
# function
# x, y, a: standard input
# model: A for t-learner, B for df-learner
mu0hat <- SuperLearner(Y = y[a == 0], X = data.frame(x[a == 0, , drop = FALSE]), SL.library = sl.lib,
newX = data.frame(x))$SL.predict
mu1hat <- SuperLearner(Y = y[a == 1], X = data.frame(x[a == 1, , drop = FALSE]), SL.library = sl.lib,
newX = data.frame(x))$SL.predict
pihat <- mean(a)
# pihat <- SuperLearner(Y = a, X = data.frame(x), SL.library = sl.lib)$SL.predict
# pihat <- mean(a)
pseudo <- ((a-pihat)/(pihat*(1-pihat)))*(y-a*mu1hat-(1-a)*mu0hat) + mu1hat-mu0hat
if (model == 'A'){
tau <- mu1hat - mu0hat
}
else if (model == 'B'){
tau <- SuperLearner(Y = pseudo, X = as.data.frame(x), SL.library = sl.lib,
newX = as.data.frame(x))$SL.predict
}
else {
stop('invalid type!')
}
out <- list(tau = tau, pseudo = pseudo)
return (out)
}
## T learner
VIM_2a <- data.frame(matrix(nrow = length(x), ncol = 3))
rownames(VIM_2a) <- colnames(x)
colnames(VIM_2a) <- c('T', 'Lower_Bound', 'Upper_Bound')
cate <- cate_estimate(x, y, a, x, 'A')
tau_hat <- cate$tau
pseudo_hat <- cate$pseudo
tau_p_hat <- mean(tau_hat)
for (i in 1:length(x)){
tau_s_hat <- SuperLearner(Y = tau_hat, X = x[,-i],
SL.library = sl.lib,
newX = x[,-i])$SL.predict
theta_s_hat <- mean((pseudo_hat - tau_s_hat)^2 - (pseudo_hat - tau_hat)^2)
theta_p_hat <- mean((pseudo_hat - tau_p_hat)^2 - (pseudo_hat - tau_hat)^2)
# theta_p_hat <- as.numeric(var(tau_hat))
psi_hat <- theta_s_hat / theta_p_hat
phi_hat <- ((pseudo_hat - tau_s_hat)^2 - psi_hat * (pseudo_hat - tau_p_hat)^2 + (psi_hat - 1)*(pseudo_hat - tau_hat)^2) / theta_p_hat
var <- 1/length(phi_hat)^2 * sum((phi_hat)^2)
VIM_2a[i, 1] <- psi_hat
VIM_2a[i, 2] <- VIM_2a[i, 1] - 1.96 * sqrt(var)
VIM_2a[i, 3] <- VIM_2a[i, 1] + 1.96 * sqrt(var)
print(c(VIM_2a[i,1], VIM_2a[i,2], VIM_2a[i,3], theta_s_hat, theta_p_hat))
}
VIM_2a <- VIM_2a[order(VIM_2a$T, decreasing = FALSE),]
### draw picture
VIM_2a <- cbind(as.matrix(row.names(VIM_2a), nrow(VIM_2a), 1), VIM_2a)
colnames(VIM_2a) <- c('variable', 'T', 'Lower_Bound', 'Upper_Bound')
VIM_2a <- cbind(as.matrix(row.names(VIM_2a), nrow(VIM_2a), 1), VIM_2a)
VIM_2a <- VIM_2a[order(VIM_2a$T, decreasing = FALSE),]
order_2a <- VIM_2a$variable
fig_2a <- ggplot(VIM_2a, aes(x = factor(variable, level = order_2a), y = T))+
geom_point(size = 3) +
geom_errorbar(aes(ymin = Lower_Bound, ymax = Upper_Bound), width=.1, position=position_dodge(0.2))+
theme(axis.title.y=element_blank(),
axis.text.y = element_text(color = "grey20", size = 10),
axis.text.x = element_text(color = "grey20", size = 10)) +
coord_flip() +
ylim(0, 1)
fig_2a
## DR learner
VIM_2b <- data.frame(matrix(nrow = length(x), ncol = 3))
rownames(VIM_2b) <- colnames(x)
colnames(VIM_2b) <- c('DR', 'Lower_Bound', 'Upper_Bound')
cate <- cate_estimate(x, y, a, x, 'B')
tau_hat <- cate$tau
pseudo_hat <- cate$pseudo
tau_p_hat <- mean(tau_hat)
for (i in 1:length(x)){
tau_s_hat <- SuperLearner(Y = tau_hat, X = x[,-i],
SL.library = sl.lib,
newX = x[,-i])$SL.predict
theta_s_hat <- mean((pseudo_hat - tau_s_hat)^2 - (pseudo_hat - tau_hat)^2)
theta_p_hat <- mean((pseudo_hat - tau_p_hat)^2 - (pseudo_hat - tau_hat)^2)
# theta_p_hat <- as.numeric(var(tau_hat))
psi_hat <- theta_s_hat / theta_p_hat
phi_hat <- ((pseudo_hat - tau_s_hat)^2 - psi_hat * (pseudo_hat - tau_p_hat)^2 + (psi_hat - 1)*(pseudo_hat - tau_hat)^2) / theta_p_hat
var <- 1/length(phi_hat)^2 * sum((phi_hat)^2)
VIM_2b[i, 1] <- psi_hat
VIM_2b[i, 2] <- VIM_2b[i, 1] - 1.96 * sqrt(var)
VIM_2b[i, 3] <- VIM_2b[i, 1] + 1.96 * sqrt(var)
print(c(VIM_2b[i,1], VIM_2b[i,2], VIM_2b[i,3], theta_s_hat, theta_p_hat))
}
VIM_2b <- VIM_2b[order(VIM_2b$DR, decreasing = FALSE),]
# VIM_2b <- VIM_2b[order(VIM_2b$DR),]
### draw picture
VIM_2b <- cbind(as.matrix(row.names(VIM_2b), nrow(VIM_2b), 1), VIM_2b)
colnames(VIM_2b) <- c('variable', 'DR', 'Lower_Bound', 'Upper_Bound')
VIM_2b <- VIM_2b[order(VIM_2b$DR, decreasing = FALSE),]
order_2b <- VIM_2b$variable
fig_2b <- ggplot(VIM_2b, aes(x = factor(variable, level = order_2b), y = DR))+
geom_point(size = 3) +
geom_errorbar(aes(ymin = Lower_Bound, ymax = Upper_Bound), width=.1, position=position_dodge(0.2))+
theme(axis.title.y=element_blank(),
axis.text.y = element_text(color = "grey20", size = 10),
axis.text.x = element_text(color = "grey20", size = 10)) +
coord_flip() +
ylim(0, 1)
fig_2b
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.