knitr::opts_chunk$set(echo = TRUE)

Generate data

# devtools::install_github("Larsvanderlaan/npOddsRatio)
library(npOddsRatio)
n <- 1000
# Baseline variables
X1 <- runif(n, min = -1, max = 1)
X2 <- runif(n, min = -1, max = 1)
# Binary treatment
A <- rbinom(n,size = 1, prob = plogis(X1 + X2) )
# Binary outcome
Y <- rbinom(n,size = 1, prob = plogis(-1+A + A*(X1 + X2) + X1 + X2 + X1*X2)) 
# Binary missingness indicator
Delta <- rbinom(n,size = 1, prob =1- 0.1*plogis(X1 + X2  +A-0.5 ))
# construct data.frame
data <- data.frame(X1,X2,A,Delta, Y = Delta*Y)
print(head(data))

Fit conditional odds ratio using sl3

# devtools::install_github("tlverse/sl3)
library(sl3)
# useful learners
lrnr_gam <-  Lrnr_gam$new()
lrnr_glmnet <- Lrnr_glmnet$new()
lrnr_hal <- Lrnr_hal9001$new(max_degree = 2, smoothness_orders =1, num_knots = c(5,3), fit_control = list(parallel = T))
lrnr_xgboost <- Lrnr_xgboost$new(max_depth = 4)

# Create a stack/ensemble learner
stack <- make_learner(Stack, lrnr_gam, lrnr_glmnet,   lrnr_xgboost)
# Create a superlearner
lrnr_SL <- Lrnr_sl$new(stack)
# Just use cross-validation selection
lrnr_cv <- Lrnr_cv$new(stack)
lrnr_cv <- make_learner(Pipeline, lrnr_cv, Lrnr_cv_selector$new(loss_loglik_binomial))
######

true_coefs <- c(1, 1,1)
library(doMC)
registerDoMC()

# Specify formula for odds ratio
logOR_formula <- ~ 1 + X1 + X2
# Fit odds ratio
fit_sp <- spOR(logOR_formula, 
            W = data[,c("X1", "X2")],
            A = data$A, 
            Y = data$Y ,
            Delta = data$Delta, 
            # Specify HAL estimator for P(Y=1|A=0,W):
            max_degree_Y0W = 2, # Generate only up to two-way interaction spline basis functions
            num_knots_Y0W = c(10,5,0), # Specify number of knots used per covariate for each spline basis function of each interaction degree (deg = 1,2,3).
            sl3_learner_A = lrnr_cv) # learner to estimate P(A=1|X,Delta=1)  


# Or be totally nonparametric and get correct inference even when parametric model is incorrect and just an approximation
fit_np <- npOR(logOR_formula, 
            W = data[,c("X1", "X2")],
            A = data$A, 
            Y = data$Y ,
            Delta = data$Delta, 
            sl3_learner_OR = lrnr_hal, # Learner for true OR
             sl3_learner_Y0W = lrnr_hal, # Learner for true P(Y=1|A=0,W) nuisance
            sl3_learner_A = lrnr_cv, #Learner for P(A=1|W)
            sl3_learner_Delta = lrnr_cv) #Learner for P(Delta=1|W,A)

Get estimates and inference

#### coefficients
#print(coef(fit))
#### Summarize coefficients
print("semiparametric")
print(summary(fit_sp))
print("nonparametric")
print(summary(fit_np))

# Estimates are similar since semiparametric model is correct!
# The nonparametric method has a larger uncertainty.
#### Get predictions and inference for log odds ratio at observations
predict(fit_sp, data[1:10,c("X1", "X2")])

Fit conditional odds ratio using glm

# Specify formula for log odds ratio
logOR_formula <- ~ 1 + X1 + X2
# P(A=1|X, Delta = 1) glm formula
glm_formula_A <- ~ 1 + X1 + X2 + X1*X2
# P(Y=1|A=0,X, Delta = 1) glm formula
glm_formula_Y0W <- ~ 1 + X1 + X2 + X1*X2
# Fit odds ratio

fit_glm <- spOR(logOR_formula, 
            W = data[,c("X1", "X2")],
            A = data$A, 
            Y = data$Y ,
            Delta = data$Delta, 
            glm_formula_A = glm_formula_A,
            glm_formula_Y0W = glm_formula_Y0W)

Get estimates and inference

#### coefficients
#print(coef(fit))
#### Summarize coefficients
print(summary(fit_glm))


#### Get predictions and inference for log odds ratio at observations
predict(fit_glm, data[1:10,c("X1", "X2")])

Simulation for coverage

passes1 <- c()
passes2 <- c()
library(doMC)

for(i in 1:1000){
  print(i)
n <- 5000
X1 <- runif(n, min = -1, max = 1)
X2 <- runif(n, min = -1, max = 1)
A <- rbinom(n,size = 1, prob = plogis(X1 + X2) )
Y <- rbinom(n,size = 1, prob = plogis(-1+A + A*(X1 + X2) + X1 + X2 + X1*X2)) 
Delta <- rbinom(n,size = 1, prob =1- 0.3*plogis(X1 + X2    ))
data <- data.frame(X1,X2,A,Delta, Y = Y, DeltaY = Delta*Y)



true <- c(1, 1,1)


fit <- spOR(~ 1 + X1 + X2    , W = data[,c("X1", "X2")], A = data$A, Y = data$DeltaY ,
            Delta = data$Delta, num_knots_Y0W = c(5,3,0), max_degree_Y0W = 2, sl3_learner_A = Lrnr_hal9001_custom$new(max_degree=2,num_knots = c(5,2), fit_control = list(parallel = T)), parallel = T, ncores = 4  )

ci <- fit$coefs
passes1 <- cbind(passes1, ci[,4] <= true & ci[,5] >= true)

print(rowMeans(passes1))


fit <- spOR(~ 1 + X1 + X2    , W = data[,c("X1", "X2")], A = data$A, Y = data$Y ,
            Delta = rep(1,n), num_knots_Y0W = c(5,3,0), max_degree_Y0W = 2, sl3_learner_A = Lrnr_hal9001_custom$new(max_degree=2,num_knots = c(5,2), fit_control = list(parallel = T)), parallel = T, ncores = 4  )

ci <- fit$coefs
passes2 <- cbind(passes2, ci[,4] <= true & ci[,5] >= true)

print(rowMeans(passes2))


}
rowMeans(ci)
rowMeans(ci1)
rowMeans(ci2)
ci <- c()
ci1 <- c()
ci2 <- c()
passes <- c()
passesb <- c()
passesc <- c()
passes1 <- c()
ests <- c()
ests1 <- c()
for(i in 1:200){
  print(i)

D <- DAG.empty()

D <- D +
  node("W1", distr = "runif", min = -1, max = 1) +
  node("W2", distr = "runif", min = -1, max = 1) +
  node("Tfull", distr = "rweibull", shape=4, scale = exp(1 ))+
   node("T",  distr = "rconst", const = min(Tfull/2,5))+
  node("A", distr = "rbinom", size = 1, prob = plogis(0.7*(W1 + W2 + T-1) ))+
  node("R", distr = "rbinom", size = 1, prob =  1) +
  node("mu", distr = "rconst", const =  plogis( 0.8*(-1.25  +A   + T +   0.8*( T + W1 +   W2  - W1*W2 + T*W2 + T*W1 ))))+
  node("J", distr = "rbinom", size = 1, prob = mu) +
  node("G", distr = "rconst",const = 1- 0.25*plogis(W1 + W2   ))+
  node("Delta", distr = "rbinom", size = 1, prob = 1)+
  node("Jobs", distr = "rconst",const = J * Delta*R) 

setD <- set.DAG(D )
# data <- sim(setD, n = 250000)
# fit_glm <- glm(Jobs~ T + W1 + W2 + A + A*T  , family = binomial(), data=data[data$Delta==1,] )
# coef(fit_glm)
data <- sim(setD, n = 150)
print(quantile(data$mu))
table(data$Delta)





true <- 0.8*c(1)
library(doMC)
registerDoMC(10)

fit <- spOR(~ 1     , W = data[,c("W1", "W2", "T")], A = data$A, Y = data$Jobs , data = data,
            Delta = data$Delta, num_knots_Y0W = c(20,10,0), max_degree_Y0W = 2, sl3_learner_A = Lrnr_hal9001_custom$new(max_degree=2,num_knots = c(20,10,2),fit_control = list(parallel = T, n_folds = 20)), fit_control = list(parallel = T, n_folds = 20), targeting_method = "universal", smoothness_order_Y0W =1 ,fit_Q0W_separate = F )

tmpci <- fit$coefs
  print(summary(fit))
passes <- cbind(passes, tmpci[,4] <= true & tmpci[,5] >= true)

fit <- spOR(~ 1     , W = data[,c("W1", "W2", "T")], A = data$A, Y = data$Jobs , data = data,
            Delta = data$Delta, num_knots_Y0W = c(20,10,0), max_degree_Y0W = 2, sl3_learner_A = Lrnr_hal9001_custom$new(max_degree=2,num_knots = c(20,10,2),fit_control = list(parallel = T, n_folds = 10)), fit_control = list(parallel = T, n_folds = 10), targeting_method = "universal", smoothness_order_Y0W =1  )
# 
tmpci <- fit$coefs
 print(summary(fit))
passes1 <- cbind(passes1, tmpci[,4] <= true & tmpci[,5] >= true)

print(rowMeans(passes))
print(rowMeans(passes1))



########




}
# ci <- c()
# ci1 <- c()
# ci2 <- c()
# passes <- c()
# passesb <- c()
# passesc <- c()
# passes1 <- c()
# ests <- c()
# ests1 <- c()
for(i in 1:100){
  print(i)
n <-  5000
D <- DAG.empty()

D <- D +
  node("W1", distr = "runif", min = -1, max = 1) +
  node("W2", distr = "runif", min = -1, max = 1) +
  node("Tfull", distr = "rweibull", shape=4, scale = exp(1 ))+
   node("T",  distr = "rconst", const = min(Tfull/2,5))+
  node("A", distr = "rbinom", size = 1, prob = plogis(0.5*(W1 + W2 + T-1) ))+

  node("R", distr = "rbinom", size = 1, prob =  1) +
  node("mu", distr = "rconst", const =  plogis( 0.8*(-1.25  +A -1* A*(T)  +   0.8*( T + W1 +   W2  - W1*W2 + T*W2 + T*W1 ))))+
  node("J", distr = "rbinom", size = 1, prob = mu) +
  node("G", distr = "rconst",const = 1- 0.3*plogis(W1 + W2   ))+
  node("Delta", distr = "rbinom", size = 1, prob = G)+
  node("Jobs", distr = "rconst",const = J * Delta*R) 

setD <- set.DAG(D )
# data <- sim(setD, n = 250000)
# fit_glm <- glm(Jobs~ T + W1 + W2 + A + A*T  , family = binomial(), data=data[data$Delta==1,] )
# coef(fit_glm)
data <- sim(setD, n = n)
print(quantile(data$mu))
table(data$Delta)




true <- 0.8*c(1,-1)
library(doMC)
registerDoMC(10)

fit <- spOR(~ 1 + T   , W = data[,c("W1", "W2", "T")], A = data$A, Y = data$Jobs ,
            Delta = data$Delta, num_knots_Y0W = c(15,10,0), max_degree_Y0W = 2, sl3_learner_A = Lrnr_hal9001_custom$new(max_degree=2,num_knots = c(10,5,2),fit_control = list(parallel = T, n_folds = 10)), fit_control = list(parallel = T, n_folds = 10), targeting_method = "universal", smoothness_order_Y0W =1 ,fit_Q0W_separate = F )

tmpci <- fit$coefs
print(tmpci)
ests <- cbind(ests,  fit$coefs[,1])
ci <- cbind(ci, apply(tmpci[,c(4,5)],1,diff))
passes <- cbind(passes, tmpci[,4] <= true & tmpci[,5] >= true)

print(rowMeans(passes))

##########


fit <- spOR(~ 1 + T   , W = data[,c("W1", "W2", "T")], A = data$A, Y = data$Jobs, Delta = data$Delta 
             , glm_formula_Y0W = ~.^2, sl3_learner_A = Lrnr_hal9001_custom$new(max_degree=2,num_knots = c(10,5,2),fit_control = list(parallel = T, n_folds = 10))     )

tmpci <- fit$coefs

passesb <- cbind(passesb, tmpci[,4] <= true & tmpci[,5] >= true)
print(rowMeans(passesb))


########


fit <- npOR(~ 1 + T   , W = data[,c("W1", "W2", "T")], A = data$A, Y = data$Jobs ,
            Delta = data$Delta,    sl3_learner_default = Lrnr_hal9001_custom$new(max_degree=2,num_knots = c(10,5,0), fit_control = list(parallel = T)))

tmpci <- fit$coefs
print(tmpci)

ci2 <- cbind(ci2, apply(tmpci[,c(4,5)],1,diff))
passesc <- cbind(passesc, tmpci[,4] <= true & tmpci[,5] >= true)
print(rowMeans(passesc))


########


fit_glm <- glm(Jobs~ T + W1 + W2 + A + A*T  , family = binomial(), data=data[data$Delta==1,] )
tmpci1 <- confint(fit_glm)[5:6,,drop = F]
ci1 <- cbind(ci1, apply(tmpci1,1,diff))
ests1 <- cbind(ests1,  coef(fit_glm)[5:6])
passes1 <- cbind(passes1,  tmpci1[,1] <= true & tmpci1[,2] >= true)

print(rowMeans(passes1))


}


Larsvanderlaan/npOddsRatio documentation built on May 3, 2022, 12:05 p.m.