# borrowr: estimate population average treatment effects with borrowing between data sources.
# Copyright (C) 2019 Jeffrey A. Verdoliva Boatman
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
fit_mems <- function(mf, estimator, ndpost, exch_prob, ...) {
# if(estimator != "bayesian_lm")
# stop("'bayesian_lm' is currently the only estimator implemented.")
formula <- attr(mf, "formula")
src_var <- attr(mf, "src_var")
trt_var <- attr(mf, "trt_var")
com_var <- attr(mf, "com_var")
nc <- !is.na(com_var)
ps <- attr(mf, "primary_source")
sl <- levels(mf[, src_var])
nl <- length(sl)
# bad idea:
# nm <- match(c(src_var, trt_var), names(mf))
# names(mf)[nm] <- c("src", "trt")
# sl <- levels(mf[, src_var])
# ps <- sl[1L]
# nl <- length(sl)
il <- replicate(nl - 1, c(TRUE, FALSE), simplify = FALSE)
il <- c(TRUE, il)
im <- do.call(expand.grid, il)
im <- t(as.matrix(im))
dimnames(im) <- list(
source = sl,
MEM = seq_len(ncol(im))
)
prior_mem <- numeric(ncol(im))
names(prior_mem) <- seq_len(ncol(im))
names(exch_prob) <- rownames(im)[-1]
mem_pate_post <- array(dim = c(ndpost, ncol(im)))
mem_EY0 <- array(dim = c(ndpost, ncol(im)))
mem_EY1 <- array(dim = c(ndpost, ncol(im)))
dimnames(mem_pate_post) <- list(
draw = seq_len(ndpost),
MEM = seq_len(ncol(im))
)
dimnames(mem_EY0) <- dimnames(mem_pate_post)
dimnames(mem_EY1) <- dimnames(mem_pate_post)
# Y <- matrix(model.response(mf, "numeric"), ncol = 1)
# Y <- mf[, all.vars(formula[[2]])]
# Y <- matrix(Y, ncol = 1)
log_marg_like <- numeric(ncol(im))
names(log_marg_like) <- paste0("MEM_", seq_len(ncol(im)))
# in the event of bugs, check sorting and splitting
Xf <- mf
Xf <- Xf[order(Xf[, src_var]), ]
Y <- Xf[, all.vars(formula[[2]])]
# Y <- matrix(Y, ncol = 1)
# for bayesian lm:
if(estimator == "bayesian_lm") {
Xm <- as.data.frame(model.matrix(formula, Xf))
X0 <- Xf[Xf[, src_var] == ps, ]
X1 <- Xf[Xf[, src_var] == ps, ]
X0[, trt_var] <- 0
X1[, trt_var] <- 1
if (nc) {
X0[, com_var] <- 1
X1[, com_var] <- 1
}
X0 <- as.data.frame(model.matrix(formula, X0))
X1 <- as.data.frame(model.matrix(formula, X1))
beta_post_mean <- array(dim = c(ncol(Xm), ncol(im)))
beta_post_var <- array(dim = c(ncol(Xm), ncol(im)))
dimnames(beta_post_mean) <- list(
coef = colnames(Xm),
MEM = seq_len(ncol(im))
)
dimnames(beta_post_var) <- list(
var = colnames(Xm),
MEM = seq_len(ncol(im))
)
} else if (estimator == "BART") {
Xm <- Xf[, c(all.vars(formula[[3]]), src_var), drop = FALSE]
beta_post_mean <- NA
beta_post_var <- NA
}
attr(Xm, "src_var") <- src_var
attr(Xm, "trt_var") <- trt_var
attr(Xm, "primary_source") <- ps
attr(Xm, "com_var") <- com_var
sro <- Xf[, src_var]
Xs <- split(Xm, sro)
#Xfs <- lapply(Xfs, as.matrix)
Ys <- split(Y, sro)
# X0 <- Xs[[1]]
# X1 <- Xs[[1]]
# X0[, trt_var] <- 0 # bayes_lm only?
# X1[, trt_var] <- 1 # bayes_lm only?
#sepfits <- list()
# ...
# fit the BART model for each level of the source variable.
# if(bart) {
#
# }
if(estimator == "bayesian_lm") {
sepfits <- mapply(bayes_lm,
Y = Ys,
X = Xs,
X0 = list(X0),
X1 = list(X1),
ndpost = ndpost,
SIMPLIFY = FALSE)
multicollinearity_flag <- any(sapply(sepfits, "[[", "multicollinearity_flag"))
if (multicollinearity_flag)
warning("Multicollinearity in design matrix X.")
} else if (estimator == "BART") {
#for BART only
message("Fitting BART model to each data source...")
sepfits <- mapply(bart,
Y = Ys,
X = Xs,
#X0 = list(X0),
#X1 = list(X1),
estimate = as.list(rep(c(TRUE, FALSE), c(1, nl - 1))),
ndpost = ndpost,
# theta = 10,
sparse = TRUE,
cont = FALSE,
SIMPLIFY = FALSE,
...)
}
for(mem in seq_len(ncol(im))) {
if (estimator == "BART")
message(sprintf("Fitting MEM %d of %d ...", mem, ncol(im)))
cm <- im[, mem]
# cn <- names(cm)[which(cm)]
exch <- 1 * cm[-1]
prior_mem[mem] <- prod(exch * exch_prob + (1 - exch) * (1 - exch_prob))
tfits <- sepfits
if (any(cm[-1L])) {
tempX <- do.call(rbind, Xs[cm])
tempY <- do.call(c, Ys[cm])
if (estimator == "bayesian_lm") {
tfits[[1]] <- bayes_lm(Y = tempY,
X = tempX,
X0 = X0,
X1 = X1,
ndpost = ndpost)
} else if (estimator == "BART") {
tfits[[1]] <- bart(Y = tempY,
X = tempX,
#X0 = X0,
#X1 = X1,
estimate = TRUE,
ndpost = ndpost,
# theta = 10,
sparse = TRUE,
cont = FALSE,
...)
}
}
ll <- sapply(tfits, "[[", "log_marg_like")[c(TRUE, !cm[-1L])]
log_marg_like[mem] <- sum(ll)
mem_pate_post[, mem] <- tfits[[1]]$pate_post
mem_EY0[, mem] <- tfits[[1]]$EY0
mem_EY1[, mem] <- tfits[[1]]$EY1
if (estimator == "bayesian_lm") {
beta_post_mean[, mem] <- colMeans(tfits[[1]]$beta_post)
beta_post_var[, mem] <- apply(tfits[[1]]$beta_post, 2, var)
}
}
# if(estimator == "BART") {
# Xf <- mf
# Xf <- Xf[, all.vars(attr(mf, "terms")[[3]])]
# attr(Xf, "src_var") <- src_var
# attr(Xf, "trt_var") <- trt_var
# attr(Xf, "primary_source") <- attr(mf, "primary_source")
# # attr(Xf, "in_prim") <- attr(mf, "in_prim")
# Xfo <- Xf[order(Xf[, src_var]), ]
# sro <- Xfo[, src_var]
# # Xfo[, src_var] <- NULL
# Xfs <- split(Xfo, sro)
# #Xfs <- lapply(Xfs, as.matrix)
# Ys <- split(Y, sro)
# #X0 <- Xfs[[1]]
# #X1 <- Xfs[[1]]
# # X0[, trt_var] <- 0 # now done within the bart function
# # X1[, trt_var] <- 1 # now done within the bart function
# #sepfits <- list()
# # ...
# # fit the BART model for each level of the source variable.
# message("Fitting BART model to each data source...")
# sepfits <- mapply(bart,
# Y = Ys,
# X = Xfs,
# #X0 = list(X0),
# #X1 = list(X1),
# estimate = as.list(rep(c(TRUE, FALSE), c(1, nl - 1))),
# ndpost = ndpost,
# beta = beta,
# gf = gf,
# eb = eb,
# # theta = 10,
# sparse = TRUE,
# cont = FALSE,
# SIMPLIFY = FALSE,
# ...)
# for(mem in seq_len(ncol(im))) {
# message(sprintf("Computing MEM %d of %d ...", mem, ncol(im)))
# cm <- im[, mem]
# # cn <- names(cm)[which(cm)]
# tfits <- sepfits
# if (any(cm[-1L])) {
# tempX <- do.call(rbind, Xfs[cm])
# tempY <- do.call(c, Ys[cm])
# tfits[[1]] <- bart(Y = tempY,
# X = tempX,
# #X0 = X0,
# #X1 = X1,
# estimate = TRUE,
# ndpost = ndpost,
# beta = beta,
# gf = gf,
# eb = eb,
# # theta = 10,
# sparse = TRUE,
# cont = FALSE,
# ...)
# }
# ll <- sapply(tfits, "[[", "log_marg_like")[c(TRUE, !cm[-1L])]
# log_marg_like[mem] <- sum(ll)
# mem_pate_post[, mem] <- tfits[[1]]$pate_post
# }
# } else if (estimator == "bayesian_lm") {
# for(mem in seq_len(ncol(im))) {
# message(sprintf("Computing MEM %d of %d ...", mem, ncol(im)))
# cm <- im[, mem]
# cn <- names(cm)[which(cm)]
# temp_mf <- mf
# temp_mf[, src_var][temp_mf[, src_var] %in% cn] <- ps
# X <- model.matrix(terms(mf), temp_mf)
# X <- apply(X, 2, dropzeros)
# if(is.list(X))
# X <- do.call(cbind, X)
# X0 <- X1 <- X[mf[, src_var] == ps, ]
# X0[, trt_var] <- 0
# X1[, trt_var] <- 1
# fit <- bayes_lm(Y, X, X0, X1, ndpost)
# mem_pate_post[, mem] <- fit$pate_post
# log_marg_like[mem] <- fit$log_marg_like
# }
# }
lml <- log_marg_like
cons <- -max(lml)
# post_probs <- exp(cons + log_marg_like) / sum(exp(cons + log_marg_like))
post_probs <- exp(cons + lml) * prior_mem / sum(exp(cons + lml) * prior_mem)
# post_probs[which(is.na(post_probs))] <- 0
pate_post <- sample_posterior(mem_pate_post, post_probs)
# better to return the matrices rather than the
# weighted averages.
EY0 <- sample_posterior(mem_EY0, post_probs)
EY1 <- sample_posterior(mem_EY1, post_probs)
out <- list(
estimator = estimator,
pate_post = pate_post,
log_marg_like = log_marg_like,
post_probs = post_probs,
MEMs = im,
exch_prob = exch_prob
)
out$mem_pate_post <- mem_pate_post
out$mem_EY0 <- mem_EY0
out$mem_EY1 <- mem_EY1
out$EY0 <- EY0
out$EY1 <- EY1
out$beta_post_mean <- beta_post_mean
out$beta_post_var <- beta_post_var
out
}
# set.seed(100)
# n <- 100
# dat <- data.frame(
# src = sample(letters[1:3], n, replace = TRUE),
# trt = sample(c(1, 0), n, replace = TRUE),
# x1 = rnorm(n),
# x2 = rnorm(n),
# y = rnorm(n)
# )
#
# dat$src <- relevel(dat$src, ref = "b")
# str(dat)
#
# mf <- model.frame(y ~ src + trt + x1 * x2, dat)
# attr(mf, "src_var") <- "src"
# attr(mf, "trt_var") <- "trt"
# attr(mf, "primary_source") <- "b"
# # debug(fit_mems)
# # undebug(bart)
# # undebug(BART::wbart)
# foo <- fit_mems(mf, "BART", 10, 2)
# bar <- fit_mems(mf, "bayesian_lm")
# foo$log_marg_like
# bar$log_marg_like
# foo$post_probs
# bar$post_probs
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.