#' Propensity Score Calibration (Conditional Expectation Method)
#'
#' Implements the "conditional expectation" version of propensity score
#' calibration as described by Sturmer et al. (\emph{Am. J. Epidemiol.} 2005).
#' For the "algebraic" version, see \code{\link{psc_algebraic}}.
#'
#' The true disease model is a GLM:
#'
#' g[E(Y)] = beta_0 + beta_x X + beta_g G
#'
#' where G = P(X|\strong{C},\strong{Z}), with \strong{C} but not \strong{Z}
#' available in the main study.
#'
#' In a validation study with (X, \strong{C}, \strong{Z}), logistic regression
#' is used to obtain fitted probabilities for G as well as an error-prone
#' version G* = P(X|\strong{C}). A linear model is fitted to map from G* to
#' E(G|G*). Finally, in the main study, G*'s are calculated, then E(G|G*),
#' and the disease model is fit for Y vs. (X, E(G|G*)).
#'
#' @inheritParams rc_cond_exp
#'
#' @param x_var Character string specifying name of X variable.
#' @param gs_vars Character vector specifying names of variables for gold
#' standard propensity score.
#' @param ep_vars Character vector specifying names of variables for error-prone
#' propensity score.
#' @param surrogacy Logical value for whether to assume surrogacy, which means
#' that the error-prone propensity score is not informative of Y given X and the
#' gold standard propensity score. Have to assume surrogacy if validation data
#' is external.
#' @param ep_data Character string controlling what data is used to fit the
#' error-prone propensity score model. Choices are \code{"validation"} for
#' validation study data, \code{"all"} for main study and validation study data,
#' and \code{"separate"} for validation data for first step and main study data
#' for second step.
#'
#'
#' @references
#' Sturmer, T., Schneeweiss, S., Avorn, J. and Glynn, R.J. (2005) "Adjusting
#' effect estimates for unmeasured confounding with validation data using
#' propensity score calibration." \emph{Am. J. Epidemiol.} \strong{162}(3):
#' 279-289.
#'
#'
#' @export
# # Data for testing
# n.m <- 5000
# n.e <- 5000
# n <- n.m + n.e
#
# gammas <- c(0, 0.25, 0.25)
# betas <- c(0, 0.25, 0.1, 0.2)
# sigsq_e <- 0.5
#
# c <- rnorm(n)
# z <- rnorm(n)
# x <- rbinom(n, size = 1, prob = (1 + exp(-gammas[1] - gammas[2] * c - gammas[3] * z))^(-1))
# y <- betas[1] + betas[2] * z + betas[3] * x + betas[4] * c + rnorm(n, sd = sqrt(sigsq_e))
#
# all_data <- data.frame(y = y, z = z, x = x, c = c)
# all_data[1: n.e, 1] <- NA
# all_data[(n.e + 1): n, 2] <- NA
# main <- internal <- external <- NULL
# y_var <- "y"
# x_var <- "x"
# gs_vars <- c("z", "c")
# ep_vars <- "c"
# tdm_family <- "gaussian"
# surrogacy <- TRUE
# ep_data <- "validation"
# beta_0_formula <- 1
# delta_var <- TRUE
# boot_var <- TRUE
# boots <- 100
#
# fit <- psc_cond_exp(all_data = all_data,
# y_var = "y",
# x_var = "x",
# gs_vars = c("z", "c"),
# ep_vars = "c")
psc_cond_exp <- function(all_data = NULL,
main = NULL,
internal = NULL,
external = NULL,
y_var,
x_var,
gs_vars,
ep_vars,
tdm_family = "gaussian",
surrogacy = TRUE,
ep_data = "separate",
boot_var = FALSE, boots = 100,
alpha = 0.05) {
# Get full list of covariates
covariates <- unique(c(x_var, gs_vars, ep_vars))
# Get list of covariates subject to missingness
covariates.missing <- setdiff(gs_vars, ep_vars)
# If all_data not specified, create it from main, internal, and external
if (is.null(all_data)) {
if (! is.null(main)) {
if (! all(covariates.missing %in% names(main))) {
main[, covariates.missing] <- NA
}
main <- main[, c(y_var, covariates)]
all_data <- main
}
if (! is.null(internal)) {
internal <- internal[, c(y_var, covariates)]
all_data <- rbind(all_data, internal)
}
if (! is.null(external)) {
if (! y_var %in% names(external)) {
external[, y_var] <- NA
}
external <- external[, c(y_var, covariates)]
all_data <- rbind(all_data, external)
}
}
# Subset validation data with (X, Z, C) and data with outcome variable
locs.val <- which(complete.cases(all_data[, covariates]))
val_data <- all_data[locs.val, ]
locs.y <- which(! is.na(all_data[, y_var]))
y_data <- all_data[locs.y, ]
locs.main <- which(! is.na(all_data[, y_var]) & ! complete.cases(all_data[, covariates]))
main <- all_data[locs.main, ]
# Figure out type of validation data
val.type <- ifelse(length(locs.main) == length(locs.y), "external", "internal")
# Fit error-prone propensity score model
ep.formula <- paste(paste(x_var, "~"), paste(ep_vars, collapse = " + "))
if (ep_data %in% c("validation", "separate")) {
fit.ep <- glm(ep.formula, data = val_data, family = "binomial")
} else if (ep_data == "all") {
fit.ep <- glm(ep.formula, data = all_data, family = "binomial")
}
fitted.gstar <- predict(fit.ep, newdata = val_data, type = "response")
# Fit gold standard propensity score model
gs.formula <- paste(paste(x_var, "~"), paste(gs_vars, collapse = " + "))
fit.gs <- glm(gs.formula, data = val_data, family = "binomial")
fitted.g <- fit.gs$fitted
# Fit MEM for G vs. (X, G*)
fit.mem <- lm(fitted.g ~ val_data[, x_var] + fitted.gstar)
# Add G*'s and G's to y_data for internal validation subjects
y_data$gstar <- NA
y_data$g <- NA
if (val.type == "internal") {
y_data$gstar[locs.val] <- fitted.gstar
y_data$g[locs.val] <- fitted.g
}
# Re-fit error-prone propensity score model if requested
if (ep_data == "separate") {
fit.ep <- glm(ep.formula, data = main, family = "binomial")
}
# Calculate G*'s and E(G)'s for main study subjects
y_data$gstar[locs.main] <- predict(fit.ep, newdata = main, type = "response")
y_data$g[locs.main] <- as.matrix(cbind(1, y_data[locs.main, c(x_var, "gstar")])) %*% fit.mem$coef
# Fit TDM and return beta estimate
if (surrogacy) {
tdm.formula <- paste(paste(y_var, "~"), paste(c(x_var, "g"), collapse = " + "))
beta.labels <- paste("beta", c("0", x_var, "g"), sep = "_")
} else {
tdm.formula <- paste(paste(y_var, "~"), paste(c(x_var, "g", "gstar"), collapse = " + "))
beta.labels <- paste("beta", c("0", x_var, "g", "gstar"), sep = "_")
}
tdm.fit <- glm(tdm.formula, data = y_data, family = tdm_family)
beta.hat <- tdm.fit$coef
names(beta.hat) <- beta.labels
# Add beta.hat to ret.list
ret.list <- list(beta.hat = beta.hat)
# Get bootstrap variance estimate if requested
if (boot_var) {
# Various data types
locs.m <- which(! is.na(all_data[, y_var]) & ! complete.cases(all_data[, covariates]))
locs.i <- which(! is.na(all_data[, y_var]) & complete.cases(all_data[, covariates]))
locs.e <- which(is.na(all_data[, y_var]) & complete.cases(all_data[, covariates]))
# Initialize matrix for theta estimates
beta.hat.boots <- matrix(NA, ncol = length(beta.hat), nrow = boots)
# Bootstrap
for (ii in 1: boots) {
beta.hat.boots[ii, ] <- psc_cond_exp(
all_data = all_data[c(sample(locs.m, replace = TRUE),
sample(locs.i, replace = TRUE),
sample(locs.e, replace = TRUE)), ],
y_var = y_var,
x_var = x_var,
gs_vars = gs_vars,
ep_vars = ep_vars,
tdm_family = tdm_family,
surrogacy = surrogacy,
ep_data = ep_data
)
}
# Calculate bootstrap variance estimates
boot.variance <- var(beta.hat.boots)
rownames(boot.variance) <- colnames(boot.variance) <- beta.labels
ret.list$boot.var <- boot.variance
boot.ci <- apply(beta.hat.boots, 2, function(x)
quantile(x, probs = c(alpha / 2, 1 - alpha / 2)))
colnames(boot.ci) <- beta.labels
ret.list$boot.ci <- boot.ci
}
# Return ret.list
if (length(ret.list) == 1) {
ret.list <- ret.list[[1]]
}
return(ret.list)
}
# Archived 6/5/18 - was in middle of modifying
# psc <- function(all_data = NULL,
# main = NULL,
# internal = NULL,
# external = NULL,
# y_var,
# x_var,
# gs_vars,
# ep_vars,
# tdm_family = "gaussian",
# surrogacy = TRUE,
# ep_data = "validation",
# all_imputed = FALSE,
# boot_var = FALSE, boots = 100,
# alpha = 0.05) {
#
# # Get full list of covariates
# covariates <- unique(c(x_var, gs_vars, ep_vars))
#
# # Get list of covariates subject to missingness
# covariates.missing <- setdiff(gs_vars, ep_vars)
#
# # If all_data not specified, create it from main, internal, and external
# if (is.null(all_data)) {
#
# if (! is.null(main)) {
# if (! all(covariates.missing %in% names(main))) {
# main[, covariates.missing] <- NA
# }
# main <- main[, c(y_var, covariates)]
# all_data <- main
# }
#
# if (! is.null(internal)) {
# internal <- internal[, c(y_var, covariates)]
# all_data <- rbind(all_data, internal)
# }
#
# if (! is.null(external)) {
# if (! y_var %in% names(external)) {
# external[, y_var] <- NA
# }
# external <- external[, c(y_var, covariates)]
# all_data <- rbind(all_data, external)
# }
#
# }
#
# # Get validation data with (X, Z, C)
# val_data <- all_data[complete.cases(all_data[, covariates]), ]
#
# # Fit error-prone propensity score model
# ep.formula <- paste(paste(x_var, "~"), paste(ep_vars, collapse = " + "))
# if (ep_data %in% c("validation", "separate")) {
# fit.ep <- glm(ep.formula, data = val_data, family = "binomial")
# } else if (ep_data == "all") {
# fit.ep <- glm(ep.formula, data = all_data, family = "binomial")
# }
#
# # Fit gold standard propensity score model
# gs.formula <- paste(paste(x_var, "~"), paste(gs_vars, collapse = " + "))
# fit.gs <- glm(gs.formula, data = val_data, family = "binomial")
#
# # Fit MEM for G vs. (X, G*)
# fitted.gstar <- predict(fit.ep, newdata = val_data, type = "response")
# fitted.g <- fit.gs$fitted
# fit.mem <- lm(fitted.g ~ val_data[, x_var] + fitted.gstar)
#
# # Calculate G's for data with Y
# y_data <- all_data[! is.na(all_data[, y_var]), ]
#
# # Calculate G's for subjects with (C, Z), unless all_imputed is TRUE
# y_data$g <- NA
# if (! all_imputed) {
# y_data$g <- predict(fit.gs, newdata = y_data, type = "response")
# }
#
# # Re-fit error-prone propensity score model if requested
# if (ep_data == "separate") {
# fit.ep <- glm(ep.formula, data = y_data, family = "binomial")
# }
#
# # Calculate E(G|X,G*)
# locs <- which(is.na(y_data$g))
# y_data$gstar <- NA
# y_data$gstar[locs] <- predict(fit.ep, newdata = y_data[locs, ], type = "response")
# y_data$g[locs] <- as.matrix(cbind(1, y_data[locs, c(x_var, "gstar")])) %*% fit.mem$coef
#
# # Fit TDM and return beta estimate
# if (surrogacy) {
# tdm.formula <- paste(paste(y_var, "~"), paste(c(x_var, "g"), collapse = " + "))
# beta.labels <- paste("beta", c("0", x_var, "g"), sep = "_")
# } else {
# tdm.formula <- paste(paste(y_var, "~"), paste(c(x_var, "g", "gstar"), collapse = " + "))
# beta.labels <- paste("beta", c("0", x_var, "g", "gstar"), sep = "_")
# }
# tdm.fit <- glm(tdm.formula, data = y_data, family = tdm_family)
# beta.hat <- tdm.fit$coef
# names(beta.hat) <- beta.labels
#
# # Add beta.hat to ret.list
# ret.list <- list(beta.hat = beta.hat)
#
# # Get bootstrap variance estimate if requested
# if (boot_var) {
#
# # Various data types
# locs.m <- which(! is.na(all_data[, y_var]) & ! complete.cases(all_data[, covariates]))
# locs.i <- which(! is.na(all_data[, y_var]) & complete.cases(all_data[, covariates]))
# locs.e <- which(is.na(all_data[, y_var]) & complete.cases(all_data[, covariates]))
#
# # Initialize matrix for theta estimates
# beta.hat.boots <- matrix(NA, ncol = length(beta.hat), nrow = boots)
#
# # Bootstrap
# for (ii in 1: boots) {
#
# beta.hat.boots[ii, ] <- psc(
# all_data = all_data[c(sample(locs.m, replace = TRUE),
# sample(locs.i, replace = TRUE),
# sample(locs.e, replace = TRUE)), ],
# y_var = y_var,
# x_var = x_var,
# gs_vars = gs_vars,
# ep_vars = ep_vars,
# tdm_family = tdm_family,
# surrogacy = surrogacy,
# ep_data = ep_data,
# all_imputed = all_imputed
# )
#
# }
#
# # Calculate bootstrap variance estimates
# boot.variance <- var(beta.hat.boots)
# rownames(boot.variance) <- colnames(boot.variance) <- beta.labels
# ret.list$boot.var <- boot.variance
#
# boot.ci <- apply(beta.hat.boots, 2, function(x)
# quantile(x, probs = c(alpha / 2, 1 - alpha / 2)))
# colnames(boot.ci) <- beta.labels
# ret.list$boot.ci <- boot.ci
#
# }
#
# # Return ret.list
# if (length(ret.list) == 1) {
# ret.list <- ret.list[[1]]
# }
# return(ret.list)
#
# }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.