#' Take the softmax of a matrix, row-wise
#'
#' @param x N x K Matrix
#' @return N x K matrix with row sums equal to 1
row_softmax <- function(x){
score_exp <- exp(x)
probs <- sweep(score_exp, 1, rowSums(score_exp), "/")
return(probs)
}
#' Calculate the naive residual disparity for data application
#'
#' @param seer_file_path Path to raw SEER txt file
#' @param am_intx Whether to include A * M interaction in Y model
#' @param B Number of bootstrap samples to take
#' @return Vector of B residual disparities
#' @export
calculate_data_app_naive <- function(seer_file_path, am_intx = 1, B = 2000) {
# Read in seer from file path, and set
seer <- read.table(path.expand(seer_file_path))
mediator <- "stage_cat"
outcome <- "fiveyearsurv"
tx <- "black"
unmeasured <- "pov"
n <- NROW(seer)
K_m <- length(unique(seer[[mediator]]))
# Make middle age category the reference
seer$age_cat <- relevel(as.factor(seer$age_cat), ref = 2)
seer$female <- ifelse(seer$female == 2, 1, 0)
# Collapse regions to match CanCors
seer$region[seer$region == 2] <- 1
seer$region <- factor(seer$region,
labels = c("Other", "South", "West"))
formulas <- list()
formulas$mediator <- formula(~ female + as.factor(age_cat) + as.factor(region)
+ black)
formulas$unmeasured <- formula(~ female + as.factor(age_cat) + as.factor(region)
+ black)
if (am_intx) {
formulas$outcome <- formula(~ female + as.factor(age_cat) + as.factor(region)
+ black + as.factor(stage_cat)
+ black*as.factor(stage_cat))
} else {
formulas$outcome <- formula(~ female + as.factor(age_cat) + as.factor(region)
+ black + as.factor(stage_cat))
}
# These actually get used now, except for U
seer[["Y"]] <- seer[[outcome]]
seer[["M"]] <- seer[[mediator]]
seer[["U"]] <- sample(0:1, size = n, replace = TRUE) # gets overwritten
seer[["A"]] <- seer[[tx]] #<- sample(0:1, size = n, replace = TRUE)
# Construct formulas
formulas$mediator <- update.formula(formulas$mediator, M ~ .)
formulas$outcome <- update.formula(formulas$outcome, Y ~ .)
nder_naive <- rep(NA, B)
for (b in 1:B) {
new_booted <- booted <- boot_samp(seer)
fit_m <- VGAM::vglm(formulas$mediator, family = multinomial(refLevel = 1),
data = booted)#, control = vglm.control(maxit = 500,
# stepsize = 0.7))
fit_y <- glm(formulas$outcome, family = binomial(link = "logit"),
data = booted)
new_booted$A <- new_booted$black <- 0
pm_a0 <- VGAM::predict(fit_m, newdata = new_booted, type = "response")
nder_indiv <- rep(0, NROW(booted))
for (m in 1:4) {
new_booted$M <- new_booted$stage_cat <- m
new_booted$A <- new_booted$black <- 1
py10 <- stats::predict(fit_y, newdata = new_booted, type = "response")
new_booted$A <- new_booted$black <- 0
py00 <- stats::predict(fit_y, newdata = new_booted, type = "response")
nder_indiv <- nder_indiv + unname((py10 - py00) * pm_a0[, m])
}
nder_naive[b] <- mean(nder_indiv)
}
return(nder_naive)
}
#' Make arrays
#'
#' @param Xmat_Y Design matrix for outcome regression
#' @param Xmat_M Design matrix for mediator regression
#' @param Xmat_U Design matrix for unmeasured confounder regression
#' @param am_intx 0/1 indicator of exposure-mediator interaction
#' @param n_patt Number of covariate patterns
#' @param K_m Number of levels for mediator
#' @return Named list containing arrays
#' @export
make_xy_xm_xu <- function(Xmat_Y, Xmat_M, Xmat_U, am_intx, n_patt, K_m) {
# U
D_u <- ncol(Xmat_U)
xu <- array(Xmat_U, dim = c(n_patt, D_u, 2))
wa_xu <- D_u
# M
D_m <- ncol(Xmat_M)
xm <- array(Xmat_M, dim = c(n_patt, D_m, 2, 2))
wa_xm <- D_u
wu_xm <- D_m
# Y
D_y <- ncol(Xmat_Y)
xy <- array(Xmat_Y, dim = c(n_patt, D_y, 2, 2, K_m))
wa_xy <- D_u
wm_xy <- (wa_xy + 1):(wa_xy + K_m - 1)
if (am_intx) {
wam_xy <- (max(wm_xy) + 1):(max(wm_xy) + K_m - 1)
}
wu_xy <- D_y
# Fill in values for the design matrices
for (a in 0:1) {
xu[ , wa_xu, a + 1] <- a
xm[ , wa_xm, a + 1, ] <- a
xy[ , wa_xy, a + 1, , ] <- a
for (u in 0:1) {
xm[ , wu_xm, , u + 1] <- u
xy[ , wu_xy, a + 1, u + 1, ] <- u
for (m in 1:K_m) {
# initialize all M dummies to 0
xy[ , wm_xy, a + 1, u + 1, m] <- 0
if (am_intx) {
xy[ , wam_xy, a + 1, u + 1, m] <- 0
}
if (m != 1) {
# make the one non-zero dummy
xy[ , wm_xy[1] + (m - 2), a + 1, u + 1, m] <- 1
if (am_intx && (a == 1)) {
xy[ , wam_xy[1] + (m - 2), a + 1, u + 1, m] <- 1
}
}
} # end m loop
} # end u loop
} # end a loop
return(list(xy = xy, xm = xm, xu = xu))
}
#' Get distribution of M marginalized over U
#'
#' p(M = m | Z = z, A = a) = sum_u p(M = m | z, A = a, U = U(a)) P(U = u(a) | z)
#'
#' @param pm Array with element (i,j,k,l) = P(M = j | Z_i, A = k - 1, U = l - 1)
#' @param pu1 N x 2 matrix with element (i,j) = P(U = 1 | Z_i, A = (j - 1))
#' @param a Value of A to evaluate
#' @return N x 4 matrix with element (i,j) = P(M = j | Z = Z_i, A = a)
get_marginal_pm <- function(pm, pu1, a) {
pmmarg <- pm[ , , a + 1, 1] * (1 - pu1[ , a + 1]) +
pm[ , , a + 1, 2] * (pu1[ , a + 1])
return(pmmarg)
}
#' Get distribution of Y marginalized over M and U
#'
#' @param py1 Array with element (i,j,k,l) =
#' P(Y = 1 | Z_i, A = (j - 1), U = (k - 1), M = l)
#' @param pmmarg N x 4 matrix with element (i,j) = P(M = j | Z = Z_i, A = a)
#' @param pu1 N x 2 matrix with element (i,j) = P(U = 1 | Z_i, A = (j - 1))
#' @param a Value of A to evaluate
#' @return Length-N vector with element (i) = P(Y = 1 | A = a), where marginalization
#' over M and U has occurred with appropriate probabilities
get_marginal_ey <- function(py1, pmmarg, pu1, a) {
py1margu <- py1[ , a + 1, 1, ] * (1 - pu1[ , a + 1]) +
py1[ , a + 1, 2, ] * (pu1[ , a + 1])
return(rowSums(py1margu * pmmarg))
}
#' Make quartet of Y counterfactuals for (r)NDE and (r)NIE
#'
#' @param py1 Array with element (i,j,k,l) =
#' P(Y = 1 | Z_i, A = (j - 1), U = (k - 1), M = l)
#' @param pm Array with element (i,j,k,l) = P(M = j | Z_i, A = k - 1, U = l - 1)
#' @param pu1 N x 2 matrix with element (i,j) = P(U = 1 | Z_i, A = (j - 1))
#' @return N x 4 matrix
#' @export
make_quartet <- function(py1, pm, pu1) {
pm_a1 <- get_marginal_pm(pm, pu1, a = 1)
pm_a0 <- get_marginal_pm(pm, pu1, a = 0)
ey00 <- get_marginal_ey(py1, pmmarg = pm_a0, pu1, a = 0)
ey01 <- get_marginal_ey(py1, pmmarg = pm_a1, pu1, a = 0)
ey10 <- get_marginal_ey(py1, pmmarg = pm_a0, pu1, a = 1)
ey11 <- get_marginal_ey(py1, pmmarg = pm_a1, pu1, a = 1)
return(cbind(ey00 = ey00, ey01 = ey01, ey10 = ey10, ey11 = ey11))
}
#' Function to calculate a Bayesian bootstrap weighted mean
#'
#' @param statistic Length-n vector of a statistic at each covariate pattern
#' @param counts Length-n vector of counts for each covariate pattern
#' @return Length-B vector of population average statistics
#' @export
bayes_boot_weighted_mean <- function(statistic, counts) {
stopifnot(length(statistic) == length(counts))
w <- sapply(counts, FUN = function(x) rgamma(n = 1, shape = x, scale = 1))
w <- w / sum(w)
return(weighted.mean(statistic, w = w))
}
#' Process SEER data to make design matrices and counts for weighting
#'
#' @param seer_file_path Path to raw SEER txt file
#' @param am_intx Whether to include A * M interaction in Y model
#' @return Named list of design matrices Xmat_U, ..., Xmat_Y with K unique rows,
#' frequency counts for each of the K covariate pattern, number of unique
#' patterns, and number of unique values for the mediator (K_m)
process_seer_raw <- function(seer_file_path, am_intx = 1) {
# Read in seer from file path, and set
seer <- read.table(path.expand(seer_file_path))
mediator <- "stage_cat"
outcome <- "fiveyearsurv"
tx <- "black"
unmeasured <- "pov"
n <- NROW(seer)
K_m <- length(unique(seer[[mediator]]))
# Make middle age category the reference
seer$age_cat <- relevel(as.factor(seer$age_cat), ref = 2)
seer$female <- ifelse(seer$female == 2, 1, 0)
# Collapse regions to match CanCors
seer$region[seer$region == 2] <- 1
seer$region <- factor(seer$region,
labels = c("Other", "South", "West"))
formulas <- list()
formulas$mediator <- formula(~ female + as.factor(age_cat) + as.factor(region)
+ black)
formulas$unmeasured <- formula(~ female + as.factor(age_cat) + as.factor(region)
+ black)
if (am_intx) {
formulas$outcome <- formula(~ female + as.factor(age_cat) + as.factor(region)
+ black + as.factor(stage_cat)
+ black*as.factor(stage_cat))
} else {
formulas$outcome <- formula(~ female + as.factor(age_cat) + as.factor(region)
+ black + as.factor(stage_cat))
}
# These samples gets overwritten - just ensure the model matrices work
seer[["Y"]] <- seer[[outcome]]
seer[["M"]] <- seer[[mediator]] <- sample(1:K_m, size = n, replace = TRUE)
seer[["U"]] <- sample(0:1, size = n, replace = TRUE) # gets overwritten
seer[["A"]] <- seer[[tx]] <- sample(0:1, size = n, replace = TRUE)
# Construct formulas
formulas$u_noU <- formulas$unmeasured
formulas$unmeasured <- update.formula(formulas$unmeasured, U ~ .)
formulas$mediator <- update.formula(formulas$mediator, M ~ .)
formulas$outcome <- update.formula(formulas$outcome, Y ~ .)
formulas$mediatorU <- update.formula(formulas$mediator, . ~ . + U)
formulas$outcomeU <- update.formula(formulas$outcome, . ~ . + U)
# Isolate baseline covariates from mediator regression model
covs <- attr(terms(formulas$mediator), "term.labels")
formulas$baseline <- as.formula(paste("~", paste(covs[covs != tx],
collapse = " + ")))
# Simplify into unique patterns for Bayesian bootstrap
full_bl <- cbind(count = 1, model.matrix(formulas$baseline, data = seer))
all_counts <- aggregate(count ~ ., data = full_bl, FUN = sum)
patts <- unique(full_bl[, -1])
patts_with_counts <- merge(patts, all_counts, sort = FALSE)
counts <- patts_with_counts$count
n_patt <- nrow(patts)
# Make shell design matrices for unique covariate patterns
Xmat_Y <- matrix(NA,
ncol = ncol(model.matrix(formulas$outcomeU, seer)),
nrow = nrow(patts))
Xmat_U <- matrix(NA,
ncol = ncol(model.matrix(formulas$unmeasured, seer)),
nrow = nrow(patts))
Xmat_M <- matrix(NA,
ncol = ncol(model.matrix(formulas$mediatorU, seer)),
nrow = nrow(patts))
Xmat_Y[1:nrow(patts), 1:ncol(patts)] <- patts
Xmat_U[1:nrow(patts), 1:ncol(patts)] <- patts
Xmat_M[1:nrow(patts), 1:ncol(patts)] <- patts
return(list(Xmat_Y = Xmat_Y, Xmat_M = Xmat_M, Xmat_U = Xmat_U,
counts = counts, n_patt = n_patt, K_m = K_m))
}
#' Process data application stan fit to obtain posterior draws of mediation estimands
#'
#' @param seer_file_path Path to SEER data
#' @param stan_fit Stan fit from data application
#' @param am_intx Whether an exposure-mediator interaction was included
#' @return Named list of mediation quantities from closed form g-formula approach
#' @examples
#' \dontrun{
#' process_data_app_gf(seer_file_path = "../Data/Raw/data_SEER_for_BSA_summer_project.txt",
#' stan_fit = readRDS("../data_app_res_20180703.rds"),
#' am_intx = 1)
#' }
#' @export
process_data_app_gf <- function(seer_file_path, stan_fit, am_intx) {
# Make design matrix arrays
dfl <- process_seer_raw(seer_file_path = seer_file_path, am_intx = am_intx)
xml <- make_xy_xm_xu(Xmat_Y = dfl$Xmat_Y, Xmat_M = dfl$Xmat_M,
Xmat_U = dfl$Xmat_U, am_intx = am_intx,
n_patt = dfl$n_patt, K_m = dfl$K_m)
counts <- dfl$counts
K_m <- dfl$K_m
n_patt <- dfl$n_patt
rm(dfl)
xy <- xml$xy
xm <- xml$xm
xu <- xml$xu
rm(xml)
# Extract coefficients from model
coefs <- as.array(rstan::extract(stan_fit,
pars = c("alpha", "beta", "gamma")))
coef_u_arr <- t(coefs$gamma)
coef_m_arr <- aperm(coefs$beta, c(3, 2, 1)) # have last dimension be R
coef_y_arr <- t(coefs$alpha)
R <- dim(coef_u_arr)[2]
# Initialize containers
# i = covariate patter, a = tx level, m = mediator level, u = unmeasured
# confounder level
pu1 <- array(NA, dim = c(n_patt, 2)) #i, a
pm <- array(NA, dim = c(n_patt, K_m, 2, 2)) #i, m, a, u
py1 <- array(NA, dim = c(n_patt, 2, 2, K_m)) #i, a, u, m
nder_gfs <- nier_gfs <- ter_gfs <- rep(NA, R)
# Loop over MCMC iterations
for (r in 1:R) {
coef_u <- coef_u_arr[, r]
coef_m <- coef_m_arr[, , r]
coef_y <- coef_y_arr[, r]
for (a in 0:1) {
pu1[ , a + 1] <- plogis(xu[ , , a + 1] %*% coef_u)
for (u in 0:1) {
pm[ , , a + 1, u + 1] <- row_softmax(xm[ , , a + 1, u + 1] %*% coef_m)
for (m in 1:4) {
py1[ , a + 1, u + 1, m] <- plogis(xy[ , , a + 1, u + 1, m] %*% coef_y)
}
}
}
q <- make_quartet(py1, pm, pu1)
nder_gfs[r] <- bayes_boot_weighted_mean(q[ , "ey10"] - q[ , "ey00"],
counts = counts)
nier_gfs[r] <- bayes_boot_weighted_mean(q[ , "ey11"] - q[ , "ey10"],
counts = counts)
ter_gfs[r] <- bayes_boot_weighted_mean(q[ , "ey11"] - q[ , "ey00"],
counts = counts)
}
# Simulated versions
nder_gcs <- c(unlist(extract(stan_fit, pars = "meffects[1]")))
nier_gcs <- c(unlist(extract(stan_fit, pars = "meffects[2]")))
ter_gcs <- c(unlist(extract(stan_fit, pars = "meffects[3]")))
nder_gc <- mean(nder_gcs)
nier_gc <- mean(nier_gcs)
ter_gc <- mean(ter_gcs)
ci_nder_gc <- make_ci(nder_gcs)
ci_nier_gc <- make_ci(nier_gcs)
ci_ter_gc <- make_ci(ter_gcs)
# Posterior summaries
nder_gf <- mean(nder_gfs)
nier_gf <- mean(nier_gfs)
ter_gf <- mean(ter_gfs)
ci_nier_gf <- make_ci(nier_gfs)
ci_nder_gf <- make_ci(nder_gfs)
ci_ter_gf <- make_ci(ter_gfs)
return(list(nder_gcs = nder_gcs, nder_gc = nder_gc, ci_nder_gc = ci_nder_gc,
nier_gcs = nier_gcs, nier_gc = nier_gc, ci_nier_gc = ci_nier_gc,
ter_gcs = ter_gcs, ter_gc = ter_gc, ci_ter_gc = ci_ter_gc,
nder_gfs = nder_gfs, nder_gf = nder_gf, ci_nder_gf = ci_nder_gf,
nier_gfs = nier_gfs, nier_gf = nier_gf, ci_nier_gf = ci_nier_gf,
ter_gfs = ter_gfs, ter_gf = ter_gf, ci_ter_gf = ci_ter_gf))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.