#' Make short variables for frequentist corrections
#'
#' @param df Data application data frame
#' @return Data frame with additional aliased variables
#' @export
make_da_shortcut_vars <- function(df) {
df$z1 <- df$female
df$z2 <- as.factor(df$age_cat)
df$z3 <- as.factor(df$region)
df$a <- df$black
df$m <- df$stage_cat
df$y <- df$fiveyearsurv
if ("pov" %in% colnames(df)) {
df$u <- df$pov
}
return(df)
}
#' Process SEER for frequentist corrections
#'
#' @param seer_file_path File path to raw SEER data
#' @return SEER data with aliased variables z1, z2, ...
#' @export
make_da_big_df <- function(seer_file_path) {
seer <- read.table(seer_file_path)
# Make middle age category the reference
seer$age_cat <- relevel(as.factor(seer$age_cat), ref = 2)
seer$female <- ifelse(seer$female == 2, 1, 0)
# Collapse regions to match CanCors
seer$region[seer$region == 2] <- 1
seer$region <- factor(seer$region,
labels = c("Other", "South", "West"))
seer <- make_da_shortcut_vars(seer)
return(seer)
}
#' Process CanCORS for frequentist corrections
#'
#' @param cancors_file_path File path to raw CanCORS data
#' @return CanCORS data with aliased variables z1, z2, ...
#' @export
make_da_small_df <- function(cancors_file_path) {
# Read in raw data
cancors <- read.csv(cancors_file_path)
# Make Cancors variable names match SEER
cancors <- na.omit(cancors)
cancors$stage_cat <- cancors$stage
cancors$age_cat <- relevel(as.factor(cancors$age_dx), ref = 2)
cancors$female <- ifelse(cancors$gender == 2, 1, 0)
cancors$pov <- ifelse(cancors$income == 1, 1, 0)
cancors$black <- cancors$race
cancors$region <- factor(cancors$region,
labels = c("Other", "South", "West"))
cancors$fiveyearsurv <- ifelse(cancors$surv < 365.25 * 5, 0, 1)
cancors <- make_da_shortcut_vars(cancors)
return(cancors)
}
#' Count up baseline covariate patterns for predictions
#'
#' @param df Data frame containing z1, z2, z3
#' @return Data frame containing unique combinations of z1, z2, z2, and frequency counts
#' @export
make_da_cov_df_with_counts <- function(df) {
cov_df <- df[, c("z1", "z2", "z3")]
cov_df <- dplyr::group_by(.data = cov_df, z1, z2, z3)
cov_df <- as.data.frame(dplyr::tally(x = cov_df))
colnames(cov_df)[colnames(cov_df) == "n"] <- "counts"
return(cov_df)
}
#' Data application interaction correction
#'
#' @param df Big data frame containing y, a, m, z1, z2, z3
#' @param small_df Small data containing y, a, m, z1, z2, z3, u
#' @param nonparametric Whether to calculate everything nonparametrically
#' @return Estimated bias-corrected population (r)NDE
#' @export
run_da_intx_corr <- function(df, small_df, nonparametric = TRUE) {
if (nonparametric) {
formula_y <- formula(y ~ 1 + (z1 + z2 + z3 + a + m + u)^6)
} else {
formula_y <- formula(y ~ 1 + z1 + z2 + z3 + a + m + a:m + u)
}
# Big data sample frequencies of covariate patterns
cov_df <- make_da_cov_df_with_counts(df)
a <- 1
a_star <- 0
# Fit models to calculate B_nde
fit_y <- glm(y ~ 1 + (z1 + z2 + z3 + a + m + u)^6, data = small_df,
family = binomial(link = "logit"))
fit_u_cond <- glm(u ~ 1 + (z1 + z2 + z3 + a + m)^5, data = small_df,
family = binomial(link = "logit")) # weird but correct
fit_u_marg <- glm(u ~ 1 + (z1 + z2 + z3)^3, data = small_df,
family = binomial(link = "logit"))
# Calculate bias for every covariate pattern, summing across m and u
cov_df$B_nde <- 0
for (m in 1:4) {
small_df$meqm <- (small_df$m == m) * 1
fit_m_cond <- glm(meqm ~ 1 + (z1 + z2 + z3 + a + u)^5, data = small_df,
family = binomial(link = "logit"))
fit_m_marg <- glm(meqm ~ 1 + (z1 + z2 + z3 + a)^4, data = small_df,
family = binomial(link = "logit"))
for (u in 0:1) {
ey1 <- predict(fit_y, newdata = cbind(cov_df, a = a, m = m, u = u),
type = "response")
ey2 <- predict(fit_y, newdata = cbind(cov_df, a = a_star, m = m, u = u),
type = "response")
pm3 <- predict(fit_m_marg, newdata = cbind(cov_df, a = a_star),
type = "response")
pm4 <- predict(fit_m_cond, newdata = cbind(cov_df, a = a_star, u = u),
type = "response")
qu5 <- predict(fit_u_cond, newdata = cbind(cov_df, a = a, m = m),
type = "response")
pu5 <- u * qu5 + (1 - u) * (1 - qu5)
qu6 <- predict(fit_u_cond, newdata = cbind(cov_df, a = a_star, m = m),
type = "response")
pu6 <- u * qu6 + (1 - u) * (1 - qu6)
qu7 <- predict(fit_u_marg, newdata = cbind(cov_df),
type = "response")
pu7 <- u * qu7 + (1 - u) * (1 - qu7)
cov_df$B_nde <- cov_df$B_nde +
(ey1 * pu5 * pm3
- ey2 * pu6 * pm3
- ey1 * pm4 * pu7
+ ey2 * pm4 * pu7)
}
}
naive_nder <- calculate_categorical_naive_nder(df = df,
nonparametric = nonparametric)
# Subtract off bias and average by covariate pattern frequency
intx_nder <- naive_nder - cov_df$B_nde
avg_intx_nder <- weighted.mean(intx_nder, w = cov_df$counts)
return(avg_intx_nder)
}
#' Calculate the naive NDER for a categorical mediator
#'
#' @param df Data frame for calculationg
#' @param nonparametric Whether to fit nearly-saturated models (i.e., do NP
#' estimation). Note: this is not actually nonparametric because we need a little
#' bit of smoothing due to empty cells
#' @param avg Whether to take weighted average
#' @return Vector (if avg = FALSE) of naive NDEs for each covariate pattern or scalar
#' NDE
#' @export
calculate_categorical_naive_nder <- function(df, nonparametric = FALSE, avg = FALSE) {
a <- 1
astar <- 0
cov_df <- make_da_cov_df_with_counts(df)
cov_df$NDE <- 0
if (nonparametric) {
fit_y <- lm(y ~ 1 + (z1 + z2 + z3 + a + m)^4, data = df)
} else {
fit_y <- glm(y ~ 1 + z1 + z2 + z3 + a * m, data = df,
family = binomial(link = "logit"))
}
for (m in 1:4) {
df$meqm <- (df$m == m) * 1
if (nonparametric) {
fit_meqm <- glm(meqm ~ 1 + (z1 + z2 + z3 + a)^2, data = df,
family = binomial(link = "logit"))
} else {
fit_meqm <- glm(meqm ~ 1 + z1 + z2 + z3 + a, data = df,
family = binomial(link = "logit"))
}
# browser()
pm <- predict(fit_meqm, newdata = cbind(cov_df, a = astar), type = "response")
ey1 <- predict(fit_y, newdata = cbind(cov_df, m = m, a = a), type = "response")
ey2 <- predict(fit_y, newdata = cbind(cov_df, m = m, a = astar), type = "response")
cov_df$NDE <- cov_df$NDE + (ey1 - ey2) * pm
}
if (avg) {
naive <- weighted.mean(x = cov_df$NDE, w = cov_df$counts)
} else {
naive <- cov_df$NDE
}
return(naive)
}
# seer <- make_da_big_df(seer_file_path = "~/Dropbox/mediation-BSA/Data/Raw/data_SEER_for_BSA_summer_project.txt")
# cancors <- make_da_small_df(cancors_file_path = "~/Dropbox/mediation-BSA/Data/Raw/selected_cancors_data_2016_3_14.csv")
# calculate_categorical_naive_nder(df = seer, nonparametric = TRUE, avg = TRUE)
# calculate_categorical_naive_nder(df = cancors, nonparametric = FALSE, avg = TRUE)
# calculate_categorical_naive_nder(df = cancors, nonparametric = TRUE, avg = TRUE)
# run_da_intx_corr(df = seer, small_df = cancors, nonparametric = FALSE)
# run_da_intx_corr(df = seer, small_df = cancors, nonparametric = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.