# Doc header --------------------------------------------------------------
# Pairwise binomial mixed models
# Jan van den Brand, PhD
# jan.vandenbrand@kuleuven.be
# janvandenbrand@gmail.com
# Project: DKF19OK003
# The work here is based on The work is based on Fieuws S., Verbeke, G., Boen, F. and Delecluse, C. (2006)
# "High dimensional multivariate mixed models for binary questionnaire data", Applied Statistics, 55 (4), 449-460.
# For the orginal SAS macro, refer to https://ibiostat.be/online-resources/uploads/pairwiseMGLMM.zip
# Combine custom model with the code for pairwise model fitting
# Compare results to work by Fieuws.
# 0: Preliminaries ------------------------------------------------------
setwd("C:/Users/jajgv/Documents/repos/highdimjm")
source("functions/mmm_functions.R")
load("output/pbc2_analysis.RData")
# 1: Extract -----------------------------------------
con <- url("https://raw.github.com/drizopoulos/Repeated_Measurements/master/Data.RData")
load(con)
close(con)
rm(list = c("aids", "aids.id", "glaucoma", "pbc2.id", "prothro", "con"))
names(pbc2) <- tolower(names(pbc2))
pbc2 <- pbc2 %>%
mutate(year = year - years,
status2 = ifelse(years > 5, 0, 1)) %>%
group_by(id) %>%
mutate(years = min(years, 5)) %>%
ungroup()
pbc2 <- pbc2 %>%
filter(year >= -5)
## Visualize the data
### Bar charts for categorical variables
factors <- c("sex", "ascites", "hepatomegaly", "spiders", "edema", "status")
bar_charts <- lapply(factors, make_bar_plot, data = pbc2)
do.call(grid.arrange, bar_charts)
### Histograms for continuous variables
markers <- c("serbilir", "serchol", "albumin", "alkaline", "sgot", "platelets", "prothrombin")
histograms <- lapply(markers, make_histogram, data = pbc2)
do.call(grid.arrange, histograms)
### Spaghetti plots for longitudinal variables
plots <- lapply(markers, make_spaghetti_plots, data = pbc2, time = "year", id = "id", by = "status2")
pdf(file = "plots/spaghetti-plots.pdf")
plots
dev.off()
# 2: Transform ---------------------------------------------------------------
pbc2 <- pbc2 %>%
mutate(serbilir = scale(log(serbilir+1)),
serchol = scale(log(serchol+1)),
albumin = scale(albumin),
alkaline = scale(log(alkaline+1)),
sgot = scale(log(sgot+1)),
platelets = scale(platelets),
prothrombin = scale(log(prothrombin+1)),
age = scale(age)
)
plots <- lapply(markers, make_spaghetti_plots, data = pbc2, time = "year", id = "id", by = "status2")
pdf(file = "plots/spaghetti-plots-transformed.pdf")
plots
dev.off()
## binomial model accepts only c(0,1) level outcomes
pbc2 %>% dplyr::select(hepatomegaly, spiders, ascites, edema) %>% summary()
pbc2 <- pbc2 %>%
mutate(hepatomegaly = as.integer(hepatomegaly)-1,
spiders = as.integer(spiders)-1,
ascites = as.integer(ascites)-1,
edema = ifelse(as.integer(edema) >= 2, 1 , 0))
pbc2 %>% dplyr::select(hepatomegaly, spiders, ascites, edema) %>% summary()
## covariates can only be numeric
pbc2 <- pbc2 %>% mutate(sex = as.numeric(sex)-1,
drug = as.numeric(drug)-1)
pbc2 <- pbc2[complete.cases(pbc2),]
pbc2 <- pbc2 %>% mutate(id = as.character(factor(id)))
plots <- lapply(factors[c(-1,-6)], make_spaghetti_plots, data = pbc2, time = "year", id = "id", by = "status2")
pdf(file = "plots/spaghetti-plots-factors.pdf")
plots
dev.off()
# 3: Load -------------------------------------------------------------
outcomes <- c("spiders", "edema", "serbilir", "sgot")
pairs <- make_pairs(outcomes)
model_info <- test_input_datatypes(data = pbc2, pairs = pairs)
## split the data by outcome
pbc2_fail <- pbc2 %>% filter(status2 == 1)
summary(pbc2_fail)
lapply(c("spiders", "edema"), make_bar_plot, data = pbc2_fail)
lapply(outcomes, make_spaghetti_plots, data = pbc2_fail, time = "year", id = "id")
pbc2_nofail <- pbc2 %>% filter(status2 == 0)
summary(pbc2_nofail)
lapply(c("spiders", "edema"), make_bar_plot, data = pbc2_nofail)
lapply(outcomes, make_spaghetti_plots, data = pbc2_nofail, time = "year", id = "id")
# Stack data
stacked_fail <- stack_data(pairs = pairs, id = "id", data = pbc2_fail,
covars = c("year", "years", "age", "sex", "drug"))
stacked_nofail <- stack_data(pairs = pairs, id = "id", data = pbc2_nofail,
covars = c("year", "age", "sex", "drug"))
## Tests to verify data stacking, these should all return TRUE
test_compare_stacked_to_original_data(data = pbc2_fail, stacked_data = stacked_fail, pairs = pairs)
test_compare_stacked_to_original_data(data = pbc2_nofail, stacked_data = stacked_nofail, pairs = pairs)
# 4: Explore univariate mixed models ----
#
# m_spiders_fail <- mixed_model(fixed = spiders ~ 1 + year + age + sex + drug,
# random = ~ 1 | id,
# data = pbc2_fail,
# family = binomial())
# m_spiders_fail_slopes <- update(m_spiders_fail, random = ~ 1 + year | id,
# iter_EM = 100, iter_qN_outer = 30)
# summary(m_spiders_fail)
# anova(m_spiders_fail, m_spiders_fail_slopes)
# # Use random intercept
# m_edema_fail <- mixed_model(fixed = edema ~ 1 + year + age + sex + drug,
# random = ~ 1 | id,
# data = pbc2_fail,
# family = binomial())
# m_edema_fail_slopes <- update(m_edema_fail, random = ~ 1 + year | id,
# iter_EM = 100, iter_qN_outer = 30)
# summary(m_edema_fail)
# anova(m_edema_fail, m_edema_fail_slopes)
# # Use random intercepts only
# m_serbilir_fail <- mixed_model(fixed = serbilir ~ 1 + year + age + sex + drug,
# random = ~ 1 | id,
# data = pbc2_fail,
# family = normal(),
# n_phis = 1)
# m_serbilir_fail_slopes <- update(m_serbilir_fail, random = ~ 1 + year | id,
# iter_EM = 100, iter_qN_outer = 30)
# summary(m_serbilir_fail)
# anova(m_serbilir_fail, m_serbilir_fail_slopes)
# # Use random intercepts only
# m_albumin_fail <- mixed_model(fixed = albumin ~ 1 + year + age + sex + drug,
# random = ~ 1 | id,
# data = pbc2_fail,
# family = normal(),
# n_phis = 1)
# m_albumin_fail_slopes <- update(m_albumin_fail, random = ~ 1 + year | id,
# iter_EM = 100, iter_qN_outer = 30)
# summary(m_albumin_fail_slopes)
# anova(m_albumin_fail, m_albumin_fail_slopes)
# # Random slopes as well...
#
# m_spiders_nofail <- mixed_model(fixed = spiders ~ 1 + year + age + sex + drug,
# random = ~ 1 | id,
# data = pbc2_nofail,
# family = binomial())
# m_spiders_nofail_slopes <- update(m_spiders_nofail, random = ~ 1 + year | id,
# iter_EM = 300, iter_qN_outer = 100)
# summary(m_spiders_nofail_slopes)
# anova(m_spiders_nofail, m_spiders_nofail_slopes)
# # Hessian not positive definite
# # Use drop the random intercepts only.
# m_edema_nofail <- mixed_model(fixed = edema ~ 1 + year + age + sex + drug,
# random = ~ 1 | id,
# data = pbc2_nofail,
# family = binomial())
# m_edema_nofail_slopes <- update(m_edema_nofail, random = ~ 1 + year | id,
# iter_EM = 100, iter_qN_outer = 30)
# summary(m_edema_nofail_slopes)
# anova(m_edema_nofail, m_edema_nofail_slopes)
# # Use random slopes
# m_serbilir_nofail <- mixed_model(fixed = serbilir ~ 1 + year + age + sex + drug,
# random = ~ 1 | id,
# data = pbc2_nofail,
# family = normal(),
# n_phis = 1)
# m_serbilir_nofail_slopes <- update(m_serbilir_nofail, random = ~ 1 + year | id,
# iter_EM = 100, iter_qN_outer = 30)
# summary(m_serbilir_nofail_slopes)
# anova(m_serbilir_fail, m_serbilir_fail_slopes)
# # Use random intercepts only
# m_albumin_nofail <- mixed_model(fixed = albumin ~ 1 + year + age + sex + drug,
# random = ~ 1 | id,
# data = pbc2_nofail,
# family = normal(),
# n_phis = 1)
# m_albumin_nofail_slopes <- update(m_albumin_nofail, random = ~ 1 + year | id,
# iter_EM = 100, iter_qN_outer = 30)
# summary(m_albumin_nofail_slopes)
# anova(m_albumin_nofail, m_albumin_nofail_slopes)
# Use random slopes
# FIX: how to deal with non-linear evolution over time in fixed effects?
# 5: Fit the multivariate mixed models ----------------------------------------------
# FIX: Fit some models with, and some without random slopes
fixed_nofail <- "Y ~ -1 +
intercept_Y1 + intercept_Y1:X + year_Y1 + drug_Y1 + age_Y1 +
intercept_Y2 + intercept_Y2:X + year_Y2 + drug_Y2 + age_Y2"
random_nofail <- "~ -1 + intercept_Y1 + intercept_Y2 | id"
fixed_fail <- "Y ~ -1 +
intercept_Y1 + intercept_Y1:X + year_Y1 + years_Y1 + drug_Y1 + age_Y1 +
intercept_Y2 + intercept_Y2:X + year_Y2 + years_Y2 + drug_Y2 + age_Y2"
random_fail <- "~ -1 + intercept_Y1 + intercept_Y2 | id"
# covars <- list("model1" = c("year", "sex", "age", "drug"),
# "model2" = c("year", "sex", "age", "drug"),
# "model3" = c("year", "sex", "age", "drug"),
# "model4" = c("year", "sex", "age", "drug"),
# "model5" = c("year", "sex", "age", "drug"),
# "model6" = c("year", "sex", "age", "drug"))
# fixed_formulas_nofail <- lapply(covars, make_fixed_formula)
#
# covars <- list("model1" = NULL,
# "model2" = NULL,
# "model3" = "year",
# "model4" = NULL,
# "model5" = "year",
# "model6" = "year")
# random_formulas_nofail <- lapply(covars, make_random_formula)
#
# model_info$formula <- list("fixed" = fixed_formulas_nofail,
# "random" = random_formulas_nofail)
# The next step is optional, if skipped starting values are set to 0 by default in mixed_model.
# However it helps search for optimal starting values.
# FIX: search for n_fixed and n_random from model formulas.
model_families <- get_random_start(model_info, n_fixed = 10, n_random = 2)
model_nofail <- mmm_model(fixed = fixed_nofail,
random = random_nofail,
id = "id",
data = pbc2_nofail,
stacked_data = stacked_nofail,
pairs = pairs,
model_families = model_families,
iter_EM = 0,
iter_qN_outer = 100,
nAGQ = 7,
parallel_plan = "multisession")
eigen(model_nofail$vcov)
# all eigen values should be >0 otherwise the Hessian is not a positive definite.
# This means that either the model is overly complex (i.e. random effects are not different from 0), or
# the model converged to a local optimum (saddle point).
# 1: scale the covariates (best practice anyway). v
# 2: reconsider specification of the random effects (usually solves the issue). v
# 3: increase the iterations (worth a try, although it cannot save from converging to a local optimum). v
# 4: consider other start values (e.g. a grid search). v
# 5: sometimes the model just does not fit the data. <-
# FIX: select parameter names to build model matrices X and Z
# FIX: allow for different specifications of the univariate mixed models.
# Pass these in the model_info object (e.g. fixed == model_info$fixed[i])
model_fail <- mmm_model(fixed = fixed_fail,
random = random_fail,
id = "id",
data = pbc2_fail,
stacked_data = stacked_fail,
pairs = pairs,
model_families = model_families,
iter_EM = 300,
iter_qN_outer = 100,
nAGQ = 7,
parallel_plan = "multisession",
verbose = TRUE)
eigen(model_fail$vcov)
# 5: Get predictions from the MMM -----------------------------------------
# FIX: show warning if vcov is not positive definite + suggestions to resolve,
mu <- setNames(rep(0, length(outcomes)), outcomes)
re_samples_nofail <- mvrnorm(1e3,mu = mu, Sigma = model_nofail$vcov)
re_samples_fail <- mvrnorm(1e3,mu = mu, Sigma = model_fail$vcov)
prior <- get_priors(data = pbc2,
time_failure = "years",
failure = "status2",
horizon = 5,
interval = 1/12,
dist = "weibull")
outcomes <- get_outcome_type(data = pbc2,
outcomes = outcomes[1:3])
df_pred <- pbc2 %>% filter(id == 3)
plan(sequential)
predictions <- mmm_predictions(data = df_pred,
outcomes = outcomes,
fixed_formula_nofail = fixed_nofail,
random_formula_nofail = random,
random_effects_nofail = re_samples_nofail,
parameters_nofail = model_nofail$estimates,
fixed_formula_fail = fixed_fail,
random_formula_fail = random,
random_effects_fail = re_samples_fail,
parameters_fail = model_fail$estimates,
time = "year",
failure = "status2",
failure_time = "years",
prior = "prior",
id = "id",
horizon = 5,
interval = 1/12)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.