highdimjm: High Dimensional Joint Models

# Doc header --------------------------------------------------------------

# Pairwise binomial mixed models 
# Jan van den Brand, PhD
# jan.vandenbrand@kuleuven.be
# janvandenbrand@gmail.com
# Project: DKF19OK003

# The work here is based on The work is based on Fieuws S., Verbeke, G., Boen, F. and Delecluse, C. (2006) 
# "High dimensional multivariate mixed models for binary questionnaire data", Applied Statistics, 55 (4), 449-460.
# For the orginal SAS macro, refer to https://ibiostat.be/online-resources/uploads/pairwiseMGLMM.zip

# Combine custom model with the code for pairwise model fitting
# Compare results to work by Fieuws.

# 0: Preliminaries  ------------------------------------------------------
setwd("C:/Users/jajgv/Documents/repos/highdimjm")
source("functions/mmm_functions.R")
load("output/pbc2_analysis.RData")

# 1: Extract  -----------------------------------------

con <- url("https://raw.github.com/drizopoulos/Repeated_Measurements/master/Data.RData")
load(con)
close(con)
rm(list = c("aids", "aids.id", "glaucoma", "pbc2.id", "prothro", "con"))

names(pbc2) <- tolower(names(pbc2))
pbc2 <- pbc2 %>% 
  mutate(year = year - years,
         status2 = ifelse(years > 5, 0, 1)) %>% 
  group_by(id) %>%
  mutate(years = min(years, 5)) %>%
  ungroup()
pbc2 <- pbc2 %>% 
  filter(year >= -5)

## Visualize the data
### Bar charts for categorical variables
factors <- c("sex", "ascites", "hepatomegaly", "spiders", "edema", "status")
bar_charts <- lapply(factors, make_bar_plot, data = pbc2)
do.call(grid.arrange, bar_charts)

### Histograms for continuous variables
markers <- c("serbilir", "serchol", "albumin", "alkaline", "sgot", "platelets", "prothrombin") 
histograms <- lapply(markers, make_histogram, data = pbc2)
do.call(grid.arrange, histograms)

### Spaghetti plots for longitudinal variables
plots <- lapply(markers, make_spaghetti_plots, data = pbc2, time = "year", id = "id", by = "status2")
pdf(file = "plots/spaghetti-plots.pdf")
plots
dev.off()

# 2: Transform ---------------------------------------------------------------

pbc2 <- pbc2 %>% 
  mutate(serbilir = scale(log(serbilir+1)),
         serchol = scale(log(serchol+1)),
         albumin = scale(albumin),
         alkaline = scale(log(alkaline+1)),
         sgot = scale(log(sgot+1)),
         platelets = scale(platelets),
         prothrombin = scale(log(prothrombin+1)),
         age = scale(age)
  )
plots <- lapply(markers, make_spaghetti_plots, data = pbc2, time = "year", id = "id", by = "status2")
pdf(file = "plots/spaghetti-plots-transformed.pdf")
plots
dev.off()

## binomial model accepts only c(0,1) level outcomes
pbc2 %>% dplyr::select(hepatomegaly, spiders, ascites, edema) %>% summary()
pbc2 <- pbc2 %>% 
  mutate(hepatomegaly = as.integer(hepatomegaly)-1,
         spiders = as.integer(spiders)-1,
         ascites = as.integer(ascites)-1,
         edema = ifelse(as.integer(edema) >= 2, 1 , 0))
pbc2 %>% dplyr::select(hepatomegaly, spiders, ascites, edema) %>% summary()
## covariates can only be numeric
pbc2 <- pbc2 %>% mutate(sex = as.numeric(sex)-1,
                drug = as.numeric(drug)-1)
pbc2 <- pbc2[complete.cases(pbc2),]
pbc2 <- pbc2 %>% mutate(id = as.character(factor(id)))
plots <- lapply(factors[c(-1,-6)], make_spaghetti_plots, data = pbc2, time = "year", id = "id", by = "status2")
pdf(file = "plots/spaghetti-plots-factors.pdf")
plots
dev.off()

# 3: Load -------------------------------------------------------------

outcomes <- c("spiders", "edema", "serbilir", "sgot")
pairs <- make_pairs(outcomes)
model_info <- test_input_datatypes(data = pbc2, pairs = pairs)
## split the data by outcome
pbc2_fail <- pbc2 %>% filter(status2 == 1)
summary(pbc2_fail)
lapply(c("spiders", "edema"), make_bar_plot, data = pbc2_fail)
lapply(outcomes, make_spaghetti_plots, data = pbc2_fail, time = "year", id = "id")

pbc2_nofail <- pbc2 %>% filter(status2 == 0)
summary(pbc2_nofail)
lapply(c("spiders", "edema"), make_bar_plot, data = pbc2_nofail)
lapply(outcomes, make_spaghetti_plots, data = pbc2_nofail, time = "year", id = "id")
# Stack data
stacked_fail <- stack_data(pairs = pairs, id = "id", data = pbc2_fail,  
                           covars = c("year", "years", "age", "sex", "drug"))
stacked_nofail <- stack_data(pairs = pairs, id = "id", data = pbc2_nofail, 
                             covars = c("year", "age", "sex", "drug"))
## Tests to verify data stacking, these should all return TRUE
test_compare_stacked_to_original_data(data = pbc2_fail, stacked_data = stacked_fail, pairs = pairs)
test_compare_stacked_to_original_data(data = pbc2_nofail, stacked_data = stacked_nofail, pairs = pairs)

# 4: Explore univariate mixed models ----
# 
# m_spiders_fail <- mixed_model(fixed = spiders ~ 1 + year + age + sex + drug,
#                          random = ~ 1 | id,
#                          data = pbc2_fail,
#                          family = binomial())
# m_spiders_fail_slopes <- update(m_spiders_fail, random = ~ 1 + year | id,
#                                 iter_EM = 100, iter_qN_outer = 30)
# summary(m_spiders_fail)
# anova(m_spiders_fail, m_spiders_fail_slopes)
# # Use random intercept
# m_edema_fail <- mixed_model(fixed = edema ~ 1 + year + age + sex + drug,
#                               random = ~ 1 | id,
#                               data = pbc2_fail,
#                               family = binomial())
# m_edema_fail_slopes <- update(m_edema_fail, random = ~ 1 + year | id,
#                                      iter_EM = 100, iter_qN_outer = 30)
# summary(m_edema_fail)
# anova(m_edema_fail, m_edema_fail_slopes)
# # Use random intercepts only
# m_serbilir_fail <- mixed_model(fixed = serbilir ~ 1 + year + age + sex + drug,
#                                    random = ~ 1 | id,
#                                    data = pbc2_fail,
#                                    family = normal(),
#                                n_phis = 1)
# m_serbilir_fail_slopes <- update(m_serbilir_fail, random = ~ 1 + year | id,
#                                      iter_EM = 100, iter_qN_outer = 30)
# summary(m_serbilir_fail)
# anova(m_serbilir_fail, m_serbilir_fail_slopes)
# # Use random intercepts only
# m_albumin_fail <- mixed_model(fixed = albumin ~ 1 + year + age + sex + drug,
#                                random = ~ 1 | id,
#                                data = pbc2_fail,
#                                family = normal(),
#                                n_phis = 1)
# m_albumin_fail_slopes <- update(m_albumin_fail, random = ~ 1 + year | id,
#                                  iter_EM = 100, iter_qN_outer = 30)
# summary(m_albumin_fail_slopes)
# anova(m_albumin_fail, m_albumin_fail_slopes)
# # Random slopes as well...
# 
# m_spiders_nofail <- mixed_model(fixed = spiders ~ 1 + year + age + sex + drug,
#                               random = ~ 1 | id,
#                               data = pbc2_nofail,
#                               family = binomial())
# m_spiders_nofail_slopes <- update(m_spiders_nofail, random = ~ 1 + year | id,
#                                 iter_EM = 300, iter_qN_outer = 100)
# summary(m_spiders_nofail_slopes)
# anova(m_spiders_nofail, m_spiders_nofail_slopes)
# # Hessian not positive definite
# # Use drop the random intercepts only.
# m_edema_nofail <- mixed_model(fixed = edema ~ 1 + year + age + sex + drug,
#                                    random = ~ 1 | id,
#                                    data = pbc2_nofail,
#                                    family = binomial())
# m_edema_nofail_slopes <- update(m_edema_nofail, random = ~ 1 + year | id,
#                                      iter_EM = 100, iter_qN_outer = 30)
# summary(m_edema_nofail_slopes)
# anova(m_edema_nofail, m_edema_nofail_slopes)
# # Use random slopes
# m_serbilir_nofail <- mixed_model(fixed = serbilir ~ 1 + year + age + sex + drug,
#                                random = ~ 1 | id,
#                                data = pbc2_nofail,
#                                family = normal(),
#                                n_phis = 1)
# m_serbilir_nofail_slopes <- update(m_serbilir_nofail, random = ~ 1 + year | id,
#                                  iter_EM = 100, iter_qN_outer = 30)
# summary(m_serbilir_nofail_slopes)
# anova(m_serbilir_fail, m_serbilir_fail_slopes)
# # Use random intercepts only
# m_albumin_nofail <- mixed_model(fixed = albumin ~ 1 + year + age + sex + drug,
#                               random = ~ 1 | id,
#                               data = pbc2_nofail,
#                               family = normal(),
#                               n_phis = 1)
# m_albumin_nofail_slopes <- update(m_albumin_nofail, random = ~ 1 + year | id,
#                                 iter_EM = 100, iter_qN_outer = 30)
# summary(m_albumin_nofail_slopes)
# anova(m_albumin_nofail, m_albumin_nofail_slopes)
# Use random slopes
# FIX: how to deal with non-linear evolution over time in fixed effects?

# 5: Fit the multivariate mixed models  ----------------------------------------------

# FIX: Fit some models with, and some without random slopes 
fixed_nofail <- "Y ~ -1 +
intercept_Y1 + intercept_Y1:X + year_Y1 + drug_Y1 + age_Y1 +
intercept_Y2 + intercept_Y2:X + year_Y2 + drug_Y2 + age_Y2" 
random_nofail <- "~ -1 + intercept_Y1 + intercept_Y2 | id"
fixed_fail <- "Y ~ -1 +
intercept_Y1 + intercept_Y1:X + year_Y1 + years_Y1 + drug_Y1 + age_Y1 +
intercept_Y2 + intercept_Y2:X + year_Y2 + years_Y2 + drug_Y2 + age_Y2"
random_fail <- "~ -1 + intercept_Y1 + intercept_Y2 | id"

# covars <- list("model1" = c("year", "sex", "age", "drug"),
#                "model2" = c("year", "sex", "age", "drug"),
#                "model3" = c("year", "sex", "age", "drug"),
#                "model4" = c("year", "sex", "age", "drug"),
#                "model5" = c("year", "sex", "age", "drug"),
#                "model6" = c("year", "sex", "age", "drug"))
# fixed_formulas_nofail <- lapply(covars, make_fixed_formula)
# 
# covars <- list("model1" = NULL,
#                "model2" = NULL,
#                "model3" = "year",
#                "model4" = NULL,
#                "model5" = "year",
#                "model6" = "year")
# random_formulas_nofail <- lapply(covars, make_random_formula)
# 
# model_info$formula <- list("fixed" = fixed_formulas_nofail,
#                            "random" = random_formulas_nofail)

# The next step is optional, if skipped starting values are set to 0 by default in mixed_model.
# However it helps search for optimal starting values. 
# FIX: search for n_fixed and n_random from model formulas. 
model_families <- get_random_start(model_info, n_fixed = 10, n_random = 2)

model_nofail <- mmm_model(fixed = fixed_nofail,
                        random  = random_nofail,
                        id = "id",
                        data = pbc2_nofail,
                        stacked_data = stacked_nofail,
                        pairs = pairs,
                        model_families = model_families,
                        iter_EM = 0,
                        iter_qN_outer = 100,
                        nAGQ = 7,
                        parallel_plan = "multisession")
eigen(model_nofail$vcov) 
 # all eigen values should be >0 otherwise the Hessian is not a positive definite.
# This means that either the model is overly complex (i.e. random effects are not different from 0), or
# the model converged to a local optimum (saddle point).
# 1: scale the covariates (best practice anyway). v
# 2: reconsider specification of the random effects (usually solves the issue). v
# 3: increase the iterations (worth a try, although it cannot save from converging to a local optimum). v
# 4: consider other start values (e.g. a grid search). v
# 5: sometimes the model just does not fit the data. <-

# FIX: select parameter names to build model matrices X and Z 
# FIX: allow for different specifications of the univariate mixed models.
# Pass these in the model_info object (e.g. fixed == model_info$fixed[i])

model_fail <- mmm_model(fixed = fixed_fail,
                        random = random_fail,
                        id = "id",
                        data = pbc2_fail,
                        stacked_data = stacked_fail,
                        pairs = pairs,
                        model_families = model_families,
                        iter_EM = 300,
                        iter_qN_outer = 100,
                        nAGQ = 7,
                        parallel_plan = "multisession",
                        verbose = TRUE)
eigen(model_fail$vcov) 

# 5: Get predictions from the MMM -----------------------------------------

# FIX: show warning if vcov is not positive definite + suggestions to resolve,

mu <- setNames(rep(0, length(outcomes)), outcomes)
re_samples_nofail <- mvrnorm(1e3,mu = mu, Sigma = model_nofail$vcov)
re_samples_fail <- mvrnorm(1e3,mu = mu, Sigma = model_fail$vcov)

prior <- get_priors(data = pbc2,
                    time_failure = "years", 
                    failure = "status2",
                    horizon = 5,
                    interval = 1/12, 
                    dist = "weibull")

outcomes <- get_outcome_type(data = pbc2,
                             outcomes = outcomes[1:3])

df_pred <- pbc2 %>% filter(id == 3)
plan(sequential)
predictions <- mmm_predictions(data = df_pred,
                               outcomes = outcomes,
                               fixed_formula_nofail = fixed_nofail,
                               random_formula_nofail = random,
                               random_effects_nofail = re_samples_nofail,
                               parameters_nofail = model_nofail$estimates,
                               fixed_formula_fail = fixed_fail,
                               random_formula_fail = random,
                               random_effects_fail = re_samples_fail,
                               parameters_fail = model_fail$estimates,
                               time = "year",
                               failure = "status2",
                               failure_time = "years",
                               prior = "prior",
                               id = "id",
                               horizon = 5, 
                               interval = 1/12)