highdimjm: High Dimensional Joint Models

# Doc header ----------------------------------------------------------------------------------

# Create simulated data for multivariate pairwise MM



# Initialize ----------------------------------------------------------------------------------

setwd("C:/Users/jajgv/Documents/repos/highdimjm")
pkg <- c("optimx", "ggplot2","lattice","MASS", "Matrix", "gridExtra", "tidyverse",
        "GLMMadaptive", "stringr", "flexsurv", "future.apply", "scales", "microbenchmark")
lapply(pkg, library, character.only = TRUE)
source("R/mmm_functions.R")

# Data --------------------------------------------------

set.seed(1234)
n <- 100 # number of subjects
K <- 8 # number of measurements per subject
t_max <- 5 # maximum follow-up time

# we constuct a data frame with the design:
# everyone has a baseline measurment, and then measurements at random follow-up times
df <- data.frame(id = as.character(rep(seq_len(n), each = K)),
                 time = c(replicate(n, c(0, sort(runif(K - 1, 0, t_max))))),
                 sex = rep(gl(2, n/2, labels = c("male", "female")), each = K))

# design matrices for the fixed effects
X1 <- model.matrix(~ sex + time, data = df)
X2 <- model.matrix(~ time, data = df)
X3 <- model.matrix(~ sex + time, data = df)
X4 <- model.matrix(~ time, data = df)
# Random intercepts model
Z1 <-
  Z2 <-
  Z3 <-
  Z4 <- model.matrix(~ 1, data = df)
# fixed effects coefficients
betas1 <- c(-0.67, -0.25, 0.24)
betas2 <- c(-1.15, 0.3)
betas3 <- c(1.34, -0.13, -0.04)
betas4 <- c(0.25, -0.17)
# random effects matrix
D <- c(0.8, 0.7, 0.6, 0.5)
resid <- c(0.5, 0.87) # standard deviation error terms

# we simulate random effects
b <- cbind(rnorm(n, sd = sqrt(D[1])), rnorm(n, sd = sqrt(D[2])), rnorm(n, sd = sqrt(D[3])), rnorm(n, D[4]))

eta_y1 <- as.vector(X1 %*% betas1 + rowSums(Z1 * b[as.integer(df$id), 1, drop = FALSE]))
eta_y2 <- as.vector(X2 %*% betas2 + rowSums(Z2 * b[as.integer(df$id), 2, drop = FALSE]))
eta_y3 <- as.vector(X3 %*% betas3 + rowSums(Z3 * b[as.integer(df$id), 3, drop = FALSE]))
eta_y4 <- as.vector(X4 %*% betas4 + rowSums(Z4 * b[as.integer(df$id), 4, drop = FALSE]))

# simulate normal longitudinal data
df$y1 <- rnorm(n * K, mean = eta_y1, sd = resid[1])
df$y2 <- rnorm(n * K, mean = eta_y2, sd = resid[2])
# we set binary outcome from the logistic regression
df$y3 <- as.numeric(as.logical(rbinom(n * K, size = 1, prob = plogis(eta_y3))))
df$y4 <- as.numeric(as.logical(rbinom(n * K, size = 1, prob = plogis(eta_y4))))
# Set failures
df$failure <- rnorm(n * K, mean = -0.5*eta_y1+0.8*eta_y2-0.33*eta_y3+0.15*eta_y4, sd = 1.5*mean(resid))
df$failure <- exp(df$failure) / (1 + exp(df$failure))
df$failure <- ifelse(df$failure >= 0.6 & df$time >0, 1, 0)
# Set failure times
subjects <- split(df, df$id)
for (i in seq_along(subjects)) {
  if ( summarise(subjects[[i]], max(failure)) == 0 ) {
    subjects[[i]]$time_failure <- unlist(summarise(subjects[[i]], max(time)))["max(time)"]
  } else {
    subjects[[i]]$time_failure <- unlist(summarise(subjects[[i]] %>%
                                                     group_by(failure) %>%
                                                     filter(failure == 1), min(time)))["min(time)"]
  }
  subjects[[i]]$failure <- ifelse(max(subjects[[i]]$failure) == 1, 1, 0)
  subjects[[i]] <- subjects[[i]] %>% filter(time <= time_failure)
}
df <- do.call(rbind, subjects)
df <- df %>% mutate(sex = as.numeric(sex))
summary(df)

#scale the parameters
df <- df %>% mutate(y1 = scale(y1),
                    y2 = scale(y2))

# Fit the mmm ---------------------------------------------------------------------------------

pairs <- make_pairs(outcomes = c("y1", "y2", "y3", "y4"))

model_info <- test_input_datatypes(data = df, pairs = pairs)

df_fail <- df %>% filter(failure == 1)
df_fail_stacked <- stack_data(data = df_fail, id = "id", pairs = pairs, covars = c("time", "sex", "time_failure"))

df_nofail <- df %>% filter(failure == 0)
df_nofail_stacked <- stack_data(data = df_nofail, id = "id", pairs = pairs, covars = c("time", "sex"))

fixed_nofail <- make_fixed_formula(covars = c("time", "sex"))
fixed_fail <- make_fixed_formula(covars = c("time", "sex", "time_failure"))
random <- make_random_formula(id="id")

model_nofail <- mmm_model(fixed = fixed_nofail,
                          random = random,
                          id = "id",
                          data = df_nofail,
                          stacked_data = df_nofail_stacked,
                          pairs = pairs,
                          model_families = model_info,
                          iter_EM = 100,
                          iter_qN_outer = 30,
                          nAGQ = 11)

model_fail <- mmm_model(fixed = fixed_fail,
                        random = random,
                        id = "id",
                        data = df_fail,
                        stacked_data = df_fail_stacked,
                        pairs = pairs,
                        model_families = model_info,
                        iter_EM = 100,
                        iter_qN_outer = 30,
                        nAGQ = 11,
                        update_GH_every = 5,
                        parallel_plan = "multisession")

# Get the predictions -----------------------------------------------------
# Simulate random effects
outcomes <- paste0("y",1:4)
mu <- setNames(rep(0, length(outcomes)), outcomes)
re_samples_nofail <- mvrnorm(1e3,mu = mu, Sigma = model_nofail$vcov)
re_samples_fail <- mvrnorm(1e3,mu = mu, Sigma = model_fail$vcov)
prior <- get_priors(data = df,
                    time_failure = "time_failure",
                    failure = "failure",
                    horizon = 5,
                    interval = 1/12)
outcomes <- get_outcome_type(data = df,
                             outcomes)
df_pred <- df %>% filter(id == 10)
# FIX: check argument validity
predictions <- mmm_predictions(data = df_pred,
                               outcomes = outcomes,
                               fixed_formula_nofail = "~ time + sex + y1 + y2 + y3 + y4",
                               random_formula_nofail = "~ 1 | id",
                               random_effects_nofail = re_samples_nofail,
                               parameters_nofail = model_nofail$estimates,
                               fixed_formula_fail = "~ time + sex + time_failure + y1 + y2 + y3 + y4",
                               random_formula_fail = "~ 1 | id",
                               random_effects_fail = re_samples_fail,
                               parameters_fail = model_fail$estimates,
                               time = "time",
                               failure = "failure",
                               failure_time = "time_failure",
                               prior = prior,
                               id = "id",
                               horizon = 5,
                               interval = 1/12)
# Visualize the predictions
get_plots(newdata = df_pred,
          predictions = predictions,
          outcomes = outcomes,
          interval = 1/12)