R/custom_2part_model.R

# Doc header --------------------------------------------------------------

# Jan van den Brand
# jan.vandenbrand@kuleuven.be
# janvandenbrand@gmail.com


# Create custom model family object

# Aim is to develop a two part model that can accomodate continuous and binary outcomes
# See https://drizopoulos.github.io/GLMMadaptive/articles/ZeroInflated_and_TwoPart_Models.html 
# Use the 'zi' formulation 

# Preliminaries -----------------------------------------------------------

setwd("C:/Users/jajgv/Documents/repos/highdimjm")
pkg <- c("ggplot2","gridExtra", "dplyr", "optimParallel", "GLMMadaptive", "Matrix", "MASS")
lapply(pkg, library, character.only = TRUE)
cluster <- makeCluster(detectCores()); setDefaultCluster(cl = cluster)

# Simulate some data --------------------------------------------------
# From https://drizopoulos.github.io/GLMMadaptive/articles/ZeroInflated_and_TwoPart_Models.html#two-part-mixed-effects-model-for-semi-continuous-data-1

set.seed(1234)
n <- 10 # number of subjects
K <- 8 # number of measurements per subject
t_max <- 5 # maximum follow-up time

# we constuct a data frame with the design: 
# everyone has a baseline measurement, and then measurements at random follow-up times
df <- data.frame(id = rep(seq_len(n), each = K),
                 time = c(replicate(n, c(0, sort(runif(K - 1, 0, t_max))))),
                 sex = rep(gl(2, n/2, labels = c("male", "female")), each = K))

# design matrices for the fixed and random effects non-zero part
X <- model.matrix(~ sex * time, data = df)
Z <- model.matrix(~ time, data = df)
# design matrices for the fixed and random effects zero part (i.e. the zero inflated part)
X_zi <- model.matrix(~ sex, data = df)
Z_zi <- model.matrix(~ 1, data = df)

betas <- c(-2.13, -0.25, 0.24, -0.05) # fixed effects coefficients non-zero part
sigma <- 0.5 # standard deviation error terms non-zero part
gammas <- c(-1.5, 0.5) # fixed effects coefficients zero part
D11 <- 0.5 # variance of random intercepts non-zero part
D22 <- 0.1 # variance of random slopes non-zero part
D33 <- 0.4 # variance of random intercepts zero part

# we simulate random effects
b <- cbind(rnorm(n, sd = sqrt(D11)), rnorm(n, sd = sqrt(D22)), rnorm(n, sd = sqrt(D33)))
# linear predictor continuous part
eta_y <- as.vector(X %*% betas + rowSums(Z * b[df$id, 1:2, drop = FALSE])) 
# linear predictor binomial part
eta_zi <- as.vector(X_zi %*% gammas + rowSums(Z_zi * b[df$id, 3, drop = FALSE]))
# we simulate normal longitudinal data
df$y <- rnorm(n * K, mean = eta_y, sd = sigma)
# we simulate the binomial data
df$y2 <- as.logical(rbinom(n * K, size = 1, prob = plogis(eta_zi)))


# Determine the log density ---------------------------------------------------

y <- c(df$y, df$y2)
ind <- rep(0:1, each = length(y)/2)
(y[ind == 0])
(y[ind == 1])
# Covariate data just gets copied 
eta <- as.matrix(eta_y)
eta_zi <- as.matrix(eta_zi)
out <- rbind(eta, eta_zi)
(out[ind == 1, ])
(out[ind == 0, ])
# linear mixed model (continuous part)
out[ind==0,] <- dnorm(y[ind == 0], eta, sigma, log = TRUE)
# logistic mixed model (binary part)
probs <- exp(eta_zi) / (1 + exp(eta_zi))
out[ind==1,] <- dbinom(y[ind == 1], 1, probs, log = TRUE)
out


# Cast into a function ----------------------------------------------------

binary.normal <- function () {
  stats <- make.link("identity")
  log_dens <- function (y, eta, mu_fun, phis, eta_zi) {
    ### Determine the probability density for a combined binary and normal outcome
    
    # Uses finite difference approximation for optimization of the loss function
    
    # Args:
    # y: the outcome vector holding the binary and normal outcome
    # (optional) eta: the linear predictor
    # (optional) mu_fun: optional inverse of the linear predictor used during optimization
    # phis: the number of phi parameters, default = 1 
    # (optional) eta_zi Zero Inflated model part
    ###
    
    sigma <- exp(phis)
    ind <- rep(0:1, times = length(y)/2)   # FIX let user assign id's
    eta <- as.matrix(eta)
    eta_zi <- as.matrix(eta_zi)
    out <- rbind(eta, eta_zi)
    # linear mixed model (continuous part)
    out[ind==0,] <- dnorm(y[ind == 0], eta, sigma, log = TRUE)
    # logistic mixed model (binary part)
    probs <- exp(eta_zi) / (1 + exp(eta_zi))
    out[ind==1,] <- dbinom(y[ind == 1], 1, probs, log = TRUE)
    out
  }
  structure(list(family = "user binary normal", 
                 link = stats$name, 
                 linkfun = stats$linkfun,
                 linkinv = stats$linkinv, 
                 log_dens = log_dens),
            class = "family")
}


# Stack the data ----------------------------------------------------------

d <- df
# Append the data
pairs <- c("y", "y2")
# Note that id was stored as a factor in the pbc data. This may cause errors.
ids <- rep(d[["id"]], each = 2)
Y <- d %>% dplyr::select(y1 = paste(pairs[1]), y2 = paste(pairs[2]))
Y <- t(as.matrix(Y))
Y <- as.vector(Y)
Y <- data.frame(id = as.character(ids), 
                Y = Y)
# 2) create the design matrix
X <- d %>% dplyr::select(x1 = paste(pairs[2]), x2 = paste(pairs[1]))
X <- t(as.matrix(X))
X <- as.vector(X)
X <- data.frame(id = as.character(ids), 
                X = X)
# 3) add covariates
Z <- d %>% mutate(sex = as.integer(sex)) %>% dplyr::select(time, sex) # sex: 1 = male, 2 = female
Z <- as.matrix(Z)
# 4) Add dummies for X
dummy <- kronecker(rep(1, times = nrow(d)), diag(1, nrow = 2))
Zk <- kronecker(Z, diag(1, nrow = 2))
Z <- cbind(dummy, Zk)
# 5 Join the Y and design matrices
data <- cbind(Y, X[-1], Z)
names(data) <- c("id", "Y", "X", "intercept.Y1", "intercept.Y2","time.Y1", "time.Y2", "sex.Y1", "sex.Y2")

# test data consistency
checkY <-  data %>% dplyr::filter(intercept.Y1 == 1) %>% dplyr::select(Y)
all(d["y"] == checkY)
checkX <- data %>% dplyr::filter(intercept.Y1 == 1) %>% dplyr::select(X)
all(d["y2"] == checkX)

# assert that y1 is NOT dichotomous
all(sort(unique(d$y)) != c(0,1))
# assert that y2 is dichotomous
all(sort(unique(d$y2)) == c(0,1))
# assert that y is numeric
class(d$y) == "numeric"

rm(list = setdiff(ls(), "data",  "binary.normal"))

# Test the model -----------------------------------------------------------

data$id <- as.numeric(data$id)
model <- mixed_model(fixed = Y ~ -1 + intercept.Y1 + intercept.Y1:X + time.Y1,
                     random = ~ -1 + intercept.Y1 | id,
                     zi_fixed = ~ -1 + intercept.Y2 + intercept.Y2:X + time.Y2,
                     zi_random = ~ -1 + intercept.Y2 | id,
                     data = data, 
                     family = binary.normal(), 
                     n_phis = 1,
                     control = list(optimizer = "optim",
                                    optim_method = "L-BFGS-B",
                                    iter_EM = 1e3,
                                    verbose = TRUE))
JanvandenBrand/highdimjm documentation built on Dec. 18, 2021, 12:32 a.m.