# Doc header --------------------------------------------------------------
# Pairwise binomial mixed models
# Jan van den Brand, PhD
# jan.vandenbrand@kuleuven.be
# janvandenbrand@gmail.com
# The work here is based on The work is based on Fieuws, Verbeke, Maes and Vanrenterghem. Biostatistics 2008.
# https://doi.org/10.1093/biostatistics/kxm041
# 0: Preliminaries ------------------------------------------------------
setwd("C:/Users/jajgv/Documents/repos/highdimjm")
pkg <- c("ggplot2","gridExtra", "dplyr", "optimParallel", "GLMMadaptive", "Matrix", "MASS")
lapply(pkg, library, character.only = TRUE)
cluster <- makeCluster(detectCores()); setDefaultCluster(cl = cluster)
# Data ------------------------------------------------------------------
# For this example I will used the pbc (primary billiary cirrhosis) data from Rizopoulos Join Models for Longitudinal and Time-to-Event Data, CRC press, 2012.
ds <- url("https://raw.github.com/drizopoulos/Repeated_Measurements/master/Data.RData")
load(ds)
rm(list = c("aids", "aids.id", "glaucoma", "pbc2.id", "prothro", "ds"))
# Print overview of data
str(pbc2)
summary(pbc2)
head(pbc2)
# Pie/bar charts for categorical variables
factors <- c("sex", "ascites", "hepatomegaly", "spiders", "edema", "status")
make.pie <- function(v) {
pie_plot <- ggplot(data = pbc2, aes(x = factor(1), fill = pbc2[,v])) +
geom_bar(width = 1) +
coord_polar('y', start = 0) +
theme_minimal() +
labs(x = "", y = "") +
guides(fill = guide_legend(title = paste(v)))
}
pie_charts <- lapply(factors, make.pie)
do.call(grid.arrange, pie_charts)
# create histograms for continuous variables
markers <- c("serBilir", "serChol", "albumin", "alkaline", "SGOT", "platelets", "prothrombin")
make.hist <- function(v, data) {
plot <- ggplot() +
geom_histogram(data = data, aes(x = data[,v]), bins = 20) +
theme_classic() +
labs(x = paste("histogram of", v))
return(plot)
}
histograms <- lapply(markers, make.hist, data = pbc2)
do.call(grid.arrange, histograms)
# Spaghetti plots for longitudinal variables
make.plots <- function(v) {
plot <- ggplot() +
geom_line(data = pbc2[pbc2$status2 == 0,],
aes(x = year - years, y = pbc2[pbc2$status2 == 0, v], group = id),
size = 0.1, alpha = 0.2) +
geom_smooth(data = pbc2[pbc2$status2 == 0,],
aes(x = year - years, y = pbc2[pbc2$status2 == 0, v])) +
geom_line(data = pbc2[pbc2$status2 == 1,],
aes(x = year - years, y = pbc2[pbc2$status2 == 1, v], group = id),
color = "red", size = 0.1, alpha = 0.2) +
geom_smooth(data = pbc2[pbc2$status2 == 1,],
aes(x = year - years, y = pbc2[pbc2$status2 == 1, v]), color = "red") +
scale_x_continuous(breaks = seq(floor(min(-pbc2$years, na.rm = TRUE)),0,3)) +
theme_classic() +
labs(x = "Follow-up time in years", y = paste(v))
return(plot)
}
plots <- lapply(markers, make.plots)
do.call(grid.arrange, plots)
# Data preparation --------------------------------------------------------
# FEATURE: Functionize the data preparation
# TODO: arguments for times, status, id, factors, covariates,
# Select variables in the model
d <- pbc2 %>% dplyr::select(id, years, status2, drug, age, sex, year,
ascites, hepatomegaly, spiders, edema, serBilir, albumin, SGOT, prothrombin)
# set the id column
id <- c("id")
# set the follow-up times
event.time <- c("years")
times <- c("year")
d[,times] <- d[,times] - d[,event.time]
# rescale follow-up times
d[,times] <- d[,times]/5
status <- c("status2")
# turn categorical covariates into factor
factors <- c("drug", "sex")
d[,factors] <- sapply(d[,factors], as.factor)
# recode categorical outcomes into integers)
outcome_factors <- c("ascites", "hepatomegaly", "spiders", "edema")
d[,outcome_factors] <- sapply(d[,outcome_factors], function(x) {as.integer(x)-1})
# turn all covariates into numerical
covariates <- c("age", "serBilir", "albumin", "SGOT", "prothrombin")
d[,covariates] <- sapply(d[,covariates],as.numeric)
# WARNING: Y VARIABLES NEED TO BE CLOSE TO NORMAL IN ORDER FOR AVERAGING TO WORK
# BoxCox transform
boxcoxtransform <- function(v) {
y <- unlist(v)
bc <- boxcox(y ~ 1, lambda = seq(-5,5,0.5))
lambda <- bc$x[which.max(bc$y)]
y <- (y^lambda-1)/lambda
return(y)
}
d[,covariates] <- sapply(d[,covariates],boxcoxtransform)
summary(d[,covariates])
# Center the outcomes
d[,covariates] <- lapply(d[,covariates], scale)
histograms <- lapply(covariates, make.hist, data = d)
do.call(grid.arrange, histograms)
# Split the data set into data.frames with pairs of the longitudinal outcomes.
outcomes <- c("ascites", "hepatomegaly", "spiders", "edema", "serBilir", "albumin", "SGOT", "prothrombin")
N <- length(outcomes)
Npair <- N * (N-1)/2
pairs <- matrix(data = 0, nrow = Npair, ncol = 4)
h <- 1
for (i in 1:(N-1)) {
for (j in (i+1):N) {
pairs[h,1] <- outcomes[i]
pairs[h,2] <- outcomes[j]
pairs[h,3] <- i
pairs[h,4] <- j
h <- h + 1
}
}
# TEST: 3 continuous outcomes
pairs <- pairs[c(23,24,26),]
# Unroll the outcome matrix -----------------------------------------------
# FEATURE: functionize the data wrangling for better modularity and testing
npair <- nrow(pairs)
# 1) unroll the Y matrix
# Note that id was stored as a factor in the pbc data. This may cause errors.
ids <- rep(d[["id"]], each = 2)
Y <- vector("list", length = nrow(pairs))
for (i in 1:nrow(pairs)) {
Y[[i]] <- d %>% dplyr::select(y1 = paste(pairs[i,1]), y2 = paste(pairs[i,2]))
Y[[i]] <- t(as.matrix(Y[[i]]))
Y[[i]] <- as.vector(Y[[i]])
Y[[i]] <- data.frame(id = as.character(ids),
Y = Y[[i]]
)
}
# 2) create the design matrix
X <- vector("list", length = nrow(pairs))
for (i in 1:nrow(pairs)){
X[[i]] <- d %>% dplyr::select(x1 = paste(pairs[i,2]), x2 = paste(pairs[i,1]))
X[[i]] <- t(as.matrix(X[[i]]))
X[[i]] <- as.vector(X[[i]])
X[[i]] <- data.frame(id = as.character(ids),
X = X[[i]]
)
}
# 3) add covariates
Z <- unlist(dplyr::select(d ,paste(times)))
Z <- as.matrix(Z)
# 4) Add dummies
dummy <- kronecker(rep(1, times = nrow(d)), diag(1, nrow = 2))
Zk <- kronecker(Z, diag(1, nrow = 2))
Z <- cbind(dummy, Zk)
# 5 Join the Y and design matrices
data.list <- vector("list", length=nrow(pairs))
for (i in 1:nrow(pairs)) {
data.list[[i]] <- cbind(Y[[i]],X[[i]][-1], Z)
names(data.list[[i]]) <- c("id", "Y", "X", "intercept.Y1", "intercept.Y2",
paste0(times,".Y1"), paste0(times,".Y2"))
}
# test data consistency
checkY <- data.list[[1]] %>% dplyr::filter(intercept.Y1 == 1) %>% dplyr::select(Y)
all(unlist(d[, "serBilir"]) == checkY)
checkX <- data.list[[1]] %>% dplyr::filter(intercept.Y1 == 1) %>% dplyr::select(X)
all(unlist(d[, "albumin"]) == checkX)
# Clean-up
rm(Y, X, Z, dummy, Zk, pbc2, histograms, pie_charts, plots)
# Fit the individual models for comparison --------------------------------
fit_serBilir_albumin <- mixed_model(fixed = serBilir ~ year + albumin + year:albumin,
random = ~ year | id,
data = d,
family = students.t(50),
n_phis = 1,
nAGQ = 7,
control = list(optimizer = "OptimParallel",
optim_method = "L-BFGS-B",
numeric_deriv = "fd",
iter_EM = 100)
)
fit_serBilir_albumin_check <- mixed_model(fixed = Y ~ -1 + intercept.Y1 + year.Y1 + X:intercept.Y1 + year.Y1:X,
random = ~ -1 + intercept.Y1 + year.Y1 | id,
data = dplyr::filter(data.list[[1]], intercept.Y1 == 1),
family = students.t(50),
n_phis = 1,
nAGQ = 7,
control = list(optimizer = "OptimParallel",
optim_method = "L-BFGS-B",
numeric_deriv = "fd",
iter_EM = 100)
)
all(fit_serBilir_albumin$coefficients == fit_serBilir_albumin_check$coefficients)
all(fit_serBilir_albumin$D == fit_serBilir_albumin_check$D)
# Get start values --------------------------------------------------------
# FEATURE: combine binomial and gaussian models.
# Option 1: ifelse for Y depending on Ylabel?
# Option 2: bblme --> features a general way to estimate the log likelihood but requires in depth specification
# FEATURE: Ordinal and multinomial logit models.
# FEATURE: function to parallelize
# arguments: fixed, random, data.list
# Fit the pairwise models
fits <- vector("list", length = nrow(pairs))
starts <- vector("list", length = nrow(pairs))
time.start <- proc.time()
for (i in 1:nrow(pairs)) {
print(paste("Running model", i, "out of", nrow(pairs)))
fixed <- paste("Y ~ -1 + intercept.Y1 + intercept.Y2 +", # intercepts
"year.Y1 + year.Y2 +", # effect of follow-up time on population average evolution of biomarkers
"X:intercept.Y1 + X:intercept.Y2 +", # effect of biomarker evolution on the longitudinal biomarker
"X:year.Y1 + X:year.Y2") # interaction between follow-up time and the
random <- paste("~ -1 + intercept.Y1 + intercept.Y2 +", # random intercepts
"year.Y1 + year.Y2 | id") # random slopes for Y over time.
fits[[i]] <- mixed_model(fixed = as.formula(fixed),
random = as.formula(random),
data = data.list[[i]],
family = students.t(50),
n_phis = 1,
nAGQ = 7,
control = list(optimizer = "optimParellel",
optim_method = "L-BFGS-B",
numeric_deriv = "fd",
iter_EM = 100,
verbose = TRUE)
)
starts[[i]] <- list(betas = fixef(fits[[i]]), phis = fits[[i]]$phis, D = fits[[i]]$D)
}
(proc.time() - time.start)/60
save.image(file = "pbc2_models.RData")
# load("pbc2_models.RData")
# Get derivatives ----------------------------------------------------------
# FEATURE: function to parallel
# Split data using https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/split
# Initialize the data.frames to store results
time.start <- proc.time()
options(warn = -1)
df.hessians <- vector("list", length = nrow(pairs))
df.gradients <- vector("list", length = nrow(pairs))
# Outer loop over the pairs
for (i in 1:nrow(pairs)) {
print(paste("Running model", i, "out of", nrow(pairs)))
# create lists to store subject level results
hessians <- vector("list", length = nrow(unique(d["id"])))
gradients <- vector("list", length = nrow(unique(d["id"])))
# construct the formulas from pairs
fixed <- paste("Y ~ -1 + intercept.Y1 + intercept.Y2 +
year.Y1 + year.Y2 +
X:intercept.Y1 + X:intercept.Y2 +
X:year.Y1 + X:year.Y2")
random <- paste("~ -1 + intercept.Y1 + intercept.Y2 + year.Y1 + year.Y2 | id")
# Inner loop over the subjects
for (j in 1:nrow(unique(data.list[[i]]["id"]))) {
# get subject level data within the pairs data
ds <- data.list[[i]] %>% filter(id == j)
# Copying the subject is needed to hack the optimization algorithm
ds[,"id"] <- paste0(j,"a")
ds <- rbind(data.list[[i]] %>% filter(id == j), ds)
# Fit the model for the subject
fit <- mixed_model(fixed = as.formula(fixed),
random = as.formula(random),
data = ds,
family = students.t(50),
n_phis = 1,
initial_values = starts[[i]], # input start values
nAGQ = 7,
control = list(optimizer = "optimParallel",
optim_method = "L-BFGS-B",
numeric_deriv = "fd",
iter_EM = 0,
iter_qN_outer = 0) # this avoids iterating from the start values
)
# Store the Hessian
h <- cbind("id" = j, fit$Hessian/2) # divide by 2 to undo the hack
# Store the subject level gradients
g <- c("id" = j,
colSums(fit$score_vect_contributions$score.betas/2), #divide by two to undo the hack
colSums(fit$score_vect_contributions$score.D/2))
# List the individual level results
hessians[[j]] <- h[1:nrow(h)-fit$call$n_phis, 1:ncol(h)-fit$call$n_phis]
gradients[[j]] <- g
}
# Append the lists with individual level results into data.frames
df.hessians[[i]] <- do.call(rbind, hessians)
df.gradients[[i]] <- do.call(rbind, gradients)
}
round((proc.time() - time.start)/60,1)
options(warn = 0)
# Stack output ------------------------------------------------------------
parameters <- vector("list", length = nrow(pairs))
for (i in 1:nrow(pairs)) {
parameters[[i]] <- data.frame(
# build up labels:
# b_<predictor>_<outcome>
# var_<predictor>_<outcome>
# cov_<predictor>_<outcome (col)>_<predictor>_<outcome (row)>
"parameter" = c(paste0("int_", pairs[i,3]),
paste0("int_", pairs[i,4]),
paste0("b_year_", pairs[i,3]),
paste0("b_year_", pairs[i,4]),
paste0("b_", pairs[i,4], "_", pairs[i,3]),
paste0("b_", pairs[i,3], "_", pairs[i,4]),
paste0("b_", pairs[i,4], ":year_", pairs[i,3]),
paste0("b_", pairs[i,3], ":year_", pairs[i,4]),
paste0("var_int_",pairs[i,3]),
paste0("cov_int_",pairs[i,4], "_int_",pairs[i,3]),
paste0("cov_year:",pairs[i,3], "_int_",pairs[i,3]),
paste0("cov_year:",pairs[i,4], "_int_",pairs[i,3]),
paste0("var_int_",pairs[i,4]),
paste0("cov_year:",pairs[i,3], "_int_",pairs[i,4]),
paste0("cov_year:",pairs[i,4], "_int_",pairs[i,4]),
paste0("var_year_",pairs[i,3]),
paste0("cov_year:",pairs[i,4], "_year:",pairs[i,3]),
paste0("var_year_",pairs[i,4])
),
"estimates" = c(fixef(fits[[i]]),
c(fits[[i]]$D[lower.tri(fits[[i]]$D, diag = TRUE)]))
)
}
parameters <- do.call(rbind, parameters)
# Stack the gradients
gradients <- vector("list", length = nrow(pairs))
for (i in 1:nrow(pairs)) {
g <- vector("list", length = nrow(unique(d["id"])))
for (j in 1:nrow(unique(d["id"]))) {
g[[j]] <- data.frame(
"id" = df.gradients[[i]][j,"id"],
"parameter" = c(paste0("int_", pairs[i,3]),
paste0("int_", pairs[i,4]),
paste0("b_year_", pairs[i,3]),
paste0("b_year_", pairs[i,4]),
paste0("b_", pairs[i,4], "_", pairs[i,3]),
paste0("b_", pairs[i,3], "_", pairs[i,4]),
paste0("b_", pairs[i,4], ":year_", pairs[i,3]),
paste0("b_", pairs[i,3], ":year_", pairs[i,4]),
paste0("var_int_",pairs[i,3]),
paste0("cov_int_",pairs[i,4], "_int_",pairs[i,3]),
paste0("cov_year:",pairs[i,3], "_int_",pairs[i,3]),
paste0("cov_year:",pairs[i,4], "_int_",pairs[i,3]),
paste0("var_int_",pairs[i,4]),
paste0("cov_year:",pairs[i,3], "_int_",pairs[i,4]),
paste0("cov_year:",pairs[i,4], "_int_",pairs[i,4]),
paste0("var_year_",pairs[i,3]),
paste0("cov_year:",pairs[i,4], "_year:",pairs[i,3]),
paste0("var_year_",pairs[i,4])
),
"gradient" = df.gradients[[i]][j,-1]
)
}
gradients[[i]] <- do.call(rbind, g)
}
gradients <- do.call(rbind, gradients)
gradients <- gradients[order(gradients$id),]
# Hessians have already been stacked into df.hessians
hessians <- vector("list", length = nrow(pairs))
for (i in 1:nrow(pairs)) {
hessians[[i]] <- as.data.frame(df.hessians[[i]])
}
# Calculate standard errors -------------------------------------------------
# The A matrix is of p rows, equal to the number of unique parameters in the MMM, which is npar.
# and q columns equal to the total number of parameters estimated in the pairwise modeling step: npar times npairs.
A <- matrix(0, nrow = length(unique(parameters[,"parameter"])), ncol = nrow(parameters))
dimnames(A) <- list(
unique(parameters[,"parameter"]),
parameters[,"parameter"]
)
# The values of A are 1 if a parameter was estimated, or 0 if it was not estimated in the pairwise step.
for (i in 1:nrow(A)) {
for (j in 1:ncol(A)) {
if (rownames(A)[i] == colnames(A)[j]) {
A[i,j] <- 1
}
}
}
# Get number of parameters per model (note that we assume all models to have an equal number of parameters)
npar <- ncol(A)/nrow(pairs)
# Get the cross product of the subject specific gradient vector and input into CPgradient.
CPgrad <- matrix(0, nrow(pairs)*npar, nrow(pairs)*npar,
dimnames = list(
parameters[,"parameter"],
parameters[,"parameter"]
)
)
# The loop sums all the subject level contributions to the gradient.
for (i in 1:nrow(unique(d["id"]))) {
# validation: only execute if the names of the gradients and parameters match
if (all(gradients[gradients$id == i,]$parameter == colnames(CPgrad))) {
gsub <- gradients[gradients$id == i,]$gradient # subject level contribution to the gradients
cpgsub <- tcrossprod(gsub) # takes the cross product to get a block matrix
} else {
stop("The parameter names and gradient names do not match. Are you sure the parameter and gradients are in the same order?")
}
CPgrad <- CPgrad + cpgsub # Add to the total
}
# Average the gradient
CPgrad <- CPgrad * (1/nrow(unique(d["id"])))
# Average the Hessians
Hessian <- matrix(0, nrow(pairs)*npar, nrow(pairs)*npar)
for (j in 1:nrow(pairs)) {
hess_pair <- matrix(0, npar, npar)
# Inner loop: subject level contributions to the pairwise Hessian
for (i in 1:nrow(unique(d["id"]))) {
hsub <- as.matrix(hessians[[j]][hessians[[j]]$id == i,])
hsub <- hsub[,2:ncol(hsub)]
hess_pair <- hess_pair + hsub
}
# Outer loop: create block Sparse matrix of the multivariate mixed model Hessian
if (j == 1) {
Hessian <- hess_pair
} else {
Hessian <- as.matrix(bdiag(Hessian, hess_pair))
}
}
# Average the Hessian
Hessian <- Hessian * (1/nrow(unique(d["id"])))
dimnames(Hessian) <- list(
colnames(A),
colnames(A)
)
# Covariance of sqrt(n) * (theta - theta_0) Fieuws 2006 eq(4) p453 (Robust sandwich estimator)
JKJ <- ginv(Hessian) %*% CPgrad %*% ginv(Hessian)
dimnames(JKJ) <- list(
colnames(A),
colnames(A)
)
# Covariance of (theta - theta_0)
cov <- JKJ * (1/nrow(unique(d["id"])))
# Average of the parameters
A <- A * (1/rowSums(A))
avg <- A %*% parameters[,"estimates"]
covrobust <- A %*% cov %*% t(A)
serobust <- sqrt(abs(diag(covrobust)))
B <- data.frame(
"parameter" = rownames(avg),
"estimate_pair" = avg ,
"se_pair" = serobust
)
# Report the parameters estimates, vcov and correlation matrix ------------
# Replace labels from pairs
for (i in 1:length(outcomes)) {
B$parameter <- gsub(i,outcomes[i],B$parameter)
}
# Fixed effects
FE <- B %>% filter(grepl("^var_", parameter) == FALSE & grepl("^cov_", parameter) == FALSE )
FE <- FE[order(row.names(FE)),]
# Random effects
var <- B %>% filter(grepl("^var_", parameter) == TRUE)
var <- var[order(row.names(var)),]
covar <- B %>% filter(grepl("^cov_", parameter) == TRUE)
covar <- covar[row.names(covar),]
# Select the covariance parameters
est_D <- matrix(0, nrow = nrow(var),ncol = nrow(var))
dimnames(est_D) <- list(
sub("var_","",var$parameter),
sub("var_","",var$parameter)
)
diag(est_D) <- var$estimate_pair
est_D[upper.tri(est_D)] <- covar$estimate_pair
est_D[lower.tri(est_D)] <- t(est_D)[lower.tri(est_D)]
# Calculate the correlation matrix
est_Dcorr <- cov2cor(est_D)
# outcome table
estimates <- rbind(FE, var, covar)
print(estimates)
print(est_D)
print(est_Dcorr)
# FEATURE: predictions from the MMM
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.