R/pairwise_outcometypes.R

Defines functions boxcoxtransform make.plots make.hist make.pie

# Doc header --------------------------------------------------------------

# Pairwise binomial mixed models 
# Jan van den Brand, PhD
# jan.vandenbrand@kuleuven.be
# janvandenbrand@gmail.com


# The work here is based on The work is based on Fieuws, Verbeke, Maes and Vanrenterghem. Biostatistics 2008.
# https://doi.org/10.1093/biostatistics/kxm041

# 0: Preliminaries  ------------------------------------------------------

setwd("C:/Users/jajgv/Documents/repos/highdimjm")
pkg <- c("ggplot2","gridExtra", "dplyr", "optimParallel", "GLMMadaptive", "Matrix", "MASS")
lapply(pkg, library, character.only = TRUE)
cluster <- makeCluster(detectCores()); setDefaultCluster(cl = cluster)

# Data ------------------------------------------------------------------

# For this example I will used the pbc (primary billiary cirrhosis) data from Rizopoulos Join Models for Longitudinal and Time-to-Event Data, CRC press, 2012.

ds <- url("https://raw.github.com/drizopoulos/Repeated_Measurements/master/Data.RData")
load(ds)
rm(list = c("aids", "aids.id", "glaucoma", "pbc2.id", "prothro", "ds"))

# Print overview of data
str(pbc2)
summary(pbc2)
head(pbc2)

# Pie/bar charts for categorical variables
factors <- c("sex", "ascites", "hepatomegaly", "spiders", "edema", "status")
make.pie <- function(v) {
  pie_plot <- ggplot(data = pbc2, aes(x = factor(1), fill = pbc2[,v])) +
    geom_bar(width = 1) +
    coord_polar('y', start = 0)  +
    theme_minimal() +
    labs(x = "", y = "") +
    guides(fill = guide_legend(title = paste(v)))
}
pie_charts <- lapply(factors, make.pie)
do.call(grid.arrange, pie_charts)

# create histograms for continuous variables
markers <- c("serBilir", "serChol", "albumin", "alkaline", "SGOT", "platelets", "prothrombin") 
make.hist <- function(v, data) {
  plot <- ggplot() +
    geom_histogram(data = data, aes(x = data[,v]), bins = 20) +
    theme_classic() +
    labs(x = paste("histogram of", v))
  return(plot)
}
histograms <- lapply(markers, make.hist, data = pbc2)
do.call(grid.arrange, histograms)

# Spaghetti plots for longitudinal variables
make.plots <- function(v) {
  plot <- ggplot() +
    geom_line(data = pbc2[pbc2$status2 == 0,], 
              aes(x = year - years, y = pbc2[pbc2$status2 == 0, v], group = id), 
              size = 0.1, alpha = 0.2) +
    geom_smooth(data = pbc2[pbc2$status2 == 0,], 
                aes(x = year - years, y =  pbc2[pbc2$status2 == 0, v])) +
    geom_line(data = pbc2[pbc2$status2 == 1,], 
              aes(x = year - years, y = pbc2[pbc2$status2 == 1, v], group = id), 
              color = "red", size = 0.1, alpha = 0.2) +
    geom_smooth(data = pbc2[pbc2$status2 == 1,], 
                aes(x = year - years, y =  pbc2[pbc2$status2 == 1, v]), color = "red") +
    scale_x_continuous(breaks = seq(floor(min(-pbc2$years, na.rm = TRUE)),0,3)) +
    theme_classic() + 
    labs(x = "Follow-up time in years", y = paste(v))
  return(plot)
}
plots <- lapply(markers, make.plots)
do.call(grid.arrange, plots)

# Data preparation --------------------------------------------------------

# FEATURE: Functionize the data preparation
# TODO: arguments for times, status, id, factors, covariates,

# Select variables in the model
d <- pbc2 %>% dplyr::select(id, years, status2, drug, age, sex, year, 
                     ascites, hepatomegaly, spiders, edema, serBilir, albumin, SGOT, prothrombin)
# set the id column
id <- c("id")
# set the follow-up times
event.time <- c("years")
times <- c("year")
d[,times] <- d[,times] - d[,event.time] 
# rescale follow-up times
d[,times] <- d[,times]/5
status <- c("status2")
# turn categorical covariates into factor
factors <- c("drug", "sex") 
d[,factors] <- sapply(d[,factors], as.factor)
# recode categorical outcomes into integers)
outcome_factors <- c("ascites", "hepatomegaly", "spiders", "edema")
d[,outcome_factors] <- sapply(d[,outcome_factors], function(x) {as.integer(x)-1})
# turn all covariates into numerical
covariates <-  c("age", "serBilir", "albumin", "SGOT", "prothrombin")
d[,covariates] <- sapply(d[,covariates],as.numeric)
# WARNING: Y VARIABLES NEED TO BE CLOSE TO NORMAL IN ORDER FOR AVERAGING TO WORK
# BoxCox transform
boxcoxtransform <- function(v) {
  y <- unlist(v)
  bc <- boxcox(y ~ 1, lambda = seq(-5,5,0.5))
  lambda <- bc$x[which.max(bc$y)]
  y <- (y^lambda-1)/lambda
  return(y)
}
d[,covariates] <- sapply(d[,covariates],boxcoxtransform)
summary(d[,covariates])
# Center the outcomes
d[,covariates] <- lapply(d[,covariates], scale)

histograms <- lapply(covariates, make.hist, data = d)
do.call(grid.arrange, histograms)

# Split the data set into data.frames with pairs of the longitudinal outcomes. 
outcomes <- c("ascites", "hepatomegaly", "spiders", "edema", "serBilir", "albumin", "SGOT", "prothrombin")
N <- length(outcomes)
Npair <- N * (N-1)/2
pairs <- matrix(data = 0, nrow = Npair, ncol = 4)
h <- 1
for (i in 1:(N-1)) {
  for (j in (i+1):N) {
    pairs[h,1] <- outcomes[i]
    pairs[h,2] <- outcomes[j]
    pairs[h,3] <- i
    pairs[h,4] <- j
    h <- h + 1
  }
}
# TEST: 3 continuous outcomes
pairs <- pairs[c(23,24,26),]

# Unroll the outcome matrix -----------------------------------------------

# FEATURE: functionize the data wrangling for better modularity and testing
npair <- nrow(pairs)
# 1) unroll the Y matrix
# Note that id was stored as a factor in the pbc data. This may cause errors.
ids <- rep(d[["id"]], each = 2)
Y <- vector("list", length = nrow(pairs))
for (i in 1:nrow(pairs)) {
  Y[[i]] <- d %>% dplyr::select(y1 = paste(pairs[i,1]), y2 = paste(pairs[i,2]))
  Y[[i]] <- t(as.matrix(Y[[i]]))
  Y[[i]] <- as.vector(Y[[i]])
  Y[[i]] <- data.frame(id = as.character(ids), 
                       Y = Y[[i]]
                       )
}
# 2) create the design matrix
X <- vector("list", length = nrow(pairs))
for (i in 1:nrow(pairs)){
  X[[i]] <- d %>% dplyr::select(x1 = paste(pairs[i,2]), x2 = paste(pairs[i,1]))
  X[[i]] <- t(as.matrix(X[[i]]))
  X[[i]] <- as.vector(X[[i]])
  X[[i]] <- data.frame(id = as.character(ids), 
                       X = X[[i]]
                       )
}
# 3) add covariates
Z <- unlist(dplyr::select(d ,paste(times)))
Z <- as.matrix(Z)
# 4) Add dummies
dummy <- kronecker(rep(1, times = nrow(d)), diag(1, nrow = 2))
Zk <- kronecker(Z, diag(1, nrow = 2))
Z <- cbind(dummy, Zk)
# 5 Join the Y and design matrices
data.list <- vector("list", length=nrow(pairs))
for (i in 1:nrow(pairs)) {
  data.list[[i]] <- cbind(Y[[i]],X[[i]][-1], Z)
  names(data.list[[i]]) <- c("id", "Y", "X", "intercept.Y1", "intercept.Y2",
                             paste0(times,".Y1"), paste0(times,".Y2"))
}

# test data consistency
checkY <- data.list[[1]] %>% dplyr::filter(intercept.Y1 == 1) %>% dplyr::select(Y)
all(unlist(d[, "serBilir"]) == checkY)
checkX <- data.list[[1]] %>% dplyr::filter(intercept.Y1 == 1) %>% dplyr::select(X)
all(unlist(d[, "albumin"]) == checkX)



# Clean-up
rm(Y, X, Z, dummy, Zk, pbc2, histograms, pie_charts, plots)

# Fit the individual models for comparison --------------------------------

fit_serBilir_albumin <- mixed_model(fixed = serBilir ~ year + albumin + year:albumin,
                            random = ~ year | id,
                            data = d,
                            family = students.t(50),
                            n_phis = 1,
                            nAGQ = 7,
                            control = list(optimizer = "OptimParallel",
                                           optim_method = "L-BFGS-B",
                                           numeric_deriv = "fd",
                                           iter_EM = 100)
)
fit_serBilir_albumin_check <- mixed_model(fixed = Y ~ -1 + intercept.Y1 + year.Y1 + X:intercept.Y1 + year.Y1:X,
                                    random = ~ -1 + intercept.Y1 + year.Y1 | id,
                                    data = dplyr::filter(data.list[[1]], intercept.Y1 == 1),
                                    family = students.t(50),
                                    n_phis = 1,
                                    nAGQ = 7,
                                    control = list(optimizer = "OptimParallel",
                                                   optim_method = "L-BFGS-B",
                                                   numeric_deriv = "fd",
                                                   iter_EM = 100)
)
all(fit_serBilir_albumin$coefficients == fit_serBilir_albumin_check$coefficients)
all(fit_serBilir_albumin$D == fit_serBilir_albumin_check$D)

# Get start values --------------------------------------------------------

# FEATURE: combine binomial and gaussian models.
  # Option 1: ifelse for Y depending on Ylabel?
  # Option 2: bblme --> features a general way to estimate the log likelihood but requires in depth specification
# FEATURE: Ordinal and multinomial logit models.
# FEATURE: function to parallelize
  # arguments: fixed, random, data.list

# Fit the pairwise models
fits <- vector("list", length = nrow(pairs))
starts <- vector("list", length = nrow(pairs))
time.start <- proc.time()
for (i in 1:nrow(pairs)) {
  print(paste("Running model", i, "out of", nrow(pairs)))
  fixed <- paste("Y ~ -1 + intercept.Y1 + intercept.Y2 +", # intercepts
                 "year.Y1 + year.Y2 +", # effect of follow-up time on population average evolution of biomarkers
                 "X:intercept.Y1 + X:intercept.Y2 +", # effect of biomarker evolution on the longitudinal biomarker
                 "X:year.Y1 + X:year.Y2") # interaction between follow-up time and the 
  random <- paste("~ -1 + intercept.Y1 + intercept.Y2 +", # random intercepts
                  "year.Y1 + year.Y2 | id") # random slopes for Y over time. 
  fits[[i]] <- mixed_model(fixed = as.formula(fixed),
                           random = as.formula(random), 
                           data = data.list[[i]],
                           family = students.t(50),
                           n_phis = 1,
                           nAGQ = 7,
                           control = list(optimizer = "optimParellel", 
                                          optim_method = "L-BFGS-B",
                                          numeric_deriv = "fd",
                                          iter_EM = 100,
                                          verbose = TRUE)
  )
  starts[[i]] <- list(betas = fixef(fits[[i]]), phis = fits[[i]]$phis, D = fits[[i]]$D)
}
(proc.time() - time.start)/60

save.image(file = "pbc2_models.RData")
# load("pbc2_models.RData")

# Get derivatives ----------------------------------------------------------

#  FEATURE: function to parallel
#   Split data using https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/split

# Initialize the data.frames to store results
time.start <- proc.time()
options(warn = -1)
df.hessians <- vector("list", length = nrow(pairs))
df.gradients <- vector("list", length = nrow(pairs))
# Outer loop over the pairs
for (i in 1:nrow(pairs)) {
  print(paste("Running model", i, "out of", nrow(pairs)))
  # create lists to store subject level results
  hessians <- vector("list", length = nrow(unique(d["id"])))
  gradients <- vector("list", length = nrow(unique(d["id"])))
  # construct the formulas from pairs
  fixed <- paste("Y ~ -1 + intercept.Y1 + intercept.Y2 +
                 year.Y1 + year.Y2 +
                 X:intercept.Y1 + X:intercept.Y2 + 
                 X:year.Y1 + X:year.Y2")
  random <- paste("~ -1 + intercept.Y1 + intercept.Y2 + year.Y1 + year.Y2 | id") 
  # Inner loop over the subjects
  for (j in 1:nrow(unique(data.list[[i]]["id"]))) {
    # get subject level data within the pairs data
    ds <- data.list[[i]] %>% filter(id == j)
    # Copying the subject is needed to hack the optimization algorithm 
    ds[,"id"] <- paste0(j,"a")
    ds <- rbind(data.list[[i]] %>% filter(id == j), ds)
    # Fit the model for the subject
    fit <- mixed_model(fixed = as.formula(fixed),
                       random = as.formula(random), 
                       data = ds,
                       family = students.t(50),
                       n_phis = 1,
                       initial_values = starts[[i]], # input start values
                       nAGQ = 7,
                       control = list(optimizer = "optimParallel", 
                                      optim_method = "L-BFGS-B", 
                                      numeric_deriv = "fd",
                                      iter_EM = 0, 
                                      iter_qN_outer = 0) # this avoids iterating from the start values
    )
    # Store the Hessian
    h <- cbind("id" = j, fit$Hessian/2) # divide by 2 to undo the hack  
    # Store the subject level gradients 
    g <- c("id" = j,
           colSums(fit$score_vect_contributions$score.betas/2), #divide by two to undo the hack 
           colSums(fit$score_vect_contributions$score.D/2))
    # List the individual level results
    hessians[[j]] <- h[1:nrow(h)-fit$call$n_phis, 1:ncol(h)-fit$call$n_phis] 
    gradients[[j]] <- g 
  }
  # Append the lists with individual level results into data.frames
  df.hessians[[i]] <- do.call(rbind, hessians)
  df.gradients[[i]] <- do.call(rbind, gradients)
}
round((proc.time() - time.start)/60,1)
options(warn = 0)

# Stack output ------------------------------------------------------------

parameters <- vector("list", length = nrow(pairs)) 
for (i in 1:nrow(pairs)) {
  parameters[[i]] <- data.frame(
    # build up labels:
    # b_<predictor>_<outcome>
    # var_<predictor>_<outcome>
    # cov_<predictor>_<outcome (col)>_<predictor>_<outcome (row)>
    "parameter" = c(paste0("int_", pairs[i,3]),
                    paste0("int_", pairs[i,4]),
                    paste0("b_year_", pairs[i,3]),
                    paste0("b_year_", pairs[i,4]),
                    paste0("b_", pairs[i,4], "_", pairs[i,3]),
                    paste0("b_", pairs[i,3], "_", pairs[i,4]),
                    paste0("b_", pairs[i,4], ":year_", pairs[i,3]),
                    paste0("b_", pairs[i,3], ":year_", pairs[i,4]),
                    paste0("var_int_",pairs[i,3]),
                    paste0("cov_int_",pairs[i,4], "_int_",pairs[i,3]),
                    paste0("cov_year:",pairs[i,3], "_int_",pairs[i,3]),
                    paste0("cov_year:",pairs[i,4], "_int_",pairs[i,3]),
                    paste0("var_int_",pairs[i,4]),
                    paste0("cov_year:",pairs[i,3], "_int_",pairs[i,4]),
                    paste0("cov_year:",pairs[i,4], "_int_",pairs[i,4]),
                    paste0("var_year_",pairs[i,3]),
                    paste0("cov_year:",pairs[i,4], "_year:",pairs[i,3]),
                    paste0("var_year_",pairs[i,4])
    ),
    "estimates" = c(fixef(fits[[i]]),
                    c(fits[[i]]$D[lower.tri(fits[[i]]$D, diag = TRUE)]))
  )
}
parameters <- do.call(rbind, parameters)

# Stack the gradients
gradients <- vector("list", length = nrow(pairs))
for (i in 1:nrow(pairs)) {
  g <- vector("list", length = nrow(unique(d["id"])))
  for (j in 1:nrow(unique(d["id"]))) {
    g[[j]] <- data.frame(
      "id" = df.gradients[[i]][j,"id"],
      "parameter" = c(paste0("int_", pairs[i,3]),
                      paste0("int_", pairs[i,4]),
                      paste0("b_year_", pairs[i,3]),
                      paste0("b_year_", pairs[i,4]),
                      paste0("b_", pairs[i,4], "_", pairs[i,3]),
                      paste0("b_", pairs[i,3], "_", pairs[i,4]),
                      paste0("b_", pairs[i,4], ":year_", pairs[i,3]),
                      paste0("b_", pairs[i,3], ":year_", pairs[i,4]),
                      paste0("var_int_",pairs[i,3]),
                      paste0("cov_int_",pairs[i,4], "_int_",pairs[i,3]),
                      paste0("cov_year:",pairs[i,3], "_int_",pairs[i,3]),
                      paste0("cov_year:",pairs[i,4], "_int_",pairs[i,3]),
                      paste0("var_int_",pairs[i,4]),
                      paste0("cov_year:",pairs[i,3], "_int_",pairs[i,4]),
                      paste0("cov_year:",pairs[i,4], "_int_",pairs[i,4]),
                      paste0("var_year_",pairs[i,3]),
                      paste0("cov_year:",pairs[i,4], "_year:",pairs[i,3]),
                      paste0("var_year_",pairs[i,4])
      ),
      "gradient" = df.gradients[[i]][j,-1]
    )
  }
  gradients[[i]] <- do.call(rbind, g)
}
gradients <- do.call(rbind, gradients)
gradients <- gradients[order(gradients$id),]

# Hessians have already been stacked into df.hessians
hessians <- vector("list", length = nrow(pairs))
for (i in 1:nrow(pairs)) {
  hessians[[i]] <- as.data.frame(df.hessians[[i]])
}

# Calculate standard errors -------------------------------------------------

# The A matrix is of p rows, equal to the number of unique parameters in the MMM, which is npar.
# and q columns equal to the total number of parameters estimated in the pairwise modeling step: npar times npairs.
A <- matrix(0, nrow = length(unique(parameters[,"parameter"])), ncol = nrow(parameters))
dimnames(A) <- list(
  unique(parameters[,"parameter"]),
  parameters[,"parameter"]
)
# The values of A are 1 if a parameter was estimated, or 0 if it was not estimated in the pairwise step.
for (i in 1:nrow(A)) {
  for (j in 1:ncol(A)) {
    if (rownames(A)[i] == colnames(A)[j]) { 
      A[i,j] <- 1
    }
  }
}
# Get number of parameters per model (note that we assume all models to have an equal number of parameters)
npar <- ncol(A)/nrow(pairs) 
# Get the cross product of the subject specific gradient vector and input into CPgradient.
CPgrad <- matrix(0, nrow(pairs)*npar, nrow(pairs)*npar, 
                 dimnames = list(
                   parameters[,"parameter"],
                   parameters[,"parameter"]
                 )
) 
# The loop sums all the subject level contributions to the gradient.
for (i in 1:nrow(unique(d["id"]))) { 
  # validation: only execute if the names of the gradients and parameters match
  if (all(gradients[gradients$id == i,]$parameter == colnames(CPgrad))) { 
    gsub <- gradients[gradients$id == i,]$gradient # subject level contribution to the gradients
    cpgsub <- tcrossprod(gsub) # takes the cross product to get a block matrix
  } else {
    stop("The parameter names and gradient names do not match. Are you sure the parameter and gradients are in the same order?")
  }
  CPgrad <- CPgrad + cpgsub # Add to the total
}
# Average the gradient
CPgrad <- CPgrad * (1/nrow(unique(d["id"])))

# Average the Hessians 
Hessian <- matrix(0, nrow(pairs)*npar, nrow(pairs)*npar)
for (j in 1:nrow(pairs)) {
  hess_pair <- matrix(0, npar, npar)
  # Inner loop: subject level contributions to the pairwise Hessian
  for (i in 1:nrow(unique(d["id"]))) {
    hsub <- as.matrix(hessians[[j]][hessians[[j]]$id == i,])
    hsub <- hsub[,2:ncol(hsub)]
    hess_pair <- hess_pair + hsub
  }
  # Outer loop: create block Sparse matrix of the multivariate mixed model Hessian
  if (j == 1) {
    Hessian <- hess_pair
  } else {
    Hessian <- as.matrix(bdiag(Hessian, hess_pair))
  }
}
# Average the Hessian
Hessian <- Hessian * (1/nrow(unique(d["id"])))
dimnames(Hessian) <- list(
  colnames(A),
  colnames(A)
)

# Covariance of sqrt(n) * (theta - theta_0) Fieuws 2006 eq(4) p453 (Robust sandwich estimator)
JKJ <- ginv(Hessian) %*% CPgrad %*% ginv(Hessian)
dimnames(JKJ) <- list(
  colnames(A),
  colnames(A)
)

# Covariance of (theta - theta_0)
cov <- JKJ * (1/nrow(unique(d["id"]))) 
# Average of the parameters
A <- A * (1/rowSums(A))
avg <- A %*% parameters[,"estimates"]
covrobust <- A %*% cov %*% t(A)
serobust <- sqrt(abs(diag(covrobust)))
B <- data.frame(
  "parameter" = rownames(avg),
  "estimate_pair" = avg ,
  "se_pair" = serobust
)

# Report the parameters estimates, vcov and correlation matrix ------------

# Replace labels from pairs
for (i in 1:length(outcomes)) {
  B$parameter <- gsub(i,outcomes[i],B$parameter)  
}
# Fixed effects
FE <- B %>% filter(grepl("^var_", parameter) == FALSE & grepl("^cov_", parameter) == FALSE )
FE <- FE[order(row.names(FE)),]
# Random effects
var <- B %>% filter(grepl("^var_", parameter) == TRUE)
var <- var[order(row.names(var)),]
covar <- B %>% filter(grepl("^cov_", parameter) == TRUE)
covar <- covar[row.names(covar),]
# Select the covariance parameters
est_D <- matrix(0, nrow = nrow(var),ncol = nrow(var))
dimnames(est_D) <- list(
  sub("var_","",var$parameter),
  sub("var_","",var$parameter)
)
diag(est_D) <- var$estimate_pair
est_D[upper.tri(est_D)] <- covar$estimate_pair 
est_D[lower.tri(est_D)] <- t(est_D)[lower.tri(est_D)]
# Calculate the correlation matrix
est_Dcorr <- cov2cor(est_D)
# outcome table
estimates <- rbind(FE, var, covar)
  
print(estimates)
print(est_D)
print(est_Dcorr)

# FEATURE: predictions from the MMM
JanvandenBrand/highdimjm documentation built on Dec. 18, 2021, 12:32 a.m.