R/pairwise_adaptiveglmm.R

# Doc header --------------------------------------------------------------

# Pairwise binomial mixed models 
# Jan van den Brand, PhD
# jan.vandenbrand@kuleuven.be
# janvandenbrand@gmail.com
# Project: DKF19OK003

# The work here is based on The work is based on Fieuws S., Verbeke, G., Boen, F. and Delecluse, C. (2006) 
# "High dimensional multivariate mixed models for binary questionnaire data", Applied Statistics, 55 (4), 449-460.
# For the orginal SAS macro, refer to https://ibiostat.be/online-resources/uploads/pairwiseMGLMM.zip

# 0: Preliminaries  ------------------------------------------------------

packages <- c("optimx", "ggplot2","lattice","MASS", "Matrix", "gridExtra", "dplyr", "optimParallel", "GLMMadaptive")
lapply(packages, library, character.only = TRUE)
cluster <- makeCluster(detectCores()); setDefaultCluster(cl = cluster)

setwd("C:/Users/jajgv/Documents/NSN19OK003/Data")

# 1: The Data -------------------------------------------------------------

# For this example I will use the questions data provide by Fieuws.

df <- read.csv(file = "questiondata.txt", sep = " ")
names(df) <- c("id", "dc", "resp", "physical", "selfesteem", "barriers",  "psychic", "selfperception", "selfeffic", "motivation")
df <- mutate(df,
             id = as.character(id),
             resp = factor(resp,
                           labels = c("negative", "positive"),
                           exclude = ".")
)

# 2: Data exploration -----------------------------------------------------

# Print overview of data
str(df)
head(df)

# 3 Data preparation ------------------------------------------------------

# Split the data set into data.frames with pairs of the longitudinal outcomes. 
outcomes <- names(df)[-1:-3]
# outcomes <- outcomes[1:3] # TEst only the first 3 outcomes
N <- length(outcomes)
Npair <- N * (N-1)/2
pairs <- matrix(data = 0, nrow = Npair, ncol = 5)
h <- 1
for (i in 1:(N-1)) {
  for (j in (i+1):N) {
    pairs[h,1] <- outcomes[i]
    pairs[h,2] <- outcomes[j]
    pairs[h,3] <- paste0(i,j)
    pairs[h,4] <- paste0(i)
    pairs[h,5] <- paste0(j)
    h <- h + 1
  }
}
print(pairs)
pairs <- pairs[c(1,2,7),]
d <- df

# 4 Get starting values  --------------------------------------------------

# NB mixed_model return the individual level gradients in score_vect_contributions.

# Fit the pairwise models
fits <- vector("list", length = nrow(pairs))
starts <- vector("list", length = nrow(pairs))
tstart <- Sys.time()
for (i in 1:nrow(pairs)) {
  loopstart <- Sys.time()
  fixed <- paste("resp ~ -1 +", # omit the fixed intercept
                 paste(pairs[i, 1]), # fixed intercept for pair==1
                 "+", paste(pairs[i, 2]), # fixed intercept for pair==2
                 "+",  paste0(pairs[i,1],":dc +"), # fixed slope for pair==1 x dc 
                 paste0(pairs[i,2],":dc")) # fixed slope for pair==2 x dc
  random <- paste("~ -1 + ",            
                  paste(pairs[i,1]),"+", # random slope for pair == 1 
                  paste(pairs[i,2]), "| id" ) # random slope for pair == 2 
  ds <- filter(d, d[paste(pairs[i, 1])] == 1 | d[paste(pairs[i, 2])] == 1) # Filter the data for the relevant pair of outcomes
  fits[[i]] <- mixed_model(fixed = as.formula(fixed),
                           random = as.formula(random), 
                           data = ds,
                           family = binomial(),
                           nAGQ = 11,
                           control = list(optimizer = "optimParellel", 
                                          optim_method = "L-BFGS-B",
                                          numeric_deriv = "fd",
                                          iter_EM = 100,
                                          verbose = TRUE)
                           )
  starts[[i]] <- list(betas = fixef(fits[[i]]), D = fits[[i]]$D)
  loopend <- Sys.time()
  print(round(loopend - loopstart,1))
}
tend <- Sys.time()
print(round(tend - tstart,1))

# 5: fit pairwise models on subject level to calculate the first and second order derivatives ------------

# FEATURE: Get score_vect_contributions$score.betas directly from mixed_model output.
# FEATURE: do data split outside loop for parallelization

# Initialize the data.frames to store results
tstart <- Sys.time()
options(warn = -1)
df.hessians <- vector("list", length = nrow(pairs))
df.gradients <- vector("list", length = nrow(pairs))
# Outer loop over the pairs
for (i in 1:nrow(pairs)) {
  loopstart <- Sys.time()   # review run time
  # create lists to store subject level results
  hessians <- vector("list", length = nrow(unique(d["id"])))
  gradients <- vector("list", length = nrow(unique(d["id"])))
  # construct the formulas from pairs
  fixed <- paste("resp ~ -1 +", # omit the fixed intercept
                 paste(pairs[i, 1]), # fixed intercept for pair==1
                 "+", paste(pairs[i, 2]), # fixed intercept for pair==2
                 "+",  paste0(pairs[i,1],":dc +"), # fixed slope for pair==1 x dc 
                 paste0(pairs[i,2],":dc")) # fixed slope for pair==2 x dc
  random <- paste("~ -1 + ", # omit random intercepts          
                  paste(pairs[i,1]),"+", # random slope for pair == 1 
                  paste(pairs[i,2]), "| id" ) # random slope for pair == 2 
    # Inner loop over the subjects
    for (j in 1:nrow(unique(d["id"]))) {
    # get subject level data within the pairs data
    ds <- d %>% filter((d[paste(pairs[i, 1])] == 1 | d[paste(pairs[i, 2])] == 1) & id == j)
    # Copying the subject is needed to hack the optimization algorithm 
    ds[,"id"] <- paste0(j,"a")
    ds <- rbind(d %>% filter((d[paste(pairs[i, 1])] == 1 | d[paste(pairs[i, 2])] == 1) & id == j), ds)
    # Fit the model for the subject
    fit <- mixed_model(fixed = as.formula(fixed),
                       random = as.formula(random), 
                       data = ds,
                       family = binomial(),
                       initial_values = starts[[i]], # input start values obtained at step 4.
                       nAGQ = 11,
                       control = list(optimizer = "optimParallel", 
                                      optim_method = "L-BFGS-B", 
                                      numeric_deriv = "fd",
                                      iter_EM = 0, 
                                      iter_qN_outer = 0) # this avoids iterating from the start values
    )
    # Store the Hessian
    h <- cbind("id" = j, fit$Hessian/2) # divide by 2 to undo the hack  
    # Store the subject level gradients 
    g <- c("id" = j,
           colSums(fit$score_vect_contributions$score.betas/2), #divide by two to undo the hack 
           colSums(fit$score_vect_contributions$score.D/2)) # ==> these do not match SAS
    names(g) <- c("id",colnames(fit$Hessian))
    # List the individual level results
    hessians[[j]] <- h
    gradients[[j]] <- g
  }
  # Append the lists with individual level results into data.frames
  df.hessians[[i]] <- do.call(rbind, hessians)
  df.gradients[[i]] <- do.call(rbind, gradients)
  loopend <- Sys.time()
  print(round(loopend - loopstart,1))
}
tend <- Sys.time()
print(round(tend - tstart, 1))
options(warn = 0)

# 6.1: Combine the pairs ----------------------------------------------------

# Stack the parameters
parameters <- vector("list", length = nrow(pairs)) 
for (i in 1:nrow(pairs)) {
  parameters[[i]] <- data.frame(
    "parameter" = c(paste0("alpha", pairs[i,4]),
                    paste0("alpha", pairs[i,5]),
                    paste0("beta", pairs[i,4]),
                    paste0("beta", pairs[i,5]),
                    paste0("su",pairs[i,4]),
                    paste0("su",pairs[i,4],pairs[i,5]),
                    paste0("su",pairs[i,5])),
    "estimates" = c(fixef(fits[[i]]),
                    c(fits[[i]]$D[lower.tri(fits[[i]]$D, diag = TRUE)])
    )
  )
}
parameters <- do.call(rbind, parameters)

# Stack the gradients
gradients <- vector("list", length = nrow(pairs))
for (i in 1:nrow(pairs)) {
  g <- vector("list", length = nrow(unique(d["id"])))
  for (j in 1:nrow(unique(d["id"]))) {
    g[[j]] <- data.frame(
      "id" = df.gradients[[i]][j,"id"],
      "parameter" = c(paste0("alpha", pairs[i,4]),
                      paste0("alpha", pairs[i,5]),
                      paste0("beta", pairs[i,4]),
                      paste0("beta", pairs[i,5]),
                      paste0("su",pairs[i,4]),
                      paste0("su",pairs[i,4],pairs[i,5]),
                      paste0("su",pairs[i,5])),
      "gradient" = df.gradients[[i]][j,-1]
    )
  }
  gradients[[i]] <- do.call(rbind, g)
}
gradients <- do.call(rbind, gradients)
gradients <- gradients[order(gradients$id),]

# Hessians have already been stacked into df.hessians
hessians <- vector("list", length = nrow(pairs))
for (i in 1:nrow(pairs)) {
  hessians[[i]] <- as.data.frame(df.hessians[[i]])
}

# 6.2: Create the A matrix -------------------------------------------------

# The A matrix is of p rows, equal to the number of unique parameters in the MMM, which is npar.# and q columns equal to the total number of parameters estimated in the pairwise modeling step: npar times npairs.
A <- matrix(0, nrow = length(unique(parameters[,"parameter"])), ncol = nrow(parameters))
dimnames(A) <- list(
  unique(parameters[,"parameter"]),
  parameters[,"parameter"]
)
# The values of A are 1 if a parameter was estimated, or 0 if it was not estimated in the pairwise step.
for (i in 1:nrow(A)) {
  for (j in 1:ncol(A)) {
    if (rownames(A)[i] == colnames(A)[j]) { 
      A[i,j] <- 1
    }
  }
}

# 6.3: Calculate the covariance matrix for fixed effects ----------------------

npar <- 7 

# Get the cross product of the subject specific gradient vector and input into CPgradient.
CPgrad <- matrix(0, nrow(pairs)*npar, nrow(pairs)*npar, 
                 dimnames = list(
                   parameters[,"parameter"],
                   parameters[,"parameter"]
                   )
                 ) 
# The loop sums all the subject level contributions to the gradient.
for (i in 1:nrow(unique(d["id"]))){ 
  gsub <- gradients[gradients$id == i,]$gradient # subject level contribution to the gradients
  cpgsub <- tcrossprod(gsub) # takes the cross product to get a block matrix
  CPgrad <- CPgrad + cpgsub # Add to the total
}
# Average the gradient
CPgrad <- CPgrad * (1/nrow(unique(d["id"])))

# Average the Hessians 
Hessian <- matrix(0, nrow(pairs)*npar, nrow(pairs)*npar)
for (j in 1:nrow(pairs)) {
  hess_pair <- matrix(0, npar, npar)
  # Inner loop: subject level contributions to the pairwise Hessian
  for (i in 1:nrow(unique(d["id"]))) {
    hsub <- as.matrix(hessians[[j]][hessians[[j]]$id == i,-1])
    hess_pair <- hess_pair + hsub
  }
  # Outer loop: create block Sparse matrix of the multivariate mixed model Hessian
  if (j == 1) {
    Hessian <- hess_pair
  } else {
    Hessian <- as.matrix(bdiag(Hessian, hess_pair))
  }
}
# Average the Hessian
Hessian <- Hessian * (1/nrow(unique(d["id"])))
dimnames(Hessian) <- list(
  colnames(A),
  colnames(A)
)
# Covariance of sqrt(n) * (theta - theta_0) Fieuws 2006 eq(4) p453
JKJ <- ginv(Hessian) %*% CPgrad %*% ginv(Hessian)
dimnames(JKJ) <- list(
  colnames(A),
  colnames(A)
)
# Covariance of (theta - theta_0)
cov <- JKJ * (1/nrow(unique(d["id"]))) 
# Average of the parameters
A <- A * (1/rowSums(A))
avg <- A %*% parameters[,"estimates"]
covrobust <- A %*% cov %*% t(A)
serobust <- sqrt(diag(covrobust))
B <- data.frame(
  "parameter" = rownames(avg),
  "estimate_pair" = avg ,
  "se_pair" = serobust
)

# 6.4: Create the covariance and correlation matrix of RE -----------------

# The parameters and standard errors random effects are reported on the variance and not std scale.

# Fixed effects
FE <- B %>% filter(grepl("^su", parameter) == FALSE)
FE <- FE[order(FE$parameter),]
# Random effects
RE <- B %>% filter(grepl("^su", parameter) == TRUE)
RE <- RE[order(RE$parameter),]
# Select the covariance parameters
est_D <- matrix(0, nrow = N,ncol = N) 
est_D[lower.tri(est_D, diag = TRUE)] <- RE$estimate_pair 
est_D[upper.tri(est_D)] <- t(est_D)[upper.tri(est_D)]
# Calculate the correlation matrix
est_Dcorr <- cov2cor(est_D)
# outcome table
estimates <- rbind(FE, RE)

print(estimates)
print(est_D)
print(est_Dcorr)

save.image(file = paste0("mglmm_",format(Sys.Date(),"%Y%m%d"),".RData"))

write.csv2(estimates , file = "C:/Users/jajgv/Downloads/estimates.csv", row.names = FALSE)
write.csv2(round(est_Dcorr,2), file = "C:/Users/jajgv/Downloads/correlations.csv", row.names = FALSE)
JanvandenBrand/highdimjm documentation built on Dec. 18, 2021, 12:32 a.m.