MEml: Mixed Effect Machine Learning

Documented in MErf

#' @export
MErf  <- function(X, ...) UseMethod("MErf")


#' @title Mixed Effect random forest  
#' @description 
#' Trains a Mixed Effect random forest 
#' for longitudinal continuous  and binary 
#' data. A rule based version or these methods 
#' using the \code{inTree} package is also implemented(see [1])  

#' @param form formula  
#' @param dat  data.frame with predictors 
#' @param groups character name of the column containing the group identifier
#' @param family a GLM family: continuous data set family = "gaussian", binary data set 
#' family = "binomial" see \code{glm} and \code{family}
#' @param rand.vars random effect variables 
#' @param para named list of gbm training parameters 
#' @param glmer.Control \code{glmer} or \code{lmer} control 
#' @param tol convergence tolerance 
#' @param max.iter maximum number of iterations  
#' @param include.RE (logical) to include random effect Zb as predictor in gbm?  
#' @param verbose verbose for lme4 
#' @param likelihoodCheck (logical) to use log likelihood of glmer to check for convergence? 
#' @param type of predictions of gbm to pass to lme4 as population estimates (these will be used as offset) 
#' @param \dots Further arguments passed to or from other methods.
#' @return An object of class MEgbm; a list with items 
#' \item{rf.fit}{fitted random forest model}
#' \item{glmer.fit}{fitted mixed effect logistic regression model}
#' \item{logLik}{log likelihood of mixed effect logistic regression} 
#' \item{random.effects}{random effect parameter estimates}
#' \item{glmer.form}{lmer4 formula} 
#' \item{glmer.CI}{estimates of mixed effect logistic regression with 
#'     approximate confidence intervals on the logit scale. More accurate values 
#'     can be obtained by bootstrap}
#' \item{threshold}{classification cut-off}
#' \item{predRules}{fitted rules}
#' \item{Y.star}{fitted "transform" outcome. This is the same as the predicted outcome for binary data}
#
#' @author  Che Ngufor <Ngufor.Che@@mayo.edu>
#
#' @references
#' Che Ngufor,  Holly Van Houten, Brian S. Caffo , Nilay D. Shah, Rozalina G. McCoy 
#' Mixed Effect Machine Learning: a framework for predicting longitudinal change in hemoglobin A1c,
#' in Journal of Biomedical Informatics, 2018 

#' @import lme4 caret partykit inTrees 

#' @export
MErf <- function(form, 
                      dat,  
                      groups = NULL, 
                      family, 
                      rand.vars="1", 
                      para = NULL,   
                      tol= 1e-5, 
                      max.iter =100, 
                      include.RE =FALSE, 
                      verbose = FALSE, maxdepth=5,
                      glmer.Control=glmerControl(optimizer = "bobyqa"), 
                      nAGQ=0, 
                      likelihoodCheck = TRUE,
                      K=3, 
                      decay = 0.05, ...){
  
  if(is.null(groups)) stop("please provide grouping variable")
  rhs.vars <- rhs.form(form)
  resp.vars <- lhs.form(form)
  dat$Y <- Y <- dat[, resp.vars]

  old.lik <- -Inf    
  if(likelihoodCheck == FALSE){
    n.re <- sum(rand.vars != 1)+1  
    b.old <-rep(0, n.re*nlevels(factor(dat[, groups])))  ## initial random effect parameters 
  }
  
  ## family
  if(is.character(family))
    family <- get(family, mode = "function", envir = parent.frame())
  if(is.function(family)) family <- family()
  if(is.null(family$family)) {
    print(family)
    stop("'family' not recognized")
  }
  
  ### initial random effect component: fit a LME model with no fixed effects, ie 
  ### a priori known mean of 0 
  form.glmer <- as.formula(paste0("Y ~ ", paste0(c(paste0("1", collapse = "+"), "+", "(", 
                                                   paste0(c(rand.vars), collapse = "+"), "|", groups, ")"), collapse = "")))             
  glmer.fit <- myglmer(form = form.glmer, dat = dat, family = family, control = glmer.Control, nAGQ=0, 
                     verbose = as.numeric(verbose))
  
  pp = predict(glmer.fit, newdata = dat, type = "response") 
  
  if (family$family == "binomial"){
    pp <- ifelse(abs(pp -1) <= 1e-16, pp-1e-16, ifelse(pp <= 1e-16, pp+1e-16, pp))     
    w = pp*(1-pp)
    Y.star <- qlogis(pp) + (Y - pp)/w
  }
  else if(family$family == "gaussian") {
    Y.star = Y
    w = rep(1, length(Y))
  } else {
    print(family)
    stop("'family' is not yet implemented")
  }
  
  ### get the random effect component 
  Zt <-  getME(glmer.fit, name = "Zt")
  b  <-  getME(glmer.fit, name = "b")	
  Zb <-  as.numeric(cprod(x = Zt, y = b))       
  ### adjusted target 
  Target <- Y.star - Zb 
  dat[, "Target"] <- Target 
  dat[, "Zb"] <- Zb
  
  if(include.RE)   rhs.vars <- unique(c(rhs.vars, "Zb"))
  rf.form <- as.formula(paste0("Target ~", paste0(rhs.vars, collapse = "+")))
  
  partvars <- c("TreeConditions")
  ### glmer formula 
  glmer.form  <- as.formula(paste0("Y ~ ", paste0(c(paste0(partvars, collapse="+"), "+",
                            "(", paste0(c(rand.vars), collapse = "+"), "|", groups, ")"), collapse = "")))             
  
  for(ii in 1:max.iter){    	  	    
    #### fit boosted regression trees 
        if(para$opt.para) {
          fitControl <- trainControl(method = para$method, number =  para$number)
          mod <-  train(form=rf.form, data = dat, method = "RRF", tuneLength = para$tuneLength, trControl = fitControl, metric = "RMSE",
                                      ntree = para$ntree, importance = TRUE, keep.inbag = TRUE, replace = FALSE, proximity=FALSE)
          para$mtry  <-  mod$bestTune$mtry
          
          rf.fit <- RRF(dat[, rhs.form(rf.form)], dat[, lhs.form(rf.form)], ntree=para$ntree, mtry = para$mtry, 
                        importance = FALSE, proximity=FALSE, keep.inbag=TRUE, replace=FALSE)
          
        } else {
          rf.fit <- RRF(dat[, rhs.form(rf.form)], dat[, lhs.form(rf.form)], ntree=para$ntree, mtry = max(floor(length(rhs.form(rf.form))/3), 1), 
                        importance = FALSE, proximity=FALSE, keep.inbag=TRUE, replace=FALSE)
          
        }
    
    treeList <- RF2List(rf.fit)    
    ruleExec = myextractRules(treeList, dat[, rhs.form(rf.form)], ntree=floor(0.5*para$ntree), maxdepth = maxdepth, random=FALSE)    
    ruleExec <- unique(ruleExec)
    
    #    splt <- SplitVector(Target, K = K)
    #    target <- splt$newV
    
    ruleMetric <- getRuleMetric(ruleExec,dat[,rhs.vars],Target)
    ruleMetric <- pruneRule(ruleMetric,dat[,rhs.vars], Target, decay)
    ruleMetric <- unique(ruleMetric)    
    learner <- buildLearner(ruleMetric,dat[,rhs.vars], Target)
    pred <- myapplyLearner(learner, dat[,rhs.vars])
    dat[, "TreeConditions"] <- factor(pred$predCond)
    levels(dat[, "TreeConditions"]) <- paste0("Rule.", 1:nlevels(dat[, "TreeConditions"]))
    
    if(length(unique(dat[, "TreeConditions"])) < 2) dat[, "TreeConditions"] <- 1
    
    glmer.fit <- myglmer(form = glmer.form, dat= dat,  family = family, control = glmer.Control, 
                       nAGQ=  nAGQ, verbose = as.numeric(verbose))   
    
    #  get predicted probabilities and compute transformed response 
    pp <- predict(glmer.fit, newdata = dat, type = "response")
     
    if (family$family == "binomial"){
      pp <- ifelse(abs(pp -1) <= 1e-16, pp-1e-16, ifelse(pp <= 1e-16, pp+1e-16, pp))     
      w = pp*(1-pp)
      Y.star <- qlogis(pp) + (Y - pp)/w
    }
    else if(family$family == "gaussian") {
      Y.star = Y
      w = rep(1, length(Y))
    } else {
      print(family)
      stop("'family' is not yet implemented")
    }
    
    Z <-   getME(glmer.fit, name = "Z")
    b  <-  getME(glmer.fit, name = "b")	
    Zb <-  as.numeric(Z%*%b)
    Target  <- Y.star  - Zb
    #    target <- round(w*(Target+ Zb -  qlogis(pp) ) + pp) 
    
    dat[, "Target"] <- Target 
    dat[, "Zb"] <- Zb
    
    ### test for convergence             
    if(likelihoodCheck){
      new.lik <- as.numeric(logLik(glmer.fit))
      r <- as.numeric(sqrt(t(new.lik - old.lik)%*%(new.lik-old.lik)))
      old.lik <- new.lik	
    } else {
      r <- as.numeric(sqrt(t(b - b.old)%*%(b-b.old)))
      b.old <- b
    } 
#    if(verbose) 
#    print(paste("Error: ", r))    
    if( r < tol) break     
  } ## for loop 
  
  if(r > tol) warning("EM algorithm did not converge")
  
  
  #  fitted = predict(rf.fit, newdata = dat, type = "raw") + Zb 
  zz <- predict(rf.fit, newdata = dat, type = "response")
  fitted = zz  + Zb 
  
  threshold = NULL 
  if(family$family == "binomial"){
    pp <- sigmoid(fitted) 
    pp <- ifelse(abs(pp -1) <= 1e-16, pp-1e-16, ifelse(pp <= 1e-16, pp+1e-16, pp))  
    perf <- Performance.measures(pp, Y)
    threshold <- perf$threshold	
  }  
  
  rule.levels <- levels(dat[, "TreeConditions"])
  ### get confidence intervals for mixed effect logistic regresion: rough estimates using the SEs
  se <- sqrt(diag(as.matrix(vcov(glmer.fit)))) 
  tab <- cbind(Est = fixef(glmer.fit), LL = fixef(glmer.fit) - 1.96 * se, 
               UL = fixef(glmer.fit) + 1.96 *se)
  
  #   splitV <- splt$splitV
  #   newV <- rep(0, nrow(learner))
  #   labs <- learner[, "pred"]
  #   xx <- unique(labs)
  #   ix <- 1:length(xx)
  #   is <- which(xx == "L1")
  #   newV[which(labs == "L1")] <- splt$splitV[1]
  #   ll <- xx[xx != "L1"]
  #   names(splitV) <- ll
  #   
  #   if (length(splt$splitV) >= 2) {
  #     for (jj in 1:(length(ll)-1)) {    
  #       newV[which(labs == ll[jj])] <- 0.5*(splt$splitV[jj] + splt$splitV[jj+1])
  #     }
  #   }
  #   newV[which(labs == ll[length(ll)])] <- splt$splitV[length(ll)] + 0.5 
  
  
  ruleMetric <- getRuleMetric(ruleExec,dat[,rhs.vars],Y)
  ruleMetric <- pruneRule(ruleMetric,dat[,rhs.vars], Y)
  ruleMetric <- unique(ruleMetric)    
  learner <- buildLearner(ruleMetric,dat[,rhs.vars], Y)  
  readableLearner <- data.frame(presentRules(learner, rhs.vars))
  
  res <- list(rf.fit = rf.fit, 
              glmer.fit = glmer.fit,  
              groups = groups, para=para,
              rand.vars=rand.vars, 
              rhs.vars = rhs.vars, 
              logLik=as.numeric(logLik(glmer.fit)), 
              random.effects =ranef(glmer.fit), 
              glmer.form = glmer.form, 
              glmer.CI =tab,
              Y.star = fitted, 
              rf.form = rf.form,
              threshold = threshold, 
              include.RE=include.RE, 
              rfRules = learner, 
              rfReadableRules = readableLearner, 
              predRules = pred, 
              rule.levels = rule.levels)
  class(res) <- "MErf"         
  return(res)         
}