R/MplusTrees.R
In MplusTrees: Decision Trees with Structural Equation Models Fit in 'Mplus'

Documented in MplusTrees

#' Recursive partitioning trees with Mplus models
#'
#' Generates recursive partitioning trees using M\emph{plus} models. \code{MplusTrees()} takes an
#' M\emph{plus} model written in the form of an \code{MplusAutomation} script, uses
#' \code{MplusAutomation} to fit the model in M\emph{plus}, and performs recursive partitioning
#' using \code{rpart}.
#'
#' @param script An \code{MplusAutomation} script file
#' @param data Dataset that is specified in the script
#' @param rPartFormula Formula of the form ~ variable names
#' @param catvars Vector of names of categorical covariates
#' @param group id variable. If not specified an id variable is created for each row
#' @param control Control object for \code{rpart}
#' @param se Whether to print standard errors and \emph{p} values. In general should be set to FALSE
#' @param psplit Whether to use likelihood ratio \emph{p} values as a splitting criterion
#' @param palpha Type I error rate (alpha level) for rejecting with likelihood ratio test when
#' \code{psplit} set to \code{TRUE}
#' @param cv Performs k-fold cross-validation to select value of \code{cp}
#' @param k number of folds for cross-validation
#' @details The function temporarily changes the working directory to the temporary directory. Files used
#' and generated by M\emph{plus} are stored here and can be accessed using \code{tempdir()}.
#'
#' By default \code{MplusTrees()} only splits on the criteria specified in the \code{control}
#' argument, the most important of which is the \code{cp} parameter. The user can also split on the
#' \emph{p} value generated from the likelihood ratio test comparing the parent node to a multiple group
#' model consisting of 2 groups (the daughter nodes). This \emph{p} value criterion is used in addition
#' to the \code{cp} criterion in that both must be met for a split to be made. The \code{psplit} argument
#' turns this option on, and \code{palpha} sets the alpha level criterion for rejection.
#'
#' Cross-validation (CV) can also be used to choose the \code{cp} parameter. If this option is used, any
#' user-specified \code{cp} value will be overridden by the optimal \code{cp} value chosen by CV. CV fits
#' the model to the training set and calculates an expected minus 2 log-likelihood (-2LL) for each terminal
#' node. In the test set, individuals are assigned to terminal nodes based on the tree structure found in
#' the training set. Their "expected" values are the -2LL values from the respective training set terminal
#' nodes. The "observed" values are the -2LL values from fitting a multiple group model, with each terminal
#' node as a group. The \code{cp} value chosen is the one that produces the smallest MSE.
#'
#' CV should only be used when (1) the M\emph{plus} model can be fit relatively quickly, (2) there are only
#' a few covariates with a few response options, and (3) the sample size is large enough that the user is
#' confident the model can be fit without issue in a sample of size \emph{N/k} and a tree that partitions
#' this sample further. If these conditions are not met, the process could take prohibitively long to arrive
#' at a solution. Note that if even a single model fails to produce a valid log-likelihood value, the
#' function will terminate with an error.
#'
#' @references Serang, S., Jacobucci, R., Stegmann, G., Brandmaier, A. M., Culianos, D., & Grimm, K. J. (2021).
#' Mplus Trees: Structural equation model trees using Mplus. Structural Equation Modeling, 28, 127-137.
#' @return An object of class '\code{mplustree}'. \code{rpart_out} provides the tree structure, \code{terminal}
#' gives a vector of terminal nodes, \code{where} shows the terminal node of each id, and \code{estimates} gives
#' the parameter estimates for each terminal node.
#' @author Ross Jacobucci and Sarfaraz Serang
#' @import MplusAutomation
#' @import rpart
#' @import nlme
#' @importFrom stats terms as.formula model.matrix pchisq quantile predict
#' @export
#' @examples
#' \dontrun{
#' library(lavaan)
#'
#' script = mplusObject(
#'    TITLE = "Example #1 - Factor Model;",
#'    MODEL = "f1 BY x1-x3; f2 BY x4-x6; f3 BY x7-x9;",
#'    usevariables = c('x1','x2','x3','x4','x5','x6','x7','x8','x9'),
#'    rdata = HolzingerSwineford1939)
#'
#' fit = MplusTrees(script, HolzingerSwineford1939, group=~id,
#'    rPartFormula=~sex+school+grade,
#'    control=rpart.control(minsplit=100, minbucket=100, cp=.01))
#'
#' fit
#' }

MplusTrees <- function(script,
                       data,
                       rPartFormula,
                       catvars = NULL,
                       group= ~ id,
                       control = rpart.control(),
                       se = F,
                       psplit = F,
                       palpha = .05,
                       cv = F,
                       k = 5){

  wd=getwd()
  on.exit(setwd(wd))
  setwd(tempdir())

  if(missing(group)){data$id = 1:nrow(data)}

  if(!is.null(catvars)){
    catdata = indicator(data,catvars=catvars,rPartFormula=rPartFormula)
    data = catdata$data
    rPartFormula = catdata$rpf
  }

  groupingName = attr(terms(splitFormula(group,'~')[[1]]),"term.labels")
  groupingFactor = data[,names(data)==groupingName]


  evaluation <- function(y, wt, parms){
    script$rdata = data=parms[groupingFactor%in%y,]
    fit = mplusModeler(script,run=1L,modelout = "Model.1.inp")
    deviance = -2*(fit$results$summaries$LL)
    list(label=deviance, deviance=deviance)
  }


  split.tree <- function(y, wt, x, parms, continuous) {
    print(paste("splitting:", length(unique(x)), "values"))
    dev = vector()
    xUnique = unique(x)
    script$rdata = data=parms[groupingFactor%in%y,]
    fit = mplusModeler(script,run=1L,modelout = "Model.1.inp")
    rootDev = fit$results$summaries$LL

    mplusmodwrap = function(index,script,run,modelout){
      if(index == 1){
        script$rdata = parms[groupingFactor %in% yLeft, ]
        out = mplusModeler(script,run=run,modelout=modelout)
      }
      if(index == 2){
        script$rdata = parms[groupingFactor %in% yRight, ]
        out = mplusModeler(script,run=run,modelout=modelout)
      }
      return(out)
    }

    if (continuous) {
      for (i in xUnique) {
        yLeft = y[x <= i]
        yRight = y[x > i]
        if (length(yLeft) < control$minbucket || length(yRight) <
            control$minbucket) {
          dev = c(dev, 0)
        }
        else{
          script$rdata = parms[groupingFactor %in% yLeft, ]
          modelLeft = mplusModeler(script,run=1L,modelout = "Model.1.inp")
          script$rdata = parms[groupingFactor %in% yRight, ]
          modelRight = mplusModeler(script,run=1L,modelout = "Model.1.inp")

          if(psplit==T){
            chisq = -2*(fit$results$summaries$LL-
              modelLeft$results$summaries$LL-modelRight$results$summaries$LL)
            p0 = fit$results$summaries$NIndependentVars +
              fit$results$summaries$NDependentVars
            df = p0^2 + 3*p0 - fit$results$summaries$Parameters -
              modelLeft$results$summaries$ChiSqM_DF -
              modelRight$results$summaries$ChiSqM_DF
            pval = 1 - pchisq(chisq,df)
            if(length(pval)!=0){
              if(pval >= palpha){
                dev = c(dev, 0)
              }
              else if(pval < palpha){
                dev = c(dev, modelLeft$results$summaries$LL +
                          modelRight$results$summaries$LL)
              }
            }
            else{
              dev = c(dev, modelLeft$results$summaries$LL +
                        modelRight$results$summaries$LL)
            }
          }
          else if(psplit==F){
            dev = c(dev, modelLeft$results$summaries$LL +
                      modelRight$results$summaries$LL)
          }
        }
      }
      good = rep(0, length(x))
      for (i in 1:length(xUnique)) {
        good[x == xUnique[i]] = dev[i]
      }
      good = good[1:(length(good) - 1)]
      list(goodness = good + abs(rootDev) * (good != 0) * 2,
           direction = rep(-1, length(good)))
    }
    else {
      order = rep(0, length(xUnique))
      dir = sort(order, index.return = TRUE)$ix
      for (i in 1:(length(dir) - 1)) {
        yLeft = y[x %in% dir[1:i]]
        yRight = y[x %in% dir[(i + 1):length(dir)]]
        if (length(yLeft) < control$minbucket ||
            length(yRight) < control$minbucket) {
          dev = c(dev, 0)
        }
        else {
          script$rdata = parms[groupingFactor %in% yLeft, ]
          modelLeft = mplusModeler(script,run=1L,modelout = "Model.1.inp")
          script$rdata = parms[groupingFactor %in% yRight, ]
          modelRight = mplusModeler(script,run=1L,modelout = "Model.1.inp")

          if(psplit==T){
            chisq = -2*(fit$results$summaries$LL-
              modelLeft$results$summaries$LL-modelRight$results$summaries$LL)
            p0 = fit$results$summaries$NIndependentVars +
              fit$results$summaries$NDependentVars
            df = p0^2 + 3*p0 - fit$results$summaries$Parameters -
              modelLeft$results$summaries$ChiSqM_DF -
              modelRight$results$summaries$ChiSqM_DF
            pval = 1 - pchisq(chisq,df)
            if(length(pval)!=0){
              if(pval >= palpha){
                dev = c(dev, 0)
              }
              else if(pval < palpha){
                dev = c(dev, modelLeft$results$summaries$LL +
                          modelRight$results$summaries$LL)
              }
            }
            else{
              dev = c(dev, modelLeft$results$summaries$LL +
                        modelRight$results$summaries$LL)
            }
          }
          else if(psplit==F){
            dev = c(dev, modelLeft$results$summaries$LL +
                      modelRight$results$summaries$LL)
          }
        }
      }
      list(goodness = dev + abs(rootDev) * (dev != 0) * 2, direction = dir)
    }
  }


  initialize <- function(y,offset,parms=0,wt){
    list(
      y=y,
      parms=parms,
      numresp=1,
      numy=1,
      summary=function(yval,dev,wt,ylevel,digits){paste("deviance (-2logLik)",
                       format(signif(dev),3))}
    )
  }

  #cross-validation
  if(cv==T){
    crossval = function(data=data,k=k,control=control,groupingName=groupingName,
                        rPartFormula=rPartFormula){
      folds = foldmaker(data, k=k)
      agg = foldcombiner(folds)
      contr1 = contr2 = control
      contr1$cp = 0
      cprange = rpart(paste(groupingName,c(rPartFormula)),
                      method=list(eval=evaluation, split=split.tree,init=initialize),
                      control=contr1,data=agg[[1]]$train,parms=agg[[1]]$train)

      cps = cpcalc(cprange)

      cvscript = script
      mses = matrix(nrow=k,ncol=length(cps))

      for(icp in 1:length(cps)){
        cpval = cps[icp]
        contr2$cp = cpval
        for(ik in 1:k){
          devs = data.frame(matrix(nrow=nrow(agg[[ik]]$test),ncol=3))
          names(devs) = c("group","exp","obs")
          devs[,1] = agg[[ik]]$test[,paste(groupingName)]
          trainmod = rpart(paste(groupingName,c(rPartFormula)),
                           method=list(eval=evaluation, split=split.tree,init=initialize),
                           control=contr2,data=agg[[ik]]$train,parms=agg[[ik]]$train)
          devs[,2] = predict(trainmod,agg[[ik]]$test)
          leafgps = leafgrouper(trainmod,agg[[ik]]$test)

          for(ileaf in 1:length(leafgps)){
            cvscript$rdata = leafgps[[ileaf]]
            cvtestout = mplusModeler(cvscript,run=1L,modelout = "Model.1.inp")
            devs[which(devs[,1]%in%leafgps[[ileaf]][,paste(groupingName)]),3] =
              -2*cvtestout$results$summaries$LL
          }
          mses[ik,icp] = mean((devs[,3]-devs[,2])^2,na.rm=T)
        }
      }
      cvcp = cps[which(colMeans(mses,na.rm=T)==min(colMeans(mses,na.rm=T)))]
      return(cvcp)
    }

    cvcp = crossval(data=data,k=k,control=control,groupingName=groupingName,
                           rPartFormula=rPartFormula)
    control$cp = cvcp
  }

  model <- list()
  estimates <- list()
  model.rpart = rpart(paste(groupingName,c(rPartFormula)),
                      method=list(eval=evaluation, split=split.tree,init=initialize),
                      control=control,data=data,parms=data)

  model$rpart_out <- model.rpart
  frame <- model.rpart$frame
  node2 = row.names(frame[frame[,"var"] == "<leaf>",])
  model$terminal = node2

  model$where = model.rpart$where
  oldwhere = names(table(model$where))
  owind = vector("list",length(oldwhere))
  for(i in 1:length(oldwhere)){
    owind[[i]] = which(model$where == oldwhere[i])
  }
  for(i in 1:length(oldwhere)){
    model$where = replace(model$where,owind[[i]],node2[i])
  }

  for(j in 1:length(table(model.rpart$where))){
    id <- names(table(model.rpart$where))[j]==model.rpart$where
    script$rdata = data[id,]
    fit = mplusModeler(script,run=1L,modelout = "Model.1.inp")
    if(se==F){estimates[[node2[j]]] = fit$results$parameters[[1]][,1:3]}
    else if(se==T){estimates[[node2[j]]] = fit$results$parameters}
  }
  model$estimates = estimates

  class(model) <- "mplustree"

  return(model)
}