Nothing
#' Recursive partitioning trees with Mplus models
#'
#' Generates recursive partitioning trees using M\emph{plus} models. \code{MplusTrees()} takes an
#' M\emph{plus} model written in the form of an \code{MplusAutomation} script, uses
#' \code{MplusAutomation} to fit the model in M\emph{plus}, and performs recursive partitioning
#' using \code{rpart}.
#'
#' @param script An \code{MplusAutomation} script file
#' @param data Dataset that is specified in the script
#' @param rPartFormula Formula of the form ~ variable names
#' @param catvars Vector of names of categorical covariates
#' @param group id variable. If not specified an id variable is created for each row
#' @param control Control object for \code{rpart}
#' @param se Whether to print standard errors and \emph{p} values. In general should be set to FALSE
#' @param psplit Whether to use likelihood ratio \emph{p} values as a splitting criterion
#' @param palpha Type I error rate (alpha level) for rejecting with likelihood ratio test when
#' \code{psplit} set to \code{TRUE}
#' @param cv Performs k-fold cross-validation to select value of \code{cp}
#' @param k number of folds for cross-validation
#' @details The function temporarily changes the working directory to the temporary directory. Files used
#' and generated by M\emph{plus} are stored here and can be accessed using \code{tempdir()}.
#'
#' By default \code{MplusTrees()} only splits on the criteria specified in the \code{control}
#' argument, the most important of which is the \code{cp} parameter. The user can also split on the
#' \emph{p} value generated from the likelihood ratio test comparing the parent node to a multiple group
#' model consisting of 2 groups (the daughter nodes). This \emph{p} value criterion is used in addition
#' to the \code{cp} criterion in that both must be met for a split to be made. The \code{psplit} argument
#' turns this option on, and \code{palpha} sets the alpha level criterion for rejection.
#'
#' Cross-validation (CV) can also be used to choose the \code{cp} parameter. If this option is used, any
#' user-specified \code{cp} value will be overridden by the optimal \code{cp} value chosen by CV. CV fits
#' the model to the training set and calculates an expected minus 2 log-likelihood (-2LL) for each terminal
#' node. In the test set, individuals are assigned to terminal nodes based on the tree structure found in
#' the training set. Their "expected" values are the -2LL values from the respective training set terminal
#' nodes. The "observed" values are the -2LL values from fitting a multiple group model, with each terminal
#' node as a group. The \code{cp} value chosen is the one that produces the smallest MSE.
#'
#' CV should only be used when (1) the M\emph{plus} model can be fit relatively quickly, (2) there are only
#' a few covariates with a few response options, and (3) the sample size is large enough that the user is
#' confident the model can be fit without issue in a sample of size \emph{N/k} and a tree that partitions
#' this sample further. If these conditions are not met, the process could take prohibitively long to arrive
#' at a solution. Note that if even a single model fails to produce a valid log-likelihood value, the
#' function will terminate with an error.
#'
#' @references Serang, S., Jacobucci, R., Stegmann, G., Brandmaier, A. M., Culianos, D., & Grimm, K. J. (2021).
#' Mplus Trees: Structural equation model trees using Mplus. Structural Equation Modeling, 28, 127-137.
#' @return An object of class '\code{mplustree}'. \code{rpart_out} provides the tree structure, \code{terminal}
#' gives a vector of terminal nodes, \code{where} shows the terminal node of each id, and \code{estimates} gives
#' the parameter estimates for each terminal node.
#' @author Ross Jacobucci and Sarfaraz Serang
#' @import MplusAutomation
#' @import rpart
#' @import nlme
#' @importFrom stats terms as.formula model.matrix pchisq quantile predict
#' @export
#' @examples
#' \dontrun{
#' library(lavaan)
#'
#' script = mplusObject(
#' TITLE = "Example #1 - Factor Model;",
#' MODEL = "f1 BY x1-x3; f2 BY x4-x6; f3 BY x7-x9;",
#' usevariables = c('x1','x2','x3','x4','x5','x6','x7','x8','x9'),
#' rdata = HolzingerSwineford1939)
#'
#' fit = MplusTrees(script, HolzingerSwineford1939, group=~id,
#' rPartFormula=~sex+school+grade,
#' control=rpart.control(minsplit=100, minbucket=100, cp=.01))
#'
#' fit
#' }
MplusTrees <- function(script,
data,
rPartFormula,
catvars = NULL,
group= ~ id,
control = rpart.control(),
se = F,
psplit = F,
palpha = .05,
cv = F,
k = 5){
wd=getwd()
on.exit(setwd(wd))
setwd(tempdir())
if(missing(group)){data$id = 1:nrow(data)}
if(!is.null(catvars)){
catdata = indicator(data,catvars=catvars,rPartFormula=rPartFormula)
data = catdata$data
rPartFormula = catdata$rpf
}
groupingName = attr(terms(splitFormula(group,'~')[[1]]),"term.labels")
groupingFactor = data[,names(data)==groupingName]
evaluation <- function(y, wt, parms){
script$rdata = data=parms[groupingFactor%in%y,]
fit = mplusModeler(script,run=1L,modelout = "Model.1.inp")
deviance = -2*(fit$results$summaries$LL)
list(label=deviance, deviance=deviance)
}
split.tree <- function(y, wt, x, parms, continuous) {
print(paste("splitting:", length(unique(x)), "values"))
dev = vector()
xUnique = unique(x)
script$rdata = data=parms[groupingFactor%in%y,]
fit = mplusModeler(script,run=1L,modelout = "Model.1.inp")
rootDev = fit$results$summaries$LL
mplusmodwrap = function(index,script,run,modelout){
if(index == 1){
script$rdata = parms[groupingFactor %in% yLeft, ]
out = mplusModeler(script,run=run,modelout=modelout)
}
if(index == 2){
script$rdata = parms[groupingFactor %in% yRight, ]
out = mplusModeler(script,run=run,modelout=modelout)
}
return(out)
}
if (continuous) {
for (i in xUnique) {
yLeft = y[x <= i]
yRight = y[x > i]
if (length(yLeft) < control$minbucket || length(yRight) <
control$minbucket) {
dev = c(dev, 0)
}
else{
script$rdata = parms[groupingFactor %in% yLeft, ]
modelLeft = mplusModeler(script,run=1L,modelout = "Model.1.inp")
script$rdata = parms[groupingFactor %in% yRight, ]
modelRight = mplusModeler(script,run=1L,modelout = "Model.1.inp")
if(psplit==T){
chisq = -2*(fit$results$summaries$LL-
modelLeft$results$summaries$LL-modelRight$results$summaries$LL)
p0 = fit$results$summaries$NIndependentVars +
fit$results$summaries$NDependentVars
df = p0^2 + 3*p0 - fit$results$summaries$Parameters -
modelLeft$results$summaries$ChiSqM_DF -
modelRight$results$summaries$ChiSqM_DF
pval = 1 - pchisq(chisq,df)
if(length(pval)!=0){
if(pval >= palpha){
dev = c(dev, 0)
}
else if(pval < palpha){
dev = c(dev, modelLeft$results$summaries$LL +
modelRight$results$summaries$LL)
}
}
else{
dev = c(dev, modelLeft$results$summaries$LL +
modelRight$results$summaries$LL)
}
}
else if(psplit==F){
dev = c(dev, modelLeft$results$summaries$LL +
modelRight$results$summaries$LL)
}
}
}
good = rep(0, length(x))
for (i in 1:length(xUnique)) {
good[x == xUnique[i]] = dev[i]
}
good = good[1:(length(good) - 1)]
list(goodness = good + abs(rootDev) * (good != 0) * 2,
direction = rep(-1, length(good)))
}
else {
order = rep(0, length(xUnique))
dir = sort(order, index.return = TRUE)$ix
for (i in 1:(length(dir) - 1)) {
yLeft = y[x %in% dir[1:i]]
yRight = y[x %in% dir[(i + 1):length(dir)]]
if (length(yLeft) < control$minbucket ||
length(yRight) < control$minbucket) {
dev = c(dev, 0)
}
else {
script$rdata = parms[groupingFactor %in% yLeft, ]
modelLeft = mplusModeler(script,run=1L,modelout = "Model.1.inp")
script$rdata = parms[groupingFactor %in% yRight, ]
modelRight = mplusModeler(script,run=1L,modelout = "Model.1.inp")
if(psplit==T){
chisq = -2*(fit$results$summaries$LL-
modelLeft$results$summaries$LL-modelRight$results$summaries$LL)
p0 = fit$results$summaries$NIndependentVars +
fit$results$summaries$NDependentVars
df = p0^2 + 3*p0 - fit$results$summaries$Parameters -
modelLeft$results$summaries$ChiSqM_DF -
modelRight$results$summaries$ChiSqM_DF
pval = 1 - pchisq(chisq,df)
if(length(pval)!=0){
if(pval >= palpha){
dev = c(dev, 0)
}
else if(pval < palpha){
dev = c(dev, modelLeft$results$summaries$LL +
modelRight$results$summaries$LL)
}
}
else{
dev = c(dev, modelLeft$results$summaries$LL +
modelRight$results$summaries$LL)
}
}
else if(psplit==F){
dev = c(dev, modelLeft$results$summaries$LL +
modelRight$results$summaries$LL)
}
}
}
list(goodness = dev + abs(rootDev) * (dev != 0) * 2, direction = dir)
}
}
initialize <- function(y,offset,parms=0,wt){
list(
y=y,
parms=parms,
numresp=1,
numy=1,
summary=function(yval,dev,wt,ylevel,digits){paste("deviance (-2logLik)",
format(signif(dev),3))}
)
}
#cross-validation
if(cv==T){
crossval = function(data=data,k=k,control=control,groupingName=groupingName,
rPartFormula=rPartFormula){
folds = foldmaker(data, k=k)
agg = foldcombiner(folds)
contr1 = contr2 = control
contr1$cp = 0
cprange = rpart(paste(groupingName,c(rPartFormula)),
method=list(eval=evaluation, split=split.tree,init=initialize),
control=contr1,data=agg[[1]]$train,parms=agg[[1]]$train)
cps = cpcalc(cprange)
cvscript = script
mses = matrix(nrow=k,ncol=length(cps))
for(icp in 1:length(cps)){
cpval = cps[icp]
contr2$cp = cpval
for(ik in 1:k){
devs = data.frame(matrix(nrow=nrow(agg[[ik]]$test),ncol=3))
names(devs) = c("group","exp","obs")
devs[,1] = agg[[ik]]$test[,paste(groupingName)]
trainmod = rpart(paste(groupingName,c(rPartFormula)),
method=list(eval=evaluation, split=split.tree,init=initialize),
control=contr2,data=agg[[ik]]$train,parms=agg[[ik]]$train)
devs[,2] = predict(trainmod,agg[[ik]]$test)
leafgps = leafgrouper(trainmod,agg[[ik]]$test)
for(ileaf in 1:length(leafgps)){
cvscript$rdata = leafgps[[ileaf]]
cvtestout = mplusModeler(cvscript,run=1L,modelout = "Model.1.inp")
devs[which(devs[,1]%in%leafgps[[ileaf]][,paste(groupingName)]),3] =
-2*cvtestout$results$summaries$LL
}
mses[ik,icp] = mean((devs[,3]-devs[,2])^2,na.rm=T)
}
}
cvcp = cps[which(colMeans(mses,na.rm=T)==min(colMeans(mses,na.rm=T)))]
return(cvcp)
}
cvcp = crossval(data=data,k=k,control=control,groupingName=groupingName,
rPartFormula=rPartFormula)
control$cp = cvcp
}
model <- list()
estimates <- list()
model.rpart = rpart(paste(groupingName,c(rPartFormula)),
method=list(eval=evaluation, split=split.tree,init=initialize),
control=control,data=data,parms=data)
model$rpart_out <- model.rpart
frame <- model.rpart$frame
node2 = row.names(frame[frame[,"var"] == "<leaf>",])
model$terminal = node2
model$where = model.rpart$where
oldwhere = names(table(model$where))
owind = vector("list",length(oldwhere))
for(i in 1:length(oldwhere)){
owind[[i]] = which(model$where == oldwhere[i])
}
for(i in 1:length(oldwhere)){
model$where = replace(model$where,owind[[i]],node2[i])
}
for(j in 1:length(table(model.rpart$where))){
id <- names(table(model.rpart$where))[j]==model.rpart$where
script$rdata = data[id,]
fit = mplusModeler(script,run=1L,modelout = "Model.1.inp")
if(se==F){estimates[[node2[j]]] = fit$results$parameters[[1]][,1:3]}
else if(se==T){estimates[[node2[j]]] = fit$results$parameters}
}
model$estimates = estimates
class(model) <- "mplustree"
return(model)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.