R/compareModels.R
In MplusAutomation: Automating Mplus Model Estimation and Interpretation

Documented in compareModels

#' Compare the output of two Mplus models
#'
#' The \code{compareModels} function compares the output of two Mplus files and prints similarities and
#' differences in the model summary statistics and parameter estimates. Options are provided
#' for filtering out fixed parameters and nonsignificant parameters. When requested, \code{compareModels}
#' will compute the chi-square difference test for nested models (does not apply to MLMV, WLSM, and WLSMV
#' estimators, where DIFFTEST in Mplus is needed).
#' Model outputs to be compared can be full summaries and parameters (generated by \code{readModels}),
#' summary statistics only (\code{extractModelSummaries}), or parameters only (\code{extractModelParameters}).
#'
#'
#' The \code{show} parameter can be one or more of the following, which can be passed as a vector, such as c("equal", "pdiff").
#'
#' \describe{
#'   \item{show}{
#'     \describe{
#'       \item{"all"}{Display all available model comparison. Equivalent to
#'         c("summaries", "equal", "diff", "pdiff", "unique").}
#'       \item{"summaries"}{Print a comparison of model summary statistics. Compares the following summary
#'         statistics (where available): c("Title", "Observations", "Estimator", "Parameters", "LL",
#'         "AIC", "BIC", "ChiSqM_Value", "ChiSqM_DF", "CFI", "TLI", "RMSEA", "SRMR", "WRMR")}
#' 	 \item{"allsummaries"}{Prints a comparison of all summary statistics available in each model. May
#'         generate a lot of output.}
#'       \item{"equal"}{Print parameter estimates that are equal between models
#'         (i.e., \code{<= equalityMargin["param"]})}.
#'       \item{"diff"}{Print parameter estimates that are different between models
#'         (i.e., \code{> equalityMargin["param"]})}.
#'       \item{"pdiff"}{Print parameter estimates where the p-values differ between models
#'         (i.e., \code{> equalityMargin["pvalue"]})}.
#'       \item{"unique"}{Print parameter estimates that are unique to each model.}
#'     }
#'   }
#' }
#'
#' The \code{sort} parameter determines the order in which parameter estimates are displayed. The following options are available:
#'
#' \describe{
#'   \item{sort}{
#'     \describe{
#'       \item{"none"}{No sorting is performed, so parameters are output in the order presented in Mplus. (Default)}
#'       \item{"type"}{Sort parameters by their role in the model. This groups output by regression coefficient (ON),
#'         factor loadings (BY), covariances (WITH), and so on. Within each type, output is alphabetical.}
#'       \item{"alphabetical"}{Sort parameters in alphabetical order.}
#'       \item{"maxDiff"}{Sort parameter output by the largest differences between models (high to low).}
#'     }
#'   }
#' }
#'
#' @param m1 The first Mplus model to be compared. Generated by \code{readModels},
#'   	\code{extractModelSummaries}, or \code{extractModelParameters}.
#' @param m2 The second Mplus model to be compared.
#' @param show What aspects of the models should be compared. Options are
#'   	"all", "summaries", "equal", "diff", "pdiff", and "unique". See below for details.
#' @param equalityMargin Defines the discrepancy between models that is considered equal.
#'   	Different margins can be specified for p-value equality
#'   	versus parameter equality. Defaults to .0001 for both.
#' @param sort How to sort the output of parameter comparisons.
#'   	Options are "none", "type", "alphabetical", and "maxDiff". See below for details.
#' @param showFixed Whether to display fixed parameters in the output (identified
#'   	where the est/se = 999.000, per Mplus convention). Default to \code{FALSE}.
#' @param showNS Whether to display non-significant parameter estimates. Can be
#'   	\code{TRUE} or \code{FALSE}, or a numeric value (e.g., .10) that defines
#' 	what p-value is filtered as non-significant.
#' @param diffTest Whether to compute a chi-square difference test between the models.
#'   	Assumes that the models are nested. Not available for MLMV, WLSMV, and ULSMV estimators.
#'   	Use DIFFTEST in Mplus instead.
#' @return No value is returned by this function. It is used to print model differences to the R console.
#' @author Michael Hallquist
#' @keywords interface
#' @importFrom plyr rbind.fill rename
#' @export
#' @examples
#'
#' # make me!!!
compareModels <- function(m1, m2, show="all", equalityMargin=c(param=0.0001, pvalue=0.0001), sort="none", showFixed=FALSE, showNS=TRUE, diffTest=FALSE) {
  # TODO: Make work with standardized parameter estimates
  #options for show:
  # -all: print equal params, unequal params, and unique params
  # -unique: print only unique params
  # -diff: print parameters that differ and unique params
  # -pdiff: print parameters where pvalues differ and unique params
  # -equal: print parameters that are equal
  # -summaries: print comparison of summaries
  # -allsummaries

  # defaults to show equal and unequal params
  #require(plyr)

  # if a single scalar is passed in with no name, then use that value for param and pvalue diffs
  if (length(equalityMargin) == 1) {
    if (is.null(names(equalityMargin)))
      equalityMargin <- c(param=equalityMargin, pvalue=equalityMargin)
    else {
      if (!"param" %in% names(equalityMargin))
        equalityMargin["param"] <- .0001
      if (!"pvalue" %in% names(equalityMargin))
        equalityMargin["pvalue"] <- .0001
    }
  }

  cat("\n==============\n\nMplus model comparison\n----------------------\n\n")

  if (!is.null(fn <- attr(m1, "filename")))
    cat("------\nModel 1: ", fn, "\n")

  if (!is.null(fn <- attr(m2, "filename")))
    cat("Model 2: ", fn, "\n------\n")

  if (inherits(m1, c("mplus.summaries", "mplus.model")) &&
      inherits(m2, c("mplus.summaries", "mplus.model")) &&
      any(c("all", "allsummaries", "summaries") %in% show)) {

    # compare summaries
    if (inherits(m1, "mplus.model")) m1Summaries <- m1$summaries
    else m1Summaries <- m1 #just copy summaries as is

    if (inherits(m2, "mplus.model")) m2Summaries <- m2$summaries
    else m2Summaries <- m2 #just copy summaries as is

    cat("\nModel Summary Comparison\n------------------------\n\n")

    # compare basic stuff

    #sumCombined <- plyr::rbind.fill(m1Summaries, m2Summaries)
    sumCombined <- rbind.fill(m1Summaries, m2Summaries)

    if (!"allsummaries" %in% show) {
      comparators <- c("Title", "Observations", "Estimator", "Parameters",
                       "LL", "AIC", "BIC", "ChiSqM_Value", "ChiSqM_DF",
                       "CFI", "TLI", "RMSEA_Estimate", "SRMR", "WRMR")

      sumCombined <- sumCombined[, intersect(comparators, names(sumCombined))]
    }

    # manual loop seems easier here (tried apply, sapply combos... too painful)
    sumCombinedL <- as.list(sumCombined)
    nameList <- names(sumCombinedL)
    totalPad <- 0

    for (var in 1:length(sumCombinedL)) {
      sumCombinedL[[var]] <- lapply(sumCombinedL[[var]], strwrap, width=40, exdent=2)
      len1 <- length(sumCombinedL[[var]][[1]])
      len2 <- length(sumCombinedL[[var]][[2]])
      if (len1 > len2) sumCombinedL[[var]][[2]] <- c(sumCombinedL[[var]][[2]], rep("", len1-len2))
      else if (len2 > len1) sumCombinedL[[var]][[1]] <- c(sumCombinedL[[var]][[1]], rep("", len2-len1))
      padLen <- max(len2, len1) - 1
      if ((var+totalPad+1) <= length(nameList))
        nameList <- c(nameList[1:(var+totalPad)], rep("", padLen), nameList[(var+totalPad+1):length(nameList)])
      else
        nameList <- c(nameList[1:(var+totalPad)], rep("", padLen))

      totalPad <- totalPad + padLen
    }

    wrappedOutput <- cbind(m1=do.call("c", lapply(sumCombinedL, "[[", 1)),
        m2=do.call("c", lapply(sumCombinedL, "[[", 2)))

    nameList <- encodeString(nameList, justify="left", width=NA)
    rownames(wrappedOutput) <- nameList

    print(wrappedOutput, quote=FALSE)

    #chi-square difference testing
    if (diffTest == TRUE) {
      if ("ChiSqDiffTest_Value" %in% unique(c(names(m1Summaries), names(m2Summaries)))) {
        #computed using difftest
        if ("ChiSqDiffTest_Value" %in% names(m1Summaries)) {
          val <- m1Summaries$ChiSqDiffTest_Value
          dfDiff <- m1Summaries$ChiSqDiffTest_DF
          pval <- m1Summaries$ChiSqDiffTest_PValue
        } else {
          val <- m2Summaries$ChiSqDiffTest_Value
          dfDiff <- m2Summaries$ChiSqDiffTest_DF
          pval <- m2Summaries$ChiSqDiffTest_PValue
        }

        cat("\n  Chi-Square Difference Test for Nested Models using DIFFTEST in Mplus\n  -----------------------------------------------------------------------\n\n")
        cat("  Chi-square difference: ", round(val, 4), "\n")
        cat("  Diff degrees of freedom: ", dfDiff, "\n")
        cat("  P-value: ", round(pval, 4), "\n")

      } else if (m1Summaries$Estimator %in% c("ML", "MLM", "MLR", "WLS", "WLSM") && m1Summaries$Parameters != m2Summaries$Parameters) {
        if (m1Summaries$Estimator == m2Summaries$Estimator ) {
          if (m1Summaries$Parameters < m2Summaries$Parameters) H0m <- m1Summaries
          else H0m <- m2Summaries

          if (m1Summaries$Parameters > m2Summaries$Parameters) H1m <- m1Summaries
          else H1m <- m2Summaries

          if (m1Summaries$Estimator == "MLR") {
            #Chi-square difference test for MLR models based on loglikelihood
            cd <- (H0m$Parameters*H0m$LLCorrectionFactor - H1m$Parameters*H1m$LLCorrectionFactor)/(H0m$Parameters - H1m$Parameters)

            ChiSqDiff <- -2*(H0m$LL - H1m$LL)/cd
            #use difference in the number of parameters for dfDiff since for some models, only LL and Params are available.
            dfDiff <- (H1m$Parameters - H0m$Parameters)

            cat("\n  MLR Chi-Square Difference Test for Nested Models Based on Loglikelihood\n  -----------------------------------------------------------------------\n\n")
            cat("  Difference Test Scaling Correction: ", cd, "\n")
            cat("  Chi-square difference: ", round(ChiSqDiff, 4), "\n")
            cat("  Diff degrees of freedom: ", dfDiff, "\n")
            cat("  P-value: ", round(pchisq(ChiSqDiff, dfDiff, lower.tail=FALSE), 4), "\n")
            cat("\n  Note: The chi-square difference test assumes that these models are nested.\n  It is up to you to verify this assumption.\n")
          }
          if (m1Summaries$Estimator %in% c("ML", "MLR", "MLM", "WLSM")) {
            if (m1Summaries$Estimator %in% c("ML", "WLS"))
              #for ML and WLS,  no need to correct chi-square with scaling factors
              ChiSqDiff <- H0m$ChiSqM_Value - H1m$ChiSqM_Value
            else {
              cd <- (H0m$ChiSqM_DF*H0m$ChiSqM_ScalingCorrection - H1m$ChiSqM_DF*H1m$ChiSqM_ScalingCorrection)/(H0m$ChiSqM_DF - H1m$ChiSqM_DF)
              ChiSqDiff <- (H0m$ChiSqM_Value*H0m$ChiSqM_ScalingCorrection - H1m$ChiSqM_Value*H1m$ChiSqM_ScalingCorrection)/cd
            }

            dfDiff <- (H0m$ChiSqM_DF - H1m$ChiSqM_DF)

            cat("\n  ", m1Summaries$Estimator, " Chi-Square Difference test for nested models\n  --------------------------------------------\n\n", sep="")
            if (! m1Summaries$Estimator %in% c("WLS", "ML")) cat("  Difference Test Scaling Correction:", cd, "\n")
            cat("  Chi-square difference:", round(ChiSqDiff, 4), "\n")
            cat("  Diff degrees of freedom:", dfDiff, "\n")
            cat("  P-value:", round(pchisq(ChiSqDiff, dfDiff, lower.tail=FALSE), 4), "\n")
            cat("\nNote: The chi-square difference test assumes that these models are nested.\n  It is up to you to verify this assumption.\n")
          }
        } else
          warning("Cannot compute chi-square difference test because different estimators used: ", m1Summaries$Estimator, " and ", m2Summaries$Estimator)
      }
    }
  }


  ###PARAMETER COMPARISON
  #will blow up in cases where std output with no est_se or pval

  #fuzzy equality
  testEquality <- function(v1, v2, margin=0.0000) {
    if (length(v1) != length(v2)) stop ("v1 length != v2 length")

    if (!is.numeric(v1) || !is.numeric(v2)) stop("Non-numeric arguments passed to testEquality.")

    retVec <- aaply(cbind(v1, v2), 1, function(row) {
          if(abs(row["v1"]-row["v2"]) <= margin) return(TRUE)
          else return(FALSE)
        })

    return(retVec)
  }

  if (inherits(m1, c("mplus.model", "mplus.params")) &&
      inherits(m2, c("mplus.model", "mplus.params")) &&
      any(c("all", "equal", "diff", "pdiff", "unique") %in% show)) {

    if (inherits(m1, "mplus.model")) m1Params <- m1$parameters$unstandardized
    else m1Params <- m1

    if (inherits(m2, "mplus.model")) m2Params <- m2$parameters$unstandardized
    else m2Params <- m2

	#check for multiple groups, latent classes, and multilevel output
	uniqueParamCols <- c("paramHeader", "param")

  if ("Group" %in% names(m1Params)) {
    uniqueParamCols <- c(uniqueParamCols, "Group")
    multipleGroups <- TRUE
  } else multipleGroups <- FALSE

  if ("LatentClass" %in% names(m1Params)) {
    uniqueParamCols <- c(uniqueParamCols, "LatentClass")
    latentClasses <- TRUE
  } else latentClasses <- FALSE

  if ("BetweenWithin" %in% names(m1Params)) {
    uniqueParamCols <- c(uniqueParamCols, "BetweenWithin")
    multilevel <- TRUE
  } else multilevel <- FALSE

  #match up paramheader and param combinations
  m1Params$paramCombined <- apply(m1Params[,uniqueParamCols], 1, paste, collapse=".")
  m2Params$paramCombined <- apply(m2Params[,uniqueParamCols], 1, paste, collapse=".")

  #identify columns present in both data.frames
  matchCols <- intersect(m1Params$paramCombined, m2Params$paramCombined)
  m1Only <- setdiff(m1Params$paramCombined, m2Params$paramCombined)
  m2Only <- setdiff(m2Params$paramCombined, m1Params$paramCombined)

  #m1Params <- plyr::rename(m1Params, c(est="m1_est", se="m1_se", est_se="m1_est_se", pval="m1_pval"))
  #m2Params <- plyr::rename(m2Params, c(est="m2_est", se="m2_se", est_se="m2_est_se", pval="m2_pval"))
  m1Params <- rename(m1Params, c(est="m1_est", se="m1_se", est_se="m1_est_se", pval="m1_pval"))
  m2Params <- rename(m2Params, c(est="m2_est", se="m2_se", est_se="m2_est_se", pval="m2_pval"))

  #perhaps merge on uniqueParamCols (ensures that any group, bw/wi, and latent classes match)
  #matchDF <- merge(m1Params[m1Params$paramCombined %in% matchCols,], m2Params[m2Params$paramCombined %in% matchCols,], by=c("paramHeader", "param"), all.x=TRUE, all.y=TRUE)
  matchDF <- merge(m1Params[m1Params$paramCombined %in% matchCols, ],
                   m2Params[m2Params$paramCombined %in% matchCols, ],
                   by=uniqueParamCols, all.x=TRUE, all.y=TRUE)
  matchDF <- subset(matchDF, select=c(-paramCombined.x, -paramCombined.y)) #don't retain the matching column

  #remove fixed parameters (only if fixed in both models)
  if (!showFixed) {
    matchDF <- subset(matchDF, !(m1_est_se == 999.000 & m2_est_se == 999.000))
    #N.B. I'm dubious about removing fixed from m1/m2 params because then the "unique" output omits these
    m1Params <- subset(m1Params, !m1_est_se == 999.000)
    m2Params <- subset(m2Params, !m2_est_se == 999.000)
  }

  #remove non-significant effects, if requested
  if (is.logical(showNS) && showNS==FALSE) showNS <- .05 #default alpha for filtering N.S. results
  if (!showNS==TRUE) {
    matchDF <- subset(matchDF, !(m1_pval > showNS & m2_pval > showNS))
    m1Params <- subset(m1Params, !(m1_pval > showNS))
    m2Params <- subset(m2Params, !(m2_pval > showNS))
  }

  if (sort=="type") {
      #add a leading . to the replacement to filter on, by, and with to the top of the results output
      paramType <- sapply(matchDF$paramHeader, sub, pattern=".*\\.(ON|WITH|BY)$", replacement=".\\1", ignore.case=TRUE, perl=TRUE)
      matchDF <- matchDF[order(paramType, matchDF$paramHeader, matchDF$param), ]
    }
    else if (sort=="alphabetical") {
      matchDF <- matchDF[order(matchDF$paramHeader, matchDF$param),]
    }

    #order by each parameter match (could make optional)
    matchDF <- matchDF[,c(uniqueParamCols, "m1_est", "m2_est", "m1_se", "m2_se", "m1_est_se", "m2_est_se", "m1_pval", "m2_pval")]
    matchDF <- cbind(matchDF[,c(uniqueParamCols, "m1_est", "m2_est")], .=rep("|", nrow(matchDF)),
        matchDF[,c("m1_se", "m2_se")], .=rep("|", nrow(matchDF)),
        matchDF[,c("m1_est_se", "m2_est_se")], .=rep("|", nrow(matchDF)),
        matchDF[,c("m1_pval", "m2_pval")])

    cat("\n=========\n\nModel parameter comparison\n--------------------------\n")
    paramsEqual <- with(matchDF, testEquality(m1_est, m2_est, equalityMargin["param"]))
    pvalsEqual <- with(matchDF, testEquality(m1_pval, m2_pval, equalityMargin["pvalue"]))

    if (any(c("all", "equal") %in% show)) {
      cat("  Parameters present in both models\n=========\n\n")
      cat("  Equal in both models (param. est. diff <= ", equalityMargin["param"], ")\n\n")
      if (any(paramsEqual))
        print(matchDF[paramsEqual==TRUE, ], row.names=FALSE)
      else
        cat("None\n")
    }

    if (any(c("all", "diff") %in% show)) {
      cat("\n\n  Parameter estimates that differ between models (param. est. diff > ", equalityMargin["param"], ")\n  ----------------------------------------------\n", sep="")
      if (any(!paramsEqual)) {
        diffDF <- matchDF[paramsEqual==FALSE,]
        if (sort=="maxDiff") {
          paramDiff <- abs(diffDF$m1_est - diffDF$m2_est)
          diffDF <- diffDF[order(paramDiff, decreasing=TRUE),]
        }
        print(diffDF, row.names=FALSE)
      }
      else
        cat("None\n")
    }

    if (any(c("all", "pdiff") %in% show)) {
      cat("\n\n  P-values that differ between models (p-value diff > ", equalityMargin["pvalue"], ")\n  -----------------------------------\n", sep="")
      if (any(!pvalsEqual)) {
        diffDF <- matchDF[pvalsEqual==FALSE,]
        if (sort=="maxDiff") {
          paramDiff <- abs(diffDF$m1_pval - diffDF$m2_pval)
          diffDF <- diffDF[order(paramDiff, decreasing=TRUE),]
        }
        print(diffDF, row.names=FALSE)
      }
      else
        cat("None\n")
    }

    #sort doesn't affect this because it uses matchDF... sort earlier?
    if (any(c("unique", "all") %in% show)) {
      cat("\n\n  Parameters unique to model 1: ", length(m1Only), "\n  -----------------------------\n\n", sep="")
      if (length(m1Only) > 0) {
        m1Remaining <- sum(m1Params$paramCombined %in% m1Only)
        nFiltered <- length(m1Only) - m1Remaining

        if (m1Remaining > 0)
          print(m1Params[m1Params$paramCombined %in% m1Only,!names(m1Params)=="paramCombined"], row.names=FALSE)

        if (nFiltered > 0) {
          cat("\n\n  ", nFiltered, " filtered from output (fixed and/or n.s.)\n\n", sep="")
          filtered <- setdiff(m1Only, m1Params$paramCombined)
          cat(paste(strwrap(paste(filtered, collapse=", "), width=80, indent=4, exdent=4), collapse="\n"), "\n\n")
        }
      }
      else
        cat("  None\n\n")

      cat("\n  Parameters unique to model 2: ", length(m2Only), "\n  -----------------------------\n\n", sep="")
      if (length(m2Only) > 0) {
        m2Remaining <- sum(m2Params$paramCombined %in% m2Only)
        nFiltered <- length(m2Only) - m2Remaining

        if (m2Remaining > 0)
          print(m2Params[m2Params$paramCombined %in% m2Only,!names(m2Params)=="paramCombined"], row.names=FALSE)

        if (nFiltered > 0) {
          cat("\n\n  ", nFiltered, " filtered from output (fixed and/or n.s.)\n\n", sep="")
          filtered <- setdiff(m2Only, m2Params$paramCombined)
          cat(paste(strwrap(paste(filtered, collapse=", "), width=80, indent=4, exdent=4), collapse="\n"), "\n\n")
        }

      }
      else
        cat(" None\n\n")
    }

  }

  cat("\n==============\n")
}