flexplot: Graphically Based Data Analysis Using 'flexplot'

Documented in compare.fits

##' Compare the fits of two models
##'
##' This function takes two fitted models as input and plots them to visually compare how the two differ in terms of fit.
##' It can take a \code{glm}, \code{rlm}, \code{lm}, and \code{randomForest} model (and maybe others as well). The function takes
##' a \code{\link{flexplot}}-like formula as input.  
##'	
##' @param formula A formula that can be used in flexplot. The variables inside must not include variables outside the fitted models. 
##' @param data The dataset containing the variables in formula
##' @param model1 The fitted model object (e.g., lm) containing the variables specified in the formula
##' @param model2 The second fitted model object (e.g., lm) containing the variables specified in the formula
##' @param return.preds Should the function return the predictions instead of a graphic? Defaults to F
##' @param report.se Should standard errors be reported alongside the estimates? Defaults to F. 
##' @param re Should random effects be predicted? Only applies to mixed models. Defaults to F. 
##' @param pred.type What type of predictions should be outputted? This is mostly for \code{glm} models. Defaults to "response." 
##' @param num_points Number of points used for predictions. Larger numbers = slower algorithm, but smoother predictions. 
##' @param clusters For visualizing mixed models, this specifies the number of clusters to display
##' @param ... Other parameters passed to flexplot
##' @author Dustin Fife
##' @return Either a graphic or the predictions for the specified model(s)
##' @export
##' @examples 
##' data(exercise_data)
##' mod1 = lm(weight.loss~therapy.type + motivation, data=exercise_data)
##' mod2 = lm(weight.loss~therapy.type * motivation, data=exercise_data)
##' compare.fits(weight.loss~therapy.type | motivation, data=exercise_data, mod1, mod2)
compare.fits = function(formula, data, model1, model2=NULL, 
                        return.preds=F, report.se=F, re=F, 
                        pred.type="response", num_points = 50,
                        clusters=3,...){
  
  if (is.null(model2)) runme = "yes"
  
  #### if mod2 is null..
  if (is.null(model2)) model2 = model1
  
  #### get type of model
  model1.type = class(model1)[1]
  model2.type = class(model2)[1]	

  #### get all variables
  variables_mod1 = get_terms(model1)
  variables_mod2 = get_terms(model2)
  testme = unique(c(variables_mod1$predictors, variables_mod2$predictors))
  all_variables = unique(c(variables_mod1$predictors, variables_mod2$predictors, variables_mod1$response, variables_mod2$response))
  
  if (tibble::is_tibble(data)){
    data = as.data.frame(data)
  }
  
  #### for the rare occasion where deleting missing data changes the levels...
  data = check_missing(model1, model2, data, all_variables)
  
  #### make sure, if they have lme4, both models are lme4 objects
  test_same_class(model1, model2)
  
  #### convert random effects to factors for mixed models
  data = subset_random_model(model1, formula, d=data, samp.size = clusters)

  ### make sure they have the same outcome
  if (variables_mod1$response != variables_mod2$response) {
    stop("It looks like your two models have different outcome variables. That's not permitted, my friend!")
  }
  
  ##### extract variable names from FORMULA
  variables = all.vars(formula)
  outcome = variables[1]
  predictors = variables[-1]
  
  # check for errors
  compare_fits_errors(data, outcome, predictors, testme)

  # generate predictor values
  pred.values = generate_predictors(data, formula, model1, ...)

  # ensure pred.values have same class as original data
  # but don't change RE; because prior to this there's been sampling of the data and this would revert that
  randef = extract_random_term(model1)
  
  all_predictors_minus_re = ifelse(length(randef)>0, predictors[!(predictors==randef)], predictors)

  
  # for intercept only models
  if (nrow(pred.values)==0) pred.values = data.frame("(Intercept)" = 1)
  
  pred.mod1 = generate_predictions(model1, re, pred.values, pred.type, report.se)
  

  
  # when RE = T, we should return BOTH
  ### there's no fixed effect if we don't have these lines
  
  model1.type = class(model1)[1]
  if (!exists("runme")) {
    pred.mod2 = generate_predictions(model2, re, pred.values, pred.type, report.se)
  } else {
    pred.mod2 = pred.mod1
  }

  # if they provide two models AND re=T, return just the random effects
  if (re & !exists("runme")) {
    pred.mod1 = pred.mod1[pred.mod1$model == "random effects",]
    pred.mod2 = pred.mod2[pred.mod2$model == "random effects",]
    pred.mod1$model = deparse(substitute(model1))
    pred.mod2$model = deparse(substitute(model2))
  }
  #### convert polyr back to numeric (if applicable)
  if (model1.type == "polr" | model2.type == "polr"){
    data[,outcome] = as.numeric(as.character(data[,outcome]))		
    pred.mod1$prediction = as.numeric(as.character(pred.mod1$prediction))
    pred.mod2$prediction = as.numeric(as.character(pred.mod2$prediction))		
  }
  
  #### if they have the same name, just call them model1 and model2
  if (!re){
    pred.mod1$model = paste0(deparse(substitute(model1)), " (", model1.type, ")", collapse="")
    if (pred.mod1$model[1] == pred.mod2$model[1]){
      pred.mod2$model = paste0(deparse(substitute(model2)), " (", model2.type, " 2)", collapse="")
    } else {
      pred.mod2$model = paste0(deparse(substitute(model2)), " (", model2.type, ")", collapse="")
    }
  }
  
  # if pred.mod1 has both fixed and random effects, we need to double the size of pred.values
  if (nrow(pred.mod1) == 2*nrow(pred.values)) {
    pred.values = rbind(pred.values, pred.values)
  }
  
  
  #### report one or two coefficients, depending on if they supplied it
  if (!exists("runme") | exists("old.mod")){
    prediction.model = rbind(pred.mod1, pred.mod2)
    pred.values_doubled = rbind(pred.values, pred.values)
    prediction.model = suppressWarnings(cbind(pred.values_doubled, prediction.model))
  } else {
    prediction.model = pred.mod1
    prediction.model = suppressWarnings(cbind(pred.values, prediction.model))
  }
  
  #### eliminate those predictions that are higher than the range of the data
  if (!is.factor(data[,outcome])){
    min.dat = min(data[,outcome], na.rm=T); max.dat = max(data[,outcome], na.rm=T)
    if (length(which(
        prediction.model$prediction>(max.dat))>0 |
          length(which(prediction.model$prediction<(min.dat))))){
      #prediction.model  = prediction.model[-which(prediction.model$prediction>max.dat | prediction.model$prediction<min.dat), ]
      warning("Some of the model's predicted values are beyond the range of the original y-values. 
              I'm truncating the y-axis to preserve the original scale.")
      
    }
  } else {
    #### if they supply a factor, convert it to a number!!!!!
    prediction.model$prediction = round(as.numeric(as.character(prediction.model$prediction)), digits=3)
  }
  
  #### create flexplot
  if (return.preds){
    prediction.model
  } else {
    
    ### for logistic and factor outcome variable, add one to the predictions
    # (otherwise the fitted line falls below the range of y values)
    if (should_shift_predictions(model1.type, model1, outcome, prediction.model$prediction, data)) {
      prediction.model$prediction = prediction.model$prediction + 1
    }
    

    final_geom = return_lims_geom(outcome, data, model1)
    # remove duplicate rows
    prediction.model = prediction.model[!duplicated(prediction.model),]
    #when we have an intercept only model
    if (nrow(prediction.model)==1) { prediction.model = NULL; final_geom = theme_bw() }
    
    # if outcome is a factor, for logistic regresion

    flexplot(formula, data=data, prediction=prediction.model, suppress_smooth=T, se=F, ...) +
      final_geom
  }	
  
}	

return_lims_geom = function(outcome, data, model1) {
  if (!(class(model1)[1] == "lm" | class(model1)[1] == "glm" | class(model1)[1] == "lmerMod")) return(theme_bw())
  if (family(model1)$link=="logit" & !is.numeric(data[,outcome[1]])) return(theme_bw())
  if (is.factor(data[,outcome]) | is.character(data[,outcome])) return(theme_bw())
  return(coord_cartesian(ylim=c(min(data[,outcome]), max(data[,outcome]))))
}