R/regr.R
In ufs: A Collection of Utilities

### Not exported - normally in `rosetta`
regr <- function(formula, data=NULL, conf.level=.95, digits=2,
                 pvalueDigits = 3, coefficients=c("raw", "scaled"),
                 plot=FALSE, pointAlpha = .5,
                 collinearity = FALSE, influential = FALSE,
                 ci.method = c("widest", "r.con", "olkinfinn"),
                 ci.method.note = FALSE, env=parent.frame()) {

  dat <- data;

  ### Generate object to store input, intermediate outcomes, and results
  res <- list(input = as.list(environment()),
              intermediate = list(),
              output = list());

  ### Extract variables from formula
  res$intermediate$variableNames <- all.vars(formula);

  ### Convert formula to a character string
  res$intermediate$formula.as.character <-
    paste0(as.character(formula)[c(2, 1, 3)], collapse=" ");

  if (is.null(dat)) {
    ### Extract variables from formula
    res$intermediate$variables <-
      as.character(as.list(attr(stats::terms(formula), 'variables'))[-1]);

    ### Store variablesnames only for naming in raw dataframe
    res$intermediate$variables_namesOnly <- unlist(
      lapply(strsplit(res$intermediate$variables, "\\$"), utils::tail, 1));

    ### Store variables in a dataframe
    res$intermediate$dat.raw <- list();
    for (varToGet in 1:length(res$intermediate$variables)) {
      res$intermediate$dat.raw[[res$intermediate$variables_namesOnly[varToGet]]] <-
        eval(parse(text=res$intermediate$variables[varToGet]), envir=env);
    }
    ### Convert the list to a dataframe
    res$intermediate$dat.raw <- data.frame(res$intermediate$dat.raw);

    ### Convert variable names to bare variable names
    for (currentVariableIndex in 1:length(res$intermediate$variables)) {
      res$intermediate$formula.as.character <-
        gsub(res$intermediate$variables[currentVariableIndex],
             res$intermediate$variables_namesOnly[currentVariableIndex],
             res$intermediate$formula.as.character, fixed=TRUE);
    }

  } else {
    ### Store variables in a dataframe
    res$intermediate$dat.raw <- dat[, res$intermediate$variableNames];
    res$intermediate$variables_namesOnly <- res$intermediate$variableNames;
  }

  res$intermediate$formula <- formula(res$intermediate$formula.as.character,
                                      env = environment());

  ### Standardize numeric variables
  res$intermediate$dat.scaled <- res$intermediate$dat.raw;
  res$intermediate$dat.scaled[, sapply(res$intermediate$dat.scaled, is.numeric)] <-
    as.data.frame(scale(res$intermediate$dat.scaled[, sapply(res$intermediate$dat.scaled, is.numeric)]));

  ### Run and store lm objects
  res$intermediate$lm.raw <-
    stats::lm(formula=res$intermediate$formula, data=res$intermediate$dat.raw);
  res$intermediate$lm.scaled <-
    stats::lm(formula=res$intermediate$formula, data=res$intermediate$dat.scaled);

  ### R^2 confidence interval based on formula at
  ### http://www.danielsoper.com/statcalc3/calc.aspx?id=28
  res$intermediate$rsq <- rsq <- summary(res$intermediate$lm.raw)$r.squared;
  res$intermediate$k <- k <- length(res$intermediate$lm.raw$terms) - 1;
  res$intermediate$n <- n <- res$intermediate$lm.raw$df.residual + k + 1;
  res$intermediate$rsq.se <- sqrt((4*rsq*(1-rsq)^2*(n-k-1)^2)/
                                  ((n^2-1)*(3+n)));
  res$intermediate$rsq.t.crit <- stats::qt(p=1-(1-conf.level)/2, df=n-k-1);
  res$output$rsq.ci.olkinfinn <- c(res$intermediate$rsq -
                                   res$intermediate$rsq.t.crit *
                                   res$intermediate$rsq.se,
                                 res$intermediate$rsq +
                                   res$intermediate$rsq.t.crit *
                                   res$intermediate$rsq.se);

  res$output$rsq.ci.olkinfinn <- ifelse(res$output$rsq.ci.olkinfinn < 0,
                                        0, res$output$rsq.ci.olkinfinn);

  res$output$rsq.ci.r.con <- psych::r.con(sqrt(rsq), n);
  res$output$rsq.ci.r.con <- ifelse(res$output$rsq.ci.r.con < 0,
                                    0, res$output$rsq.ci.r.con) ^ 2;

  if ("widest" %IN% ci.method) {
    res$output$rsq.ci <- ifelse(range(res$output$rsq.ci.r.con) >
                                range(res$output$rsq.ci.olkinfinn),
                                res$output$rsq.ci.r.con,
                                res$output$rsq.ci.olkinfinn);
  } else if ("r.con" %IN% ci.method) {
    res$output$rsq.ci <- res$output$rsq.ci.r.con;
  } else {
    res$output$rsq.ci <- res$output$rsq.ci.olkinfinn;
  }


  ### Run confint on lm object
  res$intermediate$confint.raw <-
    stats::confint(res$intermediate$lm.raw, level=conf.level);
  res$intermediate$confint.scaled <-
    stats::confint(res$intermediate$lm.scaled, level=conf.level);

  ### Run lm.influence on lm object
  res$intermediate$lm.influence.raw <-
    stats::lm.influence(res$intermediate$lm.raw);
  res$intermediate$lm.influence.scaled <-
    stats::lm.influence(res$intermediate$lm.scaled);

  ### Get variance inflation factors and compute tolerances
  if (collinearity && (length(res$intermediate$variables) > 2)) {
    if (requireNamespace('car')) {
      res$intermediate$vif.raw <- car::vif(res$intermediate$lm.raw);
      res$intermediate$vif.scaled <- car::vif(res$intermediate$lm.scaled);
      if (is.vector(res$intermediate$vif.raw)) {
        res$intermediate$tolerance.raw <- 1/res$intermediate$vif.raw;
        res$intermediate$tolerance.scaled <- 1/res$intermediate$vif.scaled;
      }
    } else {
      warning(paste0("You need to have package 'car' installed to ",
                     "request collineary diagnostics!"));
      res$intermediate$vif.raw <- NA;
      res$intermediate$vif.scaled <- NA;
    }
  }

  ### get summary for lm objects
  res$intermediate$summary.raw <- summary(res$intermediate$lm.raw);
  res$intermediate$summary.scaled <- summary(res$intermediate$lm.scaled);

  ### Generate output
  res$output$coef.raw <- cbind(data.frame(res$intermediate$confint.raw[, 1],
                                          res$intermediate$confint.raw[, 2]),
                               res$intermediate$summary.raw$coefficients);
  res$output$coef.scaled <- cbind(data.frame(res$intermediate$confint.scaled[, 1],
                                             res$intermediate$confint.scaled[, 2]),
                                  res$intermediate$summary.scaled$coefficients);

  names(res$output$coef.raw) <- names(res$output$coef.scaled) <-
    c(paste0(conf.level*100,"% CI, lo"),
      paste0(conf.level*100,"% CI, hi"),
      'estimate', 'se', 't', 'p');

  if (plot) {
    if (length(res$intermediate$variables_namesOnly) == 2) {
      res$output$plot <-
        ggplot2::ggplot(res$intermediate$dat.raw,
                        ggplot2::aes_string(y=res$intermediate$variables_namesOnly[1],
                        x=res$intermediate$variables_namesOnly[2])) +
        ggplot2::geom_point(alpha = pointAlpha) +
        ggplot2::geom_smooth(method='lm') +
        ggplot2::theme_bw();
    } else if (length(res$intermediate$variables_namesOnly) == 3) {

      if (is.numeric(res$intermediate$dat.raw[, res$intermediate$variables_namesOnly[2]]) &&
          is.numeric(res$intermediate$dat.raw[, res$intermediate$variables_namesOnly[3]])) {
        ### Both numeric, so we treat the second one as moderator and compute the line

        predictorName <- res$intermediate$variables_namesOnly[2];
        moderatorName <- res$intermediate$variables_namesOnly[3];

        predictorMean <- mean(res$intermediate$dat.raw[, predictorName],
                              na.rm=TRUE);
        predictorSD <- stats::sd(res$intermediate$dat.raw[, predictorName],
                          na.rm=TRUE);
        # loPredictorValue <- predictorMean - predictorSD;
        # hiPredictorValue <- predictorMean + predictorSD;
        loPredictorValue <- min(res$intermediate$dat.raw[, predictorName],
                                na.rm=TRUE);
        hiPredictorValue <- max(res$intermediate$dat.raw[, predictorName],
                                na.rm=TRUE);

        moderatorMean <- mean(res$intermediate$dat.raw[, moderatorName],
                              na.rm=TRUE);
        moderatorSD <- stats::sd(res$intermediate$dat.raw[, moderatorName],
                                 na.rm=TRUE);
        loModeratorValue <- moderatorMean - moderatorSD;
        hiModeratorValue <- moderatorMean + moderatorSD;

        intercept <- res$output$coef.raw['(Intercept)', 'estimate'];
        predictorSlope <- res$output$coef.raw[predictorName, 'estimate'];
        moderatorSlope <- res$output$coef.raw[moderatorName, 'estimate'];
        interactionSlope <- res$output$coef.raw[paste0(predictorName,
                                                       ":",
                                                       moderatorName),
                                                'estimate'];

        depVarValue_loPred_loMod <- intercept +
          loPredictorValue * predictorSlope +
          loModeratorValue * moderatorSlope +
          loPredictorValue * loModeratorValue * interactionSlope;
        depVarValue_loPred_hiMod <- intercept +
          loPredictorValue * predictorSlope +
          hiModeratorValue * moderatorSlope +
          loPredictorValue * hiModeratorValue * interactionSlope;
        depVarValue_hiPred_loMod <- intercept +
          hiPredictorValue * predictorSlope +
          loModeratorValue * moderatorSlope +
          hiPredictorValue * loModeratorValue * interactionSlope;
        depVarValue_hiPred_hiMod <- intercept +
          hiPredictorValue * predictorSlope +
          hiModeratorValue * moderatorSlope +
          hiPredictorValue * hiModeratorValue * interactionSlope;

        moderatorDat <- data.frame(x = c(loPredictorValue, loPredictorValue),
                                   xend = c(hiPredictorValue, hiPredictorValue),
                                   y = c(depVarValue_loPred_loMod, depVarValue_loPred_hiMod),
                                   yend = c(depVarValue_hiPred_loMod, depVarValue_hiPred_hiMod),
                                   modVal = c("Low", "High"));
        names(moderatorDat)[5] <- res$intermediate$variables_namesOnly[3];

        res$output$plot <-
          ggplot2::ggplot(res$intermediate$dat.raw,
                          ggplot2::aes_string(y=res$intermediate$variables_namesOnly[1],
                                              x=res$intermediate$variables_namesOnly[2])) +
          ggplot2::geom_point(alpha = pointAlpha) +
          ggplot2::geom_smooth(method='lm') +
          ggplot2::theme_bw() +
          ggplot2::geom_segment(data = moderatorDat,
                                ggplot2::aes_string(x = 'x', xend = 'xend',
                                                    y = 'y', yend = 'yend',
                                                    group = res$intermediate$variables_namesOnly[3],
                                                    color = res$intermediate$variables_namesOnly[3]),
                                size=1) +
          ggplot2::scale_color_viridis_d();

      } else if ((is.numeric(res$intermediate$dat.raw[, res$intermediate$variables_namesOnly[2]]) &&
                 (is.factor(res$intermediate$dat.raw[, res$intermediate$variables_namesOnly[3]]))) ||
                 (is.numeric(res$intermediate$dat.raw[, res$intermediate$variables_namesOnly[3]]) &&
                 (is.factor(res$intermediate$dat.raw[, res$intermediate$variables_namesOnly[2]])))) {
        ### One of the predictors is a factor, so we jsut plot lines for each value
        if (is.numeric(res$intermediate$dat.raw[, res$intermediate$variables_namesOnly[2]])) {
          predictor <- res$intermediate$variables_namesOnly[2];
          moderator <- res$intermediate$variables_namesOnly[3];
        } else {
          predictor <- res$intermediate$variables_namesOnly[3];
          moderator <- res$intermediate$variables_namesOnly[2];
        }
        res$output$plot <-
          ggplot2::ggplot(res$intermediate$dat.raw,
                          ggplot2::aes_string(y=res$intermediate$variables_namesOnly[1],
                                              x=predictor)) +
          ggplot2::geom_point(alpha = pointAlpha) +
          ggplot2::geom_smooth(method='lm') +
          ggplot2::theme_bw() +
          ggplot2::geom_smooth(ggplot2::aes_string(y=res$intermediate$variables_namesOnly[1],
                                                   x=predictor,
                                                   group=moderator,
                                                   color=moderator),
                               method="lm",
                               se = FALSE) +
          ggplot2::scale_color_viridis_d();
      }
    } else {
      warning("You requested a plot, but for now plots are ",
              "only available for regression analyses with one predictor.");
    }
  }

  ### Use Z = (b1 - b2) / sqrt(SE_b1^2 + SE_b2^2) to test whether
  ### the coefficients differ; add arguments such as
  ### compareCoefficients=FALSE, p.adjust="fdr" to enable user to
  ### request & correct this.

  class(res) <- 'regr';
  return(res);

}

#' @export
print.regr <- function(x, digits=x$input$digits,
                       pvalueDigits=x$input$pvalueDigits, ...) {

  cat(paste0("Regression analysis for formula: ",
             x$intermediate$formula.as.character, "\n\n",
             "Significance test of the entire model (all predictors together):\n",
             "  Multiple R-squared: [",
             round(x$output$rsq.ci[1], digits), ", ",
             round(x$output$rsq.ci[2], digits),
             "] (point estimate = ",
             round(x$intermediate$summary.raw$r.squared, digits),
             ", adjusted = ",
             round(x$intermediate$summary.raw$adj.r.squared, digits), ")",
             ifelse(x$input$ci.method.note, "*\n", "\n"),
             "  Test for significance: F[",
             x$intermediate$summary.raw$fstatistic[2], ", ",
             x$intermediate$summary.raw$fstatistic[3], "] = ",
             round(x$intermediate$summary.raw$fstatistic[1], digits),
             ", ", ufs::formatPvalue(stats::pf(x$intermediate$summary.raw$fstatistic[1],
                                               x$intermediate$summary.raw$fstatistic[2],
                                               x$intermediate$summary.raw$fstatistic[3],
                                               lower.tail=FALSE), digits=pvalueDigits), "\n"));
  if ("raw" %in% x$input$coefficients) {
    cat("\nRaw regression coefficients (unstandardized beta values, called 'B' in SPSS):\n\n");
    tmpDat <- round(x$output$coef.raw[, 1:5], digits);
    tmpDat[[1]] <- paste0("[", tmpDat[[1]], "; ", tmpDat[[2]], "]");
    tmpDat[[2]] <- NULL;
    names(tmpDat)[1] <- paste0(x$input$conf.level*100, "% conf. int.");
    tmpDat$p <- ufs::formatPvalue(x$output$coef.raw$p,
                                  digits=pvalueDigits,
                                  includeP=FALSE);
    print(tmpDat, ...);
  }
  if ("scaled" %in% x$input$coefficients) {
    cat("\nScaled regression coefficients (standardized beta values, called 'Beta' in SPSS):\n\n");
    tmpDat <- round(x$output$coef.scaled[, 1:5], digits);
    tmpDat[[1]] <- paste0("[", tmpDat[[1]], "; ", tmpDat[[2]], "]");
    tmpDat[[2]] <- NULL;
    names(tmpDat)[1] <- paste0(x$input$conf.level*100, "% conf. int.");
    tmpDat$p <- ufs::formatPvalue(x$output$coef.scaled$p,
                                  digits=pvalueDigits,
                                  includeP=FALSE);
    print(tmpDat, ...);
  }

  if (x$input$collinearity && (!is.null(x$intermediate$vif.raw))) {
    cat0("\nCollinearity diagnostics:\n\n");
    if (is.vector(x$intermediate$vif.raw)) {
      cat0("  For the raw regression coefficients:\n\n");
      collinearityDat <- data.frame(VIF = x$intermediate$vif.raw,
                                    Tolerance = x$intermediate$tolerance.raw);
      row.names(collinearityDat) <- paste0(ufs::repStr(4), names(x$intermediate$vif.raw));
      print(collinearityDat);
      cat0("\n  For the standardized regression coefficients:\n\n");
      collinearityDat <- data.frame(VIF = x$intermediate$vif.scaled,
                                    Tolerance = x$intermediate$tolerance.scaled);
      row.names(collinearityDat) <- paste0(ufs::repStr(4), names(x$intermediate$vif.raw));
      print(collinearityDat);
    }
  }

  ciMsg <- "\n* Note that the confidence interval for R^2 is based on ";
  if (x$input$ci.method[1] == 'r.con') {
    ciMsg <- paste0(ciMsg,
                    "the confidence interval for the Pearson Correlation of ",
                    "the multiple correlation using r.con from the 'psych' ",
                    "package because that was specified using the 'ci.method' ",
                    "argument.");
  } else if (x$input$ci.method[1] == 'olkinfinn') {
    ciMsg <- paste0(ciMsg,
                    "the formula reported by Olkin and Finn (1995) in their Correlation ",
                    "Redux paper, because this was specified using the 'ci.method' ",
                    "argument. This may not work well for very low values. Set the ",
                    "argument to 'widest' to also compute the confidence interval ",
                    "of the multiple correlation using the r.con function from ",
                    "the 'psych' package and selecting the widest interval.");
  } else if (identical(x$output$rsq.ci, x$output$rsq.ci.r.con)) {
    ciMsg <- paste0(ciMsg,
                    "the confidence interval for the Pearson Correlation of ",
                    "the multiple correlation using r.con from the 'psych' ",
                    "package because that was the widest interval, which ",
                    "should be used because the 'ci.method' was set to 'widest'.");
  } else if (identical(x$output$rsq.ci, x$output$rsq.ci.olkinfinn)) {
    ciMsg <- paste0(ciMsg,
                    "the formula reported by Olkin and Finn (1995) in their Correlation ",
                    "Redux paper, because this was the widest interval, which ",
                    "should be used because the 'ci.method' was set to 'widest'.");
  } else {
    ciMsg <- paste0(ciMsg,
                    " -- I don't know actually, something appears to have gone wrong. ",
                    "The 'ci.method' argument was set to ", ufs::vecTxtQ(x$input$ci.method),
                    ".");
  }

  if (x$input$ci.method.note) {
    cat("\n");
    cat(strwrap(ciMsg), sep="\n");
  }

  if (!is.null(x$output$plot)) {
    print(x$output$plot);
  }
  invisible();

}

#' @export
pander.regr <- function (x, digits = x$input$digits, pvalueDigits = x$input$pvalueDigits, ...) {
  pander::pandoc.p(paste0("\n\n#### Regression analysis for formula: ", x$intermediate$formula.as.character));
  pander::pandoc.p("\n\n##### Significance test of the entire model (all predictors together):\n\n");
  pander::pandoc.p(paste0("Multiple R-squared: [", round(x$output$rsq.ci[1],
                                                 digits), ", ", round(x$output$rsq.ci[2], digits),
                  "] (point estimate = ", round(x$intermediate$summary.raw$r.squared,
                                                digits), ", adjusted = ", round(x$intermediate$summary.raw$adj.r.squared,
                                                                                digits), ")"))
  pander::pandoc.p(paste0("Test for significance: F[", x$intermediate$summary.raw$fstatistic[2],
                  ", ", x$intermediate$summary.raw$fstatistic[3], "] = ",
                  round(x$intermediate$summary.raw$fstatistic[1], digits),
                  ", ", ufs::formatPvalue(stats::pf(x$intermediate$summary.raw$fstatistic[1],
                                                    x$intermediate$summary.raw$fstatistic[2], x$intermediate$summary.raw$fstatistic[3],
                                                    lower.tail = FALSE), digits = pvalueDigits), "\n"));

  if ("raw" %in% x$input$coefficients) {
    pander::pandoc.p("\n\n##### Raw regression coefficients (unstandardized beta values, called 'B' in SPSS):\n\n");
    tmpDat <- round(x$output$coef.raw[, 1:5], digits);
    tmpDat[[1]] <- paste0("[", tmpDat[[1]], "; ", tmpDat[[2]], "]");
    tmpDat[[2]] <- NULL;
    names(tmpDat)[1] <- paste0(x$input$conf.level * 100, "% conf. int.");
    tmpDat$p <- ufs::formatPvalue(x$output$coef.raw$p, digits = pvalueDigits, includeP = FALSE);
    pander::pander(tmpDat, missing="");
  }
  if ("scaled" %in% x$input$coefficients) {
    pander::pandoc.p("\n\n##### Scaled regression coefficients (standardized beta values, called 'Beta' in SPSS):\n\n");
    tmpDat <- round(x$output$coef.scaled[, 1:5], digits);
    tmpDat[[1]] <- paste0("[", tmpDat[[1]], "; ", tmpDat[[2]], "]");
    tmpDat[[2]] <- NULL;
    names(tmpDat)[1] <- paste0(x$input$conf.level * 100, "% conf. int.");
    tmpDat$p <- ufs::formatPvalue(x$output$coef.scaled$p, digits = pvalueDigits, includeP = FALSE);
    pander::pander(tmpDat, missing="");
  }
}