R/Autoreg.R
In linreg: Linear Regression and Model Selection Framework

Documented in autoreg

#' @title Multiple Linear Regression with Backward Elimination
#'
#' @description
#' Performs multiple linear regression using backward elimination based on
#' p-value threshold and provides full model diagnostics including ANOVA,
#' multicollinearity, heteroscedasticity, normality test, and plots.
#'
#' @param data A data frame containing dependent variable (y) in the first column
#' and independent variables (x's) in remaining columns
#' @param threshold Significance level for variable removal (default = 0.10)
#'
#' @return A list containing:
#' \itemize{
#'   \item final_model: Final regression model
#'   \item model_summary: Summary of final model
#'   \item selected_variables: Variables retained in final model
#'   \item anova_table: ANOVA table for final model
#'   \item vif: Variance Inflation Factor values (if applicable)
#'   \item gq_test: Goldfeld-Quandt test result
#'   \item shapiro_test: Shapiro-Wilk normality test result
#'   \item actual_vs_fitted: Data frame of actual vs fitted values
#' }
#'
#' @details
#' The function starts with a full model and iteratively removes the variable
#' with the highest p-value greater than the specified threshold until all
#' variables are significant.
#'
#' @importFrom stats lm resid predict shapiro.test pf as.formula coef
#' @importFrom graphics abline par text plot
#' @importFrom car vif
#' @importFrom lmtest gqtest
#' @export
#'
#' @examples {
#' library(car)
#' library(lmtest)
#'
#' set.seed(123)
#' n <- 40
#'
#' x1 <- rnorm(n, 50, 10)
#' x2 <- rnorm(n, 30, 5)
#' x3 <- rnorm(n, 70, 15)
#' x4 <- rnorm(n, 20, 7)
#' x5 <- rnorm(n, 100, 20)
#' x6 <- rnorm(n, 10, 3)
#'
#' y <- 0.5*x1 - 0.3*x2 + 0.2*x3 +
#'      0.1*x4 - 0.05*x5 + 0.3*x6 +
#'      rnorm(n, 0, 15)
#'
#' df <- data.frame(y, x1, x2, x3, x4, x5, x6)
#'
#' result <- autoreg(df, threshold = 0.10)
#' result$selected_variables
#' }
autoreg <- function(data, threshold = 0.10) {

  if (!is.data.frame(data)) {
    stop("data must be a data frame")
  }

  if (!all(sapply(data, is.numeric))) {
    stop("All variables must be numeric")
  }

  y_name <- names(data)[1]
  x_vars <- names(data)[-1]

  if (length(x_vars) < 1) {
    stop("At least one predictor is required")
  }

  formula <- stats::as.formula(
    paste(y_name, "~", paste(x_vars, collapse = "+"))
  )
  model <- stats::lm(formula, data = data)

  message("\nInitial Full Model:\n")
  print(summary(model))

  repeat {

    p_vals <- summary(model)$coefficients[-1, 4]
    max_p <- max(p_vals)

    if (max_p > threshold) {

      remove_var <- names(which.max(p_vals))

      message("\nRemoving variable:", remove_var,
          "(p =", round(max_p, 4), ")\n")

      remaining_vars <- setdiff(names(p_vals), remove_var)

      if (length(remaining_vars) == 0) break

      formula_new <- stats::as.formula(
        paste(y_name, "~", paste(remaining_vars, collapse = " + "))
      )

      model <- stats::lm(formula_new, data = data)

    } else {
      break
    }
  }

  message("\nFinal Model Summary:\n")
  summary(model)

  selected_vars <- names(stats::coef(model))[-1]

  selected_vars

  y <- data[[1]]
  n <- nrow(data)

  SST <- sum((y - mean(y))^2)
  SSE <- sum(stats::resid(model)^2)
  SSR <- SST - SSE

  df_reg <- length(selected_vars)
  df_res <- n - df_reg - 1
  df_total <- n - 1

  MSR <- SSR / df_reg
  MSE <- SSE / df_res
  F_value <- MSR / MSE
  p_value <- stats::pf(F_value, df_reg, df_res, lower.tail = FALSE)

  anova_table <- data.frame(
    Source = c("Regression", "Error", "Total"),
    DF = c(df_reg, df_res, df_total),
    SS = round(c(SSR, SSE, SST), 3),
    MS = round(c(MSR, MSE, NA), 3),
    F  = round(c(F_value, NA, NA), 3),
    "Pr(>F)" = c(round(p_value, 5), NA, NA)
  )

  if (length(selected_vars) > 1) {
    vif_vals <- car::vif(model)
  } else {
    vif_vals <- NA
  }

  gq <- lmtest::gqtest(model)
  shapiro <- stats::shapiro.test(stats::resid(model))

  fitted_vals <- stats::predict(model)

  actual_vs_fitted <- data.frame(
    Actual = round(y, 2),
    Fitted = round(fitted_vals, 2)
  )

  oldpar <- graphics::par(no.readonly = TRUE)
  on.exit(graphics::par(oldpar))

  graphics::par(family = "serif", font = 2, mfrow = c(2, 3))

  plot(model)

  r2 <- summary(model)$r.squared

  plot(y, fitted_vals,
       xlab = "Actual Values",
       ylab = "Fitted Values",
       main = "Actual vs Fitted",
       pch = 19)

  graphics::abline(0, 1, col = "red", lwd = 2)

  graphics::text(
    x = min(y) + 0.05 * diff(range(y)),
    y = max(fitted_vals) - 0.05 * diff(range(fitted_vals)),
    labels = bquote(bold(R^2 == .(round(r2, 2)))),
    col = "blue",
    adj = c(0, 1),
    cex = 1.3
  )

  plot(fitted_vals, stats::resid(model),
       xlab = "Fitted Values",
       ylab = "Residuals",
       main = "Residuals vs Fitted",
       pch = 19)

  graphics::abline(h = 0, col = "blue", lwd = 2)

  graphics::par(mfrow = c(1, 1))

  return(list(
    final_model = model,
    model_summary = summary(model),
    selected_variables = selected_vars,
    anova_table = anova_table,
    vif = vif_vals,
    gq_test = gq,
    shapiro_test = shapiro,
    actual_vs_fitted = actual_vs_fitted
  ))
}