Nothing
#' @title Multiple Linear Regression with Backward Elimination
#'
#' @description
#' Performs multiple linear regression using backward elimination based on
#' p-value threshold and provides full model diagnostics including ANOVA,
#' multicollinearity, heteroscedasticity, normality test, and plots.
#'
#' @param data A data frame containing dependent variable (y) in the first column
#' and independent variables (x's) in remaining columns
#' @param threshold Significance level for variable removal (default = 0.10)
#'
#' @return A list containing:
#' \itemize{
#' \item final_model: Final regression model
#' \item model_summary: Summary of final model
#' \item selected_variables: Variables retained in final model
#' \item anova_table: ANOVA table for final model
#' \item vif: Variance Inflation Factor values (if applicable)
#' \item gq_test: Goldfeld-Quandt test result
#' \item shapiro_test: Shapiro-Wilk normality test result
#' \item actual_vs_fitted: Data frame of actual vs fitted values
#' }
#'
#' @details
#' The function starts with a full model and iteratively removes the variable
#' with the highest p-value greater than the specified threshold until all
#' variables are significant.
#'
#' @importFrom stats lm resid predict shapiro.test pf as.formula coef
#' @importFrom graphics abline par text plot
#' @importFrom car vif
#' @importFrom lmtest gqtest
#' @export
#'
#' @examples {
#' library(car)
#' library(lmtest)
#'
#' set.seed(123)
#' n <- 40
#'
#' x1 <- rnorm(n, 50, 10)
#' x2 <- rnorm(n, 30, 5)
#' x3 <- rnorm(n, 70, 15)
#' x4 <- rnorm(n, 20, 7)
#' x5 <- rnorm(n, 100, 20)
#' x6 <- rnorm(n, 10, 3)
#'
#' y <- 0.5*x1 - 0.3*x2 + 0.2*x3 +
#' 0.1*x4 - 0.05*x5 + 0.3*x6 +
#' rnorm(n, 0, 15)
#'
#' df <- data.frame(y, x1, x2, x3, x4, x5, x6)
#'
#' result <- autoreg(df, threshold = 0.10)
#' result$selected_variables
#' }
autoreg <- function(data, threshold = 0.10) {
if (!is.data.frame(data)) {
stop("data must be a data frame")
}
if (!all(sapply(data, is.numeric))) {
stop("All variables must be numeric")
}
y_name <- names(data)[1]
x_vars <- names(data)[-1]
if (length(x_vars) < 1) {
stop("At least one predictor is required")
}
formula <- stats::as.formula(
paste(y_name, "~", paste(x_vars, collapse = "+"))
)
model <- stats::lm(formula, data = data)
message("\nInitial Full Model:\n")
print(summary(model))
repeat {
p_vals <- summary(model)$coefficients[-1, 4]
max_p <- max(p_vals)
if (max_p > threshold) {
remove_var <- names(which.max(p_vals))
message("\nRemoving variable:", remove_var,
"(p =", round(max_p, 4), ")\n")
remaining_vars <- setdiff(names(p_vals), remove_var)
if (length(remaining_vars) == 0) break
formula_new <- stats::as.formula(
paste(y_name, "~", paste(remaining_vars, collapse = " + "))
)
model <- stats::lm(formula_new, data = data)
} else {
break
}
}
message("\nFinal Model Summary:\n")
summary(model)
selected_vars <- names(stats::coef(model))[-1]
selected_vars
y <- data[[1]]
n <- nrow(data)
SST <- sum((y - mean(y))^2)
SSE <- sum(stats::resid(model)^2)
SSR <- SST - SSE
df_reg <- length(selected_vars)
df_res <- n - df_reg - 1
df_total <- n - 1
MSR <- SSR / df_reg
MSE <- SSE / df_res
F_value <- MSR / MSE
p_value <- stats::pf(F_value, df_reg, df_res, lower.tail = FALSE)
anova_table <- data.frame(
Source = c("Regression", "Error", "Total"),
DF = c(df_reg, df_res, df_total),
SS = round(c(SSR, SSE, SST), 3),
MS = round(c(MSR, MSE, NA), 3),
F = round(c(F_value, NA, NA), 3),
"Pr(>F)" = c(round(p_value, 5), NA, NA)
)
if (length(selected_vars) > 1) {
vif_vals <- car::vif(model)
} else {
vif_vals <- NA
}
gq <- lmtest::gqtest(model)
shapiro <- stats::shapiro.test(stats::resid(model))
fitted_vals <- stats::predict(model)
actual_vs_fitted <- data.frame(
Actual = round(y, 2),
Fitted = round(fitted_vals, 2)
)
oldpar <- graphics::par(no.readonly = TRUE)
on.exit(graphics::par(oldpar))
graphics::par(family = "serif", font = 2, mfrow = c(2, 3))
plot(model)
r2 <- summary(model)$r.squared
plot(y, fitted_vals,
xlab = "Actual Values",
ylab = "Fitted Values",
main = "Actual vs Fitted",
pch = 19)
graphics::abline(0, 1, col = "red", lwd = 2)
graphics::text(
x = min(y) + 0.05 * diff(range(y)),
y = max(fitted_vals) - 0.05 * diff(range(fitted_vals)),
labels = bquote(bold(R^2 == .(round(r2, 2)))),
col = "blue",
adj = c(0, 1),
cex = 1.3
)
plot(fitted_vals, stats::resid(model),
xlab = "Fitted Values",
ylab = "Residuals",
main = "Residuals vs Fitted",
pch = 19)
graphics::abline(h = 0, col = "blue", lwd = 2)
graphics::par(mfrow = c(1, 1))
return(list(
final_model = model,
model_summary = summary(model),
selected_variables = selected_vars,
anova_table = anova_table,
vif = vif_vals,
gq_test = gq,
shapiro_test = shapiro,
actual_vs_fitted = actual_vs_fitted
))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.