Nothing
#' Stepwise Multiple Regression Search based on Information Criteria
#'
#' @param data Raw data containing only the variables to be tested within the multiple regression as dependent or independent variable. May include missing values.
#' @param n Numeric value specifying the sample size used in calculating information criteria for model search.
#' If not provided, it will be computed based on the data.
#' If a correlation matrix (`mat`) is supplied instead of raw data, `n` must be provided.
#' @param mat Optional covariance or correlation matrix for the variables to be used within the multiple regression.
#' #' Used only if \code{data} is \code{NULL}.
#' @param dep_ind Index of the column within a data set to be used as dependent variable within in the regression model.
#' @param n_calc Method for calculating the sample size for node-wise regression models. Can be one of:
#' `"individual"` (sample size for each variable is the number of non-missing observations for that variable),
#' `"average"` (sample size is the average number of non-missing observations across all variables),
#' `"max"` (sample size is the maximum number of non-missing observations across all variables),
#' `"total"` (sample size is the total number of observations across in the data set / number of rows).
#' @param missing_handling Method for estimating the correlation matrix in the presence of missing data.
#' `"tow-step-em"` uses a classic EM algorithm to estimate the covariance matrix from the data.
#' `"stacked-mi"` uses multiple imputation to estimate the covariance matrix from the data.
#' `"pairwise"` uses pairwise deletion to estimate the covariance matrix from the data.
#' `"listwise"` uses listwise deletion to estimate the covariance matrix from the data.
#' @param k Penalty per parameter (number of predictors + 1) to be used in node-wise regressions; the default log(n) (number of observations observation) is the classical BIC. Alternatively, classical AIC would be `k = 2`.
#' @param nimp Number of multiple imputations to perform when using multiple imputation for missing data (default: 20).
#'
#' @return A list with the following elements:
#' \describe{
#' \item{regression}{Named vector of regression coefficients for the dependent variable.}
#' \item{R2}{R-squared value of the regression model.}
#' \item{n}{Sample size used in the regression model.}
#' \item{args}{List of arguments used in the regression model, including `k`, `missing_handling`, and `nimp`.}
#' }
#'
#' @export
#'
#' @examples
#' # For full data using AIC
#' # First variable of the data set as dependent variable
#' result <- regression_opt(
#' data = mantar_dummy_full,
#' dep_ind = 1,
#' k = "2"
#' )
#'
#' # View regression coefficients and R-squared
#' result$regression
#' result$R2
#'
#' # For data with missingess using BIC
#' # Second variable of the data set as dependent variable
#' # Using individual sample size of the dependent variable and stacked Multiple Imputation
#'
#' result_mis <- regression_opt(
#' data = mantar_dummy_mis,
#' dep_ind = 2,
#' n_calc = "individual",
#' missing_handling = "two-step-em",
#' )
#'
#' # View regression coefficients and R-squared
#' result_mis$regression
#' result_mis$R2
regression_opt <- function(data = NULL, n = NULL, mat = NULL, dep_ind,
n_calc = "individual",
missing_handling = "stacked-mi",
k = "log(n)", nimp = 20) {
n_calc <- match.arg(tolower(n_calc), choices =c("average", "individual", "max", "total"))
missing_handling <- match.arg(tolower(missing_handling), choices = c("two-step-em", "stacked-mi", "pairwise", "listwise"))
# Check: Which input is provided?
checker(data = data, mat = mat)
# Determine column names relevant for dep_ind
col_names <- if (!is.null(data)) colnames(data) else colnames(mat)
# Resolve dep_ind to a column index
if (is.character(dep_ind)) {
if (is.null(col_names)) {
stop("dep_ind was given as a name, but the selected input (data or mat) has no column names.")
}
if (!(dep_ind %in% col_names)) {
stop("dep_ind must be a valid column name.")
}
dep_ind <- which(col_names == dep_ind)
} else if (is.numeric(dep_ind)) {
max_cols <- if (!is.null(col_names)) length(col_names) else if (!is.null(data)) ncol(data) else ncol(mat)
if (dep_ind < 1 || dep_ind > max_cols) {
stop("dep_ind must be a valid column index.")
}
} else {
stop("dep_ind must be either a numeric index or a character string.")
}
if (!is.null(data)) {
if (anyNA(data)){
if (missing_handling == "two-step-em"){
if (!requireNamespace("lavaan", quietly = TRUE)) {
stop(
"Package \"lavaan\" must be installed to use this function.",
call. = FALSE
)
}
lavobject <- suppressWarnings(try(lavaan::lavCor(data,
missing = "ml", se = "none", meanstructure = TRUE,
estimator = "ML", output = "fit")))
if (inherits(lavobject, "try-error")) stop("lavaan::lavCor failed. Check your data.")
mat <- try(stats::cov2cor(lavaan::inspect(lavobject, "cov.ov")))
nimp <- NULL
} else if (missing_handling == "stacked-mi"){
if (!requireNamespace("mice", quietly = TRUE)) {
stop(
"Package \"mice\" must be installed to use this function.",
call. = FALSE
)
}
original_names <- colnames(data)
colnames(data) <- make.names(original_names)
imputed_data <- suppressMessages(mice::mice(data, m = nimp, maxit = 10, method = 'pmm',
ridge = 0, donors = 5, ls.method = "qr"))
stacked_data <- mice::complete(imputed_data, c(1:nimp))
colnames(stacked_data) <- original_names # restore original column names
mat <- stats::cor(stacked_data)
} else if (missing_handling == "pairwise"){
mat <- stats::cov2cor(stats::cov(data, use = "pairwise.complete.obs"))
nimp <- NULL
} else if (missing_handling == "listwise"){
mat <- stats::cov2cor(stats::cov(data, use = "complete.obs"))
nimp <- NULL
}
} else {
mat <- stats::cov2cor(stats::cov(data))
nimp <- missing_handling <- NULL
}
if (is.null(n)){
ns <- calculate_sample_size(data = data, n_calc = n_calc)
n <- ns[dep_ind] # use the sample size for the dependent variable
} else if (length(n) != 1){
stop("Length of 'n' must be 1.")
}
} else if (!is.null(mat)) {
if (is.null(n)) {
stop("If 'mat' is provided, 'n' must also be specified.")
}
if (length(n) != 1) {
stop("Length of 'n' must be 1.")
}
mat <- stats::cov2cor(mat)
nimp <- missing_handling <- NULL
}
mod <- pred_search(mat = mat, dep_ind = dep_ind,
possible_pred_ind = setdiff(1:ncol(mat), dep_ind),
n = n, k = k)
regression <- mod$actual_betas
names(regression) <- colnames(mat)[mod$actual_preds]
result <- list(
regression = regression,
R2 = 1 - mod$actual_resid_var,
n = n,
args = list(k = k, missing_handling = missing_handling, nimp = nimp)
)
class(result) <- c("mantar_regression")
return(result)
}
pred_search <- function(mat, dep_ind, possible_pred_ind, n, k = log(n)){
null_mod <- matrix_regression(mat = mat, dep_ind = dep_ind, pred_ind = NULL)
null_IC <- reg_ic_calc(resid_var = null_mod$resid_var, n = n, n_preds = 0, k = k)
actual_preds <- c() # initialize vector for actual predictors
actual_betas <- c() # initialize vector for actual betas
actual_resid_var <- null_mod$resid_var # initialize object for actual residual variance
best_IC <- null_IC # initialize best IC with null model
while(TRUE){
mods <- lapply(possible_pred_ind, function(pred){
# if the predictor is not already in the model
if (!(pred %in% actual_preds)){
# add the predictor to the model
mod_preds <- sort(c(pred, actual_preds))
# calculate regression coefficients
mod <- matrix_regression(mat = mat, dep_ind = dep_ind, pred_ind = mod_preds)
mod_betas <- mod$beta
mod_resid_var <- mod$resid_var
}
# if the predictor is already in the model
else{
# remove the predictor from the model
mod_preds <- sort(actual_preds[actual_preds != pred])
# if there are no predictors left - empty model
if (length(mod_preds) == 0){
mod_preds <- NULL
mod_betas <- NULL
} else{
# if there are predictors left - calculate regression coefficients
mod <- matrix_regression(mat = mat, dep_ind = dep_ind, pred_ind = mod_preds)
mod_betas <- mod$beta
mod_resid_var <- mod$resid_var
}
}
# calculate information criteria for the model
# if there are no predictors, the information criteria is the one from the null model
if(is.null(mod_betas)) {
mod_resid_var <- null_mod$resid_var
IC <- null_IC
} else {
# if there are predictors, calculate the information criteria for the model
IC <- reg_ic_calc(resid_var = mod_resid_var, n = n, n_preds = length(mod_preds), k = k)
}
return(list(IC = IC, mod_preds = mod_preds, mod_betas = mod_betas, mod_resid_var = mod_resid_var))
})
ICs <- lapply(mods, "[[", "IC") |> unlist() # extract information criteria from models
# if there is a model with a lower information criteria than the best model so far
if (any(ICs< best_IC)){
# extract predictors and regression coefficients from all models
mod_betas <- lapply(mods, "[[", "mod_betas")
mod_preds <- lapply(mods, "[[", "mod_preds")
mod_resid_vars <- lapply(mods, "[[", "mod_resid_var")
# find the model with the lowest information criteria
mod_num <- which.min(ICs)
best_IC <- ICs[mod_num]
# extract predictors and regression coefficients from the best model
actual_preds <- mod_preds[mod_num] |> unlist()
actual_betas <- mod_betas[mod_num] |> unlist()
actual_resid_var <- mod_resid_vars[mod_num] |> unlist()
} else{
# if there is no model with a lower information criteria than the best model so far
break
}
}
return(list(actual_preds = actual_preds, actual_betas = actual_betas, actual_resid_var = actual_resid_var, best_IC = best_IC))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.