#' An exhaustive search regression built on base R
#'
#' @export
#'
#' @param data A `data.frame` that contains a dependent variable and the
#' independent variables.
#' @param dependent The dependent variable for the regression.
#' @param independent A vector of independent variables to be used. These must
#' match the column names from `data`. These can also include interaction
#' terms made from column names from `data`. This allows for specific
#' interaction terms to be used, rather than every possible interaction as is
#' done with `interactions = TRUE`.
#' @param minvar (Optional) The minimum number of independent variables to be
#' used in the regression. Defaults to 1.
#' @param maxvar The maximum number of independent variables to be used in the
#' regression. Must be equal to or less than the number of independent
#' variables. If interaction terms are used, they count as one independent
#' variable.
#' @param family The type of regression. Passed to `glm`. See
#' \code{\link[stats:glm]{glm}} for more information.
#' @param topN (Optional) The number of top results to be printed upon run
#' completion. Defaults to 0.
#' @param interactions (Optional) A boolean indicating whether or not
#' interaction terms should be used. Defaults to `FALSE`.
#' @param multi (Optional) A boolean indicating whether or not multithreading
#' should be used. Defaults to `FALSE`. It is highly recommended to use
#' multithreading.
#' @param ... (Optional) Function arguments to be passed to
#' \code{\link[stats]{glm}}
#'
#' @return Returns a `data.table` of information on the regressions run. The
#' resulting data.table is sorted in descending order by the rSquare divided
#' by the mean p-value. This is generally reliable in pushing quality
#' regressions to the top of the list. \item{`formula`}{The regression formula
#' used.} \item{`aic`}{The aic for the regression.} \item{`rSquare`}{The
#' calculated r-square for the regression.} \item{`warn`}{Currently unused.}
#' \item{independent}{Each variable column contains the p-values for that
#' variable or interaction term in a given regression.}
#'
#' @import data.table pbapply parallel
#' @importFrom stats as.formula formula glm BIC
#' @importFrom utils combn
#'
#' @examples
#' # Creating dummy data
#' dt <- data.frame("dependent" = sample(c(0, 1), 100, replace = TRUE),
#' "ind_1" = runif(100, 0, 1),
#' "ind_2" = runif(100, 0, 1),
#' "ind_3" = runif(100, 0, 1),
#' "ind_4" = runif(100, 0, 1))
#'
#' # Without interaction terms and multithreading
#' ## Two top results
#' regsearch(dt, "dependent", c("ind_1", "ind_2", "ind_3", "ind_4"),
#' 1, 4, "binomial", 2)
#' ## No top results
#' regsearch(dt, "dependent", c("ind_1", "ind_2", "ind_3", "ind_4"),
#' 1, 4, "binomial", FALSE, FALSE)
#'
#' # With interaction terms and multithreading
#' regsearch(dt, "dependent", c("ind_1", "ind_2", "ind_3", "ind_4"),
#' 1, 4, "binomial", TRUE, TRUE)
regsearch <- function(data,
dependent, independent,
minvar = 1, maxvar,
family,
topN = 0,
interactions = FALSE,
multi = FALSE,
...) {
# warnings for missing variables in function call
if(missing(interactions)) {
warning("Missing 'interactions' argument. Defaulting to FALSE.")
interactions <- FALSE
}
if(missing(multi)) {
warning("Missing 'multi' argument. Defaulting to FALSE.")
multi <- FALSE
}
# stop if `interactions` or `multi` are not logical
if(!interactions %in% c(TRUE, FALSE) | !multi %in% c(TRUE, FALSE))
stop("Arguments 'interactions' and 'multi' must be logical TRUE/FALSE")
# stop if `parallel` or `pbapply` are not installed
# `parallel` should always be installed beacuse it is part of base R installs
if (!requireNamespace(c("parallel", "pbapply"), quietly = TRUE)) {
stop("Packages \"parallel\" and \"pbapply\" must be installed.",
call. = FALSE)
}
maxvarTest <-
length(independent) + ifelse(interactions, choose(length(independent), 2), 0)
if(maxvar > maxvarTest) {
warning(paste("`maxvar` is greater than the number of independent variables. Setting maxvar to",
maxvarTest, collapse = ""))
maxvar <- maxvarTest
}
# create the cluster for multithreading
if(multi) { clust <- makeCluster(detectCores()) }
# Get every combination of the independent variables
if(dependent %in% independent) {
warning("Dropping dependent variable from independent variables")
independent <- independent[independent != dependent]
}
factorVars <- sapply(data, is.factor)
# append underscores to factor variables so that when the regression is run,
# they will show as `factor_1`, `factor_2`, and so on
varNames <- ifelse(independent %in% sapply(data, is.factor),
paste0(independent, "_"),
independent)
# create and sort the list of independent variables
if(interactions) {
independent <- sort(independent)
# create a vector that includes every possible combination of
# interaction terms and the independent variables
print("Gathering variables...")
if (multi) {
independent <- c(unlist(pblapply(cl = clust,
combn(independent, 2, simplify = FALSE),
function(x) { paste(x, collapse = "*") }),
recursive = FALSE),
independent)
} else {
print("WARNING: Using interaction terms without multithreading may take a very long time")
independent <- c(unlist(pblapply(combn(independent, 2, simplify = FALSE),
function(x) { paste(x, collapse = "*") }),
recursive = FALSE),
independent)
}
} else { independent <- sort(independent) }
# create the formulas that will be used for the regressions
# when an interaction term is encountered, any superfluous terms are dropped
# i.e. `a*b + a + b` is dropped in favor of `a*b` since `+ a + b` is implied
print("Assembling regresions...")
if(multi) {
combs <- pblapply(cl = clust, minvar:maxvar, function(x) {
lapply(combn(independent, x, simplify = FALSE), dropDuplicates)
})
} else {
combs <- pblapply(minvar:maxvar, function(x) {
lapply(combn(independent, x, simplify = FALSE), dropDuplicates)
})
}
combs <- unlist(combs, recursive = FALSE)
print(paste("Creating",
length(combs),
"formulas. Please be patient, this may take a while."))
# create a base data.frame that has all independent variables
# `*` in interaction terms is converted to a `.`
reg <- data.frame(matrix(data = NA, nrow = 0, ncol = length(varNames) + 5))
colnames(reg) <- c("aic", "bic", "rSquare", "warn", "X.Intercept.",
gsub("\\*", ".", varNames))
# convert the list of independent variables into full regression formulas
print("Creating regressions...")
if (multi) {
clusterExport(clust, c("combs"), envir = environment())
forms <-
pblapply(cl = clust, combs, createRegression, dependent = dependent)
} else { forms <- pblapply(combs, createRegression, dependent = dependent) }
forms <- unlist(forms)
print(paste("Running", length(forms), "regressions. Please be patient, this may take a while."))
# runs the list of regressions and builds a data.table with the results of
# every regression
print("Running regressions...")
if(multi) {
clusterEvalQ(clust, library(data.table))
clusterExport(clust, c("forms", "family", "data"), envir = environment())
regs <- rbindlist(pblapply(cl = clust, forms, summariseReg,
data = data, family = family, reg = reg, ...),
fill = TRUE)
} else {
regs <- rbindlist(pblapply(forms, summariseReg,
data = data, family = family, reg = reg, ...),
fill = TRUE)
}
# attaches the formulas to the data.table
regs <- data.table("formula" = forms, regs)
# renames the `X.Intercept.` column to something easier to work with
setnames(regs, "X.Intercept.", "xIntercept")
# ranks the results based on my arbitrary system
regs$rowMeans <-
rowMeans(regs[,!c("formula", "aic", "bic", "rSquare", "warn", "xIntercept")],
na.rm = TRUE)
regs$rank <- regs$rSquare / regs$rowMeans
regs <- regs[order(rank, decreasing = TRUE), !c("rank", "rowMeans")]
# stops the cluster
if(multi) { stopCluster(clust) }
# prints the `topN` number of regressions
# because regressions themselves are not stored, they are run again
if (topN > 0) {
sapply(regs[1:topN, formula], function(x) {
print(summary(glm(formula = as.formula(x), data = data, family = family)))
})
}
# return the final data.table
return(regs)
}
# takes a character vector or formula and runs the requested regression
# returns a single row data.frame representing the results of that regression
summariseReg <- function(x, data, family, reg, ...) {
options(warn = 0)
lm <- glm(formula = as.formula(x), data = data, family = family, ...)
summ <- summary(lm)
# warn <- tryCatch({ names(last.warning) },
# warning = function(e) {""},
# error = function(e) {""})
coefs <- t(summ$coefficients[,4])
names(coefs) <- gsub(":", ".", names(coefs))
summ <- data.frame("aic" = summ$aic,
"bic" = BIC(lm),
"rSquare" = round(1 - (summ$deviance / summ$null.deviance), 5),
# "warn" = warn,
"warn" = NA,
coefs)
return(rbindlist(list(reg, summ), fill = TRUE))
}
# compares expanded (interactions) to unexpanded single variables
# returns the input if no unexpanded variables are in the expanded variables
dropDuplicates <- function(x) {
x <- x[!is.na(x)]
expanded <- paste0(unlist(strsplit(x[grepl("\\*", x)], "\\*")), "")
notExpanded <- paste0(x[!grepl("\\*", x)], "")
if (sum(notExpanded %in% expanded) == 0) { return(x) }
else { return(NULL) }
}
# pastes variables into a usable format for regressions
createRegression <- function(x, dependent) {
if (is.null(x)) { return(NULL) }
return(paste(paste(dependent, "~"),
paste("+", x[!is.na(x)], collapse = " "),
collapse = " "))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.