#' @title dredge_regression
#'
#' @description data dredge a \code{\link[base]{data.frame}} using \code{\link[stats]{lm}}
#'
#' @export
#'
#' @param data a \code{\link[base]{data.frame}} to operate over
#' @param grid a search grid in the form of \code{\link[mndredge]{make_regression_grid}} to use in guiding the dredging
#' @param min_pvalue the maximum the pvalue from \code{\link[stats]{lm}} can be before the result is filtered out. this generaly is an issue only in overfitted models or synthetic data
#' @param max_pvalue the maximum the pvalue from \code{\link[stats]{lm}} can be before the result is filtered out
#' @param ... addtional parameters to pass to \code{\link[stats]{lm}}
#' @return a tidy \code{\link[base]{data.frame}} containing the following components:
#' \describe{
#' \item{formula}{the model being applied to the \code{\link[base]{data.frame}}}
#' \item{best_pvalue}{the best p.value for any model coefficient}
#' \item{adjusted_r_squared}{the overall r for the model}
#' }
#'
#' @author Mark Newman, \email{mark@trinetteandmark.com}
#' @keywords utilities regression
#' @family dredging
#'
#' @examples
#' \dontshow{
#' library(magrittr)
#' library(doParallel)
#' library(mndredge) }
#' set.seed(0)
#' y1 <- c(rnorm(10), rnorm(10, mean = 2))
#' y2 <- rep(c("a", "b"), c(10,10))
#' registerDoParallel(cores=2)
#' data <-
#' data.frame(y1, y2) %>%
#' adjust_grid_for_regression()
#' grid <- data %>% make_regression_grid()
#' data %>% dredge_regression(grid)
#'
dredge_regression <-
function(
data,
grid,
min_pvalue = getOption("mndredge_min_pvalue", 0),
max_pvalue = getOption("mndredge_max_pvalue", 0.05),
...) {
# quite incorrect `devtools::check()` note
i <- NULL
stopifnot(
data %>% is.data.frame(),
data %>% nrow() >= 3,
grid %>% is.data.frame(),
min_pvalue %>% is.numeric(),
min_pvalue >= 0,
min_pvalue < 1,
max_pvalue %>% is.numeric(),
max_pvalue >= 0,
max_pvalue < 1,
min_pvalue < max_pvalue)
safe_lm_summary <- function(data, formula, ...) {
tryCatch(
expr = { lm(formula = formula, data = data, ...) %>% summary() },
warning = function(x) {} ) }
get_best_pvalue <- function(model_summary) {
min(model_summary$coefficients[-1, 4]) }
grid_res <-
foreach(
i = iter(grid, by = "row"),
.packages = c("magrittr")) %dopar% {
model_summary <-
data %>%
safe_lm_summary(i %>% as.formula())
exclude <-
model_summary %>% is.null() ||
(arv <- model_summary$adj.r.squared) %>% is.nan() ||
(bpv <- get_best_pvalue(model_summary)) < min_pvalue ||
max_pvalue < bpv
if(exclude) {
NULL }
else {
c(i, bpv, arv)}}
res_df <-
do.call(rbind, grid_res) %>%
data.frame(stringsAsFactors = F)
if((res_df %>% nrow()) == 0) {
data.frame(
formula = character(),
best_pvalue = numeric(),
adjusted_r_squared = numeric())
}
else {
res_df %>%
make_column_numeric(2) %>%
make_column_numeric(3) %>%
set_colnames(c("formula", "best_pvalue", "adjusted_r_squared")) %>%
set_rownames(NULL)
}}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.