R/dredge_regression.r

#' @title dredge_regression
#'
#' @description data dredge a \code{\link[base]{data.frame}} using \code{\link[stats]{lm}}
#'
#' @export
#'
#' @param data a \code{\link[base]{data.frame}} to operate over
 #' @param grid a search grid in the form of \code{\link[mndredge]{make_regression_grid}} to use in guiding the dredging
#' @param min_pvalue the maximum the pvalue from \code{\link[stats]{lm}} can be before the result is filtered out. this generaly is an issue only in overfitted models or synthetic data
#' @param max_pvalue the maximum the pvalue from \code{\link[stats]{lm}} can be before the result is filtered out
#' @param ... addtional parameters to pass to \code{\link[stats]{lm}}
#' @return a tidy \code{\link[base]{data.frame}} containing the following components:
#' \describe{
#'   \item{formula}{the model being applied to the \code{\link[base]{data.frame}}}
#'   \item{best_pvalue}{the best p.value for any model coefficient}
#'   \item{adjusted_r_squared}{the overall r for the model}
#' }
#'
#' @author Mark Newman, \email{mark@trinetteandmark.com}
#' @keywords utilities regression
#' @family dredging
#'
#' @examples
#'   \dontshow{
#'     library(magrittr)
#'     library(doParallel)
#'     library(mndredge) }
#'   set.seed(0)
#'   y1 <- c(rnorm(10), rnorm(10, mean = 2))
#'   y2 <- rep(c("a", "b"), c(10,10))
#'   registerDoParallel(cores=2)
#'   data <-
#'     data.frame(y1, y2) %>%
#'     adjust_grid_for_regression()
#'   grid <- data %>% make_regression_grid()
#'   data %>% dredge_regression(grid)
#'
dredge_regression <-
  function(
    data,
    grid,
    min_pvalue = getOption("mndredge_min_pvalue", 0),
    max_pvalue = getOption("mndredge_max_pvalue", 0.05),
    ...) {
    
    # quite incorrect `devtools::check()` note
    i <- NULL
  
    stopifnot(
      data %>% is.data.frame(),
      data %>% nrow() >= 3,
      grid %>% is.data.frame(),
      min_pvalue %>% is.numeric(),
      min_pvalue >= 0,
      min_pvalue < 1,
      max_pvalue %>% is.numeric(),
      max_pvalue >= 0,
      max_pvalue < 1,
      min_pvalue < max_pvalue)

    safe_lm_summary <- function(data, formula, ...) {
      tryCatch(
        expr = { lm(formula = formula, data = data, ...) %>% summary() },
        warning = function(x) {} ) }
    get_best_pvalue <- function(model_summary) {
      min(model_summary$coefficients[-1, 4]) }
  
    grid_res <-
      foreach(
        i = iter(grid, by = "row"),
        .packages = c("magrittr")) %dopar% {
          
          model_summary <-
            data %>%
            safe_lm_summary(i %>% as.formula())
  
          exclude <-
            model_summary %>% is.null() ||
            (arv <- model_summary$adj.r.squared) %>% is.nan() ||
            (bpv <- get_best_pvalue(model_summary)) < min_pvalue ||
            max_pvalue < bpv
          if(exclude) {
            NULL }
          else {
            c(i, bpv, arv)}}
  
    res_df <-
      do.call(rbind, grid_res) %>%
      data.frame(stringsAsFactors = F)
  
    if((res_df %>% nrow()) == 0) {
      data.frame(
        formula = character(),
        best_pvalue = numeric(),
        adjusted_r_squared = numeric())
    }
    else {
      res_df %>%
        make_column_numeric(2) %>%
        make_column_numeric(3) %>%
        set_colnames(c("formula", "best_pvalue", "adjusted_r_squared")) %>%
        set_rownames(NULL)
    }}
markanewman/mndredge documentation built on May 9, 2019, 5:52 a.m.