R/make_regression_grid.r

#' @title make_regression_grid
#'
#' @description makes the search grid for \code{\link[mndredge]{dredge_regression}}
#'
#' @export
#'
#' @param data a \code{\link[base]{data.frame}} to generate the search grid from
#' @param max_columns the maximum columns that can go into \code{\link[stats]{lm}} formula
#' @param max_correlation max pair-wise \code{\link[base]{abs}} of \code{\link[stats]{cor}} for the predictors because \code{\link[stats]{lm}} performs poorly with corrlated columns.
#' @param beam_width the maximum the maximum number of regressions to try for the \code{\link[base]{data.frame}}
#' @return a tidy \code{\link[base]{data.frame}} containing the following components:
#' \describe{
#'   \item{formula}{the formula to use in \code{\link[stats]{lm}}}
#' }
#'
#' @author Mark Newman, \email{mark@trinetteandmark.com}
#' @keywords iteration
#' @family search grids
#'
#' @examples
#'   \dontshow{
#'     library(magrittr)
#'     library(mndredge) }
#'
make_regression_grid <-
  function(
    data,
    max_columns = getOption("mndredge_max_columns", 4),
    max_correlation = getOption("mndredge_max_correlation", .85),
    beam_width = getOption("mndredge_beam_width", 1000)) {
    
    # quite incorrect `devtools::check()` note
    uniqueness <- name <- formula <- NULL

    stopifnot(
      data %>% is.data.frame(),
      max_columns %>% is.numeric(),
      max_columns > 0,
      max_correlation %>% is.numeric(),
      max_correlation > 0,
      max_correlation < 1,
      beam_width %>% is.numeric(),
      beam_width > 0)
    
    empty_result <- data.frame(formula = character())
    
    if(
      data %>% ncol() < 2 |
      data %>% nrow() < 3 ) {
      return(empty_result) }
    
    is_prediction <-
      function(x) {
        x == "numeric" |
          x == "integer" |
          x == "factor" }
    is_response <-
      function(x) {
        x == "numeric" |
          x == "integer" }

    d2 <-
      data %>%
      colnames() %>%
      as_tibble() %>%
      set_colnames(c("name")) %>%
      add_column(
        class =
          data %>%
          sapply(class)) %>%
      add_column(
        uniqueness =
          data %>%
          lapply(unique) %>%
          sapply(length)) %>%
      filter(uniqueness > 1) %>%
      add_column(
        is_prediction =
          is_prediction(.$class)) %>%
      add_column(
        is_response =
          is_response(.$class))
    
    d2c <- d2 %>% tally() %>% .[[1,1]]
    ipc <- d2 %>% tally(is_prediction) %>% .[[1,1]]
    irc <- d2 %>% tally(is_response) %>% .[[1,1]]
    
    if(d2c < 2 | ipc == 0 | irc == 0) {
      return(empty_result) }
  
    max_columns <- min(max_columns, ipc)
    beam_width <- min(
      beam_width,
      max(1, irc * sum(choose(ipc, 1:max_columns))))
    result <- character(length = beam_width)
    
    predictors <-
      d2 %>%
      filter(is_prediction) %>%
      arrange(uniqueness %>% desc()) %>%
      pull(name)
    responses <-
      d2 %>%
      filter(is_response) %>%
      arrange(uniqueness) %>%
      pull(name)

    indx <- 1
    for(i in max_columns:1) {
      
      grid <- predictors %>% combn_limited(i, beam_width)
      
      for(j in 1:(grid %>% ncol())) {
        
        p2 <- grid[,j] %>% paste(collapse = "*")
        exclude <-
          ((i >= 2) &&
             ((mc <-
                 data %>%
                 calculate_max_correlation(grid[,j])) > max_correlation))
        
        if(!exclude) {
          for(k in 1:irc) {
            
            r2 <- responses[k]
            exclude <- r2 %in% grid[,j]
            
            if(!exclude) {
              result[indx] <- sprintf("%s~%s", r2, p2)
              if(indx == beam_width) {
                return(
                  data.frame(
                    formula = result,
                    stringsAsFactors = F)) }
              indx <- indx + 1
            }}}}}
    
    data.frame(
      formula =
        result %>%
        as_tibble() %>%
        set_colnames(c("formula")) %>%
        filter(formula != ""),
      stringsAsFactors = F)
  }
markanewman/mndredge documentation built on May 9, 2019, 5:52 a.m.