#' @title make_regression_grid
#'
#' @description makes the search grid for \code{\link[mndredge]{dredge_regression}}
#'
#' @export
#'
#' @param data a \code{\link[base]{data.frame}} to generate the search grid from
#' @param max_columns the maximum columns that can go into \code{\link[stats]{lm}} formula
#' @param max_correlation max pair-wise \code{\link[base]{abs}} of \code{\link[stats]{cor}} for the predictors because \code{\link[stats]{lm}} performs poorly with corrlated columns.
#' @param beam_width the maximum the maximum number of regressions to try for the \code{\link[base]{data.frame}}
#' @return a tidy \code{\link[base]{data.frame}} containing the following components:
#' \describe{
#' \item{formula}{the formula to use in \code{\link[stats]{lm}}}
#' }
#'
#' @author Mark Newman, \email{mark@trinetteandmark.com}
#' @keywords iteration
#' @family search grids
#'
#' @examples
#' \dontshow{
#' library(magrittr)
#' library(mndredge) }
#'
make_regression_grid <-
function(
data,
max_columns = getOption("mndredge_max_columns", 4),
max_correlation = getOption("mndredge_max_correlation", .85),
beam_width = getOption("mndredge_beam_width", 1000)) {
# quite incorrect `devtools::check()` note
uniqueness <- name <- formula <- NULL
stopifnot(
data %>% is.data.frame(),
max_columns %>% is.numeric(),
max_columns > 0,
max_correlation %>% is.numeric(),
max_correlation > 0,
max_correlation < 1,
beam_width %>% is.numeric(),
beam_width > 0)
empty_result <- data.frame(formula = character())
if(
data %>% ncol() < 2 |
data %>% nrow() < 3 ) {
return(empty_result) }
is_prediction <-
function(x) {
x == "numeric" |
x == "integer" |
x == "factor" }
is_response <-
function(x) {
x == "numeric" |
x == "integer" }
d2 <-
data %>%
colnames() %>%
as_tibble() %>%
set_colnames(c("name")) %>%
add_column(
class =
data %>%
sapply(class)) %>%
add_column(
uniqueness =
data %>%
lapply(unique) %>%
sapply(length)) %>%
filter(uniqueness > 1) %>%
add_column(
is_prediction =
is_prediction(.$class)) %>%
add_column(
is_response =
is_response(.$class))
d2c <- d2 %>% tally() %>% .[[1,1]]
ipc <- d2 %>% tally(is_prediction) %>% .[[1,1]]
irc <- d2 %>% tally(is_response) %>% .[[1,1]]
if(d2c < 2 | ipc == 0 | irc == 0) {
return(empty_result) }
max_columns <- min(max_columns, ipc)
beam_width <- min(
beam_width,
max(1, irc * sum(choose(ipc, 1:max_columns))))
result <- character(length = beam_width)
predictors <-
d2 %>%
filter(is_prediction) %>%
arrange(uniqueness %>% desc()) %>%
pull(name)
responses <-
d2 %>%
filter(is_response) %>%
arrange(uniqueness) %>%
pull(name)
indx <- 1
for(i in max_columns:1) {
grid <- predictors %>% combn_limited(i, beam_width)
for(j in 1:(grid %>% ncol())) {
p2 <- grid[,j] %>% paste(collapse = "*")
exclude <-
((i >= 2) &&
((mc <-
data %>%
calculate_max_correlation(grid[,j])) > max_correlation))
if(!exclude) {
for(k in 1:irc) {
r2 <- responses[k]
exclude <- r2 %in% grid[,j]
if(!exclude) {
result[indx] <- sprintf("%s~%s", r2, p2)
if(indx == beam_width) {
return(
data.frame(
formula = result,
stringsAsFactors = F)) }
indx <- indx + 1
}}}}}
data.frame(
formula =
result %>%
as_tibble() %>%
set_colnames(c("formula")) %>%
filter(formula != ""),
stringsAsFactors = F)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.