R/choose_on_aic.R

Defines functions choose_on_aic

Documented in choose_on_aic

#' Choose best stepwise regression model based on AIC
#'
#' @description choose_on_aic() takes a tibble generated by stepwise_regression() as input.
#'
#' For each given combination of dependent variable, independent variable of interest and covariates, stepwise_regression() produces five models (original full, minimal, forward selection, stepwise selection, backward elimination).
#'
#' choose_on_aic() simply uses AIC to determine the best model within each set of five models.
#'
#' @details When AIC could be computed for at least one the stepwise regression models (forward selection, stepwise selection, backward elimination), the best model is defined as the one with the lowest AIC value. In case the lowest AIC is shared by the stepwise selection model and other stepwise regression model(s), the stepwise selection model is chosen.
#'
#' When df = 0 for the original full model and all three stepwise regression models (F and AIC could not be computed), the minimal model (dependent variable ~ independent variable of interest) is defined as the best model if and only if it has an F p value <= 0.05 AND a t p value (related to the independent variable of interest) <= 0.05 ("sign" label = TRUE).
#'
#' The combinations of dependent variable, independent variable of interest and covariates for which a best model could not be defined (df = 0 for the original full model and all three stepwise regression models; F p value > 0.05 and/or a t p value > 0.05 for the minimal model) are discarded.
#'
#' Note: there may be some combinations of dependent variable, independent variable of interest and covariates for which the best model i) is a stepwise regression model with "sign" label = FALSE (F p value > 0.05 and/or t p value > 0.05) while their minimal model received a "sign" label = TRUE, or ii) the independent variable of interest has been excluded from the model.
#'
#' @param tibble a tibble produced by stepwise_regression().
#' @param all.stepwise a logical indicating whether model selection based on AIC should be skipped. If TRUE, the stepwise selection model is systematically chosen as the best model as long as its AIC value could be computed. Default is FALSE.
#'
#' @return A tibble containing, for each combination of dependent variable, independent variable of interest and covariates, i) the corresponding best model, ii) its associated statistics (F p value, t p value) and iii) indication as to whether the independent variable of interest is included or not.
#'
#' Combinations of dependent variable, independent variable of interest and covariates for which a best model could not be defined are discarded, so the output may be shorter than the input tibble.
#'
#' @export
#'
#' @importFrom dplyr bind_rows
#' @importFrom dplyr case_when
#' @importFrom dplyr filter
#' @importFrom dplyr mutate
#' @importFrom dplyr near
#' @importFrom dplyr select
#' @importFrom magrittr %>%
#' @importFrom purrr map2_chr
#' @importFrom purrr map2_lgl
#' @importFrom purrr map_chr
#' @importFrom purrr map_int
#' @importFrom purrr pmap_chr
#' @importFrom purrr pmap_dbl
#' @importFrom purrr pmap_lgl
#' @importFrom stringr str_c
#' @importFrom stringr str_split
#' @importFrom stringr str_subset
#' @importFrom tidyselect everything
#'
#' @examples
#'
choose_on_aic <- function(tibble, all.stepwise = FALSE) {

  out1 <- tibble %>%
    filter(resid_df_sup_0 == FALSE & !(n_obs > (n_terms_forw + 1)) & !(n_obs > (n_terms_step + 1))) %>%
    filter(minoi_sign == TRUE) %>%
    mutate(best_mod_nm = "minimal model of interest",
           alt_mod_nm = "none",
           best_mod_terms = minoi_terms,
           n_terms_best_mod = n_terms_minoi,
           best_mod_val = minoi_mod)

  if (all.stepwise == FALSE) {
    out2 <- tibble %>%
      filter(resid_df_sup_0 == TRUE | n_obs > (n_terms_forw + 1) | n_obs > (n_terms_step + 1)) %>%
      mutate(forw_terms_alph = map_chr(forw_terms,
                                       ~ str_split(., " ") %>% unlist() %>%
                                         str_subset("[^\\+]") %>% sort() %>% str_c(collapse = " + ")),
             step_terms_alph = map_chr(step_terms,
                                       ~ str_split(., " ") %>% unlist() %>%
                                         str_subset("[^\\+]") %>% sort() %>% str_c(collapse = " + ")),
             back_terms_alph = map_chr(back_terms,
                                       ~ str_split(., " ") %>% unlist() %>%
                                         str_subset("[^\\+]") %>% sort() %>% str_c(collapse = " + ")),
             step_forw_identical = map2_lgl(step_terms_alph, forw_terms_alph, ~ identical(.x, .y)),
             step_back_identical = map2_lgl(step_terms_alph, back_terms_alph, ~ identical(.x, .y)),
             forw_back_identical = map2_lgl(forw_terms_alph, back_terms_alph, ~ identical(.x, .y)),
             step_forw_back_identical = pmap_lgl(list(step_forw_identical,
                                                      step_back_identical,
                                                      forw_back_identical),
                                                 ~ case_when(isTRUE(..1) & isTRUE(..2) & isTRUE(..3) ~ TRUE,
                                                             TRUE ~ FALSE)),
             step_mod_equality = pmap_chr(list(step_forw_back_identical,
                                               step_forw_identical,
                                               step_back_identical,
                                               forw_back_identical),
                                          ~ case_when(isTRUE(..1) ~ "stepwise / forward / backward",
                                                      isTRUE(..2) ~ "stepwise / forward",
                                                      isTRUE(..3) ~ "stepwise / backward",
                                                      isTRUE(..4) ~ "forward / backward",
                                                      TRUE ~ "none identical")),
             step_min_AIC = pmap_dbl(list(step_AIC,
                                          forw_AIC,
                                          back_AIC),
                                     ~ min(..., na.rm = TRUE)) %>%
               {case_when(
                 near(step_AIC, .) & near(forw_AIC, .) & near(back_AIC, .) ~ "stepwise / forward / backward",
                 near(step_AIC, .) & near(forw_AIC, .) ~ "stepwise / forward",
                 near(step_AIC, .) & near(back_AIC, .) ~ "stepwise / backward",
                 near(forw_AIC, .) & near(back_AIC, .) ~ "forward / backward",
                 near(step_AIC, .) ~ "stepwise",
                 near(forw_AIC, .) ~ "forward",
                 near(back_AIC, .) ~ "backward")},
             best_mod_nm = map2_chr(step_min_AIC, step_mod_equality,
                                    ~ case_when(.x == "stepwise / forward / backward" & .y == "stepwise / forward / backward"
                                                ~ "stepwise selection",
                                                .x == "stepwise / forward" & .y == "stepwise / forward"
                                                ~ "stepwise selection",
                                                .x == "stepwise / backward" & .y == "stepwise / backward"
                                                ~ "stepwise selection",
                                                .x == "forward / backward" & .y == "forward / backward"
                                                ~ "forward selection",
                                                .x == "stepwise" & !.y %in% c("stepwise / forward / backward", "stepwise / forward", "stepwise / backward")
                                                ~ "stepwise selection",
                                                .x == "forward" & !.y %in% c("stepwise / forward / backward", "stepwise / forward", "forward / backward")
                                                ~ "forward selection",
                                                .x == "backward" & !.y %in% c("stepwise / forward / backward", "stepwise / backward", "forward / backward")
                                                ~ "backward elimination",
                                                TRUE ~ "bug")),
             alt_mod_nm = map2_chr(step_min_AIC, step_mod_equality,
                                   ~ case_when(.x == "stepwise / forward / backward" & .y == "stepwise / forward / backward"
                                               ~ "forward / backward",
                                               .x == "stepwise / forward" & .y == "stepwise / forward"
                                               ~ "forward selection",
                                               .x == "stepwise / backward" & .y == "stepwise / backward"
                                               ~ "backward elimination",
                                               .x == "forward / backward" & .y == "forward / backward"
                                               ~ "backward elimination",
                                               .x == "stepwise" & !.y %in% c("stepwise / forward / backward", "stepwise / forward", "stepwise / backward")
                                               ~ "none",
                                               .x == "forward" & !.y %in% c("stepwise / forward / backward", "stepwise / forward", "forward / backward")
                                               ~ "none",
                                               .x == "backward" & !.y %in% c("stepwise / forward / backward", "stepwise / backward", "forward / backward")
                                               ~ "none",
                                               TRUE ~ "bug")),
             best_mod_terms = case_when(best_mod_nm == "stepwise selection" ~ step_terms,
                                        best_mod_nm == "forward selection" ~ forw_terms,
                                        best_mod_nm == "backward elimination" ~ back_terms,
                                        TRUE ~ "bug"),
             n_terms_best_mod = map_int(best_mod_terms,
                                        ~ str_split(., " ") %>% unlist() %>% str_subset("[^\\+]") %>% length()),
             best_mod_val = case_when(best_mod_nm == "stepwise selection" ~ step_mod,
                                      best_mod_nm == "forward selection" ~ forw_mod,
                                      best_mod_nm == "backward elimination" ~ back_mod))

    output <- bind_rows(out1, out2) %>%
      mutate(indepoi_in_best = pmap_lgl(list(best_mod_nm,
                                             indepoi_in_minoi,
                                             indepoi_in_step,
                                             indepoi_in_forw,
                                             indepoi_in_back),
                                        ~ case_when(..1 == "minimal model of interest" & ..2 ~ TRUE,
                                                    ..1 == "stepwise selection" & ..3 ~ TRUE,
                                                    ..1 == "forward selection" & ..4 ~ TRUE,
                                                    ..1 == "backward elimination" & ..5 ~ TRUE,
                                                    TRUE ~ FALSE)),
             best_mod_r_squared = pmap_dbl(list(best_mod_nm,
                                                n_terms_best_mod,
                                                minoi_r_squared,
                                                step_r_squared,
                                                step_adj_r_squared,
                                                forw_r_squared,
                                                forw_adj_r_squared,
                                                back_r_squared,
                                                back_adj_r_squared),
                                           ~ case_when(..1 == "minimal model of interest" ~ ..3,
                                                       ..1 == "stepwise selection" & ..2 == 1 ~ ..4,
                                                       ..1 == "stepwise selection" & ..2 > 1 ~ ..5,
                                                       ..1 == "forward selection" & ..2 == 1 ~ ..6,
                                                       ..1 == "forward selection" & ..2 > 1 ~ ..7,
                                                       ..1 == "backward elimination" & ..2 == 1 ~ ..8,
                                                       ..1 == "backward elimination" & ..2 > 1 ~ ..9)),
             best_mod_t_p_value = pmap_dbl(list(best_mod_nm,
                                                minoi_t_p_value,
                                                step_t_p_value,
                                                forw_t_p_value,
                                                back_t_p_value),
                                           ~ case_when(..1 == "minimal model of interest" ~ ..2,
                                                       ..1 == "stepwise selection" ~ ..3,
                                                       ..1 == "forward selection" ~ ..4,
                                                       ..1 == "backward elimination" ~ ..5)),
             best_mod_f_p_value = pmap_dbl(list(best_mod_nm,
                                                minoi_f_p_value,
                                                step_f_p_value,
                                                forw_f_p_value,
                                                back_f_p_value),
                                           ~ case_when(..1 == "minimal model of interest" ~ ..2,
                                                       ..1 == "stepwise selection" ~ ..3,
                                                       ..1 == "forward selection" ~ ..4,
                                                       ..1 == "backward elimination" ~ ..5)),
             best_mod_coeff_indepoi = pmap_dbl(list(indepoi_in_best, best_mod_val, indep_var_oi),
                                               ~ ifelse(isTRUE(..1),
                                                        ..2$coefficients[[..3]],
                                                        NA_real_)),
             best_mod_corr_indepoi = case_when(best_mod_coeff_indepoi < 0 ~ "negative",
                                               best_mod_coeff_indepoi > 0 ~ "positive",
                                               TRUE ~ NA_character_)) %>%
      select(1:resid_df_sup_0,
                    best_mod_nm, alt_mod_nm, best_mod_terms, n_terms_best_mod, indepoi_in_best, best_mod_val, best_mod_coeff_indepoi,
                    best_mod_corr_indepoi, best_mod_t_p_value, best_mod_f_p_value, best_mod_r_squared, everything())
  } else {
    out2 <- tibble %>%
      filter(resid_df_sup_0 == TRUE | n_obs > (n_terms_forw + 1) | n_obs > (n_terms_step + 1)) %>%
      mutate(step_terms_alph = map_chr(step_terms,
                                       ~ str_split(., " ") %>% unlist() %>%
                                         str_subset("[^\\+]") %>% sort() %>% str_c(collapse = " + ")),
             best_mod_nm = "stepwise selection",
             best_mod_terms = step_terms,
             n_terms_best_mod = map_int(best_mod_terms,
                                        ~ str_split(., " ") %>% unlist() %>% str_subset("[^\\+]") %>% length()),
             best_mod_val = step_mod)

    output <- bind_rows(out1, out2) %>%
      mutate(indepoi_in_best = pmap_lgl(list(best_mod_nm,
                                             indepoi_in_minoi,
                                             indepoi_in_step),
                                        ~ case_when(..1 == "minimal model of interest" & ..2 ~ TRUE,
                                                    ..1 == "stepwise selection" & ..3 ~ TRUE,
                                                    TRUE ~ FALSE)),
             best_mod_r_squared = pmap_dbl(list(best_mod_nm,
                                                n_terms_best_mod,
                                                minoi_r_squared,
                                                step_r_squared,
                                                step_adj_r_squared),
                                           ~ case_when(..1 == "minimal model of interest" ~ ..3,
                                                       ..1 == "stepwise selection" & ..2 == 1 ~ ..4,
                                                       ..1 == "stepwise selection" & ..2 > 1 ~ ..5)),
             best_mod_t_p_value = pmap_dbl(list(best_mod_nm,
                                                minoi_t_p_value,
                                                step_t_p_value),
                                           ~ case_when(..1 == "minimal model of interest" ~ ..2,
                                                       ..1 == "stepwise selection" ~ ..3)),
             best_mod_f_p_value = pmap_dbl(list(best_mod_nm,
                                                minoi_f_p_value,
                                                step_f_p_value),
                                           ~ case_when(..1 == "minimal model of interest" ~ ..2,
                                                       ..1 == "stepwise selection" ~ ..3)),
             best_mod_coeff_indepoi = pmap_dbl(list(indepoi_in_best, best_mod_val, indep_var_oi),
                                               ~ ifelse(isTRUE(..1),
                                                        ..2$coefficients[[..3]],
                                                        NA_real_)),
             best_mod_corr_indepoi = case_when(best_mod_coeff_indepoi < 0 ~ "negative",
                                               best_mod_coeff_indepoi > 0 ~ "positive",
                                               TRUE ~ NA_character_)) %>%
      select(1:resid_df_sup_0,
                    best_mod_nm, best_mod_terms, n_terms_best_mod, indepoi_in_best, best_mod_val, best_mod_coeff_indepoi,
                    best_mod_corr_indepoi, best_mod_t_p_value, best_mod_f_p_value, best_mod_r_squared, everything())
  }
}
benvallin/banban documentation built on Sept. 29, 2023, 5:46 a.m.