knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

This has a few examples of specifying model subsets and fitting linear models in order to choose the "best" model via a model selection criterion.

library(erikmisc)
library(dplyr)

Using e_model_all_subsets_formula

In this example, variables "a" and "b" are always in the model, while variables "c" and "d" are subject to model selection up to a second-order model.

In biostatistical or epidemiological applications, certain demographic variables such as age, sex, and race are often included regardless of "statistical signicance", and other variables are subject to selection.

# Two variables always in the model with a second-order model of two others
e_model_all_subsets_formula(
    var_formula = NULL
  , x_var_names = list(c("a", "b"), c("c", "d"))
  , y_var_name  = "z"
  , max_scope   = list("always", "SO")
  , sw_return   = c("formula", "table")[2]
  )

Applications

Linear model fit and model selection

Note that in these examples we are ignoring model fit; see e_plot_lm_diagostics for diagnostics.

We'll fit all the linear models between the maximal and minimal models:

Then we select the model with the lowest BIC statistic.

dat_mtcars_e |> head(3)

# table of the candidate models
form_table <-
  e_model_all_subsets_formula(
      var_formula = NULL
    , x_var_names = list(c("cyl", "carb"), c("disp", "hp"), c("wt", "vs", "am", "gear"))
    , y_var_name  = "mpg"
    , max_scope   = list("always", "TWI", "FO")
    , sw_return   = c("formula", "table")[2]
    )
print(form_table)

# generate candidate model formulas
form_list <-
  e_model_all_subsets_formula(
      var_formula = NULL
    , x_var_names = list(c("cyl", "carb"), c("disp", "hp"), c("wt", "vs", "am", "gear"))
    , y_var_name  = "mpg"
    , max_scope   = list("always", "TWI", "FO")
    , sw_return   = c("formula", "table")[1]
    )

# fit models and calculate model selection criteria
lm_fit_list <- list()
lm_fit_list_criteria <- list()

for (i_form in seq_len(length(form_list))) {
  # model fit
  lm_fit_list[[ i_form ]] <-
    stats::lm(
      formula = form_list[[ i_form ]]
    , data    = dat_mtcars_e
    )

  # model selection criteria
  lm_fit_list_criteria[[ i_form ]] <-
    e_lm_model_criteria(
      lm_fit    = lm_fit_list[[ i_form ]]
    , dat_fit   = dat_mtcars_e
    , model_id  = i_form
    )
}

# sort the models by a criterion
lm_fit_criteria <-
  lm_fit_list_criteria |>
  dplyr::bind_rows() |>
  dplyr::arrange(
    # arrange models by this model criterion
    bic               # ascending by bic (best on top)
    # dplyr::desc(r2)   # descending by r2 (best on top)
  )

# print the head and tail of fit criteria table
lm_fit_criteria |> head()
lm_fit_criteria |> tail(3)

# best model ID
lm_fit_criteria$model_id[1]

# summarize "best" model
lm_fit_list[[ lm_fit_criteria$model_id[1] ]] |>
  car::Anova(type = 3)
lm_fit_list[[ lm_fit_criteria$model_id[1] ]] |>
  summary()

Longitudinal linear model fit and model selection

For the purpose of illustrating the method, variables x1, x2, and x3 are generated as indepedent standard normal random variables. The correct model would remove all of those variables and keep only the fixed effect of Type and the random effect (1 | Subject).

We'll fit all the longitudinal linear models between the maximal and minimal models:

Then we select the model with the lowest cAIC statistic.

set.seed(76543)

dat_ergoStool_e <-
  dat_ergoStool_e |>
  # add a few noise variables just to use for model selection example
  dplyr::mutate(
    x1 = stats::rnorm(n = dplyr::n())
  , x2 = stats::rnorm(n = dplyr::n())
  , x3 = stats::rnorm(n = dplyr::n())
  )
dat_ergoStool_e |> head(8)

# table of the candidate models
form_table <-
  e_model_all_subsets_formula(
      var_formula = NULL
    , x_var_names = list(c("Type", "(1 | Subject)"), c("x1", "x2", "x3"))
    , y_var_name  = "Effort"
    , max_scope   = list("always", "SO")
    , sw_return   = c("formula", "table")[2]
    )
print(form_table)

# generate candidate model formulas
form_list <-
  e_model_all_subsets_formula(
      var_formula = NULL
    , x_var_names = list(c("Type", "(1 | Subject)"), c("x1", "x2", "x3"))
    , y_var_name  = "Effort"
    , max_scope   = list("always", "SO")
    , sw_return   = c("formula", "table")[1]
    )

# fit models and calculate model selection criteria
lmer_fit_list <- list()
lmer_fit_list_criteria <- list()

for (i_form in seq_len(length(form_list))) {
  # model fit
  lmer_fit_list[[ i_form ]] <-
    lme4::lmer(
      formula = form_list[[ i_form ]]
    , REML    = TRUE
    , data    = dat_ergoStool_e
    )

  # model selection criteria
  lmer_fit_list_criteria[[ i_form ]] <-
    e_lm_model_criteria(
      lm_fit    = lmer_fit_list[[ i_form ]]
    , dat_fit   = dat_ergoStool_e
    , model_id  = i_form
    )
}

# sort the models by a criterion
lmer_fit_criteria <-
  lmer_fit_list_criteria |>
  dplyr::bind_rows() |>
  dplyr::arrange(
    # arrange models by this model criterion
    caic              # ascending by caic (best on top)
    # dplyr::desc(r2)   # descending by r2 (best on top)
  )

# print the head and tail of fit criteria table
lmer_fit_criteria |> head()
lmer_fit_criteria |> tail(3)

# best model ID
lmer_fit_criteria$model_id[1]

# summarize "best" model
lmer_fit_list[[ lmer_fit_criteria$model_id[1] ]] |>
  car::Anova(type = 3)
lmer_fit_list[[ lmer_fit_criteria$model_id[1] ]] |>
  summary()

Longitudinal linear (in parallel, for 200+ models)

## Set up parallel backend
  ## From: https://www.blasbenito.com/post/02_parallelizing_loops_with_r/
  library(doParallel)
  # use n-1 cores
  n_cores <- parallel::detectCores() - 1
  # create the cluster
  my_cluster <-
    parallel::makeCluster(
      spec = n_cores
    , type = "PSOCK"
    )
  # check cluster definition (optional)
  print(my_cluster)
  # register it to be used by %dopar%
  doParallel::registerDoParallel(cl = my_cluster)
  # check if it is registered (optional)
  foreach::getDoParRegistered()
  # how many workers are available? (optional)
  foreach::getDoParWorkers()


# table of the candidate models
form_table <-
  e_model_all_subsets_formula(
      var_formula = NULL
    , x_var_names = list(c("Type", "(1 | Subject)"), c("x1", "x2", "x3"))
    , y_var_name  = "Effort"
    , max_scope   = list("always", "SO")
    , sw_return   = c("formula", "table")[2]
    )
print(form_table)

# generate candidate model formulas
form_list <-
  e_model_all_subsets_formula(
      var_formula = NULL
    , x_var_names = list(c("Type", "(1 | Subject)"), c("x1", "x2", "x3"))
    , y_var_name  = "Effort"
    , max_scope   = list("always", "SO")
    , sw_return   = c("formula", "table")[1]
    )

# fit models and calculate model selection criteria
lmer_fit_list <-
  foreach::foreach(
    i_form = seq_len(length(form_list))
  , .combine = "c"
  ) %dopar% {
    # model fit
    lme4::lmer(
      formula = form_list[[ i_form ]]
    , REML    = TRUE
    , data    = dat_ergoStool_e
    )
  }

lmer_fit_list_criteria <-
  foreach::foreach(
    i_form = seq_len(length(form_list))
  #, .combine = "c"
  ) %dopar% {
    # model selection criteria
    e_lm_model_criteria(
      lm_fit    = lmer_fit_list[[ i_form ]]
    , dat_fit   = dat_ergoStool_e
    , model_id  = i_form
    )
  }



# sort the models by a criterion
lmer_fit_criteria <-
  lmer_fit_list_criteria |>
  dplyr::bind_rows() |>
  dplyr::arrange(
    # arrange models by this model criterion
    caic              # ascending by caic (best on top)
    # dplyr::desc(r2)   # descending by r2 (best on top)
  )

# print the head and tail of fit criteria table
lmer_fit_criteria |> head()
lmer_fit_criteria |> tail(3)

# best model ID
lmer_fit_criteria$model_id[1]

# summarize "best" model
lmer_fit_list[[ lmer_fit_criteria$model_id[1] ]] |>
  car::Anova(type = 3)
lmer_fit_list[[ lmer_fit_criteria$model_id[1] ]] |>
  summary()


## stop cluster
parallel::stopCluster(cl = my_cluster)


erikerhardt/erikmisc documentation built on April 17, 2025, 10:48 a.m.