knitr::opts_chunk$set(echo = TRUE)
if (params$isonline){
  s_ex07p01_path <- "https://charlotte-ngs.github.io/asmss2022/data/asm_bw_mod_sel.csv" 
} else {
  s_ex07p01_path <- file.path(here::here(), "docs", "data", "asm_bw_mod_sel.csv")
}

Problem 1: Model Selection

Given is a dataset with body weight as a response and different other variables and factors. The columns Breed and BCS (Body Condition Score) are taken as factors. All other columns are taken as predictor variables. The column Animal is not used in any model. Use model selection to find the relevant predictor variables and factors for the best linear fixed effect model. Use the estimated mean square error $C_p$ as a quality measure for a single linear model. The dataset to be analysed can be obtained from

cat(s_ex07p01_path, "\n")

Your Tasks

Solution

Because, we need the residual standard deviation of the full model and backward elimination starts with the full model, we start with backward elimination

Backward Elimination

if (params$isonline){
  s_ex07p01_path <- "https://charlotte-ngs.github.io/asmss2022/data/asm_bw_mod_sel.csv" 
} else {
  s_ex07p01_path <- file.path(here::here(), "docs", "data", "asm_bw_mod_sel.csv")
}
tbl_ex07p01 <- readr::read_csv(file = s_ex07p01_path)
tbl_ex07p01$BCS <- as.factor(tbl_ex07p01$BCS)
tbl_ex07p01$Breed <- as.factor(tbl_ex07p01$Breed)
s_resp <- "BW"
vec_cols_to_ignore <- c("Animal")
vec_pred_full <- setdiff(colnames(tbl_ex07p01), c(s_resp, vec_cols_to_ignore))
fmlm_full <- as.formula(paste0(s_resp, " ~ ", 
                               paste0(vec_pred_full, collapse = " + "), 
                               collapse = ""))
lm_ex07p01_full <- lm(formula = fmlm_full, data = tbl_ex07p01)
smry_ex07p01_full <- summary(lm_ex07p01_full)
n_sd_full <- smry_ex07p01_full$sigma
n_ssqr_full <- crossprod(residuals(lm_ex07p01_full))
n_ssqr_full

From the full model select one variable at the time, remove that variable, fit a reduced model and compute for that model the residual sum of squares. The model that increases the residual sum of squares the least, is selected and for that model the $C_p$ value is compute.

tbl_belim_res <- NULL
for (p in vec_pred_full){
  fm_update_cur <- as.formula(paste0(". ~ . - ", p, collapse = ""))
  lm_cur <- update(lm_ex07p01_full, fm_update_cur)
  vec_res <- residuals(lm_cur)
  tbl_cur <- tibble::tibble(Variable = p,
                            RSSQ = crossprod(vec_res))
  if (is.null(tbl_belim_res)){
    tbl_belim_res <- tbl_cur
  } else {
    tbl_belim_res <- dplyr::bind_rows(tbl_belim_res, tbl_cur)
  }
}
tbl_belim_res

From tbl_belim_res, we determine the variable which is excluded

n_idx_var_exclude <- which(tbl_belim_res$RSSQ == min(tbl_belim_res$RSSQ))
s_var_exclude <- tbl_belim_res$Variable[n_idx_var_exclude]
s_var_exclude

The model after this first round of elimination corresponds to the model that results when taking away the variable r s_var_exclude from the full model.

vec_pred_cur <- setdiff(vec_pred_full, s_var_exclude)
fm_cur <- as.formula(paste0(s_resp, " ~ ", 
                               paste0(vec_pred_cur, collapse = " + "), 
                               collapse = ""))
lm_cur <- lm(formula = fm_cur, data = tbl_ex07p01)
summary(lm_cur)

For the current model, we have to compute the $C_p$ value

n_nr_obs <- nrow(tbl_ex07p01)
n_rssq <- crossprod(residuals(lm_cur))
# model size is the number of predictors plus the intercept
n_model_size <- length(vec_pred_cur) + 1 
n_cp_cur <- n_rssq / (n_sd_full^2) - n_nr_obs + 2 * n_model_size
n_cp_cur

Verify, according to https://search.r-project.org/CRAN/refmans/olsrr/html/ols_mallows_cp.html

olsrr::ols_mallows_cp(lm_cur, lm_ex07p01_full)
get_subm_back <- function(plm_cur_model){
  # minimal value for RSSQ
  n_rssq_min <- NULL
  lm_result_sub <- NULL
  # obtain the vector of predictor variables and factors
  vec_pred_cur <- attr(terms(plm_cur_model), "term.labels")
  # loop over vector of predictors and compute RSSQ for each sub-model
  for (p in vec_pred_cur){
    # remove p from predictors
    fm_cur_subm <- as.formula(paste0(". ~ . - ", p, collapse = ""))
    lm_cur_subm <- update(plm_cur_model, fm_cur_subm)
    vec_res_subm <- residuals(lm_cur_subm)
    n_rssq_subm <- crossprod(vec_res_subm)
    # check whether n_rssq_sub is minimal
    if (is.null(n_rssq_min)){
      n_rssq_min <- n_rssq_subm
      lm_result_sub <- lm_cur_subm
    } else {
      if (n_rssq_subm < n_rssq_min){
        n_rssq_min <- n_rssq_subm
        lm_result_sub <- lm_cur_subm
      }
    }
  }
  # return model with minimal rssq
  return(lm_result_sub)
}

The function get_subm_back() can be verified by a call with the full model. Then the sub-model with HEI eliminated should result.

lm_ex07p01_first_subm <- get_subm_back(plm_cur_model = lm_ex07p01_full)
lm_ex07p01_first_subm

The second function computes the $C_p$ value for the obtained sub-model.

compute_cp_value <- function(pn_res_sd_full_model, pn_nr_obs, plm_cur_model){
  n_rssq <- crossprod(residuals(plm_cur_model))
  # model size is the number of predictors plus the intercept
  vec_pred_cur <- attr(terms(plm_cur_model), "term.labels")
  n_model_size <- length(vec_pred_cur) + 1 
  n_cp_cur <- n_rssq / (pn_res_sd_full_model^2) - pn_nr_obs + 2 * n_model_size
  return(n_cp_cur)
}

For the first submodel, we get

compute_cp_value(pn_res_sd_full_model = n_sd_full, 
                 pn_nr_obs = nrow(tbl_ex07p01), 
                 plm_cur_model = lm_ex07p01_first_subm)

Now that we have the two functions ready, we can do the repetition of the elimination process of variables from a model. To make it a little bit easier, we start again with the full model.

n_nr_obs <- nrow(tbl_ex07p01)
lm_current <- lm_ex07p01_full
n_sd_current <- summary(lm_current)$sigma
vec_pred_current <- attr(terms(lm_current), "term.labels")
# initialise a result dataframe
tbl_elim_result <- NULL
# loop as long as, there are variables in vec_pred_current
while (length(vec_pred_current) > 0){
  # get variables and C_p of current model
  tbl_elim_current <- tibble::tibble(`Current Model` = as.character(formula(lm_current))[3],
                                      Cp = compute_cp_value(pn_res_sd_full_model = n_sd_current,
                                                            pn_nr_obs = n_nr_obs, 
                                                            plm_cur_model = lm_current))
  # store variables and C_p value of current model in result
  if (is.null(tbl_elim_result)) {
    tbl_elim_result <- tbl_elim_current
  } else {
    tbl_elim_result <- dplyr::bind_rows(tbl_elim_result, tbl_elim_current)
  }
  # get new submodel
  lm_current <- get_subm_back(plm_cur_model = lm_current)
  vec_pred_current <- attr(terms(lm_current), "term.labels")
}
tbl_elim_result

In the above shown result dataframe, the model which only fits an intercept is missing. Hence, we add that model to the results

lm_inter <- lm(BW ~ 1, data = tbl_ex07p01)
tbl_elim_inter <- tibble::tibble(`Current Model` = as.character(formula(lm_inter))[3],
                                      Cp = compute_cp_value(pn_res_sd_full_model = n_sd_current,
                                                            pn_nr_obs = n_nr_obs, 
                                                            plm_cur_model = lm_inter))
tbl_elim_result <- dplyr::bind_rows(tbl_elim_result, tbl_elim_inter)
tbl_elim_result
n_model_idx <- which(tbl_elim_result$Cp == min(tbl_elim_result$Cp))
tbl_elim_result[n_model_idx,]

Forward Selection

In forward selection, we start with the smallest model with only an intercept. Based on the preparation for backward selection, we can start with the iteration after an initialisation of the current model with the smallest model. The major difference between forward selection and backward selection is the way how subsequent submodels are generated. In forward selection, predictor variables or factors are added. This is done in a function called get_subm_forward().

get_subm_forward <- function(plm_cur_model, pvec_pred_full){
  # minimal value for RSSQ
  n_rssq_min <- NULL
  lm_result_sub <- NULL
  # loop over vector of predictors and compute RSSQ for each sub-model
  for (p in pvec_pred_full){
    # remove p from predictors
    fm_cur_subm <- as.formula(paste0(". ~ . + ", p, collapse = ""))
    lm_cur_subm <- update(plm_cur_model, fm_cur_subm)
    vec_res_subm <- residuals(lm_cur_subm)
    n_rssq_subm <- crossprod(vec_res_subm)
    # check whether n_rssq_sub is minimal
    if (is.null(n_rssq_min)){
      n_rssq_min <- n_rssq_subm
      lm_result_sub <- lm_cur_subm
    } else {
      if (n_rssq_subm < n_rssq_min){
        n_rssq_min <- n_rssq_subm
        lm_result_sub <- lm_cur_subm
      }
    }
  }
  # return model with minimal rssq
  return(lm_result_sub)
}

The above function can be used in the iterative process of forward selection

# initialise current model
lm_current_forward <- lm(BW ~ 1, data = tbl_ex07p01)
n_sd_current_forward <- summary(lm_current_forward)$sigma
vec_pred_current_forward <- attr(terms(lm_current_forward), "term.labels")
n_nr_pred_fact <- length(vec_pred_full)
tbl_result_forward <- NULL
# start iteration
while (length(vec_pred_current_forward) < n_nr_pred_fact){
  # results for current model
  tbl_cur_forward <- tibble::tibble(`Current Model` = as.character(formula(lm_current_forward))[3],
                                    Cp = compute_cp_value(pn_res_sd_full_model = n_sd_current,
                                                            pn_nr_obs = n_nr_obs, 
                                                            plm_cur_model = lm_current_forward))
  # collect result
  if (is.null(tbl_result_forward)){
    tbl_result_forward <- tbl_cur_forward
  } else {
    tbl_result_forward <- dplyr::bind_rows(tbl_result_forward, tbl_cur_forward)
  }
  # update current model
  lm_current_forward <- get_subm_forward(plm_cur_model = lm_current_forward, pvec_pred_full = vec_pred_full)
  vec_pred_current_forward <- attr(terms(lm_current_forward), "term.labels")

}
# add full model
tbl_cur_forward <- tibble::tibble(`Current Model` = as.character(formula(lm_current_forward))[3],
                                    Cp = compute_cp_value(pn_res_sd_full_model = n_sd_current,
                                                            pn_nr_obs = n_nr_obs, 
                                                            plm_cur_model = lm_current_forward))
tbl_result_forward <- dplyr::bind_rows(tbl_result_forward, tbl_cur_forward)
tbl_result_forward

The model with the lowest $C_p$ value is

n_model_idx <- which(tbl_result_forward$Cp == min(tbl_result_forward$Cp))
tbl_result_forward[n_model_idx,]

Because the $C_p$ values for backward selection and forward selection are negative, they cannot be used as estimates for the mean square error (MSE), because MSE must be positive. This indicates that $C_p$ is not a good model selection criterion. Often, people just ignore the models with the negative $C_p$ values and take the one that has the smallest positive $C_p$ value besides the full model. In our case, this results in the model

tbl_result_modified <- tbl_result_forward[tbl_result_forward$Cp > 1, ]
tbl_result_modified[tbl_result_modified$Cp == min(tbl_result_modified$Cp),]

It might be worth while to use AIC or BIC as alternative criteria. In R the function MASS::stepAIC() can be used to do model selection based on AIC.

MASS::stepAIC(lm_ex07p01_full)

The result of MASS::stepAIC() also shows the variable BC and the factor Breed to be important. This means the following model would be the best model that is selected from the data.

lm_ex07p01_best <- lm(BW ~ BC + Breed, data = tbl_ex07p01)
(smry_ex07p01_best <- summary(lm_ex07p01_best))

The model that was used to generate the data is the model with only BC and an intercept. Hence the true model is

lm_ex07p01_true <- lm(BW ~ BC, data = tbl_ex07p01)
summary(lm_ex07p01_true)

This shows the difficulty in the analysis of data when there are correlated variables and factors. As it seams they make it almost impossible to find the true model. But in any case there is no problem with the best model, it is a very good model and it is able to explain r round(100 * smry_ex07p01_best$adj.r.squared, digit = 1) percent of the variation in the response.

Problem 2: Verification of Model Selection Results

Use the R-package olsrr to verify the results of Problem 1. Have a look at the documentation of olsrr at https://github.com/rsquaredacademy/olsrr. In a first step, we are going to read the data from

if (params$isonline){
  s_ex07p02_path <- "https://charlotte-ngs.github.io/asmss2022/data/asm_bw_mod_sel.csv" 
} else {
  s_ex07p02_path <- file.path(here::here(), "docs", "data", "asm_bw_mod_sel.csv")
}
cat(s_ex07p02_path, "\n")

Solution

tbl_ex07p02 <- readr::read_csv(file = s_ex07p02_path)
tbl_ex07p02$BCS <- as.factor(tbl_ex07p02$BCS)
tbl_ex07p02$Breed <- as.factor(tbl_ex07p02$Breed)
tbl_ex07p02
lm_ex07p02_full <- lm(BW ~ BC + HEI + BCS + Breed, data = tbl_ex07p02)
olsrr::ols_step_best_subset(lm_ex07p02_full)

The results of olsrr::ols_step_best_subset() are consistent with our calculation of Problem 1.



charlotte-ngs/asmss2022 documentation built on June 7, 2022, 1:33 p.m.