knitr::opts_chunk$set(echo = TRUE)
if (params$isonline){ s_ex07p01_path <- "https://charlotte-ngs.github.io/asmss2022/data/asm_bw_mod_sel.csv" } else { s_ex07p01_path <- file.path(here::here(), "docs", "data", "asm_bw_mod_sel.csv") }
Given is a dataset with body weight as a response and different other variables and factors. The columns Breed
and BCS
(Body Condition Score) are taken as factors. All other columns are taken as predictor variables. The column Animal
is not used in any model. Use model selection to find the relevant predictor variables and factors for the best linear fixed effect model. Use the estimated mean square error $C_p$ as a quality measure for a single linear model. The dataset to be analysed can be obtained from
cat(s_ex07p01_path, "\n")
Because, we need the residual standard deviation of the full model and backward elimination starts with the full model, we start with backward elimination
Breed
and BCS
to factorsif (params$isonline){ s_ex07p01_path <- "https://charlotte-ngs.github.io/asmss2022/data/asm_bw_mod_sel.csv" } else { s_ex07p01_path <- file.path(here::here(), "docs", "data", "asm_bw_mod_sel.csv") } tbl_ex07p01 <- readr::read_csv(file = s_ex07p01_path) tbl_ex07p01$BCS <- as.factor(tbl_ex07p01$BCS) tbl_ex07p01$Breed <- as.factor(tbl_ex07p01$Breed)
s_resp <- "BW" vec_cols_to_ignore <- c("Animal") vec_pred_full <- setdiff(colnames(tbl_ex07p01), c(s_resp, vec_cols_to_ignore)) fmlm_full <- as.formula(paste0(s_resp, " ~ ", paste0(vec_pred_full, collapse = " + "), collapse = "")) lm_ex07p01_full <- lm(formula = fmlm_full, data = tbl_ex07p01) smry_ex07p01_full <- summary(lm_ex07p01_full) n_sd_full <- smry_ex07p01_full$sigma n_ssqr_full <- crossprod(residuals(lm_ex07p01_full)) n_ssqr_full
From the full model select one variable at the time, remove that variable, fit a reduced model and compute for that model the residual sum of squares. The model that increases the residual sum of squares the least, is selected and for that model the $C_p$ value is compute.
tbl_belim_res <- NULL for (p in vec_pred_full){ fm_update_cur <- as.formula(paste0(". ~ . - ", p, collapse = "")) lm_cur <- update(lm_ex07p01_full, fm_update_cur) vec_res <- residuals(lm_cur) tbl_cur <- tibble::tibble(Variable = p, RSSQ = crossprod(vec_res)) if (is.null(tbl_belim_res)){ tbl_belim_res <- tbl_cur } else { tbl_belim_res <- dplyr::bind_rows(tbl_belim_res, tbl_cur) } } tbl_belim_res
From tbl_belim_res
, we determine the variable which is excluded
n_idx_var_exclude <- which(tbl_belim_res$RSSQ == min(tbl_belim_res$RSSQ)) s_var_exclude <- tbl_belim_res$Variable[n_idx_var_exclude] s_var_exclude
The model after this first round of elimination corresponds to the model that results when taking away the variable r s_var_exclude
from the full model.
vec_pred_cur <- setdiff(vec_pred_full, s_var_exclude) fm_cur <- as.formula(paste0(s_resp, " ~ ", paste0(vec_pred_cur, collapse = " + "), collapse = "")) lm_cur <- lm(formula = fm_cur, data = tbl_ex07p01) summary(lm_cur)
For the current model, we have to compute the $C_p$ value
n_nr_obs <- nrow(tbl_ex07p01) n_rssq <- crossprod(residuals(lm_cur)) # model size is the number of predictors plus the intercept n_model_size <- length(vec_pred_cur) + 1 n_cp_cur <- n_rssq / (n_sd_full^2) - n_nr_obs + 2 * n_model_size n_cp_cur
Verify, according to https://search.r-project.org/CRAN/refmans/olsrr/html/ols_mallows_cp.html
olsrr::ols_mallows_cp(lm_cur, lm_ex07p01_full)
get_subm_back <- function(plm_cur_model){ # minimal value for RSSQ n_rssq_min <- NULL lm_result_sub <- NULL # obtain the vector of predictor variables and factors vec_pred_cur <- attr(terms(plm_cur_model), "term.labels") # loop over vector of predictors and compute RSSQ for each sub-model for (p in vec_pred_cur){ # remove p from predictors fm_cur_subm <- as.formula(paste0(". ~ . - ", p, collapse = "")) lm_cur_subm <- update(plm_cur_model, fm_cur_subm) vec_res_subm <- residuals(lm_cur_subm) n_rssq_subm <- crossprod(vec_res_subm) # check whether n_rssq_sub is minimal if (is.null(n_rssq_min)){ n_rssq_min <- n_rssq_subm lm_result_sub <- lm_cur_subm } else { if (n_rssq_subm < n_rssq_min){ n_rssq_min <- n_rssq_subm lm_result_sub <- lm_cur_subm } } } # return model with minimal rssq return(lm_result_sub) }
The function get_subm_back()
can be verified by a call with the full model. Then the sub-model with HEI
eliminated should result.
lm_ex07p01_first_subm <- get_subm_back(plm_cur_model = lm_ex07p01_full) lm_ex07p01_first_subm
The second function computes the $C_p$ value for the obtained sub-model.
compute_cp_value <- function(pn_res_sd_full_model, pn_nr_obs, plm_cur_model){ n_rssq <- crossprod(residuals(plm_cur_model)) # model size is the number of predictors plus the intercept vec_pred_cur <- attr(terms(plm_cur_model), "term.labels") n_model_size <- length(vec_pred_cur) + 1 n_cp_cur <- n_rssq / (pn_res_sd_full_model^2) - pn_nr_obs + 2 * n_model_size return(n_cp_cur) }
For the first submodel, we get
compute_cp_value(pn_res_sd_full_model = n_sd_full, pn_nr_obs = nrow(tbl_ex07p01), plm_cur_model = lm_ex07p01_first_subm)
Now that we have the two functions ready, we can do the repetition of the elimination process of variables from a model. To make it a little bit easier, we start again with the full model.
n_nr_obs <- nrow(tbl_ex07p01) lm_current <- lm_ex07p01_full n_sd_current <- summary(lm_current)$sigma vec_pred_current <- attr(terms(lm_current), "term.labels") # initialise a result dataframe tbl_elim_result <- NULL # loop as long as, there are variables in vec_pred_current while (length(vec_pred_current) > 0){ # get variables and C_p of current model tbl_elim_current <- tibble::tibble(`Current Model` = as.character(formula(lm_current))[3], Cp = compute_cp_value(pn_res_sd_full_model = n_sd_current, pn_nr_obs = n_nr_obs, plm_cur_model = lm_current)) # store variables and C_p value of current model in result if (is.null(tbl_elim_result)) { tbl_elim_result <- tbl_elim_current } else { tbl_elim_result <- dplyr::bind_rows(tbl_elim_result, tbl_elim_current) } # get new submodel lm_current <- get_subm_back(plm_cur_model = lm_current) vec_pred_current <- attr(terms(lm_current), "term.labels") } tbl_elim_result
In the above shown result dataframe, the model which only fits an intercept is missing. Hence, we add that model to the results
lm_inter <- lm(BW ~ 1, data = tbl_ex07p01) tbl_elim_inter <- tibble::tibble(`Current Model` = as.character(formula(lm_inter))[3], Cp = compute_cp_value(pn_res_sd_full_model = n_sd_current, pn_nr_obs = n_nr_obs, plm_cur_model = lm_inter)) tbl_elim_result <- dplyr::bind_rows(tbl_elim_result, tbl_elim_inter) tbl_elim_result
n_model_idx <- which(tbl_elim_result$Cp == min(tbl_elim_result$Cp)) tbl_elim_result[n_model_idx,]
In forward selection, we start with the smallest model with only an intercept. Based on the preparation for backward selection, we can start with the iteration after an initialisation of the current model with the smallest model. The major difference between forward selection and backward selection is the way how subsequent submodels are generated. In forward selection, predictor variables or factors are added. This is done in a function called get_subm_forward()
.
get_subm_forward <- function(plm_cur_model, pvec_pred_full){ # minimal value for RSSQ n_rssq_min <- NULL lm_result_sub <- NULL # loop over vector of predictors and compute RSSQ for each sub-model for (p in pvec_pred_full){ # remove p from predictors fm_cur_subm <- as.formula(paste0(". ~ . + ", p, collapse = "")) lm_cur_subm <- update(plm_cur_model, fm_cur_subm) vec_res_subm <- residuals(lm_cur_subm) n_rssq_subm <- crossprod(vec_res_subm) # check whether n_rssq_sub is minimal if (is.null(n_rssq_min)){ n_rssq_min <- n_rssq_subm lm_result_sub <- lm_cur_subm } else { if (n_rssq_subm < n_rssq_min){ n_rssq_min <- n_rssq_subm lm_result_sub <- lm_cur_subm } } } # return model with minimal rssq return(lm_result_sub) }
The above function can be used in the iterative process of forward selection
# initialise current model lm_current_forward <- lm(BW ~ 1, data = tbl_ex07p01) n_sd_current_forward <- summary(lm_current_forward)$sigma vec_pred_current_forward <- attr(terms(lm_current_forward), "term.labels") n_nr_pred_fact <- length(vec_pred_full) tbl_result_forward <- NULL # start iteration while (length(vec_pred_current_forward) < n_nr_pred_fact){ # results for current model tbl_cur_forward <- tibble::tibble(`Current Model` = as.character(formula(lm_current_forward))[3], Cp = compute_cp_value(pn_res_sd_full_model = n_sd_current, pn_nr_obs = n_nr_obs, plm_cur_model = lm_current_forward)) # collect result if (is.null(tbl_result_forward)){ tbl_result_forward <- tbl_cur_forward } else { tbl_result_forward <- dplyr::bind_rows(tbl_result_forward, tbl_cur_forward) } # update current model lm_current_forward <- get_subm_forward(plm_cur_model = lm_current_forward, pvec_pred_full = vec_pred_full) vec_pred_current_forward <- attr(terms(lm_current_forward), "term.labels") } # add full model tbl_cur_forward <- tibble::tibble(`Current Model` = as.character(formula(lm_current_forward))[3], Cp = compute_cp_value(pn_res_sd_full_model = n_sd_current, pn_nr_obs = n_nr_obs, plm_cur_model = lm_current_forward)) tbl_result_forward <- dplyr::bind_rows(tbl_result_forward, tbl_cur_forward) tbl_result_forward
The model with the lowest $C_p$ value is
n_model_idx <- which(tbl_result_forward$Cp == min(tbl_result_forward$Cp)) tbl_result_forward[n_model_idx,]
Because the $C_p$ values for backward selection and forward selection are negative, they cannot be used as estimates for the mean square error (MSE), because MSE must be positive. This indicates that $C_p$ is not a good model selection criterion. Often, people just ignore the models with the negative $C_p$ values and take the one that has the smallest positive $C_p$ value besides the full model. In our case, this results in the model
tbl_result_modified <- tbl_result_forward[tbl_result_forward$Cp > 1, ] tbl_result_modified[tbl_result_modified$Cp == min(tbl_result_modified$Cp),]
It might be worth while to use AIC or BIC as alternative criteria. In R the function MASS::stepAIC()
can be used to do model selection based on AIC.
MASS::stepAIC(lm_ex07p01_full)
The result of MASS::stepAIC()
also shows the variable BC
and the factor Breed
to be important. This means the following model would be the best model that is selected from the data.
lm_ex07p01_best <- lm(BW ~ BC + Breed, data = tbl_ex07p01) (smry_ex07p01_best <- summary(lm_ex07p01_best))
The model that was used to generate the data is the model with only BC
and an intercept. Hence the true model is
lm_ex07p01_true <- lm(BW ~ BC, data = tbl_ex07p01) summary(lm_ex07p01_true)
This shows the difficulty in the analysis of data when there are correlated variables and factors. As it seams they make it almost impossible to find the true model. But in any case there is no problem with the best model, it is a very good model and it is able to explain r round(100 * smry_ex07p01_best$adj.r.squared, digit = 1)
percent of the variation in the response.
Use the R-package olsrr
to verify the results of Problem 1. Have a look at the documentation of olsrr
at https://github.com/rsquaredacademy/olsrr. In a first step, we are going to read the data from
if (params$isonline){ s_ex07p02_path <- "https://charlotte-ngs.github.io/asmss2022/data/asm_bw_mod_sel.csv" } else { s_ex07p02_path <- file.path(here::here(), "docs", "data", "asm_bw_mod_sel.csv") } cat(s_ex07p02_path, "\n")
tbl_ex07p02 <- readr::read_csv(file = s_ex07p02_path) tbl_ex07p02$BCS <- as.factor(tbl_ex07p02$BCS) tbl_ex07p02$Breed <- as.factor(tbl_ex07p02$Breed) tbl_ex07p02
lm_ex07p02_full <- lm(BW ~ BC + HEI + BCS + Breed, data = tbl_ex07p02)
olsrr::ols_step_best_subset(lm_ex07p02_full)
The results of olsrr::ols_step_best_subset()
are consistent with our calculation of Problem 1.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.