knitr::opts_chunk$set(echo = TRUE)
s_ex05p01_data_path <- "https://charlotte-ngs.github.io/asmss2022/data/asm_bw_flem.csv" # s_ex05p01_data_path <- file.path(here::here(), "docs", "data", "asm_bw_flem.csv")
Use the dataset of Body Weight
and Breed
to fit a linear model of Body Weight
on Breed
. The aim of this exercise is to use the Helmert
-contrasts instead of the defautl Treatment
contrasts. What are the estimable functions used in the Helmert
-Contrasts and what are the effects that are reported for the different levels of the factor Breed
? Verify your answer by comparing estimable functions of solutions of the least squares normal equations to the effects of lm()
.
The dataset is available under
cat(s_ex05p01_data_path, "\n")
options(contrasts = c("contr.helmert", "contr.helmert"))
to change the default contrasts to the desired Helmert
-Contraststbl_e05p01 <- readr::read_csv(file = s_ex05p01_data_path)
The data is sorted according to the breed
tbl_e05p01 <- tbl_e05p01[order(tbl_e05p01$Breed),] tbl_e05p01
A solution vector depends on the matrix $X$ and on the vector $y$. The vector $y$ is directly obtained from the column Body Weight
of the dataframe.
vec_y <- tbl_e05p01$`Body Weight`
The matrix $X$ can be obtained from the function model.matrix()
.
mat_X <- model.matrix(lm(`Body Weight` ~ 0 + Breed, data = tbl_e05p01)) mat_X <- cbind(matrix(1, nrow = nrow(mat_X), ncol = 1), mat_X) mat_X
A solution for the least squares normal equations is obtained by
mat_xtx_ginv <- MASS::ginv(crossprod(mat_X)) mat_xty <- crossprod(mat_X, vec_y) mat_b0 <- crossprod(mat_xtx_ginv, mat_xty) mat_b0
The solutions correspond to the vector $b^0$ with the components
vec_b0 <- c("\\mu", "\\alpha_1", "\\alpha_2", "\\alpha_3") cat("$$\n") cat("b^0 = ") cat(paste0(rmdhelp::bcolumn_vector(pvec = vec_b0), collapse = "\n"), "\n") cat(" = \n") cat(paste0(rmdhelp::bmatrix(pmat = round(mat_b0, digits = 3)), collapse = "\n"), "\n") cat("$$\n")
opts <- options()
Change contrasts
options(contrasts = c("contr.helmert", "contr.helmert")) getOption("contrasts")
tbl_e05p01$Breed <- as.factor(tbl_e05p01$Breed) c_mat_helmert <- contrasts(tbl_e05p01$Breed) c_mat_helmert
Add a columns of all ones to c_mat_helmert
.
c_mat_helmert <- cbind(matrix(1, nrow = nrow(c_mat_helmert), ncol = 1), c_mat_helmert) c_mat_helmert
Compute the inverse of c_mat_helmert
est_mat_helmert <- solve(c_mat_helmert) est_mat_helmert
The first row tells us how the intercept is computed. The intercept ($\hat{b}_0$) here corresponds to
$$\hat{b}0 = {1\over 3}\left( E(y{1.}) + E(y_{2.}) + E(y_{3.})\right)$$
where $E(y_{1.})$, $E(y_{2.})$ and $E(y_{3.})$ are the mean values of Body Weight
for Angus, Limousin and Simmental animals, respectively.
n_mean_angus <- mean(tbl_e05p01[tbl_e05p01$Breed == "Angus", ]$`Body Weight`) n_mean_limousin <- mean(tbl_e05p01[tbl_e05p01$Breed == "Limousin", ]$`Body Weight`) n_mean_simmental <- mean(tbl_e05p01[tbl_e05p01$Breed == "Simmental", ]$`Body Weight`) mean(c(n_mean_angus, n_mean_limousin,n_mean_simmental))
The second row of est_mat_helmert
shows the first estimable function that is used. It corresponds to
$$\hat{b}_1 = {1\over 2}(\alpha_2 - \alpha_1)$$
where $\hat{b}_1$ measures the difference between the breeds Limousin
and Angus
corresponding to
1/2*(mat_b0[3] - mat_b0[2])
The third row of est_mat_helmert
shows how the Body Weight
of the breed Simmental
is compared to the two other breeds. It is
$$\hat{b}_2 = {1\over 6}(2\alpha_3 - \alpha_2 - \alpha_1)$$
which measures the difference between Simmental
and Limousin
and Angus
together.
1/6 * (2*mat_b0[4] - mat_b0[3] - mat_b0[2])
lm()
. The estimate for the intercept islm_helmert <- lm(`Body Weight` ~ Breed, data = tbl_e05p01) coefficients(lm_helmert)["(Intercept)"]
The estimates for the Breed
effects can be seen from the list of all coefficients.
coefficients(lm_helmert)
options(opts)
vec_nobs <- c(10,30,100) s_ex05p02_data_path <- "https://charlotte-ngs.github.io/asmss2022/data/asm_bw_flem.csv" # s_ex05p02_data_path <- file.path(here::here(), "docs", "data", "asm_bw_flem.csv")
Use the results of the regression of Body Weight
on Breast Circumference
and simulate three datasets with r vec_nobs[1]
, r vec_nobs[2]
and r vec_nobs[3]
observations respectively. What is the number of observations required to obtain the same regression results from the simulated dataset that you used in the simulation?
The original dataset is available under:
cat(s_ex05p02_data_path, "\n")
Body Weight
on Breast Circumference
tbl_bwbc <- readr::read_csv(file = s_ex05p02_data_path) lm_bwbc <- lm(`Body Weight` ~ `Breast Circumference`, data = tbl_bwbc) b0 <- coefficients(lm_bwbc)["(Intercept)"] b1 <- coefficients(lm_bwbc)["`Breast Circumference`"] mean_bc <- mean(tbl_bwbc$`Breast Circumference`) sd_bc <- sd(tbl_bwbc$`Breast Circumference`) sry_bwbc <- summary(lm_bwbc) sd_res <- sry_bwbc$sigma
Breast Circumference
Breast Circumference
and returns a dataset according to these input values.
simulate_bwbc <- function(pn_nrobs, pn_b0, pn_b1, pn_mean_x, pn_sd_x, pn_sd_res){ vec_bc <- rnorm(pn_nrobs, mean = pn_mean_x, sd = pn_sd_x) vec_bw <- pn_b0 + pn_b1 * vec_bc + rnorm(pn_nrobs, mean = 0, sd = pn_sd_res) tbl_result <- tibble::tibble(Animal = c(1:pn_nrobs), BC = vec_bc, BW = vec_bw) return(tbl_result) }
With the above defined function, we can create a list with the three datasets
set.seed(1928) vec_nobs <- c(10,30,100) l_data_set <- lapply(vec_nobs, simulate_bwbc, pn_b0 = b0, pn_b1 = b1, pn_mean_x = mean_bc, pn_sd_x = sd_bc, pn_sd_res = sd_res)
Each of the datasets is analysed by lm()
and the results are again stored in a list
l_lm_result <- lapply(l_data_set, function(x) lm(BW ~ BC, data = x))
Collect the resutls into a table
tbl_result <- NULL for (cur_res in l_lm_result){ sry_cur_res <- summary(cur_res) tbl_cur <- tibble::tibble(NrObs = length(cur_res$residuals), Intercept_Estimate = coefficients(sry_cur_res)["(Intercept)", "Estimate"], Intercept_StdErr = coefficients(sry_cur_res)["(Intercept)", "Std. Error"], Slope_Estimate = coefficients(sry_cur_res)["BC", "Estimate"], Slope_StdErr = coefficients(sry_cur_res)["BC", "Std. Error"], ResStdErr = sry_cur_res$sigma) if (is.null(tbl_result)){ tbl_result <- tbl_cur } else { tbl_result <- dplyr::bind_rows(tbl_result, tbl_cur) } } knitr::kable(tbl_result)
The true values used in the sumulation are
tbl_true_values <- tibble::tibble(Intercept = b0, Slope = b1, ResStdErr = sd_res) knitr::kable(tbl_true_values, longtable = TRUE, booktabs = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.