knitr::opts_chunk$set(echo = TRUE)
s_ex05p01_data_path <- "https://charlotte-ngs.github.io/asmss2022/data/asm_bw_flem.csv"
# s_ex05p01_data_path <- file.path(here::here(), "docs", "data", "asm_bw_flem.csv")

Problem 1: Helmert Contrasts

Use the dataset of Body Weight and Breed to fit a linear model of Body Weight on Breed. The aim of this exercise is to use the Helmert-contrasts instead of the defautl Treatment contrasts. What are the estimable functions used in the Helmert-Contrasts and what are the effects that are reported for the different levels of the factor Breed? Verify your answer by comparing estimable functions of solutions of the least squares normal equations to the effects of lm().

The dataset is available under

cat(s_ex05p01_data_path, "\n")

Hint

Solution

tbl_e05p01 <- readr::read_csv(file = s_ex05p01_data_path)

The data is sorted according to the breed

tbl_e05p01 <- tbl_e05p01[order(tbl_e05p01$Breed),]
tbl_e05p01

A solution vector depends on the matrix $X$ and on the vector $y$. The vector $y$ is directly obtained from the column Body Weight of the dataframe.

vec_y <- tbl_e05p01$`Body Weight`

The matrix $X$ can be obtained from the function model.matrix().

mat_X <- model.matrix(lm(`Body Weight` ~ 0 + Breed, data = tbl_e05p01))
mat_X <- cbind(matrix(1, nrow = nrow(mat_X), ncol = 1), mat_X)
mat_X

A solution for the least squares normal equations is obtained by

mat_xtx_ginv <- MASS::ginv(crossprod(mat_X))
mat_xty <- crossprod(mat_X, vec_y)
mat_b0 <- crossprod(mat_xtx_ginv, mat_xty)
mat_b0

The solutions correspond to the vector $b^0$ with the components

vec_b0 <- c("\\mu", "\\alpha_1", "\\alpha_2", "\\alpha_3")
cat("$$\n")
cat("b^0 = ")
cat(paste0(rmdhelp::bcolumn_vector(pvec = vec_b0), collapse = "\n"), "\n")
cat(" = \n")
cat(paste0(rmdhelp::bmatrix(pmat = round(mat_b0, digits = 3)), collapse = "\n"), "\n")
cat("$$\n")
opts <- options()

Change contrasts

options(contrasts = c("contr.helmert", "contr.helmert"))
getOption("contrasts")
tbl_e05p01$Breed <- as.factor(tbl_e05p01$Breed)
c_mat_helmert <- contrasts(tbl_e05p01$Breed)
c_mat_helmert

Add a columns of all ones to c_mat_helmert.

c_mat_helmert <- cbind(matrix(1, nrow = nrow(c_mat_helmert), ncol = 1), c_mat_helmert)
c_mat_helmert

Compute the inverse of c_mat_helmert

est_mat_helmert <- solve(c_mat_helmert)
est_mat_helmert

The first row tells us how the intercept is computed. The intercept ($\hat{b}_0$) here corresponds to

$$\hat{b}0 = {1\over 3}\left( E(y{1.}) + E(y_{2.}) + E(y_{3.})\right)$$ where $E(y_{1.})$, $E(y_{2.})$ and $E(y_{3.})$ are the mean values of Body Weight for Angus, Limousin and Simmental animals, respectively.

n_mean_angus <- mean(tbl_e05p01[tbl_e05p01$Breed == "Angus", ]$`Body Weight`)
n_mean_limousin <- mean(tbl_e05p01[tbl_e05p01$Breed == "Limousin", ]$`Body Weight`)
n_mean_simmental <- mean(tbl_e05p01[tbl_e05p01$Breed == "Simmental", ]$`Body Weight`)
mean(c(n_mean_angus, n_mean_limousin,n_mean_simmental))

The second row of est_mat_helmert shows the first estimable function that is used. It corresponds to

$$\hat{b}_1 = {1\over 2}(\alpha_2 - \alpha_1)$$ where $\hat{b}_1$ measures the difference between the breeds Limousin and Angus corresponding to

1/2*(mat_b0[3] - mat_b0[2])

The third row of est_mat_helmert shows how the Body Weight of the breed Simmental is compared to the two other breeds. It is

$$\hat{b}_2 = {1\over 6}(2\alpha_3 - \alpha_2 - \alpha_1)$$

which measures the difference between Simmental and Limousin and Angus together.

1/6 * (2*mat_b0[4] - mat_b0[3] - mat_b0[2])
lm_helmert <- lm(`Body Weight` ~ Breed, data = tbl_e05p01)
coefficients(lm_helmert)["(Intercept)"]

The estimates for the Breed effects can be seen from the list of all coefficients.

coefficients(lm_helmert)
options(opts)
vec_nobs <- c(10,30,100)
s_ex05p02_data_path <- "https://charlotte-ngs.github.io/asmss2022/data/asm_bw_flem.csv"
# s_ex05p02_data_path <- file.path(here::here(), "docs", "data", "asm_bw_flem.csv")

Problem 2: Simulation

Use the results of the regression of Body Weight on Breast Circumference and simulate three datasets with r vec_nobs[1], r vec_nobs[2] and r vec_nobs[3] observations respectively. What is the number of observations required to obtain the same regression results from the simulated dataset that you used in the simulation?

The original dataset is available under:

cat(s_ex05p02_data_path, "\n")

Solution

tbl_bwbc <- readr::read_csv(file = s_ex05p02_data_path)
lm_bwbc <- lm(`Body Weight` ~ `Breast Circumference`, data = tbl_bwbc)
b0 <- coefficients(lm_bwbc)["(Intercept)"]
b1 <- coefficients(lm_bwbc)["`Breast Circumference`"]
mean_bc <- mean(tbl_bwbc$`Breast Circumference`)
sd_bc <- sd(tbl_bwbc$`Breast Circumference`)
sry_bwbc <- summary(lm_bwbc)
sd_res <- sry_bwbc$sigma

and returns a dataset according to these input values.

simulate_bwbc <- function(pn_nrobs, pn_b0, pn_b1, pn_mean_x, pn_sd_x, pn_sd_res){
  vec_bc <- rnorm(pn_nrobs, mean = pn_mean_x, sd = pn_sd_x)
  vec_bw <- pn_b0 + pn_b1 * vec_bc + rnorm(pn_nrobs, mean = 0, sd = pn_sd_res)
  tbl_result <- tibble::tibble(Animal = c(1:pn_nrobs),
                               BC = vec_bc,
                               BW = vec_bw)
  return(tbl_result)
}

With the above defined function, we can create a list with the three datasets

set.seed(1928)
vec_nobs <- c(10,30,100)
l_data_set <- lapply(vec_nobs, simulate_bwbc, 
                     pn_b0 = b0, 
                     pn_b1 = b1, 
                     pn_mean_x = mean_bc, 
                     pn_sd_x = sd_bc, 
                     pn_sd_res = sd_res) 

Each of the datasets is analysed by lm() and the results are again stored in a list

l_lm_result <- lapply(l_data_set, function(x) lm(BW ~ BC, data = x))

Collect the resutls into a table

tbl_result <- NULL
for (cur_res in l_lm_result){
  sry_cur_res <- summary(cur_res)
  tbl_cur <- tibble::tibble(NrObs = length(cur_res$residuals),
              Intercept_Estimate = coefficients(sry_cur_res)["(Intercept)", "Estimate"],
              Intercept_StdErr = coefficients(sry_cur_res)["(Intercept)", "Std. Error"],
              Slope_Estimate = coefficients(sry_cur_res)["BC", "Estimate"],
              Slope_StdErr = coefficients(sry_cur_res)["BC", "Std. Error"],
              ResStdErr = sry_cur_res$sigma)
  if (is.null(tbl_result)){
    tbl_result <- tbl_cur
  } else {
    tbl_result <- dplyr::bind_rows(tbl_result, tbl_cur)
  }
}
knitr::kable(tbl_result)

The true values used in the sumulation are

tbl_true_values <- tibble::tibble(Intercept = b0,
                                  Slope = b1,
                                  ResStdErr = sd_res)
knitr::kable(tbl_true_values,
             longtable = TRUE,
             booktabs = TRUE)


charlotte-ngs/asmss2022 documentation built on June 7, 2022, 1:33 p.m.