\newcommand{\points}[1] {\begin{flushright}\textbf{#1}\end{flushright}}
# knitr::opts_chunk$set(echo = FALSE, results = 'hide') #knitr::opts_chunk$set(concordance=TRUE) knitr::knit_hooks$set(hook_convert_odg = rmdhelp::hook_convert_odg) # write the parameters to file b_params_to_file <- FALSE # check whether seed is set and output it to a file s_this_rmd_file = basename(ifelse(rstudioapi::isAvailable(), rstudioapi::getSourceEditorContext()$path, whereami::thisfile())) if (is.null(params$seed)){ stop(" ** Error parameter seed has not been set.") } else { set.seed(params$seed) s_params_file <- paste0(format(Sys.time(), '%Y%m%d%H%M%S'), "_params_", s_this_rmd_file, ".txt", collapse = "") if (b_params_to_file) dput(params, file = s_params_file) }
# Assign Points for Q1 lPointsQ1 <- list(TaskA = 8, TaskB = 2, TaskC = 6, TaskD = 0) nPointQ1Total <- sum(unlist(lPointsQ1))
# Assign Points for Q2 lPointsQ2 <- list(TaskA = 7, TaskB = 15, TaskC = 6, TaskD = 0) nPointQ2Total <- sum(unlist(lPointsQ2))
# Assign Points for Q3 lPointsQ3 <- list(TaskA = 4, TaskB = 5, TaskC = 0, TaskD = 0) nPointQ3Total <- sum(unlist(lPointsQ3))
# Assign Points for Q4 lPointsQ4 <- list(TaskA = 8, TaskB = 15, TaskC = 15, TaskD = 0) nPointQ4Total <- sum(unlist(lPointsQ4))
nPointOverallTotal <- nPointQ1Total + nPointQ2Total + nPointQ3Total + nPointQ4Total
\thispagestyle{empty}
\fcolorbox{white}{white}{ \centering \parbox[t]{1.0\linewidth}{ \fontsize{12pt}{20pt}\selectfont % \vspace*{0.5cm} %
Peter von Rohr \\ Institute of Agricultural Sciences\\ D-USYS\\ ETH Zurich \vspace*{0.5cm} }
}
\vspace*{2cm}
\fcolorbox{white}{white}{ \parbox[t]{1.0\linewidth}{ \centering \fontsize{25pt}{40pt}\selectfont % \vspace*{0.2cm} 751-7602-00 V \ Solutions for Exam \ Applied Statistical Methods \ in Animal Sciences \ SS 2021
\vspace*{0.7cm} % Space between the end of the title and the bottom of the grey box }
}
\vspace*{1cm}
\begin{tabular}{p{3cm}p{6cm}}
Date: & $31^{st}$ May 2021 \
& \
& \
Name: & r params$firstname
r params$name
\
& \
& \
Legi-Nr: & r params$leginr
\
\end{tabular}
\vspace{5ex}
\begin{center}
\begin{tabular}{|p{3cm}|c|c|}
\hline
Problem & Maximum Number of Points & Number of Points Reached\
\hline
1 & r nPointQ1Total
& \
\hline
2 & r nPointQ2Total
& \
\hline
3 & r nPointQ3Total
& \
\hline
4 & r nPointQ4Total
& \
\hline
Total & r nPointOverallTotal
& \
\hline
\end{tabular}
\end{center}
\vspace{0.5cm}
\textit{Questions in German are in italics}
\clearpage \pagebreak
set.seed(1566) n = 30 k = 1 #x = matrix(sample(c(0, 1), n * k, replace = T), nrow = n, ncol = k) x = matrix(rnorm(n*k, mean = 3, sd = 0.5), nrow = n, ncol = k) X = cbind(1, x) betaTrue = c(1.07, 3.32) y = X %*% betaTrue + rnorm(n, 0, 2) dfSimData <- data.frame(X1 = x[,1], y = y)
The same dataset is analysed with two different regression models. The R-Output of both analyses is given by Output A and Output B.
\textit{Wir haben den gleichen Datensatz mit zwei unterschiedlichen linearen Regressionsmodellen analysiert. Der R-Output dieser beiden Analysen ist nachfolgend als Output A und Output B gegeben. }
summary(snp_regA <- lm(y ~ X1, data = dfSimData))
summary(snp_regB <- lm(y ~ -1 + X1, data = dfSimData))
\clearpage \pagebreak
\begin{enumerate}
\item[a)] Give the formulas of both statistical models which belong to Output A and Output B. Where is the main difference between both models?
\textit{Geben Sie die Formeln der beiden statistischen Modelle an, welche zu Output A und Output B geführt haben. Wo liegt der hauptsächliche Unterschied zwischen den beiden Modellen?}
\end{enumerate}
\points{r lPointsQ1$TaskA
}
\clearpage \pagebreak
\begin{enumerate} \item[b)] For both analyses a plot was produced. Assign Plots 1 and 2 to Outputs A and B.
\textit{ Für die zwei Analysen wurden auch zwei Plots gemacht. Ordnen Sie die Plots 1 und 2 den Outputs A und B zu.}
\end{enumerate}
\points{r lPointsQ1$TaskB
}
plot(dfSimData$X1, dfSimData$y, type = "p", xlab = "X1", ylab = "y", xlim = c(0,max(dfSimData$X1)+0.2), ylim = c(-1,max(dfSimData$y)+1) ) abline(snp_regB, col = "red")
plot(dfSimData$X1, dfSimData$y, type = "p", xlab = "X1", ylab = "y", xlim = c(0,max(dfSimData$X1)+0.2), ylim = c(-1,max(dfSimData$y)+1)) abline(snp_regA, col = "red")
\clearpage \pagebreak
\begin{enumerate} \item[c)] Enter the parameter estimates from Outputs A and B in Plots 1 and 2 by marking their lengths in the plots.
\textit{Zeichnen Sie die geschätzten Parameter (Estimate) aus den Outputs A und B in die Plots 1 und 2 ein}
\end{enumerate}
\points{r lPointsQ1$TaskC
}
plot(dfSimData$X1, dfSimData$y, type = "p", xlab = "X1", ylab = "y", xlim = c(0,max(dfSimData$X1)+0.2), ylim = c(-1,max(dfSimData$y)+1) ) abline(snp_regB, col = "red")
plot(dfSimData$X1, dfSimData$y, type = "p", xlab = "X1", ylab = "y", xlim = c(0,max(dfSimData$X1)+0.2), ylim = c(-1,max(dfSimData$y)+1)) abline(snp_regA, col = "red")
The following plot shows parameters from Output B
#rmdhelp::use_odg_graphic(ps_path = "odg/p01plot1.odg") knitr::include_graphics(path = "odg/p01plot1.png")
The following plot shows parameters from Output A
#rmdhelp::use_odg_graphic(ps_path = "odg/p01plot2.odg") knitr::include_graphics(path = "odg/p01plot2.png")
\clearpage \pagebreak
set.seed(1221) n_nr_animal <- 12 n_mean_bw <- 531 n_sd_bw <- 6.1 n_min_bw <- n_mean_bw - 3 * n_sd_bw n_max_bw <- n_mean_bw + 3 * n_sd_bw vec_bw <- round(runif(n_nr_animal, min = n_min_bw, max = n_max_bw), digits = 0) n_reg_coef_sw_bw <- 0.451 n_sd_sw <- 4.7 n_intercept <- -31.73 vec_sw <- round(n_intercept + n_reg_coef_sw_bw * vec_bw + rnorm(n_nr_animal, mean = 0, sd = n_sd_sw), digits = 0) tbl_sw_bw <- tibble::tibble( Animal = c(1:n_nr_animal), BodyWeight = vec_bw, SlaughterWeight = vec_sw ) # write file, if it does not exist s_data_p02_file <- "asm_exam_p02.csv" s_local_data_p02_path <- file.path(here::here(), "docs", "data", s_data_p02_file) if (!file.exists(s_local_data_p02_path)) readr::write_csv2(tbl_sw_bw, file = s_local_data_p02_path)
The following table contains body weight and slaughter weight for r n_nr_animal
animals. Before the farmer sells the animal to the slaughter house, it is weighed on the farm. The slaughter weight is determined by the slaughter house.
\textit{Die folgende Tabelle enthält Lebendgewicht (BodyWeight
) und Schlachtgewicht (SlaughterWeight
) für r n_nr_animal
Tiere. Vor der Schlachtung wird das Tier auf dem Betrieb noch gewogen. Das Schlachtgewicht wird im Schlachthof bestimmt.}
knitr::kable(tbl_sw_bw)
\begin{enumerate}
\item[a)] Please specify the equation that models SlaughterWeight
(response variable) as a regression on BodyWeight
(predictor variable). Based on the specified regression equation and based on the dataset, create a table with knowns and unknowns.
\textit{Bitte geben Sie eine Modellgleichung, welche SlaughterWeight
(Zielgrösse) als Regression auf BodyWeight
(Predictorvariable) modelliert. Basierend auf der spezifizierten Regressionsgleichung und basierend auf dem Datensatz, geben Sie in einer Tabelle an, welche Grössen bekannt und welche unbekannt sind.}
\end{enumerate}
\points{r lPointsQ2$TaskA
}
Regression Model:
Table of knowns and unknowns
tbl_known_unknown <- tibble::tibble(Quantity = c("", "", "", "", ""), `Known/Unknown` = c("", "", "", "", "")) knitr::kable(tbl_known_unknown)
\clearpage \pagebreak
$$y_i = b_0 + b_1 * x_{i} + e_i$$ where $b_0$ is the intercept, $b_1$ the regression slope, $e_i$ the random error and $x_i$ the bodyweight of animal $i$. Using matrix-vector notation, we get
$$y = Xb + e$$ with $y$ being the vector of slaughterweights, $b$ the vector with the intercept and the regression slope, $e$ the vector of random residuals and $X$ is the matrix having $n$ rows and two columns where the first column is all ones and the seond column contains the bodyweight of all animals.
tbl_known_unknown <- tibble::tibble(Quantity = c("$y$", "$X$", "$b$", "$e$", "$\\sigma_e^2$"), `Known/Unknown` = c("known", "known", "unknown", "unknown", "unknown")) knitr::kable(tbl_known_unknown)
\clearpage \pagebreak
\begin{enumerate} \item[b)] The following programming code in R does a Bayesian estimation of the unknowns of the regression model. Please complete the code where indicated (lines after comment starting with "TODO") such that estimates of unknowns are obtained.
\textit{Der nachfolgende Programmcode in R ergibt eine Bayes'sche Schätzung der unbekannten im Regressionsmodell. Bitte vervollständigen Sie den nachfolgenden Programmcode so, dass die Schätzungen der Unbekannten Grössen im Regressionsmodell resultieren. Die zu ergänzenden Stellen sind mit einem Kommentar, welcher mit dem Wort "TODO" beginnt, markiert.}
\end{enumerate}
\points{r lPointsQ2$TaskB
}
s_data_p02_path <- file.path("https://charlotte-ngs.github.io/gelasmss2021/data", s_data_p02_file)
r s_data_p02_path
. 01 # read the data 02 s_data_p02_path <- "https://charlotte-ngs.github.io/gelasmss2021/data/asm_exam_p02.csv" 03 tbl_reg_sw_bw <- readr::read_csv2(file = s_data_p02_path) 04 05 # take number of observations from tbl_reg_sw_bw 06 n_nr_obs <- nrow(tbl_reg_sw_bw) 07 08 # define Matrix X 09 X <- matrix(c(rep(1,n_nr_obs), tbl_reg_sw_bw$BodyWeight), ncol = 2) 10 # observations as vector y 11 y <- tbl_reg_sw_bw$SlaughterWeight 12 # fix constants 13 nuRes <- 4 14 varResidual <- 1 15 scaleRes <- varResidual * (nuRes - 2)/nuRes 16 mu <- mean(y) 17 ycorr <- y - mu 18 # intialise estimates for intercept, slope and residual variance 19 beta <- c(0,0) 20 meanBeta <- c(0, 0) 21 sigma <- 1 22 meanSigma <- 0 23 # loop over iterations of the Gibbs Sampler 24 niter <- 1000 25 for (iter in 1:niter){ 26 # sampling intercept 27 w <- y - X[, 2] * beta[2] 28 x <- X[, 1] 29 xpxi <- 1/(t(x) %*% x) 30 # TODO: compute mean of conditional distribution 31 betaHat <- 32 # TODO: draw sample of intercept from normal distribution 33 beta[1] <- 34 # sampling slope 35 w <- y - X[, 1] * beta[1] 36 x <- X[, 2] 37 xpxi <- 1/(t(x) %*% x) 38 # TODO: compute mean of conditional distribution 39 betaHat <- 40 # TODO: draw sample for slope from normal distribution 41 beta[2] <- 42 # sample residual variance 43 sigma <- (crossprod(ycorr) + nuRes * scaleRes) / rchisq(1, n_nr_obs + nuRes) 44 # TODO: save current sample of beta to meanBeta and sigma to meanSigma 45 meanBeta <- 46 meanSigma <- 47 # output every 200th sample 48 if (iter %% 200 == 0){ 49 cat(" * Iteration: ", iter, "\n") 50 cat(" * Intercept: ", beta[1], "\n") 51 cat(" * Slope: ", beta[2], "\n") 52 cat(" * Residual Variance: ", sigma, "\n") 53 } 54 } 55 56 # output estimates 57 cat(" * Bayes Estimates\n") 58 cat(" * Intercept: ", meanBeta[1]/niter, "\n") 59 cat(" * Slope: ", meanBeta[2]/niter, "\n") 60 cat(" * Residual Variance: ", meanSigma/niter, "\n")
\clearpage \pagebreak
# set seed set.seed(7821) # read the data s_data_p02_path <- "https://charlotte-ngs.github.io/gelasmss2021/data/asm_exam_p02.csv" tbl_reg_sw_bw <- readr::read_csv2(file = s_data_p02_path) # take number of observations from tbl_reg_sw_bw n_nr_obs <- nrow(tbl_reg_sw_bw) # define Matrix X X <- matrix(c(rep(1,n_nr_obs), tbl_reg_sw_bw$BodyWeight), ncol = 2) # observations as vector y y <- tbl_reg_sw_bw$SlaughterWeight # fix constants nuRes <- 4 varResidual <- 1 scaleRes <- varResidual * (nuRes - 2)/nuRes mu <- mean(y) ycorr <- y - mu # intialise estimates for intercept, slope and residual variance beta <- c(0,0) meanBeta <- c(0, 0) sigma <- 1 meanSigma <- 0 # loop over iterations of the Gibbs Sampler niter <- 1000 for (iter in 1:niter){ # sampling intercept w = y - X[, 2] * beta[2] x = X[, 1] xpxi = 1/(t(x) %*% x) # compute mean of conditional distribution betaHat = t(x) %*% w * xpxi # draw sample of intercept from normal distribution beta[1] = rnorm(1, betaHat, sqrt(xpxi * sigma)) # sampling slope w = y - X[, 1] * beta[1] x = X[, 2] xpxi = 1/(t(x) %*% x) # compute mean of conditional distribution betaHat = t(x) %*% w * xpxi # draw sample for slope from normal distribution beta[2] = rnorm(1, betaHat, sqrt(xpxi * sigma)) # sample residual variance sigma <- (crossprod(ycorr) + nuRes * scaleRes) / rchisq(1, n_nr_obs + nuRes) # save current sample of beta to meanBeta and sigma to meanSigma # add beta samples meanBeta <- meanBeta + beta # add sample for sigma meanSigma <- meanSigma + sigma # output every 200th sample if (iter %% 200 == 0){ cat(" * Iteration: ", iter, "\n") cat(" * Intercept: ", beta[1], "\n") cat(" * Slope: ", beta[2], "\n") cat(" * Residual Variance: ", sigma, "\n") } } # output estimates cat(" * Bayes Estimates\n") cat(" * Intercept: ", meanBeta[1]/niter, "\n") cat(" * Slope: ", meanBeta[2]/niter, "\n") cat(" * Residual Variance: ", meanSigma/niter, "\n")
Validation with lm()
fit_sw_bw <- lm(SlaughterWeight ~ BodyWeight, data = tbl_reg_sw_bw) summary(fit_sw_bw)
\clearpage \pagebreak
x_missing <- 513
\begin{enumerate}
\item[c)] We assume that in addition to the r n_nr_obs
animals shown above there is an additional animal with a body weight of r x_missing
kg. In the slaughterhouse the slaughter weight could not be measured, hence it is missing. How are such missing observations handled in a Bayesian analysis? Please fill out the table with the knowns and unknowns once again taking into account the fact that the observation of the slaughterweight for animal r n_nr_obs+1
is missing.
\textit{Wir nehmen an, dass zusätzlich zu den r n_nr_obs
Tieren, welche in der Tabelle oben gezeigt wurden, noch ein zusätzliches Tier mit einem Lebendgewicht von r x_missing
kg hinzukommt. Im Schlachthof konnte das Schlachtgewicht vom Tier r n_nr_obs+1
nicht erfasst werden und fehlt somit. Wie wird diese fehlende Beobachtung in einer Bayes'schen Analyse behandelt? Bitte ergänzen Sie die Tabelle mit den bekannten und den unbekannten Grössen unter Berücksichtigung der fehlenden Beobachtung des Schlachtgewichts von Tier r n_nr_obs+1
.}
\end{enumerate}
\points{r lPointsQ2$TaskC
}
Expanded Table with knowns and unknowns
tbl_exp_known_unknown <- tibble::tibble(Quantity = c("", "", "", "", "", ""), `Known/Unknown` = c("", "", "", "", "", "")) knitr::kable(tbl_exp_known_unknown)
\clearpage \pagebreak
Expanded Table with knowns and unknowns
tbl_exp_known_unknown <- tibble::tibble(Quantity = c("$y[1:12]$", "$y[13]$", "$X$", "$b$", "$e$", "$\\sigma_e^2$"), `Known/Unknown` = c("known", "unknown", "known", "unknown", "unknown", "unknown")) knitr::kable(tbl_exp_known_unknown)
\clearpage \pagebreak
\begin{enumerate} \item[a)] LASSO is an alternative procedure to Least Squares to estimate parameters of a linear model. Which of the following equations belongs to least square and which belongs to LASSO.
\textit{LASSO ist ein alternatives Parameterschätzverfahren zu Least Squares. Ordnen Sie die nachfolgenden Gleichungen zu den beiden Verfahren Least Squares und LASSO zu.}
\end{enumerate}
\points{r lPointsQ3$TaskA
}
\begin{eqnarray} \hat{\beta}1 & = & argmin{\beta} \left{ ||y - X\beta||^2 + \lambda\sum_{j=1}^p|\beta_j| \right}\nonumber \label{eq:LsEstimateBetaLASSO} \end{eqnarray}
\begin{eqnarray} \hat{\beta}2 & = & argmin{\beta}||y - X\beta||^2 \nonumber \label{eq:LsEstimateBetaExpandRSS} \end{eqnarray}
\clearpage \pagebreak
\begin{enumerate} \item[b)] We analyse a genomic dataset with 25 animals which are genotyped at 100 SNP-locations. Only 5 SNPs have an effect on the observed trait. The SNP-effects are estimated with LASSO. The results of this analysis are shown in the two plots below. The second plot can be used to determine the penalty-term (Log Lambda), such that a minimum number of SNP-effects are considered and such that the mean-squared error is still as small as possible (right dotted). Which are the 5 SNPs (indicate the numbers) with the largest absolute effects, when the penalty-term is determined based on the second plot.
\textit{Wir analysieren einen genomischen Datensatz mit 25 Tieren, welche Daten an 100 SNP-Positionen aufweisen. Davon haben nur 5 SNP einen Effekt auf das gemessene Merkmal. Die SNP-Effekte werden mit LASSO geschätzt. Die Resultate sind in den nachfolgenden Plots gezeigt. Im zweiten Plot können wir den Strafterm (Log Lambda) so bestimmen, dass möglichst wenige SNPs berücksichtigt werden und dass gleichzeitig der mittlere quadrierte Fehler minimal bleibt (rechte gestrichelte Linie). Welches sind die 5 SNPs (bitte Nummern angeben) mit den grössten absoluten Effekten, wenn wir den Strafterm aufgrund des zweiten Plots bestimmen.}
\end{enumerate}
\points{r lPointsQ3$TaskB
}
\vspace{1cm}
# set the seed for reproducibility set.seed(1239) # generate the matrix at a given number of SNP-loci # and for a given number of animals n_nr_snp <- 100 n_nr_animals <- 25 n_min_allele_freq <- 0.2 # generate the matrix using sample() mat_snp <- matrix(data = sample(c(0,1,2), n_nr_snp*n_nr_animals, replace = T, prob = c((1-n_min_allele_freq)^2, 2*(1-n_min_allele_freq)*n_min_allele_freq, n_min_allele_freq^2)), nrow = n_nr_animals, ncol = n_nr_snp) # change the coding mat_snp <- mat_snp - 1 # check dimensions # dim(mat_snp) # significant snp n_nr_sign_snp <- 5 vec_snp_sd <- apply(mat_snp, 2, sd) vec_ord_sd <- order(vec_snp_sd, decreasing = TRUE) vec_sign_snp_idx <- vec_ord_sd[1:n_nr_sign_snp] # snp effects n_snp_a_min <- .5 n_snp_a_max <- 13 vec_snp_eff <- runif(n = n_nr_sign_snp, min = n_snp_a_min, max = n_snp_a_max) # observations n_true_inter <- -12.4 n_true_sd <- 3.41 vec_y <- crossprod(t(mat_snp[,vec_sign_snp_idx]), vec_snp_eff) + rnorm(n = n_nr_animals, mean = n_true_inter, sd = n_true_sd) mat_data_lasso <- cbind(vec_y, mat_snp) # dim(mat_data_lasso) # fit suppressPackageStartupMessages(require(glmnet)) fit_snp <- glmnet(x = mat_snp, y = vec_y) # plot showing fit plot(fit_snp, xvar = "lambda", label = TRUE)
\pagebreak
The penalty-term can be determined based on the right dotted line.
\textit{Der Strafterm kann aufgrund der rechten gestrichelten Linie bestimmt werden.}
\vspace{1cm}
# Crossvalidation cvfitsnp <- cv.glmnet(x = mat_snp, y = vec_y, grouped = FALSE) plot(cvfitsnp)
vec_sign_snp_idx
From the diagram, the marker 53, 28, 70, 84 and 18 had the highest absolute effect.
The fit
summary(cvfitsnp)
\clearpage \pagebreak
\begin{enumerate} \item[c)]
\textit{}
\end{enumerate}
\points{r lPointsQ3$TaskC
}
\clearpage \pagebreak
s_data_p04_file <- "asm_exam_p04.csv" s_data_p04_local_path <- file.path(here::here(), "docs", "data", s_data_p04_file) s_data_p04_url_path <- file.path("https://charlotte-ngs.github.io/gelasmss2021/data", s_data_p04_file) n_nr_snp_cons <- 2 # make the simulation reproducible set.seed(732) if (file.exists(s_data_p04_local_path)){ tbl_data_p04 <- readr::read_csv2(file = s_data_p04_local_path) } else { # pedigree n_nr_ani <- 10 n_nr_founder <- 4 n_nr_non_founder <- n_nr_ani - n_nr_founder tbl_ped <- tibble::tibble(ID = c((n_nr_founder + 1):n_nr_ani), Sire = c(1, 1, 2, 2, 5, 5), Dam = c(3, 4, 3, 3, 7, 8)) # install (if necessary) andload the simulation package pkg <- "sim1000G" if (!is.element(pkg, installed.packages())) install.packages(pkg) require(sim1000G) # get the vcf file that comes with the package, TODO: adapt that vcf file examples_dir = system.file("examples", package = "sim1000G") vcf_file = file.path(examples_dir,"region.vcf.gz") # read specified vcf file with parameters (currently as given in vignette) vcf = readVCF( vcf_file, maxNumberOfVariants = 600 , min_maf = 0.01, max_maf = 1) # start simulation startSimulation(vcf, totalNumberOfIndividuals = n_nr_ani) ids = generateUnrelatedIndividuals(n_nr_founder) for (idx in 1:n_nr_non_founder){ ids[n_nr_founder + idx] <- SIM$mate(tbl_ped$Sire[idx],tbl_ped$Dam[idx]) } genotype = retrieveGenotypes(ids) dim(genotype) # select one SNP where animal 2 has genotype 2 #which(genotype[1,] == 2) #which(genotype[2,] == 2) #which(genotype[3,] == 2) # manually pick three snp mat_sim_geno <- genotype[,c(1,156, 22)] # determine the number of SNP n_nr_snp <- ncol(mat_sim_geno) # simulate observations vec_snp_eff <- c(13.3, 4.7, 2.1) vec_sex <- c("M" = 7.1, "F" = 3.3) sd_res <- 11.9 n_intercept <- -5.3 # augment pedigree with founders tbl_ped_aug <- tibble::tibble(ID = c(1:n_nr_ani), Sire = c(rep(NA, n_nr_founder), tbl_ped$Sire), Dam = c(rep(NA, n_nr_founder), tbl_ped$Dam), Sex = c("M", "M", "F", "F", "M", rep("F", 5))) # observations vec_y <- n_intercept + vec_sex[tbl_ped_aug$Sex] + crossprod(t(mat_sim_geno), vec_snp_eff) + rnorm(1, mean = 0, sd = sd_res) # data set tbl_data_p04 <- tibble::tibble(Animal = tbl_ped_aug$ID, Sire = tbl_ped_aug$Sire, Dam = tbl_ped_aug$Dam, Sex = tbl_ped_aug$Sex, Observation = round(vec_y[,1], digits = 2)) # add snp Info # for (idx in 1:n_nr_snp_cons){ for (idx in 1:n_nr_snp){ tbl_data_p04 <- dplyr::bind_cols(tbl_data_p04, tibble::tibble(SNP = mat_sim_geno[,idx]-1)) names(tbl_data_p04)[ncol(tbl_data_p04)] <- paste0("SNP", idx) } # write data to file readr::write_csv2(tbl_data_p04, path = s_data_p04_local_path) }
The data shown in the following table is available to predict genomic breeding values using different methods. The data is available from the URL shown below:
\textit{Die Daten in der nachfolgenden Tabelle sollen für die Schätzung von genomischen Zuchtwerten mit verschiedenen Methoden verwendet werden. Die Daten können vom folgenden URL heruntergeladen werden: }
r s_data_p04_url_path
.
knitr::kable(tbl_data_p04)
\begin{enumerate} \item[a)] Use the two-step approach to predict genomic breeding values. Because, the number of SNP is smaller than the number of animals, marker effects can be estimated using least squares. Please indicate the type of model and specify the all the model components used to estimated marker effects. Also, describe how the genomic breeding values are computed from the marker effects.
\textit{Verwenden Sie die Zwei-Schritt Methode zur Schätzung der genomischen Zuchtwerte. Da die Anzahl SNP kleiner ist als die Anzahl Tiere im Datensatz können die Markereffekte mit Least Squares geschätzt werden. Bitte geben Sie den Modell-Typ an und spezifizieren Sie alle Komponenten des Modells, welches zur Schätzung der Markereffekte verwendet wird. Beschreiben Sie auch, wie Sie aus den Markereffekten die genomischen Zuchtwerte berechnen.}
\end{enumerate}
\points{r lPointsQ4$TaskA
}
The two steps in the two-step approach are
Marker effects are estimated using a fixed linear effect model (model type). The observation $y_i$ of animal $i$ can be modelled as:
$$y_i = b_0 + b_{sex,i} + \alpha_{SNP_1,i} + \alpha_{SNP_2,i} + e_i = b_0 + b_{sex,i} + \sum_{j=1}^2 \alpha_{SNP_j, i} + e_i$$
with $b_0$: intercept, $b_{sex,i}$ effect of the sex of animal $i$, $\alpha_{SNP_j, i}$ the allele substitution effect according to the genotype of animal $i$ and SNP $j$, $e_i$ is the random residual with variance $\sigma_e^2$.
Because the number of SNP is smaller than the number of animals, the marker effects can be estimated using least squares. Hence, the function lm()
is used.
# read data tbl_data_p04 <- readr::read_csv2(file = s_data_p04_url_path) lm_mrk_eff <- lm(Observation ~ Sex + SNP1 + SNP2, data = tbl_data_p04) summary(lm_mrk_eff)
The genomic breeding values are predicted by the matrix vector multiplication of the genotype matrix times the marker effects.
mat_geno_snp <- matrix(c(tbl_data_p04$SNP1, tbl_data_p04$SNP2), ncol = 2) coef_mrk_eff <- effects(lm_mrk_eff) vec_mrk_eff <- c(coef_mrk_eff[["SNP1"]], coef_mrk_eff[["SNP2"]]) mat_pred_gbv <- crossprod(t(mat_geno_snp), vec_mrk_eff) # table tbl_res_ts_gbv <- tibble::tibble(Animal = tbl_data_p04$Animal, `Predicted Breeding Value` = round(mat_pred_gbv[,1], digits = 4)) knitr::kable(tbl_res_ts_gbv)
Ranking of the animals
order(mat_pred_gbv[,1])
\clearpage \pagebreak
\begin{enumerate} \item[b)] Use a single-step marker-effect model to predict breeding values using the data shown above. Please specify the model type and all the components of the marker-effect model and describe how genomic breeding values are predicted. The ration ($\lambda$) between the residual variance ($\sigma_e^2$) and the QTL-variance ($\sigma_q^2$) can assumed to be $1$.
\textit{Verwenden Sie ein "Single-Step" Markereffektmodell für die Schätzung der genomischen Zuchtwerte. Bitte geben Sie den Modelltyp und alle Komponenten des Markereffektmodells and und beschreiben Sie, wie die genomischen Zuchtwerte berechnet werden.}
\end{enumerate}
\points{r lPointsQ4$TaskB
}
In a single-step marker effect model a mixed linear effect model is used to predict marker effects. Marker effects are treated as random effects. Genomic breeding values are predicted by summing over the appropriate marker effects. The linear mixed effect model to predict the marker effects is as follows
$$y = Xb + Wq + e$$ where $y$ is the vector of observations, $b$ the vector of fixed effects, $q$ the vector of random marker effects, $e$ the vector of random residuals, $X$ and $W$ are the design-matrices. The matrix $W$ encodes the genotypes where $-1$ corresponds to $G_2G_2$, $0$ to $G_1G_2$ and $1$ to $G_1G_1$, assuming $G_1$ is the positive allele.
Inserting the values of the dataset into the model components results in
vec_y <- tbl_data_p04$Observation n_nr_obs <- length(vec_y) n_nr_snp <- 2 mat_X <- matrix(data = c(1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1), ncol = 2) mat_W <- matrix(data = c(tbl_data_p04$SNP1, tbl_data_p04$SNP2), ncol = n_nr_snp) # vector of unknowns vec_b <- c("b_M", "b_F") vec_q <- rmdhelp::vecGetVecElem(psBaseElement = "q", pnVecLen = n_nr_snp, psResult = "latex") vec_e <- rmdhelp::vecGetVecElem(psBaseElement = "e", pnVecLen = n_nr_obs, psResult = "latex") # output cat(paste0(rmdhelp::bcolumn_vector(pvec = vec_y, ps_name = "y", ps_env = "$$"), collapse = "\n"), "\n") cat(paste0(rmdhelp::bmatrix(pmat = mat_X, ps_name = "X", ps_env = "$$"), collapse = "\n"), "\n") cat(paste0(rmdhelp::bmatrix(pmat = mat_W, ps_name = "W", ps_env = "$$"), collapse = "\n"), "\n") cat(paste0(rmdhelp::bcolumn_vector(pvec = vec_b, ps_name = "b", ps_env = "$$"), collapse = "\n"), "\n") cat(paste0(rmdhelp::bcolumn_vector(pvec = vec_q, ps_name = "q", ps_env = "$$"), collapse = "\n"), "\n") cat(paste0(rmdhelp::bcolumn_vector(pvec = vec_e, ps_name = "e", ps_env = "$$"), collapse = "\n"), "\n")
The mixed model equations
$$ \left[ \begin{array}{cc} X^TX & X^TW \ W^TX & W^TW + \lambda * I \end{array} \right] \left[ \begin{array}{c} \hat{b} \ \hat{q} \end{array} \right] = \left[ \begin{array}{c} X^Ty \ W^Ty \end{array} \right] $$
Get the solution of the mixed model equations
lambda <- 1 mat_XtX <- crossprod(mat_X) mat_XtW <- crossprod(mat_X, mat_W) mat_WtX <- crossprod(mat_W, mat_X) mat_WtWlambdaI <- crossprod(mat_W) + lambda * diag(1, nrow = n_nr_snp) mat_coef <- rbind(cbind(mat_XtX, mat_XtW), cbind(mat_WtX, mat_WtWlambdaI)) mat_rhs <- rbind(crossprod(mat_X, vec_y), crossprod(mat_W, vec_y)) mat_sol <- solve(mat_coef, mat_rhs) vec_sol_fix <- mat_sol[1:2,]
From the marker effects
n_nr_sol <- nrow(mat_sol) mrk_eff <- mat_sol[(n_nr_sol - n_nr_snp + 1):n_nr_sol,] mrk_eff
the genomic breeding values are computed by the product $\hat{g} = W\hat{q}$
mat_gen_bv <- crossprod(t(mat_W), mrk_eff) vec_sol_gbv <- round(mat_gen_bv[,1], digits = 4)
The solution for the fixed effects ($\hat{b}$) and for the predicted breeding values ($\hat{g}$) are
cat(paste0(rmdhelp::bcolumn_vector(pvec = round(vec_sol_fix, digits = 4), ps_name = "\\hat{b}", ps_env = "$$"), collapse = "\n"), "\n") cat(paste0(rmdhelp::bcolumn_vector(pvec = vec_sol_gbv, ps_name = "\\hat{g}", ps_env = "$$"), collapse = "\n"), "\n")
The ranking of the animals according to the predicted breeding value
order(vec_sol_gbv)
Setting the predicted breeding value of the first animal to $0$ leads to
vec_sol_gbv - vec_sol_gbv[1]
\clearpage \pagebreak
\begin{enumerate} \item[c)] Use a single-step Genomic BLUP (GBLUP) model to predict genomic breeding values. Please specify the type of model used and also list all the model components of the GBLUP model used. The ration ($\lambda$) between the residual variance ($\sigma_e^2$) and the genetic variance ($\sigma_g^2$) can assumed to be $1$.
\textit{Verwenden Sie ein "Ein-Schritt" genomisches BLUP (GBLUP) Modell zur Schätzung der genomischen Zuchtwerte. Bitte geben Sie den Modelltyp und alle Modellkomponenten an, welche im GBLUP-Modell vorkommen.}
\end{enumerate}
\points{r lPointsQ4$TaskC
}
The GBLUP model is a mixed linear effect model where genomic breeding values are treated as random effects. The mixed linear effect is
$$y = Xb + Zg + e$$ where $y$ is the vector of observations, $b$ the vector of fixed effects, $g$ the vector of random genomic breeding values, $e$ the vector of random residuals, $X$ and $Z$ are the design-matrices.
vec_y <- tbl_data_p04$Observation n_nr_obs <- length(vec_y) mat_X <- matrix(data = c(1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1), ncol = 2) mat_Z <- diag(1, n_nr_obs) lambda <- 1 # unknowns vec_b <- c("b_M", "b_F") vec_g <- rmdhelp::vecGetVecElem(psBaseElement = "g", pnVecLen = n_nr_obs, psResult = "latex") vec_e <- rmdhelp::vecGetVecElem(psBaseElement = "e", pnVecLen = n_nr_obs, psResult = "latex") # show quantities cat(paste0(rmdhelp::bcolumn_vector(pvec = vec_y, ps_name = "y", ps_env = "$$"), collapse = "\n"), "\n") cat(paste0(rmdhelp::bmatrix(pmat = mat_X, ps_name = "X", ps_env = "$$"), collapse = "\n"), "\n") cat(paste0(rmdhelp::bmatrix(pmat = mat_Z, ps_name = "Z", ps_env = "$$"), collapse = "\n"), "\n") cat(paste0(rmdhelp::bcolumn_vector(pvec = vec_b, ps_name = "b", ps_env = "$$"), collapse = "\n"), "\n") cat(paste0(rmdhelp::bcolumn_vector(pvec = vec_g, ps_name = "g", ps_env = "$$"), collapse = "\n"), "\n") cat(paste0(rmdhelp::bcolumn_vector(pvec = vec_e, ps_name = "e", ps_env = "$$"), collapse = "\n"), "\n")
The mixed model equations
$$ \left[ \begin{array}{cc} X^TX & X^TZ \ Z^TX & Z^TZ + \lambda * H^{-1} \end{array} \right] \left[ \begin{array}{c} \hat{b} \ \hat{g} \end{array} \right] = \left[ \begin{array}{c} X^Ty \ Z^Ty \end{array} \right] $$
The genomic relationship matrix $G$ is computed using the following function
computeMatGrm <- function(pmatData) { matData <- pmatData # check the coding, if matData is -1, 0, 1 coded, then add 1 to get to 0, 1, 2 coding if (min(matData) < 0) matData <- matData + 1 # Allele frequencies, column vector of P and sum of frequency products freq <- apply(matData, 2, mean) / 2 P <- 2 * (freq - 0.5) sumpq <- sum(freq*(1-freq)) # Changing the coding from (0,1,2) to (-1,0,1) and subtract matrix P Z <- matData - 1 - matrix(P, nrow = nrow(matData), ncol = ncol(matData), byrow = TRUE) # Z%*%Zt is replaced by tcrossprod(Z) return(tcrossprod(Z)/(2*sumpq)) } # take matrix with genotypes mat_W <- matrix(data = c(tbl_data_p04$SNP1, tbl_data_p04$SNP2), ncol = 2) # compute the genomic relationship matrix mat_G <- computeMatGrm(pmatData = mat_W) # use the numerator relationship matrix A to blend G into H ped <- pedigreemm::pedigree(sire = tbl_data_p04$Sire, dam = tbl_data_p04$Dam, label = as.character(tbl_data_p04$Animal)) mat_A <- as.matrix(pedigreemm::getA(ped = ped)) n_blend_fact <- 0.05 mat_H <- (1-n_blend_fact) * mat_G + n_blend_fact * mat_A
Get the solution of the mixed model equations
lambda <- 1 mat_XtX <- crossprod(mat_X) mat_XtZ <- crossprod(mat_X, mat_Z) mat_ZtX <- crossprod(mat_Z, mat_X) mat_ZtZlambdaHinv <- crossprod(mat_Z) + lambda * solve(mat_H) mat_coef <- rbind(cbind(mat_XtX, mat_XtZ), cbind(mat_ZtX, mat_ZtZlambdaHinv)) mat_rhs <- rbind(crossprod(mat_X, vec_y), crossprod(mat_Z, vec_y)) mat_sol <- solve(mat_coef, mat_rhs) n_nr_sol <- nrow(mat_sol) vec_sol_fix <- mat_sol[1:2,] vec_sol_gbv <- mat_sol[(n_nr_sol - n_nr_obs + 1):n_nr_sol, ]
The solution for the fixed effects ($\hat{b}$) and for the predicted breeding values ($\hat{g}$) are
cat(paste0(rmdhelp::bcolumn_vector(pvec = round(vec_sol_fix, digits = 4), ps_name = "\\hat{b}", ps_env = "$$"), collapse = "\n"), "\n") cat(paste0(rmdhelp::bcolumn_vector(pvec = round(vec_sol_gbv, digits = 4), ps_name = "\\hat{g}", ps_env = "$$"), collapse = "\n"), "\n")
order(vec_sol_gbv)
Setting the predicted breeding value of the first animal to $0$ leads to
vec_sol_gbv - vec_sol_gbv[1]
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.