#' Data dimensionality reduction by modeling genetic effects using the PCs of
#' the genomic relationship matrix.
#'
#' @description
#' Computes the genomic relationship after centering and scaling genotype matrix
#' of the training set. Eigenvalues and eigenvectors of the G matrix are
#' estimated. PCA is applied on the training set and the same transformation is
#' applied on the test set. The goal is to use principal components in
#' prediction models as a smaller number of variables instead of all the marker
#' predictors.
#'
#' @param split An object of class \code{split}, corresponding to one element of
#' the total `cv_object` generated by one of the functions [predict_cv0()],
#' [predict_cv00()], [predict_cv1()], or [predict_cv2()],
#' and containing the following items:
#' * **training**: \code{data.frame} Training dataset
#' * **test**: \code{data.frame} Test dataset
#'
#' @param geno \code{data.frame} It corresponds to a `geno` element
#' within an object of class `METData`.
#'
#' @return pc_values A \code{data.frame} containing the principal components
#' in columns and the names of all lines used in the study is contained in the
#' first column 'geno_ID'. PCs for the lines present in the test set were
#' computed based on the transformation done on the training set.
#'
#' @param num_pcs \code{integer} Maximal number of principal components to
#' extract.
#'
#'
#' @author Cathy C. Westhues \email{cathy.jubin@@hotmail.com}
#' @export
#'
#'
#'
apply_pcs_G_Add <- function(split,
geno,
num_pcs = 200,
...) {
cat('The number of PCs to be derived is',num_pcs)
geno <- as.data.frame(geno)
geno$geno_ID = row.names(geno)
geno_training = geno[geno$geno_ID %in% unique(split[[1]][, 'geno_ID']), ]
geno_training = unique(geno_training)
geno_test = geno[geno$geno_ID %in% unique(split[[2]][, 'geno_ID']), ]
geno_test = unique(geno_test)
rec_snps <- recipes::recipe( ~ . ,
data = geno_training) %>%
recipes::update_role(geno_ID, new_role = 'outcome') %>%
recipes::step_nzv(recipes::all_predictors()) %>%
recipes::step_normalize(recipes::all_numeric_predictors())
rec_snps <-
recipes::prep(rec_snps,
training = geno_training,
strings_as_factors = FALSE)
snps_data_tr <-
as.data.frame(recipes::bake(rec_snps, new_data = geno_training))
row.names(snps_data_tr) <- snps_data_tr$geno_ID
snps_data_te <-
as.data.frame(recipes::bake(rec_snps, new_data = geno_test))
row.names(snps_data_te) <- snps_data_te$geno_ID
X_tr_scaled <- as.matrix(snps_data_tr %>% dplyr::select(-geno_ID))
X_te_scaled <- as.matrix(snps_data_te %>% dplyr::select(-geno_ID))
G_tr <- tcrossprod(X_tr_scaled)
G_tr <- G_tr / mean(diag(G_tr))
EVD.G = eigen(G_tr)
EVD.G$vectors = EVD.G$vectors[, EVD.G$values > 1e-5]
EVD.G$values = EVD.G$values[EVD.G$values > 1e-5]
PC.G <- EVD.G$vectors
rownames(PC.G) <- rownames(G_tr)
for (i in 1:ncol(PC.G)) {
PC.G[, i] <- PC.G[, i] * sqrt(EVD.G$values[i])
}
Zv <- model.matrix( ~ split[[1]][, 'geno_ID'] - 1)
ZPC.G_tr <- as.data.frame(Zv %*% PC.G)
ZPC.G_tr$geno_ID <- split[[1]][, 'geno_ID']
TMP <-
EVD.G$vectors
for (i in 1:ncol(EVD.G$vectors)) {
TMP[, i] = TMP[, i] / sqrt(EVD.G$values[i])
}
GInv <- tcrossprod(TMP)
G_te <- tcrossprod(X_te_scaled, X_tr_scaled) / ncol(X_tr_scaled)
PC.G_te <- G_te %*% GInv %*% PC.G
Zv <- model.matrix( ~ split[[2]][, 'geno_ID'] - 1)
ZPC.G_te <- as.data.frame(Zv %*% PC.G_te)
ZPC.G_te$geno_ID <- split[[2]][, 'geno_ID']
## Number of PCs
ZPC.G_tr <-
ZPC.G_tr[, c(which(colnames(ZPC.G_tr) == 'geno_ID'), which(colnames(ZPC.G_tr) %in% paste0('V', 1:ncol(ZPC.G_tr) -
1)))]
ZPC.G_te <-
ZPC.G_te[, c(which(colnames(ZPC.G_te) == 'geno_ID'), which(colnames(ZPC.G_te) %in% paste0('V', 1:ncol(ZPC.G_te) -
1)))]
colnames(ZPC.G_tr) <-
c('geno_ID', paste0('PC', 1:(ncol(ZPC.G_tr) - 1)))
colnames(ZPC.G_te) <-
c('geno_ID', paste0('PC', 1:(ncol(ZPC.G_te) - 1)))
vars <- c('geno_ID', paste0('PC', 1:num_pcs))
if (ncol(ZPC.G_tr) - 1 > num_pcs) {
ZPC.G_tr <- ZPC.G_tr %>% dplyr::select(all_of(vars))
ZPC.G_te <- ZPC.G_te %>% dplyr::select(all_of(vars))
}
training <-
merge(split[[1]], ZPC.G_tr, by = 'geno_ID', all.x = T)
test <-
merge(split[[2]], ZPC.G_te, by = 'geno_ID', all.x = T)
return(list(training, test))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.