R/apply_pcs_G_Add.R

Defines functions apply_pcs_G_Add

Documented in apply_pcs_G_Add

#' Data dimensionality reduction by modeling genetic effects using the PCs of
#' the genomic relationship matrix.
#'
#' @description
#' Computes the genomic relationship after centering and scaling genotype matrix
#' of the training set. Eigenvalues and eigenvectors of the G matrix are
#' estimated. PCA is applied on the training set and the same transformation is
#' applied on the test set. The goal is to use principal components in
#' prediction models as a smaller number of variables instead of all the marker
#' predictors.
#'
#' @param split An object of class \code{split}, corresponding to one element of
#'   the total `cv_object` generated by one of the functions [predict_cv0()],
#'   [predict_cv00()], [predict_cv1()], or [predict_cv2()],
#'   and containing the following items:
#'   * **training**: \code{data.frame} Training dataset
#'   * **test**: \code{data.frame} Test dataset
#'
#' @param geno \code{data.frame} It corresponds to a `geno` element
#'   within an object of class `METData`.
#'
#' @return pc_values A \code{data.frame} containing the principal components
#'   in columns and the names of all lines used in the study is contained in the
#'   first column 'geno_ID'. PCs for the lines present in the test set were
#'   computed based on the transformation done on the training set.
#'
#' @param num_pcs \code{integer} Maximal number of principal components to
#'   extract.
#'
#'
#' @author Cathy C. Westhues \email{cathy.jubin@@hotmail.com}
#' @export
#'
#'
#'


apply_pcs_G_Add <- function(split, 
                            geno, 
                            num_pcs = 200,
                            ...) {
  cat('The number of PCs to be derived is',num_pcs)
  geno <- as.data.frame(geno)
  geno$geno_ID = row.names(geno)
  
  geno_training = geno[geno$geno_ID %in% unique(split[[1]][, 'geno_ID']), ]
  geno_training = unique(geno_training)
  geno_test =  geno[geno$geno_ID %in% unique(split[[2]][, 'geno_ID']), ]
  geno_test = unique(geno_test)
  
  rec_snps <- recipes::recipe( ~ . ,
                               data = geno_training) %>%
    recipes::update_role(geno_ID, new_role = 'outcome') %>%
    recipes::step_nzv(recipes::all_predictors()) %>%
    recipes::step_normalize(recipes::all_numeric_predictors())
  
  rec_snps <-
    recipes::prep(rec_snps,
                  training = geno_training,
                  strings_as_factors = FALSE)
  
  snps_data_tr <-
    as.data.frame(recipes::bake(rec_snps, new_data = geno_training))
  row.names(snps_data_tr) <- snps_data_tr$geno_ID
  snps_data_te <-
    as.data.frame(recipes::bake(rec_snps, new_data = geno_test))
  row.names(snps_data_te) <- snps_data_te$geno_ID
  
  X_tr_scaled <- as.matrix(snps_data_tr %>% dplyr::select(-geno_ID))
  X_te_scaled <- as.matrix(snps_data_te %>% dplyr::select(-geno_ID))
  
  G_tr <- tcrossprod(X_tr_scaled)
  G_tr <- G_tr / mean(diag(G_tr))
  
  EVD.G = eigen(G_tr)
  EVD.G$vectors = EVD.G$vectors[, EVD.G$values > 1e-5]
  EVD.G$values = EVD.G$values[EVD.G$values > 1e-5]
  PC.G <- EVD.G$vectors
  rownames(PC.G) <- rownames(G_tr)
  for (i in 1:ncol(PC.G)) {
    PC.G[, i] <- PC.G[, i] * sqrt(EVD.G$values[i])
  }
  Zv <- model.matrix( ~ split[[1]][, 'geno_ID'] - 1)
  ZPC.G_tr <- as.data.frame(Zv %*% PC.G)
  ZPC.G_tr$geno_ID <- split[[1]][, 'geno_ID']
  
  
  
  TMP <-
    EVD.G$vectors
  for (i in 1:ncol(EVD.G$vectors)) {
    TMP[, i] = TMP[, i] / sqrt(EVD.G$values[i])
  }
  GInv <- tcrossprod(TMP)
  
  
  G_te <- tcrossprod(X_te_scaled, X_tr_scaled) / ncol(X_tr_scaled)
  PC.G_te <- G_te %*% GInv %*% PC.G
  Zv <- model.matrix( ~ split[[2]][, 'geno_ID'] - 1)
  ZPC.G_te <- as.data.frame(Zv %*% PC.G_te)
  ZPC.G_te$geno_ID <- split[[2]][, 'geno_ID']
  
  
  ## Number of PCs
  ZPC.G_tr <-
    ZPC.G_tr[, c(which(colnames(ZPC.G_tr) == 'geno_ID'), which(colnames(ZPC.G_tr) %in% paste0('V', 1:ncol(ZPC.G_tr) -
                                                                                                1)))]
  ZPC.G_te <-
    ZPC.G_te[, c(which(colnames(ZPC.G_te) == 'geno_ID'), which(colnames(ZPC.G_te) %in% paste0('V', 1:ncol(ZPC.G_te) -
                                                                                                1)))]
  
  colnames(ZPC.G_tr) <-
    c('geno_ID', paste0('PC', 1:(ncol(ZPC.G_tr) - 1)))
  colnames(ZPC.G_te) <-
    c('geno_ID', paste0('PC', 1:(ncol(ZPC.G_te) - 1)))
  
  vars <- c('geno_ID', paste0('PC', 1:num_pcs))
  
  if (ncol(ZPC.G_tr) - 1 > num_pcs) {
    ZPC.G_tr <- ZPC.G_tr %>% dplyr::select(all_of(vars))
    ZPC.G_te <- ZPC.G_te %>% dplyr::select(all_of(vars))
  }
  
  training <-
    merge(split[[1]], ZPC.G_tr, by = 'geno_ID', all.x = T)
  
  test <-
    merge(split[[2]], ZPC.G_te, by = 'geno_ID', all.x = T)
  
  
  
  return(list(training, test))
  
}
cjubin/learnMET documentation built on Nov. 4, 2024, 6:23 p.m.