R/apply_PCA.R

Defines functions apply_pca

Documented in apply_pca

#' Data dimensionality reduction using PCA on a split object.
#'
#' @description
#' Fit PCA on the training set and apply the same transformation to the test 
#' set. The goal is to use principal components in prediction models as a 
#' smaller number of variables instead of all the marker predictors.
#'
#' @param split An object of class \code{split}, corresponding to one element of
#'   the total `cv_object` generated by one of the functions [predict_cv0()],
#'   [predict_cv00()], [predict_cv1()], or [predict_cv2()], 
#'   and containing the following items:
#'   * **training**: \code{data.frame} Training dataset
#'   * **test**: \code{data.frame} Test dataset
#'
#' @param geno \code{data.frame} It corresponds to a `geno` element 
#'   within an object of class `METData`.
#'   
#' @param threshold \code{numeric} A fraction of the total variance that 
#'   should be covered by the components
#'
#' @return pc_values A \code{data.frame} containing the principal components
#'   in columns and the names of all lines used in the study is contained in the
#'   first column 'geno_ID'. PCs for the lines present in the test set were
#'   computed based on the transformation done on the training set.
#'
#' @author Cathy C. Westhues \email{cathy.jubin@@hotmail.com}
#' @export
#'
#'
#'


apply_pca <- function(split, 
                      geno, 
                      threshold = 0.95,
                      ...) {
  
  
  geno <- as.data.frame(geno)
  geno$geno_ID = row.names(geno)
  
  geno_training = geno[geno$geno_ID%in%unique(split[['training']][,'geno_ID']),]
  
  geno_training = unique(geno_training)
  
  geno_test = geno[geno$geno_ID%in%unique(split[['test']][,'geno_ID']),]
  geno_test = unique(geno_test)
  
  
  rec <- recipes::recipe(geno_ID ~ . ,
                         data = geno_training) %>%
    recipes::update_role(geno_ID, new_role = 'outcome') %>%
    recipes::step_nzv(recipes::all_predictors()) %>%
    recipes::step_pca(recipes::all_predictors(),
                      threshold = threshold,
                      options = list(center = T, scale. = T))
  
  norm_obj <- recipes::prep(rec, training = geno_training,strings_as_factors = FALSE)
  
  training_pca <- recipes::bake(norm_obj, geno_training)
  test_pca <- recipes::bake(norm_obj, geno_test)
  
  training <-
    plyr::join(split[[1]], training_pca, by = 'geno_ID')
  
  test <-
    plyr::join(split[[2]], test_pca, by = 'geno_ID')
  
  
  return(list(training,test))
  
}
cjubin/learnMET documentation built on Nov. 4, 2024, 6:23 p.m.