R/predict_cv00.R

Defines functions predict_cv00

Documented in predict_cv00

#' Get train/test splits of the phenotypic MET dataset based on CV0.
#'
#' @description Get train/test splits of the phenotypic MET dataset based on a
#' number of random k-folds partitions determined by the user, according to the
#' type CV00. Creation of the list of train/test splits based on phenotypic 
#' data, so that all the phenotypes from the same environment/year/site appear 
#' in the same fold, according to the type of the CV00 scheme. In addition to
#' CV0 scheme, information on lines present in the test set evaluated in other
#' environments are removed from the training set --> prediction of new 
#' genotypes in new environments.
#'
#' @param pheno_data \code{data.frame} Dataset containing phenotypic outcome
#'   data, as well as the predictor variables.
#'
#' @param cv0_type \code{character} either `leave-one-environment-out`,
#'   `leave-one-site-out`, `leave-one-year-out` or `forward-prediction`.
#'
#' @return a \code{cv_object} object which contains the train/test splits of the
#'   CV scheme. Each element of the object corresponds to a `split` object with
#'   two elements:
#'   \describe{
#'     \item{training}{\code{data.frame} Dataset with all observations for the
#'      training set.}
#'     \item{test}{\code{data.frame} Dataset with all observations for the test
#'      set.}
#'   }
#' @author Cathy C. Westhues \email{cathy.jubin@@hotmail.com}
#' @references
#' \insertRef{jarquin2017increasing}{learnMET}
#' \insertRef{jarquin2014reaction}{learnMET}
#' @export

predict_cv00 <-
  function(pheno_data,
           cv0_type) {
    
    if (cv0_type == 'leave-one-year-out') {
      # Create data frame with unique names of year in the dataset
      
      unique_years <-
        as.character(unique(pheno_data[, 'year']))
      
      partition_data <- function(data, year) {
        test_data = data[data$year == year, ]
        training_data = data[data$year != year, ]
        lines_test_set = unique(test_data$geno_ID)
        training_data = training_data[training_data$geno_ID %notin% lines_test_set, ]
        
        split <-
          list("training_data" = training_data, "test_data" = test_data)
        class(split) <- c('split')
        names(split) <- c('training', 'test')
        return(split)
        
      }
      
      train_test_splits <- purrr::map(
        unique_years,
        .f = function (x)
          partition_data(year = x, data = pheno_data)
      )
      class(train_test_splits) <- c('cv_object')
      return(train_test_splits)
    }
    
    if (cv0_type == 'leave-one-environment-out') {
      # Create data frame with unique names of environments (YearxLoc) in the
      # dataset
      
      unique_environments <-
        as.character(unique(pheno_data[, 'IDenv']))
      
      partition_data <- function(data, IDenv) {
        training_data = data[data$IDenv != IDenv, ]
        test_data = data[data$IDenv == IDenv, ]
        lines_test_set = unique(test_data$geno_ID)
        training_data = training_data[training_data$geno_ID %notin% lines_test_set, ]
        
        split <-
          list("training_data" = training_data, "test_data" = test_data)
        class(split) <- c('split')
        names(split) <- c('training', 'test')
        return(split)
        
      }
      
      train_test_splits <- map(
        unique_environments,
        .f = function (x)
          partition_data(IDenv = x, data = pheno_data)
      )
      class(train_test_splits) <- c('cv_object')
      return(train_test_splits)
    }
    
    if (cv0_type == 'leave-one-site-out') {
      # Create data frame with unique names of location in the dataset
      
      unique_sites <-
        as.character(unique(pheno_data[, 'location']))
      
      partition_data <- function(data, location) {
        training_data = data[data$location != location, ]
        test_data = data[data$location == location, ]
        lines_test_set = unique(test_data$geno_ID)
        training_data = training_data[training_data$geno_ID %notin% lines_test_set, ]
        
        split <-
          list("training_data" = training_data, "test_data" = test_data)
        class(split) <- c('split')
        names(split) <- c('training', 'test')
        return(split)
        
      }
      
      train_test_splits <- map(
        unique_environments,
        .f = function (x)
          partition_data(location = x, data = pheno_data)
      )
      class(train_test_splits) <- c('cv_object')
      return(train_test_splits)
    }
    
    
    if (cv0_type == 'forward-prediction') {
      # Create data frame with unique names of year in the dataset
      
      unique_years = unique(as.numeric(as.character(pheno_data$year)))
      
      unique_years <- unique_years[-which.min(unique_years)]
      
      pheno_data$year <- as.numeric(as.character(pheno_data$year))
      
      partition_data <- function(data, year) {
        training_data = data[data$year < year, ]
        training_data$year = as.factor(training_data$year)
        test_data = data[data$year == year, ]
        test_data$year = as.factor(test_data$year)
        lines_test_set = unique(test_data$geno_ID)
        training_data = training_data[training_data$geno_ID %notin% lines_test_set, ]
        
        split <-
          list("training_data" = training_data, "test_data" = test_data)
        class(split) <- c('split')
        names(split) <- c('training', 'test')
        return(split)
        
      }
      
      train_test_splits <- map(
        unique_years,
        .f = function (x)
          partition_data(year = x, data = pheno_data)
      )
      
      train_test_splits <-
        train_test_splits[lengths(train_test_splits) != 0]
      class(train_test_splits) <- c('cv_object')
      return(train_test_splits)
      
      
    }
  }
cjubin/learnMET documentation built on Nov. 4, 2024, 6:23 p.m.