R/dataSplit.R

Defines functions dataSplit

Documented in dataSplit

#' @title Data split
#'
#' @description Split dataset into training, vaidation and testing subsets.
#'
#' @param y A numeric or factor response vector of size \code{N}, where \code{N} is
#' the number of observations (spectra).
#'
#' @param train_size Proportion of observations to be assigned to the training subset.
#'
#' @param valid_size Proportion of observations to be assigned to the validating subset.
#'
#' @param test_size Proportion of observations to be assigned to the testing subset.
#'
#' @param reps Number of randomisations of the training/validating/testing subsets to
#' produce.
#'
#' @param balanced Whether the dataset should be balanced (TRUE) or not (FALSE).
#' See \code{\link{fdaML_train}} for more details.
#'
#' @param fnrc Force Nonempty Response Class. Ensures that each replication has
#' samples from each possible level of the response.
#'
#' @return A list with the indices (running from \code{1} to \code{length(y)}) of
#' training, validation and testing observations. Each of these 3 lists is a list
#' of \code{reps} elements, each element being a vector of the length
#' \code{train_size}/\code{valid_size}/\code{test_size}, accordingly.
#'
#' @export
#'
dataSplit <- function(y, train_size, valid_size, test_size, reps, balanced, fnrc=TRUE){

  id_train <- id_valid <- id_test <- list()
  uniq <- sort(unique(y))
  lu <- length(uniq)
  ly <- length(y)

  if(balanced){

    nmin <- sum(y==uniq[[1]])
    ntr <- floor(train_size * nmin);   nva <- floor(valid_size * nmin);   nte <- nmin - ntr - nva  #floor(test_size  * nmin)
    positions <- matrix(0, nmin, lu)
    for(v in 1:lu){
      positions[,v] <- which(y==uniq[v])
    }
    for(r in 1:reps){
      positions <- positions[sample(nrow(positions)),]  # shuffle indices
      itr <- iva <- ite <- c()                          # define indicies for training/validation/testing sets
      for(v in 1:lu){
        itr <- c(itr, positions[1:ntr,v])
        iva <- c(iva, positions[(ntr+1):(ntr+nva),v])
        ite <- c(ite, positions[(ntr+nva+1):nmin,v])
      }
      id_train[[r]] <- itr
      id_valid[[r]] <- iva
      id_test[[r]]  <- ite
    }

  }else{

    ntr <- floor(ly * train_size)
    nva <- floor(ly * valid_size)
    nte <- floor(ly * test_size)
    while((ntr + nva + nte) != ly){
      ntr <- ntr + 1
    }
    for(r in 1:reps){
      if(fnrc){
        # make sure that each replication has samples from each possible response
        status <- T
        while(status){
          id_valid[[r]] <- sample(x = 1:ly, size = nva, replace = F)
          id_test[[r]]  <- sample(x = (1:ly)[!((1:ly) %in% id_valid[[r]])], size = nte, replace = F)
          id_train[[r]] <- sample(x = (1:ly)[!((1:ly) %in% c(id_valid[[r]],id_test[[r]]))], size = ntr, replace = F)
          status <- (length(unique(y[id_valid[[r]]])) != lu) || (length(unique(y[id_test[[r]]])) != lu)
        }
      }else{
        id_valid[[r]] <- sample(x = 1:ly, size = nva, replace = F)
        id_test[[r]]  <- sample(x = (1:ly)[!((1:ly) %in% id_valid[[r]])], size = nte, replace = F)
        id_train[[r]] <- sample(x = (1:ly)[!((1:ly) %in% c(id_valid[[r]],id_test[[r]]))], size = ntr, replace = F)
      }
    }
  }

  return(list(id_train = id_train,
              id_valid = id_valid,
              id_test  = id_test  ))
}
pmesperanca/mlevcm documentation built on March 17, 2021, 10:03 p.m.