R/Kcross.R

#' Produces indexes to split data into test and training groups K times
#'
#' Index to randomly split data frame into groups, optionally stratified by a
#' grouping variable.
#'
#' \code{Kcross(DF, K = 10, Strat = NULL)}
#'
#' @param DF name of a data frame which is to be split
#' @param K number of 'folds' to split the data into
#' @param Strat name of index of grouping variable in data frame
#'
#' @details returns a list of lists where each of the sublists consist of
#' two objects, 'train' and 'test' which contain indexes to be used to choose
#' rows of the data frame.



Kcross = function(DF, K = 10, Strat = NULL){
  if(is.null(Strat)==0){
    DF[,Strat]=factor(DF[,Strat])
  LStrat = length(levels(DF[,Strat]))
  FX = function(x) which(DF[,Strat]==levels(DF[,Strat])[x])
  L1 = lapply(1:LStrat, FX) # L1 is a list of indexes for each level
  UL1 = unlist(L1) # a single vector of indices arranged by group
    FX2 = function(x) (sample(1:length(L1[[x]]) %% K + 1)) # assign group index
    L2 = lapply(1:LStrat, FX2)
    UL2 = unlist(L2) # a vector of K group indexes ordered same as UL1
  } else {
    UL1 = sample(1:nrow(DF))  # if no stratification, just mix up the indexes
    UL2=sample(1:nrow(DF) %% K + 1)
  }
  FX3 = function(x) list("train" = sort(UL1[UL2 != x]),
                         "test" = sort(UL1[UL2 == x])) # find indices of each group
  L3 = lapply(1:K, FX3)
  names(L3) = paste0("G",1:K)
  return(invisible(L3))
}
helophilus/ColsTools documentation built on May 30, 2019, 4:03 p.m.