inst/doc/vtreatSplitting.R

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(fig.width = 7)

## -----------------------------------------------------------------------------
vtreat::kWayStratifiedY(3,2,NULL,NULL)

## -----------------------------------------------------------------------------
# This method is not a great idea as the data could have structure that strides
# in the same pattern as this split.
# Such technically is possible for any split, but we typically use
# pseudo-random structure (that is not the same across many potential
# split calls) to try and make it unlikely such structures
# match often.
modularSplit <- function(nRows,nSplits,dframe,y) {
  group <- seq_len(nRows) %% nSplits
  lapply(unique(group),
         function(gi) {
           list(train=which(group!=gi),
                app=which(group==gi))
         })
}

## -----------------------------------------------------------------------------
vtreat::buildEvalSets(nRows=25,nSplits=3,splitFunction=modularSplit)

## -----------------------------------------------------------------------------
badSplit <- function(nRows,nSplits,dframe,y) {
  list(list(train=seq_len(nRows),app=seq_len(nRows)))
}
vtreat::buildEvalSets(nRows=5,nSplits=3,splitFunction=badSplit)

## ----warning=FALSE------------------------------------------------------------
library('vtreat')

## -----------------------------------------------------------------------------
set.seed(23255)
d <- data.frame(y=sin(1:100))

# stratified 5-fold cross validation
pStrat <- kWayStratifiedY(nrow(d),5,d,d$y)
# check if the split is a good partition
check = vtreat::problemAppPlan(nrow(d),5,pStrat,TRUE)
if(is.null(check)) {
  print("Plan is good")
} else {
  print(paste0("Problem with plan: ", check))
}
d$stratGroup <- vtreat::getSplitPlanAppLabels(nrow(d),pStrat)

# unstratified 5-fold cross validation
pSimple <- kWayCrossValidation(nrow(d),5,d,d$y)
# check if the split is a good partition; return null if so
check = vtreat::problemAppPlan(nrow(d),5,pSimple,TRUE)
if(is.null(check)) {
  print("Plan is good")
} else {
  print(paste0("Problem with plan: ", check))
}
d$simpleGroup <- vtreat::getSplitPlanAppLabels(nrow(d),pSimple)

# mean(y) for each fold, unstratified
tapply(d$y,d$simpleGroup,mean)
# standard error of mean(y)
sd(tapply(d$y,d$simpleGroup,mean))

# mean(y) for each fold, unstratified
tapply(d$y,d$stratGroup,mean)
# standard error of mean(y)
sd(tapply(d$y,d$stratGroup,mean))

Try the vtreat package in your browser

Any scripts or data that you put into this service are public.

vtreat documentation built on Aug. 20, 2023, 1:08 a.m.