inst/doc/vtreatRareLevels.R

## -----------------------------------------------------------------------------
library('vtreat')

set.seed(2325)
populationFrame <- data.frame(
   popsize = round(rlnorm(100,meanlog=log(4000),sdlog=1)),
   stringsAsFactors = FALSE)
populationFrame$code <- paste0('z',formatC(sample.int(100000,
                                              size=nrow(populationFrame),
                                              replace=FALSE),width=5,flag='0'))
rareCodes <- populationFrame$code[populationFrame$popsize<1000]

# Draw individuals from code-regions proportional to size of code region
# (or uniformly over all individuals labeled by code region).
# Also add the outcome which has altered conditional probability for rareCodes.
drawIndividualsAndReturnCodes <- function(n) {
  ords <- sort(sample.int(sum(populationFrame$popsize),size=n,replace=TRUE))
  cs <- cumsum(populationFrame$popsize)
  indexes <- findInterval(ords,cs)+1
  indexes <- indexes[sample.int(n,size=n,replace=FALSE)]
  samp <- data.frame(code=populationFrame$code[indexes],
                     stringsAsFactors = FALSE)
  samp$inClass <- runif(n) < ifelse(samp$code %in% rareCodes,0.3,0.01)
  samp
}

## -----------------------------------------------------------------------------
testSet <- drawIndividualsAndReturnCodes(2000)
table(generatedAsRare=testSet$code %in% rareCodes,inClass=testSet$inClass)

## -----------------------------------------------------------------------------
designSet <- drawIndividualsAndReturnCodes(2000)
treatments <- vtreat::designTreatmentsC(designSet,'code','inClass',TRUE,
                                        rareCount=5,rareSig=NULL,
                                        verbose=FALSE)
treatments$scoreFrame[,c('varName','sig'),drop=FALSE]

## -----------------------------------------------------------------------------
designSetTreated <-  vtreat::prepare(treatments,designSet,pruneSig=0.5)
designSetTreated$code <- designSet$code
summary(as.numeric(table(designSetTreated$code[designSetTreated$code_lev_rare==1])))
summary(as.numeric(table(designSetTreated$code[designSetTreated$code_lev_rare!=1])))

## ---- fig.width=6-------------------------------------------------------------
testSetTreated <- vtreat::prepare(treatments,testSet,pruneSig=0.5)
testSetTreated$code <- testSet$code
testSetTreated$newCode <- !(testSetTreated$code %in% unique(designSet$code))
testSetTreated$generatedAsRareCode <- testSetTreated$code %in% rareCodes

# Show code_lev_rare==1 corresponds to a subset of rows with elevated inClass==TRUE rate.
table(code_lev_rare=testSetTreated$code_lev_rare,
      inClass=testSetTreated$inClass)

# Show newCodes get coded with code_level_rare==1.

table(newCode=testSetTreated$newCode,code_lev_rare=testSetTreated$code_lev_rare)

# Show newCodes tend to come from defined rareCodes.
table(newCode=testSetTreated$newCode,
      generatedAsRare=testSetTreated$generatedAsRareCode)

## ---- fig.width=6-------------------------------------------------------------
# Show code_catP's behavior on rare and novel levels.
summary(testSetTreated$code_catP)

summary(testSetTreated$code_catP[testSetTreated$code_lev_rare==1])

summary(testSetTreated$code_catP[testSetTreated$newCode])

summary(testSetTreated$code_catP[testSetTreated$generatedAsRareCode])

Try the vtreat package in your browser

Any scripts or data that you put into this service are public.

vtreat documentation built on Aug. 20, 2023, 1:08 a.m.