inst/doc/vtreatVariableTypes.R

## ----categoricalexample, tidy=FALSE-------------------------------------------
library(vtreat)
dTrainC <- data.frame(x=c('a','a','a','b','b',NA),
   z=c(1,2,3,4,NA,6),y=c(FALSE,FALSE,TRUE,FALSE,TRUE,TRUE),
   stringsAsFactors = FALSE)
treatmentsC <- designTreatmentsC(dTrainC,colnames(dTrainC),'y',TRUE)
scoreColsToPrint <- c('origName','varName','code','rsq','sig','extraModelDegrees')
print(treatmentsC$scoreFrame[,scoreColsToPrint])

## ----map----------------------------------------------------------------------
# Build a map from vtreat names back to reasonable display names
vmap <- as.list(treatmentsC$scoreFrame$origName)
names(vmap) <- treatmentsC$scoreFrame$varName
print(vmap['x_catB'])

# Map significances back to original variables
aggregate(sig~origName,data=treatmentsC$scoreFrame,FUN=min)

## ----numericexample, tidy=FALSE-----------------------------------------------
library(vtreat)
dTrainN <- data.frame(x=c('a','a','a','b','b',NA),
   z=c(1,2,3,4,NA,6),y=as.numeric(c(FALSE,FALSE,TRUE,FALSE,TRUE,TRUE)),
   stringsAsFactors = FALSE)
treatmentsN <- designTreatmentsN(dTrainN,colnames(dTrainN),'y')
print(treatmentsN$scoreFrame[,scoreColsToPrint])

## ----notargetexample, tidy=FALSE----------------------------------------------
library(vtreat)
dTrainZ <- data.frame(x=c('a','a','a','b','b',NA),
   z=c(1,2,3,4,NA,6),
   stringsAsFactors = FALSE)
treatmentsZ <- designTreatmentsZ(dTrainZ,colnames(dTrainZ))
print(treatmentsZ$scoreFrame[, c('origName','varName','code','extraModelDegrees')])

## ----restrict1----------------------------------------------------------------
dTrainN <- data.frame(x=c('a','a','a','b','b',NA),
   z=c(1,2,3,4,NA,6),y=as.numeric(c(FALSE,FALSE,TRUE,FALSE,TRUE,TRUE)),
   stringsAsFactors = FALSE)

treatmentsN <- designTreatmentsN(dTrainN,colnames(dTrainN),'y',
                                 codeRestriction = c('lev', 
                                                      'catN',
                                                      'clean',
                                                      'isBAD'),
                                 verbose=FALSE)

# no catP or catD variables
print(treatmentsN$scoreFrame[,scoreColsToPrint])


## ----restrict2----------------------------------------------------------------
dTreated = prepare(treatmentsN, dTrainN, 
                   codeRestriction = c('lev','clean', 'isBAD'))

# no catN variables
head(dTreated)

## ----selectvars---------------------------------------------------------------
dTrainN <- data.frame(x=c('a','a','a','b','b',NA),
   z=c(1,2,3,4,NA,6),y=as.numeric(c(FALSE,FALSE,TRUE,FALSE,TRUE,TRUE)),
   stringsAsFactors = FALSE)
treatmentsN <- designTreatmentsN(dTrainN,colnames(dTrainN),'y',
                                  codeRestriction = c('lev', 
                                                      'catN',
                                                      'clean',
                                                      'isBAD'),
                                 verbose=FALSE)
print(treatmentsN$scoreFrame[,scoreColsToPrint])

pruneSig <- 1.0 # don't filter on significance for this tiny example
vScoreFrame <- treatmentsN$scoreFrame
varsToUse <- vScoreFrame$varName[(vScoreFrame$sig<=pruneSig)]
print(varsToUse)
origVarNames <- sort(unique(vScoreFrame$origName[vScoreFrame$varName %in% varsToUse]))
print(origVarNames)

# prepare a treated data frame using only the "significant" variables
dTreated = prepare(treatmentsN, dTrainN, 
                   varRestriction = varsToUse)

head(dTreated)


## ----displayvars--------------------------------------------------------------
origVarNames <- sort(unique(vScoreFrame$origName[vScoreFrame$varName %in% varsToUse]))
print(origVarNames)

origVarSigs <- vScoreFrame[vScoreFrame$varName %in% varsToUse,]
aggregate(sig~origName,data=origVarSigs,FUN=min)

Try the vtreat package in your browser

Any scripts or data that you put into this service are public.

vtreat documentation built on Aug. 20, 2023, 1:08 a.m.