Nothing
## -----------------------------------------------------------------------------
set.seed(22626)
mkData <- function(n) {
d <- data.frame(xBad1=sample(paste('level',1:1000,sep=''),n,replace=TRUE),
xBad2=sample(paste('level',1:1000,sep=''),n,replace=TRUE),
xBad3=sample(paste('level',1:1000,sep=''),n,replace=TRUE),
xGood1=rnorm(n),
xGood2=rnorm(n))
# outcome only depends on "good" variables
d$y <- rnorm(nrow(d))+0.2*d$xGood1 + 0.3*d$xGood2>0.5
# the random group used for splitting the data set, not a variable.
d$rgroup <- sample(c("cal","train","test"),nrow(d),replace=TRUE)
d
}
d <- mkData(2000)
# devtools::install_github("WinVector/WVPlots")
# library('WVPlots')
plotRes <- function(d,predName,yName,title) {
print(title)
tab <- table(truth=d[[yName]],pred=d[[predName]]>0.5)
print(tab)
diag <- sum(vapply(seq_len(min(dim(tab))),
function(i) tab[i,i],numeric(1)))
acc <- diag/sum(tab)
# if(requireNamespace("WVPlots",quietly=TRUE)) {
# print(WVPlots::ROCPlot(d,predName,yName,title))
# }
print(paste('accuracy',acc))
}
## ----badmixcalandtrain--------------------------------------------------------
dTrain <- d[d$rgroup!='test',,drop=FALSE]
dTest <- d[d$rgroup=='test',,drop=FALSE]
treatments <- vtreat::designTreatmentsC(dTrain,c('xBad1','xBad2','xBad3','xGood1','xGood2'),
'y',TRUE,
rareCount=0 # Note: usually want rareCount>0, setting to zero to illustrate problem
)
dTrainTreated <- vtreat::prepare(treatments,dTrain,
pruneSig=c() # Note: usually want pruneSig to be a small fraction, setting to null to illustrate problems
)
f <- wrapr::mk_formula("y", treatments$scoreFrame$varName)
print(f)
m1 <- glm(f,
data=dTrainTreated,family=binomial(link='logit'))
print(summary(m1)) # notice low residual deviance
dTrain$predM1 <- predict(m1,newdata=dTrainTreated,type='response')
plotRes(dTrain,'predM1','y','model1 on train')
dTestTreated <- vtreat::prepare(treatments,dTest,pruneSig=c())
dTest$predM1 <- predict(m1,newdata=dTestTreated,type='response')
plotRes(dTest,'predM1','y','model1 on test')
## ----separatecalandtrain------------------------------------------------------
dCal <- d[d$rgroup=='cal',,drop=FALSE]
dTrain <- d[d$rgroup=='train',,drop=FALSE]
dTest <- d[d$rgroup=='test',,drop=FALSE]
# a nice heuristic,
# expect only a constant number of noise variables to sneak past
pruneSig <- 1/ncol(dTrain)
treatments <- vtreat::designTreatmentsC(dCal,
c('xBad1','xBad2','xBad3','xGood1','xGood2'),
'y',TRUE,
rareCount=0 # Note: usually want rareCount>0, setting to zero to illustrate problem
)
dTrainTreated <- vtreat::prepare(treatments,dTrain,
pruneSig=pruneSig)
newvars <- setdiff(colnames(dTrainTreated),'y')
m1 <- glm(paste('y',paste(newvars,collapse=' + '),sep=' ~ '),
data=dTrainTreated,family=binomial(link='logit'))
print(summary(m1))
dTrain$predM1 <- predict(m1,newdata=dTrainTreated,type='response')
plotRes(dTrain,'predM1','y','model1 on train')
dTestTreated <- vtreat::prepare(treatments,dTest,
pruneSig=pruneSig)
dTest$predM1 <- predict(m1,newdata=dTestTreated,type='response')
plotRes(dTest,'predM1','y','model1 on test')
## ----crossframes--------------------------------------------------------------
dTrain <- d[d$rgroup!='test',,drop=FALSE]
dTest <- d[d$rgroup=='test',,drop=FALSE]
prep <- vtreat::mkCrossFrameCExperiment(dTrain,
c('xBad1','xBad2','xBad3','xGood1','xGood2'),
'y',TRUE,
rareCount=0 # Note: usually want rareCount>0, setting to zero to illustrate problems
)
treatments <- prep$treatments
knitr::kable(treatments$scoreFrame[,c('varName','sig')])
colnames(prep$crossFrame)
# vtreat::mkCrossFrameCExperiment doesn't take a pruneSig argument, but we can
# prune on our own.
print(pruneSig)
newvars <- treatments$scoreFrame$varName[treatments$scoreFrame$sig<=pruneSig]
# force in bad variables, to show we "belt and suspenders" deal with them
# in that things go well in the cross-frame even if they sneak past pruning
newvars <- sort(union(newvars,c("xBad1_catB","xBad2_catB","xBad3_catB")))
print(newvars)
dTrainTreated <- prep$crossFrame
## ----xframemodel--------------------------------------------------------------
m1 <- glm(paste('y',paste(newvars,collapse=' + '),sep=' ~ '),
data=dTrainTreated,family=binomial(link='logit'))
print(summary(m1))
dTrain$predM1 <- predict(m1,newdata=dTrainTreated,type='response')
plotRes(dTrain,'predM1','y','model1 on train')
dTestTreated <- vtreat::prepare(treatments,dTest,
pruneSig=c(),varRestriction=newvars)
knitr::kable(head(dTestTreated))
dTest$predM1 <- predict(m1,newdata=dTestTreated,type='response')
plotRes(dTest,'predM1','y','model1 on test')
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.