John Mount
KDD2009 example using the
CVRTSEncoder
R
package.
date()
## [1] "Mon May 27 10:43:01 2019"
#load some libraries
library('vtreat')
library('WVPlots')
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
library('CVRTSEncoder')
library('sigr')
library('parallel')
library("glmnet")
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-18
dir = "../../PracticalDataScienceWithR2nd/PDSwR2/KDD2009/"
d <- read.table(paste(dir, 'orange_small_train.data.gz', sep = "/"),
header = TRUE,
sep = '\t',
na.strings = c('NA', ''))
churn <- read.table(paste(dir, 'orange_small_train_churn.labels.txt', sep = "/"),
header = FALSE, sep = '\t')
d$churn <- churn$V1
set.seed(729375)
rgroup <- base::sample(c('train', 'test'),
nrow(d),
prob = c(0.9, 0.1),
replace = TRUE)
dTrain <- d[rgroup=='train', , drop = FALSE]
dTest <- d[rgroup == 'test', , drop = FALSE]
outcome <- 'churn'
vars <- setdiff(colnames(dTrain), outcome)
rm(list=c('d', 'churn', 'rgroup'))
set.seed(239525)
ncore <- parallel::detectCores()
(cl = parallel::makeCluster(ncore))
## socket cluster with 4 nodes on host 'localhost'
yName <- "churn"
yTarget <- 1
# prepare plotting frames
trainPlot = dTrain[, yName, drop=FALSE]
testPlot = dTest[, yName, drop=FALSE]
alpha = 0.5
date()
## [1] "Mon May 27 10:43:10 2019"
date()
## [1] "Mon May 27 10:43:10 2019"
# Run other models (with proper coding/training separation).
#
# This gets us back to AUC 0.74 range
customCoders = list('c.PiecewiseV.num' = vtreat::solve_piecewise,
'n.PiecewiseV.num' = vtreat::solve_piecewise,
'c.knearest.num' = vtreat::square_window,
'n.knearest.num' = vtreat::square_window)
cfe = mkCrossFrameCExperiment(dTrain,
vars,yName,yTarget,
customCoders=customCoders,
smFactor=2.0,
parallelCluster=cl)
## [1] "vtreat 1.4.0 start initial treatment design Mon May 27 10:43:10 2019"
## [1] " start cross frame work Mon May 27 10:48:57 2019"
## [1] " vtreat::mkCrossFrameCExperiment done Mon May 27 10:51:48 2019"
treatmentsC = cfe$treatments
scoreFrame = treatmentsC$scoreFrame
table(scoreFrame$code)
##
## catB catP clean isBAD knearest lev
## 33 33 173 171 3 137
## PiecewiseV
## 166
selvars <- scoreFrame$varName[scoreFrame$sig<1/nrow(scoreFrame)]
treatedTrainM <- cfe$crossFrame[,c(yName,selvars),drop=FALSE]
treatedTrainM[[yName]] = treatedTrainM[[yName]]==yTarget
treatedTest = prepare(treatmentsC,
dTest,
pruneSig=NULL,
varRestriction = selvars,
parallelCluster=cl)
treatedTest[[yName]] = treatedTest[[yName]]==yTarget
date()
## [1] "Mon May 27 10:51:49 2019"
date()
## [1] "Mon May 27 10:51:49 2019"
mname = 'glmnet_pred'
print(paste(mname,length(selvars)))
## [1] "glmnet_pred 359"
model <-
cv.glmnet(as.matrix(treatedTrainM[, selvars, drop = FALSE]),
treatedTrainM[[yName]]==yTarget,
alpha = alpha,
family = "binomial")
trainPlot[[mname]] = as.numeric(predict(
model,
newx = as.matrix(treatedTrainM[, selvars, drop = FALSE]),
type = 'response',
s = "lambda.min"))
testPlot[[mname]] = as.numeric(predict(
model,
newx = as.matrix(treatedTest[, selvars, drop = FALSE]),
type = 'response',
s = "lambda.min"))
date()
## [1] "Mon May 27 10:59:21 2019"
calcAUC(testPlot[[mname]], testPlot[[yName]]==yTarget)
## [1] 0.735378
permTestAUC(testPlot, mname, yName, yTarget = yTarget)
## [1] "AUC test alt. hyp. AUC>AUC(permuted): (AUC=0.7354, s.d.=0.01568, p<1e-05)."
wrapChiSqTest(testPlot, mname, yName, yTarget = yTarget)
## [1] "Chi-Square Test summary: pseudo-R2=0.1015 (X2(1,N=4972)=258.6, p<1e-05)."
date()
## [1] "Mon May 27 10:59:21 2019"
t1 = paste(mname,'trainingM data')
print(DoubleDensityPlot(trainPlot, mname, yName,
title=t1))
print(ROCPlot(trainPlot, mname, yName, yTarget,
title=t1))
print(WVPlots::PRPlot(trainPlot, mname, yName, yTarget,
title=t1))
t2 = paste(mname,'test data')
print(DoubleDensityPlot(testPlot, mname, yName,
title=t2))
print(ROCPlot(testPlot, mname, yName, yTarget,
title=t2))
print(WVPlots::PRPlot(testPlot, mname, yName, yTarget,
title=t2))
print(date())
## [1] "Mon May 27 10:59:24 2019"
print("*****************************")
## [1] "*****************************"
date()
## [1] "Mon May 27 10:59:24 2019"
# enrich with CVRRS encoded variables
date()
## [1] "Mon May 27 10:59:26 2019"
# encode as in https://github.com/WinVector/CVRTSEncoder
is_cat_var <- vapply(
vars,
function(ci) {
is.character(dTrain[[ci]]) || is.factor(dTrain[[ci]])
}, logical(1))
categorical_cols <- vars[is_cat_var]
numeric_cols <- vars[!is_cat_var]
cross_enc <- estimate_residual_encoding_c(
data = dTrain,
avars = c(numeric_cols, categorical_cols),
evars = categorical_cols,
dep_var = yName,
dep_target = yTarget,
n_comp = 5,
cl = cl
)
te_vars <- colnames(cross_enc$cross_frame)
vars <- c(numeric_cols, te_vars)
dTrain <- cbind(dTrain, cross_enc$cross_frame)
dTest <- cbind(dTest,prepare(cross_enc$coder, dTest))
date()
## [1] "Mon May 27 11:39:16 2019"
date()
## [1] "Mon May 27 11:39:16 2019"
# Run other models (with proper coding/training separation).
customCoders = list('c.PiecewiseV.num' = vtreat::solve_piecewise,
'n.PiecewiseV.num' = vtreat::solve_piecewise,
'c.knearest.num' = vtreat::square_window,
'n.knearest.num' = vtreat::square_window)
cfe = mkCrossFrameCExperiment(dTrain,
vars,yName,yTarget,
customCoders=customCoders,
smFactor=2.0,
parallelCluster=cl)
## [1] "vtreat 1.4.0 start initial treatment design Mon May 27 11:39:16 2019"
## [1] " start cross frame work Mon May 27 11:44:46 2019"
## [1] " vtreat::mkCrossFrameCExperiment done Mon May 27 11:49:35 2019"
treatmentsC = cfe$treatments
scoreFrame = treatmentsC$scoreFrame
table(scoreFrame$code)
##
## clean isBAD knearest PiecewiseV
## 178 171 8 171
selvars <- scoreFrame$varName[scoreFrame$sig<1/nrow(scoreFrame)]
treatedTrainM <- cfe$crossFrame[,c(yName,selvars),drop=FALSE]
treatedTrainM[[yName]] = treatedTrainM[[yName]]==yTarget
treatedTest = prepare(treatmentsC,
dTest,
pruneSig=NULL,
varRestriction = selvars,
parallelCluster=cl)
treatedTest[[yName]] = treatedTest[[yName]]==yTarget
date()
## [1] "Mon May 27 11:49:35 2019"
date()
## [1] "Mon May 27 11:49:36 2019"
mname = 'glmnet_pred_CVRTS'
print(paste(mname,length(selvars)))
## [1] "glmnet_pred_CVRTS 256"
model <- cv.glmnet(as.matrix(treatedTrainM[, selvars, drop = FALSE]),
treatedTrainM[[yName]]==yTarget,
alpha = alpha,
family = "binomial")
trainPlot[[mname]] = as.numeric(predict(
model,
newx = as.matrix(treatedTrainM[, selvars, drop = FALSE]),
type = 'response',
s = "lambda.min"))
testPlot[[mname]] = as.numeric(predict(
model,
newx = as.matrix(treatedTest[, selvars, drop = FALSE]),
type = 'response',
s = "lambda.min"))
date()
## [1] "Mon May 27 11:53:21 2019"
calcAUC(testPlot[[mname]], testPlot[[yName]]==yTarget)
## [1] 0.7100485
permTestAUC(testPlot, mname, yName, yTarget = yTarget)
## [1] "AUC test alt. hyp. AUC>AUC(permuted): (AUC=0.71, s.d.=0.01354, p<1e-05)."
wrapChiSqTest(testPlot, mname, yName, yTarget = yTarget)
## [1] "Chi-Square Test summary: pseudo-R2=0.07978 (X2(1,N=4972)=203.3, p<1e-05)."
date()
## [1] "Mon May 27 11:53:21 2019"
t1 = paste(mname,'trainingM data')
print(DoubleDensityPlot(trainPlot, mname, yName,
title=t1))
print(ROCPlot(trainPlot, mname, yName, yTarget,
title=t1))
print(WVPlots::PRPlot(trainPlot, mname, yName, yTarget,
title=t1))
t2 = paste(mname,'test data')
print(DoubleDensityPlot(testPlot, mname, yName,
title=t2))
print(ROCPlot(testPlot, mname, yName, yTarget,
title=t2))
print(WVPlots::PRPlot(testPlot, mname, yName, yTarget,
title=t2))
print(date())
## [1] "Mon May 27 11:53:25 2019"
print("*****************************")
## [1] "*****************************"
date()
## [1] "Mon May 27 11:53:25 2019"
WVPlots::ROCPlotPair(testPlot,
"glmnet_pred", "glmnet_pred_CVRTS",
yName, yTarget, "ROC on test",
estimate_sig = TRUE,
parallelCluster = cl)
if(!is.null(cl)) {
parallel::stopCluster(cl)
cl = NULL
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.