KDD2009 example using the CVRTSEncoder R package.

date()
#load some libraries
library('vtreat')
library('WVPlots') 
library('CVRTSEncoder')
library('sigr')
library('parallel')
library("glmnet")


dir = "../../PracticalDataScienceWithR2nd/PDSwR2/KDD2009/"


d <- read.table(paste(dir, 'orange_small_train.data.gz', sep = "/"), 
   header = TRUE,
   sep = '\t',
   na.strings = c('NA', '')) 

churn <- read.table(paste(dir, 'orange_small_train_churn.labels.txt', sep = "/"),
   header = FALSE, sep = '\t')  
d$churn <- churn$V1 

set.seed(729375) 
rgroup <- base::sample(c('train', 'test'),   
   nrow(d), 
   prob = c(0.9, 0.1),
   replace = TRUE)
dTrain <- d[rgroup=='train', , drop = FALSE]
dTest <- d[rgroup == 'test', , drop = FALSE]

outcome <- 'churn' 
vars <- setdiff(colnames(dTrain), outcome)


rm(list=c('d', 'churn', 'rgroup'))  

set.seed(239525)

ncore <- parallel::detectCores()
(cl = parallel::makeCluster(ncore))

yName <- "churn"
yTarget <- 1

# prepare plotting frames
trainPlot = dTrain[, yName, drop=FALSE]
testPlot = dTest[, yName, drop=FALSE]

alpha = 0.5

date()
date()

# Run other models (with proper coding/training separation).
#
# This gets us back to AUC 0.74 range

customCoders = list('c.PiecewiseV.num' = vtreat::solve_piecewise,
                    'n.PiecewiseV.num' = vtreat::solve_piecewise,
                    'c.knearest.num' = vtreat::square_window,
                    'n.knearest.num' = vtreat::square_window)
cfe = mkCrossFrameCExperiment(dTrain,
                              vars,yName,yTarget,
                              customCoders=customCoders,
                              smFactor=2.0, 
                              parallelCluster=cl)


treatmentsC = cfe$treatments
scoreFrame = treatmentsC$scoreFrame
table(scoreFrame$code)
selvars <- scoreFrame$varName[scoreFrame$sig<1/nrow(scoreFrame)]
treatedTrainM <- cfe$crossFrame[,c(yName,selvars),drop=FALSE]
treatedTrainM[[yName]] = treatedTrainM[[yName]]==yTarget

treatedTest = prepare(treatmentsC,
                      dTest,
                      pruneSig=NULL, 
                      varRestriction = selvars,
                      parallelCluster=cl)
treatedTest[[yName]] = treatedTest[[yName]]==yTarget


date()
date()
mname = 'glmnet_pred'
print(paste(mname,length(selvars)))

model <- 
  cv.glmnet(as.matrix(treatedTrainM[, selvars, drop = FALSE]),
                   treatedTrainM[[yName]]==yTarget,
                   alpha = alpha,
                   family = "binomial")
trainPlot[[mname]] = as.numeric(predict(
  model, 
  newx = as.matrix(treatedTrainM[, selvars, drop = FALSE]),
  type = 'response',
  s = "lambda.min"))
testPlot[[mname]] = as.numeric(predict(
  model,
  newx = as.matrix(treatedTest[, selvars, drop = FALSE]),
  type = 'response',
  s = "lambda.min"))
date()
calcAUC(testPlot[[mname]], testPlot[[yName]]==yTarget)

permTestAUC(testPlot, mname, yName, yTarget = yTarget)

wrapChiSqTest(testPlot, mname, yName, yTarget = yTarget)
date()


t1 = paste(mname,'trainingM data')
print(DoubleDensityPlot(trainPlot, mname, yName, 
                        title=t1))
print(ROCPlot(trainPlot, mname, yName, yTarget,
              title=t1))
print(WVPlots::PRPlot(trainPlot, mname, yName, yTarget,
              title=t1))

t2 = paste(mname,'test data')
print(DoubleDensityPlot(testPlot, mname, yName, 
                        title=t2))
print(ROCPlot(testPlot, mname, yName, yTarget,
              title=t2))
print(WVPlots::PRPlot(testPlot, mname, yName, yTarget,
              title=t2))

print(date())
print("*****************************")
date()
# enrich with CVRRS encoded variables
date()

# encode as in https://github.com/WinVector/CVRTSEncoder
is_cat_var <- vapply(
  vars,
  function(ci) {
    is.character(dTrain[[ci]]) || is.factor(dTrain[[ci]])
  }, logical(1))
categorical_cols <- vars[is_cat_var]
numeric_cols <- vars[!is_cat_var]

cross_enc <- estimate_residual_encoding_c(
  data = dTrain,
  avars = c(numeric_cols, categorical_cols),
  evars = categorical_cols,
  dep_var = yName,
  dep_target = yTarget,
  n_comp = 5,
  cl = cl
)
te_vars <- colnames(cross_enc$cross_frame)
vars <- c(numeric_cols, te_vars)
dTrain <- cbind(dTrain, cross_enc$cross_frame)
dTest <- cbind(dTest,prepare(cross_enc$coder, dTest))

date()
date()

# Run other models (with proper coding/training separation).

customCoders = list('c.PiecewiseV.num' = vtreat::solve_piecewise,
                    'n.PiecewiseV.num' = vtreat::solve_piecewise,
                    'c.knearest.num' = vtreat::square_window,
                    'n.knearest.num' = vtreat::square_window)
cfe = mkCrossFrameCExperiment(dTrain,
                              vars,yName,yTarget,
                              customCoders=customCoders,
                              smFactor=2.0, 
                              parallelCluster=cl)


treatmentsC = cfe$treatments
scoreFrame = treatmentsC$scoreFrame
table(scoreFrame$code)
selvars <- scoreFrame$varName[scoreFrame$sig<1/nrow(scoreFrame)]
treatedTrainM <- cfe$crossFrame[,c(yName,selvars),drop=FALSE]
treatedTrainM[[yName]] = treatedTrainM[[yName]]==yTarget

treatedTest = prepare(treatmentsC,
                      dTest,
                      pruneSig=NULL, 
                      varRestriction = selvars,
                      parallelCluster=cl)
treatedTest[[yName]] = treatedTest[[yName]]==yTarget

date()
date()
mname = 'glmnet_pred_CVRTS'
print(paste(mname,length(selvars)))

model <- cv.glmnet(as.matrix(treatedTrainM[, selvars, drop = FALSE]),
                   treatedTrainM[[yName]]==yTarget,
                   alpha = alpha,
                   family = "binomial")
trainPlot[[mname]] = as.numeric(predict(
  model, 
  newx = as.matrix(treatedTrainM[, selvars, drop = FALSE]),
  type = 'response',
  s = "lambda.min"))
testPlot[[mname]] = as.numeric(predict(
  model,
  newx = as.matrix(treatedTest[, selvars, drop = FALSE]),
  type = 'response',
  s = "lambda.min"))
date()
calcAUC(testPlot[[mname]], testPlot[[yName]]==yTarget)

permTestAUC(testPlot, mname, yName, yTarget = yTarget)

wrapChiSqTest(testPlot, mname, yName, yTarget = yTarget)
date()


t1 = paste(mname,'trainingM data')
print(DoubleDensityPlot(trainPlot, mname, yName, 
                        title=t1))
print(ROCPlot(trainPlot, mname, yName, yTarget,
              title=t1))
print(WVPlots::PRPlot(trainPlot, mname, yName, yTarget,
              title=t1))

t2 = paste(mname,'test data')
print(DoubleDensityPlot(testPlot, mname, yName, 
                        title=t2))
print(ROCPlot(testPlot, mname, yName, yTarget,
              title=t2))
print(WVPlots::PRPlot(testPlot, mname, yName, yTarget,
              title=t2))

print(date())
print("*****************************")
date()
WVPlots::ROCPlotPair(testPlot, 
                     "glmnet_pred", "glmnet_pred_CVRTS",
                     yName, yTarget, "ROC on test",
                     estimate_sig = TRUE,
                     parallelCluster = cl)
if(!is.null(cl)) {
    parallel::stopCluster(cl)
    cl = NULL
}


WinVector/CVRTSEncoder documentation built on June 7, 2019, 9:53 a.m.