KDD2009 example using the CVRTSEncoder
R
package.
date() #load some libraries library('vtreat') library('WVPlots') library('CVRTSEncoder') library('sigr') library('parallel') library("glmnet") dir = "../../PracticalDataScienceWithR2nd/PDSwR2/KDD2009/" d <- read.table(paste(dir, 'orange_small_train.data.gz', sep = "/"), header = TRUE, sep = '\t', na.strings = c('NA', '')) churn <- read.table(paste(dir, 'orange_small_train_churn.labels.txt', sep = "/"), header = FALSE, sep = '\t') d$churn <- churn$V1 set.seed(729375) rgroup <- base::sample(c('train', 'test'), nrow(d), prob = c(0.9, 0.1), replace = TRUE) dTrain <- d[rgroup=='train', , drop = FALSE] dTest <- d[rgroup == 'test', , drop = FALSE] outcome <- 'churn' vars <- setdiff(colnames(dTrain), outcome) rm(list=c('d', 'churn', 'rgroup')) set.seed(239525) ncore <- parallel::detectCores() (cl = parallel::makeCluster(ncore)) yName <- "churn" yTarget <- 1 # prepare plotting frames trainPlot = dTrain[, yName, drop=FALSE] testPlot = dTest[, yName, drop=FALSE] alpha = 0.5 date()
date() # Run other models (with proper coding/training separation). # # This gets us back to AUC 0.74 range customCoders = list('c.PiecewiseV.num' = vtreat::solve_piecewise, 'n.PiecewiseV.num' = vtreat::solve_piecewise, 'c.knearest.num' = vtreat::square_window, 'n.knearest.num' = vtreat::square_window) cfe = mkCrossFrameCExperiment(dTrain, vars,yName,yTarget, customCoders=customCoders, smFactor=2.0, parallelCluster=cl) treatmentsC = cfe$treatments scoreFrame = treatmentsC$scoreFrame table(scoreFrame$code) selvars <- scoreFrame$varName[scoreFrame$sig<1/nrow(scoreFrame)] treatedTrainM <- cfe$crossFrame[,c(yName,selvars),drop=FALSE] treatedTrainM[[yName]] = treatedTrainM[[yName]]==yTarget treatedTest = prepare(treatmentsC, dTest, pruneSig=NULL, varRestriction = selvars, parallelCluster=cl) treatedTest[[yName]] = treatedTest[[yName]]==yTarget date()
date() mname = 'glmnet_pred' print(paste(mname,length(selvars))) model <- cv.glmnet(as.matrix(treatedTrainM[, selvars, drop = FALSE]), treatedTrainM[[yName]]==yTarget, alpha = alpha, family = "binomial") trainPlot[[mname]] = as.numeric(predict( model, newx = as.matrix(treatedTrainM[, selvars, drop = FALSE]), type = 'response', s = "lambda.min")) testPlot[[mname]] = as.numeric(predict( model, newx = as.matrix(treatedTest[, selvars, drop = FALSE]), type = 'response', s = "lambda.min")) date()
calcAUC(testPlot[[mname]], testPlot[[yName]]==yTarget) permTestAUC(testPlot, mname, yName, yTarget = yTarget) wrapChiSqTest(testPlot, mname, yName, yTarget = yTarget)
date() t1 = paste(mname,'trainingM data') print(DoubleDensityPlot(trainPlot, mname, yName, title=t1)) print(ROCPlot(trainPlot, mname, yName, yTarget, title=t1)) print(WVPlots::PRPlot(trainPlot, mname, yName, yTarget, title=t1)) t2 = paste(mname,'test data') print(DoubleDensityPlot(testPlot, mname, yName, title=t2)) print(ROCPlot(testPlot, mname, yName, yTarget, title=t2)) print(WVPlots::PRPlot(testPlot, mname, yName, yTarget, title=t2)) print(date()) print("*****************************") date()
# enrich with CVRRS encoded variables date() # encode as in https://github.com/WinVector/CVRTSEncoder is_cat_var <- vapply( vars, function(ci) { is.character(dTrain[[ci]]) || is.factor(dTrain[[ci]]) }, logical(1)) categorical_cols <- vars[is_cat_var] numeric_cols <- vars[!is_cat_var] cross_enc <- estimate_residual_encoding_c( data = dTrain, avars = c(numeric_cols, categorical_cols), evars = categorical_cols, dep_var = yName, dep_target = yTarget, n_comp = 5, cl = cl ) te_vars <- colnames(cross_enc$cross_frame) vars <- c(numeric_cols, te_vars) dTrain <- cbind(dTrain, cross_enc$cross_frame) dTest <- cbind(dTest,prepare(cross_enc$coder, dTest)) date()
date() # Run other models (with proper coding/training separation). customCoders = list('c.PiecewiseV.num' = vtreat::solve_piecewise, 'n.PiecewiseV.num' = vtreat::solve_piecewise, 'c.knearest.num' = vtreat::square_window, 'n.knearest.num' = vtreat::square_window) cfe = mkCrossFrameCExperiment(dTrain, vars,yName,yTarget, customCoders=customCoders, smFactor=2.0, parallelCluster=cl) treatmentsC = cfe$treatments scoreFrame = treatmentsC$scoreFrame table(scoreFrame$code) selvars <- scoreFrame$varName[scoreFrame$sig<1/nrow(scoreFrame)] treatedTrainM <- cfe$crossFrame[,c(yName,selvars),drop=FALSE] treatedTrainM[[yName]] = treatedTrainM[[yName]]==yTarget treatedTest = prepare(treatmentsC, dTest, pruneSig=NULL, varRestriction = selvars, parallelCluster=cl) treatedTest[[yName]] = treatedTest[[yName]]==yTarget date()
date() mname = 'glmnet_pred_CVRTS' print(paste(mname,length(selvars))) model <- cv.glmnet(as.matrix(treatedTrainM[, selvars, drop = FALSE]), treatedTrainM[[yName]]==yTarget, alpha = alpha, family = "binomial") trainPlot[[mname]] = as.numeric(predict( model, newx = as.matrix(treatedTrainM[, selvars, drop = FALSE]), type = 'response', s = "lambda.min")) testPlot[[mname]] = as.numeric(predict( model, newx = as.matrix(treatedTest[, selvars, drop = FALSE]), type = 'response', s = "lambda.min")) date()
calcAUC(testPlot[[mname]], testPlot[[yName]]==yTarget) permTestAUC(testPlot, mname, yName, yTarget = yTarget) wrapChiSqTest(testPlot, mname, yName, yTarget = yTarget)
date() t1 = paste(mname,'trainingM data') print(DoubleDensityPlot(trainPlot, mname, yName, title=t1)) print(ROCPlot(trainPlot, mname, yName, yTarget, title=t1)) print(WVPlots::PRPlot(trainPlot, mname, yName, yTarget, title=t1)) t2 = paste(mname,'test data') print(DoubleDensityPlot(testPlot, mname, yName, title=t2)) print(ROCPlot(testPlot, mname, yName, yTarget, title=t2)) print(WVPlots::PRPlot(testPlot, mname, yName, yTarget, title=t2)) print(date()) print("*****************************") date()
WVPlots::ROCPlotPair(testPlot, "glmnet_pred", "glmnet_pred_CVRTS", yName, yTarget, "ROC on test", estimate_sig = TRUE, parallelCluster = cl)
if(!is.null(cl)) { parallel::stopCluster(cl) cl = NULL }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.