Load packages
#load some libraries library('vtreat') library('WVPlots') library('sigr') library('parallel') library('xgboost')
Read in explanatory variables
d <- read.table('orange_small_train.data.gz', header = TRUE, sep = '\t', na.strings = c('NA', ''))
Read in dependent variables we are trying to predict
churn <- read.table('orange_small_train_churn.labels.txt', header = FALSE, sep = '\t') colnames(churn) <- "churn" d$churn = churn$churn table(d$churn)
Arrange test/train split. The reported performance runs of this example were sensitive to the prevalance of the churn variable in the test set, so we are cutting down on this source of evaluation variance by using a stratified split.
set.seed(729375) plan <- kWayStratifiedY(nrow(d), 10, d, d["churn"]) # use the first fold as our test/train split fold1 <- plan[[1]] train_idx <- fold1$train # training set indices test_idx <- fold1$app d_train <- d[train_idx, , drop = FALSE] d_test <- d[test_idx, , drop = FALSE] outcome <- 'churn' vars <- setdiff(colnames(d_train), outcome)
Try to use xgboost without data treatment. This fails.
model <- xgboost(data = as.matrix(d_train[, vars, drop = FALSE]), label = d_train[[outcome]], nrounds = 10, params = list(objective = "binary:logistic"))
Even if you fix this, you run into missing values, overly large categoricals, etc.
dim(d_train) # count the number of missing values in each column nNAs <- vapply(d_train[, vars], function(v) sum(is.na(v)), numeric(1)) summary(nNAs) is_categorical <- vapply(d_train[, vars], function(v) !is.numeric(v), logical(1)) # count the number of levels in each categorical column nlevels <- vapply(d_train[, is_categorical], function(v) length(unique(v)), numeric(1)) summary(nlevels)
vtreat
to prepare a clean data frameThe correct way, with cross-validated training data to avoid nested-model bias.
yTarget <- 1 # not strictly necessary, just for speed ncore <- parallel::detectCores() cl <- parallel::makeCluster(ncore) # fit the treatment plan and construct new # treated training data unpack[ transform = treatments, # treatment plan d_prepared = crossFrame # treated training data ] <- mkCrossFrameCExperiment(d_train, vars, outcomename=outcome, outcometarge=yTarget, parallelCluster=cl) scoreFrame <- transform$scoreFrame # count the number of different types of synthetic variables table(scoreFrame$code) selvars <- scoreFrame$varName # training data treatedTrainM <- d_prepared[,c(outcome,selvars),drop=FALSE] # change the outcome to boolean treatedTrainM[[outcome]] = treatedTrainM[[outcome]]==yTarget # use the treatment plan to prepare the holdout data treatedTest <- prepare(transform, d_test, parallelCluster=cl) treatedTest[[outcome]] = treatedTest[[outcome]]==yTarget # prepare plotting frames treatedTrainP = treatedTrainM[, outcome, drop=FALSE] treatedTestP = treatedTest[, outcome, drop=FALSE]
Note that the prepared data is all numeric with no missing variables.
# count the number of missing values in each column nNAs <- vapply(treatedTrainM[, selvars], function(v) sum(is.na(v)), numeric(1)) summary(nNAs) is_categorical <- vapply(treatedTrainM[, selvars], function(v) !is.numeric(v), logical(1)) sum(is_categorical)
First, use only the recommended variables
nrow(scoreFrame) # total number of synthetic variables model_vars <- scoreFrame$varName[scoreFrame$recommended] length(model_vars)
Now fit the model.
mname = 'xgboost' print(paste(mname,length(model_vars))) params <- list(max_depth = 5, objective = "binary:logistic", nthread = ncore) # cross-val to determine a good number of trees model <- xgb.cv(data = as.matrix(treatedTrainM[, model_vars, drop = FALSE]), label = treatedTrainM[[outcome]], nrounds = 400, params = params, nfold = 5, early_stopping_rounds = 10, eval_metric = "logloss") nrounds <- model$best_iteration print(paste("nrounds", nrounds)) # fit the model model <- xgboost(data = as.matrix(treatedTrainM[, model_vars, drop = FALSE]), label = treatedTrainM[[outcome]], nrounds = nrounds, params = params)
Get predictions on training and test data.
treatedTrainP[[mname]] = predict( model, newdata = as.matrix(treatedTrainM[, model_vars, drop = FALSE]), n.trees = nTrees, type = 'response') treatedTestP[[mname]] = predict( model, newdata = as.matrix(treatedTest[, model_vars, drop = FALSE]), n.trees = nTrees, type = "response")
calcAUC(treatedTestP[[mname]], treatedTestP[[outcome]]==yTarget) permTestAUC(treatedTestP, mname, outcome, yTarget = yTarget) wrapChiSqTest(treatedTestP, mname, outcome, yTarget = yTarget)
t1 = paste(mname,'model on training') print(DoubleDensityPlot(treatedTrainP, mname, outcome, title=t1)) print(ROCPlot(treatedTrainP, mname, outcome, yTarget, title=t1)) print(WVPlots::PRPlot(treatedTrainP, mname, outcome, yTarget, title=t1)) t2 = paste(mname,'model on test') print(DoubleDensityPlot(treatedTestP, mname, outcome, title=t2)) print(ROCPlot(treatedTestP, mname, outcome, yTarget, title=t2)) print(WVPlots::PRPlot(treatedTestP, mname, outcome, yTarget, title=t2))
Save predictions.
saveRDS( list(train_p = treatedTrainP, test_p = treatedTestP), file = 'predictions.RDS')
What happens if you don't account for nested model bias. Don't do this!
# fit the treatment plan from the training data # without creating cross-validated training data transform <- designTreatmentsC(d_train, vars, outcomename=outcome, outcometarge=yTarget, parallelCluster=cl) # use the treatment plan to treat the treated data directly # (this is the incorrect step) d_prepared = prepare(transform, d_train, parallelCluster=cl) scoreFrame <- transform$scoreFrame selvars <- scoreFrame$varName # training data treatedTrainM <- d_prepared[,c(outcome,selvars),drop=FALSE] # change the outcome to boolean treatedTrainM[[outcome]] = treatedTrainM[[outcome]]==yTarget # use the treatment plan to prepare the holdout data treatedTest <- prepare(transform, d_test, parallelCluster=cl) treatedTest[[outcome]] = treatedTest[[outcome]]==yTarget # prepare plotting frames treatedTrainP = treatedTrainM[, outcome, drop=FALSE] treatedTestP = treatedTest[, outcome, drop=FALSE] # get recommended variables model_vars <- scoreFrame$varName[scoreFrame$recommended]
Fit the model.
mname = 'naive xgboost' print(paste(mname,length(model_vars))) params <- list(max_depth = 5, objective = "binary:logistic", nthread = ncore) # cross-val to determine a good number of trees model <- xgb.cv(data = as.matrix(treatedTrainM[, model_vars, drop = FALSE]), label = treatedTrainM[[outcome]], nrounds = 400, params = params, nfold = 5, early_stopping_rounds = 10, eval_metric = "logloss") nrounds <- model$best_iteration print(paste("nrounds", nrounds)) # fit the model model <- xgboost(data = as.matrix(treatedTrainM[, model_vars, drop = FALSE]), label = treatedTrainM[[outcome]], nrounds = nrounds, params = params)
treatedTrainP[[mname]] = predict( model, newdata = as.matrix(treatedTrainM[, model_vars, drop = FALSE]), n.trees = nTrees, type = 'response') treatedTestP[[mname]] = predict( model, newdata = as.matrix(treatedTest[, model_vars, drop = FALSE]), n.trees = nTrees, type = "response")
Look at the ROC Plots
t1 = paste(mname,'model on training') print(ROCPlot(treatedTrainP, mname, outcome, yTarget, title=t1)) t2 = paste(mname,'model on test') print(ROCPlot(treatedTestP, mname, outcome, yTarget, title=t2))
if(!is.null(cl)) { parallel::stopCluster(cl) cl = NULL }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.