ComparingSubsets.R
In datarobot: 'DataRobot' Predictive Modeling API

## ----echo = TRUE--------------------------------------------------------------
library(mlbench)
data(PimaIndiansDiabetes)
str(PimaIndiansDiabetes)

## ----echo = FALSE, fig.width=7,fig.height=6, fig.cap="Figure 1: Normal QQ plots of four Pima Indians diabetes variables.", warning = FALSE, message = FALSE----
par(mfrow = c(2, 2))
qqnorm(PimaIndiansDiabetes$glucose, ylab = "glucose")
title("Plasma glucose concentration")
qqnorm(PimaIndiansDiabetes$pressure, ylab = "pressure")
title("Diastolic blood pressure")
qqnorm(PimaIndiansDiabetes$triceps, ylab = "triceps")
title("Triceps skinfold thickness")
qqnorm(PimaIndiansDiabetes$insulin, ylab = "insulin")
title("Serum insulin")

## ----echo = FALSE-------------------------------------------------------------
MissPctInsulin <- round(100 * length(which(PimaIndiansDiabetes$insulin == 0)) /
                        nrow(PimaIndiansDiabetes), digits = 1)

## ----echo = TRUE--------------------------------------------------------------
insulinMissing <- as.numeric(PimaIndiansDiabetes$insulin == 0)
modifiedPima <- PimaIndiansDiabetes
modifiedPima$insulin <- NULL
modifiedPima$insulinMissing <- insulinMissing

## ----echo = TRUE, eval = FALSE------------------------------------------------
# insulinProject <- StartProject(dataSource = modifiedPima,
#                                projectName = "InsulinProject",
#                                target = "insulinMissing",
#                                wait = TRUE)

## ----echo = TRUE, eval = FALSE------------------------------------------------
# insulinModelList <- ListModels(insulinProject)

## ----echo = FALSE, fig.width = 7, fig.height = 6, fig.cap = "Figure 2: Barplot of LogLoss values for the models predicting missingInsulin.", warning = FALSE, message = FALSE----
library(datarobot)
insulinModelList <- readRDS("insulinModelList.rds")
insulinModelFrame <- as.data.frame(insulinModelList, simple = FALSE)
par(mfrow = c(1, 1))
plot(insulinModelList, orderDecreasing = TRUE, xpos = 0.25, textSize = 0.6)

## ----echo = FALSE-------------------------------------------------------------
bestIndex <- which.min(insulinModelFrame$LogLoss.validation)
worstIndex <- which.max(insulinModelFrame$LogLoss.validation)
insulinModelFrame$expandedModel[bestIndex]

## ----echo = FALSE, fig.width = 7, fig.height = 6, fig.cap = "Figure 3: Plot of AUC values for the models predicting missingInsulin.", warning = FALSE, message = FALSE----
par(mfrow = c(1, 1))
plot(insulinModelFrame$AUC.validation, xlab = "Model number", ylab = "Area under the ROC curve")
points(bestIndex, insulinModelFrame$AUC.validation[bestIndex], pch = 16, col = "red")

## ----echo = TRUE, eval = FALSE------------------------------------------------
# modelList <- list(n = 9)
# modelList[[1]] <- insulinModelList
# allVars <- colnames(modifiedPima)[1:8]
# permFile <- tempfile(fileext = "permFile.csv")
# for (i in 1:8) {
#   varName <- allVars[i]
#   PermuteColumn("modifiedPima.csv", varName, permFile)
#   projName <- paste("PermProject",varName,sep="")
#   permProject <- StartProject(permFile, projectName = projName, target = "insulinMissing", wait = TRUE)
#   modelList[[i+1]] <- ListModels(permProject)
# }

## ----echo = FALSE, fig.width = 7, fig.height = 6, fig.cap="Figure 4: Beanplot summary of LogLoss shifts versus random permutation.", warning = FALSE, message = FALSE----
par(mfrow = c(1, 1))
library(beanplot)
logLossDeltas <- readRDS("insulinDeltaFrame.rds")
beanplot(logLossDeltas[, 1:8], las = 2, xlab = "", ylab = "LogLoss Shift",
         col = c("transparent", "red", "red", "blue"),
         what = c(0, 1, 1, 1))
bestRow <- which.min(logLossDeltas$originalLogLoss)
points(seq(1, 8, 1), logLossDeltas[bestRow, 1:8], pch = 16, col = "limegreen", cex = 1.5)
legend("topright", col = c("limegreen", "blue"), pch = c(16, 15), cex = 1.2,
       legend = c("Best", "Average"))
abline(h = 0, lty = 2)

## ----echo = FALSE, fig.width = 7, fig.height = 6, fig.cap = "Figure 5: Plot of AUC values for the original models (open circles) and those for the models with the random permutation applied to triceps (solid red triangles)", warning = FALSE, message = FALSE----
par(mfrow = c(1, 1))
AUCshiftFrame <- readRDS("AUCshiftFrame.rds")
sortIndex <- order(logLossDeltas$originalLogLoss)
plot(AUCshiftFrame$originalAUC[sortIndex], xlab = "Model number", ylab = "Area under ROC curve")
points(AUCshiftFrame$triceps[sortIndex], pch = 17, col = "red")

## ----echo = FALSE-------------------------------------------------------------
missingInsulin <- as.numeric(PimaIndiansDiabetes$insulin == 0)
missingTriceps <- as.numeric(PimaIndiansDiabetes$triceps == 0)
table(missingInsulin, missingTriceps)

## ----echo = TRUE--------------------------------------------------------------
library(insuranceData)
data(dataCar)

## ----echo = TRUE--------------------------------------------------------------
lossIndex <- which(dataCar$claimcst0 > 0)
keepVars <- c("veh_value", "exposure", "claimcst0", "veh_body", "veh_age",
              "gender", "area", "agecat")
lossFrame <- subset(dataCar, claimcst0 > 0, select = keepVars)

## ----echo = FALSE-------------------------------------------------------------
lossPct <- round(100 * length(lossIndex) / nrow(dataCar), digits = 1)
anomIndex <- which(lossFrame$claimcst0 == 200)
anomPct <- round(100 * length(anomIndex) / length(lossIndex), digits = 1)

## ----echo = TRUE, eval = FALSE------------------------------------------------
# anomaly <- as.numeric(lossFrame$claimcst0 == 200)
# anomFrame <- lossFrame
# anomFrame$claimcst0 <- NULL
# anomFrame$anomaly <- anomaly
# anomProject <- StartProject(anomFrame, projectName = "AnomalyProject", target = anomaly, wait = TRUE)
# anomalyModelList <- ListModels(anomProject)

## ----echo = FALSE, fig.width=7,fig.height=6, fig.cap="Figure 7: Horizontal barplot LogLoss summary of the 64% models for the small loss data.", warning = FALSE, message = FALSE----
par(mfrow = c(1, 1))
anomalyLeaderboard <- readRDS("anomalyModelList.rds")
anomalyLeaderFrame <- as.data.frame(anomalyLeaderboard, simple = FALSE)
plotPct <- max(anomalyLeaderFrame$samplePct)
plot(anomalyLeaderboard, pct = plotPct, orderDecreasing = TRUE, xlim = c(0, 0.45))
abline(v = min(anomalyLeaderFrame$LogLoss.validation), lty = 2, lwd = 2, col = "magenta")

## ----echo = FALSE, fig.width=7,fig.height=6, fig.cap="Figure 8: Scatterplot summary of the AUC values for all models from AnomLossProject.", warning = FALSE, message = FALSE----
par(mfrow = c(1, 1))
AAUC <- anomalyLeaderFrame$AUC.validation
samplePct <- anomalyLeaderFrame$samplePct
sizes <- sort(unique(samplePct))
plot(AAUC, xlab = "Model number", ylab = "Area under ROC curve")
Index64 <- which(samplePct == sizes[3])
points(Index64, AAUC[Index64], pch = 16, col = "red")
Index32 <- which(samplePct == sizes[2])
points(Index32, AAUC[Index32], pch = 16, col = "limegreen")
Index16 <- which(samplePct == sizes[1])
points(Index16, AAUC[Index16], pch = 16, col = "blue")
legend("bottomleft", col = c("blue", "limegreen", "red"), pch = 16,
       legend = c("16% data sample", "32% data sample", "64% data sample"))

## ----echo = FALSE, fig.width=7,fig.height=6, fig.cap="Figure 9: Beanplot summary of AUC shifts versus random permutation.", warning = FALSE, message = FALSE----
anomAUCDeltaFrame <- readRDS("anomAUCDeltaFrame.rds")
bestIndex <- which.min(anomalyLeaderFrame$LogLoss.validation)
bestExpModel <- as.character(anomalyLeaderFrame$expandedModel)[bestIndex]
bestRow <- which(anomAUCDeltaFrame$expandedModel == bestExpModel)
par(mfrow = c(1, 1))
beanplot(anomAUCDeltaFrame[, 1:7], las = 2, xlab = "", ylab = "AUC Shift",
         col = c("transparent", "red", "red", "blue"),
         what=c(0, 1, 1, 1), ylim=c(-0.1, 0.1))
points(seq(1, 7, 1), anomAUCDeltaFrame[bestRow, 1:7], pch = 16, col = "limegreen", cex = 1.5)
legend("topright", col = c("limegreen", "blue"), pch = c(16, 15), cex = 1.2,
       legend = c("Best", "Average"))
abline(h = 0, lty = 2)