knitr::opts_chunk$set( fig.width = 5 , fig.height = 3.5, fig.align = 'center' ) oldpar <- list(mar = par()$mar, mfrow = par()$mfrow)
This vignette visualizes classification results from rpart (CART), using tools from the package. The displays in this vignette are discussed in section 4 of Raymaekers and Rousseeuw (2021) (for the training data), and in section A.4 of the Supplementary Material (for the test data).
We first load and inspect the data.
data("data_titanic") traindata <- data_titanic[which(data_titanic$dataType == "train"), -13] dim(traindata) colnames(traindata) # SibSp: number of siblings + spouses aboard # Parch: number of parents + children aboard str(traindata) table(traindata$y)
Now we want to predict the response y by CART (rpart). We set the seed as rpart is not deterministic. We also inspect the resulting classification tree.
set.seed(123) rpart.out <- rpart(y ~ Pclass + Sex + SibSp + Parch + Fare + Embarked, data = traindata, method = 'class', model = TRUE) ## plot the tree: # pdf(file = "titanic_train_tree.pdf", width = 6, height = 6) rpart.plot::rpart.plot(rpart.out, box.palette = "RdBu") # dev.off()
We now inspect the variable importance, which will be used to calculate the variable weights in the farness computation.
Now we declare the types of the variables. This is used in the calculation of the Daisy distance, which in turn is needed for the farness computation. There are 5 nominal columns and one ordinal. The variables not listed here are interval-scaled by default.
mytype <- list(nominal = c("Name", "Sex", "Ticket", "Cabin", "Embarked"), ordratio = c("Pclass"))
Now we prepare for visualization and inspect the elements of the output.
x_train <- traindata[, -12] y_train <- traindata[, 12] vcrtrain <- vcr.rpart.train(x_train, y_train, rpart.out, mytype) names(vcrtrain) vcrtrain$predint[1:10] # prediction as integer vcrtrain$pred[1:10] # prediction as label vcrtrain$altint[1:10] # alternative label as integer vcrtrain$altlab[1:10] # alternative label # Probability of Alternative Class (PAC) of each object: vcrtrain$PAC[1:3] # summary(vcrtrain$PAC) # f(i, g) is the distance from case i to class g: vcrtrain$fig[1:3, ] # for the first 3 objects: # The farness of an object i is the f(i, g) to its own class: vcrtrain$farness[1:3] # summary(vcrtrain$farness) # The "overall farness" of an object is defined as the # lowest f(i, g) it has to any class g (including its own): summary(vcrtrain$ofarness) sum(vcrtrain$ofarness > 0.99, na.rm = TRUE) # No farness is considered outlying in these data. confmat.vcr(vcrtrain) cols <- c("firebrick", "blue")
We first make the stacked plot, visualizing the confusion matrix:
stackedplot(vcrtrain, classCols = cols, main = "Stacked plot of rpart on Titanic training data")
Now we consider the silhouette plot
silplot(vcrtrain, classCols = cols, main = "Silhouettes of rpart on Titanic training data") # silplot.out <- silplot(vcrtrain, classCols = cols) # ggsave("titanic_train_silhouettes.pdf", silplot.out, # width = 5, height = 5)
Now we construct the quasi residual plot of PAC vs. age for males only. The age variable is not in the model, but including it did not improve the classification. We see that the very young male passengers are often misclassified.
hist(x_train$Age) # Quasi residual plot versus age, for males only: # pdf("titanic_qrp_versus_age_males.pdf", width = 4.8, height = 4.8) PAC <- vcrtrain$PAC[which(x_train$Sex == "male")] feat <- x_train$Age[which(x_train$Sex == "male")] qresplot(PAC, feat, xlab = "Age (years)", opacity = 0.5, main = "quasi residual plot for male passengers", plotLoess = TRUE) text(x = 14, y = 0.60, "loess curve", col = "red", cex = 1) # dev.off()
Now we construct the class maps. First of the casualties.
classmap(vcrtrain, "casualty", classCols = cols) # classmap(vcrtrain, "casualty", classCols = cols, identify = TRUE) # blue points top right: cases "a" and "b" in the paper x_train[which(y_train == "casualty"), ] # Woman in 1st class, should have survived. # x_train[which(y_train == "casualty"), ] # Similar, but child. # red point most to the right: case "c" in the paper x_train[which(y_train == "casualty"), ]
Now the class map of the survivors.
classmap(vcrtrain, "survived", classCols = cols) # classmap(vcrtrain, "survived", classCols = cols, identify = TRUE) # red point with highest farness among highest PAC: case "d" in the paper x_train[which(y_train == "survived")[c(14)], ] # near-coinciding points with highest farness: cases "e" and "f" in the paper # x_train[which(y_train == "survived"), ] x_train[which(y_train == "survived"), ] # man --> predicted as not survived. also paid the highest # fare in the data and has same ticket number as passenger # 265. Also embarked at the same place. It turns out that # Gustave Lesueur was the man servant of the banker Thomas Cardeza: # https://www.encyclopedia-titanica.org/titanic-survivor/thomas-cardeza.html # https://www.encyclopedia-titanica.org/titanic-survivor/gustave-lesueur.html # blue point bottom right: case "g" in the paper x_train[which(y_train == "survived"), ] # # woman + first class so predicted in survivors. # # paid highest fare in whole dataset
In addition to the training data, we consider the titanic test data. First we load the data and inspect.
testdata <- data_titanic[which(data_titanic$dataType == "test"), -13] dim(testdata) x_test <- testdata[, -12] y_test <- testdata[, 12] table(y_test)
Now we prepare for visualization:
vcrtest <- vcr.rpart.newdata(x_test, y_test, vcrtrain) confmat.vcr(vcrtest) cols <- c("firebrick", "blue")
Now we can visualize, starting with the stacked plot:
stackedplot(vcrtest, classCols = cols, main = "Stacked plot of Titanic test data")
Now we construct the silhouette plot:
silplot(vcrtest, classCols = cols, main = "Silhouettes of rpart on Titanic test data")
Finally, we make the class maps. First of the casualties, in which the misclassifications are all female (since all males are predicted as casualty by this model).
classmap(vcrtest, "casualty", classCols = cols)
Now the class map of survivors:
classmap(vcrtest, "survived", classCols = cols)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.