Nothing
## ---- echo = FALSE------------------------------------------------------------
knitr::opts_chunk$set(
fig.width = 5 ,
fig.height = 3.5,
fig.align = 'center'
)
oldpar <- list(mar = par()$mar, mfrow = par()$mfrow)
## -----------------------------------------------------------------------------
library(rpart)
library(classmap)
## -----------------------------------------------------------------------------
data("data_titanic")
traindata <- data_titanic[which(data_titanic$dataType == "train"), -13]
dim(traindata)
colnames(traindata)
# SibSp: number of siblings + spouses aboard
# Parch: number of parents + children aboard
str(traindata)
table(traindata$y)
## -----------------------------------------------------------------------------
set.seed(123)
rpart.out <- rpart(y ~ Pclass + Sex + SibSp +
Parch + Fare + Embarked,
data = traindata, method = 'class',
model = TRUE)
## plot the tree:
# pdf(file = "titanic_train_tree.pdf", width = 6, height = 6)
rpart.plot::rpart.plot(rpart.out, box.palette = "RdBu")
# dev.off()
## -----------------------------------------------------------------------------
rpart.out$variable.importance
## -----------------------------------------------------------------------------
mytype <- list(nominal = c("Name", "Sex", "Ticket", "Cabin", "Embarked"), ordratio = c("Pclass"))
## -----------------------------------------------------------------------------
x_train <- traindata[, -12]
y_train <- traindata[, 12]
vcrtrain <- vcr.rpart.train(x_train, y_train, rpart.out, mytype)
names(vcrtrain)
vcrtrain$predint[1:10] # prediction as integer
vcrtrain$pred[1:10] # prediction as label
vcrtrain$altint[1:10] # alternative label as integer
vcrtrain$altlab[1:10] # alternative label
# Probability of Alternative Class (PAC) of each object:
vcrtrain$PAC[1:3]
#
summary(vcrtrain$PAC)
# f(i, g) is the distance from case i to class g:
vcrtrain$fig[1:3, ] # for the first 3 objects:
# The farness of an object i is the f(i, g) to its own class:
vcrtrain$farness[1:3]
#
summary(vcrtrain$farness)
# The "overall farness" of an object is defined as the
# lowest f(i, g) it has to any class g (including its own):
summary(vcrtrain$ofarness)
sum(vcrtrain$ofarness > 0.99, na.rm = TRUE)
# No farness is considered outlying in these data.
confmat.vcr(vcrtrain)
cols <- c("firebrick", "blue")
## -----------------------------------------------------------------------------
stackedplot(vcrtrain, classCols = cols,
main = "Stacked plot of rpart on Titanic training data")
## -----------------------------------------------------------------------------
silplot(vcrtrain, classCols = cols,
main = "Silhouettes of rpart on Titanic training data")
# silplot.out <- silplot(vcrtrain, classCols = cols)
# ggsave("titanic_train_silhouettes.pdf", silplot.out,
# width = 5, height = 5)
## -----------------------------------------------------------------------------
hist(x_train$Age)
# Quasi residual plot versus age, for males only:
# pdf("titanic_qrp_versus_age_males.pdf", width = 4.8, height = 4.8)
PAC <- vcrtrain$PAC[which(x_train$Sex == "male")]
feat <- x_train$Age[which(x_train$Sex == "male")]
qresplot(PAC, feat, xlab = "Age (years)", opacity = 0.5,
main = "quasi residual plot for male passengers",
plotLoess = TRUE)
text(x = 14, y = 0.60, "loess curve", col = "red", cex = 1)
# dev.off()
## -----------------------------------------------------------------------------
classmap(vcrtrain, "casualty", classCols = cols)
# classmap(vcrtrain, "casualty", classCols = cols, identify = TRUE)
# blue points top right: cases "a" and "b" in the paper
x_train[which(y_train == "casualty")[119], ]
# Woman in 1st class, should have survived.
#
x_train[which(y_train == "casualty")[192], ]
# Similar, but child.
# red point most to the right: case "c" in the paper
x_train[which(y_train == "casualty")[268], ]
## -----------------------------------------------------------------------------
classmap(vcrtrain, "survived", classCols = cols)
# classmap(vcrtrain, "survived", classCols = cols, identify = TRUE)
# red point with highest farness among highest PAC: case "d" in the paper
x_train[which(y_train == "survived")[c(14)], ]
# near-coinciding points with highest farness: cases "e" and "f" in the paper
#
x_train[which(y_train == "survived")[265], ]
x_train[which(y_train == "survived")[287], ]
# man --> predicted as not survived. also paid the highest
# fare in the data and has same ticket number as passenger
# 265. Also embarked at the same place. It turns out that
# Gustave Lesueur was the man servant of the banker Thomas Cardeza:
# https://www.encyclopedia-titanica.org/titanic-survivor/thomas-cardeza.html
# https://www.encyclopedia-titanica.org/titanic-survivor/gustave-lesueur.html
# blue point bottom right: case "g" in the paper
x_train[which(y_train == "survived")[90], ]
# # woman + first class so predicted in survivors.
# # paid highest fare in whole dataset
## -----------------------------------------------------------------------------
testdata <- data_titanic[which(data_titanic$dataType == "test"), -13]
dim(testdata)
x_test <- testdata[, -12]
y_test <- testdata[, 12]
table(y_test)
## -----------------------------------------------------------------------------
vcrtest <- vcr.rpart.newdata(x_test, y_test, vcrtrain)
confmat.vcr(vcrtest)
cols <- c("firebrick", "blue")
## -----------------------------------------------------------------------------
stackedplot(vcrtest, classCols = cols,
main = "Stacked plot of Titanic test data")
## -----------------------------------------------------------------------------
silplot(vcrtest, classCols = cols,
main = "Silhouettes of rpart on Titanic test data")
## -----------------------------------------------------------------------------
classmap(vcrtest, "casualty", classCols = cols)
## -----------------------------------------------------------------------------
classmap(vcrtest, "survived", classCols = cols)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.