inst/doc/Rpart_examples.R

## ---- echo = FALSE------------------------------------------------------------
knitr::opts_chunk$set(
 fig.width  = 5 ,
 fig.height = 3.5,
 fig.align  = 'center'
)
oldpar <- list(mar = par()$mar, mfrow = par()$mfrow)

## -----------------------------------------------------------------------------
library(rpart)
library(classmap)

## -----------------------------------------------------------------------------
data("data_titanic")

traindata <- data_titanic[which(data_titanic$dataType == "train"), -13]

dim(traindata) 
colnames(traindata)
# SibSp: number of siblings + spouses aboard
# Parch: number of parents + children aboard

str(traindata)
table(traindata$y)

## -----------------------------------------------------------------------------
set.seed(123) 
rpart.out <- rpart(y ~ Pclass + Sex + SibSp + 
                    Parch + Fare + Embarked,
                  data = traindata, method = 'class',
                  model = TRUE)

## plot the tree:
# pdf(file = "titanic_train_tree.pdf", width = 6, height = 6)
rpart.plot::rpart.plot(rpart.out, box.palette = "RdBu")
# dev.off()

## -----------------------------------------------------------------------------
rpart.out$variable.importance

## -----------------------------------------------------------------------------
mytype <- list(nominal = c("Name", "Sex", "Ticket", "Cabin", "Embarked"), ordratio = c("Pclass"))

## -----------------------------------------------------------------------------
x_train <- traindata[, -12]
y_train <- traindata[,  12]
vcrtrain <- vcr.rpart.train(x_train, y_train, rpart.out, mytype)
names(vcrtrain)

vcrtrain$predint[1:10] # prediction as integer
vcrtrain$pred[1:10]    # prediction as label
vcrtrain$altint[1:10]  # alternative label as integer
vcrtrain$altlab[1:10]  # alternative label

# Probability of Alternative Class (PAC) of each object:
vcrtrain$PAC[1:3] 
#
summary(vcrtrain$PAC)

# f(i, g) is the distance from case i to class g:
vcrtrain$fig[1:3, ] # for the first 3 objects:

# The farness of an object i is the f(i, g) to its own class: 
vcrtrain$farness[1:3]
#
summary(vcrtrain$farness)

# The "overall farness" of an object is defined as the 
# lowest f(i, g) it has to any class g (including its own):
summary(vcrtrain$ofarness)

sum(vcrtrain$ofarness > 0.99, na.rm = TRUE) 
# No farness is considered outlying in these data.

confmat.vcr(vcrtrain) 

cols <- c("firebrick", "blue")

## -----------------------------------------------------------------------------
stackedplot(vcrtrain, classCols = cols,
            main = "Stacked plot of rpart on Titanic training data")

## -----------------------------------------------------------------------------
silplot(vcrtrain, classCols = cols, 
        main = "Silhouettes of rpart on Titanic training data")
# silplot.out <- silplot(vcrtrain, classCols = cols)
# ggsave("titanic_train_silhouettes.pdf", silplot.out,
#        width = 5, height = 5)

## -----------------------------------------------------------------------------
hist(x_train$Age)

# Quasi residual plot versus age, for males only:

# pdf("titanic_qrp_versus_age_males.pdf", width = 4.8, height = 4.8)
PAC <- vcrtrain$PAC[which(x_train$Sex == "male")]
feat <- x_train$Age[which(x_train$Sex == "male")]
qresplot(PAC, feat, xlab = "Age (years)", opacity = 0.5,
         main = "quasi residual plot for male passengers",
         plotLoess = TRUE)
text(x = 14, y = 0.60, "loess curve", col = "red", cex = 1)
# dev.off()

## -----------------------------------------------------------------------------
classmap(vcrtrain, "casualty", classCols = cols)
# classmap(vcrtrain, "casualty", classCols = cols, identify = TRUE)

# blue points top right: cases "a" and "b" in the paper
x_train[which(y_train == "casualty")[119], ]
# Woman in 1st class, should have survived.
#
x_train[which(y_train == "casualty")[192], ]

# Similar, but child.

# red point most to the right: case "c" in the paper
x_train[which(y_train == "casualty")[268], ]


## -----------------------------------------------------------------------------
classmap(vcrtrain, "survived", classCols = cols)
# classmap(vcrtrain, "survived", classCols = cols, identify = TRUE)

# red point with highest farness among highest PAC: case "d" in the paper
x_train[which(y_train == "survived")[c(14)], ] 

# near-coinciding points with highest farness: cases "e" and "f" in the paper
#
x_train[which(y_train == "survived")[265], ]

x_train[which(y_train == "survived")[287], ] 

# man --> predicted as not survived. also paid the highest 
# fare in the data and has same ticket number as passenger 
# 265. Also embarked at the same place. It turns out that
# Gustave Lesueur was the man servant of the banker Thomas Cardeza:
  
# https://www.encyclopedia-titanica.org/titanic-survivor/thomas-cardeza.html
# https://www.encyclopedia-titanica.org/titanic-survivor/gustave-lesueur.html

# blue point bottom right: case "g" in the paper
x_train[which(y_train == "survived")[90], ]

# # woman  + first class so predicted in survivors.
# # paid highest fare in whole dataset 

## -----------------------------------------------------------------------------
testdata <- data_titanic[which(data_titanic$dataType == "test"), -13]

dim(testdata)
x_test <- testdata[, -12]
y_test <- testdata[, 12]
table(y_test)

## -----------------------------------------------------------------------------
vcrtest <- vcr.rpart.newdata(x_test, y_test, vcrtrain)

confmat.vcr(vcrtest)
cols <- c("firebrick", "blue")

## -----------------------------------------------------------------------------
stackedplot(vcrtest, classCols = cols,
            main = "Stacked plot of Titanic test data")

## -----------------------------------------------------------------------------
silplot(vcrtest, classCols = cols, 
        main = "Silhouettes of rpart on Titanic test data")

## -----------------------------------------------------------------------------
classmap(vcrtest, "casualty", classCols = cols) 

## -----------------------------------------------------------------------------
classmap(vcrtest, "survived", classCols = cols) 

Try the classmap package in your browser

Any scripts or data that you put into this service are public.

classmap documentation built on April 23, 2023, 5:09 p.m.