In mascaretti/moxier: An Introduction To Statistics With R

library(knitr)
library(learnr)
knitr::opts_chunk$set(echo = TRUE, exercise = FALSE)

Classification

Preliminary Analysis

We are going to attempt to classify the gender of a group of humans, based on their heights.

library(MASS)
library(class)
library(kknn)
library(rpart)

mydata <- moxier::cleaneddata
mydata.gender <- mydata[, 1]
mydata.biosocial <- mydata[, 3:8]

group <- mydata.gender
feature1 <- mydata.biosocial[, "height"]

We also make a plot.

plot(feature1, col = group)

LDA and QDA

We fit the models.

lda.fit <- lda(group ~ feature1)
qda.fit <- qda(group ~ feature1)

Estimate the posterior group probabilities provided by the two models for a subject whose height is 175 cm.

predict(lda.fit, data.frame(feature1 = 175))
predict(qda.fit, data.frame(feature1 = 175))

Grid

We estimate posterior probabilities on a regular grid provided by the two models

x <- data.frame(feature1 = seq(min(feature1), max(feature1), length = 100))
lda.posterior <- predict(lda.fit, x)$posterior
qda.posterior <- predict(qda.fit, x)$posterior

Plot posterior

We plot the probabilities we obtain using LDA.

# LDA Group 1
plot(x[, 1], lda.posterior[, 1], type = "l", col = "blue", xlab = "x", ylab = "estimated posterior")
# LDA Group 2
points(x[, 1], lda.posterior[, 2], type = "l", col = "red")

and overplot those obtained with QDA.

# LDA Group 1
plot(x[, 1], lda.posterior[, 1], type = "l", col = "blue", xlab = "x", ylab = "estimated posterior")
# LDA Group 2
points(x[, 1], lda.posterior[, 2], type = "l", col = "red")
# QDA Group 1
points(x[, 1], qda.posterior[, 1], type = "l", col = "blue", lty = 2, xlab = "x", ylab = "estimated posterior")
# QDA Group 2
points(x[, 1], qda.posterior[, 2], type = "l", col = "red", lty = 2)

k-Nearest Neighbour Classifier

We fit the model on a regular grid.

x <- data.frame(feature1 = seq(min(feature1), max(feature1), length = 100))
(knn.fitted <- knn(train = feature1, test = x, cl = group, k = 30, prob = T))

We need to chose $k$.

train.cv <- train.kknn(group ~ feature1,
  data = data.frame(feature1, group),
  kmax = 40, scale = F, kernel = "rectangular"
)
plot(train.cv)

CART

We fit the model

rpart.fit <- rpart(group ~ feature1)

and stimate the posterior group probabilities for a subject whose height is 175cm.

predict(rpart.fit, data.frame(feature1 = 175))

We compute the values on a grid.

x <- data.frame(feature1 = seq(min(feature1), max(feature1), length = 100))
data.frame(x = x, female = predict(rpart.fit, x)[, 1], male = 1 - predict(rpart.fit, x)[, 1])

Multivariate Analysis

We now consider two classes: Female and Male and six features.

group <- mydata.gender
features <- mydata.biosocial
pairs(features, col = group)

We fit the QDA.

qda.fit <- qda(group ~ ., features)

Estimate the posterior group probabilities for the following subject:

kable(data.frame(age = 23, height = 175, weight = 70, phone = 100, facebook = 1000, instagram = 500))

predict(qda.fit, data.frame(age = 23, height = 175, weight = 70, phone = 100, facebook = 1000, instagram = 500))

We also estimate the posterior probabilities for the subjects in the original dataset.

data.frame(predict(qda.fit))

We compute the confusion matrix.

kable(table(True.class = group, Predicted.class = predict(qda.fit)$class))

and the apparent Accuracy and Error Rate.

Conf <- table(True.class = group, Predicted.class = predict(qda.fit)$class)
Accuracy <- sum(diag(Conf)) / sum(Conf)
ErrorRate <- 1 - Accuracy

LOO CV

We compute the confusion matrix

kable(table(True.class = group, Predicted.class = predict(qda.fit, method = "looCV")$class))

and the apparent Accuracy and Error Rate.

ConfCV <- table(True.class = group, Predicted.class = predict(qda.fit, method = "looCV")$class)
kable(ConfCV)
(AccuracyCV <- sum(diag(ConfCV)) / sum(ConfCV))
(ErrorRateCV <- 1 - AccuracyCV)

kNN

We fit the model on the original dataset, after rescaling.

rescale <- function(x) {
  (x - min(x)) / (max(x) - min(x))
}

mydata.biosocial.rescaled <- as.data.frame(lapply(mydata.biosocial, rescale))

group <- mydata.gender
features <- mydata.biosocial.rescaled

We fit the algorithm.

knn.fitted <- knn(train = features, test = features, cl = group, k = 11, prob = T)

Estimate the posterior group probabilities for the following subject:

kable(data.frame(age = 23, height = 175, weight = 70, phone = 100, facebook = 1000, instagram = 500))

(knn.fitted <- knn(train = features, test = data.frame(age = 23, height = 175, weight = 70, phone = 100, facebook = 1000, instagram = 500), cl = group, k = 11, prob = T))

We also select the value of $k$ using leave-one-out cross-validation.

train.cv <- train.kknn(group ~ .,
  data = data.frame(features, group),
  kmax = 50, scale = F, kernel = "rectangular"
)
plot(train.cv)

We plot the confusion matrix

knn.fitted <- knn(train = features, test = features, cl = group, k = train.cv$best.parameters$k, prob = T)
kable(table(True.class = group, Predicted.class = knn.fitted))

We also compute the apparent Accuracy and Error Rate.

Conf <- table(True.class = group, Predicted.class = knn.fitted)
kable(Conf)
(Accuracy <- sum(diag(Conf)) / sum(Conf))
(ErrorRate <- 1 - Accuracy)

CART

We fit the model and plot the results.

group <- mydata.gender
features <- mydata.biosocial
rpart.fit <- rpart(group ~ ., data = features)

plot(rpart.fit)
text(rpart.fit)

Estimate the posterior group probabilities for the following subject:

kable(data.frame(age = 23, height = 175, weight = 70, phone = 100, facebook = 1000, instagram = 500))

predict(rpart.fit, data.frame(age = 23, height = 175, weight = 70, phone = 100, facebook = 1000, instagram = 500))

We estimate the probabilities on the original dataset.

data.frame(predict(rpart.fit))

We also compute the confusion matrix, the apparent Accuracy and the error rate.

kable(table(True.class = group, Predicted.class = predict(rpart.fit, type = "class")))

# Compute (apparent) Accuracy and Error Rate
Conf <- table(True.class = group, Predicted.class = predict(rpart.fit, type = "class"))
kable(Conf)
(Accuracy <- sum(diag(Conf)) / sum(Conf))
(ErrorRate <- 1 - Accuracy)