library(knitr) library(learnr) knitr::opts_chunk$set(echo = TRUE, exercise = FALSE)
We are going to attempt to classify the gender of a group of humans, based on their heights.
library(MASS) library(class) library(kknn) library(rpart)
mydata <- moxier::cleaneddata mydata.gender <- mydata[, 1] mydata.biosocial <- mydata[, 3:8] group <- mydata.gender feature1 <- mydata.biosocial[, "height"]
We also make a plot.
plot(feature1, col = group)
We fit the models.
lda.fit <- lda(group ~ feature1) qda.fit <- qda(group ~ feature1)
Estimate the posterior group probabilities provided by the two models for a subject whose height is 175 cm.
predict(lda.fit, data.frame(feature1 = 175)) predict(qda.fit, data.frame(feature1 = 175))
We estimate posterior probabilities on a regular grid provided by the two models
x <- data.frame(feature1 = seq(min(feature1), max(feature1), length = 100)) lda.posterior <- predict(lda.fit, x)$posterior qda.posterior <- predict(qda.fit, x)$posterior
We plot the probabilities we obtain using LDA.
# LDA Group 1 plot(x[, 1], lda.posterior[, 1], type = "l", col = "blue", xlab = "x", ylab = "estimated posterior") # LDA Group 2 points(x[, 1], lda.posterior[, 2], type = "l", col = "red")
and overplot those obtained with QDA.
# LDA Group 1 plot(x[, 1], lda.posterior[, 1], type = "l", col = "blue", xlab = "x", ylab = "estimated posterior") # LDA Group 2 points(x[, 1], lda.posterior[, 2], type = "l", col = "red") # QDA Group 1 points(x[, 1], qda.posterior[, 1], type = "l", col = "blue", lty = 2, xlab = "x", ylab = "estimated posterior") # QDA Group 2 points(x[, 1], qda.posterior[, 2], type = "l", col = "red", lty = 2)
We fit the model on a regular grid.
x <- data.frame(feature1 = seq(min(feature1), max(feature1), length = 100)) (knn.fitted <- knn(train = feature1, test = x, cl = group, k = 30, prob = T))
We need to chose $k$.
train.cv <- train.kknn(group ~ feature1, data = data.frame(feature1, group), kmax = 40, scale = F, kernel = "rectangular" ) plot(train.cv)
We fit the model
rpart.fit <- rpart(group ~ feature1)
and stimate the posterior group probabilities for a subject whose height is 175cm.
predict(rpart.fit, data.frame(feature1 = 175))
We compute the values on a grid.
x <- data.frame(feature1 = seq(min(feature1), max(feature1), length = 100)) data.frame(x = x, female = predict(rpart.fit, x)[, 1], male = 1 - predict(rpart.fit, x)[, 1])
We now consider two classes: Female
and Male
and six features.
group <- mydata.gender features <- mydata.biosocial pairs(features, col = group)
We fit the QDA.
qda.fit <- qda(group ~ ., features)
Estimate the posterior group probabilities for the following subject:
kable(data.frame(age = 23, height = 175, weight = 70, phone = 100, facebook = 1000, instagram = 500))
predict(qda.fit, data.frame(age = 23, height = 175, weight = 70, phone = 100, facebook = 1000, instagram = 500))
We also estimate the posterior probabilities for the subjects in the original dataset.
data.frame(predict(qda.fit))
We compute the confusion matrix.
kable(table(True.class = group, Predicted.class = predict(qda.fit)$class))
and the apparent Accuracy and Error Rate.
Conf <- table(True.class = group, Predicted.class = predict(qda.fit)$class) Accuracy <- sum(diag(Conf)) / sum(Conf) ErrorRate <- 1 - Accuracy
We compute the confusion matrix
kable(table(True.class = group, Predicted.class = predict(qda.fit, method = "looCV")$class))
and the apparent Accuracy and Error Rate.
ConfCV <- table(True.class = group, Predicted.class = predict(qda.fit, method = "looCV")$class) kable(ConfCV) (AccuracyCV <- sum(diag(ConfCV)) / sum(ConfCV)) (ErrorRateCV <- 1 - AccuracyCV)
We fit the model on the original dataset, after rescaling.
rescale <- function(x) { (x - min(x)) / (max(x) - min(x)) }
mydata.biosocial.rescaled <- as.data.frame(lapply(mydata.biosocial, rescale)) group <- mydata.gender features <- mydata.biosocial.rescaled
We fit the algorithm.
knn.fitted <- knn(train = features, test = features, cl = group, k = 11, prob = T)
Estimate the posterior group probabilities for the following subject:
kable(data.frame(age = 23, height = 175, weight = 70, phone = 100, facebook = 1000, instagram = 500))
(knn.fitted <- knn(train = features, test = data.frame(age = 23, height = 175, weight = 70, phone = 100, facebook = 1000, instagram = 500), cl = group, k = 11, prob = T))
We also select the value of $k$ using leave-one-out cross-validation.
train.cv <- train.kknn(group ~ ., data = data.frame(features, group), kmax = 50, scale = F, kernel = "rectangular" ) plot(train.cv)
We plot the confusion matrix
knn.fitted <- knn(train = features, test = features, cl = group, k = train.cv$best.parameters$k, prob = T) kable(table(True.class = group, Predicted.class = knn.fitted))
We also compute the apparent Accuracy and Error Rate.
Conf <- table(True.class = group, Predicted.class = knn.fitted) kable(Conf) (Accuracy <- sum(diag(Conf)) / sum(Conf)) (ErrorRate <- 1 - Accuracy)
We fit the model and plot the results.
group <- mydata.gender features <- mydata.biosocial rpart.fit <- rpart(group ~ ., data = features)
plot(rpart.fit) text(rpart.fit)
Estimate the posterior group probabilities for the following subject:
kable(data.frame(age = 23, height = 175, weight = 70, phone = 100, facebook = 1000, instagram = 500))
predict(rpart.fit, data.frame(age = 23, height = 175, weight = 70, phone = 100, facebook = 1000, instagram = 500))
We estimate the probabilities on the original dataset.
data.frame(predict(rpart.fit))
We also compute the confusion matrix, the apparent Accuracy and the error rate.
kable(table(True.class = group, Predicted.class = predict(rpart.fit, type = "class"))) # Compute (apparent) Accuracy and Error Rate Conf <- table(True.class = group, Predicted.class = predict(rpart.fit, type = "class")) kable(Conf) (Accuracy <- sum(diag(Conf)) / sum(Conf)) (ErrorRate <- 1 - Accuracy)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.