knitr::opts_chunk$set(echo = FALSE)
library(DiagrammeR)
library(knitr)
library(kableExtra)
library(vegan)
library(ape)
library(FD)

Session 7: Classification

Classification

Goal: group data in similar classes

Data

library(datasets)
data(iris, package = "datasets")

Exercises (30 min)

Hierarchical clustering

Check data

str(iris)

Check data

max <- apply(iris[, 1:4], 2, function(x){max(x, na.rm = T)}) 
min <- apply(iris[, 1:4], 2, function(x){min(x, na.rm = T)})

Check data

plot(max, xaxt = "n", ylim = c(0,8), ylab = " ")
axis(1, at = 1:4, labels = colnames(iris[1:4]))
points(min, col = "red")

Load all necessary packages

suppressMessages(library(cluster))
suppressMessages(library(class))
suppressMessages(library(vegan))
suppressMessages(library(ggplot2))
suppressMessages(library(factoextra))
suppressMessages(library(stats))
library(cluster)
library(stats)
library(vegan)

Hierarchical clustering (1)

# Construct a dissimilarity matrix for the sepal length
# Euclidean distance for quantitative data
dis.eu <- vegdist(iris$Sepal.Length, method="euclidean", na.rm = TRUE)

# Perform an agglomerative hierarchical clustering of the dataset
cluster_single <- agnes(dis.eu,method = "single")
cluster_complete <- agnes(dis.eu,method = "complete")
cluster_ward <- agnes(dis.eu,method = "ward")
cluster_average <- agnes(dis.eu,method = "weighted")
cluster_flexible <- agnes(dis.eu,method = "flexible", par.method = 0.625) # alpha 0.625 gives beta -0.25


cluster_single <- as.hclust(cluster_single)
cluster_complete <- as.hclust(cluster_complete)
cluster_ward <- as.hclust(cluster_ward)
cluster_average <- as.hclust(cluster_average)
cluster_flexible <- as.hclust(cluster_flexible)

Hierarchical clustering (2)

plot(cluster_single)

Hierarchical clustering (3)

plot(cluster_complete)

Hierarchical clustering (4)

plot(cluster_ward)

Hierarchical clustering (5)

plot(cluster_average)

Hierarchical clustering (6)

plot(cluster_flexible)

Hierarchical clustering (7)

Difficult to visually check which method is best, but:

Recommendation:

Hierarchical clustering (8)

# Alternative = cophenetic correlation coefficient
# Evaluate how well the dendrogram represents the original dissimilarity matrix
coph_ward <- cophenetic(cluster_ward)
coph_average <- cophenetic(cluster_average)

# Both score about the same
cor(dis.eu, coph_ward)
cor(dis.eu, coph_average)

Hierarchical clustering (9)

# Find the optimum number of clusters (Total within-cluster Sums of Squares)

set.seed(1)
k.max <- 10
wss <- sapply(1:k.max,
              function(x){kmeans(iris[, 1], x, nstart = 20, iter.max = 20)$tot.withinss})

Hierarchical clustering (10)

# Visual representation
plot(1:k.max, wss , type= "b", xlab = "Number of clusters(k)", 
      ylab = "Within cluster sum of squares")

Hierarchical clustering (11)

# What changes when you classify the dendrogram with 3 clusters or 4 clusters?
clusters_three <- table(cutree(cluster_ward, 3))
clusters_three
clusters_four <- table(cutree(cluster_ward, 4))
clusters_four

Hierarchical clustering (12)

# We see that cluster 1 is split in two new cluster (from 80 to 46 & 34)
clusters_difference <- table(cutree(cluster_ward, 3), cutree(cluster_ward, 4))
clusters_difference

Hierarchical clustering (13)

# Visualize the different clusters by plotting rectangles on the dendrogram
plot(cluster_ward)
rect.hclust(cluster_ward , k = 3, border = 2:6)

Exercises (15 min)

K means

K means (1)

# Find the optimum number of clusters (GAP method)
set.seed(1)
gap_stat <- clusGap(iris[, 3:4], FUN = kmeans, nstart = 20, K.max = 10, B = 50)
k <- maxSE(gap_stat$Tab[, "gap"], gap_stat$Tab[, "SE.sim"], method="Tibs2001SEmax")
k

K means (2)

library(factoextra)
fviz_gap_stat(gap_stat)

K means (3)

# Perform a K means clustering of the iris dataset
Kclustering <- kmeans(iris[,3:4], 3, nstart = 20)
table(Kclustering$cluster,iris$Species)

K means (4)

# Visualize the clustering using ggplot2
Kclustering$cluster <- as.factor(Kclustering$cluster)
ggplot(iris, aes(Petal.Length, Petal.Width, color = Kclustering$cluster)) + geom_point()

Exercises (15 min)

K nearest neighbours

K nearest neighbours (1)

# random sample a training dataset and a testing dataset
set.seed(1)
trainrows <- sample(1:nrow(iris), replace = F, size = 0.7*nrow(iris))
train_iris <- iris[trainrows, 1:4]
test_iris <- iris[-trainrows, 1:4]
train_response <- iris[trainrows, 5]
test_response <- iris[-trainrows, 5]

K nearest neighbours (2)

# Find the optimum number of neighbours
library(class)
error <- c()
for (i in 1:60) {
  knn.fit <- knn(train = train_iris, test = test_iris, cl = train_response, k = i)
  error[i] = 1 - mean(knn.fit == test_response)
}

K nearest neighbours (3)

# This result can differ due to knn's method to resolve ties
ggplot(data = data.frame(error), aes(x = 1:60, y = error)) +
  geom_line(color = "Blue") + xlab("Number of Neighbours")

K nearest neighbours (4)

# Calculate the prediction accuracy using the chosen K value
set.seed(1)
iris_pred <- knn(train = train_iris, test = test_iris, cl = train_response, k = 20)
predicted_classification <- cbind(test_iris, iris_pred)
true_classification <- cbind(test_iris, test_response)

K nearest neighbours (6)

# Visualize the predicted classes using ggplot2
ggplot(predicted_classification, aes(x=Petal.Width, y=Petal.Length, color=iris_pred))+
  geom_point()

K nearest neighbours (6)

# Visualize the true classes using ggplot2
ggplot(true_classification, aes(x=Petal.Width, y=Petal.Length, color=test_response))+
  geom_point()

K nearest neighbours (7)

Need an extra challenge?

Exercises (30 min)

library(vegan)
data(varespec, package = "vegan")

Indicator species analysis

Indicator species analysis (1)

str(varespec)

Indicator species analysis (2)

A<-vector()
B<-vector()

#perform NMDS for 1, 2, 3 and 4 dimensions
for (i in 1:4) {
  NDMS<-metaMDS(varespec, distance = "bray",k=i, autotransform = TRUE, trymax=100 )
  solution<-NDMS$converged
  stress<-NDMS$stress
  A1<-cbind(solution,stress)
  j=i+1
  B<-rbind(B,j-1)
  A<-rbind(A,A1)
}

Indicator species analysis (3)

NMDSoutput<-data.frame(B,A)
names(NMDSoutput) <- c("k","Solution?","Stress")
NMDSoutput

Indicator species analysis (4)

k<-4
NMDSfinal<-metaMDS(varespec, distance = "bray", k=k, autotransform = TRUE, trymax=100)

Indicator species analysis (5)

# Find the optimum number of clusters (Total within-cluster Sums of Squares)
set.seed(1)
k.max <- 10
wss <- c()
for (i in 1:k.max) {
  wss[i] <- kmeans(NMDSfinal$points, i, nstart = 20, iter.max = 20)$tot.withinss
}

Indicator species analysis (6)

# Find the optimum number of clusters (Total within-cluster Sums of Squares)
plot(1:k.max, wss , type= "b", xlab = "Number of clusters (k)", 
            ylab = "Within cluster sum of squares")

Indicator species analysis (6)

# Find the optimum number of clusters (GAP method)
set.seed(1)
gap_stat <- clusGap(NMDSfinal$points, FUN = kmeans, nstart = 20, K.max = 10, B = 50)
k <- maxSE(gap_stat$Tab[, "gap"], gap_stat$Tab[, "SE.sim"], method="Tibs2001SEmax")
k

Indicator species analysis (7)

# Find the optimum number of clusters (GAP method)
fviz_gap_stat(gap_stat)

Indicator species analysis (8)

# Perform a K means clustering of the iris dataset
Kclustering <- kmeans(NMDSfinal$points, 3, nstart = 20)
Kclustering$cluster

Indicator species analysis (9)

library(indicspecies)
inv <- multipatt(varespec, Kclustering$cluster, func = "r.g", control = how(nperm=9999))

Indicator species analysis (10)

summary(inv)

Indicator species analysis (11)

summary(inv)

Assignment 5

Access the assignment:

vignette("assignment-5", package = "edcpR")

Submit your R-script as Surname_FirstName_assignment5.R

Upload everything on Toledo before December 8, 12 pm (= at noon)!



wardfont/edcpR documentation built on Dec. 23, 2021, 5:07 p.m.