In wardfont/edcpR: Ecological Data Collection and Processing Package

knitr::opts_chunk$set(echo = FALSE)
library(DiagrammeR)
library(knitr)
library(kableExtra)
library(vegan)
library(ape)
library(FD)

Session 7: Classification

Classification

Hierarchical clustering (unsupervised, hierarchical)
K means (unsupervised, non-hierarchical)
K nearest neighbours (supervised)

Goal: group data in similar classes

Data

library(datasets)
data(iris, package = "datasets")

Exercises (30 min)

Hierarchical clustering

Construct a dissimilarity matrix for the sepal length
Perform an agglomerative hierarchical clustering of the dataset
based upon the sepal length
TIP: cluster package
Plot the dendrogram of the performed clustering
change the output from the agglomerative clustering into a hclust object
Find the optimum number of clusters
TIP: stats package
How does the clustering changes when you use 4 clusters instead of 3?
Visualize the different clusters by plotting rectangles on the dendrogram

Check data

str(iris)

Check data

max <- apply(iris[, 1:4], 2, function(x){max(x, na.rm = T)}) 
min <- apply(iris[, 1:4], 2, function(x){min(x, na.rm = T)})

Check data

plot(max, xaxt = "n", ylim = c(0,8), ylab = " ")
axis(1, at = 1:4, labels = colnames(iris[1:4]))
points(min, col = "red")

Load all necessary packages

suppressMessages(library(cluster))
suppressMessages(library(class))
suppressMessages(library(vegan))
suppressMessages(library(ggplot2))
suppressMessages(library(factoextra))
suppressMessages(library(stats))

library(cluster)
library(stats)
library(vegan)

Hierarchical clustering (1)

# Construct a dissimilarity matrix for the sepal length
# Euclidean distance for quantitative data
dis.eu <- vegdist(iris$Sepal.Length, method="euclidean", na.rm = TRUE)

# Perform an agglomerative hierarchical clustering of the dataset
cluster_single <- agnes(dis.eu,method = "single")
cluster_complete <- agnes(dis.eu,method = "complete")
cluster_ward <- agnes(dis.eu,method = "ward")
cluster_average <- agnes(dis.eu,method = "weighted")
cluster_flexible <- agnes(dis.eu,method = "flexible", par.method = 0.625) # alpha 0.625 gives beta -0.25


cluster_single <- as.hclust(cluster_single)
cluster_complete <- as.hclust(cluster_complete)
cluster_ward <- as.hclust(cluster_ward)
cluster_average <- as.hclust(cluster_average)
cluster_flexible <- as.hclust(cluster_flexible)

Hierarchical clustering (2)

plot(cluster_single)

Hierarchical clustering (3)

plot(cluster_complete)

Hierarchical clustering (4)

plot(cluster_ward)

Hierarchical clustering (5)

plot(cluster_average)

Hierarchical clustering (6)

plot(cluster_flexible)

Hierarchical clustering (7)

Difficult to visually check which method is best, but:

Space conserving method preferred
Average, Ward, Flexible

Recommendation:

Ward's method with Euclidean distance
Flexible beta (-0.25) with Bray-Curtis or Sorensen (species data!!)

Hierarchical clustering (8)

# Alternative = cophenetic correlation coefficient
# Evaluate how well the dendrogram represents the original dissimilarity matrix
coph_ward <- cophenetic(cluster_ward)
coph_average <- cophenetic(cluster_average)

# Both score about the same
cor(dis.eu, coph_ward)
cor(dis.eu, coph_average)

Hierarchical clustering (9)

# Find the optimum number of clusters (Total within-cluster Sums of Squares)

set.seed(1)
k.max <- 10
wss <- sapply(1:k.max,
              function(x){kmeans(iris[, 1], x, nstart = 20, iter.max = 20)$tot.withinss})

Hierarchical clustering (10)

# Visual representation
plot(1:k.max, wss , type= "b", xlab = "Number of clusters(k)", 
      ylab = "Within cluster sum of squares")

Hierarchical clustering (11)

# What changes when you classify the dendrogram with 3 clusters or 4 clusters?
clusters_three <- table(cutree(cluster_ward, 3))
clusters_three
clusters_four <- table(cutree(cluster_ward, 4))
clusters_four

Hierarchical clustering (12)

# We see that cluster 1 is split in two new cluster (from 80 to 46 & 34)
clusters_difference <- table(cutree(cluster_ward, 3), cutree(cluster_ward, 4))
clusters_difference

Hierarchical clustering (13)

# Visualize the different clusters by plotting rectangles on the dendrogram
plot(cluster_ward)
rect.hclust(cluster_ward , k = 3, border = 2:6)

Exercises (15 min)

K means

Cluster using petal length and width
Find the optimum number of clusters
TIP: cluster package
Visualize the clustering using ggplot2

K means (1)

# Find the optimum number of clusters (GAP method)
set.seed(1)
gap_stat <- clusGap(iris[, 3:4], FUN = kmeans, nstart = 20, K.max = 10, B = 50)
k <- maxSE(gap_stat$Tab[, "gap"], gap_stat$Tab[, "SE.sim"], method="Tibs2001SEmax")
k

K means (2)

library(factoextra)
fviz_gap_stat(gap_stat)

K means (3)

# Perform a K means clustering of the iris dataset
Kclustering <- kmeans(iris[,3:4], 3, nstart = 20)
table(Kclustering$cluster,iris$Species)

K means (4)

# Visualize the clustering using ggplot2
Kclustering$cluster <- as.factor(Kclustering$cluster)
ggplot(iris, aes(Petal.Length, Petal.Width, color = Kclustering$cluster)) + geom_point()

Exercises (15 min)

K nearest neighbours

Cluster using all four variables
Split data into random training and random testing sets using a random 70%-30% split
Find the optimum number of neighbours
TIP: class package
Visualize the clustering using ggplot2

K nearest neighbours (1)

# random sample a training dataset and a testing dataset
set.seed(1)
trainrows <- sample(1:nrow(iris), replace = F, size = 0.7*nrow(iris))
train_iris <- iris[trainrows, 1:4]
test_iris <- iris[-trainrows, 1:4]
train_response <- iris[trainrows, 5]
test_response <- iris[-trainrows, 5]

K nearest neighbours (2)

# Find the optimum number of neighbours
library(class)
error <- c()
for (i in 1:60) {
  knn.fit <- knn(train = train_iris, test = test_iris, cl = train_response, k = i)
  error[i] = 1 - mean(knn.fit == test_response)
}

K nearest neighbours (3)

# This result can differ due to knn's method to resolve ties
ggplot(data = data.frame(error), aes(x = 1:60, y = error)) +
  geom_line(color = "Blue") + xlab("Number of Neighbours")

K nearest neighbours (4)

# Calculate the prediction accuracy using the chosen K value
set.seed(1)
iris_pred <- knn(train = train_iris, test = test_iris, cl = train_response, k = 20)
predicted_classification <- cbind(test_iris, iris_pred)
true_classification <- cbind(test_iris, test_response)

K nearest neighbours (6)

# Visualize the predicted classes using ggplot2
ggplot(predicted_classification, aes(x=Petal.Width, y=Petal.Length, color=iris_pred))+
  geom_point()

K nearest neighbours (6)

# Visualize the true classes using ggplot2
ggplot(true_classification, aes(x=Petal.Width, y=Petal.Length, color=test_response))+
  geom_point()

K nearest neighbours (7)

Need an extra challenge?

Try performing K nearest neighbours with cross-validation in R
Tip: caret-package

Exercises (30 min)

library(vegan)
data(varespec, package = "vegan")

Indicator species analysis

Use the varespec dataset
Divide the species data in clusters
Find indicator species for each cluster
TIP: indicspecies package

Indicator species analysis (1)

str(varespec)

Indicator species analysis (2)

A<-vector()
B<-vector()

#perform NMDS for 1, 2, 3 and 4 dimensions
for (i in 1:4) {
  NDMS<-metaMDS(varespec, distance = "bray",k=i, autotransform = TRUE, trymax=100 )
  solution<-NDMS$converged
  stress<-NDMS$stress
  A1<-cbind(solution,stress)
  j=i+1
  B<-rbind(B,j-1)
  A<-rbind(A,A1)
}

Indicator species analysis (3)

NMDSoutput<-data.frame(B,A)
names(NMDSoutput) <- c("k","Solution?","Stress")
NMDSoutput

Indicator species analysis (4)

k<-4
NMDSfinal<-metaMDS(varespec, distance = "bray", k=k, autotransform = TRUE, trymax=100)

Indicator species analysis (5)

# Find the optimum number of clusters (Total within-cluster Sums of Squares)
set.seed(1)
k.max <- 10
wss <- c()
for (i in 1:k.max) {
  wss[i] <- kmeans(NMDSfinal$points, i, nstart = 20, iter.max = 20)$tot.withinss
}

Indicator species analysis (6)

# Find the optimum number of clusters (Total within-cluster Sums of Squares)
plot(1:k.max, wss , type= "b", xlab = "Number of clusters (k)", 
            ylab = "Within cluster sum of squares")

Indicator species analysis (6)

# Find the optimum number of clusters (GAP method)
set.seed(1)
gap_stat <- clusGap(NMDSfinal$points, FUN = kmeans, nstart = 20, K.max = 10, B = 50)
k <- maxSE(gap_stat$Tab[, "gap"], gap_stat$Tab[, "SE.sim"], method="Tibs2001SEmax")
k

Indicator species analysis (7)

# Find the optimum number of clusters (GAP method)
fviz_gap_stat(gap_stat)

Indicator species analysis (8)

# Perform a K means clustering of the iris dataset
Kclustering <- kmeans(NMDSfinal$points, 3, nstart = 20)
Kclustering$cluster

Indicator species analysis (9)

library(indicspecies)
inv <- multipatt(varespec, Kclustering$cluster, func = "r.g", control = how(nperm=9999))

Indicator species analysis (10)

summary(inv)

Indicator species analysis (11)

summary(inv)

Assignment 5

Access the assignment:

vignette("assignment-5", package = "edcpR")

Submit your R-script as Surname_FirstName_assignment5.R

Upload everything on Toledo before December 8, 12 pm (= at noon)!

wardfont/edcpR documentation built on Dec. 23, 2021, 5:07 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

wardfont/edcpR Ecological Data Collection and Processing Package

In wardfont/edcpR: Ecological Data Collection and Processing Package

Session 7: Classification

Classification

Data

Exercises (30 min)

Check data

Check data

Check data

Load all necessary packages

Hierarchical clustering (1)

Hierarchical clustering (2)

Hierarchical clustering (3)

Hierarchical clustering (4)

Hierarchical clustering (5)

Hierarchical clustering (6)

Hierarchical clustering (7)

Hierarchical clustering (8)

Hierarchical clustering (9)

Hierarchical clustering (10)

Hierarchical clustering (11)

Hierarchical clustering (12)

Hierarchical clustering (13)

Exercises (15 min)

K means (1)

K means (2)

K means (3)

K means (4)

Exercises (15 min)

K nearest neighbours (1)

K nearest neighbours (2)

K nearest neighbours (3)

K nearest neighbours (4)

K nearest neighbours (6)

K nearest neighbours (6)

K nearest neighbours (7)

Exercises (30 min)

Indicator species analysis (1)

Indicator species analysis (2)

Indicator species analysis (3)

Indicator species analysis (4)

Indicator species analysis (5)

Indicator species analysis (6)

Indicator species analysis (6)

Indicator species analysis (7)

Indicator species analysis (8)

Indicator species analysis (9)

Indicator species analysis (10)

Indicator species analysis (11)

Assignment 5

R Package Documentation

Browse R Packages

We want your feedback!

wardfont/edcpR
Ecological Data Collection and Processing Package