knitr::opts_chunk$set(echo = FALSE) library(DiagrammeR) library(knitr) library(kableExtra) library(vegan) library(ape) library(FD)
Goal: group data in similar classes
library(datasets) data(iris, package = "datasets")
Hierarchical clustering
str(iris)
max <- apply(iris[, 1:4], 2, function(x){max(x, na.rm = T)}) min <- apply(iris[, 1:4], 2, function(x){min(x, na.rm = T)})
plot(max, xaxt = "n", ylim = c(0,8), ylab = " ") axis(1, at = 1:4, labels = colnames(iris[1:4])) points(min, col = "red")
suppressMessages(library(cluster)) suppressMessages(library(class)) suppressMessages(library(vegan)) suppressMessages(library(ggplot2)) suppressMessages(library(factoextra)) suppressMessages(library(stats))
library(cluster) library(stats) library(vegan)
# Construct a dissimilarity matrix for the sepal length # Euclidean distance for quantitative data dis.eu <- vegdist(iris$Sepal.Length, method="euclidean", na.rm = TRUE) # Perform an agglomerative hierarchical clustering of the dataset cluster_single <- agnes(dis.eu,method = "single") cluster_complete <- agnes(dis.eu,method = "complete") cluster_ward <- agnes(dis.eu,method = "ward") cluster_average <- agnes(dis.eu,method = "weighted") cluster_flexible <- agnes(dis.eu,method = "flexible", par.method = 0.625) # alpha 0.625 gives beta -0.25 cluster_single <- as.hclust(cluster_single) cluster_complete <- as.hclust(cluster_complete) cluster_ward <- as.hclust(cluster_ward) cluster_average <- as.hclust(cluster_average) cluster_flexible <- as.hclust(cluster_flexible)
plot(cluster_single)
plot(cluster_complete)
plot(cluster_ward)
plot(cluster_average)
plot(cluster_flexible)
Difficult to visually check which method is best, but:
Recommendation:
# Alternative = cophenetic correlation coefficient # Evaluate how well the dendrogram represents the original dissimilarity matrix coph_ward <- cophenetic(cluster_ward) coph_average <- cophenetic(cluster_average) # Both score about the same cor(dis.eu, coph_ward) cor(dis.eu, coph_average)
# Find the optimum number of clusters (Total within-cluster Sums of Squares) set.seed(1) k.max <- 10 wss <- sapply(1:k.max, function(x){kmeans(iris[, 1], x, nstart = 20, iter.max = 20)$tot.withinss})
# Visual representation plot(1:k.max, wss , type= "b", xlab = "Number of clusters(k)", ylab = "Within cluster sum of squares")
# What changes when you classify the dendrogram with 3 clusters or 4 clusters? clusters_three <- table(cutree(cluster_ward, 3)) clusters_three clusters_four <- table(cutree(cluster_ward, 4)) clusters_four
# We see that cluster 1 is split in two new cluster (from 80 to 46 & 34) clusters_difference <- table(cutree(cluster_ward, 3), cutree(cluster_ward, 4)) clusters_difference
# Visualize the different clusters by plotting rectangles on the dendrogram plot(cluster_ward) rect.hclust(cluster_ward , k = 3, border = 2:6)
K means
# Find the optimum number of clusters (GAP method) set.seed(1) gap_stat <- clusGap(iris[, 3:4], FUN = kmeans, nstart = 20, K.max = 10, B = 50) k <- maxSE(gap_stat$Tab[, "gap"], gap_stat$Tab[, "SE.sim"], method="Tibs2001SEmax") k
library(factoextra) fviz_gap_stat(gap_stat)
# Perform a K means clustering of the iris dataset Kclustering <- kmeans(iris[,3:4], 3, nstart = 20) table(Kclustering$cluster,iris$Species)
# Visualize the clustering using ggplot2 Kclustering$cluster <- as.factor(Kclustering$cluster) ggplot(iris, aes(Petal.Length, Petal.Width, color = Kclustering$cluster)) + geom_point()
K nearest neighbours
# random sample a training dataset and a testing dataset set.seed(1) trainrows <- sample(1:nrow(iris), replace = F, size = 0.7*nrow(iris)) train_iris <- iris[trainrows, 1:4] test_iris <- iris[-trainrows, 1:4] train_response <- iris[trainrows, 5] test_response <- iris[-trainrows, 5]
# Find the optimum number of neighbours library(class) error <- c() for (i in 1:60) { knn.fit <- knn(train = train_iris, test = test_iris, cl = train_response, k = i) error[i] = 1 - mean(knn.fit == test_response) }
# This result can differ due to knn's method to resolve ties ggplot(data = data.frame(error), aes(x = 1:60, y = error)) + geom_line(color = "Blue") + xlab("Number of Neighbours")
# Calculate the prediction accuracy using the chosen K value set.seed(1) iris_pred <- knn(train = train_iris, test = test_iris, cl = train_response, k = 20) predicted_classification <- cbind(test_iris, iris_pred) true_classification <- cbind(test_iris, test_response)
# Visualize the predicted classes using ggplot2 ggplot(predicted_classification, aes(x=Petal.Width, y=Petal.Length, color=iris_pred))+ geom_point()
# Visualize the true classes using ggplot2 ggplot(true_classification, aes(x=Petal.Width, y=Petal.Length, color=test_response))+ geom_point()
Need an extra challenge?
library(vegan) data(varespec, package = "vegan")
Indicator species analysis
str(varespec)
A<-vector() B<-vector() #perform NMDS for 1, 2, 3 and 4 dimensions for (i in 1:4) { NDMS<-metaMDS(varespec, distance = "bray",k=i, autotransform = TRUE, trymax=100 ) solution<-NDMS$converged stress<-NDMS$stress A1<-cbind(solution,stress) j=i+1 B<-rbind(B,j-1) A<-rbind(A,A1) }
NMDSoutput<-data.frame(B,A) names(NMDSoutput) <- c("k","Solution?","Stress") NMDSoutput
k<-4 NMDSfinal<-metaMDS(varespec, distance = "bray", k=k, autotransform = TRUE, trymax=100)
# Find the optimum number of clusters (Total within-cluster Sums of Squares) set.seed(1) k.max <- 10 wss <- c() for (i in 1:k.max) { wss[i] <- kmeans(NMDSfinal$points, i, nstart = 20, iter.max = 20)$tot.withinss }
# Find the optimum number of clusters (Total within-cluster Sums of Squares) plot(1:k.max, wss , type= "b", xlab = "Number of clusters (k)", ylab = "Within cluster sum of squares")
# Find the optimum number of clusters (GAP method) set.seed(1) gap_stat <- clusGap(NMDSfinal$points, FUN = kmeans, nstart = 20, K.max = 10, B = 50) k <- maxSE(gap_stat$Tab[, "gap"], gap_stat$Tab[, "SE.sim"], method="Tibs2001SEmax") k
# Find the optimum number of clusters (GAP method) fviz_gap_stat(gap_stat)
# Perform a K means clustering of the iris dataset Kclustering <- kmeans(NMDSfinal$points, 3, nstart = 20) Kclustering$cluster
library(indicspecies) inv <- multipatt(varespec, Kclustering$cluster, func = "r.g", control = how(nperm=9999))
summary(inv)
summary(inv)
Access the assignment:
vignette("assignment-5", package = "edcpR")
Submit your R-script as Surname_FirstName_assignment5.R
Upload everything on Toledo before December 8, 12 pm (= at noon)!
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.