library(learnr)
knitr::opts_chunk$set(echo = TRUE, exercise = FALSE)

Earthquake Dataset

Data Loading

We load the libraries and the data.

library(plotly)
df <- moxier::earthquakes

Plotting the data

p <- plot_ly(df, x = ~lat, y = ~long, z = ~depth) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'Latitude'),
                     yaxis = list(title = 'Longitude'),
                     zaxis = list(title = 'Depth')))
p

Rescaling the data

We rescale the data.

rescale <- function(x, na.rm = FALSE) {
  (x - min(x, na.rm = na.rm)) / (max(x, na.rm = na.rm) - min(x, na.rm = na.rm))
}
df_r <- as.data.frame(apply(X = df, MARGIN = 2, FUN = rescale, na.rm = FALSE))

and we plot it.

p <- plot_ly(df_r, x = ~lat, y = ~long, z = ~depth) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'Latitude'),
                     yaxis = list(title = 'Longitude'),
                     zaxis = list(title = 'Depth')))
p

k-means: finding k

within_ss <- matrix(data = NA, nrow = 1000, ncol = 10)
between_ss <- matrix(data = NA, nrow = 1000, ncol = 10)

for (k in seq.int(from = 1, to = 10)) {
  for (iter in seq.int(from = 1, to = 1000)) {
    clust_iter <- kmeans(x = df_r, centers = k)
    within_ss[iter, k] <- clust_iter$tot.withinss
    between_ss[iter, k] <- clust_iter$betweenss
  }
}

Let us check the result.

boxplot(within_ss, main = "Within SS")
boxplot(between_ss, main = "Between SS")

k-means: clustering

clust_k <- kmeans(x = df_r, centers = 4, iter.max = 50L, nstart = 100L)

We plot the results.

p <- plot_ly(df_r, x = ~lat, y = ~long, z = ~depth, color = ~as.factor(clust_k$cluster)) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'Latitude'),
                     yaxis = list(title = 'Longitude'),
                     zaxis = list(title = 'Depth')))
p

We also inspect what we have obtained.

clust_k$iter
clust_k$cluster
clust_k$size
clust_k$centers
clust_k$tot.withinss
clust_k$betweenss
clust_k$totss

Agglomerative Hierarchical Clustering

We now cluster considering with a different technique.

First of all, we compute the distance matrix.

d <- dist(df_r, method = "euclidean")
image(as.matrix(d), asp = 1)

We perform different clusterings and plot them.

clustc <- hclust(d, method='complete')
clusts <- hclust(d, method='single')
clusta <- hclust(d, method='average')
clustw <- hclust(d, method='ward.D')
par(mfrow=c(2,2))
plot(clustc, hang=0, labels=FALSE, main='Complete', xlab='', sub='')
plot(clusts, hang=0, labels=FALSE, main='Single', xlab='', sub='')
plot(clusta, hang=0, labels=FALSE, main='Average', xlab='', sub='')
plot(clustw, hang=0, labels=FALSE, main='Ward', xlab='', sub='')

For each method, we shall display the results.

Complete

clusterc <- cutree(hclust(d, method='complete'), 2)
p <- plot_ly(df_r, x = ~lat, y = ~long, z = ~depth, color = ~as.factor(clusterc)) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'Latitude'),
                     yaxis = list(title = 'Longitude'),
                     zaxis = list(title = 'Depth')))
p

Single

clusters <- cutree(hclust(d, method='single'), 4)
p <- plot_ly(df_r, x = ~lat, y = ~long, z = ~depth, color = ~as.factor(clusters)) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'Latitude'),
                     yaxis = list(title = 'Longitude'),
                     zaxis = list(title = 'Depth')))
p

Average

clustera <- cutree(hclust(d, method='average'), 2)
p <- plot_ly(df_r, x = ~lat, y = ~long, z = ~depth, color = ~as.factor(clustera)) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'Latitude'),
                     yaxis = list(title = 'Longitude'),
                     zaxis = list(title = 'Depth')))
p

Ward

clusterw <- cutree(hclust(d, method='ward.D'), 3)
p <- plot_ly(df_r, x = ~lat, y = ~long, z = ~depth, color = ~as.factor(clusterw)) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'Latitude'),
                     yaxis = list(title = 'Longitude'),
                     zaxis = list(title = 'Depth')))
p


mascaretti/moxier documentation built on March 17, 2020, 3:57 p.m.