knitr::opts_chunk$set(
  fig.width = 6,
  fig.height = 4,  
  collapse = TRUE,
  comment = "#>"
)

This vignette provides a tutorial on main functionality of the rpack package.

Libraries

First, load the necessary libraries. In addition to the rpack package, we use tidyverse for sample data manipulation and plotting.

library(rpack)
library(tidyverse)
#library(plotly)

Simulated data

Let's set up some data to be clustered.

set.seed(112)

# Generating 200 points from mixture of 10 normal distributions.
test_dat <- simulate_normal_mixture(n = 200, k = 10)

# Ids in interactive plot
id <-  1:nrow(test_dat)

plot_sim <- ggplot(data = test_dat, aes(x = x, y = y, size = w, label = id)) +
  geom_point() +
  # Scale objects sizes
  scale_size(range = c(2, 6)) +
  # Point size in legend
  guides(
    color = guide_legend(
      override.aes = list(size=5)
    )
  ) +
  labs(x = "x", y = "y", title = "Unclustered data") +
  # Legend position and removing ticks from axis
  theme(
    legend.position = "right",
    axis.text.x = ggplot2::element_blank(),
    axis.text.y = ggplot2::element_blank(),
    axis.ticks = ggplot2::element_blank()
  )

plot_sim
#ggplotly(plot_sim, tooltip = c("id", "w"))

Bigger datasets take a bit longer to cluster. To get a hang of the required processing times, you could try also the following sets:

# Generating 500 points from mixture of 20 Laplace distributions.
test_dat2 <- simulate_laplace_mixture(n = 500, k = 20)

# Generating 1000 points from mixture of 25 normal distributions.
test_dat3 <- simulate_normal_mixture(n = 1000, k = 25)

Clustering with uniform prior

First, cluster the data into $k=10$ clusters with a uniform prior for the cluster weights. In other words, we let the total weights of clusters vary uniformly within a given range. We set the range as $mean(w) \pm 250 $. As the algorithm is heuristic, the number of iterations needs to be set as well. Here we set $N = 20$. This could be set bigger to ensure that the global optimum is achieved. In addition cluster heads can be placed anywhere.

# Number of clusters
k <- 10

# Mean
pr_mean <- round(sum(test_dat$w) / k)

# Cluster size width
pr_width <- 250

# Lower und upper limit for cluster size
L <- pr_mean - pr_width
U <- pr_mean + pr_width

# Number of iterations
N <- 20

# Alternaring algorithm
clust1 <- alt_alg(
  coords = test_dat %>% select(x, y),
  k = k,
  N = N,
  weights = test_dat %>% pull(w),
  range = c(L, U),
  place_to_point = FALSE # Clusters heads can be located anywhere
)

# Save the results of the clustering
test_dat$cl1 <- clust1$clusters
centers1 <- clust1$centers

Plot the clusters with plot_clusters.

plot_cl1 <- plot_clusters(
  coords = test_dat %>% select(x, y),
  weights = test_dat %>% pull(w),
  clusters = test_dat %>% pull(cl1),
  centers = centers1,
  title = paste(
    "Capacitated clustering, k = ",
    k,
    ", squread Euclidean distance",
    sep = ""
  ),
  subtitle = paste("Uniform prior in [", L, ", ", U, "] on cluster sizes", sep = "")
)

plot_cl1_hull <- plot_hull(coords = test_dat %>% select(x, y),
  weights = test_dat %>% pull(w),
  clusters = test_dat %>% pull(cl1),
  centers = centers1)


plot_cl1
plot_cl1_hull

Clustering with different distance metric and add constraints for cluster head locations

Similarly, cluster the points into $k=10$ with same prior for the cluster weights. Before $L_2^2$ distance metric was used as it is the default option. This time use the standard Euclidean distance $L_2$. In addition, constraint the location of the cluster heads to be one of the data points.

# Number of clusters
k <- 10

# Mean
pr_mean <- round(sum(test_dat$w) / k)

# Cluster size width
pr_width <- 250

# Lower und upper limit for cluster size
L <- pr_mean - pr_width
U <- pr_mean + pr_width

# Number of iterations
N <- 10

# Alternaring algorithm
clust2 <- alt_alg(
  coords = test_dat %>% select(x, y),
  k = k,
  N = N,
  weights = test_dat %>% pull(w),
  range = c(L, U),
  center_init = "kmpp",
  d = euc_dist, # Euclidean distance
  place_to_point = TRUE # This is TRUE by default
)

# Save the results of the clustering
test_dat$cl2 <- clust2$clusters
centers2 <- clust2$centers

# Plot the clusters
plot_cl2 <- plot_clusters(
  coords = test_dat %>% select(x,y),
  weights = test_dat %>% pull(w),
  clusters = test_dat %>% pull(cl2),
  centers = centers2,
  title = paste("Capacitated clustering, k = ", k, ", Euclidean distance", sep = ""),
  subtitle = paste("Uniform prior in [", L, ", ", U, "] on cluster sizes", sep = "")
)

plot_cl2

Clustering with outgroup extension.

Similarly, cluster the data into $k=10$ clusters. In addition, add outgroup for the data points. Outgroup consists of points that can be seen as outliers from other points and are not allocated to any of the clusters.

# Add some artificial outliers to the data
outlier_dat <- tibble(x = c(0.2, 4, -8),
                      y = c(7, 10, -6),
                      w = c(50, 50, 50),
                      orig_group = rep(factor(99), 3))

test_dat_out <- full_join(test_dat, outlier_dat)

# Plot the data with outliers
plot_dat_out <- ggplot() +
  geom_point(data = outlier_dat,
             mapping = aes(x = x, y = y, size = w),
             color = "red") +
    geom_point(data = test_dat,
             mapping = aes(x = x, y = y, size = w)) +
  ggtitle("New data with outlier points")

plot_dat_out
# Number of clusters
k <- 10

# Mean
pr_mean <- round(sum(test_dat_out$w) / k)

# Max radius for prior
pr_width <- 300

# Lower und upper limit for cluster size
L <- (pr_mean - pr_width)
U <- (pr_mean + pr_width)

# Outgroup parameter lambda, smaller value --> more outliers
lambda1 <- 0.04

# Alternaring algorithm
clust3 <- alt_alg(
  coords = dplyr::select(test_dat_out, x, y),
  weights = test_dat_out$w,
  N = 10,
  k = k,
  range = c(L, U),
  lambda = lambda1
)

# Save the clustering
test_dat_out$cl3 <- clust3$clusters
centers3 <- clust3$centers

# Plot the clusters. Outgroup points are labeled as "NA"
plot_cl3 <- plot_clusters(
  coords = test_dat_out %>% select(x,y),
  weights = test_dat_out %>% pull(w),
  clusters = test_dat_out %>% pull(cl3),
  centers = centers3,
  title = paste("Capacitated clustering, k = ", k, sep = ""),
  subtitle = paste("Uniform prior in [", L, ", ", U, "] on cluster sizes", sep = "")
)

plot_cl3

Clustering with multiple cluster size ranges

Again, cluster the points into $k=10$ clusters. This time choose two different ranges for the cluster sizes

# Number of clusters
k <- 10

# Mean
pr_mean <- round(sum(test_dat$w) / k)

# Smaller cluster size range
range_small <- c(600, 900)

# Larger cluster size range
range_large <- c(1200, 1500)

# Matrix of ranges
ranges <- matrix(c(range_small, range_large),
                 byrow = TRUE,
                 ncol = 2, 
                 nrow = 2)

# Number of iterations
N <- 10

# Alternaring algorithm
clust4 <- alt_alg(
  coords = test_dat %>% select(x, y),
  k = k,
  N = N,
  weights = test_dat %>% pull(w),
  range = ranges
)

# Save the results of the clustering
test_dat$cl4 <- clust4$clusters
centers4 <- clust4$centers

# Plot the clusters
plot_cl4 <- plot_clusters(
  coords = test_dat %>% select(x,y),
  weights = test_dat %>% pull(w),
  clusters = test_dat %>% pull(cl4),
  centers = centers4,
  title = paste("Capacitated clustering, k = ", k, ", Euclidean distance", sep = ""),
  subtitle = paste("Ranges [", ranges[1,1], ", ", ranges[1,2], "] and [", ranges[2,1], ", ", ranges[2,2], "] for cluster sizes", sep = "")
)

plot_cl4

Clustering with fixed centers

Some of the cluster heads might be fixed to certain points. This can be typical in facility location where some number of facilities are already placed and the location of new ones should be determined. Predefine 5 of the points as fixed cluster heads and add 5 new cluster heads to the region.

# Choose the fixed centers from the data points
fixed_centers <- test_dat[c(3,45,99,105,148),] %>% select(x,y)

# Number of fixed centers
n_fixed <- nrow(fixed_centers)

# Plot the fixed points
ggplot() +
  geom_point(data = test_dat, 
             mapping = aes(x = x,
                           y = y, 
                           size = w)) +
  geom_point(data = fixed_centers,
             mapping = aes(x = x,
                           y = y),
             stroke = 3, 
             size = 4,
             shape = 4,
             color = "red")


# Total number of clusters
k <- n_fixed + 5

# Mean
pr_mean <- round(sum(test_dat$w) / k)

# Cluster size width
pr_width <- 300

# Lower und upper limit for cluster size
L <- pr_mean - pr_width
U <- pr_mean + pr_width

# Number of iterations
N <- 20

# Alternaring algorithm
clust5 <- alt_alg(
  coords = test_dat %>% select(x, y),
  k = k,
  N = N,
  weights = test_dat %>% pull(w),
  range = c(L, U),
  fixed_centers = fixed_centers
)

# Save the results of the clustering
test_dat$cl5 <- clust5$clusters
centers5 <- clust5$centers

# Plot the clusters
plot_cl5 <- plot_clusters(
  coords = test_dat %>% select(x,y),
  weights = test_dat %>% pull(w),
  clusters = test_dat %>% pull(cl5),
  centers = centers5,
  title = paste("Capacitated clustering, k = ", k, ", Euclidean distance", sep = ""),
  subtitle = paste("Uniform prior in [", L, ", ", U, "] on cluster sizes", sep = "")
)

plot_cl5

Clustering with reliability

Cluster the points to $k=10$ clusters. Set some points to be require multiple centers to be assigned to.

# Number of clusters
k <- 10

# Mean
pr_mean <- round(sum(test_dat$w) / k)

# Max radius for prior
pr_width <- 250

# Lower und upper limit for cluster size
L <- (pr_mean - pr_width)
U <- (pr_mean + pr_width)

# Ids of the points to be assigned to multiple clusters
multip_id <- c(42, 153, 190, 83, 171)

# n-length vector to indicate the number of centers a point is assigned to.
multip_centers <- rep(1, nrow(test_dat))

# Selected points are assigned to two clusters
multip_centers[multip_id] <- 2

# Alternaring algorithm
clust6 <- alt_alg(
  coords = dplyr::select(test_dat, x, y),
  weights = test_dat$w,
  N = 10,
  k = k,
  range = c(L, U),
  multip_centers = multip_centers
)

test_dat$cl6 <- clust6$clusters
centers6 <- clust6$centers

Plot the clusters and the points with multiple centers.

multip_data <- test_dat[multip_id,]

multip_clust <- sapply(X = multip_id,
                       FUN = function(x){which(clust6$assign_frac[x,] == 1)})
multip_data$clust1 <- as.factor(multip_clust[1,])
multip_data$clust2 <- as.factor(multip_clust[2,])


plot_clusters(
  coords = test_dat %>% select(x,y),
  weights = test_dat %>% pull(w),
  clusters = clust6$clusters,
  centers = centers6,
  title = paste("Capacitated clustering, k = ", k, sep = ""),
  subtitle = paste("Uniform prior in [", L, ", ", U, "] on cluster sizes", sep = "")
) +
  geom_point(data = multip_data,
             mapping = aes(x = x, y = y),
             size = 6,
             color = "black",
             fill = "black",
             shape = 23,
             show.legend = FALSE)+
  geom_point(data = multip_data,
             mapping = aes(x = x, y = y, color = clust1, fill = clust2),
             size = 3,
             shape = 23,
             stroke = 2,
             show.legend = FALSE) + 
  ggplot2::scale_fill_manual(  # Color theme for objects and legend title
      values = c_col[unique(sort(as.numeric(levels(multip_data$clust2))[multip_data$clust2]))],
      #name = "Cluster sizes:",
      #labels = cl_sizes
   )

Clustering with pre-determined center locations

Previously we allowed cluster heads to be one of the demand points. In this example we give all the possible cluster head locations as an input to the algorithm.

# Pre-determined cluster head locations
predet_loc <- dplyr::tibble(x = rep(seq(from = -10, to = 10, by = 4), times = 6),
                     y = rep(seq(from = -10, to = 10, by = 4), each = 6))

# Plotting the demand points. Red points refer to predetermined center locations
ggplot() +
  geom_point(data = test_dat, aes(x = x, y = y, size = w)) +
  scale_size(range = c(2, 6)) +  # Scale objects sizes
  guides(
    color = guide_legend(        # Point size in legend
      override.aes = list(size=5)
    )
  ) +
  labs(x = "x", y = "y", title = "Unclustered data") +
  theme(
    legend.position = "right",            # Legend position and removing ticks from axis
    axis.text.x = ggplot2::element_blank(),
    axis.text.y = ggplot2::element_blank(),
    axis.ticks = ggplot2::element_blank()
  ) + 
  geom_point(data = predet_loc,
             mapping = aes(x = x, y = y),
             shape = 18,
             col = "red",
             size = 5)

# Number of clusters
k <- 10

# Mean
pr_mean <- round(sum(test_dat$w) / k)

# Max radius for prior
pr_width <- 1000

# Lower und upper limit for cluster size
L <- (pr_mean - pr_width)
U <- (pr_mean + pr_width)

# Distance to pre-determined center location
dist_to_centers <- apply(
      X = predet_loc,
      MARGIN = 1,
      FUN = function(x) {
        apply(
          X = test_dat %>% select(x,y),
          MARGIN = 1,
          FUN = euc_dist,
          x2 = x
        )
      }
)

# Number of columns is equal to number of predefined center locations
dim(dist_to_centers)

# Alternaring algorithm
clust_predet <- alt_alg_predet(
  dist_to_centers = dist_to_centers,
  weights = test_dat$w,
  N = 20,
  k = k,
  range = c(L, U),
  place_to_point = TRUE
)

Plot the clusters. Small crosses refer to all the possible cluster center locations.

clust_centers <- predet_loc %>% slice(clust_predet$center_ids) 

plot_clusters(
  coords = test_dat %>% select(x,y),
  weights = test_dat$w,
  clusters = clust_predet$clusters,
  centers = clust_centers,
  title = paste("Capacitated clustering, k = ", k, sep = ""),
  subtitle = paste("Uniform prior in [", L, ", ", U, "] on cluster sizes", sep = "")
) + 
  geom_point(data = predet_loc, 
             mapping = aes(x = x, y = y),
             shape = 4)

Clustering with mobility

test_dat <- simulate_normal_mixture(n = 50, k = 5)

n <- nrow(test_dat)

# First form the distance matrix
dis <- matrix(0, ncol = n, nrow = n)
dis <- sapply(1:n, 
      FUN = function(x) {
        apply(test_dat %>% select(x,y), 
              MARGIN = 1, 
              FUN = euc_dist2, 
              x2 = test_dat[x, 2:3]
              )})

# Mobility is represented via symmetric (n x n)-matrix
#   - choose n smallest distances (n chosen to be 20) and sample approximately 10 % to have mobility
#   - diagonal values are zero 
set.seed(2)
mob <- sapply(X = 1:n, FUN = function(x){
  # nth smallest
  #nth_smallest <- sort(dis[,x])[sample(1:8, size = 1)]
  nth_smallest <- sort(dis[,x])[20]
  rand <- runif(n)
  (dis[,x] <= nth_smallest & dis[,x] > 0 & rand < 0.05)
  })

indices <- (which(mob, arr.ind = TRUE))

# Turn logical values to numerical
mob <- mob * runif(n = n*n, min = 1, max = 10)


# Transform the row numbers to ids
indices[,1] <- test_dat$id[indices[,1]]
indices[,2] <- test_dat$id[indices[,2]]

# Make a tibble
mob_segment_data <- tibble(row = indices[,1], col = indices[,2]) %>% 
  # Remove duplicates
  filter(col != row) %>% 
  # Add new columns to indicate coordinates of the segments
  mutate(xstart = test_dat$x[row], ystart = test_dat$y[row], xend = test_dat$x[col], yend = test_dat$y[col]) 

# Plot points with mobility
ggplot() +
  geom_segment(data = mob_segment_data,
               mapping = aes(x = xstart, y = ystart, xend = xend, yend = yend),
               color = "red",
               size = 1) + 
  geom_point(data = test_dat,
             mapping = aes(x = x, y = y, size = w)) 
# Number of clusters
k <- 5

# Mean
pr_mean <- round(sum(test_dat$w) / k)

# Cluster size width
pr_width <- 200

# Lower und upper limit for cluster size
L <- pr_mean - pr_width
U <- pr_mean + pr_width

# Alternaring algorithm
clust_mob <- alt_alg_mob(
  coords = test_dat %>% select(x, y),
  k = k,
  N = 10,
  print_output = "steps",
  weights = test_dat %>% pull(w),
  range = c(L, U),
  mob = mob,
  lambda_mob = 1
)


terolahderanta/Probabilistic_clustering documentation built on April 22, 2021, 8:44 p.m.