knitr::opts_chunk$set( fig.width = 6, fig.height = 4, collapse = TRUE, comment = "#>" )
This vignette provides a tutorial on main functionality of the rpack
package.
First, load the necessary libraries. In addition to the rpack
package, we use
tidyverse
for sample data manipulation and plotting.
library(rpack) library(tidyverse) #library(plotly)
Let's set up some data to be clustered.
set.seed(112) # Generating 200 points from mixture of 10 normal distributions. test_dat <- simulate_normal_mixture(n = 200, k = 10) # Ids in interactive plot id <- 1:nrow(test_dat) plot_sim <- ggplot(data = test_dat, aes(x = x, y = y, size = w, label = id)) + geom_point() + # Scale objects sizes scale_size(range = c(2, 6)) + # Point size in legend guides( color = guide_legend( override.aes = list(size=5) ) ) + labs(x = "x", y = "y", title = "Unclustered data") + # Legend position and removing ticks from axis theme( legend.position = "right", axis.text.x = ggplot2::element_blank(), axis.text.y = ggplot2::element_blank(), axis.ticks = ggplot2::element_blank() ) plot_sim #ggplotly(plot_sim, tooltip = c("id", "w"))
Bigger datasets take a bit longer to cluster. To get a hang of the required processing times, you could try also the following sets:
# Generating 500 points from mixture of 20 Laplace distributions. test_dat2 <- simulate_laplace_mixture(n = 500, k = 20) # Generating 1000 points from mixture of 25 normal distributions. test_dat3 <- simulate_normal_mixture(n = 1000, k = 25)
First, cluster the data into $k=10$ clusters with a uniform prior for the cluster weights. In other words, we let the total weights of clusters vary uniformly within a given range. We set the range as $mean(w) \pm 250 $. As the algorithm is heuristic, the number of iterations needs to be set as well. Here we set $N = 20$. This could be set bigger to ensure that the global optimum is achieved. In addition cluster heads can be placed anywhere.
# Number of clusters k <- 10 # Mean pr_mean <- round(sum(test_dat$w) / k) # Cluster size width pr_width <- 250 # Lower und upper limit for cluster size L <- pr_mean - pr_width U <- pr_mean + pr_width # Number of iterations N <- 20 # Alternaring algorithm clust1 <- alt_alg( coords = test_dat %>% select(x, y), k = k, N = N, weights = test_dat %>% pull(w), range = c(L, U), place_to_point = FALSE # Clusters heads can be located anywhere ) # Save the results of the clustering test_dat$cl1 <- clust1$clusters centers1 <- clust1$centers
Plot the clusters with plot_clusters
.
plot_cl1 <- plot_clusters( coords = test_dat %>% select(x, y), weights = test_dat %>% pull(w), clusters = test_dat %>% pull(cl1), centers = centers1, title = paste( "Capacitated clustering, k = ", k, ", squread Euclidean distance", sep = "" ), subtitle = paste("Uniform prior in [", L, ", ", U, "] on cluster sizes", sep = "") ) plot_cl1_hull <- plot_hull(coords = test_dat %>% select(x, y), weights = test_dat %>% pull(w), clusters = test_dat %>% pull(cl1), centers = centers1) plot_cl1 plot_cl1_hull
Similarly, cluster the points into $k=10$ with same prior for the cluster weights. Before $L_2^2$ distance metric was used as it is the default option. This time use the standard Euclidean distance $L_2$. In addition, constraint the location of the cluster heads to be one of the data points.
# Number of clusters k <- 10 # Mean pr_mean <- round(sum(test_dat$w) / k) # Cluster size width pr_width <- 250 # Lower und upper limit for cluster size L <- pr_mean - pr_width U <- pr_mean + pr_width # Number of iterations N <- 10 # Alternaring algorithm clust2 <- alt_alg( coords = test_dat %>% select(x, y), k = k, N = N, weights = test_dat %>% pull(w), range = c(L, U), center_init = "kmpp", d = euc_dist, # Euclidean distance place_to_point = TRUE # This is TRUE by default ) # Save the results of the clustering test_dat$cl2 <- clust2$clusters centers2 <- clust2$centers # Plot the clusters plot_cl2 <- plot_clusters( coords = test_dat %>% select(x,y), weights = test_dat %>% pull(w), clusters = test_dat %>% pull(cl2), centers = centers2, title = paste("Capacitated clustering, k = ", k, ", Euclidean distance", sep = ""), subtitle = paste("Uniform prior in [", L, ", ", U, "] on cluster sizes", sep = "") ) plot_cl2
Similarly, cluster the data into $k=10$ clusters. In addition, add outgroup for the data points. Outgroup consists of points that can be seen as outliers from other points and are not allocated to any of the clusters.
# Add some artificial outliers to the data outlier_dat <- tibble(x = c(0.2, 4, -8), y = c(7, 10, -6), w = c(50, 50, 50), orig_group = rep(factor(99), 3)) test_dat_out <- full_join(test_dat, outlier_dat) # Plot the data with outliers plot_dat_out <- ggplot() + geom_point(data = outlier_dat, mapping = aes(x = x, y = y, size = w), color = "red") + geom_point(data = test_dat, mapping = aes(x = x, y = y, size = w)) + ggtitle("New data with outlier points") plot_dat_out
# Number of clusters k <- 10 # Mean pr_mean <- round(sum(test_dat_out$w) / k) # Max radius for prior pr_width <- 300 # Lower und upper limit for cluster size L <- (pr_mean - pr_width) U <- (pr_mean + pr_width) # Outgroup parameter lambda, smaller value --> more outliers lambda1 <- 0.04 # Alternaring algorithm clust3 <- alt_alg( coords = dplyr::select(test_dat_out, x, y), weights = test_dat_out$w, N = 10, k = k, range = c(L, U), lambda = lambda1 ) # Save the clustering test_dat_out$cl3 <- clust3$clusters centers3 <- clust3$centers # Plot the clusters. Outgroup points are labeled as "NA" plot_cl3 <- plot_clusters( coords = test_dat_out %>% select(x,y), weights = test_dat_out %>% pull(w), clusters = test_dat_out %>% pull(cl3), centers = centers3, title = paste("Capacitated clustering, k = ", k, sep = ""), subtitle = paste("Uniform prior in [", L, ", ", U, "] on cluster sizes", sep = "") ) plot_cl3
Again, cluster the points into $k=10$ clusters. This time choose two different ranges for the cluster sizes
# Number of clusters k <- 10 # Mean pr_mean <- round(sum(test_dat$w) / k) # Smaller cluster size range range_small <- c(600, 900) # Larger cluster size range range_large <- c(1200, 1500) # Matrix of ranges ranges <- matrix(c(range_small, range_large), byrow = TRUE, ncol = 2, nrow = 2) # Number of iterations N <- 10 # Alternaring algorithm clust4 <- alt_alg( coords = test_dat %>% select(x, y), k = k, N = N, weights = test_dat %>% pull(w), range = ranges ) # Save the results of the clustering test_dat$cl4 <- clust4$clusters centers4 <- clust4$centers # Plot the clusters plot_cl4 <- plot_clusters( coords = test_dat %>% select(x,y), weights = test_dat %>% pull(w), clusters = test_dat %>% pull(cl4), centers = centers4, title = paste("Capacitated clustering, k = ", k, ", Euclidean distance", sep = ""), subtitle = paste("Ranges [", ranges[1,1], ", ", ranges[1,2], "] and [", ranges[2,1], ", ", ranges[2,2], "] for cluster sizes", sep = "") ) plot_cl4
Some of the cluster heads might be fixed to certain points. This can be typical in facility location where some number of facilities are already placed and the location of new ones should be determined. Predefine 5 of the points as fixed cluster heads and add 5 new cluster heads to the region.
# Choose the fixed centers from the data points fixed_centers <- test_dat[c(3,45,99,105,148),] %>% select(x,y) # Number of fixed centers n_fixed <- nrow(fixed_centers) # Plot the fixed points ggplot() + geom_point(data = test_dat, mapping = aes(x = x, y = y, size = w)) + geom_point(data = fixed_centers, mapping = aes(x = x, y = y), stroke = 3, size = 4, shape = 4, color = "red") # Total number of clusters k <- n_fixed + 5 # Mean pr_mean <- round(sum(test_dat$w) / k) # Cluster size width pr_width <- 300 # Lower und upper limit for cluster size L <- pr_mean - pr_width U <- pr_mean + pr_width # Number of iterations N <- 20 # Alternaring algorithm clust5 <- alt_alg( coords = test_dat %>% select(x, y), k = k, N = N, weights = test_dat %>% pull(w), range = c(L, U), fixed_centers = fixed_centers ) # Save the results of the clustering test_dat$cl5 <- clust5$clusters centers5 <- clust5$centers # Plot the clusters plot_cl5 <- plot_clusters( coords = test_dat %>% select(x,y), weights = test_dat %>% pull(w), clusters = test_dat %>% pull(cl5), centers = centers5, title = paste("Capacitated clustering, k = ", k, ", Euclidean distance", sep = ""), subtitle = paste("Uniform prior in [", L, ", ", U, "] on cluster sizes", sep = "") ) plot_cl5
Cluster the points to $k=10$ clusters. Set some points to be require multiple centers to be assigned to.
# Number of clusters k <- 10 # Mean pr_mean <- round(sum(test_dat$w) / k) # Max radius for prior pr_width <- 250 # Lower und upper limit for cluster size L <- (pr_mean - pr_width) U <- (pr_mean + pr_width) # Ids of the points to be assigned to multiple clusters multip_id <- c(42, 153, 190, 83, 171) # n-length vector to indicate the number of centers a point is assigned to. multip_centers <- rep(1, nrow(test_dat)) # Selected points are assigned to two clusters multip_centers[multip_id] <- 2 # Alternaring algorithm clust6 <- alt_alg( coords = dplyr::select(test_dat, x, y), weights = test_dat$w, N = 10, k = k, range = c(L, U), multip_centers = multip_centers ) test_dat$cl6 <- clust6$clusters centers6 <- clust6$centers
Plot the clusters and the points with multiple centers.
multip_data <- test_dat[multip_id,] multip_clust <- sapply(X = multip_id, FUN = function(x){which(clust6$assign_frac[x,] == 1)}) multip_data$clust1 <- as.factor(multip_clust[1,]) multip_data$clust2 <- as.factor(multip_clust[2,]) plot_clusters( coords = test_dat %>% select(x,y), weights = test_dat %>% pull(w), clusters = clust6$clusters, centers = centers6, title = paste("Capacitated clustering, k = ", k, sep = ""), subtitle = paste("Uniform prior in [", L, ", ", U, "] on cluster sizes", sep = "") ) + geom_point(data = multip_data, mapping = aes(x = x, y = y), size = 6, color = "black", fill = "black", shape = 23, show.legend = FALSE)+ geom_point(data = multip_data, mapping = aes(x = x, y = y, color = clust1, fill = clust2), size = 3, shape = 23, stroke = 2, show.legend = FALSE) + ggplot2::scale_fill_manual( # Color theme for objects and legend title values = c_col[unique(sort(as.numeric(levels(multip_data$clust2))[multip_data$clust2]))], #name = "Cluster sizes:", #labels = cl_sizes )
Previously we allowed cluster heads to be one of the demand points. In this example we give all the possible cluster head locations as an input to the algorithm.
# Pre-determined cluster head locations predet_loc <- dplyr::tibble(x = rep(seq(from = -10, to = 10, by = 4), times = 6), y = rep(seq(from = -10, to = 10, by = 4), each = 6)) # Plotting the demand points. Red points refer to predetermined center locations ggplot() + geom_point(data = test_dat, aes(x = x, y = y, size = w)) + scale_size(range = c(2, 6)) + # Scale objects sizes guides( color = guide_legend( # Point size in legend override.aes = list(size=5) ) ) + labs(x = "x", y = "y", title = "Unclustered data") + theme( legend.position = "right", # Legend position and removing ticks from axis axis.text.x = ggplot2::element_blank(), axis.text.y = ggplot2::element_blank(), axis.ticks = ggplot2::element_blank() ) + geom_point(data = predet_loc, mapping = aes(x = x, y = y), shape = 18, col = "red", size = 5) # Number of clusters k <- 10 # Mean pr_mean <- round(sum(test_dat$w) / k) # Max radius for prior pr_width <- 1000 # Lower und upper limit for cluster size L <- (pr_mean - pr_width) U <- (pr_mean + pr_width) # Distance to pre-determined center location dist_to_centers <- apply( X = predet_loc, MARGIN = 1, FUN = function(x) { apply( X = test_dat %>% select(x,y), MARGIN = 1, FUN = euc_dist, x2 = x ) } ) # Number of columns is equal to number of predefined center locations dim(dist_to_centers) # Alternaring algorithm clust_predet <- alt_alg_predet( dist_to_centers = dist_to_centers, weights = test_dat$w, N = 20, k = k, range = c(L, U), place_to_point = TRUE )
Plot the clusters. Small crosses refer to all the possible cluster center locations.
clust_centers <- predet_loc %>% slice(clust_predet$center_ids) plot_clusters( coords = test_dat %>% select(x,y), weights = test_dat$w, clusters = clust_predet$clusters, centers = clust_centers, title = paste("Capacitated clustering, k = ", k, sep = ""), subtitle = paste("Uniform prior in [", L, ", ", U, "] on cluster sizes", sep = "") ) + geom_point(data = predet_loc, mapping = aes(x = x, y = y), shape = 4)
test_dat <- simulate_normal_mixture(n = 50, k = 5) n <- nrow(test_dat) # First form the distance matrix dis <- matrix(0, ncol = n, nrow = n) dis <- sapply(1:n, FUN = function(x) { apply(test_dat %>% select(x,y), MARGIN = 1, FUN = euc_dist2, x2 = test_dat[x, 2:3] )}) # Mobility is represented via symmetric (n x n)-matrix # - choose n smallest distances (n chosen to be 20) and sample approximately 10 % to have mobility # - diagonal values are zero set.seed(2) mob <- sapply(X = 1:n, FUN = function(x){ # nth smallest #nth_smallest <- sort(dis[,x])[sample(1:8, size = 1)] nth_smallest <- sort(dis[,x])[20] rand <- runif(n) (dis[,x] <= nth_smallest & dis[,x] > 0 & rand < 0.05) }) indices <- (which(mob, arr.ind = TRUE)) # Turn logical values to numerical mob <- mob * runif(n = n*n, min = 1, max = 10) # Transform the row numbers to ids indices[,1] <- test_dat$id[indices[,1]] indices[,2] <- test_dat$id[indices[,2]] # Make a tibble mob_segment_data <- tibble(row = indices[,1], col = indices[,2]) %>% # Remove duplicates filter(col != row) %>% # Add new columns to indicate coordinates of the segments mutate(xstart = test_dat$x[row], ystart = test_dat$y[row], xend = test_dat$x[col], yend = test_dat$y[col]) # Plot points with mobility ggplot() + geom_segment(data = mob_segment_data, mapping = aes(x = xstart, y = ystart, xend = xend, yend = yend), color = "red", size = 1) + geom_point(data = test_dat, mapping = aes(x = x, y = y, size = w))
# Number of clusters k <- 5 # Mean pr_mean <- round(sum(test_dat$w) / k) # Cluster size width pr_width <- 200 # Lower und upper limit for cluster size L <- pr_mean - pr_width U <- pr_mean + pr_width # Alternaring algorithm clust_mob <- alt_alg_mob( coords = test_dat %>% select(x, y), k = k, N = 10, print_output = "steps", weights = test_dat %>% pull(w), range = c(L, U), mob = mob, lambda_mob = 1 )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.