robClustVignette.R
In WeightedCluster: Clustering of Weighted Data and Robust Clustering

## ----include=FALSE------------------------------------------------------------

knitr::knit_hooks$set(time_it = local({
  now <- NULL
  function(before, options) {
    if (before) {
      # record the current time before each chunk
      now <<- Sys.time()
    } else {
      # calculate the time difference after a chunk
      res <- difftime(Sys.time(), now, units = "secs")
      # return a character string to show the time
      paste("Time for this code chunk to run:", round(res,
        2), "seconds")
    }
  }
}))
knitr::opts_chunk$set(fig.width = 6, fig.height = 5, fig.dpi = if (knitr::is_latex_output()) 300 else 96)
# knitr::opts_chunk$set(dev = "png", dev.args = list(type = "cairo-png"), time_it=TRUE) # fait boguer les références des fiugres lorsque compilé en pdf
knitr::opts_chunk$set(dev = "png") # format des figures
knitr::opts_chunk$set(cache = FALSE# If FALSE cache is emptied when knitting  
                      , cache.lazy = FALSE) # knitr n'execute pas le code si il a déjà tourné une fois et est resté dans le cache, il faut vider le cache de temps à autre dans le menu déroulant "Knit" 
#library(magick)


## ----include=FALSE------------------------------------------------------------
# Setting the random seed for reproducibility
set.seed(1234)

## ----seqdefmvd, message=FALSE-------------------------------------------------
# Loading the package
library(WeightedCluster)

# Loading illustrative data
data(mvad)

# Creating state sequence object
mvad.seq <- seqdef(mvad[, 17:86], # The data containing information on trajectories
                   labels = c("Employment", "Further Education", # The states 
                              "Higher Education", "Joblessness", 
                              "School", "Training"), 
                   xtstep = 6)

## ----figSeqMvad, message=FALSE, fig.cap='MVAD trajectories, sorted from start'----

# Plotting the squences
seqIplot(mvad.seq, 
         legend.prop=0.2, 
         sortv = "from.start") # sequences are ordered by state.

## ----seqdist, message=FALSE, time_it=TRUE-------------------------------------

# Compute LCS dissimilarities
diss <- seqdist(mvad.seq, 
                method="LCS")


## ----include= FALSE-----------------------------------------------------------
# Set seed for reproducibility
set.seed(1234)

# Generate well-separated, compact clusters
n <- 100

# Cluster 1: Bottom-left, compact
cluster1_compact <- data.frame(
  x = rnorm(n, mean = 2, sd = 1),
  y = rnorm(n, mean = 2, sd = 1),
  group = "Cluster 1"
)


# Cluster 2: Top-right, compact  
cluster2_compact <- data.frame(
  x = rnorm(n, mean = 8, sd = 1),
  y = rnorm(n, mean = 8, sd = 1),
  group = "Cluster 2"
)

compact_data <- rbind(cluster1_compact, cluster2_compact)


cluster1_dispersed <- data.frame(
  x = rnorm(n, mean = 2, sd = 2),
  y = rnorm(n, mean = 2, sd = 2),
  group = "Cluster 1"
)

cluster2_dispersed <- data.frame(
  x = rnorm(n, mean = 8, sd = 2),
  y = rnorm(n, mean = 8, sd = 2),
  group = "Cluster 2"
)

dispersed_data <- rbind(cluster1_dispersed, cluster2_dispersed)

## ----figClustStr,echo=FALSE , warning=FALSE, message=FALSE, fig.cap = "Clusterings with the same centers but showing Strong or Weak Structure"----

par(mfrow = c(1, 2))
# Get viridis colors
viridis_colors <- viridis::cividis(2)
# Base R plot for compact clusters
plot(compact_data$x, compact_data$y, 
     col = ifelse(compact_data$group == "Cluster 1", viridis_colors[1], viridis_colors[2]),
     pch = 16, cex = 0.8,
     main = "Strong Structure",
     xlab = "",
     ylab = "", 
     xaxt='n',
     yaxt='n',
     ylim = c(-3,13), 
     xlim = c(-3,13), 
     cex.main = 1)
points(x=c(2,8), y=c(2,8), col = "grey", pch = 4, lwd = 3)

# Base R plot for dispersed clusters  
plot(dispersed_data$x, dispersed_data$y,
     col = ifelse(dispersed_data$group == "Cluster 1", viridis_colors[1], viridis_colors[2]),
     pch = 16, cex = 0.8,
     main = "Weak Structure",
     xlab = "",
     ylab = "",
     xaxt='n',
     yaxt='n',
     ylim = c(-3,13), 
     xlim = c(-3,13), 
     cex.main = 1)
points(x=c(2,8), y=c(2,8), col = "grey", pch = 4, lwd = 3)


## ----consClustTypo, message=FALSE, time_it=TRUE-------------------------------
# Setting up parallel computing
library(future)
plan(multisession)
# Creating the typology
set.seed(1234)
pamWardConsClust <- consClust(diss,
                              base.clust = c("pam", "ward.D"), 
                              R = 100, 
                              kvals = 2:15,
                              cons.method = "SE", 
                              membership = "crisp",
                              k.fixed = TRUE,
                              agg.method = "cRand",
                              keep.ensemble = TRUE,
                              parallel = FALSE, 
                              progressbar = FALSE)

## ----consClustCqi, message=FALSE----------------------------------------------
# Showing CQIs
pamWardConsClust

## ----plotConsClustCqi, message=FALSE, fig.cap="PAM and Ward consensus clustering CQIs (normalized)"----
# Plotting CQIs
par(cex = 0.75)
plot(pamWardConsClust,
     legendpos = "topleft",
     stat = "all",
     norm = "zscore") # CQIs are standarize

## ----include=FALSE------------------------------------------------------------
par(mar = c(2,2,2,2))

## ----consClustSeqplot, message=FALSE, fig.cap= "Crisp consensus Clustering in nine groups (PAM and Ward)"----
# Plotting the consensus typology in nine groups
par(mar = c(2,2,2,2))
seqIplot(mvad.seq,
         group = pamWardConsClust$clustering$cluster9, # Specitifing the cluster to use for plotting 
         main = c("Further Ed. - Higher Ed.", "Joblessness", # naming the clusters in plot
                  "Training - Employment", "Training",
                  "School - Higher Ed.", "Further Ed. - Employment", 
                  "Employment", "School - Employment", 
                  "Futher Ed."), 
         cex.legend = 0.8)

## ----consClustTypoFuzzy, message=FALSE, time_it=TRUE--------------------------
# Creating the typology
set.seed(1234)
pamWardConsClustF <- consClust(diss,
                              base.clust = c("pam", "ward.D"),
                              R = 100, 
                              kvals = 2:15, 
                              cons.method = "SE", 
                              membership = "fuzzy", 
                              k.fixed = TRUE,
                              agg.method = "cRand",
                              keep.ensemble = TRUE,
                              progressbar = FALSE)

## ----plotConsFuzzy, message=FALSE, fig.cap="Fuzzy consensus Clustering in nine groups (PAM and Ward), sorted by membership probability"----
par(mar = c(2,2,2,2))
fuzzyseqplot(mvad.seq, # sequences to plot 
             group = pamWardConsClustF$clustering$cluster9, # grouping variable
              main = c("Further Ed. - Higher Ed.", "Joblessness",# naming the clusters
                       "Training - Employment", "Training",
                       "School - Higher Ed.", "Further Ed. - Employment",
                       "Employment", "School - Employment",
                       "Futher Ed."), 
             membership.threshold = 0.4,
             sortv = "membership",
             type = "I", # We plot an index plot
             cex.legend = 0.8) 

## -----------------------------------------------------------------------------
delta08 <- mean(diss) * 0.8 # Calculating the noise distance delta
delta08

## ----noiseTypo, message=FALSE, time_it=TRUE-----------------------------------
# Creating a typology with noise
set.seed(1234)
noiseClust08 <- seqclararange(mvad.seq, 
                              kvals = 2:15,
                              R = 50, # Number of subsamples
                              sample.size = nrow(mvad.seq), 
                              method = "noise",
                              dnoise = delta08, # noise sensitivity
                              seqdist.args = list(method = "LCS"))

## ----noiseClustCqi, message=FALSE---------------------------------------------

# Showing CQIs
noiseClust08


## ----plotNoiseClustCqi, message=FALSE, fig.cap="Fuzzy noise clustering CQIs, lambda = 0.8"----
# Plotting CQIs
plot(noiseClust08, 
     legendpos = "topleft")

## ----include=FALSE------------------------------------------------------------
par(mar = c(2,2,2,2))

## ----noiseClustSeqplot, fig.cap="Fuzzy noise clustering in seven groups, lambda = 0.8, sorted by membership probability"----
## Displaying the resulting clustering with membership threshold of 0.20
par(mar = c(2,2,2,2))
fuzzyseqplot(mvad.seq, 
             group = noiseClust08$clustering$cluster7,  
             main = c("Futher Ed.", "Employment", # naming the clusters
                      "School - Higher Ed.", "Further Ed. - Higher Ed.",
                      "Joblessness", "Training - Employment",
                      "Further Ed. - Employment", "Noise Seq."), 
             membership.threshold = 0.20,
             type = "I", # We plot an index plot
             sortv = "membership", 
             cex.legend = 0.8)

## ----crispNoiseTypo, message=FALSE, fig.width=8, fig.height=5-----------------
crispNoiseClust08 <- as.crisp(noiseClust08)

## ----crispNoiseClustSeqplot, fig.cap="Crisp noise clustering in seven groups, lambda = 0.8"----
par(mar = c(2,2,2,2))
seqIplot(mvad.seq, 
         group = crispNoiseClust08$clustering$cluster7,
         main = c("Futher Ed.", "Employment", # naming the clusters
                  "School - Higher Ed.", "Further Ed. - Higher Ed.",
                  "Joblessness", "Training - Employment",
                  "Further Ed. - Employment", "Noise Seq."), 
        cex.legend = 0.8)


## ----nise06, message=FALSE----------------------------------------------------
# Calculating the noise distance delta
delta06 <- mean(diss) * 0.6 
delta06

## ----include=FALSE------------------------------------------------------------
par(mar = c(2,2,2,2))

## ----noiseTypo06, message=FALSE, time_it=TRUE---------------------------------
# Creating a typology with noise
set.seed(1234)
noiseClust06 <- seqclararange(mvad.seq, 
                              kvals = 2:15,
                              R = 50,
                              sample.size = nrow(mvad.seq),
                              method = "noise", 
                              dnoise = delta06,
                              seqdist.args = list(method = "LCS"))

# Converting the fuzzy partition to crisp
crispNoiseClust06 <- as.crisp(noiseClust06) 

## ----noiseClust06Seqplot, fig.cap= "Crisp noise clustering in seven groups, lambda = 0.6"----
par(mar = c(2,2,2,2))
# Plotting the crisp typology
seqIplot(mvad.seq,
         group = crispNoiseClust06$clustering$cluster7, 
         main = c("Futher Ed.", "Employment", 
                  "School - Higher Ed.", "Further Ed. - Higher Ed.",
                  "Joblessness", "Training - Employment",
                  "Further Ed. - Employment", "Noise Seq."),
        cex.legend = 0.8)  

## -----------------------------------------------------------------------------
delta1 <- mean(diss) * 1 # Calculating the noise distance delta
delta1

## ----noiseTypo1, message=FALSE, fig.width=8, fig.height=5, time_it=TRUE-------
# Creating a typology with noise
set.seed(1234)
noiseClust <- seqclararange(mvad.seq, 
                            kvals = 2:15,
                            R = 50,
                            sample.size = nrow(mvad.seq),
                            method = "noise", 
                            dnoise = delta1,
                            seqdist.args = list(method = "LCS"))

# Converting the fuzzy partition to crisp
crispNoiseClust <- as.crisp(noiseClust)


## ----noiseClust1Seqplot, fig.cap= "Crisp noise clustering in seven groups, lambda = 1"----

# Plotting the crisp typology
par(mar = c(2,2,2,2))
seqIplot(mvad.seq, 
         group = crispNoiseClust$clustering$cluster7,
         main = c("Futher Ed.", "Employment", 
                  "School - Higher Ed.", "Further Ed. - Higher Ed.",
                  "Joblessness", "Training - Employment",
                  "Further Ed. - Employment", "Noise Seq."),
         cex.legend = 0.8)

## ----echo= FALSE--------------------------------------------------------------
set.seed(1234)
crispClust <- suppressMessages(seqclararange(mvad.seq, # computing
                            kvals = 2:15,
                            R = 50,
                            sample.size = nrow(mvad.seq),
                            method = "crisp", 
                            seqdist.args = list(method = "LCS")))

d2m <- list()
for(i in 1:length(crispClust$clustering)){
  d2mC <- list()
  for(j in 1:length(unique(crispClust$clustering[[i]]))){
    d2mC[[j]] <- diss[crispClust$clustering[[i]] == j, crispClust$clara[[i]]$medoids[[j]]]
  }  
  d2m[[i]] <- unlist(d2mC)
}

d2m <- do.call(cbind,d2m)

colnames(d2m) <- paste0("cluster ", c(2:15))

## ----boxplotD2m, echo = FALSE, fig.cap= "Distance to medoid by number of clusters"----
boxplot(d2m, las = 2)

## ----crispNoiseTypo10, echo=FALSE, message=FALSE, fig.cap= "Crisp noise clustering in ten groups, lambda = 0.8 (delta = 68.39)"----
par(mar = c(2,2,2,2))
seqIplot(mvad.seq, 
         group = crispNoiseClust08$clustering$cluster10,
         cex.legend = 0.6,
         main = c("Medium Futher Ed.", "Long Futher Ed.", # naming the clusters
                  "Training - Employment", "Employment", 
                  "School - Higher Ed.", "Further Ed. - Higher Ed.",
                  "Short Training - Employment","Joblessness", 
                  "Short Further Ed. - Employment", "School - Employment",
                  "Noise Seq."))


## ----ngroupDnoiseCalc, echo= FALSE, message=FALSE-----------------------------
crispClust <- seqclararange(mvad.seq, # computing a crisp clustering
                            kvals = 2:10,
                            R = 50,
                            sample.size = nrow(mvad.seq),
                            method = "crisp", 
                            seqdist.args = list(method = "LCS"))

d2m <- list() # calculating distance to medoids summary 
for(i in 1:length(crispClust$clustering)){
  d2mC <- list()
  for(j in 1:length(unique(crispClust$clustering[[i]]))){
    d2mC[[j]] <- diss[crispClust$clustering[[i]] == j, crispClust$clara[[i]]$medoids[[j]]]
  }  
  d2m[[i]] <- unlist(d2mC)
}

d2m <- apply(do.call(cbind,d2m), 2, FUN = fivenum)

delta10k <-  (delta08 / d2m[5,5]) * d2m[5,9] # deltas as share of the maximal distance to medoid
 set.seed(1234)
noiseClust10k  <- seqclararange(mvad.seq, 
                              kvals = 10,
                              R = 50,
                              sample.size = nrow(mvad.seq),
                              method = "noise", 
                              dnoise = delta10k,
                              seqdist.args = list(method = "LCS"))

crispNoiseClust10k <-  as.crisp(noiseClust10k)

## ----ngroupDnoise, message=FALSE, fig.cap= "Crisp noise clustering in ten groups, delta = 62.57"----
par(mar = c(2,2,2,2))
seqIplot(mvad.seq, 
         group = crispNoiseClust10k$clustering$cluster10,
         cex.legend = 0.6, 
         main = c("Medium Futher Ed.", "Long Futher Ed.", # naming the clusters
                  "Training - Employment", "Employment", 
                  "School - Higher Ed.", "Further Ed. - Higher Ed.",
                  "Short Training - Employment","Joblessness", 
                  "Short Further Ed. - Employment", "School - Employment",
                  "Noise Seq."))