Nothing
## ----include=FALSE------------------------------------------------------------
knitr::knit_hooks$set(time_it = local({
now <- NULL
function(before, options) {
if (before) {
# record the current time before each chunk
now <<- Sys.time()
} else {
# calculate the time difference after a chunk
res <- difftime(Sys.time(), now, units = "secs")
# return a character string to show the time
paste("Time for this code chunk to run:", round(res,
2), "seconds")
}
}
}))
knitr::opts_chunk$set(fig.width = 6, fig.height = 5, fig.dpi = if (knitr::is_latex_output()) 300 else 96)
# knitr::opts_chunk$set(dev = "png", dev.args = list(type = "cairo-png"), time_it=TRUE) # fait boguer les références des fiugres lorsque compilé en pdf
knitr::opts_chunk$set(dev = "png") # format des figures
knitr::opts_chunk$set(cache = FALSE# If FALSE cache is emptied when knitting
, cache.lazy = FALSE) # knitr n'execute pas le code si il a déjà tourné une fois et est resté dans le cache, il faut vider le cache de temps à autre dans le menu déroulant "Knit"
#library(magick)
## ----include=FALSE------------------------------------------------------------
# Setting the random seed for reproducibility
set.seed(1234)
## ----seqdefmvd, message=FALSE-------------------------------------------------
# Loading the package
library(WeightedCluster)
# Loading illustrative data
data(mvad)
# Creating state sequence object
mvad.seq <- seqdef(mvad[, 17:86], # The data containing information on trajectories
labels = c("Employment", "Further Education", # The states
"Higher Education", "Joblessness",
"School", "Training"),
xtstep = 6)
## ----figSeqMvad, message=FALSE, fig.cap='MVAD trajectories, sorted from start'----
# Plotting the squences
seqIplot(mvad.seq,
legend.prop=0.2,
sortv = "from.start") # sequences are ordered by state.
## ----seqdist, message=FALSE, time_it=TRUE-------------------------------------
# Compute LCS dissimilarities
diss <- seqdist(mvad.seq,
method="LCS")
## ----include= FALSE-----------------------------------------------------------
# Set seed for reproducibility
set.seed(1234)
# Generate well-separated, compact clusters
n <- 100
# Cluster 1: Bottom-left, compact
cluster1_compact <- data.frame(
x = rnorm(n, mean = 2, sd = 1),
y = rnorm(n, mean = 2, sd = 1),
group = "Cluster 1"
)
# Cluster 2: Top-right, compact
cluster2_compact <- data.frame(
x = rnorm(n, mean = 8, sd = 1),
y = rnorm(n, mean = 8, sd = 1),
group = "Cluster 2"
)
compact_data <- rbind(cluster1_compact, cluster2_compact)
cluster1_dispersed <- data.frame(
x = rnorm(n, mean = 2, sd = 2),
y = rnorm(n, mean = 2, sd = 2),
group = "Cluster 1"
)
cluster2_dispersed <- data.frame(
x = rnorm(n, mean = 8, sd = 2),
y = rnorm(n, mean = 8, sd = 2),
group = "Cluster 2"
)
dispersed_data <- rbind(cluster1_dispersed, cluster2_dispersed)
## ----figClustStr,echo=FALSE , warning=FALSE, message=FALSE, fig.cap = "Clusterings with the same centers but showing Strong or Weak Structure"----
par(mfrow = c(1, 2))
# Get viridis colors
viridis_colors <- viridis::cividis(2)
# Base R plot for compact clusters
plot(compact_data$x, compact_data$y,
col = ifelse(compact_data$group == "Cluster 1", viridis_colors[1], viridis_colors[2]),
pch = 16, cex = 0.8,
main = "Strong Structure",
xlab = "",
ylab = "",
xaxt='n',
yaxt='n',
ylim = c(-3,13),
xlim = c(-3,13),
cex.main = 1)
points(x=c(2,8), y=c(2,8), col = "grey", pch = 4, lwd = 3)
# Base R plot for dispersed clusters
plot(dispersed_data$x, dispersed_data$y,
col = ifelse(dispersed_data$group == "Cluster 1", viridis_colors[1], viridis_colors[2]),
pch = 16, cex = 0.8,
main = "Weak Structure",
xlab = "",
ylab = "",
xaxt='n',
yaxt='n',
ylim = c(-3,13),
xlim = c(-3,13),
cex.main = 1)
points(x=c(2,8), y=c(2,8), col = "grey", pch = 4, lwd = 3)
## ----consClustTypo, message=FALSE, time_it=TRUE-------------------------------
# Setting up parallel computing
library(future)
plan(multisession)
# Creating the typology
set.seed(1234)
pamWardConsClust <- consClust(diss,
base.clust = c("pam", "ward.D"),
R = 100,
kvals = 2:15,
cons.method = "SE",
membership = "crisp",
k.fixed = TRUE,
agg.method = "cRand",
keep.ensemble = TRUE,
parallel = FALSE,
progressbar = FALSE)
## ----consClustCqi, message=FALSE----------------------------------------------
# Showing CQIs
pamWardConsClust
## ----plotConsClustCqi, message=FALSE, fig.cap="PAM and Ward consensus clustering CQIs (normalized)"----
# Plotting CQIs
par(cex = 0.75)
plot(pamWardConsClust,
legendpos = "topleft",
stat = "all",
norm = "zscore") # CQIs are standarize
## ----include=FALSE------------------------------------------------------------
par(mar = c(2,2,2,2))
## ----consClustSeqplot, message=FALSE, fig.cap= "Crisp consensus Clustering in nine groups (PAM and Ward)"----
# Plotting the consensus typology in nine groups
par(mar = c(2,2,2,2))
seqIplot(mvad.seq,
group = pamWardConsClust$clustering$cluster9, # Specitifing the cluster to use for plotting
main = c("Further Ed. - Higher Ed.", "Joblessness", # naming the clusters in plot
"Training - Employment", "Training",
"School - Higher Ed.", "Further Ed. - Employment",
"Employment", "School - Employment",
"Futher Ed."),
cex.legend = 0.8)
## ----consClustTypoFuzzy, message=FALSE, time_it=TRUE--------------------------
# Creating the typology
set.seed(1234)
pamWardConsClustF <- consClust(diss,
base.clust = c("pam", "ward.D"),
R = 100,
kvals = 2:15,
cons.method = "SE",
membership = "fuzzy",
k.fixed = TRUE,
agg.method = "cRand",
keep.ensemble = TRUE,
progressbar = FALSE)
## ----plotConsFuzzy, message=FALSE, fig.cap="Fuzzy consensus Clustering in nine groups (PAM and Ward), sorted by membership probability"----
par(mar = c(2,2,2,2))
fuzzyseqplot(mvad.seq, # sequences to plot
group = pamWardConsClustF$clustering$cluster9, # grouping variable
main = c("Further Ed. - Higher Ed.", "Joblessness",# naming the clusters
"Training - Employment", "Training",
"School - Higher Ed.", "Further Ed. - Employment",
"Employment", "School - Employment",
"Futher Ed."),
membership.threshold = 0.4,
sortv = "membership",
type = "I", # We plot an index plot
cex.legend = 0.8)
## -----------------------------------------------------------------------------
delta08 <- mean(diss) * 0.8 # Calculating the noise distance delta
delta08
## ----noiseTypo, message=FALSE, time_it=TRUE-----------------------------------
# Creating a typology with noise
set.seed(1234)
noiseClust08 <- seqclararange(mvad.seq,
kvals = 2:15,
R = 50, # Number of subsamples
sample.size = nrow(mvad.seq),
method = "noise",
dnoise = delta08, # noise sensitivity
seqdist.args = list(method = "LCS"))
## ----noiseClustCqi, message=FALSE---------------------------------------------
# Showing CQIs
noiseClust08
## ----plotNoiseClustCqi, message=FALSE, fig.cap="Fuzzy noise clustering CQIs, lambda = 0.8"----
# Plotting CQIs
plot(noiseClust08,
legendpos = "topleft")
## ----include=FALSE------------------------------------------------------------
par(mar = c(2,2,2,2))
## ----noiseClustSeqplot, fig.cap="Fuzzy noise clustering in seven groups, lambda = 0.8, sorted by membership probability"----
## Displaying the resulting clustering with membership threshold of 0.20
par(mar = c(2,2,2,2))
fuzzyseqplot(mvad.seq,
group = noiseClust08$clustering$cluster7,
main = c("Futher Ed.", "Employment", # naming the clusters
"School - Higher Ed.", "Further Ed. - Higher Ed.",
"Joblessness", "Training - Employment",
"Further Ed. - Employment", "Noise Seq."),
membership.threshold = 0.20,
type = "I", # We plot an index plot
sortv = "membership",
cex.legend = 0.8)
## ----crispNoiseTypo, message=FALSE, fig.width=8, fig.height=5-----------------
crispNoiseClust08 <- as.crisp(noiseClust08)
## ----crispNoiseClustSeqplot, fig.cap="Crisp noise clustering in seven groups, lambda = 0.8"----
par(mar = c(2,2,2,2))
seqIplot(mvad.seq,
group = crispNoiseClust08$clustering$cluster7,
main = c("Futher Ed.", "Employment", # naming the clusters
"School - Higher Ed.", "Further Ed. - Higher Ed.",
"Joblessness", "Training - Employment",
"Further Ed. - Employment", "Noise Seq."),
cex.legend = 0.8)
## ----nise06, message=FALSE----------------------------------------------------
# Calculating the noise distance delta
delta06 <- mean(diss) * 0.6
delta06
## ----include=FALSE------------------------------------------------------------
par(mar = c(2,2,2,2))
## ----noiseTypo06, message=FALSE, time_it=TRUE---------------------------------
# Creating a typology with noise
set.seed(1234)
noiseClust06 <- seqclararange(mvad.seq,
kvals = 2:15,
R = 50,
sample.size = nrow(mvad.seq),
method = "noise",
dnoise = delta06,
seqdist.args = list(method = "LCS"))
# Converting the fuzzy partition to crisp
crispNoiseClust06 <- as.crisp(noiseClust06)
## ----noiseClust06Seqplot, fig.cap= "Crisp noise clustering in seven groups, lambda = 0.6"----
par(mar = c(2,2,2,2))
# Plotting the crisp typology
seqIplot(mvad.seq,
group = crispNoiseClust06$clustering$cluster7,
main = c("Futher Ed.", "Employment",
"School - Higher Ed.", "Further Ed. - Higher Ed.",
"Joblessness", "Training - Employment",
"Further Ed. - Employment", "Noise Seq."),
cex.legend = 0.8)
## -----------------------------------------------------------------------------
delta1 <- mean(diss) * 1 # Calculating the noise distance delta
delta1
## ----noiseTypo1, message=FALSE, fig.width=8, fig.height=5, time_it=TRUE-------
# Creating a typology with noise
set.seed(1234)
noiseClust <- seqclararange(mvad.seq,
kvals = 2:15,
R = 50,
sample.size = nrow(mvad.seq),
method = "noise",
dnoise = delta1,
seqdist.args = list(method = "LCS"))
# Converting the fuzzy partition to crisp
crispNoiseClust <- as.crisp(noiseClust)
## ----noiseClust1Seqplot, fig.cap= "Crisp noise clustering in seven groups, lambda = 1"----
# Plotting the crisp typology
par(mar = c(2,2,2,2))
seqIplot(mvad.seq,
group = crispNoiseClust$clustering$cluster7,
main = c("Futher Ed.", "Employment",
"School - Higher Ed.", "Further Ed. - Higher Ed.",
"Joblessness", "Training - Employment",
"Further Ed. - Employment", "Noise Seq."),
cex.legend = 0.8)
## ----echo= FALSE--------------------------------------------------------------
set.seed(1234)
crispClust <- suppressMessages(seqclararange(mvad.seq, # computing
kvals = 2:15,
R = 50,
sample.size = nrow(mvad.seq),
method = "crisp",
seqdist.args = list(method = "LCS")))
d2m <- list()
for(i in 1:length(crispClust$clustering)){
d2mC <- list()
for(j in 1:length(unique(crispClust$clustering[[i]]))){
d2mC[[j]] <- diss[crispClust$clustering[[i]] == j, crispClust$clara[[i]]$medoids[[j]]]
}
d2m[[i]] <- unlist(d2mC)
}
d2m <- do.call(cbind,d2m)
colnames(d2m) <- paste0("cluster ", c(2:15))
## ----boxplotD2m, echo = FALSE, fig.cap= "Distance to medoid by number of clusters"----
boxplot(d2m, las = 2)
## ----crispNoiseTypo10, echo=FALSE, message=FALSE, fig.cap= "Crisp noise clustering in ten groups, lambda = 0.8 (delta = 68.39)"----
par(mar = c(2,2,2,2))
seqIplot(mvad.seq,
group = crispNoiseClust08$clustering$cluster10,
cex.legend = 0.6,
main = c("Medium Futher Ed.", "Long Futher Ed.", # naming the clusters
"Training - Employment", "Employment",
"School - Higher Ed.", "Further Ed. - Higher Ed.",
"Short Training - Employment","Joblessness",
"Short Further Ed. - Employment", "School - Employment",
"Noise Seq."))
## ----ngroupDnoiseCalc, echo= FALSE, message=FALSE-----------------------------
crispClust <- seqclararange(mvad.seq, # computing a crisp clustering
kvals = 2:10,
R = 50,
sample.size = nrow(mvad.seq),
method = "crisp",
seqdist.args = list(method = "LCS"))
d2m <- list() # calculating distance to medoids summary
for(i in 1:length(crispClust$clustering)){
d2mC <- list()
for(j in 1:length(unique(crispClust$clustering[[i]]))){
d2mC[[j]] <- diss[crispClust$clustering[[i]] == j, crispClust$clara[[i]]$medoids[[j]]]
}
d2m[[i]] <- unlist(d2mC)
}
d2m <- apply(do.call(cbind,d2m), 2, FUN = fivenum)
delta10k <- (delta08 / d2m[5,5]) * d2m[5,9] # deltas as share of the maximal distance to medoid
set.seed(1234)
noiseClust10k <- seqclararange(mvad.seq,
kvals = 10,
R = 50,
sample.size = nrow(mvad.seq),
method = "noise",
dnoise = delta10k,
seqdist.args = list(method = "LCS"))
crispNoiseClust10k <- as.crisp(noiseClust10k)
## ----ngroupDnoise, message=FALSE, fig.cap= "Crisp noise clustering in ten groups, delta = 62.57"----
par(mar = c(2,2,2,2))
seqIplot(mvad.seq,
group = crispNoiseClust10k$clustering$cluster10,
cex.legend = 0.6,
main = c("Medium Futher Ed.", "Long Futher Ed.", # naming the clusters
"Training - Employment", "Employment",
"School - Higher Ed.", "Further Ed. - Higher Ed.",
"Short Training - Employment","Joblessness",
"Short Further Ed. - Employment", "School - Employment",
"Noise Seq."))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.