Nothing
seqclararange <- function(seqdata, R = 100, sample.size = 40 + 2*max(kvals), kvals = 2:10,
seqdist.args = list(method = "LCS"), method=c("crisp", "fuzzy", "representativeness"), m=1.5,
criteria = c("distance"), stability=FALSE,
parallel=FALSE, progressbar = FALSE, keep.diss=FALSE, max.dist = NULL){
## Setting up and checks
message(" [>] Starting generalized CLARA for sequence analysis.")
if (!inherits(seqdata, "stslist")) {
stop(" [!] seqdata should be a sequence object, see seqdef function to create one")
}
if(max(kvals) > sample.size){
stop(" [!] More clusters than the size of the sample requested.")
}
allmethods <- c("crisp", "fuzzy", "representativeness")
methindex <- pmatch(method[1], allmethods)
if(is.na(methindex)){
stop(" [!] Unknow method ", method, ". Please specify one of the following: ", paste(allmethods, collapse = ", "))
}
method <- allmethods[methindex]
if(method=="representativeness" && is.null(max.dist)){
stop(" [!] You need to set max.dist to the theoretical maximum distance value when using representativeness method")
}
allcriteria <- c("distance", "db", "xb", "pbm", "ams")
critindex <- pmatch(criteria, allcriteria)
if(any(is.na(critindex))){
stop(" [!] At least one unkown criteria among ", paste(criteria, collapse = ", "), ". Please specify at leat one among the following: ", paste(allcriteria, collapse = ", "))
}
criteria <- allcriteria[critindex]
message(" [>] Using ", method, " clustering optimizing the following criterion: ", paste(criteria, collapse = ", "), ".")
## FIXME Add coherance check between method and criteria
start.time <- Sys.time() #debut du processus
# message(" [>] Overall memory requirement estimation...", appendLF=FALSE)
# objsize <- 2*object.size(seqdata)+ R*() ## no. of column * no. of rows * 8
## Aggregation
n <- nrow(seqdata)
message(" [>] Aggregating ", n, " sequences...", appendLF=FALSE)
ac <- wcAggregateCases(seqdata)
agseqdata <- seqdata[ac$aggIndex, ]
ac$probs <- ac$aggWeights/n
message(" OK (", length(ac$aggWeights)," unique cases).")
## Setting up parallel computing
if(parallel){
message(" [>] Initializing parallel processing...", appendLF=FALSE)
oplan <- plan(multisession)
on.exit(plan(oplan), add=TRUE)
message(" OK.")
}
if(progressbar){
if (requireNamespace("progress", quietly = TRUE)) {
old_handlers <- handlers(handler_progress(format = "(:spin) [:bar] :percent | Elapsed: :elapsed | ETA: :eta | :message"))
if(!is.null(old_handlers)){
on.exit(handlers(old_handlers), add = TRUE)
}
}else{
message(" [>] Install the progress package to see estimated remaining time.")
}
oldglobal <- handlers(global=TRUE)
if(!is.null(oldglobal)){
on.exit(handlers(global=oldglobal), add = TRUE)
}
}
p <- progressor(R)
## Memory clean up before parallel computing
gc()
message(" [>] Starting iterations...\n")
## Launch parallel loop
#calc_pam <- foreach(loop=1:R, .export=c("davies_bouldin_internal"), .packages = c('TraMineR', 'cluster', 'WeightedCluster', 'fastcluster')) %dofuture%{#on stocke chaque sample
calc_pam <- foreach(loop=1:R, .options.future = list(seed = TRUE, globals = structure(TRUE, add = c("sample.size", "seqdist.args", "m"))) ) %dofuture%{#on stocke chaque sample
ltime <- Sys.time()
mysample <- sample.int(nrow(agseqdata), size=sample.size, prob=ac$probs, replace=TRUE)
##Re-aggregate!
ac2 <- wcAggregateCases(data.frame(id=mysample))
data_subset <- agseqdata[mysample[ac2$aggIndex], ]
## Compute distances
seqdist.args$refseq <- NULL ## Avoid dependencies between loop
seqdist.args$seqdata <- data_subset
suppressMessages(diss <- do.call(seqdist, seqdist.args))
hc <- fastcluster::hclust(as.dist(diss), method="ward.D", members=ac2$aggWeights)
## For each number of clusters
allclust <- list()
for(k in seq_along(kvals)){
if(method=="fuzzy"){
## Weighted FCMdd clustering on subsample
memb = as.memb(cutree(hc, k = kvals[k]))
fanny <- fanny(diss, kvals[k], diss=TRUE, memb.exp=m, iniMem.p=memb, tol=0.00001)
clustering <- wfcmdd(diss, memb = fanny$membership, weights=ac2$aggWeights, method="FCMdd", m=m) #FCMdd algo sur la matrice de distance
rm(fanny)
##Retrieve medoids
medoids <- mysample[ac2$aggIndex[clustering$mobileCenters]] ## Going back to overall dataset
}else{
## Weighted Pam clustering on subsample
clustering <- wcKMedoids(diss, k=kvals[k], cluster.only=TRUE, initialclust=hc, weights=ac2$aggWeights) #PAM sur la matrice de distance
##Retrieve medoids
medoids <- mysample[ac2$aggIndex[unique(clustering)]] ## Going back to overall dataset
}
rm(clustering)
##Compute distances between all sequence to the medoids
distargs2 <- seqdist.args
distargs2$seqdata <- agseqdata
distargs2$refseq <- list(1:nrow(agseqdata), medoids)
suppressMessages(diss2 <- do.call(seqdist, distargs2))
##Compute two minimal distances are used for silhouette width
## and other criterions
alphabeta <- apply(diss2, 1, function(x) sort(x)[1:2])
#alpha <- alphabeta[1, ]
#beta <- alphabeta[2, ]
sil <- ((alphabeta[2, ]-alphabeta[1, ])/pmax(alphabeta[2, ], alphabeta[1, ]))
if(method=="fuzzy"){
##Allocate to clusters using FCM formulae
memb <- (1/diss2)^(1/(m-1))
memb <- memb/rowSums(memb)
memb[diss2==0] <- 1
##Compute criterion (FCMdd Formulae)
## mean_diss <- sum(rowSums((memb^m)*diss2)*ac$aggWeights)
mean_diss <- sum(rowSums((memb^m)*diss2)*ac$probs)
db <- fuzzy_davies_bouldin_internal(diss2, memb, medoids, weights=ac$aggWeights)$db
alpha <- 1
hightest.memb <- apply(memb, 1, function(x){
y <- sort(x, decreasing = TRUE)[1:2]
return((y[1]-y[2])**alpha)
})
pbm <- ((1/length(medoids)) *(max(diss2[medoids, ])/sum(rowSums((memb)*diss2)*ac$probs)))^2
ams <- sum(hightest.memb*sil*ac$probs)/sum(hightest.memb*ac$probs)
rm(hightest.memb)
}else{
##Allocate to clusters
memb <- apply(diss2, 1, which.min)
mean_diss <- sum(alphabeta[1, ]*ac$probs)
db <- davies_bouldin_internal(diss2, memb, medoids, weights=ac$aggWeights)$db
pbm <- ((1/length(medoids)) *(max(diss2[medoids, ])/mean_diss))^2
ams <- sum(sil*ac$probs)
}
distmed <- as.dist(diss2[medoids, ])
minsep <- min(distmed)
maxsep <- max(distmed)
xb <- mean_diss/minsep
## Memory cleanup
rm(alphabeta, sil, diss2)
rm(distmed, minsep, maxsep)
## Store results
## Do not disagg Now to save memory
allclust[[k]] <- list(mean_diss=mean_diss, db=db, pbm=pbm, ams=ams, xb=xb, clustering= memb, medoids=medoids)
}
##Store the clustering of this iteration to stratify the next one
#previousclusteringstrata <- allclust[[length(kvals)]]$clustering
rm(diss)
gc()
#p(message=paste("Iteration ", loop, " finished. Objective=", round(mean_diss, 2), ". Time=", format(round(Sys.time()-ltime, 2)), ".\n"))
p()
#Return all clusterings
allclust
}
## Stop parallel computing
message("\n [>] Aggregating iterations for each k values...")
flush.console()
## Function to access data from parallel computing
reframeData <- function(ll, name, clust, format="vector"){
xx <- lapply(ll, FUN=function(x)x[[clust]][[name]])
if(format=="list"){
return(xx)
}else if(format=="matrix"){
return(do.call(cbind, xx))
}
return(do.call(c, xx))
}
kvalscriteria <- expand.grid(kvals=seq_along(kvals), criteria=criteria)
## Object to be returned, should match clustrange structure
p <- progressor(nrow(kvalscriteria)*R)
kret <- list()
for(i in 1:nrow(kvalscriteria)){
k <- kvalscriteria$kvals[i]
criteria <- as.character(kvalscriteria$criteria[i])
## Objective criteria
mean_all_diss <- reframeData(calc_pam, "mean_diss", k, "vector")
pbm_all <- reframeData(calc_pam, "pbm", k, "vector")
db_all <- reframeData(calc_pam, "db", k, "vector")
xb_all <- reframeData(calc_pam, "xb", k, "vector")
ams_all <- reframeData(calc_pam, "ams", k, "vector")
##Retrieve all clusterings
clustering_all_diss <- reframeData(calc_pam, "clustering",k, ifelse(method=="fuzzy", "list", "matrix"))
##Retrieve medoids
med_all_diss <- reframeData(calc_pam, "medoids", k, "matrix")
##Find best clustering
objective <- switch(criteria, distance=mean_all_diss, pbm=pbm_all, db=db_all, ams=ams_all, xb=xb_all)
best <- ifelse(criteria %in% c("ams", "pbm"), which.max(objective), which.min(objective))
## Compute clustering stability of the best partition
# ari <- sapply(1:ncol(clustering_all_diss), function(x){
# tab <- xtabs(ac$aggWeights~clustering_all_diss[, best]+clustering_all_diss[, x])
# adjustedRandIndex(tab)
# })
if(stability){
if(method=="fuzzy"){
foplan <- plan(sequential)
}
arilist <- foreach(j=1:R, .options.future = list(seed = TRUE, globals = structure(TRUE, add = c("ac", "clustering_all_diss", "method"))) ) %dofuture%{#on stocke chaque sample
if(method=="fuzzy"){
tab <- as.table(crossmemb(clustering_all_diss[[j]]*ac$aggWeights, clustering_all_diss[[best]]*ac$aggWeights, relativize = FALSE))
}
else{
tab <- as.table(tapply(ac$aggWeights, list(clustering_all_diss[, j], clustering_all_diss[, best]), sum, default=0L))
}
val <- c(adjustedRandIndex(tab), jaccardCoef(tab))
p()
val
}
if(method=="fuzzy"){
plan(foplan)
}
arimatrix <- do.call(rbind, arilist)
colnames(arimatrix) <- c("ARI", "JC")
ari08 <- sum(arimatrix[,1] >= 0.8)
jc08 <- sum(arimatrix[,2] >= 0.8)
}else{
arimatrix <- NA
ari08 <- NA
jc08 <- NA
}
if(method=="fuzzy"){
disagclust <- clustering_all_diss[[best]][ac$disaggIndex, ] ##Disaggregate here
}else{
disagclust <- clustering_all_diss[ac$disaggIndex, best] ##Disaggregate here
}
if(criteria %in% c("ams", "pbm")){
evol.diss <- cummax(objective)
}else{
evol.diss <- cummin(objective)
}
##Store the best solution and evaluations of the others.
bestcluster <- list(
medoids = ac$aggIndex[med_all_diss[, best]], ##Disaggregate here
#clustering = clustering_all_diss[, best],
clustering = disagclust, ##Disaggregate here
evol.diss =evol.diss,
iter.objective=objective,
objective=objective[best],
iteration=best,
#ari=ari,
#iter.ari09=iter.ari,
arimatrix=arimatrix,
criteria = criteria,
method=method,
avg_dist=mean_all_diss[best],
pbm=pbm_all[best],
db=db_all[best],
xb=xb_all[best],
ams=ams_all[best],
ari08=ari08,
jc08=jc08,
R=R,
k=k
)#Creating the object to be returned
####Compute cluster quality David Bouldin if asked for
if(keep.diss || method =="representativeness"){
## Recompute distances (required to avoid memory issue)
seqdist.args$seqdata <- agseqdata
seqdist.args$refseq <- list(1:nrow(agseqdata), ac$disaggIndex[bestcluster$medoids])
suppressMessages(diss2 <- do.call(seqdist, seqdist.args))
bestcluster$diss <- diss2[ac$disaggIndex, ]
bestcluster$representativeness <- 1-bestcluster$diss/max.dist
rm(diss2)
}
## Store computed cluster quality
gc()
class(bestcluster) <- "seqclara"
kresult <- list(k=k, criteria=criteria, stats=c(bestcluster$avg_dist, bestcluster$pbm, bestcluster$db, bestcluster$xb, bestcluster$ams, bestcluster$ari08, bestcluster$jc08, best), bestcluster=bestcluster)
kret[[i]] <- kresult
}
claraObj <- function(kretlines){
if(method=="crisp"){
clustering <- matrix(-1, nrow=nrow(seqdata), ncol=length(kvals))
clustering <- as.data.frame(clustering)
colnames(clustering) <- paste("cluster", kvals, sep="")
} else{
clustering <- list()
}
ret <- list(kvals=kvals,
clara=list(),
clustering =clustering,
stats =matrix(-1, nrow=length(kvals), ncol=8))
for(i in kretlines){
k <- kret[[i]]$k
criteria <- kret[[i]]$criteria
ret$stats[k, ] <- kret[[i]]$stats
ret$clara[[k]] <- kret[[i]]$bestcluster
if(method=="crisp"){
ret$clustering[,k] <- kret[[i]]$bestcluster$clustering
}else if(method=="representativeness"){
ret$clustering[[paste0("cluster", kvals[k])]] <- kret[[i]]$bestcluster$representativeness
}else{
ret$clustering[[paste0("cluster", kvals[k])]] <- kret[[i]]$bestcluster$clustering
}
}
rownames(ret$stats) <- paste("cluster", kvals, sep="")
colnames(ret$stats) <- c("Avg dist", "PBM", "DB", "XB", "AMS", "ARI>0.8", "JC>0.8", "Best iter")
class(ret) <- c("seqclararange", "clustrange")
return(ret)
}
if(length(criteria)>1){
ret <- list(param=list(criteria=criteria, pam.combine=FALSE, all.criterias= criteria, kvals=kvals, method=method, stability=stability))
for(meth in criteria){
ret[[meth]] <- claraObj(which(kvalscriteria$criteria==meth))
}
allstats <- list()
for(meth in criteria){
allstats[[meth]] <- as.data.frame(cbind(ret[[meth]]$stats, ngroup=kvals))
allstats[[meth]]$criteria <- meth
}
ret$allstats <- do.call(rbind, allstats)
class(ret) <- c("seqclarafamily", "clustrangefamily")
}else{
ret <- claraObj(1:nrow(kvalscriteria))
}
## Overall memory management
gc()
message(" \n [>] Overall computation time : ", format(round(Sys.time()-start.time, 2)))
return(ret)
}
#mysample <- strataSampling(precluststrata, sample.size=sample.size, prob=ac$probs, randomize=0)
adjustedRandIndex <- function(x, y=NULL){
if(!is.table(x)){
x <- as.vector(x)
y <- as.vector(y)
if (length(x) != length(y))
stop("arguments must be vectors of the same length")
tab <- table(x, y)
}else{
tab <- x
}
if (all(dim(tab) == c(1, 1)))
return(1)
a <- sum(choose(tab, 2))
b <- sum(choose(rowSums(tab), 2)) - a
c <- sum(choose(colSums(tab), 2)) - a
d <- choose(sum(tab), 2) - a - b - c
ARI <- (a - (a + b) * (a + c)/(a + b + c + d))/((a + b +
a + c)/2 - (a + b) * (a + c)/(a + b + c + d))
return(ARI)
}
jaccardCoef <- function(tab){
if (all(dim(tab) == c(1, 1)))
return(1)
n11 <- sum(tab^2)
n01 <- sum(colSums(tab)^2)
n10 <- sum(rowSums(tab)^2)
return(n11/(n01+n10-n11))
}
daviesBouldinIndex <- function(seqclara, seqdata, seqdist.args = list(method = "LCS"), p=1){
ret <- numeric(length(seqclara$kvals))
for(k in seq_along(seqclara$kvals)){
## Objective criteria
seqdist.args$seqdata <- seqdata
seqdist.args$refseq <- list(1:nrow(seqdata), seqclara$clara[[k]]$medoids)
suppressMessages(diss2 <- do.call(seqdist, seqdist.args))
db <- davies_bouldin_internal(diss2, seqclara$clara[[k]]$clustering, seqclara$clara[[k]]$medoids)$db
rm(diss2)
ret[k] <- db
}
return(ret)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.