RARCAT.R
In WeightedCluster: Clustering of Weighted Data

## ----setup, include=FALSE-----------------------------------------------------
knitr::knit_hooks$set(time_it = local({
  now <- NULL
  function(before, options) {
    if (before) {
      # record the current time before each chunk
      now <<- Sys.time()
    } else {
      # calculate the time difference after a chunk
      res <- difftime(Sys.time(), now, units = "secs")
      # return a character string to show the time
      paste("Time for this code chunk to run:", round(res,
        2), "seconds")
    }
  }
}))
knitr::opts_chunk$set(dev = "png", dev.args = list(type = "cairo-png"), time_it=TRUE)

## ----message=FALSE------------------------------------------------------------
set.seed(1)

## ----message=FALSE------------------------------------------------------------
## Loading the TraMineR library
library(TraMineR)
## Loading the data
data(mvad)

## State properties
mvad.alphabet <- c("employment", "FE", "HE", "joblessness", "school", "training")
mvad.lab <- c("employment", "further education", "higher education", "joblessness", "school", "training")
mvad.shortlab <- c("EM","FE","HE","JL","SC","TR")

## Creating the state sequence object
mvad.seq <- seqdef(mvad, 17:86, alphabet = mvad.alphabet, states = mvad.shortlab, 
                   labels = mvad.lab, xtstep = 6)

## ----message=FALSE------------------------------------------------------------
## Using fastcluster for hierarchical clustering
library(fastcluster)
## Distance computation
diss <- seqdist(mvad.seq, method="LCS")
## Hierarchical clustering
hc <- hclust(as.dist(diss), method="ward.D")

## ----message=FALSE------------------------------------------------------------
# Loading the WeightedCluster library
library(WeightedCluster)
# Computing cluster quality measures.
clustqual <- as.clustrange(hc, diss=diss, ncluster=10)
clustqual

## ----fig.width=8, fig.height=10-----------------------------------------------
seqdplot(mvad.seq, group=clustqual$clustering$cluster6, border=NA)

## ----message=FALSE------------------------------------------------------------
# Add the clustering solution to the dataset
mvad$clustering <- clustqual$clustering$cluster6
# Formula for the association between clustering and covariates of interest
formula <- clustering ~ funemp + gcse5eq
# Run separate logistic regressions for each cluster and compute the corresponding AMEs
# with their confidence interval
rarcatout <- rarcat(formula, data=mvad, diss=diss, robust=FALSE)
rarcatout$original.analysis

## ----message=FALSE------------------------------------------------------------
# Evaluate the validity of the original analysis and the reliability of its
# findings by applying RARCAT with 50 bootstrap replications.
# As in the original analysis, hierarchical clustering with Ward method is implemented.
# The number of clusters is fixed to 6 here.
rarcatout <- rarcat(formula, data = mvad, diss = diss, robust = TRUE, B = 50, 
                    algo = "hierarchical", method = "ward.D", 
                    fixed = TRUE, kcluster = 6)
rarcatout$robust.analysis

## ----message=FALSE------------------------------------------------------------
# # Loading the parallel library
# library(parallel)
# # Use available cores minus one
# ncpus <- detectCores() - 1
# # Create the parallel cluster
# cl <- makeCluster(ncpus)
# 
# # Parallel run
# rarcatout <- rarcat(formula, data = mvad, diss = diss, robust = TRUE, B = 500,
#                     algo = "hierarchical", method = "ward.D",
#                     fixed = TRUE, kcluster = 6,
#                     parallel = "snow", ncpus = ncpus, cl = cl)
# rarcatout$robust.analysis

## ----fig.width=8, fig.height=4------------------------------------------------
seqdplot(mvad.seq, group=clustqual$clustering$cluster2, border=NA)

## ----message=FALSE------------------------------------------------------------
# Loading the margins library for marginal effects estimation
library(margins)
# Create cluster membership variable
mvad$clustering <- clustqual$clustering$cluster2
mvad$membership <- mvad$clustering == 2
# Run logistic regression model
mod <- glm(membership ~ funemp, mvad, family = "binomial")
# Model results (AME)
summary(margins(mod))

## ----message=FALSE------------------------------------------------------------
# Loading the fpc library
library(fpc)
# Cluster-wise stability assessment by bootstrap
stab <- clusterboot(diss, B = 500, distances = TRUE, clustermethod = disthclustCBI, 
                    method = "ward.D", k = 2, count = FALSE)
round(stab$bootmean, 2)

## ----message=FALSE------------------------------------------------------------
# Bootstrap replicates of the typology and its association with the variable funemp.
# As in the original analysis, hierarchical clustering with Ward method is implemented.
# Also, an optimal clustering solution with n between 2 and 10 is evaluated each time by
# maximizing the CH index.
# bootout <- regressboot(clustering ~ funemp, data = mvad, diss = diss, B = 500,
#                        algo = "hierarchical", method = "ward.D",
#                        fixed = FALSE, kcluster = 10, cqi = "CH",
#                        parallel = "snow", ncpus = ncpus, cl = cl)
# Without parallel computing
bootout <- regressboot(clustering ~ funemp, data = mvad, diss = diss, B = 50,
                       algo = "hierarchical", method = "ward.D",
                       fixed = FALSE, kcluster = 10, cqi = "CH")
table(bootout$optimal.kcluster)

## ----message=FALSE------------------------------------------------------------
# The association with father unemployment status as it appears in the regression output
bootout$covar.name
# The AMEs for the association between father unemployment status and the original clustering
round(bootout$original.ame$funempyes, 4)

## ----fig.width=8, fig.height=5------------------------------------------------
# Histogram of the estimated AMEs for all individuals and all bootstraps
hist(bootout$bootstrap.ame$funempyes, main = NULL, xlab = "AME")
# Histogram for the individuals in the second cluster (high education trajectories)
clustering <- clustqual$clustering$cluster2
hist(bootout$bootstrap.ame$funempyes[clustering == 2,], main = NULL, xlab = "AME")
# Histogram for the individuals in the first cluster (the rest)
hist(bootout$bootstrap.ame$funempyes[clustering == 1,], main = NULL, xlab = "AME")

## ----message=FALSE------------------------------------------------------------
# Robustness assessment for the association between father unemployment status
# and membership to the higher education trajectory group
result <- bootpool(bootout, clustering = mvad$clustering, clusnb = 2, 
                   covar = "funempyes")
round(result$pooled.ame, 4)
round(result$standard.error, 4)
round(result$bootstrap.stddev, 4)

## ----fig.width=8, fig.height=5------------------------------------------------
# Histogram of the fitted random effects for individuals in the second cluster
hist(result$individual.ranef, main = NULL, breaks = 20,
     xlab = "Individual-specific random effects")

## ----fig.width=8, fig.height=4------------------------------------------------
# Most frequent trajectories in the second cluster, separated by their 
# random effects estimated in RARCAT
seqfplot(mvad.seq[clustering == 2,], 
         group=abs(result$individual.ranef) > result$individual.stddev, 
         border=NA, main="Outlier")

## ----fig.width=8, fig.height=10-----------------------------------------------
seqdplot(mvad.seq, group=clustqual$clustering$cluster6, border=NA)

## ----message=FALSE------------------------------------------------------------
# Create cluster membership variable
mvad$clustering <- clustqual$clustering$cluster6
mvad$membership <- mvad$clustering == 6
# Run logistic regression model
mod <- glm(clustering ~ funemp, mvad, family = "binomial")
# Model results
summary(margins(mod))

## ----message=FALSE------------------------------------------------------------
# Evaluate the validity of the original analysis and the reliability of its
# findings by applying RARCAT with all given covariates and constructed types
# rarcat <- rarcat(clustering ~ funemp, mvad, diss = diss, B = 500,
#                  algo = "hierarchical", method = "ward.D",
#                  fixed = TRUE, kcluster = 6,
#                  parallel = "snow", ncpus = ncpus, cl = cl)
rarcat <- rarcat(clustering ~ funemp, mvad, diss = diss, B = 50,
                 algo = "hierarchical", method = "ward.D",
                 fixed = TRUE, kcluster = 6)
rarcat$original.analysis
rarcat$robust.analysis
# Stop the parallel cluster after computation
#stopCluster(cl)