Nothing
#' Functional clustering
#'
#' Functional clustering based on a specified measure.
#' The options of the measures can be found in \code{\link{central_region}}.
#'
#'
#' Functional clustering joins the list of \code{curve_set} objects in one \code{curve_set} with long functions and
#' applies on the differences of all functions the specified measure. This provides a dissimilarity matrix
#' which is used in partitioning around medoids procedure. The resulting clusters can then be shown by plotting
#' the function respectively for each \code{curve_set}. Thus for each \code{curve_set}, the panel with all the medoids
#' is shown followed by all clusters represented by central region, medoid and all curves belonging to it, when
#' the result object is plotted.
#'
#' If there are less than three curves in some of the groups, then the central region is not plotted.
#' This leads to a warning message from ggplot2.
#'
#' @param curve_sets A \code{curve_set} object or a list of \code{curve_set} objects to which
#' the functional clustering is to be applied. If list of \code{curve_set} objects is provided,
#' then the joined functional clustering is applied, which provides an equal weight combination
#' of \code{curve_set} objects, if the \code{curve_set} objects contain the same numbers of elements
#' (same lengths of vector \eqn{r}{r}).
#' @param k The number of clusters.
#' @param type The measure which is used to compute the dissimilarity matrix. The preferred options
#' are \code{"area"} and \code{"st"}, but \code{"erl"} and \code{"cont"} can be also used with caution.
#' @param triangineq Logical. Whether or not to compute the proportion of combinations
#' of functions which satisfies the triangular inequality, see 'Value'.
#' @param ... Additional parameters to be passed to \code{\link{central_region}},
#' which is responsible for calculating the central region (global envelope)
#' on which the functional clustering is based.
#'
#' @return An object having the class \code{fclust}, containing
#' \itemize{
#' \item curve_sets = The set(s) of functions determined for clustering
#' \item k = Number of clusters
#' \item type = Type of clustering method
#' \item triangineq = The proportion of combinations of functions which satisfies the triangular inequality.
#' The triangular inequality must hold to ensure the chosen measure forms a metric. In some weird cases
#' it does not hold for ‘area’ measure, therefore this check is provided to ensure the data forms metric
#' with the ‘area’ measure. The triangineq must be 1 to ensure the inequality holds for all functions.
#' \item dis = The joined dissimilarity matrix
#' \item pam = Results of the partitioning around medoids (pam) method applied on the joined functions
#' with the dissimilarity matrix (dis). See \code{\link[cluster]{pam}}.
#' }
#' @references
#' Dai, W., Athanasiadis, S., Mrkvička, T. (2021) A new functional clustering method with combined dissimilarity sources and graphical interpretation. Intech open, London, UK. DOI: 10.5772/intechopen.100124
#'
#' @export
#' @seealso \code{\link{central_region}}, \code{\link{plot.fclust}}
#' @examples
#' # Read raw data from population growth rdata
#' # with countries over million inhabitants
#' data("popgrowthmillion")
#' \dontshow{
#' popgrowthmillion <- popgrowthmillion[1:10, 1:50]
#' }
#' # Create centred data
#' m <- apply(popgrowthmillion, 2, mean) # Country-wise means
#' cpopgrowthmillion <- popgrowthmillion
#' for(i in 1:dim(popgrowthmillion)[1]) {
#' cpopgrowthmillion[i,] <- popgrowthmillion[i,] - m
#' }
#'
#' # Create scaled data
#' t2 <- function(v) { sqrt(sum(v^2)) }
#' s <- apply(cpopgrowthmillion, 2, t2)
#' spopgrowthmillion <- popgrowthmillion
#' for(i in 1:dim(popgrowthmillion)[1]) {
#' spopgrowthmillion[i,] <- cpopgrowthmillion[i,]/s
#' }
#'
#' # Create curve sets
#' r <- 1951:2015
#' \dontshow{
#' r <- r[1:10]
#' }
#' cset1 <- curve_set(r = r, obs = popgrowthmillion)
#' cset2 <- curve_set(r = r, obs = spopgrowthmillion)
#' csets <- list(Raw = cset1, Shape = cset2)
#'
#' # Functional clustering with respect to joined "st" difference measure
#' # and "joined" central regions of each group
#' res <- fclustering(csets, k=3, type="area")
#' p <- plot(res, plotstyle = "marginal", coverage = 0.5)
#' p[[1]] # Central functions
#' p[[2]] # Groups: central functions and regions
#' # To collect the two figures into one use, e.g., patchwork:
#' if(require("patchwork", quietly=TRUE)) {
#' p[[1]] + p[[2]] + plot_layout(widths = c(1, res$k))
#' }
#' # Silhouette plot of pam
#' plot(res$pam)
#' @importFrom stats dist
#' @importFrom cluster pam
#' @importFrom utils combn
fclustering <- function(curve_sets, k, type = c("area", "st", "erl", "cont"), triangineq = FALSE, ...) {
# Check curve_sets. Note: k is checked by pam
if(is_a_single_curveset(curve_sets)) {
curve_sets <- list(curve_sets) # Make a list of a single curve_set to treat it similarly as several sets of curves
}
# Convert (e.g. fdata) to curve_sets and check the content
curve_sets <- lapply(curve_sets, FUN=as.curve_set)
if(any(sapply(curve_sets, FUN=curve_set_is1obs))) stop("Some or all sets of curves have one observed function.")
# Check consistency of the given curve sets
checkequal <- function(f) {
all(sapply(curve_sets, FUN=function(curve_set) { f(curve_set) == f(curve_sets[[1]]) }))
}
if(!checkequal(curve_set_nfunc))
stop("The numbers of functions in curve sets differ.")
if(!checkequal(curve_set_narg))
warning("The numbers of points where the curves are observed (r) differ in the given curve sets.")
type <- match.arg(type)
if(type %in% c("erl", "cont"))
warning("Type is ", type, ". Check if the triangular inequality is equal to 1. \n",
"If not, the measure does not form a metric. pam should not be used.")
if(type == "st") {
measure <- "max"; scaling <- "st"
}
else {
measure <- type; scaling <- NULL
}
# Number of curve_sets
n <- length(curve_sets)
# Compute the dissimilarity matrix
# lstats = original statistics, jstats = joined statistics
lstats <- vector("list", n)
jr <- NULL
jstats <- NULL
for(j in 1:n) {
nr <- curve_set_narg(curve_sets[[j]])
nfunc <- curve_set_nfunc(curve_sets[[j]])
funcs <- curve_set_funcs(curve_sets[[j]])
co <- combn(nfunc, 2, simplify = FALSE) # All combinations
stats <- array(0, c(2*length(co), nr))
for(i in seq_along(co)) {
stats[i,] <- funcs[, co[[i]][1]]-funcs[, co[[i]][2]]
}
for(i in seq_along(co)) {
stats[length(co)+i, ] <- funcs[, co[[i]][2]]-funcs[, co[[i]][1]]
}
lstats[[j]] <- stats
# Combined
jr <- c(jr, curve_sets[[j]][['r']])
jstats <- cbind(jstats, lstats[[j]]) # j = joint
}
# Add the zero curve
jstats <- rbind(jstats, rep(0, length(jr))) # length(jr) is same as dim(jstats)[2]
data <- create_curve_set(list(r=jr, obs=t(jstats))) # Non-increasing r-values provided, but not used
# dist gives distances between the rows of a data matrix. each row = a function
b <- dist(data_and_sim_curves(curve_sets[[1]])) # Or t(curve_set_funcs(curve_sets[[1]]))
# Fill it in with other dissimilarity matrix
if(measure == "max")
fo <- forder(data, measure="max", scaling=scaling)
else
fo <- 1 - forder(data, measure=measure, scaling=scaling)
b[seq_along(b)] <- fo[seq_along(b)] # for(i in 1:length(b)) { b[i] <- fo[i] }
resultpamF <- pam(b, k=k)
bb <- as.matrix(b)
nfunc <- curve_set_nfunc(curve_sets[[1]])
# Triangular inequality
if(triangineq) {
r <- NULL # Here r is a logical vector for something
for(i in 1:(nfunc-2)) {
for(j in (i+1):(nfunc-1)) {
for(kk in (j+1):(nfunc)) {
r <- c(r, bb[i,j] + bb[j,kk] >= bb[i,kk] - 0.00000001)
}
}
}
}
res <- list(curve_sets=curve_sets, k=k, type=type,
pam=resultpamF, dis=b)
if(triangineq) res$triangineq <- sum(r)/length(r)
class(res) <- c("fclust", class(res))
res
}
#' Print method for the class 'fclust'
#'
#' Print method for the 'fclust' objects returned by \code{\link{fclustering}}.
#' @param x A object of class 'fclust'.
#' @param ... Ignored.
#' @export
print.fclust <- function(x, ...) {
cat("Functional clustering:\n")
cat(" * Based on the measure \'", x$type, "\' at ", x$k, " clusters \n", sep="")
cat(" * Triangular inequality is satisfied in", x$triangineq*100, "% of cases.\n")
cat("The object contains:\n")
cat("$pam - Results of the partitioning around medoids\n")
cat("$pam$clustering - The clustering vector\n")
cat("$dis - The dissimilarity matrix\n")
}
#' Plot method for the class 'fclust'
#'
#' Plot method for the 'fclust' objects returned by \code{\link{fclustering}}.
#'
#'
#' The clusters are shown respectively for each \code{curve_set}. Thus for each \code{curve_set}
#' the panel with all the medoids is shown followed by all clusters represented by central region,
#' medoid and all curves belonging to it.
#'
#' For all sources, the function plots the deepest curves for all clusters and
#' the deepest curve of each cluster together with the desired central region and
#' all the curves of the group.
#' @param x An 'fclust' object.
#' @param ncol The maximum number of columns for the figures determining the dimensions of the graphical output.
#' Default to a one row figure.
#' @param ncol The number of columns in the graphical output, when there is just one set of curves
#' that has been ordered. If not given, \code{c(1, k+1)} is used, which gives all plots in one row.
#' For more sets of curves, the rows are fixed to correspond to the sets (one row for each set).
#' @param plotstyle The resulting central regions of clusters can be plotted by sorting the
#' appropriate \code{curve_set} only 'marginal' or by sorting the joined list of \code{curve_set}
#' objects 'joined'. If 'joined' is used the shown central regions corresponds to the joined
#' ordering used to cluster the functional data. If 'marginal' is used the shown central regions
#' do not correspond to the joined ordering used to cluster the functional data, but better express
#' the shape of cluster with respect to given \code{curve_set}.
#' @param coverage The coverage of central regions to be used to show the clusters.
#' @param ... Ignored.
#' @param nstep 1 or 2 for how to contruct a combined (joined) global envelope
#' if there are more than one sets of curves. Default to 1, if the numbers of points
#' where the curves are observed (r) are the same in each set, and 2 otherwise.
#'
#' @references
#' Dai, W., Athanasiadis, S., Mrkvička, T. (2021) A new functional clustering method with combined dissimilarity sources and graphical interpretation. Intech open, London, UK. DOI: 10.5772/intechopen.100124
#' @export
#' @importFrom ggplot2 ggplot aes_ geom_line guides facet_wrap facet_grid vars
plot.fclust <- function(x, plotstyle = c("marginal", "joined"), coverage = 0.5,
nstep, ncol, ...) {
csets <- x$curve_sets
n <- length(csets)
k <- x$k
type <- x$type
plotstyle <- match.arg(plotstyle)
if(coverage < 0 | coverage > 1) stop("Unreasonable coverage.")
if(missing(ncol)) ncol <- k+1
if(!is.numeric(ncol) || length(ncol) != 1) stop("Unreasonable ncol.")
if(type %in% "st") centralf <- min
else centralf <- max
if(missing(nstep)) {
nstep <- 1
checkequal <- function(f) {
all(sapply(csets, FUN=function(curve_set) { f(curve_set) == f(csets[[1]]) }))
}
if(!checkequal(curve_set_narg))
nstep <- 2
}
if(!(nstep %in% c(1,2))) stop("Invalid value of nstep.")
if(!all(sapply(csets, FUN = curve_set_is1d)))
stop("Plots are valid only for 1-dimensional curve_set objects.")
resultpamF <- x$pam
resultpamT <- x$pam$clustering
nis <- as.numeric(table(resultpamT)) # Numbers of functions in the groups i, i = 1,...,k
#- Calculate central regions (either)
cr_all <- vector("list", n)
if(plotstyle == "marginal") {
for(j in 1:n) { # For all sets of curves
cr <- vector("list", k)
ff <- vector("list", k)
# Calculate the central regions for each cluster
for(i in 1:k) { # For all groups (within each set of curves)
# Calculate the central regions for each cluster
clusteri <- subset(csets[[j]], resultpamT==i)
if(nis[i] > 3 && !IsNfuncTooSmall(Nfunc=nis[i], alpha=1-coverage)) { # At least 3 functions
cr[[i]] <- central_region(clusteri, type=type, coverage=coverage)
}
else {
cr[[i]] <- data.frame(r=clusteri[['r']], central=1, lo=NA, hi=NA)
}
# Redefine the central curve to be the medoid
cr[[i]]$central <- curve_set_funcs(csets[[j]])[, resultpamF$id.med[i]]
}
cr_all[[j]] <- cr
}
}
else { # "joined"
cr <- vector("list", k)
ff <- vector("list", k)
for(i in 1:k) { # Calculate the central regions for each cluster
if(nis[i] > 3 && !IsNfuncTooSmall(Nfunc=nis[i], alpha=1-coverage)) {
clusteri <- lapply(csets, FUN=subset, subset=resultpamT==i) # Curves for the group i
cr[[i]] <- central_region(clusteri, type=type, coverage=coverage, nstep=nstep) # Joint central region for the group i
# Redefine the central curve to be the medoid
if(n > 1) for(z in 1:n) cr[[i]][[z]]$central <- curve_set_funcs(csets[[z]])[, resultpamF$id.med[i]]
else cr[[i]]$central <- curve_set_funcs(csets[[1]])[, resultpamF$id.med[i]]
}
}
for(j in 1:n) cr_all[[j]] <- lapply(cr, FUN=function(z) { z[[j]] })
}
#- Setup for the plotting
alt <- "two.sided"
d <- list(xlab="", ylab="")
labels <- paste("Group", 1:k)
if(n < 2) {
n_of_plots <- k + 1
ncols_of_plots <- min(n_of_plots, ncol)
nrows_of_plots <- ceiling(n_of_plots / ncols_of_plots)
scales <- "fixed"
}
else scales <- "free_y"
if(!is.null(names(csets))) csetnames <- names(csets)
else csetnames <- paste("Curves", 1:n)
#- df: Central functions and regions
df <- NULL
for(i in 1:n) {
df_i <- combined_df_construction(cr_all[[i]], labels=labels)
df_i$cset <- csetnames[i]
df <- rbind(df, df_i)
}
df$cset <- factor(df$cset, levels=csetnames)
#- df: All single functions
funcs <- lapply(csets, FUN=curve_set_funcs)
nams <- lapply(funcs, FUN=function(x) { n <- colnames(x); if(!is.null(n)) n else 1:ncol(x) })
funcs.df <- do.call(rbind, lapply(seq_along(funcs), FUN = function(i) {
data.frame(r = rep(cr_all[[1]][[i]][['r']], times=ncol(funcs[[i]])),
curves = c(funcs[[i]]),
id = factor(rep(nams[[i]], each=length(cr_all[[1]][[i]][['r']])), levels = nams[[i]]),
plotmain = factor(rep(paste("Group", resultpamT), each=length(cr_all[[1]][[i]][['r']])), levels = labels),
cset = csetnames[i])
}))
#- Plot
size <- 0.3
df$Group <- df$plotmain
p1 <- ( ggplot()
+ geom_line(data = df, aes_(x = ~r, y = ~curves, group = ~Group,
linetype = ~Group), size = size, col='black')
+ set_envelope_legend_position()
+ labs(title = "", x = d$xlab, y = d$ylab)
)
if(n > 1) p1 <- ( p1 + facet_grid(cset~type, scales="free_y") )
else p1 <- ( p1 + facet_wrap(~type, scales="free_y") )
p2 <- ( ggplot()
+ geom_line(data = funcs.df, aes_(x = ~r, y = ~curves, group = ~id), col='grey70')
+ basic_stuff_for_fclustplot(df, d$xlab, d$ylab, main="", size=size)
+ guides(linetype = "none")
)
if(n > 1) p2 <- (p2 + facet_grid(cset~plotmain, scales=scales))
else p2 <- (p2 + facet_wrap(~ plotmain, scales=scales,
nrow=nrows_of_plots, ncol=ncols_of_plots))
#- Collect
list(p1, p2)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.