#' @include adproclus_classes.R
NULL
.repmat <- function(a, n, m) {
kronecker(matrix(1, n, m), a)
}
.lossL2 <- function(w) {
m <- sum(w^2)
return(m)
}
.updateA_lf1 <- function(A, PossibA, data) {
B <- A
npos <- nrow(PossibA)
n <- nrow(A)
loss <- matrix(0, 1, npos)
for (row in 1:n) {
for (r in 1:npos) {
Apos <- B
Apos[row, ] <- PossibA[r, ]
if (any(colSums(Apos) == 0) == FALSE) {
Gpos <- NMFN::mpinv(Apos) %*% data
respos <- data - (Apos %*% Gpos)
loss[r] <- .lossL2(respos)
} else {
loss[r] <- NA
}
}
minloss <- min(loss[, !is.na(loss)])
ind <- which.min(loss)
B[row, ] <- PossibA[ind, ]
}
if (any(colSums(B) == 0) == TRUE) {
zerocols <- 0
} else {
zerocols <- 1
}
result <- list(B, minloss, zerocols)
return(result)
}
.updateA_lf2 <- function(n, P, replX, PossibA) {
nA <- nrow(PossibA)
posrec <- PossibA %*% P
loss <- matrix(as.numeric(rowSums((replX - posrec)^2)),
nrow = nA/n, ncol = n)
index <- apply(loss, 2, which.min)
A <- PossibA[index, ]
return(A)
}
.adproclus_lf1 <- function(x, A) {
t <- proc.time()
data <- x
n <- nrow(x)
totvar <- norm(data - mean(data), "f")^2
k <- ncol(A)
PossibA <- gtools::permutations(2, k, v = c(0, 1), repeats.allowed = TRUE)
PossibA <- apply(PossibA, 2, rev)
if (k > 1) {
PossibA <- t(apply(PossibA, 1, rev))
}
G <- NMFN::mpinv(A) %*% data
res <- data - (A %*% G)
f <- .lossL2(res)
fold <- f + 1
iter <- 1
while (fold > f) {
fold = f
Aold = A
update <- .updateA_lf1(Aold, PossibA, data)
A <- update[[1]]
f <- update[[2]]
iter <- iter + 1
}
runs <- iter - 1
Membs <- Aold
Profs <- NMFN::mpinv(Aold) %*% data
sse <- fold
explvar <- 1 - (fold/totvar)
timeruns <- (proc.time() - t)[1]
model <- Membs %*% Profs
result <- list(Model = model, Membs = Membs, Profs = Profs,
sse = sum((model - x)^2), totvar = totvar,
explvar = explvar, alg_iter = runs, timer = as.numeric(timeruns))
return(result)
}
.adproclus_lf2 <- function(x, A, P) {
t <- proc.time()
data <- x
n <- nrow(x)
totvar <- norm(data - mean(data), "f")^2
k <- ncol(A)
npos <- 2^k
G <- NMFN::mpinv(A) %*% data
res <- data - (A %*% G)
f <- .lossL2(res)
fold <- f + 1
iter <- 1
PossibA <- gtools::permutations(2, k, v = c(0, 1), repeats.allowed = TRUE)
PossibA <- apply(PossibA, 2, rev)
if (k > 1) {
PossibA <- t(apply(PossibA, 1, rev))
}
PossibA <- .repmat(PossibA, n, 1)
replX <- data.frame()
for (i in 1:n) {
reps <- matrix(.repmat(data[i, ], npos, 1),
ncol = ncol(data), nrow = npos, byrow = TRUE)
replX <- rbind(replX, reps)
}
replX <- as.matrix(replX)
while (fold > f) {
fold = f
Aold = A
Pold = P
A <- .updateA_lf2(n, Pold, replX, PossibA)
P <- NMFN::mpinv(A) %*% data
f <- .lossL2(x - (A %*% P))
iter <- iter + 1
}
runs <- iter - 1
Membs <- Aold
Profs <- Pold
sse <- fold
explvar <- 1 - (fold/totvar)
timeruns <- (proc.time() - t)[1]
model <- Membs %*% Profs
result <- list(Model = model, Membs = Membs, Profs = Profs,
sse = sum((model - x)^2), totvar = totvar,
explvar = explvar, alg_iter = runs, timer = as.numeric(timeruns))
return(result)
}
.printoutput <- function (k, time, nstart) {
print ("Clustering completed.")
print (paste ("Additive Profile Clustering with",
k, "overlapping clusters."))
print (paste ("Total processing time:",
round (time, digits = 1), "seconds."))
print (paste ("Algorithm starts:", nstart[1], "random and",
gtools::na.replace (nstart[2], 0), "rational."))
}
#' Generate initial random start
#'
#' Generate an initial random start for the Additive Profile Clustering
#' algorithm (see \code{\link{adproclus}}).
#'
#' \code{getRandom} generates an random initial binary membership matrix
#' \strong{A} by drawing entries from a Bernoulli Distribution with \eqn{\pi =
#' 0.5}. A corresponding initial profile matrix \strong{P} is subsequently
#' estimated conditionally upon A (for details, see Depril et al., 2008, and
#' Wilderjans et al., 2010).
#'
#' For programming simplicity, this function provides the option to pass a
#' matrix of initial cluster centers to the \code{centers} argument. In this
#' case, the matrix is dismissed, the number of clusters \emph{k} is set to
#' \code{nrow(centers)} and a new profile matrix is returned, based on a
#' randomly generated membership matrix. For generating an initial start based
#' on a specific set of initial cluster centers, see \code{\link{getRational}}.
#'
#' \strong{Warning:} This function does \emph{not} obtain an ADPRCOLUS model. To
#' perform aditive profile clustering, see \code{\link{adproclus}}.
#'
#' @param data Object-by-variable data matrix of class \code{matrix} or \code{data.frame}.
#' @param centers either the number of clusters \emph{k}, or a matrix of initial
#' (distinct) cluster centres.
#'
#' @return \code{getRandom} returns a list with the following components:
#' \describe{ \item{\code{type}}{A character string denoting the type of start
#' ('Random Start')} \item{\code{A}}{A randomly generated initial Membership
#' matrix} \item{\code{P}}{The corresponding initial Profile matrix} }
#'
#' @export
#'
#' @references Wilderjans, T. F., Ceulemans, E., Van Mechelen, I., & Depril, D.
#' (2010). ADPROCLUS: a graphical user interface for fitting additive profile
#' clustering models to object by variable data matrices. \emph{Behavior
#' Research Methods, 43}(1), 56-65.
#'
#' Depril, D., Van Mechelen, I., & Mirkin, B.
#' (2008). Algorithms for additive clustering of rectangular data tables.
#' \emph{Computational Statistics and Data Analysis, 52,} 4923-4938.
#'
#' @examples
#' getRandom(ADPROCLUS::CGdata, 3)
#'
#' @seealso \code{\link{getRational}} for generating rational starts and
#' \code{\link{adproclus}} for details about membership and profile matrices.
getRandom <- function(data, centers) {
data <- as.matrix(data)
if (length(centers) == 1L) {
k <- centers
} else {
k <- nrow(centers)
}
n <- nrow(data)
A <- (matrix(stats::runif(n * k), n, k) < 0.5) * 1
while (any(colSums(A) == 0) || qr(A)$rank < k) {
A <- (matrix(stats::runif(n * k), n, k) < 0.5) * 1
}
P <- NMFN::mpinv(A) %*% data
return(list(type = "Random Start", A = A, P = P))
}
#' Generate initial rational start
#'
#' Generate an initial rational start for the Additive Profile Clustering
#' algorithm (see \code{\link{adproclus}}).
#'
#' If \code{centers} is a number of clusters \emph{k}, an initial profile matrix
#' \strong{P} is generated by drawing \emph{k} randomly chosen, distinct, rows
#' from \code{data}. Alternatively, you can pass a user selected matrix of size
#' \emph{k} * \code{ncol(data)} with initial cluster profiles to the
#' \code{centers} argument. In both cases, a corresponding membership matrix
#' \strong{A} is subsequently estimated conditionally upon \strong{P} (for
#' details, see Depril et al., 2008, and Wilderjans et al., 2010).
#'
#' \strong{Warning:} This function does \emph{not} obtain an ADPRCOLUS model. To
#' perform aditive profile clustering, see \code{\link{adproclus}}.
#'
#' @param data Object-by-variable data matrix of class \code{matrix} or
#' \code{data.frame}.
#' @param centers either the number of clusters \emph{k}, or a matrix of initial
#' (distinct) cluster centres.
#'
#' @return \code{getRational} returns a list with the following components:
#' \describe{
#' \item{\code{type}}{A character string denoting the type of start
#' ('Rational Start')}
#' \item{\code{A}}{An initial Membership matrix}
#' \item{\code{P}}{The corresponding initial Profile matrix} }
#'
#' @export
#'
#' @references Wilderjans, T. F., Ceulemans, E., Van Mechelen, I., & Depril, D.
#' (2010). ADPROCLUS: a graphical user interface for fitting additive profile
#' clustering models to object by variable data matrices. \emph{Behavior
#' Research Methods, 43}(1), 56-65.
#'
#' Depril, D., Van Mechelen, I., & Mirkin, B. (2008). Algorithms for additive
#' clustering of rectangular data tables. \emph{Computational Statistics and
#' Data Analysis, 52,} 4923-4938.
#'
#' @examples
#' getRational(ADPROCLUS::CGdata, 3)
#'
#' @seealso \code{\link{getRandom}} for generating random starts and
#' \code{\link{adproclus}} for details about membership and profile matrices.
getRational <- function(data, centers) {
data <- as.matrix(data)
n <- nrow(data)
if (length(centers) == 1L) {
k <- centers
Permutation <- sample(n)
P <- data[Permutation[1:k], ]
} else {
k <- nrow(centers)
P <- centers
}
npos <- 2^k
PossibA <- gtools::permutations(2, k, v = c(0, 1), repeats.allowed = TRUE)
PossibA <- apply(PossibA, 2, rev)
if (k > 1) {
PossibA <- t(apply(PossibA, 1, rev))
}
PossibA <- .repmat(PossibA, n, 1)
replX <- data.frame()
for (i in 1:n) {
reps <- matrix(.repmat(data[i, ], npos, 1),
ncol = ncol(data), nrow = npos, byrow = TRUE)
replX <- rbind(replX, reps)
}
replX <- as.matrix(replX)
A <- as.matrix(.updateA_lf2(n, P, replX, PossibA))
result <- list(type = "Rational Start", A = A,
P = P)
return(result)
}
#' Additive profile clustering
#'
#' Perform ADditive PROfile CLUStering (ADPRCOLUS) on object by variable data.
#'
#' In this function, Mirkin's (1987, 1990) Aditive Profile Clustering
#' (ADPROCLUS) method is used to obtain an unrestricted overlapping clustering
#' model of the object by variable data provided by \code{data}.
#'
#' The ADPROCLUS model approximates an \emph{I} \eqn{x} \emph{J} object by
#' variable data matrix \strong{X} by an \emph{I} \eqn{x} \emph{J} model matrix
#' \strong{M} that can be decomposed into an \emph{I} \eqn{x} \emph{K} binary
#' cluster membership matrix \strong{A} and a \emph{K} \eqn{x} \emph{J}
#' real-valued cluster profile matrix \strong{P}, with \emph{K} indicating the
#' number of overlapping clusters. In particular, the aim of an ADPROCLUS
#' analysis is therefore, given a number of clusters \emph{k}, to estimate a
#' model matrix \strong{M} = \strong{AP} that reconstructs data matrix
#' \strong{X} as close as possible in a least squares sense (i.e. sum of squared
#' residuals). For a detailed illustration of the ADPROCLUS model and associated
#' loss function, see Wilderjans et al., 2011.
#'
#' The alternating least squares algorithms ("\code{ALS1}" and "\code{ALS2}")
#' that can be used for minimization of the loss function were proposed by
#' Depril et al. (2008). In "\code{ALS2}", starting from an initial random or
#' rational estimate of \strong{A} (see \code{\link{getRandom}} and
#' \code{\link{getRational}}), \strong{A} and \strong{P} are alternately
#' re-estimated conditionally upon each other until convergence. The
#' "\code{ALS1}" algorithm differs from the one previous one in that each row in
#' \strong{A} is updated independently and that the conditionally optimal
#' \strong{P} is recalculated after each row update, instead of the end of the
#' matrix. For a discussion and comparison of the different algorithms, see
#' Depril et al., 2008.
#'
#' \strong{Warning:} Computation time increases exponentially with increasing
#' number of clusters, \emph{k}! We recommend to determine the computation time
#' of a single start for each specific dataset and \emph{k} before employing a
#' multistart procedure.
#'
#' @param data object-by-variable data matrix of class \code{matrix} or
#' \code{data.frame}.
#' @param centers either the number of clusters \emph{k}, or a matrix of initial
#' (distinct) cluster centres. If a number \emph{k}, a random set of \emph{k}
#' rows in \code{data} is chosen as initial centres.
#' @param nstart if \code{centers} is a number, a vector of length 1 or 2
#' denoting the number of random and rational starts to be performed.
#' @param algorithm character string "\code{ALS1}" (default) or "\code{ALS2}",
#' denoting the type of alternating least squares algorithm.
#' @param SaveAllStarts logical. If \code{TRUE}, the results of all algorithm
#' starts are returned. By default, only the best solution is retained.
#'
#' @return By default, \code{adproclus} returns a list with the following
#' components: (If \code{SaveAllStarts} is \code{TRUE}, a list is returned for
#' each start of the algorithm) \describe{ \item{\code{Model}}{matrix. The
#' obtained overlapping clustering model \strong{M} of the same size as
#' \code{data}.} \item{\code{Membs}}{matrix. The membership matrix \strong{A}
#' of the clustering model.} \item{\code{Profs}}{matrix. The profile matrix
#' \strong{P} of the clustering model.} \item{\code{sse}}{numeric. The
#' residual sum of sqares of the clustering model, which is minimised by the
#' ALS algorithm.} \item{\code{totvar}}{numeric. The total sum of squares
#' of\code{data}.} \item{\code{explvar}}{numeric. The proportion of variance
#' in \code{data} that is accounted for by the clustering model.}
#' \item{\code{alg_iter}}{numeric. The number of iterations of the algorithm.}
#' \item{\code{timer}}{numeric. The amount of time (in seconds) the algorithm
#' ran for.} \item{\code{initialStart}}{list. A list containing initial
#' membership and profile matrices, as well as the type of start that was used
#' to obtain the clustering solution. (as returned by \code{\link{getRandom}}
#' or \code{\link{getRational}})}}
#'
#' @export
#'
#' @references Wilderjans, T. F., Ceulemans, E., Van Mechelen, I., & Depril, D.
#' (2010). ADPROCLUS: a graphical user interface for fitting additive profile
#' clustering models to object by variable data matrices. \emph{Behavior
#' Research Methods, 43}(1), 56-65.
#'
#' Depril, D., Van Mechelen, I., & Mirkin, B. (2008). Algorithms for additive
#' clustering of rectangular data tables. \emph{Computational Statistics and
#' Data Analysis, 52,} 4923-4938.
#'
#' Mirkin, B. G. (1987). The method of principal clusters. \emph{Automation
#' and Remote Control}, 10:131-143.
#'
#' Mirkin, B. G. (1990). A sequential fitting procedure for linear data
#' analysis models. \emph{Journal of Classification}, 7(2):167-195.
#'
#' @examples
#' # Loading a test dataset into the global environment
#' x <- ADPROCLUS::CGdata
#'
#' # Quick clustering with K = 3 clusters
#' clust <- adproclus(x, 3)
#'
#' # Clustering with K = 4 clusters,
#' # using the ALS2 algorithm,
#' # with 5 random and 5 rational starts
#' clust <- adproclus(x, 4, c(5,5), "ALS2")
#'
#' # Saving the results of all starts
#' clust <- adproclus(x, 3, c(2,2), SaveAllStarts = TRUE)
#'
#' # Clustering using a user-defined rational start
#' start <- getRational(x,3)
#' clust <- adproclus(x, start$P)
#'
#' @seealso \code{\link{getRandom}} and \code{\link{getRational}} for generating
#' random and rational starts for ADORCLUS.
adproclus <- function(data, centers, nstart = 1L,
algorithm = "ALS1", SaveAllStarts = FALSE) {
t <- proc.time()
results <- list()
data <- as.matrix(data)
n <- as.integer(nrow(data))
p <- as.integer(ncol(data))
if (is.na(n) || is.na(p))
stop("'invalid data matrix")
if (missing(centers))
stop("'centers' must be a number or a matrix")
alg <- switch(match.arg(algorithm, c("ALS1", "ALS2")),
ALS1 = 1, ALS2 = 2)
if (length(centers) != 1L) {
if (ncol(data) != ncol(centers))
stop("number of columns in 'centers' must match number of columns in 'data'")
k <- nrow(centers)
centers <- as.matrix(centers)
nstart <- c(0,nstart)
if (n < k)
stop("number of clusters cannot exceed number of objects in 'data'")
if (sum(nstart) != 1L) {
nstart <- c(0,1)
warning("'centers' is an initial profile matrix.
Number of starts has been set to one.")
}
initialStart <- getRational(data,
centers)
if (alg == 1) {
results <- .adproclus_lf1(data, initialStart$A)
results$initialStart <- initialStart
}
if (alg == 2) {
results <- .adproclus_lf2(data, initialStart$A,
initialStart$P)
results$initialStart <- initialStart
}
} else {
k <- centers
if (n < k)
stop("number of clusters cannot exceed number of objects in 'data'")
if (length(nstart) > 2) {
stop("'nstart' must be a vector of length 1 or 2")
}
if (sum(nstart) > 50) {
warning("Number of starts is larger than 50. Computation might take a while")
}
BestSol <- list(sse = Inf)
if (alg == 1) {
for (i in 1:nstart[1]) {
initialStart <- getRandom(data,
centers)
res <- .adproclus_lf1(data, initialStart$A)
res$initialStart <- initialStart
if (res$sse < BestSol$sse) {
BestSol <- res
}
if (SaveAllStarts == TRUE) {
results <- append(results, list(res))
}
}
remove(i)
if (!is.na(nstart[2])) {
for (i in 1:nstart[2]) {
initialStart <- getRational(data,
centers)
res <- .adproclus_lf1(data, initialStart$A)
res$initialStart <- initialStart
if (res$sse < BestSol$sse) {
BestSol <- res
}
if (SaveAllStarts == TRUE) {
results <- append(results, list(res))
}
}
remove(i)
}
}
if (alg == 2) {
for (i in 1:nstart[1]) {
initialStart <- getRandom(data,
centers)
res <- .adproclus_lf2(data, initialStart$A,
initialStart$P)
res$initialStart <- initialStart
if (res$sse < BestSol$sse) {
BestSol <- res
}
if (SaveAllStarts == TRUE) {
results <- append(results, list(res))
}
remove(i)
}
if (!is.na(nstart[2])) {
for (i in 1:nstart[2]) {
initialStart <- getRational(data,
centers)
res <- .adproclus_lf2(data, initialStart$A,
initialStart$P)
res$initialStart <- initialStart
if (res$sse < BestSol$sse) {
BestSol <- res
}
if (SaveAllStarts == TRUE) {
results <- append(results, list(res))
}
}
remove(i)
}
}
if (SaveAllStarts == TRUE) {
results <- list(BestSol = BestSol, Runs = results)
names(results$Runs) <- as.character(c(1:length(results$Runs)))
} else {
results <- BestSol
}
}
time <- (proc.time() - t)[1]
.printoutput(k,time, nstart)
return(results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.