# R/clusCA.R In clustrd: Methods for Joint Dimension Reduction and Clustering

```clusCA <- function(data,nclus,ndim,nstart=100,smartStart=NULL,gamma = FALSE, seed=NULL, binary = FALSE){
K = nclus
k = ndim
nrs = nstart
q = ncol(data)
maxiter = 100
maxinert=-1
data = data.frame(data)
if (binary == FALSE) {
data=as.data.frame(lapply(data,as.factor))
Z = tab.disjonctif(data)#dummy.data.frame(data, dummy.classes = "ALL") # The original super indicator
lab1a=names(data)
lab1b=lapply(data,function(z) levels(as.factor(z)))
lab1=rep(lab1a,times=unlist(lapply(lab1b,length)))
lab2=unlist(lab1b)
colnames(Z) = paste(lab1,lab2,sep=".")
}
else
Z = data
n=nrow(Z)
Q=ncol(Z)

Dzh=diag(as.vector(colSums(Z)^(.5)))
Dzhi=pseudoinverse(Dzh)
#Dzhi= chol2inv(chol(Dzh))
MZ=scale(Z,scale=FALSE)
MZD = MZ %*% Dzhi
if(!is.null(seed)) set.seed(seed)
seed <- round(2^31 * runif(nstart, -1, 1))

pb <- txtProgressBar(style = 3)
prog = 1
# Do nrs random starts
fvec=c()
for (rs in 1:nrs){
if (rs > (prog * (nrs/10))) {
prog <- prog + 1
}
setTxtProgressBar(pb, prog * 0.1)

if(is.null(smartStart)){
# myseed=seed+rs
#  set.seed(myseed)
set.seed(seed[rs])
randVec= matrix(ceiling(runif(n)*nclus),n,1)
}else{
randVec=smartStart
}

Zki=dummy(randVec)
Dk=t(Zki) %*% Zki
Dks=Dk^(.5)
#Dksi=pseudoinverse(Dks)
Dksi= chol2inv(chol(Dks))
DZkZD= sqrt(n/q)*Dksi%*% t(Zki) %*% MZD     # equation 6 on the paper
svdDZkZD=svd(DZkZD)
#this is for ndim = 1 to work
if (k != 1) {
Lk=diag(svdDZkZD\$d[1:k])
} else {
Lk = data.matrix(svdDZkZD\$d[1])
}
G = svdDZkZD\$u
Gi = G[,1:k]

Gi=Dksi%*%Gi%*%Lk # CA row coordinates (section 2 in the paper right below formula (1))

Bstar=svdDZkZD\$v
B=sqrt(n*q)*Dzhi %*% Bstar # as in eq. (8)

Bns=B[,1:k]
Bi=Bstar[,1:k]           # attribute quantifications. Orthonormal

#

#  inertia = sum(t(Lk)%*% Lk) # explained inertia in k dimensions

Yi=sqrt((n/q)) * MZD%*%Bi  # The coordinates for the subjects. as in eq. (10).

GDGbef=sum(diag(t(Gi) %*% Dk %*% Gi))   # Objective value before K means This is equivalent to formula (6), but then in the paper

objbef=GDGbef
######## END first fixed C step: Given random C, B and G are optimal
#  obj_improv=c()
#  kmeanobj=1000        # initialize  obj value
improv=10
iter=0

objective=sum(diag((t(Yi)%*%Zki%*%chol2inv(chol(t(Zki)%*%Zki))%*%t(Zki)%*%Yi)))
while ((improv > 0.0001) && (iter<=maxiter)){
iter = iter + 1
outK = try(kmeans(Yi,centers=Gi,nstart=100),silent=T)
#empty clusters
if(is.list(outK) == F){
outK=EmptyKmeans(Yi,centers=Gi)
#  break
}
Zki=outK\$cluster
Gi=outK\$centers

Zki = dummy(Zki)
Dk = t(Zki) %*% Zki
Dks = Dk^(.5)                  # New Dch weights
#Dksi = pseudoinverse(Dks)
Dksi = chol2inv(chol(Dks))
#END Fixed B step: Given B, C is optimal.
#Now: Fix C and recalculate B
DkZkZD=sqrt(n/q)*Dksi%*% t(Zki)%*%MZD # Make the new matrix for the SVD.

outDkZkZD=svd(DkZkZD)       # New SVD, CA analysis

G=outDkZkZD\$u
Gi=G[,1:k]
# this is for ndim = 1 to work
if (k != 1) {
Lk=diag(outDkZkZD\$d[1:k])
} else {
Lk = data.matrix(outDkZkZD\$d[1])
}

Gi = Dksi %*% Gi %*% Lk
Bstar=outDkZkZD\$v
Bi=Bstar[,1:k]
B=sqrt(n*q)*Dzhi%*%Bi
Bns=B[,1:k]
#Attribute quantifications
Yi= sqrt((n/q)) * MZD %*% Bi # Subject coordinates
lambda = outDkZkZD\$d
objective=c(objective,sum(diag(t(Lk)%*% Lk)))

# B rescaled in such a way that B'DzB=nqI.
improv=sum(diag(t(Gi)%*% t(Zki)%*%Zki%*%Gi))-objbef
objbef=sum(diag(t(Gi) %*% t(Zki) %*% Zki %*% Gi))
#    print(improv)
varsi=sum(diag(t(Gi)%*% Dk %*% Gi)) #no need to calc inside the loop

}
#FIX: 16/09/2016, calculated outside the loop
# varsi=sum(diag(t(Gi)%*% Dk %*% Gi))

# fvec = c(fvec, objbef)
#  print(fvec)
#  print(rs)
if (varsi > maxinert){ #gamma
if (gamma == TRUE) {
distB = sum(diag(t(Bns)%*%  Bns))
distG = sum(diag(t(Gi)%*% Gi))
g = ((K/Q)* distB/distG)^.25

Bsol = (1/g)*Bns
Gsol = g*Gi
Ysol = g*Yi
} else {
Bsol = Bns
Gsol = Gi
Ysol = Yi
}
Csol = Zki
maxinert = varsi
}
}

wone = which(Csol==1,arr.ind=T)
cluster = matrix(0,n,1)
cluster[wone[,1]] = wone[,2]

##reorder cluster membership according to cluster size
#csize = round((table(cluster)/sum( table(cluster)))*100,digits=2)
#aa = sort(csize,decreasing = TRUE)
size = table(cluster)
aa = sort(size,decreasing = TRUE)
cluster = mapvalues(cluster, from = as.integer(names(aa)), to = as.integer(names(table(cluster))))
#reorder centroids
Gsol = Gsol[as.integer(names(aa)),]
setTxtProgressBar(pb, 1)
out=list()
out\$obscoord=data.frame(Ysol) # observations coordinates
out\$attcoord=data.frame(Bsol) # attributes coordinates
out\$centroid=data.frame(Gsol) # centroids
cluster = as.integer(cluster)
names(cluster) = rownames(data)
rownames(out\$obscoord) = rownames(data)
rownames(out\$attcoord) = colnames(Z)

out\$cluster=cluster   # cluster membership
out\$criterion=maxinert # criterion
#  out\$iters=iters # number of iterations
out\$size=as.integer(aa) #round((table(cluster)/sum( table(cluster)))*100,digits=1)
out\$odata=data.frame(lapply(data.frame(data),factor))
out\$nstart = nstart
class(out)="clusmca"
return(out)
}

txtProgressBar <- function(min = 0, max = 1, initial = 0, char = "=", width = NA,
title, label, style = 1, file = "")
{
if (!identical(file, "") && !(inherits(file, "connection") &&
isOpen(file)))
stop("'file' must be \"\" or an open connection object")
if (!style %in% 1L:3L)
style <- 1
.val <- initial
.killed <- FALSE
.nb <- 0L
.pc <- -1L
nw <- nchar(char, "w")
if (is.na(width)) {
width <- getOption("width")
if (style == 3L)
width <- width - 10L
width <- trunc(width/nw)
}
if (max <= min)
stop("must have 'max' > 'min'")
up1 <- function(value) {
if (!is.finite(value) || value < min || value > max)
return()
.val <<- value
nb <- round(width * (value - min)/(max - min))
if (.nb < nb) {
cat(strrep(char, nb - .nb), file = file)
flush.console()
}
else if (.nb > nb) {
cat("\r", strrep(" ", .nb * nw), "\r", strrep(char,
nb), sep = "", file = file)
flush.console()
}
.nb <<- nb
}
up2 <- function(value) {
if (!is.finite(value) || value < min || value > max)
return()
.val <<- value
nb <- round(width * (value - min)/(max - min))
if (.nb <= nb) {
cat("\r", strrep(char, nb), sep = "", file = file)
flush.console()
}
else {
cat("\r", strrep(" ", .nb * nw), "\r", strrep(char,
nb), sep = "", file = file)
flush.console()
}
.nb <<- nb
}
up3 <- function(value) {
if (!is.finite(value) || value < min || value > max)
return()
.val <<- value
nb <- round(width * (value - min)/(max - min))
pc <- round(100 * (value - min)/(max - min))
if (nb == .nb && pc == .pc)
return()
cat(paste0("\r  |", strrep(" ", nw * width + 6)), file = file)
cat(paste(c("\r  |", rep.int(char, nb), rep.int(" ",
nw * (width - nb)), sprintf("| %3d%%", pc)), collapse = ""),
file = file)
flush.console()
.nb <<- nb
.pc <<- pc
}
getVal <- function() .val
kill <- function() if (!.killed) {
cat("\n", file = file)
flush.console()
.killed <<- TRUE
}
up <- switch(style, up1, up2, up3)
up(initial)
structure(list(getVal = getVal, up = up, kill = kill), class = "txtProgressBar")
}

setTxtProgressBar <- function (pb, value, title = NULL, label = NULL)
{
if (!inherits(pb, "txtProgressBar"))
stop(gettextf("'pb' is not from class %s", dQuote("txtProgressBar")),
domain = NA)
oldval <- pb\$getVal()
pb\$up(value)
invisible(oldval)
}
```

## Try the clustrd package in your browser

Any scripts or data that you put into this service are public.

clustrd documentation built on May 8, 2019, 5:03 p.m.