# R/clust.R In uhh-lt/senseasim: Get sense vectors and similarities

```#'
#'
#' define some clustering algorithms
#'
#'

clust <- new.env(parent = .GlobalEnv)

with(clust, {

# Some variables for testing ----
fournodes <- { function() {
message('Creating 4-nodes sample data. Access with \'clust\$fournodes\'.')
B <- matrix(c(1,1,1,1,1,1,0,1,1,0,1,0,1,1,0,1), nrow = 4)
rownames(B) <- sapply(1:nrow(B), function(i) paste0('node',i))
colnames(B) <- rownames(B)
return(B)
} }()

twentynodes <- { function() {
message('Creating 20-nodes sample data. Access with \'clust\$twentynodes\'.')
B <- matrix(0,20,20)
A <- matrix(runif(100),10,10)
B[01:10,01:10] <- A
B[11:20,11:20] <- A
rownames(B) <- sapply(1:20, function(i) paste0('node',i))
colnames(B) <- rownames(B)
return(B)
} }()

twentynodesundirected <- { function() {
message('Creating 20-undirected-nodes sample data. Access with \'clust\$twentynodesundirected\'.')
A <- twentynodes
B <- matrix(0, dim(A)[[1]], dim(A)[[2]])
for(i in 1:dim(A)[[1]]){
for(j in 1:dim(A)[[2]]){
B[i,j] <- {
if(i < j)
A[i,j]
else
A[j,i]
}
}
}
rownames(B) <- rownames(A)
colnames(B) <- colnames(A)
return(B)
} }()

# Utility methods ----
as.cluster.lists <- function(labels){
labels_ <- unique(labels)
clusterlists <- lapply(labels_, function(lbl) names(labels)[which(labels == lbl)])
names(clusterlists) <- labels_
return(clusterlists)
}

as.cluster.labels <- function(senselists){
labels <- Reduce(
f = function(result, i) {
x <- replicate(length(senselists[[i]]), i)
names(x) <- senselists[[i]]
c(result, x)
},
x = 1:length(senselists),
init = c()
)
return(labels)
}

#'
#' merged singleton clusters
#'
merge_singleton_clusters <- function(labels, newlabel = 'singletons.merged'){
quantities <- table(labels)
labels_with_size_1 <- names(which(quantities == 1))
labels[which(labels %in% labels_with_size_1)] <- newlabel
return(labels)
}

#'
#' prune graph / adjaceny matrix
#'
#' prune graph by keeping top outgoing and top incomnig edges
#'
graph.prune.inout <- function(A, nkeep=3, make.symmetric = F) {
# assert ncol(A) == nrow(A)
# first make sure that selfloops are removed
diag(A) <- 0
# reset nkeep to the min(nkeep, len)
nkeep <- min(nrow(A), nkeep)
idx <- seq_len(nrow(A))
maxidx <- sapply(idx, function(i){
# throw in some randomness (just in case the matrix is too homogenous)
sampledcolidx <- sample(seq_len(ncol(A)))
sampledrowidx <- sample(seq_len(nrow(A)))
c(
sampledcolidx[order(A[i, sampledcolidx], decreasing=T)[1:nkeep]],
sampledrowidx[order(A[sampledrowidx, i], decreasing=T)[1:nkeep]]
)
})
rowidx <- c(rep(idx, each=nkeep), c(maxidx[1:nkeep,]))
colidx <- c(c(maxidx[(nkeep+1):(nkeep*2),]), rep(idx, each=nkeep))
B <- matrix(0, nrow=nrow(A), ncol=ncol(A))
rownames(B) <- rownames(A); colnames(B) <- colnames(A)
B[cbind(rowidx, colidx)] <- A[cbind(rowidx, colidx)]
if(make.symmetric)
B <- (B + t(B)) / 2 # make symmetric
return(B)
}

#'
#' prune graph / adjaceny matrix
#'
#' prune graph by keeping top outgoing edges
#'
graph.prune <- function(A, nkeep=3, make.symmetric = T) {
# assert ncol(A) == nrow(A)
# first make sure that selfloops are removed since they will have the strongest score
diag(A) <- 0
# reset nkeep to the min(nkeep, len)
nkeep <- min(nrow(A), nkeep)
idx <- seq_len(nrow(A))
rowidx <- c(sapply(idx, function(i){
# throw in some randomness (just in case the matrix is too homogenous)
sampledcolidx <- sample(seq_len(ncol(A)))
sampledcolidx[order(A[i, sampledcolidx], decreasing=T)[1:nkeep]]
}))
colidx <- rep(idx, each=nkeep)
B <- matrix(0, nrow=nrow(A), ncol=ncol(A))
rownames(B) <- rownames(A); colnames(B) <- colnames(A)
B[cbind(rowidx, colidx)] <- A[cbind(rowidx, colidx)]
if(make.symmetric)
B <- (B + t(B)) / 2 # make symmetric
return(B)
}

#'
#' visualize graph / adjaceny matrix
#'
graph.viz <- function(A, labels=NULL, labels.as.list=T, label.in.name=T, tkplot=F) {
# ensure that weights are between 0 and 1
miA <- min(A)
if(miA < 0) # shift if needed
A <- A - miA
maA <- max(A)
if(maA > 1)  # scale if needed
A <- A / maA
# produce undirected weighted graph
net <- igraph::graph_from_adjacency_matrix(A, mode = 'undirected', weighted = T)
igraph::V(net)\$color <- 'lightgray'
if(!is.null(labels)){
if(labels.as.list){
labels <- as.cluster.labels(labels)
}

labelfactor <- factor(labels)
labelfactorid <- as.numeric(labelfactor)

# vertex properties
igraph::V(net)[names(labels)]\$color <- c('lightgray', 'yellow', 'magenta', 'red', 'green', 'white', 'cyan', 'lightblue')[(labelfactorid %% 8)+1]
igraph::V(net)[names(labels)]\$frame.color <- c('black', 'red', 'green', 'blue', 'yellow', 'white')[((labelfactorid %/% 8) %% 6)+1]
#vertices\$size <- 14

# vertex labels and label properties
igraph::V(net)[names(labels)]\$label <- if(label.in.name) paste(names(labels), labelfactorid, sep='##') else names(vertices)
igraph::V(net)[names(labels)]\$label.color <- c('black', 'red', 'green', 'blue', 'white')[((labelfactorid %/% 48) %% 5)+1]
# igraph::V(net)\$label.font <- 1   # Font: 1 plain, 2 bold, 3, italic, 4 bold italic, 5 symbol
# igraph::V(net)\$label.cex <- 1  # Font size (multiplication factor, device-dependent)
# igraph::V(net)\$label.dist <- 0 # Distance between the label and the vertex
# igraph::V(net)\$label.degree <- 0  # The position of the label in relation to the vertex (use pi)
# igraph::V(net)\$label.family <- 'Times'  # Font family of the label (e.g. 'Times', 'Helvetica')
# igraph::V(net)[names(labels)]\$shape <- c('circle', 'square')[((labelfactorid %/% 240) %% 2)+1] # 'csquare', 'rectangle', 'crectangle', 'vrectangle', 'pie', 'raster', 'sphere', 'none'

# edge properties
igraph::E(net)\$color <- 'gray'
igraph::E(net)\$lty <- 'solid'   # Line type, could be 0 or “blank”, 1 or “solid”, 2 or “dashed”, 3 or “dotted”, 4 or “dotdash”, 5 or “longdash”, 6 or “twodash”
igraph::E(net)\$curved <- 0.3 # Edge curvature, range 0-1 (FALSE sets it to 0, TRUE to 0.5)
igraph::E(net)\$width <- igraph::E(net)\$weight
}

if(tkplot){
tkid <- igraph::tkplot(net)
}else{
p <- igraph::plot.igraph(net)
}

return(list(A=A, labels=labels))
}

# Clustering algorithms ----

#' Chinese Whispers
#'
#' cluster a graph using the chinese whispers graph clustering algorithm
#'
#'
#' A <- matrix(runif(25),5,5)  # create random quadratic matrix
#' A[sample(1:25, 5)] <- 0 # make A sparse
#' A <- A * t(A) * (1 - diag(5)) # make matrix symmetrical and fix diagonal
#'
#' test: clust\$cw(twenty_nodes)
#'
#' @return a vector containing a cluster id for each input node
cw <- function(A, max_iter = 20, eps = 1e-5, remove_self_loops = T, allowsingletons = F){
# init
assertthat::are_equal(ncol(A), nrow(A))
N <- nrow(A)
# can nodes vote for their own class?
if(remove_self_loops) {
diag(A) <- 0
}
labels <- seq_len(N)
labels_new <- labels
names(labels_new) <- rownames(A)

for(iter in seq_len(max_iter)){
for(u in sample(seq_len(N), size = N, replace = F)) {
# each neighbor votes for its class
# pick a random class for max vote ties
labels_new[[u]] <- newlabel_u
}
}
if(sqrt(sum((labels-labels_new)^2)) <= eps) {
if(!allowsingletons)
labels_new <- merge_singleton_clusters(labels_new)
message(sprintf('found %d clusters, took %d iterations.', length(unique(labels_new)), iter))
return(labels_new)
}
labels <- labels_new
}

if(!allowsingletons)
labels_new <- merge_singleton_clusters(labels_new)
message(sprintf('found %d clusters, took %d iterations.', length(unique(labels_new)), max_iter))
return(labels_new)
}

#'
#'
#' @param e expansion exponent (A %^% e)
#' @param r inflation exponent (A ^ r)
#' @param ithresh inflation pruning parameter, keep only values equal or greater to ithresh
#' @param eps epsilon threshold to test for convergence
#'
#' test: clust\$mcl(twentynodes, addLoops = !remove_self_loops)\$Cluster
#'
mcl <- function(A, e = 2, r = 2, ithresh = 1e-5, max_iter = 100, eps = 1e-5, remove_self_loops = F, add_dummy = F, allowsingletons = F){
# init
assertthat::are_equal(ncol(A), nrow(A))
N <- nrow(A)

# add an unconnected dummy node
A <- cbind(rbind(A, 0), 0)
A[N+1, N+1] <- 1
}

# remove self loops
if(remove_self_loops) {
diag(A) <- 0
}
# normalize by column
margins <- colSums(A)
A <- t(apply(A, MARGIN = 1, function(row) row / margins))

# helper function to interpret results
interpret <- function(){
# interpret cluster results
labels <- replicate(N, -1)
for(i in seq_len(N)){
# resolve ties
labels[[i]] <- paste0(votes, collapse = '_')
labels[[i]] <- -1
else
}

if (!allowsingletons)
labels <- merge_singleton_clusters(labels)

names(labels) <- rownames(A)
return(labels)
}

mpow <- function(M, exponent){
if(exponent < 2)
return(M)
rM <- M
for(i in 2:exponent)
rM <- rM %*% M
return(rM)
}

for(iter in seq_len(max_iter)){
# expand
newA <- mpow(A, e)
# inflate
newA <- newA ^ r
# prune by threshold but keep the diagonals
newA[which(newA < ithresh)] <- 0
margins <- colSums(newA)
newA <- t(apply(newA, MARGIN = 1, function(row) row / margins))
# test for convergence
if(sqrt(sum((A-newA)^2, na.rm = T)) <= eps){
A <- newA
labels <- interpret()
message(sprintf('found %d clusters, took %d iterations.', length(unique(labels)), iter))
return(labels)
}
A <- newA
}

# interpret results
labels <- interpret()
message(sprintf('found %d clusters, took %d iterations.', length(unique(labels)), max_iter))
return(labels)
}

#'
#' K-Means clusters row vectors
#'
kmeans <- function(A, k = 6, allowsingletons = F){
result <- stats::kmeans(A, k)
labels <- result\$cluster
if(!allowsingletons)
labels <- merge_singleton_clusters(labels)
message(sprintf('found %d clusters.', length(unique(labels))))
return(labels)
}

#'
#' Hierichal clustering clusters by dissimilarities
#'
#'  dist(A)
#'
hclust <- function(A, k=6, h=NULL, allowsingletons = F, isNormalizedSimilarityMatrix = T, debugplot = F){
# convert similarity matrix into disimilarities, otherwise assume A to be a dissimilarity matrix
D <- as.dist({ if(isNormalizedSimilarityMatrix) (1-A) else A })
clusters <- stats::hclust(D)
if(debugplot)
plot(clusters)
labels <- {
if(!is.null(h))
cutree(clusters, h = h)
else
cutree(clusters, k = k)
}
if(!allowsingletons)
labels <- merge_singleton_clusters(labels)
message(sprintf('found %d clusters.', length(unique(labels))))
return(labels)
}

#'
#' (!!!EXPERIMENTAL!!!)
#' Self Organizing Maps clusters by rows
#'
som <- function(A, xdim = 2, ydim = 3, allowsingletons = F){
#require(kohonen) # for selforganizing maps
result <- kohonen::som(A, grid=kohonen::somgrid(xdim = xdim, ydim = ydim))
labels <- result\$unit.classif
names(labels) <- rownames(A)
if(!allowsingletons)
labels <- merge_singleton_clusters(labels)
message(sprintf('found %d clusters.', length(unique(labels))))
return(labels)
}

})
```
uhh-lt/senseasim documentation built on Feb. 28, 2019, 7:38 p.m.