### map-utils.R
# version 4.3
# (c) 2009-2020 Lutz Hamel, Benjamin Ott, Greg Breard, University of Rhode Island
# with Robert Tatoian and Vishakh Gopu
#
# This file constitues a set of routines which are useful in constructing
# and evaluating self-organizing maps (SOMs).
# The main utilities available in this file are:
# map.fit` --------- constructs a map
# map.embed --------- reports the embedding of the map in terms of modeling the
# underlying data distribution (100% if all feature distributions
# are modeled correctly, 0% if none are)
# map.significance -- graphically reports the significance of each feature with
# respect to the self-organizing map model
# map.starburst ----- displays the starburst representation of the SOM model, the centers of
# starbursts are the centers of clusters
# map.neuron -------- returns the contents of a neuron at (x,y) on the map as a vector
# map.marginal ------ displays a density plot of a training dataframe dimension overlayed
# with the neuron density for that same dimension or index.
#
#
### License
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# A copy of the GNU General Public License is available at
# http://www.r-project.org/Licenses/
#
###
# load libraries
require(class)
require(fields)
require(graphics)
require(ggplot2)
### map.fit -- construct a SOM, returns an object of class 'map'
# parameters:
# - data - a dataframe where each row contains an unlabeled training instance
# - labels - a vector or dataframe with one label for each observation in data
# - xdim,ydim - the dimensions of the map
# - alpha - the learning rate, should be a positive non-zero real number
# - train - number of training iterations
# - normalize - normalize switch
# - seed - set seed for random init of neurons
# retuns:
# - an object of type 'map' -- see below
# Hint: if your training data does not have any labels you can construct a
# simple label dataframe as follows: labels <- data.frame(1:nrow(training.data))
# NOTE: default algorithm: "vsom" also available: "som", "experimental", "batchsom"
map.fit <- function(data,labels=NULL,xdim=10,ydim=5,alpha=.3,train=1000,normalize=TRUE,seed=NULL)
{
#TODO: implement normalize switch, implement seed
neurons <- vsom.f(data,
xdim=xdim,
ydim=ydim,
alpha=alpha,
train=train)
# make the neuron data a data frame
neurons <- data.frame(neurons)
names(neurons) <- names(data)
### add the visual field to map
# for each observation i, visual has an entry for
# the best matching neuron
visual <- c()
for (i in 1:nrow(data))
{
b <- best.match(map,data[i,])
visual <- c(visual,coordinate(map,b))
}
### construct the map object
map <- list(data=data,
labels=labels,
xdim=xdim,
ydim=ydim,
alpha=alpha,
train=train,
algorithm=algorithm,
neurons=neurons,
visual=visual)
### add the class name
class(map) <- "map"
return(map)
}
### map.embed - evaluate the embedding of a map using the F-test and
# a Bayesian estimate of the variance in the training data.
# parameters:
# - map is an object if type 'map'
# - conf.int is the confidence interval of the convergence test (default 95%)
# - verb is switch that governs the return value false: single convergence value
# is returned, true: a vector of individual feature convergences is returned.
#
# - return value is the cembedding of the map (variance captured by the map so far)
# Hint: the embedding index is the variance of the training data captured by the map.
map.embed <- function(map,conf.int=.95,verb=FALSE,ks=FALSE)
{
if (ks)
map.embed.ks(map,conf.int,verb)
else
map.embed.vm(map,conf.int,verb)
}
### map.starburst - compute and display the starburst representation of clusters
# parameters:
# - map is an object if type 'map'
# - explicit controls the shape of the connected components
# - smoothing controls the smoothing level of the umat (NULL,0,>0)
# - merge.clusters is a switch that controls if the starburst clusters are merged together
# - merge.range - a range that is used as a percentage of a certain distance in the code
# to determine whether components are closer to their centroids or
# centroids closer to each other.
map.starburst <- function(map,explicit=FALSE,smoothing=2,merge.clusters=FALSE,merge.range=.25)
{
if (class(map) != "map")
stop("map.starburst: first argument is not a map object.")
umat <- compute.umat(map,smoothing=smoothing)
plot.heat(map,umat,explicit=explicit,comp=TRUE,merge=merge.clusters,merge.range=merge.range)
}
### map.significance - compute the relative significance of each feature and plot it
# parameters:
# - map is an object if type 'map'
# - graphics is a switch that controls whether a plot is generated or not
# - feature.labels is a switch to allow the plotting of feature names vs feature indices
# return value:
# - a vector containing the significance for each feature
map.significance <- function(map,graphics=TRUE,feature.labels=TRUE)
{
if (class(map) != "map")
stop("map.significance: first argument is not a map object.")
data.df <- data.frame(map$data)
nfeatures <- ncol(data.df)
# Compute the variance of each feature on the map
var.v <- array(data=1,dim=nfeatures)
for (i in 1:nfeatures)
{
var.v[i] <- var(data.df[[i]]);
}
# we use the variance of a feature as likelihood of
# being an important feature, compute the Bayesian
# probability of significance using uniform priors
var.sum <- sum(var.v)
prob.v <- var.v/var.sum
# plot the significance
if (graphics)
{
par.v <- map.graphics.set()
y <- max(prob.v)
plot.new()
plot.window(xlim=c(1,nfeatures),ylim=c(0,y))
box()
title(xlab="Features",ylab="Significance")
xticks <- seq(1,nfeatures,1)
yticks <- seq(0,y,y/4)
if (feature.labels)
xlabels <- names(data.df)
else
xlabels <- seq(1,nfeatures,1)
ylabels <- formatC(seq(0,y,y/4),digits=2)
axis(1,at=xticks,labels=xlabels)
axis(2,at=yticks,labels=ylabels)
points(1:nfeatures,prob.v,type="h")
map.graphics.reset(par.v)
}
else
{
prob.v
}
}
### map.neuron - returns the contents of a neuron at (x,y) on the map as a vector
# parameters:
# map - the neuron map
# x - map x-coordinate of neuron
# y - map y-coordinate of neuron
# return value:
# a vector representing the neuron
map.neuron <- function(map,x,y)
{
if (class(map) != "map")
stop("map.neuron: first argument is not a map object.")
ix <- rowix(map,x,y)
map$neurons[ix,]
}
### map.marginal - plot that shows the marginal probability distribution of the neurons and data
# parameters:
# - map is an object of type 'map'
# - marginal is the name of a training data frame dimension or index
map.marginal <- function(map,marginal)
{
# ensure that map is a 'map' object
if (class(map) != "map")
stop("map.marginal: first argument is not a map object.")
# check if the second argument is of type character
if (!typeof(marginal) == "character")
{
train <- data.frame(points = map$data[[marginal]])
neurons <- data.frame(points = map$neurons[[marginal]])
train$legend <- 'training data'
neurons$legend <- 'neurons'
hist <- rbind(train,neurons)
ggplot(hist, aes(points, fill = legend)) + geom_density(alpha = 0.2) + xlab(names(map$data)[marginal])
}
else if (marginal %in% names(map$data))
{
train <- data.frame(points = map$data[names(map$data) == marginal])
colnames(train) <- c("points")
neurons <- data.frame(points = map$neurons[names(map$neurons) == marginal])
colnames(neurons) <- c("points")
train$legend <- 'training data'
neurons$legend <- 'neurons'
hist <- rbind(train,neurons)
ggplot(hist, aes(points, fill = legend)) + geom_density(alpha = 0.2) + xlab(marginal)
}
else
{
stop("map.marginal: second argument is not the name of a training data frame dimension or index")
}
}
############################### local functions #################################
# map.embed using variance and mean tests
map.embed.vm <- function(map,conf.int=.95,verb=FALSE)
{
if (class(map) != "map")
stop("map.embed: first argument is not a map object.")
# map.df is a dataframe that contains the neurons
map.df <- data.frame(map$neurons)
# data.df is a dataframe that contain the training data
# note: map$data is what the 'som' package returns
data.df <- data.frame(map$data)
# do the F-test on a pair of datasets: code vectors/training data
vl <- df.var.test(map.df,data.df,conf=conf.int)
# do the t-test on a pair of datasets: code vectors/training data
ml <- df.mean.test(map.df,data.df,conf=conf.int)
# compute the variance captured by the map -- but only if the means have converged as well.
nfeatures <- ncol(map.df)
prob.v <- map.significance(map,graphics=FALSE)
var.sum <- 0
for (i in 1:nfeatures)
{
#cat("Feature",i,"variance:\t",vl$ratio[i],"\t(",vl$conf.int.lo[i],"-",vl$conf.int.hi[i],")\n")
#cat("Feature",i,"mean:\t",ml$diff[i],"\t(",ml$conf.int.lo[i],"-",ml$conf.int.hi[i],")\n")
if (vl$conf.int.lo[i] <= 1.0 && vl$conf.int.hi[i] >= 1.0 && ml$conf.int.lo[i] <= 0.0 && ml$conf.int.hi[i] >= 0.0) {
var.sum <- var.sum + prob.v[i]
} else {
# not converged - zero out the probability
prob.v[i] <- 0
}
}
# return the variance captured by converged features
if (verb)
prob.v
else
var.sum
}
# map.embed using the kolgomorov-smirnov test
map.embed.ks <- function(map,conf.int=.95,verb=FALSE) {
if (class(map) != "map") {
stop("map.embed: first argument is not a map object.")
}
# map.df is a dataframe that contains the neurons
map.df <- data.frame(map$neurons)
# data.df is a dataframe that contain the training data
# note: map$data is what the 'som' package returns
data.df <- data.frame(map$data)
nfeatures <- ncol(map.df)
# use the Kolmogorov-Smirnov Test to test whether the neurons and training data appear
# to come from the same distribution
ks.vector <- NULL
for(i in 1:nfeatures){
# Note rpt - I needed to use suppress warnings to suppress the warning about ties.
ks.vector[[i]] <- suppressWarnings(ks.test(map.df[[i]], data.df[[i]]))
}
prob.v <- map.significance(map,graphics=FALSE)
var.sum <- 0
# compute the variance captured by the map
for (i in 1:nfeatures)
{
# the second entry contains the p-value
if (ks.vector[[i]][[2]] > (1 - conf.int)) {
var.sum <- var.sum + prob.v[i]
} else {
# not converged - zero out the probability
prob.v[i] <- 0
}
}
# return the variance captured by converged features
if (verb)
prob.v
else
var.sum
}
# map.normalize -- based on the som:normalize function but preserved names
map.normalize <- function (x, byrow = TRUE)
{
if (is.data.frame(x))
{
if (byrow)
df <- t(apply(x, 1, scale))
else
df <- apply(x, 2, scale)
names(df) <- names(x)
}
else
stop("map.normalize: 'x' is not a dataframe.\n")
}
# best.match -- given observation obs, return the best matching neuron
best.match <- function(map,obs,full=FALSE)
{
# NOTE: replicate obs so that there are nr rows of obs
obs.m <- matrix(as.numeric(obs),nrow(map$neurons),ncol(map$neurons),byrow=TRUE)
diff <- map$neurons - obs.m
squ <- diff * diff
s <- rowSums(squ)
d <- sqrt(s)
o <- order(d)
if (full)
o
else
o[1]
}
# coordinate -- convert from a row index to a map xy-coordinate
coordinate <- function(map,rowix)
{
x <- (rowix-1) %% map$xdim + 1
y <- (rowix-1) %/% map$xdim + 1
c(x,y)
}
#rowix -- convert from a map xy-coordinate to a row index
rowix <- function(map,x,y)
{
rix <- x + (y-1)*map$xdim
rix
}
# map.graphics.set -- set the graphics environment for our map utilities
# the return value is the original graphics param vector
map.graphics.set <- function()
{
par.v <- par()
par(ps=6)
par.v
}
# map.graphics.reset -- reset the graphics environment to the original state
# parameter - a vector containing the settings for the original state
map.graphics.reset <- function(par.vector)
{
par(ps=par.vector$ps)
}
### plot.heat - plot a heat map based on a 'map', this plot also contains the connected
# components of the map based on the landscape of the heat map
# parameters:
# - map is an object if type 'map'
# - heat is a 2D heat map of the map returned by 'map'
# - labels is a vector with labels of the original training data set
# - explicit controls the shape of the connected components
# - comp controls whether we plot the connected components on the heat map
# - merge controls whether we merge the starbursts together.
# - merge.range - a range that is used as a percentage of a certain distance in the code
# to determine whether components are closer to their centroids or
# centroids closer to each other.
plot.heat <- function(map,heat,explicit=FALSE,comp=TRUE,merge=FALSE,merge.range)
{
### keep an unaltered copy of the unified distance matrix,
### required for merging the starburst clusters
umat <- heat
x <- map$xdim
y <- map$ydim
nobs <- nrow(map$data)
count <- array(data=0,dim=c(x,y))
### need to make sure the map doesn't have a dimension of 1
if (x <= 1 || y <= 1)
{
stop("plot.heat: map dimensions too small")
}
### bin the heat values into 100 bins used for the 100 heat colors
heat.v <- as.vector(heat)
heat.v <- cut(heat.v,breaks=100,labels=FALSE)
heat <- array(data=heat.v,dim=c(x,y))
colors<- heat.colors(100)
### set up the graphics window
par.v <- map.graphics.set()
plot.new()
plot.window(xlim=c(0,x),ylim=c(0,y))
box()
title(xlab="x",ylab="y")
xticks <- seq(0.5,x-0.5,1)
yticks <- seq(0.5,y-0.5,1)
xlabels <- seq(1,x,1)
ylabels <- seq(1,y,1)
axis(1,at=xticks,labels=xlabels)
axis(3,at=xticks,labels=xlabels)
axis(2,at=yticks,labels=ylabels)
axis(4,at=yticks,labels=ylabels)
### plot the neurons as heat squares on the map
# TODO: vectorize this - rect can operate on vectors of coordinates and values
for (ix in 1:x)
{
for (iy in 1:y)
{
rect(ix-1,iy-1,ix,iy,col=colors[100 - heat[ix,iy] + 1],border=NA)
}
}
### put the connected component lines on the map
if (comp)
{
if(!merge){
# find the centroid for each neuron on the map
centroids <- compute.centroids(map,heat,explicit)
} else {
# find the unique centroids for the neurons on the map
centroids <- compute.combined.clusters(map,umat,explicit,merge.range)
}
# connect each neuron to its centroid
for(ix in 1:x)
{
for (iy in 1:y)
{
cx <- centroids$centroid.x[ix,iy]
cy <- centroids$centroid.y[ix,iy]
points(c(ix,cx)-.5,c(iy,cy)-.5,type="l",col="grey")
}
}
}
### put the labels on the map if available
if (!is.null(map$labels))
{
# count the labels in each map cell
for(i in 1:nobs)
{
nix <- map$visual[i]
c <- coordinate(map,nix)
ix <- c[1]
iy <- c[2]
count[ix,iy] <- count[ix,iy]+1
}
#count.df <- data.frame(count)
#print(count.df)
for(i in 1:nobs)
{
c <- coordinate(map,map$visual[i])
#cat("Coordinate of ",i," is ",c,"\n")
ix <- c[1]
iy <- c[2]
# we only print one label per cell
# TODO: print out majority label
if (count[ix,iy] > 0)
{
count[ix,iy] <- 0
ix <- ix - .5
iy <- iy - .5
l <- map$labels[i,1]
text(ix,iy,labels=l)
}
}
}
map.graphics.reset(par.v)
}
### compute.centroids -- compute the centroid for each point on the map
# parameters:
# - map is an object if type 'map'
# - heat is a matrix representing the heat map representation
# - explicit controls the shape of the connected component
# return value:
# - a list containing the matrices with the same x-y dims as the original map containing the centroid x-y coordinates
compute.centroids <- function(map,heat,explicit=FALSE)
{
xdim <- map$xdim
ydim <- map$ydim
centroid.x <- array(data=-1,dim=c(xdim,ydim))
centroid.y <- array(data=-1,dim=c(xdim,ydim))
max.val <- max(heat)
### recursive function to find the centroid of a point on the map
compute.centroid <- function(ix,iy)
{
# first we check if the current position is already associated
# with a centroid. if so, simply return the coordinates
# of that centroid
if (centroid.x[ix,iy] > -1 && centroid.y[ix,iy] > -1)
{
list(bestx=centroid.x[ix,iy],besty=centroid.y[ix,iy])
}
# try to find a smaller value in the immediate neighborhood
# make our current position the square with the minimum value.
# if a minimum value other than our own current value cannot be
# found then we are at a minimum.
#
# search the neighborhood; three different cases: inner element, corner element, side element
# TODO: there has to be a better way!
min.val <- heat[ix,iy]
min.x <- ix
min.y <- iy
# (ix,iy) is an inner map element
if (ix > 1 && ix < xdim && iy > 1 && iy < ydim)
{
if (heat[ix-1,iy-1] < min.val)
{
min.val <- heat[ix-1,iy-1]
min.x <- ix-1
min.y <- iy-1
}
if (heat[ix,iy-1] < min.val)
{
min.val <- heat[ix,iy-1]
min.x <- ix
min.y <- iy-1
}
if (heat[ix+1,iy-1] < min.val)
{
min.val <- heat[ix+1,iy-1]
min.x <- ix+1
min.y <- iy-1
}
if (heat[ix+1,iy] < min.val)
{
min.val <- heat[ix+1,iy]
min.x <- ix+1
min.y <- iy
}
if (heat[ix+1,iy+1] < min.val)
{
min.val <- heat[ix+1,iy+1]
min.x <- ix+1
min.y <- iy+1
}
if (heat[ix,iy+1] < min.val)
{
min.val <- heat[ix,iy+1]
min.x <- ix
min.y <- iy+1
}
if (heat[ix-1,iy+1] < min.val)
{
min.val <- heat[ix-1,iy+1]
min.x <- ix-1
min.y <- iy+1
}
if (heat[ix-1,iy] < min.val)
{
min.val <- heat[ix-1,iy]
min.x <- ix-1
min.y <- iy
}
}
# (ix,iy) is bottom left corner
else if (ix == 1 && iy == 1)
{
if (heat[ix+1,iy] < min.val)
{
min.val <- heat[ix+1,iy]
min.x <- ix+1
min.y <- iy
}
if (heat[ix+1,iy+1] < min.val)
{
min.val <- heat[ix+1,iy+1]
min.x <- ix+1
min.y <- iy+1
}
if (heat[ix,iy+1] < min.val)
{
min.val <- heat[ix,iy+1]
min.x <- ix
min.y <- iy+1
}
}
# (ix,iy) is bottom right corner
else if (ix == xdim && iy == 1)
{
if (heat[ix,iy+1] < min.val)
{
min.val <- heat[ix,iy+1]
min.x <- ix
min.y <- iy+1
}
if (heat[ix-1,iy+1] < min.val)
{
min.val <- heat[ix-1,iy+1]
min.x <- ix-1
min.y <- iy+1
}
if (heat[ix-1,iy] < min.val)
{
min.val <- heat[ix-1,iy]
min.x <- ix-1
min.y <- iy
}
}
# (ix,iy) is top right corner
else if (ix == xdim && iy == ydim)
{
if (heat[ix-1,iy-1] < min.val)
{
min.val <- heat[ix-1,iy-1]
min.x <- ix-1
min.y <- iy-1
}
if (heat[ix,iy-1] < min.val)
{
min.val <- heat[ix,iy-1]
min.x <- ix
min.y <- iy-1
}
if (heat[ix-1,iy] < min.val)
{
min.val <- heat[ix-1,iy]
min.x <- ix-1
min.y <- iy
}
}
# (ix,iy) is top left corner
else if (ix == 1 && iy == ydim)
{
if (heat[ix,iy-1] < min.val)
{
min.val <- heat[ix,iy-1]
min.x <- ix
min.y <- iy-1
}
if (heat[ix+1,iy-1] < min.val)
{
min.val <- heat[ix+1,iy-1]
min.x <- ix+1
min.y <- iy-1
}
if (heat[ix+1,iy] < min.val)
{
min.val <- heat[ix+1,iy]
min.x <- ix+1
min.y <- iy
}
}
# (ix,iy) is a left side element
else if (ix == 1 && iy > 1 && iy < ydim)
{
if (heat[ix,iy-1] < min.val)
{
min.val <- heat[ix,iy-1]
min.x <- ix
min.y <- iy-1
}
if (heat[ix+1,iy-1] < min.val)
{
min.val <- heat[ix+1,iy-1]
min.x <- ix+1
min.y <- iy-1
}
if (heat[ix+1,iy] < min.val)
{
min.val <- heat[ix+1,iy]
min.x <- ix+1
min.y <- iy
}
if (heat[ix+1,iy+1] < min.val)
{
min.val <- heat[ix+1,iy+1]
min.x <- ix+1
min.y <- iy+1
}
if (heat[ix,iy+1] < min.val)
{
min.val <- heat[ix,iy+1]
min.x <- ix
min.y <- iy+1
}
}
# (ix,iy) is a bottom side element
else if (ix > 1 && ix < xdim && iy == 1 )
{
if (heat[ix+1,iy] < min.val)
{
min.val <- heat[ix+1,iy]
min.x <- ix+1
min.y <- iy
}
if (heat[ix+1,iy+1] < min.val)
{
min.val <- heat[ix+1,iy+1]
min.x <- ix+1
min.y <- iy+1
}
if (heat[ix,iy+1] < min.val)
{
min.val <- heat[ix,iy+1]
min.x <- ix
min.y <- iy+1
}
if (heat[ix-1,iy+1] < min.val)
{
min.val <- heat[ix-1,iy+1]
min.x <- ix-1
min.y <- iy+1
}
if (heat[ix-1,iy] < min.val)
{
min.val <- heat[ix-1,iy]
min.x <- ix-1
min.y <- iy
}
}
# (ix,iy) is a right side element
else if (ix == xdim && iy > 1 && iy < ydim)
{
if (heat[ix-1,iy-1] < min.val)
{
min.val <- heat[ix-1,iy-1]
min.x <- ix-1
min.y <- iy-1
}
if (heat[ix,iy-1] < min.val)
{
min.val <- heat[ix,iy-1]
min.x <- ix
min.y <- iy-1
}
if (heat[ix,iy+1] < min.val)
{
min.val <- heat[ix,iy+1]
min.x <- ix
min.y <- iy+1
}
if (heat[ix-1,iy+1] < min.val)
{
min.val <- heat[ix-1,iy+1]
min.x <- ix-1
min.y <- iy+1
}
if (heat[ix-1,iy] < min.val)
{
min.val <- heat[ix-1,iy]
min.x <- ix-1
min.y <- iy
}
}
# (ix,iy) is a top side element
else if (ix > 1 && ix < xdim && iy == ydim)
{
if (heat[ix-1,iy-1] < min.val)
{
min.val <- heat[ix-1,iy-1]
min.x <- ix-1
min.y <- iy-1
}
if (heat[ix,iy-1] < min.val)
{
min.val <- heat[ix,iy-1]
min.x <- ix
min.y <- iy-1
}
if (heat[ix+1,iy-1] < min.val)
{
min.val <- heat[ix+1,iy-1]
min.x <- ix+1
min.y <- iy-1
}
if (heat[ix+1,iy] < min.val)
{
min.val <- heat[ix+1,iy]
min.x <- ix+1
min.y <- iy
}
if (heat[ix-1,iy] < min.val)
{
min.val <- heat[ix-1,iy]
min.x <- ix-1
min.y <- iy
}
}
#if successful
# move to the square with the smaller value, i.e., call compute.centroid on this new square
# note the RETURNED x-y coords in the centroid.x and centroid.y matrix at the current location
# return the RETURNED x-y coordinates
if (min.x != ix || min.y != iy)
{
r.val <- compute.centroid(min.x,min.y)
# if explicit is set show the exact connected component
# otherwise construct a connected componenent where all
# nodes are connected to a centrol node
if (explicit)
{
centroid.x[ix,iy] <<- min.x
centroid.y[ix,iy] <<- min.y
list(bestx=min.x,besty=min.y)
}
else
{
centroid.x[ix,iy] <<- r.val$bestx
centroid.y[ix,iy] <<- r.val$besty
r.val
}
}
#else
# we have found a minimum
# note the current x-y in the centroid.x and centroid.y matrix
# return the current x-y coordinates
else
{
centroid.x[ix,iy] <<- ix
centroid.y[ix,iy] <<- iy
list(bestx=ix,besty=iy)
}
} # end function compute.centroid
### iterate over the map and find the centroid for each element
for (i in 1:xdim)
{
for (j in 1:ydim)
{
compute.centroid(i,j)
}
}
list(centroid.x=centroid.x,centroid.y=centroid.y)
}
### compute.umat -- compute the unified distance matrix
# parameters:
# - map is an object if type 'map'
# - smoothing is either NULL, 0, or a positive floating point value controlling the
# smoothing of the umat representation
# return value:
# - a matrix with the same x-y dims as the original map containing the umat values
compute.umat <- function(map,smoothing=NULL)
{
d <- dist(data.frame(map$neurons))
umat <- compute.heat(map,d,smoothing)
umat
}
### compute.heat -- compute a heat value map representation of the given distance matrix
# parameters:
# - map is an object if type 'map'
# - d is a distance matrix computed via the 'dist' function
# - smoothing is either NULL, 0, or a positive floating point value controlling the
# smoothing of the umat representation
# return value:
# - a matrix with the same x-y dims as the original map containing the heat
compute.heat <- function(map,d.in,smoothing=NULL)
{
d <- as.matrix(d.in)
x <- map$xdim
y <- map$ydim
heat <- array(data=0,dim=c(x,y))
if (x == 1 || y == 1)
stop("compute.heat: heat map can not be computed for a map with a dimension of 1")
# this function translates our 2-dim map coordinates
# into the 1-dim coordinates of the neurons
xl <- function(ix,iy)
{
#cat("converting (",ix,",",iy,") to row", ix + (iy-1) *xdim,"\n")
ix + (iy-1) * x
}
# check if the map is larger than 2 x 2 (otherwise it is only corners)
if (x > 2 && y > 2)
{
# iterate over the inner nodes and compute their umat values
for (ix in 2:(x-1))
{
for (iy in 2:(y-1))
{
sum <-
d[xl(ix,iy),xl(ix-1,iy-1)] +
d[xl(ix,iy),xl(ix,iy-1)] +
d[xl(ix,iy),xl(ix+1,iy-1)] +
d[xl(ix,iy),xl(ix+1,iy)] +
d[xl(ix,iy),xl(ix+1,iy+1)] +
d[xl(ix,iy),xl(ix,iy+1)] +
d[xl(ix,iy),xl(ix-1,iy+1)] +
d[xl(ix,iy),xl(ix-1,iy)]
heat[ix,iy] <- sum/8
}
}
# iterate over bottom x axis
for (ix in 2:(x-1))
{
iy <- 1
sum <-
d[xl(ix,iy),xl(ix+1,iy)] +
d[xl(ix,iy),xl(ix+1,iy+1)] +
d[xl(ix,iy),xl(ix,iy+1)] +
d[xl(ix,iy),xl(ix-1,iy+1)] +
d[xl(ix,iy),xl(ix-1,iy)]
heat[ix,iy] <- sum/5
}
# iterate over top x axis
for (ix in 2:(x-1))
{
iy <- y
sum <-
d[xl(ix,iy),xl(ix-1,iy-1)] +
d[xl(ix,iy),xl(ix,iy-1)] +
d[xl(ix,iy),xl(ix+1,iy-1)] +
d[xl(ix,iy),xl(ix+1,iy)] +
d[xl(ix,iy),xl(ix-1,iy)]
heat[ix,iy] <- sum/5
}
# iterate over the left y-axis
for (iy in 2:(y-1))
{
ix <- 1
sum <-
d[xl(ix,iy),xl(ix,iy-1)] +
d[xl(ix,iy),xl(ix+1,iy-1)] +
d[xl(ix,iy),xl(ix+1,iy)] +
d[xl(ix,iy),xl(ix+1,iy+1)] +
d[xl(ix,iy),xl(ix,iy+1)]
heat[ix,iy] <- sum/5
}
# iterate over the right y-axis
for (iy in 2:(y-1))
{
ix <- x
sum <-
d[xl(ix,iy),xl(ix-1,iy-1)] +
d[xl(ix,iy),xl(ix,iy-1)] +
d[xl(ix,iy),xl(ix,iy+1)] +
d[xl(ix,iy),xl(ix-1,iy+1)] +
d[xl(ix,iy),xl(ix-1,iy)]
heat[ix,iy] <- sum/5
}
} # end if
# compute umat values for corners
if (x >= 2 && y >= 2)
{
# bottom left corner
ix <- 1
iy <- 1
sum <-
d[xl(ix,iy),xl(ix+1,iy)] +
d[xl(ix,iy),xl(ix+1,iy+1)] +
d[xl(ix,iy),xl(ix,iy+1)]
heat[ix,iy] <- sum/3
# bottom right corner
ix <- x
iy <- 1
sum <-
d[xl(ix,iy),xl(ix,iy+1)] +
d[xl(ix,iy),xl(ix-1,iy+1)] +
d[xl(ix,iy),xl(ix-1,iy)]
heat[ix,iy] <- sum/3
# top left corner
ix <- 1
iy <- y
sum <-
d[xl(ix,iy),xl(ix,iy-1)] +
d[xl(ix,iy),xl(ix+1,iy-1)] +
d[xl(ix,iy),xl(ix+1,iy)]
heat[ix,iy] <- sum/3
# top right corner
ix <- x
iy <- y
sum <-
d[xl(ix,iy),xl(ix-1,iy-1)] +
d[xl(ix,iy),xl(ix,iy-1)] +
d[xl(ix,iy),xl(ix-1,iy)]
heat[ix,iy] <- sum/3
} # end if
# smooth the heat map
xcoords <- c()
ycoords <- c()
for (i in 1:y) {
for (j in 1:x) {
ycoords <- c(ycoords, i)
xcoords <- c(xcoords, j)
}
}
xycoords <- data.frame(xcoords,ycoords)
if (!is.null(smoothing)) {
if (smoothing == 0)
heat <- smooth.2d(as.vector(heat),x=as.matrix(xycoords),nrow=x,ncol=y,surface=FALSE)
else if (smoothing > 0)
heat <- smooth.2d(as.vector(heat),x=as.matrix(xycoords),nrow=x,ncol=y,surface=FALSE,theta=smoothing)
else
stop("compute.heat: bad value for smoothing parameter")
}
heat
}
### df.var.test -- a function that applies the F-test testing the ratio
# of the variances of the two data frames
# parameters:
# - df1,df2 - data frames with the same number of columns
# - conf - confidence level for the F-test (default .95)
df.var.test <- function(df1,df2,conf = .95)
{
if (length(df1) != length(df2))
stop("df.var.test: cannot compare variances of data frames")
# init our working arrays
var.ratio.v <- array(data=1,dim=length(df1))
var.confintlo.v <- array(data=1,dim=length(df1))
var.confinthi.v <- array(data=1,dim=length(df1))
# compute the F-test on each feature in our populations
for (i in 1:length(df1))
{
t <- var.test(df1[[i]],df2[[i]],conf.level=conf)
var.ratio.v[i] <- t$estimate
#cat("Feature",i,"confidence interval =",t$conf.int,"\n")
var.confintlo.v[i] <- t$conf.int[1]
var.confinthi.v[i] <- t$conf.int[2]
}
# return a list with the ratios and conf intervals for each feature
list(ratio=var.ratio.v,conf.int.lo=var.confintlo.v,conf.int.hi=var.confinthi.v)
}
### df.mean.test -- a function that applies the t-test testing the difference
# of the means of the two data frames
# parameters:
# - df1,df2 - data frames with the same number of columns
# - conf - confidence level for the t-test (default .95)
df.mean.test <- function(df1,df2,conf = .95)
{
if (ncol(df1) != ncol(df2))
stop("df.mean.test: cannot compare means of data frames")
# init our working arrays
mean.diff.v <- array(data=1,dim=ncol(df1))
mean.confintlo.v <- array(data=1,dim=ncol(df1))
mean.confinthi.v <- array(data=1,dim=ncol(df1))
# compute the F-test on each feature in our populations
for (i in 1:ncol(df1)) {
t <- t.test(x=df1[[i]],y=df2[[i]],conf.level=conf)
mean.diff.v[i] <- t$estimate[1] - t$estimate[2]
#cat("Feature",i,"confidence interval =",t$conf.int,"\n")
mean.confintlo.v[i] <- t$conf.int[1]
mean.confinthi.v[i] <- t$conf.int[2]
}
# return a list with the mean differences and conf intervals for each feature
list(diff=mean.diff.v,conf.int.lo=mean.confintlo.v,conf.int.hi=mean.confinthi.v)
}
### vsom.f - vectorized and optimized version of the stochastic SOM training algorithm written in Fortran90
vsom.f <- function(data,xdim,ydim,alpha,train)
{
### some constants
dr <- nrow(data)
dc <- ncol(data)
nr <- xdim*ydim
nc <- dc # dim of data and neurons is the same
### build and initialize the matrix holding the neurons
cells <- nr * nc # no. of neurons times number of data dimensions
v <- runif(cells,-1,1) # vector with small init values for all neurons
# NOTE: each row represents a neuron, each column represents a dimension.
neurons <- matrix(v,nrow=nr,ncol=nc) # rearrange the vector as matrix
result <- .Fortran("vsom",
as.single(neurons),
as.single(data.matrix(data)),
as.integer(dr),
as.integer(dc),
as.integer(xdim),
as.integer(ydim),
as.single(alpha),
as.integer(train),
PACKAGE="popsom2")
# unpack the structure and list in result[1]
v <- result[1]
neurons <- matrix(v[[1]],nrow=nr,ncol=nc,byrow=FALSE) # rearrange the result vector as matrix
neurons
}
#Functions to Combine connected components that represent the same cluster
##### TOP LEVEL FOR DISTANCE MATRIX ORIENTED CLUSTER COMBINE####
compute.combined.clusters <- function(map,heat,explicit,range) {
# compute the connected components
centroids <- compute.centroids(map,heat,explicit)
#Get unique centroids
unique.centroids <- get.unique.centroids(map, centroids)
#Get distance from centroid to cluster elements for all centroids
within_cluster_dist <- distance.from.centroids(map, centroids, unique.centroids, heat)
#Get average pairwise distance between clusters
between_cluster_dist <- distance.between.clusters(map, centroids, unique.centroids, heat)
#Get a boolean matrix of whether two components should be combined
combine_cluster_bools <- combine.decision(within_cluster_dist, between_cluster_dist, range)
#Create the modified connected components grid
new_centroid <- new.centroid(combine_cluster_bools, centroids, unique.centroids, map)
new_centroid
}
### get.unique.centroids -- a function that computes a list of unique centroids from
# a matrix of centroid locations.
#
# parameters:
# - map is an object of type 'map'
# - centroids - a matrix of the centroid locations in the map
get.unique.centroids <- function(map, centroids){
# get the dimensions of the map
xdim <- map$xdim
ydim <- map$ydim
xlist <- c()
ylist <- c()
x.centroid <- centroids$centroid.x
y.centroid <- centroids$centroid.y
for(ix in 1:xdim){
for(iy in 1:ydim) {
cx <- x.centroid[ix, iy]
cy <- y.centroid[ix, iy]
# Check if the x or y of the current centroid is not in the list and if not
# append both the x and y coordinates to the respective lists
if(!(cx %in% xlist) || !(cy %in% ylist)){
xlist <- c(xlist, cx)
ylist <- c(ylist, cy)
}
}
}
# return a list of unique centroid positions
list(position.x=xlist, position.y=ylist)
}
### distance.from.centroids -- A function to get the average distance from
# centroid by cluster.
# parameters:
# - map is an object of type 'map'
# - centroids - a matrix of the centroid locations in the map
# - unique.centroids - a list of unique centroid locations
# - heat is a unified distance matrix
distance.from.centroids <- function(map, centroids, unique.centroids, heat){
xdim <- map$xdim
ydim <- map$ydim
centroids.x.positions <- unique.centroids$position.x
centroids.y.positions <- unique.centroids$position.y
within <- c()
for (i in 1:length(centroids.x.positions)){
cx <- centroids.x.positions[i]
cy <- centroids.y.positions[i]
# compute the average distance
distance <- cluster.spread(cx, cy, heat, centroids, map)
# append the computed distance to the list of distances
within <- c(within, distance)
}
# return the list
within
}
### cluster.spread -- Function to calculate the average distance in
# one cluster given one centroid.
#
# parameters:
# - x - x position of a unique centroid
# - y - y position of a unique centroid
# - umat - a unified distance matrix
# - centroids - a matrix of the centroid locations in the map
# - map is an object of type 'map'
cluster.spread <- function(x, y, umat, centroids, map){
centroid.x <- x
centroid.y <- y
sum <- 0
elements <- 0
xdim <- map$xdim
ydim <- map$ydim
centroid_weight <- umat[centroid.x, centroid.y]
for(xi in 1:xdim){
for(yi in 1:ydim){
cx <- centroids$centroid.x[xi, yi]
cy <- centroids$centroid.y[xi, yi]
if(cx == centroid.x && cy == centroid.y){
cweight <- umat[xi,yi]
sum <- sum + abs(cweight - centroid_weight)
elements <- elements + 1
}
}
}
average <- sum / elements
average
}
### distance.between.clusters -- A function to compute the average pairwise
# distance between clusters.
#
# parameters:
# - map is an object of type 'map'
# - centroids - a matrix of the centroid locations in the map
# - unique.centroids - a list of unique centroid locations
# - umat - a unified distance matrix
distance.between.clusters <- function(map, centroids, unique.centroids, umat){
cluster_elements <- list.clusters(map, centroids, unique.centroids, umat)
cluster_elements <- sapply(cluster_elements,'[',seq(max(sapply(cluster_elements,length))))
columns <- ncol(cluster_elements)
cluster_elements <- matrix(unlist(cluster_elements),ncol = ncol(cluster_elements),byrow = FALSE)
cluster_elements <- apply(combn(ncol(cluster_elements), 2), 2, function(x)
abs(cluster_elements[, x[1]] - cluster_elements[, x[2]]))
mean <- colMeans(cluster_elements, na.rm=TRUE)
index <- 1
mat <- matrix(data=NA, nrow=columns, ncol=columns)
for(xi in 1:(columns-1)){
for (yi in xi:(columns-1)){
mat[xi, yi + 1] <- mean[index]
mat[yi + 1, xi] <- mean[index]
index <- index + 1
}
}
mat
}
### list.clusters -- A function to get the clusters as a list of lists.
#
# parameters:
# - map is an object of type 'map'
# - centroids - a matrix of the centroid locations in the map
# - unique.centroids - a list of unique centroid locations
# - umat - a unified distance matrix
list.clusters <- function(map,centroids,unique.centroids,umat){
centroids.x.positions <- unique.centroids$position.x
centroids.y.positions <- unique.centroids$position.y
componentx <- centroids$centroid.x
componenty <- centroids$centroid.y
cluster_list <- list()
for(i in 1:length(centroids.x.positions)){
cx <- centroids.x.positions[i]
cy <- centroids.y.positions[i]
# get the clusters associated with a unique centroid and store it in a list
cluster_list[i] <- list.from.centroid(map, cx, cy, centroids, umat)
}
cluster_list
}
### list.from.centroid -- A funtion to get all cluster elements
# associated to one centroid.
#
# parameters:
# - map is an object of type 'map'
# - x - the x position of a centroid
# - y - the y position of a centroid
# - centroids - a matrix of the centroid locations in the map
# - umat - a unified distance matrix
list.from.centroid <- function(map,x,y,centroids,umat){
centroid.x <- x
centroid.y <- y
sum <- 0
xdim <- map$xdim
ydim <- map$ydim
centroid_weight <- umat[centroid.x, centroid.y]
cluster_list <- c()
for(xi in 1:xdim){
for(yi in 1:ydim){
cx <- centroids$centroid.x[xi, yi]
cy <- centroids$centroid.y[xi, yi]
if(cx == centroid.x && cy == centroid.y){
cweight <- umat[xi, yi]
cluster_list <- c(cluster_list, cweight)
}
}
}
list(cluster_list)
}
### combine.decision -- A function that produces a boolean matrix
# representing which clusters should be combined.
#
# parameters:
# - within_cluster_dist - A list of the distances from centroid to cluster elements for all centroids
# - distance_between_clusters - A list of the average pairwise distance between clusters
# - range is the distance where the clusters are merged together.
combine.decision <- function(within_cluster_dist,distance_between_clusters,range){
inter_cluster <- distance_between_clusters
centroid_dist <- within_cluster_dist
dim <- dim(inter_cluster)[1]
to_combine <- matrix(data=FALSE, nrow=dim, ncol=dim)
for(xi in 1:dim){
for(yi in 1:dim){
cdist <- inter_cluster[xi,yi]
if(!is.na(cdist)){
rx <- centroid_dist[xi] * range
ry <- centroid_dist[yi] * range
if(cdist < (centroid_dist[xi] + rx) || cdist < (centroid_dist[yi] + ry)){
to_combine[xi, yi] <- TRUE
}
}
}
}
to_combine
}
### swap.centroids -- A function that changes every instance of a centroid to
# one that it should be combined with.
# parameters:
# - map is an object of type 'map'
# - x1 -
# - y1 -
# - x2 -
# - y2 -
# - centroids - a matrix of the centroid locations in the map
swap.centroids <- function(map, x1, y1, x2, y2, centroids){
xdim <- map$xdim
ydim <- map$ydim
compn_x <- centroids$centroid.x
compn_y <- centroids$centroid.y
for(xi in 1:xdim){
for(yi in 1:ydim){
if(compn_x[xi] == x1 && compn_y[yi] == y1){
compn_x[xi] <- x2
compn_y[yi] <- y2
}
}
}
list(centroid.x=compn_x, centroid.y=compn_y)
}
### new.centroid -- A function to combine centroids based on matrix of booleans.
#
# parameters:
#
# - bmat - a boolean matrix containing the centroids to merge
# - centroids - a matrix of the centroid locations in the map
# - unique.centroids - a list of unique centroid locations
# - map is an object of type 'map'
new.centroid <- function(bmat, centroids, unique.centroids, map){
bmat.rows <- dim(bmat)[1]
bmat.columns <- dim(bmat)[2]
centroids_x <- unique.centroids$position.x
centroids_y <- unique.centroids$position.y
components <- centroids
for(xi in 1:bmat.rows){
for(yi in 1:bmat.columns){
if(bmat[xi,yi] == TRUE){
x1 <- centroids_x[xi]
y1 <- centroids_y[xi]
x2 <- centroids_x[yi]
y2 <- centroids_y[yi]
components <- swap.centroids(map, x1, y1, x2, y2, components)
}
}
}
components
}
avg.homogeneity <- function(map,explicit=FALSE,smoothing=2,merge=FALSE,merge.range=.25)
{
### keep an unaltered copy of the unified distance matrix,
### required for merging the starburst clusters
umat <- compute.umat(map,smoothing=smoothing)
x <- map$xdim
y <- map$ydim
nobs <- nrow(map$data)
centroid.labels <- array(data=list(),dim=c(x,y))
### need to make sure the map doesn't have a dimension of 1
if (x <= 1 || y <= 1)
{
stop("avg.homogeneity: map dimensions too small")
}
if (is.null(map$labels))
{
stop("avg.homogeneity: you need to attach labels to the map")
}
if(!merge)
{
# find the centroid for each neuron on the map
centroids <- compute.centroids(map,umat,explicit)
}
else
{
# find the unique centroids for the neurons on the map
centroids <- compute.combined.clusters(map,umat,explicit,merge.range)
}
### attach labels to centroids
# count the labels in each map cell
for(i in 1:nobs)
{
lab <- as.character(map$labels[i,1])
nix <- map$visual[i]
c <- coordinate(map,nix)
ix <- c[1]
iy <- c[2]
cx <- centroids$centroid.x[ix,iy]
cy <- centroids$centroid.y[ix,iy]
centroid.labels[[cx,cy]] <- append(centroid.labels[[cx,cy]],lab)
}
### compute average homogeneity of the map: h = (1/nobs)*sum_c majority.lahbel_c
sum.majority <- 0
n.centroids <- 0
for (ix in 1:x)
{
for (iy in 1:y)
{
label.v <- centroid.labels[[ix,iy]]
if (length(label.v)!=0)
{
n.centroids <- n.centroids + 1
majority <- data.frame(sort(table(label.v),decreasing=TRUE))
if (nrow(majority) == 1) # only one label
{
m.val <- length(label.v)
}
else
{
m.val <- majority[1,2]
}
sum.majority <- sum.majority + m.val
}
}
}
list(homog=sum.majority/nobs, nclust=n.centroids)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.