Nothing
### map-utils.R
# (c) University of Rhode Island
# Lutz Hamel, Benjamin Ott, Greg Breard,
# Robert Tatoian, Vishakh Gopu, Michael Eiger
#
# This file constitues a set of routines which are useful in constructing
# and evaluating self-organizing maps (SOMs).
# The main utilities available in this file are:
# map.build -------- constructs a map
# map.summary ------ compute a summary object
# map.convergence -- details of the convergence index of a map
# map.starburst ---- displays the starburst representation of the SOM model,
# the centers of starbursts are the centers of clusters
# map.fitted ------- returns a vector of labels assigned to the observations
# map.predict ------ returns classification labels for points in DF
# map.position ----- return the position of points on the map
# map.significance - graphically reports the significance of each feature with
# respect to the self-organizing map model
# map.marginal ----- displays a density plot of a training dataframe dimension
# overlayed with the neuron density for that same dimension or
# index.
#
### License
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# A copy of the GNU General Public License is available at
# http://www.r-project.org/Licenses/
#
### Helpful docs
# medium.com/@ODSC/unsupervised-learning-evaluating-clusters-bd47eed175ce
# en.wikipedia.org/wiki/Total_sum_of_squares
###
# load libraries
loadNamespace("fields")
loadNamespace("graphics")
loadNamespace("ggplot2")
loadNamespace("hash")
### constructor ###
# map.build -- construct a SOM, returns an object of class 'map'
#
# parameters:
# - data - a dataframe where each row contains an unlabeled training instance
# - labels - a vector or dataframe with one label for each observation in data
# - xdim,ydim - the dimensions of the map
# - alpha - the learning rate, should be a positive non-zero real number
# - train - number of training iterations
# - normalize - normalize the input data by row
# - seed - a seed value for repeatablity of random initialization and selection
# - minimal - when true only the trained neuron's are returned.
# value:
# - an object of type 'map' -- see below
map.build <- function(data,
labels=NULL,
xdim=10,
ydim=5,
alpha=.3,
train=1000,
normalize=FALSE,
seed=NULL,
minimal=FALSE)
{
if (alpha <= 0 || alpha > 1)
stop("invalid value for alpha")
if (xdim < 5 || ydim < 5)
stop("map is too small.")
if (!is.data.frame(data))
stop("training data has to be a data frame")
if (!all(sapply(data,is.numeric)))
stop("only numeric data can be used for training")
if (!is.null(labels) && !is.data.frame(labels))
labels <- data.frame(labels)
if (normalize)
data <- map.normalize(data)
if (!is.null(seed) && seed <= 0)
stop("seed value has to be a positive integer value")
if (!is.null(seed) && !test.integer(seed))
stop("seed value has to be a positive integer value")
if (!test.integer(train))
stop("train value has to be a positive integer value")
# train the neural network
neurons <- vsom.f(data,
xdim=xdim,
ydim=ydim,
alpha=alpha,
train=train,
seed=seed)
# make the neuron data a data frame
neurons <- data.frame(neurons)
names(neurons) <- names(data)
# construct the map object
map <- list(data=data,
labels=labels,
xdim=xdim,
ydim=ydim,
alpha=alpha,
train=train,
normalize=normalize,
seed=seed,
neurons=neurons)
# add the class name
if (minimal)
{
class(map) <- "map.minimal"
return(map)
}
else
{
class(map) <- "map"
}
# NOTE: do not change the order of the following computations
# add the heat map
map$heat <- compute.heat(map)
# list of indexes of the best matching neuron for each observation.
# each index is an row index into the 'neuron' data frame.
map$fitted.obs <- map.fitted.obs(map)
# map of centroids - this is a map where each cell points
# to the the location on the map where the corresponding centroid
# is located. centroids point to themselves.
map$centroids <- compute.centroids(map)
# a list of actual centroid locations on the map
map$unique.centroids <- get.unique.centroids(map)
# this is a map where locations of centroids have a label associated
# with them. if unlabeled data then we invented labels for the centroids
map$centroid.labels <- majority.labels(map)
# label-to-centroid lookup table
# Note: a label can be associated with multiple centroids
map$label.to.centroid <- compute.label.to.centroid(map)
# a vector of lists of observations per centroid indexed
# by the centroid number from unique.centroids
map$centroid.obs <- compute.centroid.obs(map)
### quality measures of the model ###
# the convergence index of a map
map$convergence <- map.convergence(map,verb=FALSE)
# compute the average within cluster sum of squares (wcss)
# this is the average distance variance within the clusters of the
# underlying cluster model
map$wcss <- compute.wcss(map)
# compute the between cluster sum of squares (bcss)
# this is the distance variance between the cluster centroids of the
# underlying cluster model
map$bcss <- compute.bcss(map)
return(map)
}
# map.summary - compute a summary object
# parameters:
# map - an object of type 'map'
# verb - a switch controlling the output
# value:
# a summary object of type 'summary.map'
map.summary <- function(map, verb=TRUE)
{
if (class(map) != "map")
stop("first argument is not a map object.")
value <- list()
# training parameters
header <- c("xdim",
"ydim",
"alpha",
"train",
"normalize",
"seed",
"instances",
"columns")
v <- c(map$xdim,
map$ydim,
map$alpha,
map$train,
if (map$normalize) "TRUE" else "FALSE",
if (is.null(map$seed)) "NULL" else map$seed,
nrow(map$data),
ncol(map$data))
df <- data.frame(t(v))
names(df) <- header
row.names(df) <- " "
value$training.parameters <- df
# quality assessments
header <- c("convergence","separation","clusters")
v <- c(map$convergence,
1.0 - map$wcss/map$bcss,
length(map$unique.centroids))
df <- data.frame(t(v))
names(df) <- header
row.names(df) <- " "
value$quality.assessments <- df
class(value) <- "summary.map"
if (verb)
{
cat("\n")
cat("Training Parameters:\n")
print(value$training.parameters)
cat("\n")
cat("Quality Assessments:\n")
print(format(value$quality.assessments,digits=2))
cat("\n")
}
else
{
value
}
}
# map.starburst - compute and display the starburst representation of clusters
# parameters:
# - map is an object of type 'map'
map.starburst <- function(map)
{
if (class(map) != "map")
stop("first argument is not a map object.")
plot.heat(map)
}
# map.significance - compute the relative significance of each feature and plot it
# parameters:
# - map is an object if type 'map'
# - graphics is a switch that controls whether a plot is generated or not
# - feature.labels is a switch to allow the plotting of feature names vs
# feature indices
# return value:
# - a vector containing the significance for each feature
map.significance <- function(map,graphics=TRUE,feature.labels=TRUE)
{
if (class(map) != "map")
stop("first argument is not a map object.")
data.df <- data.frame(map$data)
nfeatures <- ncol(data.df)
# Compute the variance of each feature on the map
var.v <- array(data=1,dim=nfeatures)
for (i in 1:nfeatures)
{
var.v[i] <- var(data.df[[i]])
}
# we use the variance of a feature as likelihood of
# being an important feature, compute the Bayesian
# probability of significance using uniform priors
var.sum <- sum(var.v)
prob.v <- var.v/var.sum
# plot the significance
if (graphics)
{
par.v <- map.graphics.set()
y <- max(prob.v)
plot.new()
plot.window(xlim=c(1,nfeatures),ylim=c(0,y))
box()
title(xlab="Features",ylab="Significance")
xticks <- seq(1,nfeatures,1)
yticks <- seq(0,y,y/4)
if (feature.labels)
xlabels <- names(data.df)
else
xlabels <- seq(1,nfeatures,1)
ylabels <- formatC(seq(0,y,y/4),digits=2)
axis(1,at=xticks,labels=xlabels)
axis(2,at=yticks,labels=ylabels)
points(1:nfeatures,prob.v,type="h")
map.graphics.reset(par.v)
}
else
{
prob.v
}
}
# map.marginal - plot that shows the marginal probability distribution of the
# neurons and data
# parameters:
# - map is an object of type 'map'
# - marginal is the name of a training data frame dimension or index
map.marginal <- function(map,marginal)
{
# ensure that map is a 'map' object
if (class(map) != "map")
stop("first argument is not a map object.")
# check if the second argument is of type character
if (!typeof(marginal) == "character")
{
train <- data.frame(points = map$data[[marginal]])
neurons <- data.frame(points = map$neurons[[marginal]])
train$legend <- 'training data'
neurons$legend <- 'neurons'
hist <- rbind(train,neurons)
ggplot(hist, aes(points, fill = legend)) +
geom_density(alpha = 0.2) +
xlab(names(map$data)[marginal])
}
else if (marginal %in% names(map$data))
{
train <- data.frame(points = map$data[names(map$data) == marginal])
colnames(train) <- c("points")
neurons <- data.frame(points = map$neurons[names(map$neurons) == marginal])
colnames(neurons) <- c("points")
train$legend <- 'training data'
neurons$legend <- 'neurons'
hist <- rbind(train,neurons)
ggplot(hist, aes(points, fill = legend)) +
geom_density(alpha = 0.2) +
xlab(marginal)
}
else
{
stop("second argument is not a data dimension or index")
}
}
# map.fitted -- returns a vector of labels assigned to the observations
# parameters:
# - map is an object of type 'map'
# value:
# - a vector of labels
map.fitted <- function(map)
{
if (class(map) != "map")
stop("first argument is not a map object.")
nobs <- length(map$fitted.obs)
labels <- c()
for (i in 1:nobs)
{
nix <- map$fitted.obs[[i]]
coord <- coordinate(map,nix)
x <- coord$x
y <- coord$y
c.x <- map$centroids[[x,y]]$x
c.y <- map$centroids[[x,y]]$y
l <- map$centroid.labels[[c.x,c.y]]
labels <- c(labels,l)
}
labels
}
# map.predict -- returns classification labels for points in DF
# parameters:
# - map -- map object
# - points -- data frame of points to be classified
# value:
# - the label of the centroid x belongs to
map.predict<- function (map,points)
{
# local function to do the actual prediction
predict.point <- function (x)
{
if (!is.vector(x))
stop("argument has to be a vector.")
if (length(x) != ncol(map$data))
stop("predict vector dimensionality is incompatible")
if (map$normalize)
x <- as.vector(map.normalize(x))
# find best matching neuron
nix <- best.match(map,x)
# find corresponding centroid
coord <- coordinate(map,nix)
ix <- coord$x
iy <- coord$y
c.xy <- map$centroids[[ix,iy]]
c.nix <- rowix(map,c.xy)
label <- map$centroid.labels[[c.xy$x,c.xy$y]]
c.ix <- find.centroidix(map,c.xy)
# compute the confidence of the prediction
# compute x to centroid distance
vectors <- rbind(map$neurons[c.nix,],x)
x.to.c.distance <- max(as.matrix(dist(vectors))[1,])
# compute the max radius of cluster
# NOTE: we are using the training data here NOT the neurons
vectors <- map$neurons[c.nix,]
for (i in 1:length(map$centroid.obs[[c.ix]]))
{
obs.ix <- map$centroid.obs[[c.ix]][i]
vectors <- rbind(vectors,map$data[obs.ix,])
}
max.o.to.c.distance <- max(as.matrix(dist(vectors))[1,])
# add a little bit of slack so we don't wind up with a 0 confidence value
max.o.to.c.distance <- max.o.to.c.distance + 0.05*max.o.to.c.distance
# compute confidence value
conf <- 1.0 - (x.to.c.distance/max.o.to.c.distance)
return (c(label,conf))
}
if (is.vector(points))
points <- t(data.frame(points))
m <- data.frame(t(apply(points,1,predict.point)))
names(m) <- c("Label", "Confidence")
m
}
# map.position-- return the position of points on the map
# parameters:
# - map -- map object
# - points -- a data frame of points to be mapped
# value:
# - x-y coordinates of points in points
map.position <- function (map,points)
{
# local function to positon a point on the map
position.point <- function(x)
{
if (!is.vector(x))
stop("argument has to be a vector.")
if (length(x) != ncol(map$data))
stop("vector dimensionality is incompatible")
if (map$normalize)
x <- as.vector(map.normalize(x))
nix <- best.match(map,x)
coord <- coordinate(map,nix)
return (c(coord$x,coord$y))
}
if (is.vector(points))
points <- t(data.frame(points))
m <- data.frame(t(apply(points,1,position.point)))
names(m) <- c("x-dim", "y-dim")
m
}
# map.convergence - details of the convergence index of a map
#
# parameters:
# - map is an object if type 'map'
# - conf.int is the confidence interval of the quality assessment (default 95%)
# - k is the number of samples used for the estimated topographic accuracy
# computation
# - verb if true reports the two convergence components separately, otherwise
# it will report the linear combination of the two
# - ks is a switch, true for ks-test, false for standard var and means test
#
# - return value is the convergence index
map.convergence <- function(map,conf.int=.95,k=50,verb=TRUE,ks = TRUE)
{
if (ks)
embed <- map.embed.ks(map,conf.int,verb=FALSE)
else
embed <- map.embed.vm(map,conf.int,verb=FALSE)
topo <- map.topo(map,k,conf.int,verb=FALSE,interval=FALSE)
if (verb)
return (list(embed=embed,topo=topo))
else
return (0.5*embed + 0.5*topo)
}
#############################################################################
############################# local functions ###############################
#############################################################################
# test.integer -- test to see if x is an integer value
test.integer <- function(x)
{
all.equal(x, as.integer(x), check.attributes = FALSE)
}
# find.centroidix -- given a coordinate find the ix into the
# unique.centroids table
find.centroidix <- function (map,cd)
{
for (i in 1:length(map$unique.centroids))
{
if (cd$x == map$unique.centroids[[i]]$x &&
cd$y == map$unique.centroids[[i]]$y)
{
return (i)
}
}
stop("coordinate not a centroid")
}
# compute.centroid.obs -- compute the observations that belong to each
# centroid.
compute.centroid.obs <- function (map)
{
centroid.obs <- array(list(),dim=length(map$unique.centroids))
for (cluster.ix in 1:length(map$unique.centroids))
{
c.nix <- rowix(map,map$unique.centroids[[cluster.ix]])
for (i in 1:nrow(map$data))
{
# find the centroid of the current observation's
# best matching neuron
coord <- coordinate(map,map$fitted.obs[i])
# centroid of cluster the neuron belongs to
c.obj.nix <- rowix(map,map$centroids[[coord$x,coord$y]])
# if observation centroid equal current centroid add to vectors
if (c.obj.nix == c.nix)
{
centroid.obs[[cluster.ix]] <- append(centroid.obs[[cluster.ix]],i)
}
}
}
as.vector(centroid.obs)
}
# compute.wcss -- compute the average within cluster sum of squares
# see here:
# medium.com/@ODSC/unsupervised-learning-evaluating-clusters-bd47eed175ce
compute.wcss <- function (map)
{
# for each cluster gather all the point vectors that belong to that
# cluster into table 'vectors' making sure that the centroid vector
# is always the first vector in the table. Then compute the
# sum square distances from the centroid to all the points.
# when computing the average make sure that we ignore the centroid vector.
clusters.ss <- c()
for (cluster.ix in 1:length(map$unique.centroids))
{
c.nix <- rowix(map,map$unique.centroids[[cluster.ix]])
vectors <- map$neurons[c.nix,]
for (i in 1:length(map$centroid.obs[[cluster.ix]]))
{
obs.ix <- map$centroid.obs[[cluster.ix]][i]
vectors <- rbind(vectors,map$data[obs.ix,])
}
distances <- as.matrix(dist(vectors))[1,]
distances.sqd <- sapply(distances,function(x){x*x})
c.ss <- sum(distances.sqd)/(length(distances.sqd)-1)
clusters.ss <- c(clusters.ss,c.ss)
}
wcss <- sum(clusters.ss)/length(clusters.ss)
wcss
}
# compute.bcss -- compute the average between cluster sum of squares
# see here:
# medium.com/@ODSC/unsupervised-learning-evaluating-clusters-bd47eed175ce
compute.bcss <- function (map)
{
all.bc.ss <- c()
# put all cluster vectors into one table
c.nix <- rowix(map,map$unique.centroids[[1]])
cluster.vectors <- map$neurons[c.nix,]
if (length(map$unique.centroids) > 1)
{
for (cluster.ix in 2:length(map$unique.centroids))
{
c.nix <- rowix(map,map$unique.centroids[[cluster.ix]])
c.vector <- map$neurons[c.nix,]
cluster.vectors <- rbind(cluster.vectors,c.vector)
}
}
# put each cluster vector at the beginning of the table in turn
# and compute the distances - row 1 will have our results
# NOTE: at every iteration one of the cluster vectors will
# appear twice in the vector table. we have to adjust for that
# when computing the average.
for (cluster.ix in 1:length(map$unique.centroids))
{
c.nix <- rowix(map,map$unique.centroids[[cluster.ix]])
c.vector <- map$neurons[c.nix,]
compute.vectors <- rbind(c.vector,cluster.vectors)
bc.distances <- as.matrix(dist(compute.vectors))[1,]
bc.distances.sqd <- sapply(bc.distances,function(x){x*x})
# Note: cluster.ix appears twice
bc.ss <- sum(bc.distances.sqd)/(length(bc.distances.sqd)-2)
all.bc.ss <- c(all.bc.ss,bc.ss)
}
# basic sanity check
stopifnot(length(all.bc.ss)==length(map$unique.centroids))
bcss <- sum(all.bc.ss)/length(all.bc.ss)
bcss
}
# compute.label.to.centroid -- compute a label to centroid lookup table
# The returned value is a table of indexes into the unique centroids table
compute.label.to.centroid <- function (map)
{
conv <- hash()
for (i in 1:length(map$unique.centroids))
{
x <- map$unique.centroids[[i]]$x
y <- map$unique.centroids[[i]]$y
l <- map$centroid.labels[[x,y]]
if (is.null(conv[[l]]))
{
conv[[l]] <- list(i)
}
else
{
conv[[l]] <- append(conv[[l]],i)
}
}
conv
}
# for each observation i, visual has an entry for
# the best matching neuron
map.fitted.obs <- function(map)
{
fitted.obs <- c()
for (i in 1:nrow(map$data))
{
b <- best.match(map,map$data[i,])
fitted.obs <- c(fitted.obs,b)
}
fitted.obs
}
# map.topo - measure the topographic accuracy of the map using sampling
#
# parameters:
# - map is an object if type 'map'
# - k is the number of samples used for the accuracy computation
# - conf.int is the confidence interval of the accuracy test (default 95%)
# - verb is switch that governs the return value, false: single accuracy value
# is returned, true: a vector of individual feature accuracies is returned.
# - interval is a switch that controls whether the confidence interval
# is computed.
#
# - return value is the estimated topographic accuracy
map.topo <- function(map,k=50,conf.int=.95,verb=FALSE,interval=TRUE)
{
if (class(map) != "map")
stop("first argument is not a map object.")
# data.df is a matrix that contains the training data
data.df <- as.matrix(map$data)
# sample map$data
# k samples unless k > nrow(data.df), then
k <- min(k,nrow(data.df))
data.sample.ix <- sample(1:nrow(data.df),size=k,replace=FALSE)
# compute the sum topographic accuracy - the accuracy of a single sample
# is 1 if the best matching unit is a neighbor otherwise it is 0
acc.v <- c()
for (i in 1:k)
{
acc.v <- c(acc.v,
accuracy(map,
data.df[data.sample.ix[i],],
data.sample.ix[i]))
}
# compute the confidence interval values using the bootstrap
if (interval)
bval <- bootstrap(map,conf.int,data.df,k,acc.v)
# the sum topographic accuracy is scaled by the number of samples - estimated
# topographic accuracy
if (verb)
{
acc.v
}
else
{
val <- sum(acc.v)/k
if (interval)
list(val=val,lo=bval$lo,hi=bval$hi)
else
val
}
}
# map.embed using variance and mean tests
map.embed.vm <- function(map,conf.int=.95,verb=FALSE)
{
if (class(map) != "map")
stop("first argument is not a map object.")
# map.df is a dataframe that contains the neurons
map.df <- data.frame(map$neurons)
# data.df is a dataframe that contain the training data
# note: map$data is what the 'som' package returns
data.df <- data.frame(map$data)
# do the F-test on a pair of datasets: code vectors/training data
vl <- df.var.test(map.df,data.df,conf=conf.int)
# do the t-test on a pair of datasets: code vectors/training data
ml <- df.mean.test(map.df,data.df,conf=conf.int)
# compute the variance captured by the map -- but only if the
# means have converged as well.
nfeatures <- ncol(map.df)
prob.v <- map.significance(map,graphics=FALSE)
var.sum <- 0
for (i in 1:nfeatures)
{
if (vl$conf.int.lo[i] <= 1.0
&& vl$conf.int.hi[i] >= 1.0
&& ml$conf.int.lo[i] <= 0.0
&& ml$conf.int.hi[i] >= 0.0)
{
var.sum <- var.sum + prob.v[i]
}
else
{
# not converged - zero out the probability
prob.v[i] <- 0
}
}
# return the variance captured by converged features
if (verb)
prob.v
else
var.sum
}
# map.embed using the kolgomorov-smirnov test
map.embed.ks <- function(map,conf.int=.95,verb=FALSE)
{
if (class(map) != "map")
{
stop("first argument is not a map object.")
}
# map.df is a dataframe that contains the neurons
map.df <- data.frame(map$neurons)
# data.df is a dataframe that contain the training data
# note: map$data is what the 'som' package returns
data.df <- data.frame(map$data)
nfeatures <- ncol(map.df)
# use the Kolmogorov-Smirnov Test to test whether the neurons and
# training data appear to come from the same distribution
ks.vector <- NULL
for(i in 1:nfeatures){
# suppress the warning about ties.
ks.vector[[i]] <- suppressWarnings(ks.test(map.df[[i]], data.df[[i]]))
}
prob.v <- map.significance(map,graphics=FALSE)
var.sum <- 0
# compute the variance captured by the map
for (i in 1:nfeatures)
{
# the second entry contains the p-value
if (ks.vector[[i]][[2]] > (1 - conf.int)) {
var.sum <- var.sum + prob.v[i]
} else {
# not converged - zero out the probability
prob.v[i] <- 0
}
}
# return the variance captured by converged features
if (verb)
prob.v
else
var.sum
}
# map.normalize -- based on the som:normalize function but preserved names
map.normalize <- function (x)
{
if (is.vector(x))
{
return (scale(x))
}
else if (is.data.frame(x))
{
df <- data.frame(t(apply(x,1,scale)))
names(df) <- names(x)
return (df)
}
else
{
stop("'x' is not a vector or dataframe.\n")
}
}
# bootstrap -- compute the topographic accuracies for the given
# confidence interval
bootstrap <- function(map,conf.int,data.df,k,sample.acc.v)
{
ix <- as.integer(100 - conf.int*100)
bn <- 200
bootstrap.acc.v <- c(sum(sample.acc.v)/k)
for (i in 2:bn)
{
bs.v <- sample(1:k,size=k,replace=TRUE)
a <- sum(sample.acc.v[bs.v])/k
bootstrap.acc.v <- c(bootstrap.acc.v,a)
}
bootstrap.acc.sort.v <- sort(bootstrap.acc.v)
lo.val <- bootstrap.acc.sort.v[ix]
hi.val <- bootstrap.acc.sort.v[bn-ix]
list(lo=lo.val,hi=hi.val)
}
# best.match -- given observation obs, return the best matching neuron
best.match <- function(map,obs,full=FALSE)
{
# NOTE: replicate obs so that there are nr rows of obs
obs.m <- matrix(as.numeric(obs),
nrow(map$neurons),
ncol(map$neurons),
byrow=TRUE)
diff <- map$neurons - obs.m
squ <- diff * diff
s <- rowSums(squ)
d <- sqrt(s)
o <- order(d)
if (full)
o
else
o[1]
}
# accuracy -- the topographic accuracy of a single sample is 1 is the best
# matching unit and the second best matching unit are are neighbors
# otherwise it is 0
accuracy <- function(map,sample,data.ix)
{
# compute the euclidean distances of the sample from the neurons
# and find the 2 best matching units for the sample
o <- best.match(map,sample,full=TRUE)
best.ix <- o[1]
second.best.ix <- o[2]
# sanity check
coord <- coordinate(map,best.ix)
coord.x <- coord$x
coord.y <- coord$y
map.ix <- map$fitted.obs[data.ix]
coord <- coordinate(map,map.ix)
map.x <- coord$x
map.y <- coord$y
if (coord.x != map.x || coord.y != map.y || best.ix != map.ix)
{
cat("best.ix: ",best.ix," map.rix: ",map.ix,"\n")
stop("problems with coordinates")
}
# determine if the best and second best are neighbors on the map
best.xy <- coordinate(map,best.ix)
second.best.xy <- coordinate(map,second.best.ix)
diff.map <- c(best.xy$x,best.xy$y) - c(second.best.xy$x,second.best.xy$y)
diff.map.sq <- diff.map * diff.map
sum.map <- sum(diff.map.sq)
dist.map <- sqrt(sum.map)
# it is a neighbor if the distance on the map
# between the bmu and 2bmu is less than 2
if (dist.map < 2)
1
else
0
}
# coord -- constructor for a 'coord' object
coord <- function (x=-1,y=-1)
{
l <- list(x=x,y=y)
class(l) <- "coord"
l
}
# coordinate -- convert from a row index to a map xy-coordinate
coordinate <- function(map,rowix)
{
x <- (rowix-1) %% map$xdim + 1
y <- (rowix-1) %/% map$xdim + 1
coord(x,y)
}
# rowix -- convert from a map xy-coordinate to a row index
rowix <- function(map,cd)
{
if (class(cd) != "coord")
stop("expected a coord object")
rix <- cd$x + (cd$y-1)*map$xdim
rix
}
# map.graphics.set -- set the graphics environment for our map utilities
# the return value is the original graphics param vector
map.graphics.set <- function()
{
par.v <- par()
par(ps=6)
par.v
}
# map.graphics.reset -- reset the graphics environment to the original state
# parameter - a vector containing the settings for the original state
map.graphics.reset <- function(par.vector)
{
par(ps=par.vector$ps)
}
# plot.heat - plot a heat map based on a 'map', this plot also contains the
# connected components of the map based on the landscape of the
# heat map
plot.heat <- function(map)
{
x <- map$xdim
y <- map$ydim
centroids <- map$centroids
### need to make sure the map doesn't have a dimension of 1
if (x <= 1 || y <= 1)
{
stop("map dimensions too small")
}
### bin the heat values into 100 bins used for the 100 heat colors
heat.v <- as.vector(map$heat)
heat.v <- cut(heat.v,breaks=100,labels=FALSE)
heat <- array(data=heat.v,dim=c(x,y))
colors<- heat.colors(100)
### set up the graphics window
par.v <- map.graphics.set()
plot.new()
plot.window(xlim=c(0,x),ylim=c(0,y))
box()
title(xlab="x",ylab="y")
xticks <- seq(0.5,x-0.5,1)
yticks <- seq(0.5,y-0.5,1)
xlabels <- seq(1,x,1)
ylabels <- seq(1,y,1)
axis(1,at=xticks,labels=xlabels)
axis(3,at=xticks,labels=xlabels)
axis(2,at=yticks,labels=ylabels)
axis(4,at=yticks,labels=ylabels)
### plot the neurons as heat squares on the map
# TODO: vectorize this - rect can operate on vectors of coordinates and values
for (ix in 1:x)
{
for (iy in 1:y)
{
rect(ix-1,iy-1,ix,iy,col=colors[100 - heat[ix,iy] + 1],border=NA)
}
}
# connect each neuron to its centroid
for(ix in 1:x)
{
for (iy in 1:y)
{
cx <- centroids[[ix,iy]]$x
cy <- centroids[[ix,iy]]$y
points(c(ix,cx)-.5,c(iy,cy)-.5,type="l",col="grey")
}
}
# put majority labels on the centroids
# Note: if labels were not given then the function majority.labels
# will compute numerical labels to attach to the centroids.
centroid.labels <- majority.labels(map)
for(ix in 1:x)
{
for (iy in 1:y)
{
lab <- centroid.labels[[ix,iy]]
if (lab != none.label)
{
text(ix-.5,iy-.5,labels=lab)
}
}
}
map.graphics.reset(par.v)
}
#compute.centroids -- compute the centroid for each point on the map
# parameters:
# - map is an object if type 'map'
# return value:
# - a map of the same dimension of the input map where each cell points
# to the centroid of this cell.
compute.centroids <- function(map)
{
heat <- map$heat
xdim <- map$xdim
ydim <- map$ydim
max.val <- max(heat)
centroids <- array(data=list(coord()),dim=c(xdim,ydim))
########################################################################
### local recursive function to find the centroid of a point on the map
compute.centroid <- function(ix,iy)
{
# first we check if the current position is already associated
# with a centroid. if so, simply return the coordinates
# of that centroid
if ((centroids[[ix,iy]])$x > -1 && (centroids[[ix,iy]])$y > -1)
{
centroids[[ix,iy]]
}
# try to find a smaller value in the immediate neighborhood
# make our current position the square with the minimum value.
# if a minimum value other than our own current value cannot be
# found then we are at a minimum.
#
# search the neighborhood; three different cases: inner element,
# corner element, side element
# TODO: there has to be a better way!
min.val <- heat[ix,iy]
min.x <- ix
min.y <- iy
# (ix,iy) is an inner map element
if (ix > 1 && ix < xdim && iy > 1 && iy < ydim)
{
if (heat[ix-1,iy-1] < min.val)
{
min.val <- heat[ix-1,iy-1]
min.x <- ix-1
min.y <- iy-1
}
if (heat[ix,iy-1] < min.val)
{
min.val <- heat[ix,iy-1]
min.x <- ix
min.y <- iy-1
}
if (heat[ix+1,iy-1] < min.val)
{
min.val <- heat[ix+1,iy-1]
min.x <- ix+1
min.y <- iy-1
}
if (heat[ix+1,iy] < min.val)
{
min.val <- heat[ix+1,iy]
min.x <- ix+1
min.y <- iy
}
if (heat[ix+1,iy+1] < min.val)
{
min.val <- heat[ix+1,iy+1]
min.x <- ix+1
min.y <- iy+1
}
if (heat[ix,iy+1] < min.val)
{
min.val <- heat[ix,iy+1]
min.x <- ix
min.y <- iy+1
}
if (heat[ix-1,iy+1] < min.val)
{
min.val <- heat[ix-1,iy+1]
min.x <- ix-1
min.y <- iy+1
}
if (heat[ix-1,iy] < min.val)
{
min.val <- heat[ix-1,iy]
min.x <- ix-1
min.y <- iy
}
}
# (ix,iy) is bottom left corner
else if (ix == 1 && iy == 1)
{
if (heat[ix+1,iy] < min.val)
{
min.val <- heat[ix+1,iy]
min.x <- ix+1
min.y <- iy
}
if (heat[ix+1,iy+1] < min.val)
{
min.val <- heat[ix+1,iy+1]
min.x <- ix+1
min.y <- iy+1
}
if (heat[ix,iy+1] < min.val)
{
min.val <- heat[ix,iy+1]
min.x <- ix
min.y <- iy+1
}
}
# (ix,iy) is bottom right corner
else if (ix == xdim && iy == 1)
{
if (heat[ix,iy+1] < min.val)
{
min.val <- heat[ix,iy+1]
min.x <- ix
min.y <- iy+1
}
if (heat[ix-1,iy+1] < min.val)
{
min.val <- heat[ix-1,iy+1]
min.x <- ix-1
min.y <- iy+1
}
if (heat[ix-1,iy] < min.val)
{
min.val <- heat[ix-1,iy]
min.x <- ix-1
min.y <- iy
}
}
# (ix,iy) is top right corner
else if (ix == xdim && iy == ydim)
{
if (heat[ix-1,iy-1] < min.val)
{
min.val <- heat[ix-1,iy-1]
min.x <- ix-1
min.y <- iy-1
}
if (heat[ix,iy-1] < min.val)
{
min.val <- heat[ix,iy-1]
min.x <- ix
min.y <- iy-1
}
if (heat[ix-1,iy] < min.val)
{
min.val <- heat[ix-1,iy]
min.x <- ix-1
min.y <- iy
}
}
# (ix,iy) is top left corner
else if (ix == 1 && iy == ydim)
{
if (heat[ix,iy-1] < min.val)
{
min.val <- heat[ix,iy-1]
min.x <- ix
min.y <- iy-1
}
if (heat[ix+1,iy-1] < min.val)
{
min.val <- heat[ix+1,iy-1]
min.x <- ix+1
min.y <- iy-1
}
if (heat[ix+1,iy] < min.val)
{
min.val <- heat[ix+1,iy]
min.x <- ix+1
min.y <- iy
}
}
# (ix,iy) is a left side element
else if (ix == 1 && iy > 1 && iy < ydim)
{
if (heat[ix,iy-1] < min.val)
{
min.val <- heat[ix,iy-1]
min.x <- ix
min.y <- iy-1
}
if (heat[ix+1,iy-1] < min.val)
{
min.val <- heat[ix+1,iy-1]
min.x <- ix+1
min.y <- iy-1
}
if (heat[ix+1,iy] < min.val)
{
min.val <- heat[ix+1,iy]
min.x <- ix+1
min.y <- iy
}
if (heat[ix+1,iy+1] < min.val)
{
min.val <- heat[ix+1,iy+1]
min.x <- ix+1
min.y <- iy+1
}
if (heat[ix,iy+1] < min.val)
{
min.val <- heat[ix,iy+1]
min.x <- ix
min.y <- iy+1
}
}
# (ix,iy) is a bottom side element
else if (ix > 1 && ix < xdim && iy == 1 )
{
if (heat[ix+1,iy] < min.val)
{
min.val <- heat[ix+1,iy]
min.x <- ix+1
min.y <- iy
}
if (heat[ix+1,iy+1] < min.val)
{
min.val <- heat[ix+1,iy+1]
min.x <- ix+1
min.y <- iy+1
}
if (heat[ix,iy+1] < min.val)
{
min.val <- heat[ix,iy+1]
min.x <- ix
min.y <- iy+1
}
if (heat[ix-1,iy+1] < min.val)
{
min.val <- heat[ix-1,iy+1]
min.x <- ix-1
min.y <- iy+1
}
if (heat[ix-1,iy] < min.val)
{
min.val <- heat[ix-1,iy]
min.x <- ix-1
min.y <- iy
}
}
# (ix,iy) is a right side element
else if (ix == xdim && iy > 1 && iy < ydim)
{
if (heat[ix-1,iy-1] < min.val)
{
min.val <- heat[ix-1,iy-1]
min.x <- ix-1
min.y <- iy-1
}
if (heat[ix,iy-1] < min.val)
{
min.val <- heat[ix,iy-1]
min.x <- ix
min.y <- iy-1
}
if (heat[ix,iy+1] < min.val)
{
min.val <- heat[ix,iy+1]
min.x <- ix
min.y <- iy+1
}
if (heat[ix-1,iy+1] < min.val)
{
min.val <- heat[ix-1,iy+1]
min.x <- ix-1
min.y <- iy+1
}
if (heat[ix-1,iy] < min.val)
{
min.val <- heat[ix-1,iy]
min.x <- ix-1
min.y <- iy
}
}
# (ix,iy) is a top side element
else if (ix > 1 && ix < xdim && iy == ydim)
{
if (heat[ix-1,iy-1] < min.val)
{
min.val <- heat[ix-1,iy-1]
min.x <- ix-1
min.y <- iy-1
}
if (heat[ix,iy-1] < min.val)
{
min.val <- heat[ix,iy-1]
min.x <- ix
min.y <- iy-1
}
if (heat[ix+1,iy-1] < min.val)
{
min.val <- heat[ix+1,iy-1]
min.x <- ix+1
min.y <- iy-1
}
if (heat[ix+1,iy] < min.val)
{
min.val <- heat[ix+1,iy]
min.x <- ix+1
min.y <- iy
}
if (heat[ix-1,iy] < min.val)
{
min.val <- heat[ix-1,iy]
min.x <- ix-1
min.y <- iy
}
}
# if successful
# move to the square with the smaller value and
# call compute.centroid on this new square
if (min.x != ix || min.y != iy)
{
# Note: returns a list of an x and y coordinate
compute.centroid(min.x,min.y)
}
#else
# we have found a minimum -- this is our centroid.
else
{
coord(ix,iy)
}
} # end function compute.centroid
###########################################################################
### iterate over the map and find the centroid for each element
for (i in 1:xdim)
{
for (j in 1:ydim)
{
centroids[[i,j]] <- compute.centroid(i,j)
}
}
centroids
}
# compute.heat -- compute a heat value map representation of the given
# distance matrix
# parameters:
# - map is an object if type 'map'
# return value:
# - a matrix with the same x-y dims as the original map containing the heat
compute.heat <- function(map)
{
d <- as.matrix(dist(data.frame(map$neurons)))
x <- map$xdim
y <- map$ydim
heat <- array(data=0,dim=c(x,y))
if (x == 1 || y == 1)
stop("heat map cannot be computed for a map of dimension 1")
# local function as a shorthand for rowix
xl <- function(ix,iy)
{
#cat("converting (",ix,",",iy,") to row", ix + (iy-1) *xdim,"\n")
#ix + (iy-1) * x
rowix(map,coord(ix,iy))
}
# check if the map is larger than 2 x 2 (otherwise it is only corners)
if (x > 2 && y > 2)
{
# iterate over the inner nodes and compute their umat values
for (ix in 2:(x-1))
{
for (iy in 2:(y-1))
{
sum <-
d[xl(ix,iy),xl(ix-1,iy-1)] +
d[xl(ix,iy),xl(ix,iy-1)] +
d[xl(ix,iy),xl(ix+1,iy-1)] +
d[xl(ix,iy),xl(ix+1,iy)] +
d[xl(ix,iy),xl(ix+1,iy+1)] +
d[xl(ix,iy),xl(ix,iy+1)] +
d[xl(ix,iy),xl(ix-1,iy+1)] +
d[xl(ix,iy),xl(ix-1,iy)]
heat[ix,iy] <- sum/8
}
}
# iterate over bottom x axis
for (ix in 2:(x-1))
{
iy <- 1
sum <-
d[xl(ix,iy),xl(ix+1,iy)] +
d[xl(ix,iy),xl(ix+1,iy+1)] +
d[xl(ix,iy),xl(ix,iy+1)] +
d[xl(ix,iy),xl(ix-1,iy+1)] +
d[xl(ix,iy),xl(ix-1,iy)]
heat[ix,iy] <- sum/5
}
# iterate over top x axis
for (ix in 2:(x-1))
{
iy <- y
sum <-
d[xl(ix,iy),xl(ix-1,iy-1)] +
d[xl(ix,iy),xl(ix,iy-1)] +
d[xl(ix,iy),xl(ix+1,iy-1)] +
d[xl(ix,iy),xl(ix+1,iy)] +
d[xl(ix,iy),xl(ix-1,iy)]
heat[ix,iy] <- sum/5
}
# iterate over the left y-axis
for (iy in 2:(y-1))
{
ix <- 1
sum <-
d[xl(ix,iy),xl(ix,iy-1)] +
d[xl(ix,iy),xl(ix+1,iy-1)] +
d[xl(ix,iy),xl(ix+1,iy)] +
d[xl(ix,iy),xl(ix+1,iy+1)] +
d[xl(ix,iy),xl(ix,iy+1)]
heat[ix,iy] <- sum/5
}
# iterate over the right y-axis
for (iy in 2:(y-1))
{
ix <- x
sum <-
d[xl(ix,iy),xl(ix-1,iy-1)] +
d[xl(ix,iy),xl(ix,iy-1)] +
d[xl(ix,iy),xl(ix,iy+1)] +
d[xl(ix,iy),xl(ix-1,iy+1)] +
d[xl(ix,iy),xl(ix-1,iy)]
heat[ix,iy] <- sum/5
}
} # end if
# compute umat values for corners
if (x >= 2 && y >= 2)
{
# bottom left corner
ix <- 1
iy <- 1
sum <-
d[xl(ix,iy),xl(ix+1,iy)] +
d[xl(ix,iy),xl(ix+1,iy+1)] +
d[xl(ix,iy),xl(ix,iy+1)]
heat[ix,iy] <- sum/3
# bottom right corner
ix <- x
iy <- 1
sum <-
d[xl(ix,iy),xl(ix,iy+1)] +
d[xl(ix,iy),xl(ix-1,iy+1)] +
d[xl(ix,iy),xl(ix-1,iy)]
heat[ix,iy] <- sum/3
# top left corner
ix <- 1
iy <- y
sum <-
d[xl(ix,iy),xl(ix,iy-1)] +
d[xl(ix,iy),xl(ix+1,iy-1)] +
d[xl(ix,iy),xl(ix+1,iy)]
heat[ix,iy] <- sum/3
# top right corner
ix <- x
iy <- y
sum <-
d[xl(ix,iy),xl(ix-1,iy-1)] +
d[xl(ix,iy),xl(ix,iy-1)] +
d[xl(ix,iy),xl(ix-1,iy)]
heat[ix,iy] <- sum/3
} # end if
# smooth the heat map
xcoords <- c()
ycoords <- c()
for (i in 1:y)
{
for (j in 1:x)
{
ycoords <- c(ycoords, i)
xcoords <- c(xcoords, j)
}
}
xycoords <- data.frame(xcoords,ycoords)
heat <- smooth.2d(as.vector(heat),
x=as.matrix(xycoords),
nrow=x,ncol=y,
surface=FALSE,
theta=2)
heat
}
# df.var.test -- a function that applies the F-test testing the ratio
# of the variances of the two data frames
# parameters:
# - df1,df2 - data frames with the same number of columns
# - conf - confidence level for the F-test (default .95)
df.var.test <- function(df1,df2,conf = .95)
{
if (length(df1) != length(df2))
stop("cannot compare variances of data frames")
# init our working arrays
var.ratio.v <- array(data=1,dim=length(df1))
var.confintlo.v <- array(data=1,dim=length(df1))
var.confinthi.v <- array(data=1,dim=length(df1))
# compute the F-test on each feature in our populations
for (i in 1:length(df1))
{
t <- var.test(df1[[i]],df2[[i]],conf.level=conf)
var.ratio.v[i] <- t$estimate
#cat("Feature",i,"confidence interval =",t$conf.int,"\n")
var.confintlo.v[i] <- t$conf.int[1]
var.confinthi.v[i] <- t$conf.int[2]
}
# return a list with the ratios and conf intervals for each feature
list(ratio=var.ratio.v,
conf.int.lo=var.confintlo.v,
conf.int.hi=var.confinthi.v)
}
# df.mean.test -- a function that applies the t-test testing the difference
# of the means of the two data frames
# parameters:
# - df1,df2 - data frames with the same number of columns
# - conf - confidence level for the t-test (default .95)
df.mean.test <- function(df1,df2,conf = .95)
{
if (ncol(df1) != ncol(df2))
stop("cannot compare means of data frames")
# init our working arrays
mean.diff.v <- array(data=1,dim=ncol(df1))
mean.confintlo.v <- array(data=1,dim=ncol(df1))
mean.confinthi.v <- array(data=1,dim=ncol(df1))
# compute the F-test on each feature in our populations
for (i in 1:ncol(df1)) {
t <- t.test(x=df1[[i]],y=df2[[i]],conf.level=conf)
mean.diff.v[i] <- t$estimate[1] - t$estimate[2]
#cat("Feature",i,"confidence interval =",t$conf.int,"\n")
mean.confintlo.v[i] <- t$conf.int[1]
mean.confinthi.v[i] <- t$conf.int[2]
}
# return a list with the mean differences and conf intervals for each feature
list(diff=mean.diff.v,
conf.int.lo=mean.confintlo.v,
conf.int.hi=mean.confinthi.v)
}
# vsom.f - vectorized and optimized version of the stochastic
# SOM training algorithm written in Fortran90
vsom.f <- function(data,xdim,ydim,alpha,train,seed)
{
### some constants
dr <- nrow(data)
dc <- ncol(data)
nr <- xdim*ydim
nc <- dc # dim of data and neurons is the same
### build and initialize the matrix holding the neurons
if (!is.null(seed))
{
set.seed(seed)
}
else
{
# send a -1 to Fortran function to indicate no seed present
seed <- -1
}
cells <- nr * nc # no. of neurons times number of data dimensions
v <- runif(cells,-1,1) # vector with small init values for all neurons
# NOTE: each row represents a neuron, each column represents a dimension.
neurons <- matrix(v,nrow=nr,ncol=nc) # rearrange the vector as matrix
result <- .Fortran("vsom",
as.single(neurons),
as.single(data.matrix(data)),
as.integer(dr),
as.integer(dc),
as.integer(xdim),
as.integer(ydim),
as.single(alpha),
as.integer(train),
as.integer(seed),
PACKAGE="popsom")
# unpack the structure and list in result[1]
v <- result[1]
# rearrange the result vector as matrix
neurons <- matrix(v[[1]],nrow=nr,ncol=nc,byrow=FALSE)
neurons
}
# get.unique.centroids -- a list of unique centroid locations on the map
get.unique.centroids <- function(map)
{
# get the dimensions of the map
centroids <- map$centroids
xdim <- map$xdim
ydim <- map$ydim
cd.list <- c()
for(ix in 1:xdim)
{
for(iy in 1:ydim)
{
c.xy <- centroids[[ix, iy]]
b <- sapply(cd.list, function (x) {x$x == c.xy$x && x$y == c.xy$y})
if (!any(b))
{
cd.list <- c(cd.list,list(c.xy))
}
}
}
as.vector(cd.list)
}
# majority.labels -- return a map where the positions of the centroids
# has the majority label of the appropriate cluster attached to them.
none.label <- "<None>"
majority.labels <- function(map)
{
if (is.null(map$labels))
{
# no labels given, make up some numerical labels
return (numerical.labels(map))
}
x <- map$xdim
y <- map$ydim
centroids <- map$centroids
nobs <- nrow(map$data)
centroid.labels <- array(data=list(),dim=c(x,y))
majority.labels <- array(data=none.label,dim=c(x,y))
# gather the labels from the clusters and record them
# at the centroid position.
for(i in 1:nobs)
{
lab <- as.character(map$labels[i,1])
nix <- map$fitted.obs[i]
c <- coordinate(map,nix)
ix <- c$x
iy <- c$y
cx <- centroids[[ix,iy]]$x
cy <- centroids[[ix,iy]]$y
centroid.labels[[cx,cy]] <- append(centroid.labels[[cx,cy]],lab)
}
# attach majority labels to centroids
for (ix in 1:x)
{
for (iy in 1:y)
{
label.v <- centroid.labels[[ix,iy]]
if (length(label.v)!=0)
{
majority <- data.frame(sort(table(label.v),decreasing=TRUE))
if (nrow(majority) == 1) # only one label
{
# just copy a label from the label vector
majority.labels[[ix,iy]] <- label.v[1]
}
else
{
majority.labels[[ix,iy]] <- levels(majority[1,1])[1]
}
}
}
}
majority.labels
}
# numerical.labels -- create labels for centroids
numerical.labels <- function(map)
{
label_cnt <- 1
centroids <- map$centroids
unique.centroids <- map$unique.centroids
centroid.labels <- array(data=none.label,dim=c(map$xdim,map$ydim))
# set our labels at the centroid locations
for (i in 1:length(unique.centroids))
{
# create a label
label <- paste("centroid",label_cnt,sep=" ")
label_cnt <- label_cnt+1
ix <- unique.centroids[[i]]$x
iy <- unique.centroids[[i]]$y
#cat("coord",ix,iy,label,"\n")
centroid.labels[[ix,iy]] <- label
}
centroid.labels
}
########################## research stuff ###########################
# compute.nwcss -- compute the average within cluster sum of squares
# of neuron clusters
compute.nwcss <- function (map)
{
# for each cluster gather all the point vectors that belong to that
# cluster into table 'vectors' making sure that the centroid vector
# is always the first vector in the table. Then compute the
# sum square distances from the centroid to all the points.
# when computing the average make sure that we ignore the centroid vector.
clusters.ss <- c()
for (cluster.ix in 1:length(map$unique.centroids))
{
c.nix <- rowix(map,map$unique.centroids[[cluster.ix]])
vectors <- map$neurons[c.nix,]
for (i in 1:length(map$centroid.obs[[cluster.ix]]))
{
obs.ix <- map$centroid.obs[[cluster.ix]][i]
obs.nix <- map$fitted[[obs.ix]]
obs.coord <- coordinate(map,obs.nix)
centroid.coord <- map$centroids[[obs.coord$x,obs.coord$y]]
centroid.nix <- rowix(map,centroid.coord)
if (centroid.nix == c.nix){
vectors <- rbind(vectors,map$neurons[obs.nix,])
}
}
distances <- as.matrix(dist(vectors))[1,]
distances.sqd <- sapply(distances,function(x){x*x})
c.ss <- sum(distances.sqd)/(length(distances.sqd)-1)
clusters.ss <- c(clusters.ss,c.ss)
}
wcss <- sum(clusters.ss)/length(clusters.ss)
wcss
}
# avg.homogeneity -- given labels another way to ascertain quality of the map
avg.homogeneity <- function(map)
{
if (is.null(map$labels))
{
stop("you need to attach labels to the map")
}
# need to make sure the map doesn't have a dimension of 1
if (map$xdim <= 1 || map$ydim <= 1)
{
stop("map dimensions too small")
}
x <- map$xdim
y <- map$ydim
centroids <- map$centroids
nobs <- nrow(map$data)
centroid.labels <- array(data=list(),dim=c(x,y))
#attach labels to centroids
# count the labels in each map cell
for(i in 1:nobs)
{
lab <- as.character(map$labels[i,1])
nix <- map$fitted.obs[i]
c <- coordinate(map,nix)
ix <- c$x
iy <- c$y
cx <- centroids[[ix,iy]]$x
cy <- centroids[[ix,iy]]$y
centroid.labels[[cx,cy]] <- append(centroid.labels[[cx,cy]],lab)
}
# compute average homogeneity of the map: h = (1/nobs)*sum_c majority.label_c
sum.majority <- 0
n.centroids <- 0
for (ix in 1:x)
{
for (iy in 1:y)
{
label.v <- centroid.labels[[ix,iy]]
if (length(label.v)!=0)
{
n.centroids <- n.centroids + 1
majority <- data.frame(sort(table(label.v),decreasing=TRUE))
if (nrow(majority) == 1) # only one label
{
m.val <- length(label.v)
}
else
{
m.val <- majority[1,2]
}
sum.majority <- sum.majority + m.val
}
}
}
list(homog=sum.majority/nobs, nclust=n.centroids)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.