R/map-utils.R

Defines functions avg.homogeneity new.centroid swap.centroids combine.decision list.from.centroid list.clusters distance.between.clusters cluster.spread distance.from.centroids get.unique.centroids compute.combined.clusters vsom.f df.mean.test df.var.test compute.heat compute.umat compute.centroids plot.heat map.graphics.reset map.graphics.set rowix coordinate best.match map.embed.ks map.embed.vm map.marginal map.neuron map.significance map.starburst map.embed map.fit

Documented in best.match cluster.spread combine.decision compute.combined.clusters coordinate df.mean.test distance.between.clusters distance.from.centroids get.unique.centroids list.clusters list.from.centroid map.embed map.embed.ks map.embed.vm map.marginal map.neuron map.significance map.starburst rowix swap.centroids vsom.f

### map-utils.R
# version 4.3
# (c) 2009-2020 Lutz Hamel, Benjamin Ott, Greg Breard, University of Rhode Island
#               with Robert Tatoian and Vishakh Gopu
#
# This file constitues a set of routines which are useful in constructing
# and evaluating self-organizing maps (SOMs).
# The main utilities available in this file are:
# map.fit` --------- constructs a map
# map.embed --------- reports the embedding of the map in terms of modeling the
#                     underlying data distribution (100% if all feature distributions
#                     are modeled correctly, 0% if none are)
# map.significance -- graphically reports the significance of each feature with
#                     respect to the self-organizing map model
# map.starburst ----- displays the starburst representation of the SOM model, the centers of
#                     starbursts are the centers of clusters
# map.neuron -------- returns the contents of a neuron at (x,y) on the map as a vector
# map.marginal ------ displays a density plot of a training dataframe dimension overlayed
#                      with the neuron density for that same dimension or index.
#
#
### License
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
#  A copy of the GNU General Public License is available at
#  http://www.r-project.org/Licenses/
#
###

# load libraries
require(class)
require(fields)
require(graphics)
require(ggplot2)

### map.fit -- construct a SOM, returns an object of class 'map'
# parameters:
# - data - a dataframe where each row contains an unlabeled training instance
# - labels - a vector or dataframe with one label for each observation in data
# - xdim,ydim - the dimensions of the map
# - alpha - the learning rate, should be a positive non-zero real number
# - train - number of training iterations
# - normalize - normalize switch
# - seed - set seed for random init of neurons
# retuns:
# - an object of type 'map' -- see below

# Hint: if your training data does not have any labels you can construct a
#       simple label dataframe as follows: labels <- data.frame(1:nrow(training.data))

# NOTE: default algorithm: "vsom" also available: "som", "experimental", "batchsom"

map.fit <- function(data,labels=NULL,xdim=10,ydim=5,alpha=.3,train=1000,normalize=TRUE,seed=NULL)
{
  #TODO: implement normalize switch, implement seed
  neurons <- vsom.f(data,
                    xdim=xdim,
                    ydim=ydim,
                    alpha=alpha,
                    train=train)

  # make the neuron data a data frame
  neurons <- data.frame(neurons)
  names(neurons) <- names(data)

  ### add the visual field to map
  # for each observation i, visual has an entry for
  # the best matching neuron

  visual <- c()
  for (i in 1:nrow(data))
  {
      b <- best.match(map,data[i,])
      visual <- c(visual,coordinate(map,b))
  }

  ### construct the map object
  map <- list(data=data,
              labels=labels,
              xdim=xdim,
              ydim=ydim,
              alpha=alpha,
              train=train,
              algorithm=algorithm,
              neurons=neurons,
              visual=visual)

  ### add the class name
  class(map) <- "map"

  return(map)
}

### map.embed - evaluate the embedding of a map using the F-test and
#                     a Bayesian estimate of the variance in the training data.
# parameters:
# - map is an object if type 'map'
# - conf.int is the confidence interval of the convergence test (default 95%)
# - verb is switch that governs the return value false: single convergence value
#   is returned, true: a vector of individual feature convergences is returned.
#
# - return value is the cembedding of the map (variance captured by the map so far)

# Hint: the embedding index is the variance of the training data captured by the map.
map.embed <- function(map,conf.int=.95,verb=FALSE,ks=FALSE)
{
    if (ks)
        map.embed.ks(map,conf.int,verb)
    else
        map.embed.vm(map,conf.int,verb)
}

### map.starburst - compute and display the starburst representation of clusters
# parameters:
# - map is an object if type 'map'
# - explicit controls the shape of the connected components
# - smoothing controls the smoothing level of the umat (NULL,0,>0)
# - merge.clusters is a switch that controls if the starburst clusters are merged together
# - merge.range - a range that is used as a percentage of a certain distance in the code
#                 to determine whether components are closer to their centroids or
#                 centroids closer to each other.

map.starburst <- function(map,explicit=FALSE,smoothing=2,merge.clusters=FALSE,merge.range=.25)
{

	if (class(map) != "map")
		stop("map.starburst: first argument is not a map object.")

	umat <- compute.umat(map,smoothing=smoothing)
	plot.heat(map,umat,explicit=explicit,comp=TRUE,merge=merge.clusters,merge.range=merge.range)
}

### map.significance - compute the relative significance of each feature and plot it
# parameters:
# - map is an object if type 'map'
# - graphics is a switch that controls whether a plot is generated or not
# - feature.labels is a switch to allow the plotting of feature names vs feature indices
# return value:
# - a vector containing the significance for each feature

map.significance <- function(map,graphics=TRUE,feature.labels=TRUE)
{
	if (class(map) != "map")
		stop("map.significance: first argument is not a map object.")

	data.df <- data.frame(map$data)
	nfeatures <- ncol(data.df)

	# Compute the variance of each feature on the map
	var.v <- array(data=1,dim=nfeatures)
	for (i in 1:nfeatures)
    {
		var.v[i] <- var(data.df[[i]]);
	}

	# we use the variance of a feature as likelihood of
	# being an important feature, compute the Bayesian
	# probability of significance using uniform priors

	var.sum <- sum(var.v)
	prob.v <- var.v/var.sum

	# plot the significance
	if (graphics)
    {
		par.v <- map.graphics.set()

		y <- max(prob.v)
		plot.new()
		plot.window(xlim=c(1,nfeatures),ylim=c(0,y))
		box()

		title(xlab="Features",ylab="Significance")

		xticks <- seq(1,nfeatures,1)
		yticks <- seq(0,y,y/4)
		if (feature.labels)
			xlabels <- names(data.df)
		else
			xlabels <- seq(1,nfeatures,1)
		ylabels <- formatC(seq(0,y,y/4),digits=2)
		axis(1,at=xticks,labels=xlabels)
		axis(2,at=yticks,labels=ylabels)

		points(1:nfeatures,prob.v,type="h")

		map.graphics.reset(par.v)

	}
    else
    {
		prob.v
	}
}


### map.neuron - returns the contents of a neuron at (x,y) on the map as a vector
# parameters:
#   map - the neuron map
#   x - map x-coordinate of neuron
#   y - map y-coordinate of neuron
# return value:
#   a vector representing the neuron

map.neuron <- function(map,x,y)
{
    if (class(map) != "map")
        stop("map.neuron: first argument is not a map object.")

    ix <- rowix(map,x,y)
    map$neurons[ix,]
}


### map.marginal - plot that shows the marginal probability distribution of the neurons and data
# parameters:
# - map is an object of type 'map'
# - marginal is the name of a training data frame dimension or index

map.marginal <- function(map,marginal)
{
  # ensure that map is a 'map' object
  if (class(map) != "map")
    stop("map.marginal: first argument is not a map object.")

  # check if the second argument is of type character
  if (!typeof(marginal) == "character")
  {
    train <- data.frame(points = map$data[[marginal]])
    neurons <- data.frame(points = map$neurons[[marginal]])

    train$legend <- 'training data'
    neurons$legend <- 'neurons'

    hist <- rbind(train,neurons)
    ggplot(hist, aes(points, fill = legend)) + geom_density(alpha = 0.2) + xlab(names(map$data)[marginal])

  }
  else if (marginal %in% names(map$data))
  {

    train <- data.frame(points = map$data[names(map$data) == marginal])
    colnames(train) <- c("points")

    neurons <- data.frame(points = map$neurons[names(map$neurons) == marginal])
    colnames(neurons) <- c("points")

    train$legend <- 'training data'
    neurons$legend <- 'neurons'

    hist <- rbind(train,neurons)
    ggplot(hist, aes(points, fill = legend)) + geom_density(alpha = 0.2) + xlab(marginal)

  }
  else
  {
    stop("map.marginal: second argument is not the name of a training data frame dimension or index")
  }

}

############################### local functions #################################

# map.embed using variance and mean tests

map.embed.vm <- function(map,conf.int=.95,verb=FALSE)
{
    if (class(map) != "map")
        stop("map.embed: first argument is not a map object.")

    # map.df is a dataframe that contains the neurons
    map.df <- data.frame(map$neurons)

    # data.df is a dataframe that contain the training data
    # note: map$data is what the 'som' package returns
    data.df <- data.frame(map$data)

    # do the F-test on a pair of datasets: code vectors/training data
    vl <- df.var.test(map.df,data.df,conf=conf.int)

    # do the t-test on a pair of datasets: code vectors/training data
    ml <- df.mean.test(map.df,data.df,conf=conf.int)

    # compute the variance captured by the map -- but only if the means have converged as well.
    nfeatures <- ncol(map.df)
    prob.v <- map.significance(map,graphics=FALSE)
    var.sum <- 0

    for (i in 1:nfeatures)
    {
        #cat("Feature",i,"variance:\t",vl$ratio[i],"\t(",vl$conf.int.lo[i],"-",vl$conf.int.hi[i],")\n")
        #cat("Feature",i,"mean:\t",ml$diff[i],"\t(",ml$conf.int.lo[i],"-",ml$conf.int.hi[i],")\n")
        if (vl$conf.int.lo[i] <= 1.0 && vl$conf.int.hi[i] >= 1.0 &&  ml$conf.int.lo[i] <= 0.0 && ml$conf.int.hi[i] >= 0.0) {
            var.sum <- var.sum + prob.v[i]
        } else {
            # not converged - zero out the probability
            prob.v[i] <- 0
        }
    }

    # return the variance captured by converged features
    if (verb)
      prob.v
    else
      var.sum
}

# map.embed using the kolgomorov-smirnov test

map.embed.ks <- function(map,conf.int=.95,verb=FALSE) {

    if (class(map) != "map") {
        stop("map.embed: first argument is not a map object.")
    }

    # map.df is a dataframe that contains the neurons
    map.df <- data.frame(map$neurons)

    # data.df is a dataframe that contain the training data
    # note: map$data is what the 'som' package returns
    data.df <- data.frame(map$data)

    nfeatures <- ncol(map.df)

    # use the Kolmogorov-Smirnov Test to test whether the neurons and training data appear
    # to come from the same distribution
    ks.vector <- NULL
    for(i in 1:nfeatures){
        # Note rpt - I needed to use suppress warnings to suppress the warning about ties.
        ks.vector[[i]] <- suppressWarnings(ks.test(map.df[[i]], data.df[[i]]))
    }

    prob.v <- map.significance(map,graphics=FALSE)
    var.sum <- 0

    # compute the variance captured by the map
    for (i in 1:nfeatures)
    {
        # the second entry contains the p-value
        if (ks.vector[[i]][[2]] > (1 - conf.int)) {
            var.sum <- var.sum + prob.v[i]
        } else {
            # not converged - zero out the probability
            prob.v[i] <- 0
        }
    }

    # return the variance captured by converged features
    if (verb)
      prob.v
    else
      var.sum

}

# map.normalize -- based on the som:normalize function but preserved names

map.normalize <- function (x, byrow = TRUE)
{
    if (is.data.frame(x))
    {
        if (byrow)
            df <- t(apply(x, 1, scale))
        else
            df <- apply(x, 2, scale)
        names(df) <- names(x)
    }
    else
        stop("map.normalize: 'x' is not a dataframe.\n")
}



# best.match -- given observation obs, return the best matching neuron

best.match <- function(map,obs,full=FALSE)
{
    # NOTE: replicate obs so that there are nr rows of obs
    obs.m <- matrix(as.numeric(obs),nrow(map$neurons),ncol(map$neurons),byrow=TRUE)
    diff <- map$neurons - obs.m
    squ <- diff * diff
    s <- rowSums(squ)
    d <- sqrt(s)
    o <- order(d)

    if (full)
        o
    else
        o[1]
}


# coordinate -- convert from a row index to a map xy-coordinate

coordinate <- function(map,rowix)
{
    x <- (rowix-1) %% map$xdim + 1
    y <- (rowix-1) %/% map$xdim + 1
    c(x,y)
}

#rowix -- convert from a map xy-coordinate to a row index

rowix <- function(map,x,y)
{
    rix <- x + (y-1)*map$xdim
    rix
}

# map.graphics.set -- set the graphics environment for our map utilities
#                     the return value is the original graphics param vector

map.graphics.set <- function()
{
	par.v <- par()
	par(ps=6)
	par.v
}

# map.graphics.reset -- reset the graphics environment to the original state
# parameter - a vector containing the settings for the original state

map.graphics.reset <- function(par.vector)
{
	par(ps=par.vector$ps)
}

### plot.heat - plot a heat map based on a 'map', this plot also contains the connected
#               components of the map based on the landscape of the heat map
# parameters:
# - map is an object if type 'map'
# - heat is a 2D heat map of the map returned by 'map'
# - labels is a vector with labels of the original training data set
# - explicit controls the shape of the connected components
# - comp controls whether we plot the connected components on the heat map
# - merge controls whether we merge the starbursts together.
# - merge.range - a range that is used as a percentage of a certain distance in the code
#                 to determine whether components are closer to their centroids or
#                 centroids closer to each other.

plot.heat <- function(map,heat,explicit=FALSE,comp=TRUE,merge=FALSE,merge.range)
{
  ### keep an unaltered copy of the unified distance matrix,
  ### required for merging the starburst clusters
  umat <- heat

	x <- map$xdim
	y <- map$ydim
	nobs <- nrow(map$data)
	count <- array(data=0,dim=c(x,y))

	### need to make sure the map doesn't have a dimension of 1
	if (x <= 1 || y <= 1)
    {
        stop("plot.heat: map dimensions too small")
    }

    ### bin the heat values into 100 bins used for the 100 heat colors
    heat.v <- as.vector(heat)
    heat.v <- cut(heat.v,breaks=100,labels=FALSE)
    heat <- array(data=heat.v,dim=c(x,y))
    colors<- heat.colors(100)

	### set up the graphics window
	par.v <- map.graphics.set()
	plot.new()
	plot.window(xlim=c(0,x),ylim=c(0,y))
	box()

	title(xlab="x",ylab="y")

	xticks <- seq(0.5,x-0.5,1)
	yticks <- seq(0.5,y-0.5,1)
	xlabels <- seq(1,x,1)
	ylabels <- seq(1,y,1)
	axis(1,at=xticks,labels=xlabels)
	axis(3,at=xticks,labels=xlabels)
	axis(2,at=yticks,labels=ylabels)
	axis(4,at=yticks,labels=ylabels)


    ### plot the neurons as heat squares on the map
    # TODO: vectorize this - rect can operate on vectors of coordinates and values
	for (ix in 1:x)
    {
		for (iy in 1:y)
        {
			rect(ix-1,iy-1,ix,iy,col=colors[100 - heat[ix,iy] + 1],border=NA)
		}
	}

	### put the connected component lines on the map
	if (comp)
    {
	  if(!merge){
		  # find the centroid for each neuron on the map
		  centroids <- compute.centroids(map,heat,explicit)
	  } else {
	    # find the unique centroids for the neurons on the map
	    centroids <- compute.combined.clusters(map,umat,explicit,merge.range)
	  }
        # connect each neuron to its centroid
		for(ix in 1:x)
        {
			for (iy in 1:y)
            {
				cx <- centroids$centroid.x[ix,iy]
				cy <- centroids$centroid.y[ix,iy]
				points(c(ix,cx)-.5,c(iy,cy)-.5,type="l",col="grey")
			}
		}
	}

	### put the labels on the map if available
    if (!is.null(map$labels))
    {
        # count the labels in each map cell
        for(i in 1:nobs)
        {
            nix <- map$visual[i]
            c <- coordinate(map,nix)
            ix <- c[1]
            iy <- c[2]

            count[ix,iy] <- count[ix,iy]+1
        }

        #count.df <- data.frame(count)
        #print(count.df)

        for(i in 1:nobs)
        {
            c <- coordinate(map,map$visual[i])
            #cat("Coordinate of ",i," is ",c,"\n")
            ix <- c[1]
            iy <- c[2]
            # we only print one label per cell
            # TODO: print out majority label
            if (count[ix,iy] > 0)
            {
                count[ix,iy] <- 0
                ix <- ix - .5
                iy <- iy - .5
                l <- map$labels[i,1]
                text(ix,iy,labels=l)
            }
        }
    }

	map.graphics.reset(par.v)
}


### compute.centroids -- compute the centroid for each point on the map
# parameters:
# - map is an object if type 'map'
# - heat is a matrix representing the heat map representation
# - explicit controls the shape of the connected component
# return value:
# - a list containing the matrices with the same x-y dims as the original map containing the centroid x-y coordinates

compute.centroids <- function(map,heat,explicit=FALSE)
{
	xdim <- map$xdim
	ydim <- map$ydim
	centroid.x <- array(data=-1,dim=c(xdim,ydim))
	centroid.y <- array(data=-1,dim=c(xdim,ydim))
	max.val <- max(heat)

    ### recursive function to find the centroid of a point on the map
	compute.centroid <- function(ix,iy)
    {
		# first we check if the current position is already associated
		# with a centroid.  if so, simply return the coordinates
		# of that centroid
		if (centroid.x[ix,iy] > -1 && centroid.y[ix,iy] > -1)
        {
			list(bestx=centroid.x[ix,iy],besty=centroid.y[ix,iy])
		}

		# try to find a smaller value in the immediate neighborhood
		# make our current position the square with the minimum value.
		# if a minimum value other than our own current value cannot be
		# found then we are at a minimum.
		#
		# search the neighborhood; three different cases: inner element, corner element, side element
        # TODO: there has to be a better way!

		min.val <- heat[ix,iy]
		min.x <- ix
		min.y <- iy

		# (ix,iy) is an inner map element
		if (ix > 1 && ix < xdim && iy > 1 && iy < ydim)
        {
			if (heat[ix-1,iy-1] < min.val)
            {
				min.val <- heat[ix-1,iy-1]
				min.x <- ix-1
				min.y <- iy-1
			}
			if (heat[ix,iy-1] < min.val)
            {
				min.val <- heat[ix,iy-1]
				min.x <- ix
				min.y <- iy-1
			}
			if (heat[ix+1,iy-1] < min.val)
            {
				min.val <- heat[ix+1,iy-1]
				min.x <- ix+1
				min.y <- iy-1
			}
			if (heat[ix+1,iy] < min.val)
            {
				min.val <- heat[ix+1,iy]
				min.x <- ix+1
				min.y <- iy
			}
			if (heat[ix+1,iy+1] < min.val)
            {
				min.val <- heat[ix+1,iy+1]
				min.x <- ix+1
				min.y <- iy+1
			}
			if (heat[ix,iy+1] < min.val)
            {
				min.val <- heat[ix,iy+1]
				min.x <- ix
				min.y <- iy+1
			}
			if (heat[ix-1,iy+1] < min.val)
            {
				min.val <- heat[ix-1,iy+1]
				min.x <- ix-1
				min.y <- iy+1
			}
			if (heat[ix-1,iy] < min.val)
            {
				min.val <- heat[ix-1,iy]
				min.x <- ix-1
				min.y <- iy
			}
		}

		# (ix,iy) is bottom left corner
		else if (ix == 1 && iy == 1)
        {
			if (heat[ix+1,iy] < min.val)
            {
				min.val <- heat[ix+1,iy]
				min.x <- ix+1
				min.y <- iy
			}
			if (heat[ix+1,iy+1] < min.val)
            {
				min.val <- heat[ix+1,iy+1]
				min.x <- ix+1
				min.y <- iy+1
			}
			if (heat[ix,iy+1] < min.val)
            {
				min.val <- heat[ix,iy+1]
				min.x <- ix
				min.y <- iy+1
			}
		}

		# (ix,iy) is bottom right corner
		else if (ix == xdim && iy == 1)
        {
			if (heat[ix,iy+1] < min.val)
            {
				min.val <- heat[ix,iy+1]
				min.x <- ix
				min.y <- iy+1
			}
			if (heat[ix-1,iy+1] < min.val)
            {
				min.val <- heat[ix-1,iy+1]
				min.x <- ix-1
				min.y <- iy+1
			}
			if (heat[ix-1,iy] < min.val)
            {
				min.val <- heat[ix-1,iy]
				min.x <- ix-1
				min.y <- iy
			}
		}

		# (ix,iy) is top right corner
		else if (ix == xdim && iy == ydim)
        {
			if (heat[ix-1,iy-1] < min.val)
            {
				min.val <- heat[ix-1,iy-1]
				min.x <- ix-1
				min.y <- iy-1
			}
			if (heat[ix,iy-1] < min.val)
            {
				min.val <- heat[ix,iy-1]
				min.x <- ix
				min.y <- iy-1
			}
			if (heat[ix-1,iy] < min.val)
            {
				min.val <- heat[ix-1,iy]
				min.x <- ix-1
				min.y <- iy
			}
		}

		# (ix,iy) is top left corner
		else if (ix == 1 && iy == ydim)
        {
			if (heat[ix,iy-1] < min.val)
            {
				min.val <- heat[ix,iy-1]
				min.x <- ix
				min.y <- iy-1
			}
			if (heat[ix+1,iy-1] < min.val)
            {
				min.val <- heat[ix+1,iy-1]
				min.x <- ix+1
				min.y <- iy-1
			}
			if (heat[ix+1,iy] < min.val)
            {
				min.val <- heat[ix+1,iy]
				min.x <- ix+1
				min.y <- iy
			}
		}

		# (ix,iy) is a left side element
		else if (ix == 1  && iy > 1 && iy < ydim)
        {
			if (heat[ix,iy-1] < min.val)
            {
				min.val <- heat[ix,iy-1]
				min.x <- ix
				min.y <- iy-1
			}
			if (heat[ix+1,iy-1] < min.val)
            {
				min.val <- heat[ix+1,iy-1]
				min.x <- ix+1
				min.y <- iy-1
			}
			if (heat[ix+1,iy] < min.val)
            {
				min.val <- heat[ix+1,iy]
				min.x <- ix+1
				min.y <- iy
			}
			if (heat[ix+1,iy+1] < min.val)
            {
				min.val <- heat[ix+1,iy+1]
				min.x <- ix+1
				min.y <- iy+1
			}
			if (heat[ix,iy+1] < min.val)
            {
				min.val <- heat[ix,iy+1]
				min.x <- ix
				min.y <- iy+1
			}
		}

		# (ix,iy) is a bottom side element
		else if (ix > 1 && ix < xdim && iy == 1 )
        {
			if (heat[ix+1,iy] < min.val)
            {
				min.val <- heat[ix+1,iy]
				min.x <- ix+1
				min.y <- iy
			}
			if (heat[ix+1,iy+1] < min.val)
            {
				min.val <- heat[ix+1,iy+1]
				min.x <- ix+1
				min.y <- iy+1
			}
			if (heat[ix,iy+1] < min.val)
            {
				min.val <- heat[ix,iy+1]
				min.x <- ix
				min.y <- iy+1
			}
			if (heat[ix-1,iy+1] < min.val)
            {
				min.val <- heat[ix-1,iy+1]
				min.x <- ix-1
				min.y <- iy+1
			}
			if (heat[ix-1,iy] < min.val)
            {
				min.val <- heat[ix-1,iy]
				min.x <- ix-1
				min.y <- iy
			}
		}

		# (ix,iy) is a right side element
		else if (ix == xdim && iy > 1 && iy < ydim)
        {
			if (heat[ix-1,iy-1] < min.val)
            {
				min.val <- heat[ix-1,iy-1]
				min.x <- ix-1
				min.y <- iy-1
			}
			if (heat[ix,iy-1] < min.val)
            {
				min.val <- heat[ix,iy-1]
				min.x <- ix
				min.y <- iy-1
			}
			if (heat[ix,iy+1] < min.val)
            {
				min.val <- heat[ix,iy+1]
				min.x <- ix
				min.y <- iy+1
			}
			if (heat[ix-1,iy+1] < min.val)
            {
				min.val <- heat[ix-1,iy+1]
				min.x <- ix-1
				min.y <- iy+1
			}
			if (heat[ix-1,iy] < min.val)
            {
				min.val <- heat[ix-1,iy]
				min.x <- ix-1
				min.y <- iy
			}
		}

		# (ix,iy) is a top side element
		else if (ix > 1 && ix < xdim && iy == ydim)
        {
			if (heat[ix-1,iy-1] < min.val)
            {
				min.val <- heat[ix-1,iy-1]
				min.x <- ix-1
				min.y <- iy-1
			}
			if (heat[ix,iy-1] < min.val)
            {
				min.val <- heat[ix,iy-1]
				min.x <- ix
				min.y <- iy-1
			}
			if (heat[ix+1,iy-1] < min.val)
            {
				min.val <- heat[ix+1,iy-1]
				min.x <- ix+1
				min.y <- iy-1
			}
			if (heat[ix+1,iy] < min.val)
            {
				min.val <- heat[ix+1,iy]
				min.x <- ix+1
				min.y <- iy
			}
			if (heat[ix-1,iy] < min.val)
            {
				min.val <- heat[ix-1,iy]
				min.x <- ix-1
				min.y <- iy
			}
		}

		#if successful
		# move to the square with the smaller value, i.e., call compute.centroid on this new square
		# note the RETURNED x-y coords in the centroid.x and centroid.y matrix at the current location
		# return the RETURNED x-y coordinates
		if (min.x != ix || min.y != iy)
        {
			r.val <- compute.centroid(min.x,min.y)

			# if explicit is set show the exact connected component
			# otherwise construct a connected componenent where all
			# nodes are connected to a centrol node
			if (explicit)
            {
				centroid.x[ix,iy] <<- min.x
				centroid.y[ix,iy] <<- min.y
				list(bestx=min.x,besty=min.y)
			}
			else
            {
				centroid.x[ix,iy] <<- r.val$bestx
				centroid.y[ix,iy] <<- r.val$besty
				r.val
			}
		}
		#else
		# we have found a minimum
		# note the current x-y in the centroid.x and centroid.y matrix
		# return the current x-y coordinates
		else
        {
			centroid.x[ix,iy] <<- ix
			centroid.y[ix,iy] <<- iy
			list(bestx=ix,besty=iy)
		}
	} # end function compute.centroid

	### iterate over the map and find the centroid for each element
	for (i in 1:xdim)
    {
		for (j in 1:ydim)
        {
			compute.centroid(i,j)
		}
	}

	list(centroid.x=centroid.x,centroid.y=centroid.y)
}


### compute.umat -- compute the unified distance matrix
# parameters:
# - map is an object if type 'map'
# - smoothing is either NULL, 0, or a positive floating point value controlling the
#         smoothing of the umat representation
# return value:
# - a matrix with the same x-y dims as the original map containing the umat values

compute.umat <- function(map,smoothing=NULL)
{
	d <- dist(data.frame(map$neurons))
	umat <- compute.heat(map,d,smoothing)

	umat
}

### compute.heat -- compute a heat value map representation of the given distance matrix
# parameters:
# - map is an object if type 'map'
# - d is a distance matrix computed via the 'dist' function
# - smoothing is either NULL, 0, or a positive floating point value controlling the
#         smoothing of the umat representation
# return value:
# - a matrix with the same x-y dims as the original map containing the heat

compute.heat <- function(map,d.in,smoothing=NULL)
{
	d <- as.matrix(d.in)
	x <- map$xdim
	y <- map$ydim
	heat <- array(data=0,dim=c(x,y))

	if (x == 1 || y == 1)
		stop("compute.heat: heat map can not be computed for a map with a dimension of 1")

	# this function translates our 2-dim map coordinates
	# into the 1-dim coordinates of the neurons
	xl <- function(ix,iy)
    {
        #cat("converting (",ix,",",iy,") to row", ix + (iy-1) *xdim,"\n")
		ix + (iy-1) * x
	}

	# check if the map is larger than 2 x 2 (otherwise it is only corners)
	if (x > 2 && y > 2)
    {
		# iterate over the inner nodes and compute their umat values
		for (ix in 2:(x-1))
        {
			for (iy in 2:(y-1))
            {
				sum <-
					   d[xl(ix,iy),xl(ix-1,iy-1)] +
					   d[xl(ix,iy),xl(ix,iy-1)] +
					   d[xl(ix,iy),xl(ix+1,iy-1)] +
					   d[xl(ix,iy),xl(ix+1,iy)] +
					   d[xl(ix,iy),xl(ix+1,iy+1)] +
					   d[xl(ix,iy),xl(ix,iy+1)] +
					   d[xl(ix,iy),xl(ix-1,iy+1)] +
					   d[xl(ix,iy),xl(ix-1,iy)]
				heat[ix,iy] <- sum/8
			}

		}

		# iterate over bottom x axis
		for (ix in 2:(x-1))
        {
			iy <- 1
			sum <-
				   d[xl(ix,iy),xl(ix+1,iy)] +
				   d[xl(ix,iy),xl(ix+1,iy+1)] +
				   d[xl(ix,iy),xl(ix,iy+1)] +
				   d[xl(ix,iy),xl(ix-1,iy+1)] +
				   d[xl(ix,iy),xl(ix-1,iy)]
			heat[ix,iy] <- sum/5
		}

		# iterate over top x axis
		for (ix in 2:(x-1))
        {
			iy <- y
			sum <-
				   d[xl(ix,iy),xl(ix-1,iy-1)] +
				   d[xl(ix,iy),xl(ix,iy-1)] +
				   d[xl(ix,iy),xl(ix+1,iy-1)] +
				   d[xl(ix,iy),xl(ix+1,iy)] +
				   d[xl(ix,iy),xl(ix-1,iy)]
			heat[ix,iy] <- sum/5
		}

		# iterate over the left y-axis
		for (iy in 2:(y-1))
        {
			ix <- 1
			sum <-
				   d[xl(ix,iy),xl(ix,iy-1)] +
				   d[xl(ix,iy),xl(ix+1,iy-1)] +
				   d[xl(ix,iy),xl(ix+1,iy)] +
				   d[xl(ix,iy),xl(ix+1,iy+1)] +
				   d[xl(ix,iy),xl(ix,iy+1)]
			heat[ix,iy] <- sum/5
		}

		# iterate over the right y-axis
		for (iy in 2:(y-1))
        {
			ix <- x
			sum <-
				   d[xl(ix,iy),xl(ix-1,iy-1)] +
				   d[xl(ix,iy),xl(ix,iy-1)] +
				   d[xl(ix,iy),xl(ix,iy+1)] +
				   d[xl(ix,iy),xl(ix-1,iy+1)] +
				   d[xl(ix,iy),xl(ix-1,iy)]
			heat[ix,iy] <- sum/5
		}
	} # end if

	# compute umat values for corners
	if (x >= 2 && y >= 2)
    {
		# bottom left corner
		ix <- 1
		iy <- 1
		sum <-
				d[xl(ix,iy),xl(ix+1,iy)] +
				d[xl(ix,iy),xl(ix+1,iy+1)] +
				d[xl(ix,iy),xl(ix,iy+1)]
		heat[ix,iy] <- sum/3

		# bottom right corner
		ix <- x
		iy <- 1
		sum <-
			   d[xl(ix,iy),xl(ix,iy+1)] +
			   d[xl(ix,iy),xl(ix-1,iy+1)] +
			   d[xl(ix,iy),xl(ix-1,iy)]
		heat[ix,iy] <- sum/3

		# top left corner
		ix <- 1
		iy <- y
		sum <-
				d[xl(ix,iy),xl(ix,iy-1)] +
				d[xl(ix,iy),xl(ix+1,iy-1)] +
				d[xl(ix,iy),xl(ix+1,iy)]
		heat[ix,iy] <- sum/3

		# top right corner
		ix <- x
		iy <- y
		sum <-
				d[xl(ix,iy),xl(ix-1,iy-1)] +
				d[xl(ix,iy),xl(ix,iy-1)] +
				d[xl(ix,iy),xl(ix-1,iy)]
		heat[ix,iy] <- sum/3
	} # end if

	# smooth the heat map
	xcoords <- c()
	ycoords <- c()
	for (i in 1:y) {
		for (j in 1:x) {
			ycoords <- c(ycoords, i)
			xcoords <- c(xcoords, j)
		}
	}
	xycoords <- data.frame(xcoords,ycoords)

	if (!is.null(smoothing)) {
		if (smoothing == 0)
			heat <- smooth.2d(as.vector(heat),x=as.matrix(xycoords),nrow=x,ncol=y,surface=FALSE)
		else if (smoothing > 0)
			heat <- smooth.2d(as.vector(heat),x=as.matrix(xycoords),nrow=x,ncol=y,surface=FALSE,theta=smoothing)
		else
			stop("compute.heat: bad value for smoothing parameter")
	}

	heat
}

### df.var.test -- a function that applies the F-test testing the ratio
#                  of the variances of the two data frames
# parameters:
# - df1,df2 - data frames with the same number of columns
# - conf - confidence level for the F-test (default .95)

df.var.test <- function(df1,df2,conf = .95)
{
	if (length(df1) != length(df2))
        stop("df.var.test: cannot compare variances of data frames")

	# init our working arrays
	var.ratio.v <- array(data=1,dim=length(df1))
	var.confintlo.v <- array(data=1,dim=length(df1))
	var.confinthi.v <- array(data=1,dim=length(df1))

	# compute the F-test on each feature in our populations
	for (i in 1:length(df1))
    {
		t <- var.test(df1[[i]],df2[[i]],conf.level=conf)
		var.ratio.v[i] <- t$estimate
		#cat("Feature",i,"confidence interval =",t$conf.int,"\n")
		var.confintlo.v[i] <- t$conf.int[1]
		var.confinthi.v[i] <- t$conf.int[2]
	}

	# return a list with the ratios and conf intervals for each feature
	list(ratio=var.ratio.v,conf.int.lo=var.confintlo.v,conf.int.hi=var.confinthi.v)
}

### df.mean.test -- a function that applies the t-test testing the difference
#                   of the means of the two data frames
# parameters:
# - df1,df2 - data frames with the same number of columns
# - conf - confidence level for the t-test (default .95)

df.mean.test <- function(df1,df2,conf = .95)
{
	if (ncol(df1) != ncol(df2))
        stop("df.mean.test: cannot compare means of data frames")

	# init our working arrays
	mean.diff.v <- array(data=1,dim=ncol(df1))
	mean.confintlo.v <- array(data=1,dim=ncol(df1))
	mean.confinthi.v <- array(data=1,dim=ncol(df1))

	# compute the F-test on each feature in our populations
	for (i in 1:ncol(df1)) {
		t <- t.test(x=df1[[i]],y=df2[[i]],conf.level=conf)
		mean.diff.v[i] <- t$estimate[1] - t$estimate[2]
		#cat("Feature",i,"confidence interval =",t$conf.int,"\n")
		mean.confintlo.v[i] <- t$conf.int[1]
		mean.confinthi.v[i] <- t$conf.int[2]
	}

	# return a list with the mean differences and conf intervals for each feature
	list(diff=mean.diff.v,conf.int.lo=mean.confintlo.v,conf.int.hi=mean.confinthi.v)
}


### vsom.f - vectorized and optimized version of the stochastic SOM training algorithm written in Fortran90
vsom.f <- function(data,xdim,ydim,alpha,train)
{
    ### some constants
    dr <- nrow(data)
    dc <- ncol(data)
    nr <- xdim*ydim
    nc <- dc # dim of data and neurons is the same

    ### build and initialize the matrix holding the neurons
    cells <- nr * nc        # no. of neurons times number of data dimensions
    v <- runif(cells,-1,1)  # vector with small init values for all neurons
    # NOTE: each row represents a neuron, each column represents a dimension.
    neurons <- matrix(v,nrow=nr,ncol=nc)  # rearrange the vector as matrix


    result <- .Fortran("vsom",
                       as.single(neurons),
                       as.single(data.matrix(data)),
                       as.integer(dr),
                       as.integer(dc),
                       as.integer(xdim),
                       as.integer(ydim),
                       as.single(alpha),
                       as.integer(train),
                       PACKAGE="popsom2")

    # unpack the structure and list in result[1]
    v <- result[1]
    neurons <- matrix(v[[1]],nrow=nr,ncol=nc,byrow=FALSE)  # rearrange the result vector as matrix

    neurons
}


#Functions to Combine connected components that represent the same cluster
##### TOP LEVEL FOR DISTANCE MATRIX ORIENTED CLUSTER COMBINE####

compute.combined.clusters <- function(map,heat,explicit,range) {
  # compute the connected components
  centroids <- compute.centroids(map,heat,explicit)
  #Get unique centroids
  unique.centroids <- get.unique.centroids(map, centroids)
  #Get distance from centroid to cluster elements for all centroids
  within_cluster_dist <- distance.from.centroids(map, centroids, unique.centroids, heat)
  #Get average pairwise distance between clusters
  between_cluster_dist <- distance.between.clusters(map, centroids, unique.centroids, heat)
  #Get a boolean matrix of whether two components should be combined
  combine_cluster_bools <- combine.decision(within_cluster_dist, between_cluster_dist, range)
  #Create the modified connected components grid
  new_centroid <- new.centroid(combine_cluster_bools, centroids, unique.centroids, map)

  new_centroid
}

### get.unique.centroids -- a function that computes a list of unique centroids from
#                           a matrix of centroid locations.
#
# parameters:
# - map is an object of type 'map'
# - centroids - a matrix of the centroid locations in the map
get.unique.centroids <- function(map, centroids){
  # get the dimensions of the map
  xdim <- map$xdim
  ydim <- map$ydim
  xlist <- c()
  ylist <- c()
  x.centroid <- centroids$centroid.x
  y.centroid <- centroids$centroid.y

  for(ix in 1:xdim){
    for(iy in 1:ydim) {
      cx <- x.centroid[ix, iy]
      cy <- y.centroid[ix, iy]

      # Check if the x or y of the current centroid is not in the list and if not
      # append both the x and y coordinates to the respective lists
      if(!(cx %in% xlist) || !(cy %in% ylist)){
        xlist <- c(xlist, cx)
        ylist <- c(ylist, cy)
      }
    }
  }

  # return a list of unique centroid positions
  list(position.x=xlist, position.y=ylist)
}

### distance.from.centroids -- A function to get the average distance from
#                              centroid by cluster.
# parameters:
# - map is an object of type 'map'
# - centroids - a matrix of the centroid locations in the map
# - unique.centroids - a list of unique centroid locations
# - heat is a unified distance matrix
distance.from.centroids <- function(map, centroids, unique.centroids, heat){
  xdim <- map$xdim
  ydim <- map$ydim
  centroids.x.positions <- unique.centroids$position.x
  centroids.y.positions <- unique.centroids$position.y
  within <- c()

  for (i in 1:length(centroids.x.positions)){
    cx <- centroids.x.positions[i]
    cy <- centroids.y.positions[i]

    # compute the average distance
    distance <- cluster.spread(cx, cy, heat, centroids, map)

    # append the computed distance to the list of distances
    within <- c(within, distance)
  }

  # return the list
  within
}

### cluster.spread -- Function to calculate the average distance in
#                     one cluster given one centroid.
#
# parameters:
# - x - x position of a unique centroid
# - y - y position of a unique centroid
# - umat - a unified distance matrix
# - centroids - a matrix of the centroid locations in the map
# - map is an object of type 'map'
cluster.spread <- function(x, y, umat, centroids, map){
  centroid.x <- x
  centroid.y <- y
  sum <- 0
  elements <- 0
  xdim <- map$xdim
  ydim <- map$ydim
  centroid_weight <- umat[centroid.x, centroid.y]
  for(xi in 1:xdim){
    for(yi in 1:ydim){
      cx <- centroids$centroid.x[xi, yi]
      cy <- centroids$centroid.y[xi, yi]
      if(cx == centroid.x && cy == centroid.y){
        cweight <- umat[xi,yi]
        sum <- sum + abs(cweight - centroid_weight)
        elements <- elements + 1
      }
    }
  }

  average <- sum / elements
  average
}

### distance.between.clusters -- A function to compute the average pairwise
#                                distance between clusters.
#
# parameters:
# - map is an object of type 'map'
# - centroids - a matrix of the centroid locations in the map
# - unique.centroids - a list of unique centroid locations
# - umat - a unified distance matrix
distance.between.clusters <- function(map, centroids, unique.centroids, umat){
  cluster_elements <- list.clusters(map, centroids, unique.centroids, umat)
  cluster_elements <- sapply(cluster_elements,'[',seq(max(sapply(cluster_elements,length))))

  columns <- ncol(cluster_elements)

  cluster_elements <- matrix(unlist(cluster_elements),ncol = ncol(cluster_elements),byrow = FALSE)
  cluster_elements <- apply(combn(ncol(cluster_elements), 2), 2, function(x)
    abs(cluster_elements[, x[1]] - cluster_elements[, x[2]]))

  mean <- colMeans(cluster_elements, na.rm=TRUE)
  index <- 1
  mat <- matrix(data=NA, nrow=columns, ncol=columns)

  for(xi in 1:(columns-1)){
    for (yi in xi:(columns-1)){
      mat[xi, yi + 1] <- mean[index]
      mat[yi + 1, xi] <- mean[index]
      index <- index + 1
    }
  }

  mat
}

### list.clusters -- A function to get the clusters as a list of lists.
#
# parameters:
# - map is an object of type 'map'
# - centroids - a matrix of the centroid locations in the map
# - unique.centroids - a list of unique centroid locations
# - umat - a unified distance matrix
list.clusters <- function(map,centroids,unique.centroids,umat){
  centroids.x.positions <- unique.centroids$position.x
  centroids.y.positions <- unique.centroids$position.y
  componentx <- centroids$centroid.x
  componenty <- centroids$centroid.y
  cluster_list <- list()

  for(i in 1:length(centroids.x.positions)){
    cx <- centroids.x.positions[i]
    cy <- centroids.y.positions[i]

    # get the clusters associated with a unique centroid and store it in a list
    cluster_list[i] <- list.from.centroid(map, cx, cy, centroids, umat)
  }
  cluster_list
}

### list.from.centroid -- A funtion to get all cluster elements
#                         associated to one centroid.
#
# parameters:
# - map is an object of type 'map'
# - x - the x position of a centroid
# - y - the y position of a centroid
# - centroids - a matrix of the centroid locations in the map
# - umat - a unified distance matrix
list.from.centroid <- function(map,x,y,centroids,umat){
  centroid.x <- x
  centroid.y <- y
  sum <- 0
  xdim <- map$xdim
  ydim <- map$ydim
  centroid_weight <- umat[centroid.x, centroid.y]
  cluster_list <- c()
  for(xi in 1:xdim){
    for(yi in 1:ydim){
      cx <- centroids$centroid.x[xi, yi]
      cy <- centroids$centroid.y[xi, yi]

      if(cx == centroid.x && cy == centroid.y){
        cweight <- umat[xi, yi]
        cluster_list <- c(cluster_list, cweight)
      }
    }
  }
  list(cluster_list)
}

### combine.decision -- A function that produces a boolean matrix
#                       representing which clusters should be combined.
#
# parameters:
# - within_cluster_dist - A list of the distances from centroid to cluster elements for all centroids
# - distance_between_clusters - A list of the average pairwise distance between clusters
# - range is the distance where the clusters are merged together.
combine.decision <- function(within_cluster_dist,distance_between_clusters,range){
  inter_cluster <- distance_between_clusters
  centroid_dist <- within_cluster_dist
  dim <- dim(inter_cluster)[1]
  to_combine <- matrix(data=FALSE, nrow=dim, ncol=dim)
  for(xi in 1:dim){
    for(yi in 1:dim){
      cdist <- inter_cluster[xi,yi]
      if(!is.na(cdist)){
        rx <- centroid_dist[xi] * range
        ry <- centroid_dist[yi] * range
        if(cdist < (centroid_dist[xi] + rx) || cdist < (centroid_dist[yi] + ry)){
          to_combine[xi, yi] <- TRUE
        }
      }
    }
  }
  to_combine
}

### swap.centroids -- A function that changes every instance of a centroid to
#                     one that it should be combined with.
# parameters:
# - map is an object of type 'map'
# - x1 -
# - y1 -
# - x2 -
# - y2 -
# - centroids - a matrix of the centroid locations in the map
swap.centroids <- function(map, x1, y1, x2, y2, centroids){
  xdim <- map$xdim
  ydim <- map$ydim
  compn_x <- centroids$centroid.x
  compn_y <- centroids$centroid.y
  for(xi in 1:xdim){
    for(yi in 1:ydim){
      if(compn_x[xi] == x1 && compn_y[yi] == y1){
        compn_x[xi] <- x2
        compn_y[yi] <- y2
      }
    }
  }

  list(centroid.x=compn_x, centroid.y=compn_y)
}


### new.centroid -- A function to combine centroids based on matrix of booleans.
#
# parameters:
#
# - bmat - a boolean matrix containing the centroids to merge
# - centroids - a matrix of the centroid locations in the map
# - unique.centroids - a list of unique centroid locations
# - map is an object of type 'map'
new.centroid <- function(bmat, centroids, unique.centroids, map){
  bmat.rows <- dim(bmat)[1]
  bmat.columns <- dim(bmat)[2]
  centroids_x <- unique.centroids$position.x
  centroids_y <- unique.centroids$position.y
  components <- centroids
  for(xi in 1:bmat.rows){
    for(yi in 1:bmat.columns){
      if(bmat[xi,yi] == TRUE){
        x1 <- centroids_x[xi]
        y1 <- centroids_y[xi]
        x2 <- centroids_x[yi]
        y2 <- centroids_y[yi]
        components <- swap.centroids(map, x1, y1, x2, y2, components)
      }
    }
  }
  components
}


avg.homogeneity <- function(map,explicit=FALSE,smoothing=2,merge=FALSE,merge.range=.25)
{
    ### keep an unaltered copy of the unified distance matrix,
    ### required for merging the starburst clusters
    umat <- compute.umat(map,smoothing=smoothing)

    x <- map$xdim
    y <- map$ydim
    nobs <- nrow(map$data)
    centroid.labels <- array(data=list(),dim=c(x,y))

    ### need to make sure the map doesn't have a dimension of 1
    if (x <= 1 || y <= 1)
    {
        stop("avg.homogeneity: map dimensions too small")
    }

    if (is.null(map$labels))
    {
	stop("avg.homogeneity: you need to attach labels to the map")
    }

   if(!merge)
   {
     # find the centroid for each neuron on the map
     centroids <- compute.centroids(map,umat,explicit)
   }
   else
   {
     # find the unique centroids for the neurons on the map
      centroids <- compute.combined.clusters(map,umat,explicit,merge.range)
   }


    ### attach labels to centroids
    # count the labels in each map cell
    for(i in 1:nobs)
    {
        lab <- as.character(map$labels[i,1])
        nix <- map$visual[i]
        c <- coordinate(map,nix)
        ix <- c[1]
        iy <- c[2]
        cx <- centroids$centroid.x[ix,iy]
        cy <- centroids$centroid.y[ix,iy]
        centroid.labels[[cx,cy]] <- append(centroid.labels[[cx,cy]],lab)
     }

    ### compute average homogeneity of the map: h = (1/nobs)*sum_c majority.lahbel_c
    sum.majority <- 0
    n.centroids <- 0

    for (ix in 1:x)
    {
	for (iy in 1:y)
        {
	    label.v <- centroid.labels[[ix,iy]]
	    if (length(label.v)!=0)
	    {
		n.centroids <- n.centroids + 1
		majority <- data.frame(sort(table(label.v),decreasing=TRUE))

		if (nrow(majority) == 1) # only one label
		{
		   m.val <- length(label.v)
		}
		else
		{
		   m.val <- majority[1,2]
		}
		sum.majority <- sum.majority + m.val
	    }
	}
    }
    list(homog=sum.majority/nobs, nclust=n.centroids)
}
lutzhamel/popsom2 documentation built on May 25, 2020, 12:46 a.m.