popsom2: Functions for Constructing and Evaluating Self-Organizing Maps

Documented in best.match cluster.spread combine.decision compute.combined.clusters coordinate df.mean.test distance.between.clusters distance.from.centroids get.unique.centroids list.clusters list.from.centroid map.embed map.embed.ks map.embed.vm map.marginal map.neuron map.significance map.starburst rowix swap.centroids vsom.f

### map-utils.R
# version 4.3
# (c) 2009-2020 Lutz Hamel, Benjamin Ott, Greg Breard, University of Rhode Island
#               with Robert Tatoian and Vishakh Gopu
#
# This file constitues a set of routines which are useful in constructing
# and evaluating self-organizing maps (SOMs).
# The main utilities available in this file are:
# map.fit` --------- constructs a map
# map.embed --------- reports the embedding of the map in terms of modeling the
#                     underlying data distribution (100% if all feature distributions
#                     are modeled correctly, 0% if none are)
# map.significance -- graphically reports the significance of each feature with
#                     respect to the self-organizing map model
# map.starburst ----- displays the starburst representation of the SOM model, the centers of
#                     starbursts are the centers of clusters
# map.neuron -------- returns the contents of a neuron at (x,y) on the map as a vector
# map.marginal ------ displays a density plot of a training dataframe dimension overlayed
#                      with the neuron density for that same dimension or index.
#
#
### License
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
#  A copy of the GNU General Public License is available at
#  http://www.r-project.org/Licenses/
#
###

# load libraries
require(class)
require(fields)
require(graphics)
require(ggplot2)

### map.fit -- construct a SOM, returns an object of class 'map'
# parameters:
# - data - a dataframe where each row contains an unlabeled training instance
# - labels - a vector or dataframe with one label for each observation in data
# - xdim,ydim - the dimensions of the map
# - alpha - the learning rate, should be a positive non-zero real number
# - train - number of training iterations
# - normalize - normalize switch
# - seed - set seed for random init of neurons
# retuns:
# - an object of type 'map' -- see below

# Hint: if your training data does not have any labels you can construct a
#       simple label dataframe as follows: labels <- data.frame(1:nrow(training.data))

# NOTE: default algorithm: "vsom" also available: "som", "experimental", "batchsom"

map.fit <- function(data,labels=NULL,xdim=10,ydim=5,alpha=.3,train=1000,normalize=TRUE,seed=NULL)
{
  #TODO: implement normalize switch, implement seed
  neurons <- vsom.f(data,
                    xdim=xdim,
                    ydim=ydim,
                    alpha=alpha,
                    train=train)

  # make the neuron data a data frame
  neurons <- data.frame(neurons)
  names(neurons) <- names(data)

  ### add the visual field to map
  # for each observation i, visual has an entry for
  # the best matching neuron

  visual <- c()
  for (i in 1:nrow(data))
  {
      b <- best.match(map,data[i,])
      visual <- c(visual,coordinate(map,b))
  }

  ### construct the map object
  map <- list(data=data,
              labels=labels,
              xdim=xdim,
              ydim=ydim,
              alpha=alpha,
              train=train,
              algorithm=algorithm,
              neurons=neurons,
              visual=visual)

  ### add the class name
  class(map) <- "map"

  return(map)
}

### map.embed - evaluate the embedding of a map using the F-test and
#                     a Bayesian estimate of the variance in the training data.
# parameters:
# - map is an object if type 'map'
# - conf.int is the confidence interval of the convergence test (default 95%)
# - verb is switch that governs the return value false: single convergence value
#   is returned, true: a vector of individual feature convergences is returned.
#
# - return value is the cembedding of the map (variance captured by the map so far)

# Hint: the embedding index is the variance of the training data captured by the map.
map.embed <- function(map,conf.int=.95,verb=FALSE,ks=FALSE)
{
    if (ks)
        map.embed.ks(map,conf.int,verb)
    else
        map.embed.vm(map,conf.int,verb)
}

### map.starburst - compute and display the starburst representation of clusters
# parameters:
# - map is an object if type 'map'
# - explicit controls the shape of the connected components
# - smoothing controls the smoothing level of the umat (NULL,0,>0)
# - merge.clusters is a switch that controls if the starburst clusters are merged together
# - merge.range - a range that is used as a percentage of a certain distance in the code
#                 to determine whether components are closer to their centroids or
#                 centroids closer to each other.

map.starburst <- function(map,explicit=FALSE,smoothing=2,merge.clusters=FALSE,merge.range=.25)
{

	if (class(map) != "map")
		stop("map.starburst: first argument is not a map object.")

	umat <- compute.umat(map,smoothing=smoothing)
	plot.heat(map,umat,explicit=explicit,comp=TRUE,merge=merge.clusters,merge.range=merge.range)
}

### map.significance - compute the relative significance of each feature and plot it
# parameters:
# - map is an object if type 'map'
# - graphics is a switch that controls whether a plot is generated or not
# - feature.labels is a switch to allow the plotting of feature names vs feature indices
# return value:
# - a vector containing the significance for each feature

map.significance <- function(map,graphics=TRUE,feature.labels=TRUE)
{
	if (class(map) != "map")
		stop("map.significance: first argument is not a map object.")

	data.df <- data.frame(map$data)
	nfeatures <- ncol(data.df)

	# Compute the variance of each feature on the map
	var.v <- array(data=1,dim=nfeatures)
	for (i in 1:nfeatures)
    {
		var.v[i] <- var(data.df[[i]]);
	}

	# we use the variance of a feature as likelihood of
	# being an important feature, compute the Bayesian
	# probability of significance using uniform priors

	var.sum <- sum(var.v)
	prob.v <- var.v/var.sum

	# plot the significance
	if (graphics)
    {
		par.v <- map.graphics.set()

		y <- max(prob.v)
		plot.new()
		plot.window(xlim=c(1,nfeatures),ylim=c(0,y))
		box()

		title(xlab="Features",ylab="Significance")

		xticks <- seq(1,nfeatures,1)
		yticks <- seq(0,y,y/4)
		if (feature.labels)
			xlabels <- names(data.df)
		else
			xlabels <- seq(1,nfeatures,1)
		ylabels <- formatC(seq(0,y,y/4),digits=2)
		axis(1,at=xticks,labels=xlabels)
		axis(2,at=yticks,labels=ylabels)

		points(1:nfeatures,prob.v,type="h")

		map.graphics.reset(par.v)

	}
    else
    {
		prob.v
	}
}


### map.neuron - returns the contents of a neuron at (x,y) on the map as a vector
# parameters:
#   map - the neuron map
#   x - map x-coordinate of neuron
#   y - map y-coordinate of neuron
# return value:
#   a vector representing the neuron

map.neuron <- function(map,x,y)
{
    if (class(map) != "map")
        stop("map.neuron: first argument is not a map object.")

    ix <- rowix(map,x,y)
    map$neurons[ix,]
}


### map.marginal - plot that shows the marginal probability distribution of the neurons and data
# parameters:
# - map is an object of type 'map'
# - marginal is the name of a training data frame dimension or index

map.marginal <- function(map,marginal)
{
  # ensure that map is a 'map' object
  if (class(map) != "map")
    stop("map.marginal: first argument is not a map object.")

  # check if the second argument is of type character
  if (!typeof(marginal) == "character")
  {
    train <- data.frame(points = map$data[[marginal]])
    neurons <- data.frame(points = map$neurons[[marginal]])

    train$legend <- 'training data'
    neurons$legend <- 'neurons'

    hist <- rbind(train,neurons)
    ggplot(hist, aes(points, fill = legend)) + geom_density(alpha = 0.2) + xlab(names(map$data)[marginal])

  }
  else if (marginal %in% names(map$data))
  {

    train <- data.frame(points = map$data[names(map$data) == marginal])
    colnames(train) <- c("points")

    neurons <- data.frame(points = map$neurons[names(map$neurons) == marginal])
    colnames(neurons) <- c("points")

    train$legend <- 'training data'
    neurons$legend <- 'neurons'

    hist <- rbind(train,neurons)
    ggplot(hist, aes(points, fill = legend)) + geom_density(alpha = 0.2) + xlab(marginal)

  }
  else
  {
    stop("map.marginal: second argument is not the name of a training data frame dimension or index")
  }

}

############################### local functions #################################

# map.embed using variance and mean tests

map.embed.vm <- function(map,conf.int=.95,verb=FALSE)
{
    if (class(map) != "map")
        stop("map.embed: first argument is not a map object.")

    # map.df is a dataframe that contains the neurons
    map.df <- data.frame(map$neurons)

    # data.df is a dataframe that contain the training data
    # note: map$data is what the 'som' package returns
    data.df <- data.frame(map$data)

    # do the F-test on a pair of datasets: code vectors/training data
    vl <- df.var.test(map.df,data.df,conf=conf.int)

    # do the t-test on a pair of datasets: code vectors/training data
    ml <- df.mean.test(map.df,data.df,conf=conf.int)

    # compute the variance captured by the map -- but only if the means have converged as well.
    nfeatures <- ncol(map.df)
    prob.v <- map.significance(map,graphics=FALSE)
    var.sum <- 0

    for (i in 1:nfeatures)
    {
        #cat("Feature",i,"variance:\t",vl$ratio[i],"\t(",vl$conf.int.lo[i],"-",vl$conf.int.hi[i],")\n")
        #cat("Feature",i,"mean:\t",ml$diff[i],"\t(",ml$conf.int.lo[i],"-",ml$conf.int.hi[i],")\n")
        if (vl$conf.int.lo[i] <= 1.0 && vl$conf.int.hi[i] >= 1.0 &&  ml$conf.int.lo[i] <= 0.0 && ml$conf.int.hi[i] >= 0.0) {
            var.sum <- var.sum + prob.v[i]
        } else {
            # not converged - zero out the probability
            prob.v[i] <- 0
        }
    }

    # return the variance captured by converged features
    if (verb)
      prob.v
    else
      var.sum
}

# map.embed using the kolgomorov-smirnov test

map.embed.ks <- function(map,conf.int=.95,verb=FALSE) {

    if (class(map) != "map") {
        stop("map.embed: first argument is not a map object.")
    }

    # map.df is a dataframe that contains the neurons
    map.df <- data.frame(map$neurons)

    # data.df is a dataframe that contain the training data
    # note: map$data is what the 'som' package returns
    data.df <- data.frame(map$data)

    nfeatures <- ncol(map.df)

    # use the Kolmogorov-Smirnov Test to test whether the neurons and training data appear
    # to come from the same distribution
    ks.vector <- NULL
    for(i in 1:nfeatures){
        # Note rpt - I needed to use suppress warnings to suppress the warning about ties.
        ks.vector[[i]] <- suppressWarnings(ks.test(map.df[[i]], data.df[[i]]))
    }

    prob.v <- map.significance(map,graphics=FALSE)
    var.sum <- 0

    # compute the variance captured by the map
    for (i in 1:nfeatures)
    {
        # the second entry contains the p-value
        if (ks.vector[[i]][[2]] > (1 - conf.int)) {
            var.sum <- var.sum + prob.v[i]
        } else {
            # not converged - zero out the probability
            prob.v[i] <- 0
        }
    }

    # return the variance captured by converged features
    if (verb)
      prob.v
    else
      var.sum

}

# map.normalize -- based on the som:normalize function but preserved names

map.normalize <- function (x, byrow = TRUE)
{
    if (is.data.frame(x))
    {
        if (byrow)
            df <- t(apply(x, 1, scale))
        else
            df <- apply(x, 2, scale)
        names(df) <- names(x)
    }
    else
        stop("map.normalize: 'x' is not a dataframe.\n")
}



# best.match -- given observation obs, return the best matching neuron

best.match <- function(map,obs,full=FALSE)
{
    # NOTE: replicate obs so that there are nr rows of obs
    obs.m <- matrix(as.numeric(obs),nrow(map$neurons),ncol(map$neurons),byrow=TRUE)
    diff <- map$neurons - obs.m
    squ <- diff * diff
    s <- rowSums(squ)
    d <- sqrt(s)
    o <- order(d)

    if (full)
        o
    else
        o[1]
}


# coordinate -- convert from a row index to a map xy-coordinate

coordinate <- function(map,rowix)
{
    x <- (rowix-1) %% map$xdim + 1
    y <- (rowix-1) %/% map$xdim + 1
    c(x,y)
}

#rowix -- convert from a map xy-coordinate to a row index

rowix <- function(map,x,y)
{
    rix <- x + (y-1)*map$xdim
    rix
}

# map.graphics.set -- set the graphics environment for our map utilities
#                     the return value is the original graphics param vector

map.graphics.set <- function()
{
	par.v <- par()
	par(ps=6)
	par.v
}

# map.graphics.reset -- reset the graphics environment to the original state
# parameter - a vector containing the settings for the original state

map.graphics.reset <- function(par.vector)
{
	par(ps=par.vector$ps)
}

### plot.heat - plot a heat map based on a 'map', this plot also contains the connected
#               components of the map based on the landscape of the heat map
# parameters:
# - map is an object if type 'map'
# - heat is a 2D heat map of the map returned by 'map'
# - labels is a vector with labels of the original training data set
# - explicit controls the shape of the connected components
# - comp controls whether we plot the connected components on the heat map
# - merge controls whether we merge the starbursts together.
# - merge.range - a range that is used as a percentage of a certain distance in the code
#                 to determine whether components are closer to their centroids or
#                 centroids closer to each other.

plot.heat <- function(map,heat,explicit=FALSE,comp=TRUE,merge=FALSE,merge.range)
{
  ### keep an unaltered copy of the unified distance matrix,
  ### required for merging the starburst clusters
  umat <- heat

	x <- map$xdim
	y <- map$ydim
	nobs <- nrow(map$data)
	count <- array(data=0,dim=c(x,y))

	### need to make sure the map doesn't have a dimension of 1
	if (x <= 1 || y <= 1)
    {
        stop("plot.heat: map dimensions too small")
    }

    ### bin the heat values into 100 bins used for the 100 heat colors
    heat.v <- as.vector(heat)
    heat.v <- cut(heat.v,breaks=100,labels=FALSE)
    heat <- array(data=heat.v,dim=c(x,y))
    colors<- heat.colors(100)

	### set up the graphics window
	par.v <- map.graphics.set()
	plot.new()
	plot.window(xlim=c(0,x),ylim=c(0,y))
	box()

	title(xlab="x",ylab="y")

	xticks <- seq(0.5,x-0.5,1)
	yticks <- seq(0.5,y-0.5,1)
	xlabels <- seq(1,x,1)
	ylabels <- seq(1,y,1)
	axis(1,at=xticks,labels=xlabels)
	axis(3,at=xticks,labels=xlabels)
	axis(2,at=yticks,labels=ylabels)
	axis(4,at=yticks,labels=ylabels)


    ### plot the neurons as heat squares on the map
    # TODO: vectorize this - rect can operate on vectors of coordinates and values
	for (ix in 1:x)
    {
		for (iy in 1:y)
        {
			rect(ix-1,iy-1,ix,iy,col=colors[100 - heat[ix,iy] + 1],border=NA)
		}
	}

	### put the connected component lines on the map
	if (comp)
    {
	  if(!merge){
		  # find the centroid for each neuron on the map
		  centroids <- compute.centroids(map,heat,explicit)
	  } else {
	    # find the unique centroids for the neurons on the map
	    centroids <- compute.combined.clusters(map,umat,explicit,merge.range)
	  }
        # connect each neuron to its centroid
		for(ix in 1:x)
        {
			for (iy in 1:y)
            {
				cx <- centroids$centroid.x[ix,iy]
				cy <- centroids$centroid.y[ix,iy]
				points(c(ix,cx)-.5,c(iy,cy)-.5,type="l",col="grey")
			}
		}
	}

	### put the labels on the map if available
    if (!is.null(map$labels))
    {
        # count the labels in each map cell
        for(i in 1:nobs)
        {
            nix <- map$visual[i]
            c <- coordinate(map,nix)
            ix <- c[1]
            iy <- c[2]

            count[ix,iy] <- count[ix,iy]+1
        }

        #count.df <- data.frame(count)
        #print(count.df)

        for(i in 1:nobs)
        {
            c <- coordinate(map,map$visual[i])
            #cat("Coordinate of ",i," is ",c,"\n")
            ix <- c[1]
            iy <- c[2]
            # we only print one label per cell
            # TODO: print out majority label
            if (count[ix,iy] > 0)
            {
                count[ix,iy] <- 0
                ix <- ix - .5
                iy <- iy - .5
                l <- map$labels[i,1]
                text(ix,iy,labels=l)
            }
        }
    }

	map.graphics.reset(par.v)
}


### compute.centroids -- compute the centroid for each point on the map
# parameters:
# - map is an object if type 'map'
# - heat is a matrix representing the heat map representation
# - explicit controls the shape of the connected component
# return value:
# - a list containing the matrices with the same x-y dims as the original map containing the centroid x-y coordinates

compute.centroids <- function(map,heat,explicit=FALSE)
{
	xdim <- map$xdim
	ydim <- map$ydim
	centroid.x <- array(data=-1,dim=c(xdim,ydim))
	centroid.y <- array(data=-1,dim=c(xdim,ydim))
	max.val <- max(heat)

    ### recursive function to find the centroid of a point on the map
	compute.centroid <- function(ix,iy)
    {
		# first we check if the current position is already associated
		# with a centroid.  if so, simply return the coordinates
		# of that centroid
		if (centroid.x[ix,iy] > -1 && centroid.y[ix,iy] > -1)
        {
			list(bestx=centroid.x[ix,iy],besty=centroid.y[ix,iy])
		}

		# try to find a smaller value in the immediate neighborhood
		# make our current position the square with the minimum value.
		# if a minimum value other than our own current value cannot be
		# found then we are at a minimum.
		#
		# search the neighborhood; three different cases: inner element, corner element, side element
        # TODO: there has to be a better way!

		min.val <- heat[ix,iy]
		min.x <- ix
		min.y <- iy

		# (ix,iy) is an inner map element
		if (ix > 1 && ix < xdim && iy > 1 && iy < ydim)
        {
			if (heat[ix-1,iy-1] < min.val)
            {
				min.val <- heat[ix-1,iy-1]
				min.x <- ix-1
				min.y <- iy-1
			}
			if (heat[ix,iy-1] < min.val)
            {
				min.val <- heat[ix,iy-1]
				min.x <- ix
				min.y <- iy-1
			}
			if (heat[ix+1,iy-1] < min.val)
            {
				min.val <- heat[ix+1,iy-1]
				min.x <- ix+1
				min.y <- iy-1
			}
			if (heat[ix+1,iy] < min.val)
            {
				min.val <- heat[ix+1,iy]
				min.x <- ix+1
				min.y <- iy
			}
			if (heat[ix+1,iy+1] < min.val)
            {
				min.val <- heat[ix+1,iy+1]
				min.x <- ix+1
				min.y <- iy+1
			}
			if (heat[ix,iy+1] < min.val)
            {
				min.val <- heat[ix,iy+1]
				min.x <- ix
				min.y <- iy+1
			}
			if (heat[ix-1,iy+1] < min.val)
            {
				min.val <- heat[ix-1,iy+1]
				min.x <- ix-1
				min.y <- iy+1
			}
			if (heat[ix-1,iy] < min.val)
            {
				min.val <- heat[ix-1,iy]
				min.x <- ix-1
				min.y <- iy
			}
		}

		# (ix,iy) is bottom left corner
		else if (ix == 1 && iy == 1)
        {
			if (heat[ix+1,iy] < min.val)
            {
				min.val <- heat[ix+1,iy]
				min.x <- ix+1
				min.y <- iy
			}
			if (heat[ix+1,iy+1] < min.val)
            {
				min.val <- heat[ix+1,iy+1]
				min.x <- ix+1
				min.y <- iy+1
			}
			if (heat[ix,iy+1] < min.val)
            {
				min.val <- heat[ix,iy+1]
				min.x <- ix
				min.y <- iy+1
			}
		}

		# (ix,iy) is bottom right corner
		else if (ix == xdim && iy == 1)
        {
			if (heat[ix,iy+1] < min.val)
            {
				min.val <- heat[ix,iy+1]
				min.x <- ix
				min.y <- iy+1
			}
			if (heat[ix-1,iy+1] < min.val)
            {
				min.val <- heat[ix-1,iy+1]
				min.x <- ix-1
				min.y <- iy+1
			}
			if (heat[ix-1,iy] < min.val)
            {
				min.val <- heat[ix-1,iy]
				min.x <- ix-1
				min.y <- iy
			}
		}

		# (ix,iy) is top right corner
		else if (ix == xdim && iy == ydim)
        {
			if (heat[ix-1,iy-1] < min.val)
            {
				min.val <- heat[ix-1,iy-1]
				min.x <- ix-1
				min.y <- iy-1
			}
			if (heat[ix,iy-1] < min.val)
            {
				min.val <- heat[ix,iy-1]
				min.x <- ix
				min.y <- iy-1
			}
			if (heat[ix-1,iy] < min.val)
            {
				min.val <- heat[ix-1,iy]
				min.x <- ix-1
				min.y <- iy
			}
		}

		# (ix,iy) is top left corner
		else if (ix == 1 && iy == ydim)
        {
			if (heat[ix,iy-1] < min.val)
            {
				min.val <- heat[ix,iy-1]
				min.x <- ix
				min.y <- iy-1
			}
			if (heat[ix+1,iy-1] < min.val)
            {
				min.val <- heat[ix+1,iy-1]
				min.x <- ix+1
				min.y <- iy-1
			}
			if (heat[ix+1,iy] < min.val)
            {
				min.val <- heat[ix+1,iy]
				min.x <- ix+1
				min.y <- iy
			}
		}

		# (ix,iy) is a left side element
		else if (ix == 1  && iy > 1 && iy < ydim)
        {
			if (heat[ix,iy-1] < min.val)
            {
				min.val <- heat[ix,iy-1]
				min.x <- ix
				min.y <- iy-1
			}
			if (heat[ix+1,iy-1] < min.val)
            {
				min.val <- heat[ix+1,iy-1]
				min.x <- ix+1
				min.y <- iy-1
			}
			if (heat[ix+1,iy] < min.val)
            {
				min.val <- heat[ix+1,iy]
				min.x <- ix+1
				min.y <- iy
			}
			if (heat[ix+1,iy+1] < min.val)
            {
				min.val <- heat[ix+1,iy+1]
				min.x <- ix+1
				min.y <- iy+1
			}
			if (heat[ix,iy+1] < min.val)
            {
				min.val <- heat[ix,iy+1]
				min.x <- ix
				min.y <- iy+1
			}
		}

		# (ix,iy) is a bottom side element
		else if (ix > 1 && ix < xdim && iy == 1 )
        {
			if (heat[ix+1,iy] < min.val)
            {
				min.val <- heat[ix+1,iy]
				min.x <- ix+1
				min.y <- iy
			}
			if (heat[ix+1,iy+1] < min.val)
            {
				min.val <- heat[ix+1,iy+1]
				min.x <- ix+1
				min.y <- iy+1
			}
			if (heat[ix,iy+1] < min.val)
            {
				min.val <- heat[ix,iy+1]
				min.x <- ix
				min.y <- iy+1
			}
			if (heat[ix-1,iy+1] < min.val)
            {
				min.val <- heat[ix-1,iy+1]
				min.x <- ix-1
				min.y <- iy+1
			}
			if (heat[ix-1,iy] < min.val)
            {
				min.val <- heat[ix-1,iy]
				min.x <- ix-1
				min.y <- iy
			}
		}

		# (ix,iy) is a right side element
		else if (ix == xdim && iy > 1 && iy < ydim)
        {
			if (heat[ix-1,iy-1] < min.val)
            {
				min.val <- heat[ix-1,iy-1]
				min.x <- ix-1
				min.y <- iy-1
			}
			if (heat[ix,iy-1] < min.val)
            {
				min.val <- heat[ix,iy-1]
				min.x <- ix
				min.y <- iy-1
			}
			if (heat[ix,iy+1] < min.val)
            {
				min.val <- heat[ix,iy+1]
				min.x <- ix
				min.y <- iy+1
			}
			if (heat[ix-1,iy+1] < min.val)
            {
				min.val <- heat[ix-1,iy+1]
				min.x <- ix-1
				min.y <- iy+1
			}
			if (heat[ix-1,iy] < min.val)
            {
				min.val <- heat[ix-1,iy]
				min.x <- ix-1
				min.y <- iy
			}
		}

		# (ix,iy) is a top side element
		else if (ix > 1 && ix < xdim && iy == ydim)
        {
			if (heat[ix-1,iy-1] < min.val)
            {
				min.val <- heat[ix-1,iy-1]
				min.x <- ix-1
				min.y <- iy-1
			}
			if (heat[ix,iy-1] < min.val)
            {
				min.val <- heat[ix,iy-1]
				min.x <- ix
				min.y <- iy-1
			}
			if (heat[ix+1,iy-1] < min.val)
            {
				min.val <- heat[ix+1,iy-1]
				min.x <- ix+1
				min.y <- iy-1
			}
			if (heat[ix+1,iy] < min.val)
            {
				min.val <- heat[ix+1,iy]
				min.x <- ix+1
				min.y <- iy
			}
			if (heat[ix-1,iy] < min.val)
            {
				min.val <- heat[ix-1,iy]
				min.x <- ix-1
				min.y <- iy
			}
		}

		#if successful
		# move to the square with the smaller value, i.e., call compute.centroid on this new square
		# note the RETURNED x-y coords in the centroid.x and centroid.y matrix at the current location
		# return the RETURNED x-y coordinates
		if (min.x != ix || min.y != iy)
        {
			r.val <- compute.centroid(min.x,min.y)

			# if explicit is set show the exact connected component
			# otherwise construct a connected componenent where all
			# nodes are connected to a centrol node
			if (explicit)
            {
				centroid.x[ix,iy] <<- min.x
				centroid.y[ix,iy] <<- min.y
				list(bestx=min.x,besty=min.y)
			}
			else
            {
				centroid.x[ix,iy] <<- r.val$bestx
				centroid.y[ix,iy] <<- r.val$besty
				r.val
			}
		}
		#else
		# we have found a minimum
		# note the current x-y in the centroid.x and centroid.y matrix
		# return the current x-y coordinates
		else
        {
			centroid.x[ix,iy] <<- ix
			centroid.y[ix,iy] <<- iy
			list(bestx=ix,besty=iy)
		}
	} # end function compute.centroid

	### iterate over the map and find the centroid for each element
	for (i in 1:xdim)
    {
		for (j in 1:ydim)
        {
			compute.centroid(i,j)
		}
	}

	list(centroid.x=centroid.x,centroid.y=centroid.y)
}


### compute.umat -- compute the unified distance matrix
# parameters:
# - map is an object if type 'map'
# - smoothing is either NULL, 0, or a positive floating point value controlling the
#         smoothing of the umat representation
# return value:
# - a matrix with the same x-y dims as the original map containing the umat values

compute.umat <- function(map,smoothing=NULL)
{
	d <- dist(data.frame(map$neurons))
	umat <- compute.heat(map,d,smoothing)

	umat
}

### compute.heat -- compute a heat value map representation of the given distance matrix
# parameters:
# - map is an object if type 'map'
# - d is a distance matrix computed via the 'dist' function
# - smoothing is either NULL, 0, or a positive floating point value controlling the
#         smoothing of the umat representation
# return value:
# - a matrix with the same x-y dims as the original map containing the heat

compute.heat <- function(map,d.in,smoothing=NULL)
{
	d <- as.matrix(d.in)
	x <- map$xdim
	y <- map$ydim
	heat <- array(data=0,dim=c(x,y))

	if (x == 1 || y == 1)
		stop("compute.heat: heat map can not be computed for a map with a dimension of 1")

	# this function translates our 2-dim map coordinates
	# into the 1-dim coordinates of the neurons
	xl <- function(ix,iy)
    {
        #cat("converting (",ix,",",iy,") to row", ix + (iy-1) *xdim,"\n")
		ix + (iy-1) * x
	}

	# check if the map is larger than 2 x 2 (otherwise it is only corners)
	if (x > 2 && y > 2)
    {
		# iterate over the inner nodes and compute their umat values
		for (ix in 2:(x-1))
        {
			for (iy in 2:(y-1))
            {
				sum <-
					   d[xl(ix,iy),xl(ix-1,iy-1)] +
					   d[xl(ix,iy),xl(ix,iy-1)] +
					   d[xl(ix,iy),xl(ix+1,iy-1)] +
					   d[xl(ix,iy),xl(ix+1,iy)] +
					   d[xl(ix,iy),xl(ix+1,iy+1)] +
					   d[xl(ix,iy),xl(ix,iy+1)] +
					   d[xl(ix,iy),xl(ix-1,iy+1)] +
					   d[xl(ix,iy),xl(ix-1,iy)]
				heat[ix,iy] <- sum/8
			}

		}

		# iterate over bottom x axis
		for (ix in 2:(x-1))
        {
			iy <- 1
			sum <-
				   d[xl(ix,iy),xl(ix+1,iy)] +
				   d[xl(ix,iy),xl(ix+1,iy+1)] +
				   d[xl(ix,iy),xl(ix,iy+1)] +
				   d[xl(ix,iy),xl(ix-1,iy+1)] +
				   d[xl(ix,iy),xl(ix-1,iy)]
			heat[ix,iy] <- sum/5
		}

		# iterate over top x axis
		for (ix in 2:(x-1))
        {
			iy <- y
			sum <-
				   d[xl(ix,iy),xl(ix-1,iy-1)] +
				   d[xl(ix,iy),xl(ix,iy-1)] +
				   d[xl(ix,iy),xl(ix+1,iy-1)] +
				   d[xl(ix,iy),xl(ix+1,iy)] +
				   d[xl(ix,iy),xl(ix-1,iy)]
			heat[ix,iy] <- sum/5
		}

		# iterate over the left y-axis
		for (iy in 2:(y-1))
        {
			ix <- 1
			sum <-
				   d[xl(ix,iy),xl(ix,iy-1)] +
				   d[xl(ix,iy),xl(ix+1,iy-1)] +
				   d[xl(ix,iy),xl(ix+1,iy)] +
				   d[xl(ix,iy),xl(ix+1,iy+1)] +
				   d[xl(ix,iy),xl(ix,iy+1)]
			heat[ix,iy] <- sum/5
		}

		# iterate over the right y-axis
		for (iy in 2:(y-1))
        {
			ix <- x
			sum <-
				   d[xl(ix,iy),xl(ix-1,iy-1)] +
				   d[xl(ix,iy),xl(ix,iy-1)] +
				   d[xl(ix,iy),xl(ix,iy+1)] +
				   d[xl(ix,iy),xl(ix-1,iy+1)] +
				   d[xl(ix,iy),xl(ix-1,iy)]
			heat[ix,iy] <- sum/5
		}
	} # end if

	# compute umat values for corners
	if (x >= 2 && y >= 2)
    {
		# bottom left corner
		ix <- 1
		iy <- 1
		sum <-
				d[xl(ix,iy),xl(ix+1,iy)] +
				d[xl(ix,iy),xl(ix+1,iy+1)] +
				d[xl(ix,iy),xl(ix,iy+1)]
		heat[ix,iy] <- sum/3

		# bottom right corner
		ix <- x
		iy <- 1
		sum <-
			   d[xl(ix,iy),xl(ix,iy+1)] +
			   d[xl(ix,iy),xl(ix-1,iy+1)] +
			   d[xl(ix,iy),xl(ix-1,iy)]
		heat[ix,iy] <- sum/3

		# top left corner
		ix <- 1
		iy <- y
		sum <-
				d[xl(ix,iy),xl(ix,iy-1)] +
				d[xl(ix,iy),xl(ix+1,iy-1)] +
				d[xl(ix,iy),xl(ix+1,iy)]
		heat[ix,iy] <- sum/3

		# top right corner
		ix <- x
		iy <- y
		sum <-
				d[xl(ix,iy),xl(ix-1,iy-1)] +
				d[xl(ix,iy),xl(ix,iy-1)] +
				d[xl(ix,iy),xl(ix-1,iy)]
		heat[ix,iy] <- sum/3
	} # end if

	# smooth the heat map
	xcoords <- c()
	ycoords <- c()
	for (i in 1:y) {
		for (j in 1:x) {
			ycoords <- c(ycoords, i)
			xcoords <- c(xcoords, j)
		}
	}
	xycoords <- data.frame(xcoords,ycoords)

	if (!is.null(smoothing)) {
		if (smoothing == 0)
			heat <- smooth.2d(as.vector(heat),x=as.matrix(xycoords),nrow=x,ncol=y,surface=FALSE)
		else if (smoothing > 0)
			heat <- smooth.2d(as.vector(heat),x=as.matrix(xycoords),nrow=x,ncol=y,surface=FALSE,theta=smoothing)
		else
			stop("compute.heat: bad value for smoothing parameter")
	}

	heat
}

### df.var.test -- a function that applies the F-test testing the ratio
#                  of the variances of the two data frames
# parameters:
# - df1,df2 - data frames with the same number of columns
# - conf - confidence level for the F-test (default .95)

df.var.test <- function(df1,df2,conf = .95)
{
	if (length(df1) != length(df2))
        stop("df.var.test: cannot compare variances of data frames")

	# init our working arrays
	var.ratio.v <- array(data=1,dim=length(df1))
	var.confintlo.v <- array(data=1,dim=length(df1))
	var.confinthi.v <- array(data=1,dim=length(df1))

	# compute the F-test on each feature in our populations
	for (i in 1:length(df1))
    {
		t <- var.test(df1[[i]],df2[[i]],conf.level=conf)
		var.ratio.v[i] <- t$estimate
		#cat("Feature",i,"confidence interval =",t$conf.int,"\n")
		var.confintlo.v[i] <- t$conf.int[1]
		var.confinthi.v[i] <- t$conf.int[2]
	}

	# return a list with the ratios and conf intervals for each feature
	list(ratio=var.ratio.v,conf.int.lo=var.confintlo.v,conf.int.hi=var.confinthi.v)
}

### df.mean.test -- a function that applies the t-test testing the difference
#                   of the means of the two data frames
# parameters:
# - df1,df2 - data frames with the same number of columns
# - conf - confidence level for the t-test (default .95)

df.mean.test <- function(df1,df2,conf = .95)
{
	if (ncol(df1) != ncol(df2))
        stop("df.mean.test: cannot compare means of data frames")

	# init our working arrays
	mean.diff.v <- array(data=1,dim=ncol(df1))
	mean.confintlo.v <- array(data=1,dim=ncol(df1))
	mean.confinthi.v <- array(data=1,dim=ncol(df1))

	# compute the F-test on each feature in our populations
	for (i in 1:ncol(df1)) {
		t <- t.test(x=df1[[i]],y=df2[[i]],conf.level=conf)
		mean.diff.v[i] <- t$estimate[1] - t$estimate[2]
		#cat("Feature",i,"confidence interval =",t$conf.int,"\n")
		mean.confintlo.v[i] <- t$conf.int[1]
		mean.confinthi.v[i] <- t$conf.int[2]
	}

	# return a list with the mean differences and conf intervals for each feature
	list(diff=mean.diff.v,conf.int.lo=mean.confintlo.v,conf.int.hi=mean.confinthi.v)
}


### vsom.f - vectorized and optimized version of the stochastic SOM training algorithm written in Fortran90
vsom.f <- function(data,xdim,ydim,alpha,train)
{
    ### some constants
    dr <- nrow(data)
    dc <- ncol(data)
    nr <- xdim*ydim
    nc <- dc # dim of data and neurons is the same

    ### build and initialize the matrix holding the neurons
    cells <- nr * nc        # no. of neurons times number of data dimensions
    v <- runif(cells,-1,1)  # vector with small init values for all neurons
    # NOTE: each row represents a neuron, each column represents a dimension.
    neurons <- matrix(v,nrow=nr,ncol=nc)  # rearrange the vector as matrix


    result <- .Fortran("vsom",
                       as.single(neurons),
                       as.single(data.matrix(data)),
                       as.integer(dr),
                       as.integer(dc),
                       as.integer(xdim),
                       as.integer(ydim),
                       as.single(alpha),
                       as.integer(train),
                       PACKAGE="popsom2")

    # unpack the structure and list in result[1]
    v <- result[1]
    neurons <- matrix(v[[1]],nrow=nr,ncol=nc,byrow=FALSE)  # rearrange the result vector as matrix

    neurons
}


#Functions to Combine connected components that represent the same cluster
##### TOP LEVEL FOR DISTANCE MATRIX ORIENTED CLUSTER COMBINE####

compute.combined.clusters <- function(map,heat,explicit,range) {
  # compute the connected components
  centroids <- compute.centroids(map,heat,explicit)
  #Get unique centroids
  unique.centroids <- get.unique.centroids(map, centroids)
  #Get distance from centroid to cluster elements for all centroids
  within_cluster_dist <- distance.from.centroids(map, centroids, unique.centroids, heat)
  #Get average pairwise distance between clusters
  between_cluster_dist <- distance.between.clusters(map, centroids, unique.centroids, heat)
  #Get a boolean matrix of whether two components should be combined
  combine_cluster_bools <- combine.decision(within_cluster_dist, between_cluster_dist, range)
  #Create the modified connected components grid
  new_centroid <- new.centroid(combine_cluster_bools, centroids, unique.centroids, map)

  new_centroid
}

### get.unique.centroids -- a function that computes a list of unique centroids from
#                           a matrix of centroid locations.
#
# parameters:
# - map is an object of type 'map'
# - centroids - a matrix of the centroid locations in the map
get.unique.centroids <- function(map, centroids){
  # get the dimensions of the map
  xdim <- map$xdim
  ydim <- map$ydim
  xlist <- c()
  ylist <- c()
  x.centroid <- centroids$centroid.x
  y.centroid <- centroids$centroid.y

  for(ix in 1:xdim){
    for(iy in 1:ydim) {
      cx <- x.centroid[ix, iy]
      cy <- y.centroid[ix, iy]

      # Check if the x or y of the current centroid is not in the list and if not
      # append both the x and y coordinates to the respective lists
      if(!(cx %in% xlist) || !(cy %in% ylist)){
        xlist <- c(xlist, cx)
        ylist <- c(ylist, cy)
      }
    }
  }

  # return a list of unique centroid positions
  list(position.x=xlist, position.y=ylist)
}

### distance.from.centroids -- A function to get the average distance from
#                              centroid by cluster.
# parameters:
# - map is an object of type 'map'
# - centroids - a matrix of the centroid locations in the map
# - unique.centroids - a list of unique centroid locations
# - heat is a unified distance matrix
distance.from.centroids <- function(map, centroids, unique.centroids, heat){
  xdim <- map$xdim
  ydim <- map$ydim
  centroids.x.positions <- unique.centroids$position.x
  centroids.y.positions <- unique.centroids$position.y
  within <- c()

  for (i in 1:length(centroids.x.positions)){
    cx <- centroids.x.positions[i]
    cy <- centroids.y.positions[i]

    # compute the average distance
    distance <- cluster.spread(cx, cy, heat, centroids, map)

    # append the computed distance to the list of distances
    within <- c(within, distance)
  }

  # return the list
  within
}

### cluster.spread -- Function to calculate the average distance in
#                     one cluster given one centroid.
#
# parameters:
# - x - x position of a unique centroid
# - y - y position of a unique centroid
# - umat - a unified distance matrix
# - centroids - a matrix of the centroid locations in the map
# - map is an object of type 'map'
cluster.spread <- function(x, y, umat, centroids, map){
  centroid.x <- x
  centroid.y <- y
  sum <- 0
  elements <- 0
  xdim <- map$xdim
  ydim <- map$ydim
  centroid_weight <- umat[centroid.x, centroid.y]
  for(xi in 1:xdim){
    for(yi in 1:ydim){
      cx <- centroids$centroid.x[xi, yi]
      cy <- centroids$centroid.y[xi, yi]
      if(cx == centroid.x && cy == centroid.y){
        cweight <- umat[xi,yi]
        sum <- sum + abs(cweight - centroid_weight)
        elements <- elements + 1
      }
    }
  }

  average <- sum / elements
  average
}

### distance.between.clusters -- A function to compute the average pairwise
#                                distance between clusters.
#
# parameters:
# - map is an object of type 'map'
# - centroids - a matrix of the centroid locations in the map
# - unique.centroids - a list of unique centroid locations
# - umat - a unified distance matrix
distance.between.clusters <- function(map, centroids, unique.centroids, umat){
  cluster_elements <- list.clusters(map, centroids, unique.centroids, umat)
  cluster_elements <- sapply(cluster_elements,'[',seq(max(sapply(cluster_elements,length))))

  columns <- ncol(cluster_elements)

  cluster_elements <- matrix(unlist(cluster_elements),ncol = ncol(cluster_elements),byrow = FALSE)
  cluster_elements <- apply(combn(ncol(cluster_elements), 2), 2, function(x)
    abs(cluster_elements[, x[1]] - cluster_elements[, x[2]]))

  mean <- colMeans(cluster_elements, na.rm=TRUE)
  index <- 1
  mat <- matrix(data=NA, nrow=columns, ncol=columns)

  for(xi in 1:(columns-1)){
    for (yi in xi:(columns-1)){
      mat[xi, yi + 1] <- mean[index]
      mat[yi + 1, xi] <- mean[index]
      index <- index + 1
    }
  }

  mat
}

### list.clusters -- A function to get the clusters as a list of lists.
#
# parameters:
# - map is an object of type 'map'
# - centroids - a matrix of the centroid locations in the map
# - unique.centroids - a list of unique centroid locations
# - umat - a unified distance matrix
list.clusters <- function(map,centroids,unique.centroids,umat){
  centroids.x.positions <- unique.centroids$position.x
  centroids.y.positions <- unique.centroids$position.y
  componentx <- centroids$centroid.x
  componenty <- centroids$centroid.y
  cluster_list <- list()

  for(i in 1:length(centroids.x.positions)){
    cx <- centroids.x.positions[i]
    cy <- centroids.y.positions[i]

    # get the clusters associated with a unique centroid and store it in a list
    cluster_list[i] <- list.from.centroid(map, cx, cy, centroids, umat)
  }
  cluster_list
}

### list.from.centroid -- A funtion to get all cluster elements
#                         associated to one centroid.
#
# parameters:
# - map is an object of type 'map'
# - x - the x position of a centroid
# - y - the y position of a centroid
# - centroids - a matrix of the centroid locations in the map
# - umat - a unified distance matrix
list.from.centroid <- function(map,x,y,centroids,umat){
  centroid.x <- x
  centroid.y <- y
  sum <- 0
  xdim <- map$xdim
  ydim <- map$ydim
  centroid_weight <- umat[centroid.x, centroid.y]
  cluster_list <- c()
  for(xi in 1:xdim){
    for(yi in 1:ydim){
      cx <- centroids$centroid.x[xi, yi]
      cy <- centroids$centroid.y[xi, yi]

      if(cx == centroid.x && cy == centroid.y){
        cweight <- umat[xi, yi]
        cluster_list <- c(cluster_list, cweight)
      }
    }
  }
  list(cluster_list)
}

### combine.decision -- A function that produces a boolean matrix
#                       representing which clusters should be combined.
#
# parameters:
# - within_cluster_dist - A list of the distances from centroid to cluster elements for all centroids
# - distance_between_clusters - A list of the average pairwise distance between clusters
# - range is the distance where the clusters are merged together.
combine.decision <- function(within_cluster_dist,distance_between_clusters,range){
  inter_cluster <- distance_between_clusters
  centroid_dist <- within_cluster_dist
  dim <- dim(inter_cluster)[1]
  to_combine <- matrix(data=FALSE, nrow=dim, ncol=dim)
  for(xi in 1:dim){
    for(yi in 1:dim){
      cdist <- inter_cluster[xi,yi]
      if(!is.na(cdist)){
        rx <- centroid_dist[xi] * range
        ry <- centroid_dist[yi] * range
        if(cdist < (centroid_dist[xi] + rx) || cdist < (centroid_dist[yi] + ry)){
          to_combine[xi, yi] <- TRUE
        }
      }
    }
  }
  to_combine
}

### swap.centroids -- A function that changes every instance of a centroid to
#                     one that it should be combined with.
# parameters:
# - map is an object of type 'map'
# - x1 -
# - y1 -
# - x2 -
# - y2 -
# - centroids - a matrix of the centroid locations in the map
swap.centroids <- function(map, x1, y1, x2, y2, centroids){
  xdim <- map$xdim
  ydim <- map$ydim
  compn_x <- centroids$centroid.x
  compn_y <- centroids$centroid.y
  for(xi in 1:xdim){
    for(yi in 1:ydim){
      if(compn_x[xi] == x1 && compn_y[yi] == y1){
        compn_x[xi] <- x2
        compn_y[yi] <- y2
      }
    }
  }

  list(centroid.x=compn_x, centroid.y=compn_y)
}


### new.centroid -- A function to combine centroids based on matrix of booleans.
#
# parameters:
#
# - bmat - a boolean matrix containing the centroids to merge
# - centroids - a matrix of the centroid locations in the map
# - unique.centroids - a list of unique centroid locations
# - map is an object of type 'map'
new.centroid <- function(bmat, centroids, unique.centroids, map){
  bmat.rows <- dim(bmat)[1]
  bmat.columns <- dim(bmat)[2]
  centroids_x <- unique.centroids$position.x
  centroids_y <- unique.centroids$position.y
  components <- centroids
  for(xi in 1:bmat.rows){
    for(yi in 1:bmat.columns){
      if(bmat[xi,yi] == TRUE){
        x1 <- centroids_x[xi]
        y1 <- centroids_y[xi]
        x2 <- centroids_x[yi]
        y2 <- centroids_y[yi]
        components <- swap.centroids(map, x1, y1, x2, y2, components)
      }
    }
  }
  components
}


avg.homogeneity <- function(map,explicit=FALSE,smoothing=2,merge=FALSE,merge.range=.25)
{
    ### keep an unaltered copy of the unified distance matrix,
    ### required for merging the starburst clusters
    umat <- compute.umat(map,smoothing=smoothing)

    x <- map$xdim
    y <- map$ydim
    nobs <- nrow(map$data)
    centroid.labels <- array(data=list(),dim=c(x,y))

    ### need to make sure the map doesn't have a dimension of 1
    if (x <= 1 || y <= 1)
    {
        stop("avg.homogeneity: map dimensions too small")
    }

    if (is.null(map$labels))
    {
	stop("avg.homogeneity: you need to attach labels to the map")
    }

   if(!merge)
   {
     # find the centroid for each neuron on the map
     centroids <- compute.centroids(map,umat,explicit)
   }
   else
   {
     # find the unique centroids for the neurons on the map
      centroids <- compute.combined.clusters(map,umat,explicit,merge.range)
   }


    ### attach labels to centroids
    # count the labels in each map cell
    for(i in 1:nobs)
    {
        lab <- as.character(map$labels[i,1])
        nix <- map$visual[i]
        c <- coordinate(map,nix)
        ix <- c[1]
        iy <- c[2]
        cx <- centroids$centroid.x[ix,iy]
        cy <- centroids$centroid.y[ix,iy]
        centroid.labels[[cx,cy]] <- append(centroid.labels[[cx,cy]],lab)
     }

    ### compute average homogeneity of the map: h = (1/nobs)*sum_c majority.lahbel_c
    sum.majority <- 0
    n.centroids <- 0

    for (ix in 1:x)
    {
	for (iy in 1:y)
        {
	    label.v <- centroid.labels[[ix,iy]]
	    if (length(label.v)!=0)
	    {
		n.centroids <- n.centroids + 1
		majority <- data.frame(sort(table(label.v),decreasing=TRUE))

		if (nrow(majority) == 1) # only one label
		{
		   m.val <- length(label.v)
		}
		else
		{
		   m.val <- majority[1,2]
		}
		sum.majority <- sum.majority + m.val
	    }
	}
    }
    list(homog=sum.majority/nobs, nclust=n.centroids)
}
lutzhamel/popsom2 documentation built on May 25, 2020, 12:46 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
lutzhamel/popsom2
Functions for Constructing and Evaluating Self-Organizing Maps

R/map-utils.R
In lutzhamel/popsom2: Functions for Constructing and Evaluating Self-Organizing Maps

R Package Documentation

Browse R Packages

We want your feedback!

lutzhamel/popsom2 Functions for Constructing and Evaluating Self-Organizing Maps

R/map-utils.R In lutzhamel/popsom2: Functions for Constructing and Evaluating Self-Organizing Maps

R Package Documentation

Browse R Packages

We want your feedback!

lutzhamel/popsom2
Functions for Constructing and Evaluating Self-Organizing Maps

R/map-utils.R
In lutzhamel/popsom2: Functions for Constructing and Evaluating Self-Organizing Maps