library('sets')
#' Implementation of the mapper algorithm described in:
#'
#' G. Singh, F. Memoli, G. Carlsson (2007). Topological Methods for the Analysis of High Dimensional Data Sets and 3D Object Recognition, Point Based Graphics 2007, Prague, September 2007.
#' @author Felipe González-Casabianca
#' mapperkD function
#'
#' NOTES:
#' 1. Uses closed inetervals over the filter space
#'
#' This function uses a filter function f: X -> R^k on a data set X that has n rows (observations) and m columns (variables).
#' @param k a positive integer indicating the number of dimensions of the filter space
#' @param distance an n x n matrix of pairwise distances or dissimilarities. If the parameter \code{local_distance} is set to \code{TRUE}, this parameter is ignored since the clustering algorithm will receive a subset of the data.
#' @param filter an n x k matrix of the filter space values, where n corresponds to the number of observations and k to the filter space dimension.
#' @param intervals a vector of k positive integers, the number of intervals for each correspong dimension of the filter space.
#' @param overlap a vector of k numbers between 0 and 100, specifying how much adjacent intervals should overlap for each dimension of the filter space.
#' @param clustering_method clustering method to be used for each pre-image. If the parameter \code{local_distance} is set to \code{TRUE}, the given funtion must receive as a parameter a distance matrix. If the parameter \code{local_distance} is set to \code{FALSE}, the given funtion must receive as a parameter a data.frame. In any case, it mus return an array with the corresiponding number cluster for each of the given points. Clusters numbers must start with 1 and have no gaps between them. Default is: hierarchical_clustering
#' @param local_distance a boolean indicating if the algorithm should construct the distance function based on the data at every pre-image. Usefull for low RAM enviorments or specific clustering. Default value is \code{FALSE}
#' @param data a data frame containing the information necessary to excecute the clustering procedure. This parameter will only be used if \code{local_distance} is set to \code{TRUE}.
#'
#' @return An object of class \code{one_squeleton} which is composed of the following items:
#'
#' \code{adjacency_matrix} (adjacency matrix for the nodes),
#' \code{degree_matrix} (degree matrix for the nodes (the number of points in the intersection of nodes)),
#' \code{num_nodes} (integer number of vertices),
#' \code{points_in_nodes} (list with the points inside each node),
#' \code{nodes_per_interval} (list of lists, accesible through the vector id of the interval that contains the nodes that where constructucted inside of it)
#' \code{points_per_interval} (list of lists, accesible through the vector id of the interval that contains the points that are contained inside of it)
#'
#' @examples
#' library('mapperKD')
#' # Example
#' # ----------------
#' # Construct the data set
#' data_points = data.frame( x=c(cos(1:50) - 1, cos(1:50) + 1), y=sin(1:100) )
#' plot(data_points)
#' # Executes mapper
#' one_squeleton_result = mapperKD(k = 1,
#' distance = as.matrix(dist(data_points)),
#' filter = data_points$x,
#' intervals = c(12),
#' overlap = c(50),
#' clustering_method = hierarchical_clustering,
#' local_distance = FALSE,
#' data = NA)
#' # Visualize the result
#' g = convert_to_graph(one_squeleton_result)
#' V(g)$size = sqrt(get_1_esqueleton_node_sizes(one_squeleton_result)*30)
#' splot(g)
#'
#' @export
#'
#' @export
#'
mapperKD = function(k,
distance,
filter,
intervals,
overlap,
clustering_method = hierarchical_clustering,
local_distance = FALSE,
data = NA)
{
# Parameter consistency check
# --------------------
# Filter
if(is.null(filter) || is.na(filter))
stop('Filter cannot be NULL')
if(!is.matrix(filter))
{
#If supplied filter space is a one dimensional vector, it's transformed into a n x 1 matrix
if(k == 1)
filter = matrix(filter, nrow = length(filter), ncol = 1)
else
stop('The supplied filter parameter must by an n x k matrix')
}
if(dim(filter)[2] != k)
stop(paste('Expected a matrix with',k,'columns for the filter parameter but got a matrix with',dim(filter)[2],'columns instead'))
if(dim(filter)[1] == 1)
stop('Sample needs at least two points to work.')
# Intervals
if(is.null(intervals) || is.na(intervals))
stop('Intervals cannot be NULL')
if(k != 1 && length(intervals) == 1)
{
print('The vector of intervals has dimension 1. Assuming its value for all dimensions of the filter space')
intervals = rep(intervals, k)
}
if(length(intervals) != k )
stop(paste('Expected a',k,'dimensional vector for the intervals parameter but got a',length(intervals),'dimensional vector instead'))
if(any(intervals <= 0))
stop('All interval values must be positive integers')
# Overlap
if(is.null(overlap) || is.na(overlap))
stop('Overlap cannot be NULL')
if(k != 1 && length(overlap) == 1)
{
print('The vector of overlap has dimension 1. Assuming its value for all dimensions of the filter space')
overlap = rep(overlap, k)
}
if(length(overlap) != k )
stop(paste('Expected a',k,'dimensional vector for the overlap parameter but got a',length(intervals),'dimensional vector instead'))
if(any(intervals < 0) || any(intervals > 100))
stop('All interval values must have values between 0 and a 100.')
# Distance
if(is.null(distance) || is.na(distance))
stop('Distance cannot be NULL')
if(local_distance && missing(data))
stop('local_distance is set to TRUE, so the data parameter must be supplied.')
# Sets up the iteration process through intervals and overlap values
# ----------------
# Minimum and maximum filter values
if(k == 1)
{
filter_min = min(filter)
filter_max = max(filter)
}
else
{
filter_min = apply(filter,2, min)
filter_max = apply(filter,2, max)
}
# Finds overlaped window size
window_size = (filter_max - filter_min)/(intervals - (intervals - 1)*overlap/100)
# Step size
step_size = window_size*(1 - overlap/100)
# Initializes the output parameters
# Initializes the nodes_per_interval variable
# A list of list of the number of dimensions in the filter space
nodes_per_interval = sapply(1:intervals[length(intervals)], function(x) c())
for(num_inte in rev(intervals)[-1])
nodes_per_interval = sapply(1:num_inte, function(x) list(nodes_per_interval))
# Initializes the point_per_interval variable
# A list of list of the number of dimensions in the filter space
points_per_interval = sapply(1:intervals[length(intervals)], function(x) c())
for(num_inte in rev(intervals)[-1])
points_per_interval = sapply(1:num_inte, function(x) list(points_per_interval))
# Starts the elements_in_node variable
elements_in_node = list()
# Starts looping throught the different intervals
# ------------------
# Iteration is done advancing a k vector of coordinates for each of the intervals
# Coordinates range in each dimension from 1 to the number of intervals
coordinates = rep(1,k)
# Loop ends when all coordinates get to the last value (the number of intervals)
repeat
{
# Sets up the current interval
# ------------------------
# Extracts the interval's current max and min values
interval_min = filter_min + (coordinates - 1)*step_size
interval_max = interval_min + window_size
# For of numeric precision, the max filter value for the las intervals in each dimension are forced to be the filter's max value on the correponding dimension
interval_max[coordinates == intervals] = filter_max[coordinates == intervals]
# Gets the indices of the samples that fall on the current interval
# NOTE: this implemenatation uses closed intervals
# Currently uses the sweep function to build two matrices for the min and max
above_matrix = sign(sweep(filter, 2, interval_min, FUN = ">="))
below_matrix = sign(sweep(filter, 2, interval_max, FUN = "<="))
# Elementwise and operation
inside_matrix = above_matrix*below_matrix
# Interval indices
logical_interval_indices = rowSums(inside_matrix) == k
interval_indices = which(logical_interval_indices)
# Excecutes the clustering scheme (if there is more than one element in the interval)
# ------------------------
if(length(interval_indices) > 0)
{
if(length(interval_indices) == 1) # Single point
{
clusters_of_interval = 1
}
else if(!local_distance) # Global distance
{
# -Extracts the current matrix
current_matrix = distance[interval_indices,interval_indices]
# Excecutes the clustering algorithm
clusters_of_interval = clustering_method(current_matrix)
}
else # Local distance
{
# Filters the data
current_data = data[interval_indices,]
# Excecutes the clustering algorithm
clusters_of_interval = clustering_method(current_data)
}
# Constructs the nodes and adds them to the nodes_per_interval array
# Adjust the indices of the clusters to correpond with the nodes
clusters_of_interval = clusters_of_interval + length(elements_in_node)
# Adds them to the nodes_per_interval variable
nodes_per_interval[[coordinates]] = unique(clusters_of_interval)
# Adds the indices to the elements_in_node variable
for(node in clusters_of_interval)
elements_in_node[[node]] = interval_indices[clusters_of_interval == node]
# Adds all the points to the npoint_per_interval
points_per_interval[[coordinates]] = interval_indices
}
# Stop criteria
# Finished the last interval
if(all(coordinates == intervals))
break
# Advances the coordinates
# ------------------------
coordinates = advance_coordinates(coordinates, intervals)
}# Finish interval loop
num_nodes = length(elements_in_node)
# Builds adjacency and degree matrix
# ------------------------
# Iterates over all intervals again and checks only intersection with possible intervals
# Starts the degree matrix
degree_matrix = matrix(0, nrow = num_nodes, ncol = num_nodes)
# Starts the coordinates
coordinates = rep(1,k)
# Constructs the search possibilities. Only checks intersection with further intervals (Adjacency matrix is symmetric)
# Extracts the step grid
step_grid = construct_step_grid(filter_min, filter_max, intervals, overlap)
# Converts all the node lists to sets for efficiency
elements_in_node_as_set = lapply(elements_in_node, as.set)
# Loop ends when all coordinates get to the last value (the number of intervals)
while(any(coordinates != intervals))
{
# Gets current nodes
current_nodes = nodes_per_interval[[coordinates]]
# If empty, continunes
if(length(current_nodes) == 0)
{
coordinates = advance_coordinates(coordinates, intervals)
next
}
# Calculates possible coordinates
possible_coordinates = sweep(step_grid, 2, coordinates, "+")
possible_indices = rowSums(sweep(possible_coordinates, 2, intervals, FUN = "<=")) == k
possible_coordinates = matrix(possible_coordinates[possible_indices,], nrow = sum(possible_indices), ncol = k)
# If empty, continunes
if(length(possible_coordinates) == 0)
{
coordinates = advance_coordinates(coordinates, intervals)
next
}
# Iterates over the possible adjacent intervals
for(i in 1:nrow(possible_coordinates))
{
# Gets neighbor interval
neighbor_coordinates = possible_coordinates[i,]
# Continues if outside interval grid
#if(any(neighbor_coordinates > intervals))
# next
neighbor_nodes = nodes_per_interval[[neighbor_coordinates]]
# If empty, continunes
if(length(neighbor_nodes) == 0)
next
for(v in current_nodes)
{
for(l in neighbor_nodes)
{
degree_matrix[l,v] = length(set_intersection(elements_in_node_as_set[[v]],elements_in_node_as_set[[l]]))
degree_matrix[v,l] = degree_matrix[l,v]
}
}
}
# Advances the coordinates
# ------------------------
coordinates = advance_coordinates(coordinates, intervals)
}# Finish adjacency and degrre loop
# Adjacency matrix is simply the sign of the degree matrix
adjacency_matrix = sign(degree_matrix)
# Constructs the response
# Object of type 1_squeleton
response = list()
response$adjacency_matrix = sign(degree_matrix)
response$degree_matrix = degree_matrix
response$num_nodes = num_nodes
response$points_in_nodes = elements_in_node
response$nodes_per_interval = nodes_per_interval
response$points_per_interval = points_per_interval
#Return
return(response)
}
#' Function that calculates the forward grid search space.
#' This method should return all the values that should be added to interval to obtain all the possible places
#' where an intersection might occur. This functionality is isolated inside a method to test it individually.
construct_step_grid = function(filter_min, filter_max, intervals, overlap)
{
# Constructs the search possibilities. Only checks intersection with further intervals (Adjacency matrix is symmetric)
# Finds overlaped window size
window_size = (filter_max - filter_min)/(intervals - (intervals - 1)*overlap/100)
# Step size
step_size = window_size*(1 - overlap/100)
# Depending on overlap, non adjacent intersection is possible
max_search_posibilities = ceiling(1/(1 - overlap/100)) - 1
max_search_posibilities = pmin(max_search_posibilities, intervals) # Should not check further than the total amount of intervals
# Creates the search possibilities
search_possibilities = lapply(1:length(intervals), function(i){0:max_search_posibilities[i]})
# The steps grid (structure to detect levels where possible intersection might occur)
step_grid = as.matrix(expand.grid(search_possibilities))
colnames(step_grid) <- NULL
# Removes itself from the possible steps
step_grid = as.matrix(step_grid[rowSums(step_grid) > 0,])
return(step_grid)
}
#' Function for advancing coordinates
advance_coordinates = function(coordinates, max_values)
{
current_pos = 1
carry = TRUE
k = length(max_values)
while(carry && current_pos <= k)
{
coordinates[current_pos] = coordinates[current_pos] + 1
if(coordinates[current_pos] > max_values[current_pos])
{
coordinates[current_pos] = 1
carry = TRUE
current_pos = current_pos + 1
}
else
carry = FALSE
}
return(coordinates)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.