R/baskin_robbins.R

Defines functions create_clusterball_mapper_object create_ball_mapper_object convert_to_clusters create_1D_mapper_object

Documented in create_1D_mapper_object create_ball_mapper_object create_clusterball_mapper_object

###########################################################################
# BASKIN-ROBBINS
# Mapper flavors
###########################################################################


# 1D Mapper ---------------------------------------------------------------
#
# a flavor of Mapper based on projection to a single coordinate

#' One-Dimensional Mapper
#'
#' Run Mapper using a one-dimensional filter, a cover of the codomain of intervals, and a clusterer.
#'
#' @param data A data frame.
#' @param dists A distance matrix associated to the data frame. Can be a `dist` object or `matrix`. The names of the rows of the data points in the distance matrix need to match the names of the data points in `data`.
#' @param lens The result of a function applied to the rows of `data` (a `matrix`, `data.frame`, `list`, or `vector`), or a function which accepts a data point and outputs a result. If the former, there should be one value per observation in the original data frame, and, if the values are not named, they should be in the same order as their inputs in the original data frame.
#' @param cover An \eqn{n \times 2} `matrix` of interval left and right endpoints; rows should be intervals and columns left and right endpoints (in that order).
#' @param clusterer A function which accepts a list of distance matrices as input, and returns the results of clustering done on each distance matrix;
#' that is, it should return a list of named vectors, whose name are the names of data points and whose values are cluster assignments (integers).
#' If this value is omitted, then trivial clustering will be done.
#'
#' @return A `list` of two data frames, `nodes` and `edges`, which contain information about the Mapper graph constructed from the given parameters.
#'
#' The node data frame consists of:
#'
#' - `id`: vertex ID
#' - `cluster_size`: number of data points in cluster
#' - `medoid`: the name of the medoid of the vertex
#' - `mean_dist_to_medoid`: mean distance to medoid of cluster
#' - `max_dist_to_medoid`: max distance to medoid of cluster
#' - `cluster_width`: maximum pairwise distance within cluster
#' - `wcss`: sum of squares of distances to cluster medoid
#' - `data`: names of data points in cluster
#' - `patch`: level set ID
#'
#' The `edge` data frame contains consists of:
#'
#' - `source`: vertex ID of edge source
#' - `target`: vertex ID of edge target
#' - `weight`: Jaccard index of edge; this is the size of the intersection between the vertices divided by the union
#' - `overlap_data`: names of data points in overlap
#' - `overlap_size`: number of data points overlap
#'
#' @export
#' @examples
#' # Create noisy circle data
#' data = data.frame(x = sapply(1:1000, function(x) cos(x)) + runif(1000, 0, .25),
#'  y = sapply(1:1000, function(x) sin(x)) + runif(1000, 0, .25))
#'
#' # Project to horizontal axis as lens
#' projx = data$x
#'
#' # Create a one-dimensional cover
#' num_bins = 5
#' percent_overlap = 25
#' cover = create_width_balanced_cover(min(projx), max(projx), num_bins, percent_overlap)
#'
#' # Build Mapper object
#' create_1D_mapper_object(data, dist(data), projx, cover)
create_1D_mapper_object <- function(data,
                                    dists,
                                    lens,
                                    cover,
                                    clusterer = global_hierarchical_clusterer("single", dists)) {
  if (!all(cover[, 1] - cover[, 2] <= 0)) {
    stop("Left endpoints in the cover must be less than or equal to right endpoints.")
  }

  cover = apply(cover, 1, check_in_interval)

  return(create_mapper_object(data, dists, lens, cover, clusterer = clusterer))
}

# Ball Mapper --------------------------------------------------------------
#
# a flavor of Mapper all about the balls

#' Convert Balls to Clusters
#'
#' Perform trivial clustering on a set of balled data.
#'
#' @param bins A `list` of bins, each containing a named vector of data points.
#'
#' @return A named vector whose names are data point names and whose values are cluster labels (`integer`s).
#' @noRd
convert_to_clusters <- function(bins) {
  ball_sizes = lapply(bins, length)

  # repeat the cluster id for as many data points belonging to that bin
  ballball_data = unlist(mapply(function(x, y)
    rep(x, y), 1:length(ball_sizes), ball_sizes))

  # make sure names match up
  names(ballball_data) = unlist(bins)

  return(ballball_data)
}

#' Ball Mapper
#'
#' Run Mapper using the identity function as a lens and an \eqn{\varepsilon}-net cover, greedily generated using a distance matrix.
#'
#' @param data A data frame.
#' @param dists A distance matrix for the data frame. Can be a `dist` object or a `matrix`.
#' @param eps A positive real number for the desired ball radius.
#' @return A `list` of two data frames, `nodes` and `edges`, which contain information about the Mapper graph constructed from the given parameters.
#'
#' The node data frame consists of:
#'
#' - `id`: vertex ID
#' - `cluster_size`: number of data points in cluster
#' - `medoid`: the name of the medoid of the vertex
#' - `mean_dist_to_medoid`: mean distance to medoid of cluster
#' - `max_dist_to_medoid`: max distance to medoid of cluster
#' - `cluster_width`: maximum pairwise distance within cluster
#' - `wcss`: sum of squares of distances to cluster medoid
#' - `data`: names of data points in cluster
#'
#' The `edge` data frame contains consists of:
#'
#' - `source`: vertex ID of edge source
#' - `target`: vertex ID of edge target
#' - `weight`: Jaccard index of edge; this is the size of the intersection between the vertices divided by the union
#' - `overlap_data`: names of data points in overlap
#' - `overlap_size`: number of data points overlap
#'
#' @export
#' @examples
#' # Create noisy cirle data set
#' data = data.frame(x = sapply(1:1000, function(x) cos(x)) + runif(1000, 0, .25),
#' y = sapply(1:1000, function(x) sin(x)) + runif(1000, 0, .25))
#'
#' # Set ball radius
#' eps = .25
#'
#' # Create Mapper object
#' create_ball_mapper_object(data, dist(data), eps)
create_ball_mapper_object <- function(data, dists, eps) {
  if (!is.data.frame(data)) {
    stop("Input data needs to be a data frame.")
  } else if (any(is.na(data))) {
    stop("Data cannot have NA values.")
  } else if (any(is.na(dists))) {
    stop("No distance value can be NA.")
  } else if (!is.numeric(eps)) {
    stop("Epsilon parameter needs to be numeric.")
  } else if (eps <= 0) {
    stop("Epsilon parameter needs to be positive.")
  }

  if (nrow(data) != dim(as.matrix(dists))[1]) {
    stop("Your distance matrix dimensions are not correct for your data.")
  } else if (dim(as.matrix(dists))[1] != dim(as.matrix(dists))[2]) {
    stop("Your distance matrix is not square!")
  } else if (any(!is.numeric(dists))) {
    stop("Your distance matrix has non-numeric entries!")
  }

  if (length(data) == 0) {
    stop("Your data is missing!")
  } else if (length(dists) == 0) {
    stop("Your distance matrix is missing!")
  }

  if (length(setdiff(union(row.names(as.matrix(dists)), row.names(data)), intersect(row.names(as.matrix(dists)), row.names(data)))) != 0) {
    stop("Names of points in distance matrix need to match names in data frame!")
  }

  balls = create_balls(data, dists, eps)

  projection = row.names(data)

  return(create_mapper_object(
    data,
    dists,
    projection,
    lapply(balls, is_in_ball)
  ))
}


# clusterball Mapper ------------------------------------------------------
#
# a flavor of Mapper that's just clustering in the balls of ball Mapper

#' ClusterBall Mapper
#'
#' Run Ball Mapper, but non-trivially cluster within the balls. You can use two different distance matrices to for the balling and clustering.
#'
#' @param data A data frame.
#' @param dist1 A distance matrix for the data frame; this will be used to ball the data. It can be a `dist` object or a `matrix`.
#' @param dist2 Another distance matrix for the data frame; this will be used to cluster the data after balling. It can be a `dist` object or a `matrix`.
#' @param eps A positive real number for the desired ball radius.
#' @param clusterer A function which accepts a list of distance matrices as input, and returns the results of clustering done on each distance matrix;
#' that is, it should return a list of named vectors, whose name are the names of data points and whose values are cluster assignments (integers).
#' If this value is omitted, then single-linkage clustering will be done (and cutting heights will be decided for you).
#' @return A `list` of two data frames, `nodes` and `edges`, which contain information about the Mapper graph constructed from the given parameters.
#'
#' The node data frame consists of:
#'
#' - `id`: vertex ID
#' - `cluster_size`: number of data points in cluster
#' - `medoid`: the name of the medoid of the vertex
#' - `mean_dist_to_medoid`: mean distance to medoid of cluster
#' - `max_dist_to_medoid`: max distance to medoid of cluster
#' - `cluster_width`: maximum pairwise distance within cluster
#' - `wcss`: sum of squares of distances to cluster medoid
#' - `data`: names of data points in cluster
#' - `patch`: level set ID
#'
#' The `edge` data frame contains consists of:
#'
#' - `source`: vertex ID of edge source
#' - `target`: vertex ID of edge target
#' - `weight`: Jaccard index of edge; this is the size of the intersection between the vertices divided by the union
#' - `overlap_data`: names of data points in overlap
#' - `overlap_size`: number of data points overlap
#'
#' @export
#' @examples
#' # Create noisy circle data set
#' data = data.frame(x = sapply(1:1000, function(x) cos(x)) + runif(1000, 0, .25),
#' y = sapply(1:1000, function(x) sin(x)) + runif(1000, 0, .25))
#' data.dists = dist(data)
#'
#' # Set ball radius
#' eps = 1
#'
#' # Do single-linkage clustering in the balls to produce Mapper graph
#' create_clusterball_mapper_object(data, data.dists, data.dists, eps)
create_clusterball_mapper_object <- function(data, dist1, dist2, eps, clusterer = local_hierarchical_clusterer("single")) {
  if (!is.data.frame(data)) {
    stop("Input data needs to be a data frame.")
  } else if (any(is.na(data))) {
    stop("Data cannot have NA values.")
  } else if (any(is.na(dist1)) | any(is.na(dist2))) {
    stop("No distance value can be NA.")
  } else if (!is.numeric(eps)) {
    stop("Epsilon parameter needs to be numeric.")
  } else if (eps <= 0) {
    stop("Epsilon parameter needs to be positive.")
  }

  if (nrow(data) != dim(as.matrix(dist1))[1] | nrow(data) != dim(as.matrix(dist2))[1]) {
    stop("Your distance matrix dimensions are not correct for your data.")
  } else if (dim(as.matrix(dist1))[1] != dim(as.matrix(dist1))[2] | dim(as.matrix(dist2))[1] != dim(as.matrix(dist2))[2]) {
    stop("Your distance matrices are not square!")
  } else if (any(!is.numeric(dist1)) | any(!is.numeric(dist2))) {
    stop("Your distance matrices have non-numeric entries!")
  }

  if (length(data) == 0) {
    stop("Your data is missing!")
  } else if (length(dist1) == 0 | length(dist2) == 0) {
    stop("Your distance matrix is missing!")
  }

  if (length(setdiff(union(row.names(as.matrix(dist1)), row.names(data)), intersect(row.names(as.matrix(dist1)), row.names(data)))) != 0 | length(setdiff(union(row.names(as.matrix(dist2)), row.names(data)), intersect(row.names(as.matrix(dist2)), row.names(data)))) != 0) {
    stop("Names of points in distance matrices need to match names in data frame!")
  }

  balls = create_balls(data, dist1, eps)

  projection = row.names(data)

  return(create_mapper_object(
    data,
    dist2,
    projection,
    lapply(balls, is_in_ball),
    clusterer = clusterer
  ))
}

Try the mappeR package in your browser

Any scripts or data that you put into this service are public.

mappeR documentation built on Jan. 27, 2026, 9:06 a.m.