Nothing
# Copyright (c) 2020-2023 Nuno Fachada
# Distributed under the MIT License (http://opensource.org/licenses/MIT)
#' Generate multidimensional clusters
#'
#' @description
#' \loadmathjax
#' This is the main function of **clugenr**, and possibly the only function most
#' users will need.
#'
#' @details
#' If a custom function was given in the `clusizes_fn` parameter, it is
#' possible that `num_points` may have a different value than what was
#' specified in the `num_points` parameter.
#'
#' The terms "average" and "dispersion" refer to measures of central
#' tendency and statistical dispersion, respectively. Their exact meaning
#' depends on the optional arguments.
#'
#' @note This function is stochastic. For reproducibility set a PRNG seed with
#' [set.seed].
#'
#' @param num_dims Number of dimensions.
#' @param num_clusters Number of clusters to generate.
#' @param num_points Total number of points to generate.
#' @param direction Average direction of the cluster-supporting lines. Can be
#' a vector of length `num_dims` (same direction for all clusters) or a
#' matrix of size `num_clusters` x `num_dims` (one direction per cluster).
#' @param angle_disp Angle dispersion of cluster-supporting lines (radians).
#' @param cluster_sep Average cluster separation in each dimension (vector of
#' length `num_dims`).
#' @param llength Average length of cluster-supporting lines.
#' @param llength_disp Length dispersion of cluster-supporting lines.
#' @param lateral_disp Cluster lateral dispersion, i.e., dispersion of points
#' from their projection on the cluster-supporting line.
#' @param allow_empty Allow empty clusters? `FALSE` by default.
#' @param cluster_offset Offset to add to all cluster centers (vector of length
#' `num_dims`). By default there will be no offset.
#' @param proj_dist_fn Distribution of point projections along
#' cluster-supporting lines, with three possible values:
#' - `"norm"` (default): Distribute point projections along lines using a normal
#' distribution (\mjeqn{\mu=}{μ=} _line_center_,
#' \mjeqn{\sigma=}{σ=} `llength/6` ).
#' - `"unif"`: Distribute points uniformly along the line.
#' - User-defined function, which accepts two parameters, line length (`double`)
#' and number of points (`integer`), and returns a vector containing the
#' distance of each point projection to the center of the line. For example,
#' the `"norm"` option roughly corresponds to
#' `function(l, n) stats::rnorm(n, sd = l / 6)`.
#' @param point_dist_fn Controls how the final points are created from their
#' projections on the cluster-supporting lines, with three possible values:
#' - `"n-1"` (default): Final points are placed on a hyperplane orthogonal to
#' the cluster-supporting line, centered at each point's projection, using the
#' normal distribution (\mjeqn{\mu=0}{μ=0},
#' \mjeqn{\sigma=}{σ=} `lateral_disp` ). This is done by the [clupoints_n_1]
#' function.
#' - `"n"`: Final points are placed around their projection on the
#' cluster-supporting line using the normal distribution (\mjeqn{\mu=0}{μ=0},
#' \mjeqn{\sigma=}{σ=} `lateral_disp` ). This is done by the [clupoints_n]
#' function.
#' - User-defined function: The user can specify a custom point placement
#' strategy by passing a function with the same signature as [clupoints_n_1]
#' and [clupoints_n].
#' @param clusizes_fn Distribution of cluster sizes. By default, cluster sizes
#' are determined by the [clusizes] function, which uses the normal distribution
#' (\mjeqn{\mu=}{μ=} `num_points`/`num_clusters`, \mjeqn{\sigma=\mu/3}{σ=μ/3}),
#' and assures that the final cluster sizes add up to `num_points`. This
#' parameter allows the user to specify a custom function for this purpose,
#' which must follow [clusizes] signature. Note that custom functions are not
#' required to strictly obey the `num_points` parameter. Alternatively, the user
#' can specify a vector of cluster sizes directly.
#' @param clucenters_fn Distribution of cluster centers. By default, cluster
#' centers are determined by the [clucenters] function, which uses the uniform
#' distribution, and takes into account the `num_clusters` and `cluster_sep`
#' parameters for generating well-distributed cluster centers. This parameter
#' allows the user to specify a custom function for this purpose, which must
#' follow [clucenters] signature. Alternatively, the user can specify a matrix
#' of size `num_clusters` x `num_dims` with the exact cluster centers.
#' @param llengths_fn Distribution of line lengths. By default, the lengths of
#' cluster-supporting lines are determined by the [llengths] function, which
#' uses the folded normal distribution (\mjeqn{\mu=}{μ=} `llength`,
#' \mjeqn{\sigma=}{σ=} `llength_disp` ). This parameter allows the user to
#' specify a custom function for this purpose, which must follow [llengths]
#' signature. Alternatively, the user can specify a vector of line lengths
#' directly.
#' @param angle_deltas_fn Distribution of line angle differences with respect to
#' `direction`. By default, the angles between the main `direction` of each
#' cluster and the final directions of their cluster-supporting lines are
#' determined by the [angle_deltas] function, which uses the wrapped normal
#' distribution (\mjeqn{\mu=0}{μ=0}, \mjeqn{\sigma=}{σ=} `angle_disp` ) with
#' support in the interval \mjeqn{\left[-\pi/2,\pi/2\right]}{[-π/2, π/2]}. This
#' parameter allows the user to specify a custom function for this purpose,
#' which must follow [angle_deltas] signature. Alternatively, the user can
#' specify a vector of angle deltas directly.
#' @param seed An integer used to initialize the PRNG, allowing for reproducible
#' results. If specified, `seed` is simply passed to [set.seed].
#' @return A named list with the following elements:
#' - `points`: A `num_points` x `num_dims` matrix with the generated points for
#' all clusters.
#' - `clusters`: A `num_points` factor vector indicating which cluster
#' each point in `points` belongs to.
#' - `projections`: A `num_points` x `num_dims` matrix with the point
#' projections on the cluster-supporting lines.
#' - `sizes`: A `num_clusters` x 1 vector with the number of points in
#' each cluster.
#' - `centers`: A `num_clusters` x `num_dims` matrix with the
#' coordinates of the cluster centers.
#' - `directions`: A `num_clusters` x `num_dims` matrix with the final
#' direction of each cluster-supporting line.
#' - `angles`: A `num_clusters` x 1 vector with the angles between the
#' cluster-supporting lines and the main direction.
#' - `lengths`: A `num_clusters` x 1 vector with the lengths of the
#' cluster-supporting lines.
#'
#' @export
#' @examples
#' # 2D example
#' x <- clugen(2, 5, 1000, c(1, 3), 0.5, c(10, 10), 8, 1.5, 2)
#' graphics::plot(x$points, col = x$clusters, xlab = "x", ylab = "y", asp = 1)
#' # 3D example
#' x <- clugen(3, 5, 1000, c(2, 3, 4), 0.5, c(15, 13, 14), 7, 1, 2)
clugen <- function(num_dims, num_clusters, num_points, direction, angle_disp,
cluster_sep, llength, llength_disp, lateral_disp,
allow_empty = FALSE, cluster_offset = NA, proj_dist_fn = "norm",
point_dist_fn = "n-1", clusizes_fn = clusizes,
clucenters_fn = clucenters, llengths_fn = llengths,
angle_deltas_fn = angle_deltas, seed = NA) {
# ############### #
# Validate inputs #
# ############### #
# Check that number of dimensions is > 0
if (num_dims < 1) {
stop("Number of dimensions, `num_dims`, must be > 0")
}
# Check that number of clusters is > 0
if (num_clusters < 1) {
stop("Number of clusters, `num_clust`, must be > 0")
}
# Is direction a vector or a matrix?
if (is.vector(direction)
|| (is.array(direction) && length(dim(direction)) == 1)) {
# If a main direction vector was given, transpose it, so we can treat it
# like a matrix later
direction <- matrix(direction, ncol = length(direction))
} else if (is.matrix(direction)) {
# If a matrix was given (i.e. a main direction is given for each cluster),
# check if the number of directions is the same as the number of clusters
dir_size_1 <- dim(direction)[1]
if (dir_size_1 != num_clusters) {
stop("Number of rows in `direction` must be the same as the number of ",
"clusters (", dir_size_1, " != ", num_clusters, ")")
}
} else {
# The `direction` array must be a vector or a matrix, so if we get here
# it means we have invalid arguments
stop("`direction` must be a vector (1D array) or a matrix (2D array)")
}
# Check that `direction` has num_dims dimensions
dir_size_2 <- dim(direction)[2]
if (dir_size_2 != num_dims) {
stop("Length of directions in `direction` must be equal to `num_dims` (",
dir_size_2, " != ", num_dims, ")")
}
# Check that all directions in `direction` have magnitude > 0
if (isTRUE(all.equal(min(apply(direction, 1, norm, type = "2")), 0))) {
stop("`direction` must have magnitude > 0")
}
# If allow_empty is false, make sure there are enough points to distribute
# by the clusters
if (!allow_empty && num_points < num_clusters) {
stop("A total of ", num_points, " points is not enough for ",
num_clusters, " non-empty clusters")
}
# Check that cluster_sep has num_dims dimensions
if (length(cluster_sep) != num_dims) {
stop("Length of `cluster_sep` must be equal to `num_dims` (",
length(cluster_sep), " != ", num_dims, ")")
}
# If given, cluster_offset must have the correct number of dimensions,
# if not given then it will be a num_dims x 1 vector of zeros
if (length(cluster_offset) == 1 && is.na(cluster_offset)) {
cluster_offset <- vector(mode = "integer", length = num_dims)
} else if (length(cluster_offset) != num_dims) {
stop("Length of `cluster_offset` must be equal to `num_dims` (",
length(cluster_offset), " != ", num_dims, ")")
}
# Check that proj_dist_fn specifies a valid way for projecting points along
# cluster-supporting lines i.e., either "norm" (default), "unif" or a
# user-defined function
if (is.function(proj_dist_fn)) {
# Use user-defined distribution; assume function accepts length of line
# and number of points, and returns a number of points x 1 vector
pointproj_fn <- proj_dist_fn
} else if (is.character(proj_dist_fn) && proj_dist_fn == "unif") {
# Point projections will be uniformly placed along cluster-supporting lines
pointproj_fn <- function(l, n) stats::runif(n, min = -l / 2, max = l / 2)
} else if (is.character(proj_dist_fn) && proj_dist_fn == "norm") {
# Use normal distribution for placing point projections along
# cluster-supporting lines, mean equal to line center, standard deviation
# equal to 1/6 of line length such that the line length contains ≈99.73% of
# the points
pointproj_fn <- function(l, n) stats::rnorm(n, sd = (1.0 / 6.0) * l)
} else {
stop("`proj_dist_fn` has to be either \"norm\", \"unif\" or ",
"user-defined function")
}
# Check that point_dist_fn specifies a valid way for generating points given
# their projections along cluster-supporting lines, i.e., either "n-1"
# (default), "n" or a user-defined function
if (num_dims == 1) {
# If 1D was specified, point projections are the points themselves
pt_from_proj_fn <- function(projs, lat_disp, len, clu_dir, clu_ctr) projs
} else if (is.function(point_dist_fn)) {
# Use user-defined distribution; assume function accepts point projections
# on the line, lateral disp., cluster direction and cluster center, and
# returns a num_points x num_dims matrix containing the final points
# for the current cluster
pt_from_proj_fn <- point_dist_fn
} else if (is.character(point_dist_fn) && point_dist_fn == "n-1") {
# Points will be placed on a hyperplane orthogonal to the cluster-supporting
# line using a normal distribution centered at their intersection
pt_from_proj_fn <- clupoints_n_1
} else if (is.character(point_dist_fn) && point_dist_fn == "n") {
# Points will be placed using a multivariate normal distribution
# centered at the point projection
pt_from_proj_fn <- clupoints_n
} else {
stop("point_dist_fn has to be either \"n-1\", \"n\" or a user-defined ",
"function")
}
# If seed was given, set it
if (!is.na(seed)) {
set.seed(seed)
}
# ############################ #
# Determine cluster properties #
# ############################ #
# Normalize main direction(s)
direction <- matrix(apply(direction, 1, function(x) x / norm(x, "2")),
ncol = num_dims,
byrow = TRUE)
# If only one main direction was given, expand it for all clusters
if (dim(direction)[1] == 1) {
direction <- matrix(direction,
nrow = num_clusters,
ncol = length(direction),
byrow = TRUE)
}
# Determine cluster sizes
cluster_sizes <- if (is.function(clusizes_fn)) {
clusizes_fn(num_clusters, num_points, allow_empty)
} else if (is.vector(clusizes_fn) && length(clusizes_fn) == num_clusters) {
clusizes_fn
} else {
stop("`clusizes_fn` has to be either a function or a ",
"`num_clusters`-sized vector")
}
# Custom clusizes_fn's are not required to obey num_points, so we update
# it here just in case it's different from what the user specified
num_points <- sum(cluster_sizes)
# Determine cluster centers
cluster_centers <- if (is.function(clucenters_fn)) {
clucenters_fn(num_clusters, cluster_sep, cluster_offset)
} else if (is.matrix(clucenters_fn)
&& all(dim(clucenters_fn) == c(num_clusters, num_dims))) {
clucenters_fn
} else {
stop("clucenters_fn has to be either a function or a matrix of size ",
"`num_clusters` x `num_dims`")
}
# Determine length of lines supporting clusters
cluster_lengths <- if (is.function(llengths_fn)) {
llengths_fn(num_clusters, llength, llength_disp)
} else if (is.vector(llengths_fn) && length(llengths_fn) == num_clusters) {
llengths_fn
} else {
stop("`llengths_fn` has to be either a function or a `num_clusters`-sized",
" vector")
}
# Obtain angles between main direction and cluster-supporting lines
cluster_angles <- if (is.function(angle_deltas_fn)) {
angle_deltas_fn(num_clusters, angle_disp)
} else if (is.vector(angle_deltas_fn) && length(angle_deltas_fn) == num_clusters) {
angle_deltas_fn
} else {
stop("`angle_deltas_fn` has to be either a function or a ",
"`num_clusters`-sized vector")
}
# Determine normalized cluster directions
cluster_directions <- matrix(mapply(rand_vector_at_angle,
asplit(direction, 1),
cluster_angles),
nrow = num_clusters,
byrow = TRUE)
# ################################# #
# Determine points for each cluster #
# ################################# #
# Aux. vector with cumulative sum of number of points in each cluster
cumsum_points <- c(0, cumsum(cluster_sizes))
# Pre-allocate data structures for holding cluster info and points
# Cluster indices of each point
point_clusters <- vector(mode = "integer", length = num_points)
# Point projections on cluster-supporting lines
point_projections <- matrix(data = 0, nrow = num_points, ncol = num_dims)
# Final points to be generated
points <- matrix(data = 0, nrow = num_points, ncol = num_dims)
# Loop through clusters and create points for each one
for (i in 1:num_clusters) {
# Skip iteration if this cluster is empty
if (cluster_sizes[i] == 0) {
next
}
# Start and end indexes for points in current cluster
idx_start <- cumsum_points[i] + 1
idx_end <- cumsum_points[i + 1]
# Update cluster indices of each point
point_clusters[idx_start:idx_end] <- i
# Determine distance of point projections from the center of the line
ptproj_dist_fn_center <- pointproj_fn(cluster_lengths[i], cluster_sizes[i])
# Determine coordinates of point projections on the line using the
# parametric line equation (this works since cluster direction is
# normalized)
proj <- points_on_line(cluster_centers[i, ],
cluster_directions[i, ],
ptproj_dist_fn_center)
point_projections[idx_start:idx_end, ] <- proj
# Determine points from their projections on the line
points[idx_start:idx_end, ] <- pt_from_proj_fn(proj,
lateral_disp,
cluster_lengths[i],
cluster_directions[i, ],
cluster_centers[i, ])
}
# Return result
list(points = points,
clusters = as.factor(point_clusters),
projections = point_projections,
sizes = cluster_sizes,
centers = cluster_centers,
directions = cluster_directions,
angles = cluster_angles,
lengths = cluster_lengths)
}
#' Generality of types, by increasing order
#' @keywords internal
tprom <- list("logical" = 1,
"integer" = 2,
"double" = 3,
"complex" = 4,
"character" = 5,
"raw" = 6,
"list" = 7)
#' Returns the actual dimensions of the input
#'
#' Gets the dimensions of the input `a`, returning `c(length(a), 1)` if
#' `dim(a) == NULL`.
#'
#' @keywords internal
gdim <- function(a) {
d <- dim(a)
if (is.null(d)) {
d <- c(length(a), 1)
}
d
}
#' Merges the fields (specified in `fields`) of two or more data sets
#'
#' @description
#' Merges the fields (specified in `fields`) of two or more data sets (passed as
#' lists). The fields to be merged need to have the same number of columns. The
#' corresponding merged field will contain the rows of the fields to be merged,
#' and will have a common "supertype".
#'
#' @details
#' The `clusters_field` parameter specifies a field containing integers that
#' identify the cluster to which the respective points belongs to. If
#' `clusters_field` is specified (by default it's specified as `"clusters"`),
#' cluster assignments in individual datasets will be updated in the merged
#' dataset so that clusters are considered separate. This parameter can be set
#' to `NA`, in which case no field will be considered as a special cluster
#' assignments field.
#'
#' This function can be used to merge data sets generated with the [clugen]
#' function, by default merging the `points` and `clusters` fields in those data
#' sets. It also works with arbitrary data by specifying alternative fields in
#' the `fields` parameter. It can be used, for example, to merge third-party
#' data with [clugen]-generated data.
#'
#' @param ... One or more cluster data sets (in the form of lists) whose
#' `fields` are to be merged.
#' @param fields Fields to be merged, which must exist in the data sets given in
#' `...`.
#' @param clusters_field Field containing the integer cluster labels. If
#' specified, cluster assignments in individual datasets will be updated in the
#' merged dataset so that clusters are considered separate.
#' @return A list whose fields consist of the merged fields in the original
#' data sets.
#'
#' @export
#' @examples
#' a <- clugen(2, 5, 100, c(1, 3), 0.5, c(10, 10), 8, 1.5, 2)
#' b <- clugen(2, 3, 250, c(-1, 3), 0.5, c(13, 14), 7, 1, 2)
#' ab <- clumerge(a, b)
clumerge <- function(...,
fields = c("points", "clusters"),
clusters_field = "clusters") {
# Number of elements in each array the merged dataset
numel <- 0
# Contains information about each field
fields_info <- list()
# Merged dataset to output, initially empty
output <- list()
# If a clusters field is given, add it
if (!is.na(clusters_field)) {
fields <- union(fields, clusters_field)
}
# Place variable input into list form
data <- list(...)
# Cycle through data items
for (dt in data) {
# Number of elements in the current item
numel_i <- NA
# Cycle through fields for the current item
for (field in fields) {
if (!(field %in% names(dt))) {
stop(paste0("Data item does not contain required field `", field, "`"))
}
# Get the field value
value <- dt[[field]]
# Number of elements in field value
numel_tmp <- gdim(value)[1]
# Check the number of elements in the field value
if (is.na(numel_i)) {
# First field: get number of elements in value (must be the same
# for the remaining field values)
numel_i <- numel_tmp
} else if (numel_tmp != numel_i) {
# Fields values after the first must have the same number of elements
stop(paste0("Data item contains fields with different sizes (",
numel_tmp, " != ", numel_i, ")"))
}
# Get/check info about the field value type
field_cols <- gdim(value)[2]
field_type <- typeof(value)
field_fact <- is.factor(value)
if (!(field %in% names(fields_info))) {
# If it's the first time this field appears, just get the info
fields_info[[field]] <- list(type = field_type,
ncol = field_cols,
fact = field_fact)
# If it's the clusters field, it needs to have no dimensionality
if (!is.na(clusters_field) && field == clusters_field) {
if (!is.null(dim(value))) {
stop(paste0("Clusters field `",
clusters_field,
"` has more than one dimension"))
}
if (typeof(value) != "integer") {
stop(paste0("`", clusters_field, "` must contain integer types"))
}
}
} else {
# If this field already appeared in previous data items...
# ...check that the number of columns is the same than that of previous
# data items
if (field_cols != fields_info[[field]]$ncol) {
stop(paste0("Dimension mismatch in field `", field, "`"))
}
# ...check if previous items were factors, this item must be too (or
# vice-versa)
if (field_fact != fields_info[[field]]$fact) {
stop(paste0("Factor mismatch in field `", field, "`"))
}
# Determine most broad type between values
if (tprom[[field_type]] > fields_info[[field]]$type) {
fields_info[[field]]$type <- tprom[[field_type]]
}
}
}
# Update total number of elements
numel <- numel + numel_i
}
# Initialize output dictionary fields with room for all items
for (field in names(fields_info)) {
output[[field]] <- vector(mode = fields_info[[field]]$type,
length = numel * fields_info[[field]]$ncol)
if (fields_info[[field]]$ncol > 1) {
dim(output[[field]]) <- c(numel, fields_info[[field]]$ncol)
}
}
# Copy items from input data to output dictionary, field-wise
copied <- 0
last_cluster <- 0
# Create merged output
for (dt in data) {
# How many elements to copy for the current data item?
tocopy <- gdim(dt[[fields[1]]])[1]
# Cycle through each field and its information
for (field in names(fields_info)) {
ncol <- fields_info[[field]]$ncol
# Copy elements
if (ncol == 1) {
output[[field]][(copied + 1):(copied + tocopy)] <-
if (!is.na(clusters_field) && field == clusters_field) {
# If this is a clusters field, update the cluster IDs
old_clusters <- unique(dt[[clusters_field]])
new_clusters <- (last_cluster + 1):(last_cluster + length(old_clusters))
last_cluster <- new_clusters[length(new_clusters)]
new_clusters[match(dt[[field]], old_clusters)]
} else if (fields_info[[field]]$fact) {
# If not, are they factors?
as.numeric(levels(dt[[field]]))[dt[[field]]]
} else {
# Otherwise just copy the elements
dt[[field]]
}
} else {
output[[field]][(copied + 1):(copied + tocopy), ] <- dt[[field]]
}
}
# Update how many were copied so far
copied <- copied + tocopy
}
# Set factor according to original data
for (field in names(fields_info)) {
fi <- fields_info[[field]]
if (fi$fact) {
output[[field]] <- factor(output[[field]])
}
}
output
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.