# simulation data used in "How to Use t-SNE Effectively"
# http://distill.pub/2016/misread-tsne/
# https://github.com/distillpub/post--misread-tsne
#' 2D Grid
#'
#' 2D grid data from "How to Use t-SNE Effectively".
#'
#' Creates a two-dimensional square grid with equal spacing between points.
#' Point color goes from dark to light with increasing \code{x} coordinate and
#' from green to blue with increasing \code{y}.
#'
#' @param n Number of points per side.
#' @return Data frame with coordinates in the \code{x} and \code{y} columns,
#' and a per-point color in the \code{color} column.
#' @examples
#' df <- grid_data(n = 10)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
grid_data <- function(n = 10) {
df <- expand.grid(1:n, 1:n)
colnames(df) <- c("x", "y")
g <- floor(linear_map(df$x, 0, 255))
b <- floor(linear_map(df$y, 0, 255))
df$color <- grDevices::rgb(red = 20, green = g, blue = b, maxColorValue = 255)
df
}
#' Symmetric Gaussian
#'
#' Symmetric gaussian data from "How to Use t-SNE Effectively".
#'
#' Creates a randomly-sampled symmetric gaussian dataset of the specified
#' dimension.
#'
#' @param n Number of points.
#' @param dim Dimension of the gaussian.
#' @param sd Standard deviation.
#' @param color Color to assign to the data points.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#' \code{Xdim} columns, and color in the \code{color} column.
#' @examples
#' df <- gaussian_data(n = 50, dim = 2)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
gaussian_data <- function(n, dim, sd = 1, color = NULL) {
df <- data.frame(matrix(stats::rnorm(n * dim, sd = sd), ncol = dim))
if (!is.null(color)) {
df$color <- color
}
df
}
#' Elongated Gaussian Ellipsoid
#'
#' Elongated gaussian data from "How to Use t-SNE Effectively".
#'
#' Creates a randomly-sampled gaussian dataset of the specified
#' dimension, with standard deviation following the harmonic series: the first
#' column has standard deviation 1, the second 1/2, the third 1/3 and so on.
#'
#' @param n Number of points.
#' @param dim Dimension of the gaussian.
#' @param color Color to assign to the data points.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#' \code{Xdim} columns, and color in the \code{color} column.
#' @examples
#' df <- long_gaussian_data(n = 50, dim = 2)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
long_gaussian_data <- function(n, dim, color = NULL) {
df <- gaussian_data(n, dim, color = color)
for (j in 1:dim) {
df[, j] <- df[, j] / j
}
df
}
# Helper function for circle_data and random_circle_data.
# Creates a data frame where each point has x,y coordinates generated by the
# parametric equation of a circle with unit radius, generated from theta, a
# vector angles in radians. Each point also has a color value generated
# by linearly mapping theta to a rainbow palette.
theta_to_circle_df <- function(theta) {
data.frame(
x = cos(theta), y = sin(theta),
color = linear_color_map(theta),
stringsAsFactors = FALSE
)
}
#' Uniform 2D Circle
#'
#' 2D circle data with uniform spacing from "How to Use t-SNE Effectively".
#'
#' Creates a dataset where the points are located on the circumference of a
#' circle of unit radius, where the angle subtended by the point at the center
#' of the circle has been uniformly sampled. Each point has a color linearly
#' mapped from the angle to a rainbow color scheme.
#'
#' @param n Number of points per side.
#' @return Data frame with x and y coordinates in the columns \code{x} and
#' \code{y} respectively, and color in the \code{color} column.
#' @examples
#' df <- circle_data(n = 50)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
circle_data <- function(n) {
theta_to_circle_df(theta_unif(n))
}
#' Random 2D Circle
#'
#' Random points on a 2D circle from "How to Use t-SNE Effectively".
#'
#' Creates a dataset where the points are located on the circumference of a
#' circle of unit radius, where the angle subtended by the point at the center
#' of the circle has been randomly sampled. Each point has a color linearly
#' mapped from the angle to a rainbow color scheme.
#'
#' @param n Number of points.
#' @return Data frame with x and y coordinates in the columns \code{x} and
#' \code{y} respectively, and color in the \code{color} column.
#' @examples
#' df <- random_circle_data(n = 50)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
random_circle_data <- function(n) {
theta_to_circle_df(stats::runif(n = n, max = 2 * pi))
}
#' Fuzzy 2D Circle
#'
#' Clusters arranged in a circle from "How to Use t-SNE Effectively".
#'
#' Creates a dataset where the points are located on the circumference of a
#' circle of unit radius via \code{\link{circle_data}}, where the
#' angle subtended by the point at the center of the circle has been uniformly
#' sampled.
#'
#' Each point is then replicated 20 times, with a slight perturbation to the
#' original coordinate (using a normal distribution with a standard deviation
#' of 0.1). Only the twenty jittered points are added to the data frame. Each
#' point has a color linearly mapped from the angle of the original unjittered
#' point to a rainbow color scheme.
#'
#' @param n Number of points to define the circle. These are then jittered
#' as described in 'Details', so the total number of observations in the
#' data frame will be \code{20 * n}.
#' @return Data frame with x and y coordinates in the columns \code{x} and
#' \code{y} respectively, and color in the \code{color} column.
#' @examples
#' df <- random_circle_data(n = 50)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
random_circle_cluster_data <- function(n) {
df <- replicate_rows(circle_data(n), n = 20)
df$x <- df$x + stats::rnorm(n = nrow(df), sd = 0.1)
df$y <- df$y + stats::rnorm(n = nrow(df), sd = 0.1)
df
}
#' Two Equal Size Clusters
#'
#' Two gaussians with equal size and bandwidth, from "How to Use t-SNE
#' Effectively".
#'
#' Creates a dataset consisting of two symmetric gaussian distributions with
#' equal number of points and standard deviation 1, separated by a distance
#' of 10 units. Points are colored depending on which cluster they belong to.
#'
#' @param n Number of points per gaussian.
#' @param dim Dimension of the gaussians. You may pass a vector of length 2 to
#' create clusters of different dimensionalities, with the smaller cluster
#' having zeros in the extra dimensions.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#' \code{Xdim} columns, and color in the \code{color} column.
#'
#' @examples
#' df <- two_clusters_data(n = 50, dim = 2)
#' # two clusters with 10 members each, first 10 sampled from a 3D gaussian,
#' # second 10 are sampled from a 4D gaussian
#' df <- two_clusters_data(n = 10, dim = c(3, 4))
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
two_clusters_data <- function(n, dim = 50) {
if (length(dim) != 2) {
dim <- rep(dim, 2)
}
cluster1 <- gaussian_data(n = n, dim = dim[1], color = "#003399")
cluster2 <- gaussian_data(n = n, dim = dim[2], color = "#FF9900")
cluster2[, 1] <- cluster2[, 1] + 10
merge_by_row(cluster1, cluster2)
}
#' Two Gaussian Clusters With Unequal Standard Deviations
#'
#' Two gaussians with equal size but unequal bandwidths, from "How to Use t-SNE
#' Effectively".
#'
#' Creates a dataset consisting of two symmetric gaussian distributions with
#' equal number of points, but different standard deviations: the standard
#' deviations of the second cluster will be \code{1/scale} of the other.
#' Clusters are separated by 20 units. Points are colored depending on which
#' cluster they belong to.
#'
#' @param n Number of points per gaussian.
#' @param dim Dimension of the gaussians.
#' @param scale Amount to reduce the standard deviation of the second cluster,
#' relative to the first.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#' \code{Xdim} columns, and color in the \code{color} column.
#'
#' @examples
#' df <- two_different_clusters_data(n = 50, dim = 2, scale = 5)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
two_different_clusters_data <- function(n, dim = 50, scale = 10) {
cluster1 <- gaussian_data(n = n, dim = dim, color = "#003399")
cluster2 <- gaussian_data(n = n, dim = dim, sd = 1 / scale, color = "#FF9900")
cluster2[, 1] <- cluster2[, 1] + 20
rbind(cluster1, cluster2)
}
#' Three Gaussian Clusters With Equal Standard Deviations
#'
#' Three gaussian clusters with equal size and bandwidth from "How to Use t-SNE
#' Effectively".
#'
#' Creates a dataset consisting of three symmetric gaussian distributions with
#' equal number of points and standard deviation 1. The clusters are arranged
#' in a line, with the second cluster 10 units from the first, and the third
#' cluster 50 units from the first. Points are colored depending on which
#' cluster they belong to.
#'
#' @param n Number of points per gaussian.
#' @param dim Dimension of the gaussians.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#' \code{Xdim} columns, and color in the \code{color} column.
#' @examples
#' df <- three_clusters_data(n = 50, dim = 2)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
three_clusters_data <- function(n, dim = 50) {
cluster1 <- gaussian_data(n = n, dim = dim, color = "#003399")
cluster2 <- gaussian_data(n = n, dim = dim, color = "#FF9900")
cluster2[, 1] <- cluster2[, 1] + 10
cluster3 <- gaussian_data(n = n, dim = dim, color = "#66AA33")
cluster3[, 1] <- cluster3[, 1] + 50
rbind(cluster1, cluster2, cluster3)
}
#' Subset Cluster Data
#'
#' One tiny gaussian cluster inside of a big cluster from "How to Use t-SNE
#' Effectively".
#'
#' Creates a dataset consisting of two gaussians with the same center, but
#' with the first cluster having a standard deviation of 1, and the second
#' having a standard deviation of \code{big_sdev} (default 50). Points are
#' colored depending on which cluster they belong to (small cluster is dark
#' powder blue, large is light orange).
#'
#' @param n Number of points per gaussian.
#' @param dim Dimension of the gaussians.
#' @param big_sdev Standard deviation of the bigger cluster, default 50. The
#' smaller cluster has a standard deviation of 1.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#' \code{Xdim} columns, and color in the \code{color} column.
#' @examples
#' df <- subset_clusters_data(n = 50, dim = 2)
#'
#' # 10D example where the big cluster is only twice the standard deviation of
#' # the small cluster
#' df <- subset_clusters_data(n = 50, dim = 10, big_sdev = 2)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
subset_clusters_data <- function(n, dim = 2, big_sdev = 50) {
if (!is.numeric(big_sdev) || length(big_sdev) != 1 || big_sdev <= 0) {
stop("big_sdev should be a scalar positive value")
}
cluster1 <- gaussian_data(n = n, dim = dim, color = "#003399")
cluster2 <- gaussian_data(n = n, dim = dim, sd = big_sdev, color = "#FF9900")
rbind(cluster1, cluster2)
}
#' Simplex Data
#'
#' Data in a rough simplex from "How to Use t-SNE Effectively".
#'
#' Create a dataset of the specified number of points arranged in a rough
#' simplex, which also determines the dimensionality of the dataset. Each
#' point has a single non-zero coordinate in a different column to all the
#' other points.
#'
#' @param n Number of points (and hence dimensionality) of the simplex.
#' @param noise Degree by which to perturb the points from their original
#' locations. A normal distribution with the specified standard deviation is
#' applied to the non-zero coordinate.
#' @param color Color to apply to each point.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#' \code{Xn} columns, and color in the \code{color} column.
#' @examples
#' df <- simplex_data(n = 50)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
simplex_data <- function(n, noise = 0.5, color = "#003399") {
m <- diag(n)
diag(m) <- 1 + noise * stats::rnorm(n)
data.frame(m, color = color, stringsAsFactors = FALSE)
}
#' Cube Data
#'
#' Random points sampled from a cube, from "How to Use t-SNE Effectively".
#'
#' Create a dataset of the of points randomly sampled from a unit cube
#' of the specified dimension.
#'
#' @param n Number of points to sample.
#' @param dim Dimension of the cube.
#' @param color Color to apply to each point.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#' \code{Xdim} columns, and color in the \code{color} column.
#' @examples
#' df <- cube_data(n = 50, dim = 3)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
cube_data <- function(n, dim, color = "#003399") {
data.frame(matrix(stats::runif(n * dim), nrow = n),
color = color,
stringsAsFactors = FALSE
)
}
# Helper function for unlink_data and link_data
rotate <- function(df) {
cs <- cos(.4)
sn <- sin(.4)
y <- df$y
z <- df$z
df$y <- cs * y + sn * z
df$z <- -sn * y + cs * z
df
}
#' Two Unlinked Rings
#'
#' Points sampled from two unlinked circles, from "How to Use t-SNE
#' Effectively".
#'
#' Creates a 3D dataset where points are uniformly sampled from two (2D)
#' circles of unit radius, separated by 3 units. The two circles are not
#' co-planar. Points are colored depending from which circle they are sampled
#' from.
#'
#' For the linked version, see \code{\link{link_data}}.
#'
#' @param n Number of points to sample.
#' @return Data frame with coordinates in the \code{x}, \code{y}, \code{z}
#' columns, and color in the \code{color} column.
#' @examples
#' df <- unlink_data(n = 50)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
unlink_data <- function(n) {
theta <- theta_unif(n)
ring1 <- rotate(data.frame(
x = cos(theta), y = sin(theta), z = 0,
color = "#ff9900", stringsAsFactors = FALSE
))
ring2 <- rotate(data.frame(
x = 3 + cos(theta), y = 0, z = sin(theta),
color = "#003399", stringsAsFactors = FALSE
))
rbind(ring1, ring2)
}
#' Two Linked Rings
#'
#' Points sampled from two linked circles, from "How to Use t-SNE
#' Effectively".
#'
#' Creates a 3D dataset where points are uniformly sampled from two (2D)
#' circles of unit radius, separated by 1 unit, and hence are overlapping.
#' The two circles are not co-planar. Points are colored depending from which
#' circle they are sampled from.
#'
#' For the unlinked version, see \code{\link{unlink_data}}.
#'
#' @param n Number of points to sample.
#' @return Data frame with coordinates in the \code{x}, \code{y}, \code{z}
#' columns, and color in the \code{color} column.
#' @examples
#' df <- link_data(n = 50)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
link_data <- function(n) {
theta <- theta_unif(n)
ring1 <- rotate(data.frame(
x = cos(theta), y = sin(theta), z = 0,
color = "#ff9900", stringsAsFactors = FALSE
))
ring2 <- rotate(data.frame(
x = 1 + cos(theta), y = 0, z = sin(theta),
color = "#003399", stringsAsFactors = FALSE
))
rbind(ring1, ring2)
}
#' Trefoil Knot
#'
#' Points sampled from a trefoil knot shape, from "How to Use t-SNE
#' Effectively".
#'
#' Creates a 3D dataset where points are uniformly sampled from a trefoil knot.
#' Points are colored based on mapping the angle parameter values used to
#' generate the knot coordinates to a rainbow color scheme.
#'
#' @param n Number of points to sample.
#' @return Data frame with coordinates in the \code{x}, \code{y}, \code{z}
#' columns, and color in the \code{color} column.
#' @examples
#' df <- trefoil_data(n = 50)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
trefoil_data <- function(n) {
theta <- theta_unif(n)
data.frame(
x = sin(theta) + 2 * sin(2 * theta),
y = cos(theta) - 2 * cos(2 * theta),
z = -sin(3 * theta),
color = linear_color_map(theta),
stringsAsFactors = FALSE
)
}
#' Long Cluster Data
#'
#' Two long, linear clusters in 2D from "How to Use t-SNE Effectively".
#'
#' Creates a 2D dataset where points are sampled from two long, linear,
#' closely-separated clusters. Points are colored depending from which cluster
#' they are sampled from.
#'
#' @param n Number of points to sample per cluster.
#' @return Data frame with coordinates in the \code{x}, \code{y} columns, and
#' color in the \code{color} column.
#' @examples
#' df <- long_cluster_data(n = 50)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
long_cluster_data <- function(n) {
s <- .03 * n
cluster1 <- data.frame(
x = 0:(n - 1) + s * stats::rnorm(n),
y = 0:(n - 1) + s * stats::rnorm(n),
color = "#003399", stringsAsFactors = FALSE
)
cluster2 <- data.frame(
x = 0:(n - 1) + s * stats::rnorm(n) + n / 5,
y = 0:(n - 1) + s * stats::rnorm(n) - n / 5,
color = "#ff9900", stringsAsFactors = FALSE
)
rbind(cluster1, cluster2)
}
#' Mutually Orthogonal Step Data
#'
#' Points related by mutually orthogonal steps, from "How to Use t-SNE
#' Effectively".
#'
#' Creates N points from an N-dimensional space, where each point is related
#' by a mutually orthogonal step. Coordinates from a triangular matrix, where
#' the coordinates in the triangle are one, and are zero everywhere else.
#' Points are linearly mapped from their row id to a rainbow color scheme.
#'
#' @param n Number of points to sample.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#' \code{Xn} columns, and color in the \code{color} column.
#' @examples
#' df <- ortho_curve(n = 50)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
ortho_curve <- function(n) {
m <- matrix(0, nrow = n, ncol = n)
m[lower.tri(m)] <- 1
df <- data.frame(m)
df$color <- linear_color_map(1.5 * pi * 0:(n - 1) / n)
df
}
#' Random Walk Data
#'
#' Simulation data from a random walk process, from "How to Use t-SNE
#' Effectively".
#'
#' Creates a dataset where the coordinate of each point is related to the
#' previous point by a gaussian perturbation. Points are linearly mapped from
#' their position in the walk to a rainbow color scheme.
#'
#' @param n Number of points to sample.
#' @param dim Number of dimensions of the dataset.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#' \code{Xn} columns, and color in the \code{color} column.
#' @examples
#' df <- random_walk(n = 100, dim = 100)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
random_walk <- function(n, dim) {
current <- rep(0, dim)
df <- data.frame(matrix(nrow = n, ncol = dim))
for (i in 0:n - 1) {
step <- stats::rnorm(dim)
next_step <- current
for (j in 0:dim - 1) {
next_step[j + 1] <- current[j + 1] + step[j + 1]
}
df[i + 1, ] <- next_step
current <- next_step
}
df$color <- linear_color_map(1.5 * pi * 0:(n - 1) / n)
df
}
#' Random Jump Data
#'
#' Simulation data from a random jump process, from "How to Use t-SNE
#' Effectively".
#'
#' Creates a dataset similar to \code{\link{random_walk}} but with
#' additional gaussian noise added at each step. Points are linearly mapped from
#' their position in the walk to a rainbow color scheme.
#'
#' @param n Number of points to sample.
#' @param dim Number of dimensions of the dataset.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#' \code{Xn} columns, and color in the \code{color} column.
#' @examples
#' df <- random_jump(n = 100, dim = 100)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
random_jump <- function(n, dim) {
df <- data.frame(matrix(nrow = n, ncol = dim))
current <- rep(0, dim)
for (i in 0:n - 1) {
step <- stats::rnorm(dim)
next_step <- step + current
r <- stats::rnorm(dim)
r <- r * sqrt(dim)
df[i + 1, ] <- r + next_step
current <- next_step
}
df$color <- linear_color_map(1.5 * pi * 0:(n - 1) / n)
df
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.