R/misread-tsne.R

Defines functions random_jump random_walk ortho_curve long_cluster_data trefoil_data link_data unlink_data rotate cube_data simplex_data subset_clusters_data three_clusters_data two_different_clusters_data two_clusters_data random_circle_cluster_data random_circle_data circle_data theta_to_circle_df long_gaussian_data gaussian_data grid_data

Documented in circle_data cube_data gaussian_data grid_data link_data long_cluster_data long_gaussian_data ortho_curve random_circle_cluster_data random_circle_data random_jump random_walk simplex_data subset_clusters_data three_clusters_data trefoil_data two_clusters_data two_different_clusters_data unlink_data

# simulation data used in "How to Use t-SNE Effectively"
# http://distill.pub/2016/misread-tsne/
# https://github.com/distillpub/post--misread-tsne

#' 2D Grid
#'
#' 2D grid data from "How to Use t-SNE Effectively".
#'
#' Creates a two-dimensional square grid with equal spacing between points.
#' Point color goes from dark to light with increasing \code{x} coordinate and
#' from green to blue with increasing \code{y}.
#'
#' @param n Number of points per side.
#' @return Data frame with coordinates in the \code{x} and \code{y} columns,
#'  and a per-point color in the \code{color} column.
#' @examples
#' df <- grid_data(n = 10)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
grid_data <- function(n = 10) {
  df <- expand.grid(1:n, 1:n)
  colnames(df) <- c("x", "y")
  g <- floor(linear_map(df$x, 0, 255))
  b <- floor(linear_map(df$y, 0, 255))
  df$color <- grDevices::rgb(red = 20, green = g, blue = b, maxColorValue = 255)
  df
}

#' Symmetric Gaussian
#'
#' Symmetric gaussian data from "How to Use t-SNE Effectively".
#'
#' Creates a randomly-sampled symmetric gaussian dataset of the specified
#' dimension.
#'
#' @param n Number of points.
#' @param dim Dimension of the gaussian.
#' @param sd Standard deviation.
#' @param color Color to assign to the data points.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#'  \code{Xdim} columns, and color in the \code{color} column.
#' @examples
#' df <- gaussian_data(n = 50, dim = 2)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
gaussian_data <- function(n, dim, sd = 1, color = NULL) {
  df <- data.frame(matrix(stats::rnorm(n * dim, sd = sd), ncol = dim))
  if (!is.null(color)) {
    df$color <- color
  }
  df
}

#' Elongated Gaussian Ellipsoid
#'
#' Elongated gaussian data from "How to Use t-SNE Effectively".
#'
#' Creates a randomly-sampled gaussian dataset of the specified
#' dimension, with standard deviation following the harmonic series: the first
#' column has standard deviation 1, the second 1/2, the third 1/3 and so on.
#'
#' @param n Number of points.
#' @param dim Dimension of the gaussian.
#' @param color Color to assign to the data points.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#'  \code{Xdim} columns, and color in the \code{color} column.
#' @examples
#' df <- long_gaussian_data(n = 50, dim = 2)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
long_gaussian_data <- function(n, dim, color = NULL) {
  df <- gaussian_data(n, dim, color = color)
  for (j in 1:dim) {
    df[, j] <- df[, j] / j
  }
  df
}

# Helper function for circle_data and random_circle_data.
# Creates a data frame where each point has x,y coordinates generated by the
# parametric equation of a circle with unit radius, generated from theta, a
# vector angles in radians. Each point also has a color value generated
# by linearly mapping theta to a rainbow palette.
theta_to_circle_df <- function(theta) {
  data.frame(
    x = cos(theta), y = sin(theta),
    color = linear_color_map(theta),
    stringsAsFactors = FALSE
  )
}

#' Uniform 2D Circle
#'
#' 2D circle data with uniform spacing from "How to Use t-SNE Effectively".
#'
#' Creates a dataset where the points are located on the circumference of a
#' circle of unit radius, where the angle subtended by the point at the center
#' of the circle has been uniformly sampled. Each point has a color linearly
#' mapped from the angle to a rainbow color scheme.
#'
#' @param n Number of points per side.
#' @return Data frame with x and y coordinates in the columns \code{x} and
#'  \code{y} respectively, and color in the \code{color} column.
#' @examples
#' df <- circle_data(n = 50)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
circle_data <- function(n) {
  theta_to_circle_df(theta_unif(n))
}

#' Random 2D Circle
#'
#' Random points on a 2D circle from "How to Use t-SNE Effectively".
#'
#' Creates a dataset where the points are located on the circumference of a
#' circle of unit radius, where the angle subtended by the point at the center
#' of the circle has been randomly sampled. Each point has a color linearly
#' mapped from the angle to a rainbow color scheme.
#'
#' @param n Number of points.
#' @return Data frame with x and y coordinates in the columns \code{x} and
#'  \code{y} respectively, and color in the \code{color} column.
#' @examples
#' df <- random_circle_data(n = 50)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
random_circle_data <- function(n) {
  theta_to_circle_df(stats::runif(n = n, max = 2 * pi))
}

#' Fuzzy 2D Circle
#'
#' Clusters arranged in a circle from "How to Use t-SNE Effectively".
#'
#' Creates a dataset where the points are located on the circumference of a
#' circle of unit radius via \code{\link{circle_data}}, where the
#' angle subtended by the point at the center of the circle has been uniformly
#' sampled.
#'
#' Each point is then replicated 20 times, with a slight perturbation to the
#' original coordinate (using a normal distribution with a standard deviation
#' of 0.1). Only the twenty jittered points are added to the data frame. Each
#' point has a color linearly mapped from the angle of the original unjittered
#' point to a rainbow color scheme.
#'
#' @param n Number of points to define the circle. These are then jittered
#'  as described in 'Details', so the total number of observations in the
#'  data frame will be \code{20 * n}.
#' @return Data frame with x and y coordinates in the columns \code{x} and
#'  \code{y} respectively, and color in the \code{color} column.
#' @examples
#' df <- random_circle_data(n = 50)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
random_circle_cluster_data <- function(n) {
  df <- replicate_rows(circle_data(n), n = 20)
  df$x <- df$x + stats::rnorm(n = nrow(df), sd = 0.1)
  df$y <- df$y + stats::rnorm(n = nrow(df), sd = 0.1)
  df
}

#' Two Equal Size Clusters
#'
#' Two gaussians with equal size and bandwidth, from "How to Use t-SNE
#' Effectively".
#'
#' Creates a dataset consisting of two symmetric gaussian distributions with
#' equal number of points and standard deviation 1, separated by a distance
#' of 10 units. Points are colored depending on which cluster they belong to.
#'
#' @param n Number of points per gaussian.
#' @param dim Dimension of the gaussians. You may pass a vector of length 2 to
#' create clusters of different dimensionalities, with the smaller cluster
#' having zeros in the extra dimensions.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#'  \code{Xdim} columns, and color in the \code{color} column.
#'
#' @examples
#' df <- two_clusters_data(n = 50, dim = 2)
#' # two clusters with 10 members each, first 10 sampled from a 3D gaussian,
#' # second 10 are sampled from a 4D gaussian
#' df <- two_clusters_data(n = 10, dim = c(3, 4))
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
two_clusters_data <- function(n, dim = 50) {
  if (length(dim) != 2) {
    dim <- rep(dim, 2)
  }
  cluster1 <- gaussian_data(n = n, dim = dim[1], color = "#003399")

  cluster2 <- gaussian_data(n = n, dim = dim[2], color = "#FF9900")
  cluster2[, 1] <- cluster2[, 1] + 10

  merge_by_row(cluster1, cluster2)
}

#' Two Gaussian Clusters With Unequal Standard Deviations
#'
#' Two gaussians with equal size but unequal bandwidths, from "How to Use t-SNE
#' Effectively".
#'
#' Creates a dataset consisting of two symmetric gaussian distributions with
#' equal number of points, but different standard deviations: the standard
#' deviations of the second cluster will be \code{1/scale} of the other.
#' Clusters are separated by 20 units. Points are colored depending on which
#' cluster they belong to.
#'
#' @param n Number of points per gaussian.
#' @param dim Dimension of the gaussians.
#' @param scale Amount to reduce the standard deviation of the second cluster,
#' relative to the first.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#'  \code{Xdim} columns, and color in the \code{color} column.
#'
#' @examples
#' df <- two_different_clusters_data(n = 50, dim = 2, scale = 5)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
two_different_clusters_data <- function(n, dim = 50, scale = 10) {
  cluster1 <- gaussian_data(n = n, dim = dim, color = "#003399")

  cluster2 <- gaussian_data(n = n, dim = dim, sd = 1 / scale, color = "#FF9900")
  cluster2[, 1] <- cluster2[, 1] + 20

  rbind(cluster1, cluster2)
}


#' Three Gaussian Clusters With Equal Standard Deviations
#'
#' Three gaussian clusters with equal size and bandwidth from "How to Use t-SNE
#' Effectively".
#'
#' Creates a dataset consisting of three symmetric gaussian distributions with
#' equal number of points and standard deviation 1. The clusters are arranged
#' in a line, with the second cluster 10 units from the first, and the third
#' cluster 50 units from the first. Points are colored depending on which
#' cluster they belong to.
#'
#' @param n Number of points per gaussian.
#' @param dim Dimension of the gaussians.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#'  \code{Xdim} columns, and color in the \code{color} column.
#' @examples
#' df <- three_clusters_data(n = 50, dim = 2)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
three_clusters_data <- function(n, dim = 50) {
  cluster1 <- gaussian_data(n = n, dim = dim, color = "#003399")

  cluster2 <- gaussian_data(n = n, dim = dim, color = "#FF9900")
  cluster2[, 1] <- cluster2[, 1] + 10

  cluster3 <- gaussian_data(n = n, dim = dim, color = "#66AA33")
  cluster3[, 1] <- cluster3[, 1] + 50

  rbind(cluster1, cluster2, cluster3)
}

#' Subset Cluster Data
#'
#' One tiny gaussian cluster inside of a big cluster from "How to Use t-SNE
#' Effectively".
#'
#' Creates a dataset consisting of two gaussians with the same center, but
#' with the first cluster having a standard deviation of 1, and the second
#' having a standard deviation of \code{big_sdev} (default 50). Points are
#' colored depending on which cluster they belong to (small cluster is dark
#' powder blue, large is light orange).
#'
#' @param n Number of points per gaussian.
#' @param dim Dimension of the gaussians.
#' @param big_sdev Standard deviation of the bigger cluster, default 50. The
#' smaller cluster has a standard deviation of 1.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#'  \code{Xdim} columns, and color in the \code{color} column.
#' @examples
#' df <- subset_clusters_data(n = 50, dim = 2)
#'
#' # 10D example where the big cluster is only twice the standard deviation of
#' # the small cluster
#' df <- subset_clusters_data(n = 50, dim = 10, big_sdev = 2)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
subset_clusters_data <- function(n, dim = 2, big_sdev = 50) {
  if (!is.numeric(big_sdev) || length(big_sdev) != 1 || big_sdev <= 0) {
    stop("big_sdev should be a scalar positive value")
  }
  cluster1 <- gaussian_data(n = n, dim = dim, color = "#003399")
  cluster2 <- gaussian_data(n = n, dim = dim, sd = big_sdev, color = "#FF9900")
  rbind(cluster1, cluster2)
}

#' Simplex Data
#'
#' Data in a rough simplex from "How to Use t-SNE Effectively".
#'
#' Create a dataset of the specified number of points arranged in a rough
#' simplex, which also determines the dimensionality of the dataset. Each
#' point has a single non-zero coordinate in a different column to all the
#' other points.
#'
#' @param n Number of points (and hence dimensionality) of the simplex.
#' @param noise Degree by which to perturb the points from their original
#'  locations. A normal distribution with the specified standard deviation is
#'  applied to the non-zero coordinate.
#' @param color Color to apply to each point.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#'  \code{Xn} columns, and color in the \code{color} column.
#' @examples
#' df <- simplex_data(n = 50)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
simplex_data <- function(n, noise = 0.5, color = "#003399") {
  m <- diag(n)
  diag(m) <- 1 + noise * stats::rnorm(n)
  data.frame(m, color = color, stringsAsFactors = FALSE)
}

#' Cube Data
#'
#' Random points sampled from a cube, from "How to Use t-SNE Effectively".
#'
#' Create a dataset of the of points randomly sampled from a unit cube
#' of the specified dimension.
#'
#' @param n Number of points to sample.
#' @param dim Dimension of the cube.
#' @param color Color to apply to each point.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#'  \code{Xdim} columns, and color in the \code{color} column.
#' @examples
#' df <- cube_data(n = 50, dim = 3)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
cube_data <- function(n, dim, color = "#003399") {
  data.frame(matrix(stats::runif(n * dim), nrow = n),
    color = color,
    stringsAsFactors = FALSE
  )
}

# Helper function for unlink_data and link_data
rotate <- function(df) {
  cs <- cos(.4)
  sn <- sin(.4)
  y <- df$y
  z <- df$z
  df$y <- cs * y + sn * z
  df$z <- -sn * y + cs * z
  df
}

#' Two Unlinked Rings
#'
#' Points sampled from two unlinked circles, from "How to Use t-SNE
#' Effectively".
#'
#' Creates a 3D dataset where points are uniformly sampled from two (2D)
#' circles of unit radius, separated by 3 units. The two circles are not
#' co-planar. Points are colored depending from which circle they are sampled
#' from.
#'
#' For the linked version, see \code{\link{link_data}}.
#'
#' @param n Number of points to sample.
#' @return Data frame with coordinates in the \code{x}, \code{y}, \code{z}
#' columns, and color in the \code{color} column.
#' @examples
#' df <- unlink_data(n = 50)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
unlink_data <- function(n) {
  theta <- theta_unif(n)
  ring1 <- rotate(data.frame(
    x = cos(theta), y = sin(theta), z = 0,
    color = "#ff9900", stringsAsFactors = FALSE
  ))
  ring2 <- rotate(data.frame(
    x = 3 + cos(theta), y = 0, z = sin(theta),
    color = "#003399", stringsAsFactors = FALSE
  ))
  rbind(ring1, ring2)
}

#' Two Linked Rings
#'
#' Points sampled from two linked circles, from "How to Use t-SNE
#' Effectively".
#'
#' Creates a 3D dataset where points are uniformly sampled from two (2D)
#' circles of unit radius, separated by 1 unit, and hence are overlapping.
#' The two circles are not co-planar. Points are colored depending from which
#' circle they are sampled from.
#'
#' For the unlinked version, see \code{\link{unlink_data}}.
#'
#' @param n Number of points to sample.
#' @return Data frame with coordinates in the \code{x}, \code{y}, \code{z}
#' columns, and color in the \code{color} column.
#' @examples
#' df <- link_data(n = 50)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
link_data <- function(n) {
  theta <- theta_unif(n)
  ring1 <- rotate(data.frame(
    x = cos(theta), y = sin(theta), z = 0,
    color = "#ff9900", stringsAsFactors = FALSE
  ))
  ring2 <- rotate(data.frame(
    x = 1 + cos(theta), y = 0, z = sin(theta),
    color = "#003399", stringsAsFactors = FALSE
  ))
  rbind(ring1, ring2)
}

#' Trefoil Knot
#'
#' Points sampled from a trefoil knot shape, from "How to Use t-SNE
#' Effectively".
#'
#' Creates a 3D dataset where points are uniformly sampled from a trefoil knot.
#' Points are colored based on mapping the angle parameter values used to
#' generate the knot coordinates to a rainbow color scheme.
#'
#' @param n Number of points to sample.
#' @return Data frame with coordinates in the \code{x}, \code{y}, \code{z}
#' columns, and color in the \code{color} column.
#' @examples
#' df <- trefoil_data(n = 50)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
trefoil_data <- function(n) {
  theta <- theta_unif(n)
  data.frame(
    x = sin(theta) + 2 * sin(2 * theta),
    y = cos(theta) - 2 * cos(2 * theta),
    z = -sin(3 * theta),
    color = linear_color_map(theta),
    stringsAsFactors = FALSE
  )
}

#' Long Cluster Data
#'
#' Two long, linear clusters in 2D from "How to Use t-SNE Effectively".
#'
#' Creates a 2D dataset where points are sampled from two long, linear,
#' closely-separated clusters. Points are colored depending from which cluster
#' they are sampled from.
#'
#' @param n Number of points to sample per cluster.
#' @return Data frame with coordinates in the \code{x}, \code{y} columns, and
#' color in the \code{color} column.
#' @examples
#' df <- long_cluster_data(n = 50)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
long_cluster_data <- function(n) {
  s <- .03 * n

  cluster1 <- data.frame(
    x = 0:(n - 1) + s * stats::rnorm(n),
    y = 0:(n - 1) + s * stats::rnorm(n),
    color = "#003399", stringsAsFactors = FALSE
  )

  cluster2 <- data.frame(
    x = 0:(n - 1) + s * stats::rnorm(n) + n / 5,
    y = 0:(n - 1) + s * stats::rnorm(n) - n / 5,
    color = "#ff9900", stringsAsFactors = FALSE
  )

  rbind(cluster1, cluster2)
}

#' Mutually Orthogonal Step Data
#'
#' Points related by mutually orthogonal steps, from "How to Use t-SNE
#' Effectively".
#'
#' Creates N points from an N-dimensional space, where each point is related
#' by a mutually orthogonal step. Coordinates from a triangular matrix, where
#' the coordinates in the triangle are one, and are zero everywhere else.
#' Points are linearly mapped from their row id to a rainbow color scheme.
#'
#' @param n Number of points to sample.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#'  \code{Xn} columns, and color in the \code{color} column.
#' @examples
#' df <- ortho_curve(n = 50)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
ortho_curve <- function(n) {
  m <- matrix(0, nrow = n, ncol = n)
  m[lower.tri(m)] <- 1

  df <- data.frame(m)
  df$color <- linear_color_map(1.5 * pi * 0:(n - 1) / n)
  df
}

#' Random Walk Data
#'
#' Simulation data from a random walk process, from "How to Use t-SNE
#' Effectively".
#'
#' Creates a dataset where the coordinate of each point is related to the
#' previous point by a gaussian perturbation. Points are linearly mapped from
#' their position in the walk to a rainbow color scheme.
#'
#' @param n Number of points to sample.
#' @param dim Number of dimensions of the dataset.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#'  \code{Xn} columns, and color in the \code{color} column.
#' @examples
#' df <- random_walk(n = 100, dim = 100)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
random_walk <- function(n, dim) {
  current <- rep(0, dim)
  df <- data.frame(matrix(nrow = n, ncol = dim))
  for (i in 0:n - 1) {
    step <- stats::rnorm(dim)
    next_step <- current
    for (j in 0:dim - 1) {
      next_step[j + 1] <- current[j + 1] + step[j + 1]
    }
    df[i + 1, ] <- next_step
    current <- next_step
  }

  df$color <- linear_color_map(1.5 * pi * 0:(n - 1) / n)
  df
}

#' Random Jump Data
#'
#' Simulation data from a random jump process, from "How to Use t-SNE
#' Effectively".
#'
#' Creates a dataset similar to \code{\link{random_walk}} but with
#' additional gaussian noise added at each step. Points are linearly mapped from
#' their position in the walk to a rainbow color scheme.
#'
#' @param n Number of points to sample.
#' @param dim Number of dimensions of the dataset.
#' @return Data frame with coordinates in the \code{X1}, \code{X2} ...
#'  \code{Xn} columns, and color in the \code{color} column.
#' @examples
#' df <- random_jump(n = 100, dim = 100)
#' @family distill functions
#' @references \url{http://distill.pub/2016/misread-tsne/}
#' @export
random_jump <- function(n, dim) {
  df <- data.frame(matrix(nrow = n, ncol = dim))
  current <- rep(0, dim)
  for (i in 0:n - 1) {
    step <- stats::rnorm(dim)
    next_step <- step + current
    r <- stats::rnorm(dim)
    r <- r * sqrt(dim)
    df[i + 1, ] <- r + next_step
    current <- next_step
  }

  df$color <- linear_color_map(1.5 * pi * 0:(n - 1) / n)
  df
}
jlmelville/snedata documentation built on Jan. 13, 2024, 2:06 a.m.