
SNE Dataset Functions for R

This package provides functions for generating simple simulation datasets for use in Stochastic Neighbor Embedding and related dimensionality reduction methods, most obviously the very popular t-SNE.


November 25 2023 Added the 20 Newsgroups text dataset. You'll need to look at packages like tm or tidytext to process this into a form suitable for embedding.

May 14 2022 Added the taspheres function to generate the "spheres" dataset of high dimensional spheres nested inside a larger sphere, as used in the Topological Autoencoders paper.

December 24 2021. Added the S-curve with a hole and 2D curve datasets used in the PaCMAP paper.

December 21 2021. Added the mammoth datasets from Understanding UMAP.

June 29 2019. Added QMNIST.

March 6 2019. Added CIFAR-10.

February 23 2019. Added Small NORB.

December 14 2018. Added Kuzushiji-MNIST.





package?snedata # lists all the functions
?snedata::gaussian_data # contains links to all the other functions



# 3000 points sampled from the surface of a sphere
sphere3000 <- sphere(n = 3000)

# 1500 points sampled from a toroidal helix with 30 coils:
helix1500 <- helix(n = 1500, nwinds = 30)

# 1500 points from a filled sphere:
ball1500 <- ball(n = 1500)

# 1000 points from a "Swiss Roll" distribution:
swiss1000 <- swiss_roll(n = 1000)

# 1000 points from a five-dimensional gaussian:
gauss1000 <- gaussian_data(n = 1000, dim = 5)

# Load RnavGraphImageData

# Load the Frey faces dataset with each image as a row
frey <- frey_faces()
# Display the first pose
show_frey_face(frey, 1)

# PCA scores plot, with color indicating the frame index
frey_pca <- prcomp(frey[, -561], retx = TRUE, rank. = 2)
plot(frey_pca$x, col = frey$color, pch = 16, cex = 0.75)

# Load the Olivetti faces dataset with each image as a row
olivetti <- olivetti_faces()
# Show the second pose of the first face
show_olivetti_face(olivetti, 1, 2)

# Generate datasets similar to those used in the main text of "How to Use t-SNE Effectively"
misread_tsne <- list(
    two_clusters = two_clusters_data(n = 50, dim = 2),
    two_different_sized_clusters = two_different_clusters_data(n = 75, dim = 2),
    three_clusters_50 = three_clusters_data(n = 50, dim = 2),
    three_clusters_200 = three_clusters_data(n = 200, dim = 2),
    gaussian_cloud = gaussian_data(n = 500, dim = 100),
    ellipsoidal_gaussian_cloud = long_gaussian_data(n = 100, dim = 50),
    two_long_linear_clusters = long_cluster_data(n = 75),
    cluster_in_cluster = subset_clusters_data(n = 75, dim = 50),
    linked_rings = link_data(n = 100),
    trefoil_knot = trefoil_data(n = 150)

# fetch the MNIST data set from the MNIST website
mnist <- download_mnist()

# view the fifth digit
show_mnist_digit(mnist, 5)

# first 60,000 instances are the training set
mnist_train <- head(mnist, 60000)
# the remaining 10,000 are the test set
mnist_test <- tail(mnist, 10000)

# PCA on 1000 random training examples
mnist_r1000 <- mnist_train[sample(nrow(mnist_train), 1000), ]

pca <- prcomp(mnist_r1000[, 1:784], retx = TRUE, rank. = 2)
# plot the scores of the first two components
plot(pca$x[, 1:2], type = 'n')
text(pca$x[, 1:2], labels = mnist_r1000$Label, cex = 0.5,
  col = rainbow(length(levels(mnist_r1000$Label)))[mnist_r1000$Label])

# save to disk
save(mnist, file = "mnist.Rda")

# fetch the Fashion-MNIST dataset
fashion <- download_fashion_mnist()

# Works as a drop-in replacement for the MNIST digits, can repeat the above
# view the fifth item etc.
show_mnist_digit(fashion, 5)

# similarly for Kuzushiji-MNIST dataset of Japanese cursive characters
# (set verbose flag to see download progress)
kuzushiji <- download_kuzushiji_mnist(verbose = TRUE)
# View the tenth character
show_mnist_digit(kuzushiji, 10)

# Download the small NORB dataset
norb <- download_norb_small(verbose = TRUE)
# View an image, compare with example at
show_norb_object(norb, category = 2, instance = 6, elevation = 6, azimuth = 24, lighting = 2)

This package is licensed under the MIT License.

