In elbamos/clusteringdatasets: Datasets useful for testing clustering algorithms

Clustering Datasets

This vignette provides a simple overview of the datasets included in the package.

knitr::opts_chunk$set(comment=NA, echo=FALSE, fig.width=6, fig.height=6)

Birch

library(clusteringdatasets)
data(birch1)
data(birch2)
data(birch3)
par(mfrow = c(1, 3), mar = c(0,0,1,0))
plot(birch1, cex = 0.0000005, main = "birch1", xlab = "", ylab = NULL, xaxt='n', yaxt = 'n')
plot(birch2, cex = 0.0000005, main = "birch2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(birch3, cex = 0.0000005, main = "birch3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')

S Sets

The S-sets are useful for testing how an algorithm handles cluster overlap.

data(s1)
data(s2)
data(s3)
data(s4)
par(mfrow = c(2, 2), mar = c(0,0,1,0))
plot(s1[, 1:2], cex = 0.0001, col = s1$labels, main = "s1", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(s2[, 1:2], cex = 0.0001, col = s2$labels, main = "s2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(s3[, 1:2], cex = 0.0001, col = s3$labels, main = "s3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(s4[, 1:2], cex = 0.0001, col = s4$labels, main = "s4", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')

A Sets

data(a1)
data(a2)
data(a3)
par(mfrow = c(1, 3), mar = c(0,0,1,0))
plot(a1[, 1:2], cex = 0.0001, col = s1$labels, main = "a1", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(a2[, 1:2], cex = 0.0001, col = s2$labels, main = "a2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(a3[, 1:2], cex = 0.0001, col = s3$labels, main = "a3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')

Shapesets

data("Aggregation")
data("spiral")
data("D31")
data(Compound)
data(pathbased)
data(jain)
data(flame)
data(R15)
par(mfrow = c(3, 3), mar = c(0,0,1,0))
plot(Aggregation[, 1:2], cex = 0.1, col = Aggregation$label, main = "Aggregation", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(spiral[, 1:2], cex = 0.1, col = spiral$label, main = "spiral", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(D31[, 1:2], cex = 0.1, col = D31$label, main = "D31", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(Compound[, 1:2], cex = 0.1, col = Compound$label, main = "Compound", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(pathbased[, 1:2], cex = 0.1, col = pathbased$label, main = "pathbased", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(jain[, 1:2], cex = 0.1, col = jain$label, main = "jain", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(flame[, 1:2], cex = 0.1, col = flame$label, main = "flame", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(R15[, 1:2], cex = 0.1, col = R15$label, main = "R15", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')

Chameleon

par(mfrow = c(2, 2), mar = c(0,0,1,0))
data("t48k")
data("t58k")
data("t710k")
data("t88k")
plot(t48k, cex = 0.001, main = "t48k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(t58k, cex = 0.001, main = "t58k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(t710k, cex = 0.001, main = "t710k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(t88k, cex = 0.001, main = "t88k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')

Neural Gas

names <- c("Circle"  ,  "Complex1"  , "Complex2" ,  "Complex3"    ,  "Complex4"     ,   "Discrete"  , "HiLoDensity" ,  "JumpingRectangle" ,
                     "MovingJumpingRectangle", "MovingRectangle", "Rectangle" ,  "RMouseRectangle"    ) # "Ring"
data(list = names)
par(mfrow = c(3, 5), mar = c(0, 0, 1, 0))
for (nm in names) {
    plot(eval(parse(text = nm)), cex = 0.01,  main = nm, xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
}

Non-Convex

names <- c("cross" , "d4"  ,   "face"  , "pie"  ,  "ring2"  , "sincos")
data(list = names)
par(mfrow = c(2, 3), mar = c(0, 0, 1, 0))
for (nm in names) {
    plot(eval(parse(text = nm)), cex = 0.1,  main = nm, xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
}

Locations

data("mopsifinland")
data("mopsijoensu")
par(mfrow = c(1, 2), mar = c(0,0,1,0))
plot(mopsifinland[, 1:2], cex = 0.01,  main = "mopsifinland", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(mopsijoensu[, 1:2], cex = 0.05,  main = "mopsifinland", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')

High Dimensional Datasets

The package contains three sets of high-dimensional data. The visualizations below were made using my largeVis package to reduce each dataset to two dimensions, and the colors are the result of applying the hdbscan function within the package.

UCI Datasets

library(largeVis)
library(clusteringdatasets)
library(ggplot2)
data(glass)
data(wdbc)
data(breast)
data(yeast)
data(wine)
data(thyroid)
toproc <- list(glass, wdbc, breast, yeast, wine, thyroid)
vises <- list()
clusters <- list()
for (i in 1:length(toproc)) {
    dat <- t(scale(as.matrix(toproc[[i]])))
    if (ncol(dat) < 50000) vis <- largeVis(dat, K = 50, verbose = TRUE)
    else vis <- largeVis(dat, K = 100, verbose = TRUE)
    neighbors <- randomProjectionTreeSearch(dat, K = 50)
    edges <- buildEdgeMatrix(data = dat, neighbors = neighbors)
    print(str(edges))
    cluster <- hdbscan(edges = edges,neighbors = neighbors, K = 5, minPts = 10, verbose = TRUE)
    vises[[i]] <- vis
    clusters[[i]] <- cluster
}

library(ggplot2)
load(system.file("extdata/vises.Rda", package = "clusteringdatasets"))
load(system.file("extdata/clusters.Rda", package = "clusteringdatasets"))
names <- c("glass", "wdbc", "breast", "yeast", "wine", "thyroid")
par(mfrow = c(2, 3), mar = c(0,0,1,0))
for (i in 1:length(names)) {
    df <- data.frame(t(vises[[i]]$coords))
    colnames(df) <- c("x", "y")
    df$label <- clusters[[i]]$clusters
    if (length(unique(df$label)) > 1) {
        plot(df[, 1:2], cex = 0.005, col = df$label, main = names[i], xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
    } else {
        plot(df[, 1:2], cex = 0.005, col = df$label, main = names[i], xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
    }
}

KDDCUP04Bio

data("kddcup04bio")
library(largeVis)
load("./kddvis.Rda")
dat <- t(scale(as.matrix(kddcup04bio)))
vis <- largeVis(dat, K = 50, n_trees = 50, tree_threshold = 50, max_iter = 2, verbose = TRUE)

load(system.file("extdata/kdvis.Rda", package = "clusteringdatasets"))
par(mfrow = c(1, 1), mar = c(0,0,1,0))
plot(kdvis[, 1:2], cex = 0.0001, col = df$label, main = "kddcup04bio", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n', 
         xlim = c(-20, 18), ylim = c(-20, 30))

elbamos/clusteringdatasets documentation built on May 16, 2019, 2:58 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

elbamos/clusteringdatasets
Datasets useful for testing clustering algorithms

In elbamos/clusteringdatasets: Datasets useful for testing clustering algorithms

Clustering Datasets

Birch

S Sets

A Sets

Shapesets

Chameleon

Neural Gas

Non-Convex

Locations

High Dimensional Datasets

UCI Datasets

KDDCUP04Bio

R Package Documentation

Browse R Packages

We want your feedback!

elbamos/clusteringdatasets Datasets useful for testing clustering algorithms

In elbamos/clusteringdatasets: Datasets useful for testing clustering algorithms

Clustering Datasets

Birch

S Sets

A Sets

Shapesets

Chameleon

Neural Gas

Non-Convex

Locations

High Dimensional Datasets

UCI Datasets

KDDCUP04Bio

R Package Documentation

Browse R Packages

We want your feedback!

elbamos/clusteringdatasets
Datasets useful for testing clustering algorithms