knitr::opts_knit$set(progress = TRUE, verbose = TRUE, root.dir = ".") knitr::opts_chunk$set(collapse = TRUE, comment = "", message = TRUE, warning = FALSE, include = TRUE, echo = TRUE) set.seed(1)
install.packages("Rtsne", dependencies = TRUE) install.packages("umap", dependencies = TRUE) install.packages("dbscan", dependencies = TRUE) install.packages("ggrepel", dependencies = TRUE)
require(tidyverse) require(magrittr) require(Rtsne) require(umap) library(ggdendro) require(ggrepel) # require(ggridges) require(ggumap)
set.seed(1) load("../data/mnist.rda") str(mnist, 0) mnist.s <- mnist %>% sample_frac(0.3) .data <- mnist.s[, -1] labels <- mnist.s[, 1] n <- NROW(.data) .data %>% str(0) labels %>% table()
according to :
umap.defaults custom.config = umap.defaults custom.config$min_dist = 0.01
require(reticulate) # py_install("pandas") # py_install("scikit-learn") # py_install("matplotlib") # py_install("umap-learn")
require(reticulate) reticulate::py_config() system.time( mnist.umap <- .data %>% umap::umap(config = custom.config, method="umap-learn") # uwot::umap(metric = "cosine", n_neighbors=10, min_dist=0.001) ) mnist.umap %>% str(2) mapping.umap <- data.frame( id = 1:NROW(mnist.umap$layout), dim1 = mnist.umap$layout[, 1], dim2 = mnist.umap$layout[, 2], label = labels) # mapping.umap %>% str labels.cent <- mapping.umap %>% dplyr::group_by(label) %>% select(dim1, dim2) %>% summarize_all(mean) ggp.umap <- mapping.umap %>% ggplot(aes(x = dim1, y = dim2, colour = as.factor(label))) + geom_point(alpha = 0.3) + theme_bw() + ggrepel::geom_label_repel(data = labels.cent, aes(label = label), label.size = 0.1) + guides(colour = FALSE) + labs(title = "UMAP (with TRUE labels)") # scale_color_gradient2(midpoint=0.5, low="blue", mid="gray", high="red") + ggp.umap
Reference:
according to:
https://cran.r-project.org/web/packages/dbscan/vignettes/hdbscan.html
minPts
not only acts as a minimum cluster size to detect, but also as a "smoothing" factor of the density estimates implicitly computed from HDBSCAN.
# install.packages("dbscan", dependencies = TRUE) require(dbscan) # mapping.umap %>% str cl.hdbscan <- mapping.umap %>% select(dim1, dim2) %>% hdbscan(minPts = 15) cl.hdbscan %>% print dbscan:::plot.hdbscan(cl.hdbscan, show_flat = TRUE)
# install.packages("ggrepel", dependencies = TRUE) require(ggrepel) mapping.umap$hdbscan <- factor(cl.hdbscan$cluster) hdbscan.cent <- mapping.umap %>% filter(hdbscan != 0) %>% dplyr::group_by(hdbscan) %>% select(dim1, dim2) %>% summarize_all(mean) ggp.umap.clust <- mapping.umap %>% ggplot(aes(x = dim1, y = dim2, colour = hdbscan)) + geom_point(alpha = 0.3) + theme_bw() + ggrepel::geom_label_repel(data = hdbscan.cent, aes(label = hdbscan), label.size = 0.1) + guides(colour = FALSE) + labs(title = "UMAP + HDBSCAN (estimated cluster)") ggp.compare <- gridExtra::grid.arrange(grobs = list( ggp.umap, ggp.umap.clust), ncol = 2) ggp.compare
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.