knitr::opts_knit$set(progress = TRUE, 
                     verbose  = TRUE, 
                     root.dir = ".")

knitr::opts_chunk$set(collapse = TRUE, 
                      comment = "", 
                      message = TRUE, 
                      warning = FALSE, 
                      include = TRUE,
                      echo    = TRUE)

set.seed(1)
install.packages("Rtsne", dependencies = TRUE)
install.packages("umap", dependencies = TRUE)
install.packages("dbscan", dependencies = TRUE)
install.packages("ggrepel", dependencies = TRUE)
require(tidyverse)
require(magrittr)

require(Rtsne)
require(umap)
library(ggdendro)
require(ggrepel)
# 
require(ggridges)
require(ggumap)

Preparation

set.seed(1)

load("../data/mnist.rda")
str(mnist, 0)
mnist.s <- mnist %>% sample_frac(0.3)  
.data <- mnist.s[, -1]
labels <- mnist.s[, 1]

n <- NROW(.data)
.data %>% str(0)
labels %>% table()

dimension reduction using UMAP (unsupervised)

according to :

Configuration objects

umap.defaults
custom.config = umap.defaults
custom.config$min_dist = 0.01
require(reticulate)
# py_install("pandas")
# py_install("scikit-learn")
# py_install("matplotlib")
# py_install("umap-learn")
require(reticulate)
reticulate::py_config()

system.time(
  mnist.umap <- .data %>% 
    umap::umap(config = custom.config, method="umap-learn")
  # uwot::umap(metric = "cosine", n_neighbors=10, min_dist=0.001)
)
mnist.umap %>% str(2)

mapping.umap <- data.frame(
  id    = 1:NROW(mnist.umap$layout),
  dim1  = mnist.umap$layout[, 1],
  dim2  = mnist.umap$layout[, 2],
  label = labels)
# mapping.umap %>% str


labels.cent <- mapping.umap %>% 
  dplyr::group_by(label) %>%
  select(dim1, dim2) %>% 
  summarize_all(mean)

ggp.umap <- mapping.umap %>% 
  ggplot(aes(x = dim1, y = dim2, colour = as.factor(label))) + 
    geom_point(alpha = 0.3) + 
  theme_bw() +
  ggrepel::geom_label_repel(data = labels.cent, 
                            aes(label = label),
                            label.size = 0.1) + 
  guides(colour = FALSE) + 
  labs(title = "UMAP (with TRUE labels)") 
# scale_color_gradient2(midpoint=0.5, low="blue", mid="gray", high="red") + 


ggp.umap

Hierarchical Density-based spatial clustering of applications with noise (HDBSCAN)

Reference:

according to:

https://cran.r-project.org/web/packages/dbscan/vignettes/hdbscan.html

minPts not only acts as a minimum cluster size to detect, but also as a "smoothing" factor of the density estimates implicitly computed from HDBSCAN.

# install.packages("dbscan", dependencies = TRUE)
require(dbscan)

# mapping.umap %>% str
cl.hdbscan <- mapping.umap %>% 
  select(dim1, dim2) %>% 
  hdbscan(minPts = 15)
cl.hdbscan %>% print

dbscan:::plot.hdbscan(cl.hdbscan, show_flat = TRUE)
# install.packages("ggrepel", dependencies = TRUE)
require(ggrepel)

mapping.umap$hdbscan <- factor(cl.hdbscan$cluster)

hdbscan.cent <- mapping.umap %>% 
  filter(hdbscan != 0) %>% 
  dplyr::group_by(hdbscan) %>%
  select(dim1, dim2) %>% 
  summarize_all(mean)

ggp.umap.clust <- mapping.umap %>% 
  ggplot(aes(x = dim1, y = dim2, colour = hdbscan)) + 
  geom_point(alpha = 0.3) + 
  theme_bw() +
  ggrepel::geom_label_repel(data = hdbscan.cent, 
                            aes(label = hdbscan),
                            label.size = 0.1) + 
  guides(colour = FALSE) + 
  labs(title = "UMAP + HDBSCAN (estimated cluster)") 

ggp.compare <- gridExtra::grid.arrange(grobs = list(
  ggp.umap, ggp.umap.clust),
  ncol = 2)

ggp.compare


katokohaku/ggumap documentation built on Nov. 4, 2019, 3:32 p.m.