In Sumidu/gghilbertstrings: A Fast 'ggplot2'-Based Implementation of Hilbert Curves

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-",
  out.width = "75%",
  fig.retina = 2
)

library(tidyverse)
library(patchwork)
library(tibble)
library(magrittr)
library(ggplot2)
library(gghilbertstrings)
library(glue)

reps <- 10 #takes about 50 seconds on iMac i9 3.5GHz
size_exponent <- 11

gghilbertstrings

A Hilbert curve (also known as a Hilbert space-filling curve) is a continuous fractal space-filling curve first described by the German mathematician David Hilbert in 1891, as a variant of the space-filling Peano curves discovered by Giuseppe Peano in 1890 (from Wikipedia).

This package provides an easy access to using Hilbert curves in ggplot2.

Installation

You can install the released version of gghilbertstrings from CRAN with:

install.packages("gghilbertstrings")

You can install the development version from GitHub with:

# install.packages("remotes") # run only if not installed
remotes::install_github("Sumidu/gghilbertstrings")

Usage

The gghilbertstrings package comes with functions for fast plotting of Hilbert curves in ggplot. At it's core is a fast RCpp implementation that maps a 1D vector to a 2D position.

The gghilbertplot function creates a Hilbert curve and plots individual data points to the corners of this plot. It automatically rescales the used ID-variable to the full range of the Hilbert curve. The method also automatically picks a suitable level of detail able to represent all values of ID.

The following figure shows different hilbert curves for different maximum IDs.

p1 <- tibble(id = c(1,4)) %>% gghilbertplot(id, add_curve = T) + ggtitle("n = 4") +  theme_void()
p2 <- tibble(id = c(1,16)) %>% gghilbertplot(id, add_curve = T) + ggtitle("n = 16")  + theme_void()
p3 <- tibble(id = c(1,64)) %>% gghilbertplot(id, add_curve = T) + ggtitle("n = 64")  + theme_void()
p4 <- tibble(id = c(1,256)) %>% gghilbertplot(id, add_curve = T) + ggtitle("n = 256")  + theme_void()

p1 + p2 + p3 + p4 + plot_annotation(title = "Different depths of Hilbert curves")

Plotting random data

The most simple way to plot data is to generate an id column that ranges from 1 to n, where n is the largest value to use in the Hilbert curve. Beware: The ids are rounded to integers.

library(gghilbertstrings)

# val is the ID column used here
df <- tibble(val = 1:256, 
       size = runif(256, 1, 5),        # create random sizes 
       color = rep(c(1,2,3,4),64))     # create random colours

gghilbertplot(df, val, 
                color = factor(color), # render color as a factor
                size = size, 
                add_curve = T)         # also render the curves

Performance

We run the creation of a coordinate system r reps times. This means creating 1 entry for every possible corner in the Hilbert Curve.

library(microbenchmark)
library(HilbertCurve)
library(tidyverse)
library(gghilbertstrings)
mb <- list()
for (i in 1:10) {
  df <- tibble(val = 1:4^i,
               size = runif(4^i, 1, 5),
               # create random sizes
               color = rep(c(1, 2, 3, 4), 4^(i - 1)))
  values <- df$val
  mb[[i]] <- microbenchmark(times = reps,
                     HilbertCurve = {
                       hc <- HilbertCurve(1, 4^i, level = i, newpage = FALSE)
                     },
                     gghilbertstrings = {
                       ggh <- hilbertd2xy(n = 4^i, values)
                     })
}

# Performance benchmark
library(microbenchmark)
remotes::install_github("jokergoo/HilbertCurve")
library(HilbertCurve)
library(tidyverse)
library(gghilbertstrings)
mb <- list()
for (i in 1:10) {
  df <- tibble(val = 1:4^i,
               size = runif(4^i, 1, 5),
               # create random sizes
               color = rep(c(1, 2, 3, 4), 4^(i - 1)))
  values <- df$val
  mb[[i]] <- microbenchmark(times = reps,
                     HilbertCurve = {
                       hc <- HilbertCurve(1, 4^i, level = i, newpage = FALSE)
                     },
                     gghilbertstrings = {
                       ggh <- hilbertd2xy(n = 4^i, values)
                     })
}
res <- data.frame()
for (i in 1:length(mb)) {
  tmp <- mb[[i]] %>% as_tibble() %>% mutate(depth = i)  
  res <- res %>% bind_rows(tmp)
}

library(scales)
res %>% mutate(time = time/1000) %>% 
  ggplot() + 
  aes(x = depth, y = time, color = expr) + 
  geom_jitter(width = 0.1, height = 0, alpha = 0.2) +
  geom_smooth() + 
  scale_y_log10(labels = comma) +
  scale_x_continuous(breaks = 3:14, minor_breaks = NULL) +
  scale_color_viridis_d(begin = 0.5, end = 0.8, option = "D") +
  labs(x = "Order of Hilbert Curve", y = "Time in ms (log-scale)", color = "Package",
       title = "This package is two orders of magnitute faster",
       subtitle = glue("Comparison of {reps} repetitions across all orders."),
       caption = "Order 14 means 268,435,456 coordinates")

Useful example

We use the eliasdabbas/search-engine-results-flights-tickets-keywords data set on Kaggle as an example for a simple analysis. We map the full search URLs to the Hilbert curve and then add points when the URL was present for a specific search term. By comparing resulting facets we can see systematic difference in which provides show up for which search term.

#library(kaggler)
#kgl_auth() # requires a kaggle token.

data_set <- "eliasdabbas/search-engine-results-flights-tickets-keywords"
#refs <- kgl_datasets_list(owner_dataset = data_set)
#refs$datasetFiles
#kgl_datasets_download(owner_dataset = data_set, 
#                      fileName = refs$datasetFiles$name[1], datasetVersionNumber = 14)

read_plus <- function(flnm) {
    read_csv(flnm) %>% 
        mutate(filename = flnm)
}

# files must be added in the my_tests folder for this section to work.
df_with_sources <-
    list.files(path = "my_tests", 
               pattern = "*.csv", 
               full.names = T) %>% 
    map_df(~read_plus(.))


comparison <- df_with_sources$searchTerms %>% unique() %>% head(4)


df_with_sources %>%
  create_id_column(link) %>% 
  filter(searchTerms %in% comparison) %>% 
  gghilbertplot(gghid, 
                color = displayLink, 
                size = rank,
                label = displayLink,
                alpha = 0.1,
                jitter = 0.1,
                add_curve = F,
                curve_alpha = 0.1) +
  aes(group = displayLink) +
  facet_wrap(~searchTerms) +
  labs(title = "Comparsion of domains shown for different search queries") +
  guides(color = FALSE) -> p

p
#plotly::ggplotly(p)