knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "man/figures/README-", out.width = "75%", fig.retina = 2 ) library(tidyverse) library(patchwork) library(tibble) library(magrittr) library(ggplot2) library(gghilbertstrings) library(glue) reps <- 10 #takes about 50 seconds on iMac i9 3.5GHz size_exponent <- 11
A Hilbert curve (also known as a Hilbert space-filling curve) is a continuous fractal space-filling curve first described by the German mathematician David Hilbert in 1891, as a variant of the space-filling Peano curves discovered by Giuseppe Peano in 1890 (from Wikipedia).
This package provides an easy access to using Hilbert curves in ggplot2
.
You can install the released version of gghilbertstrings from CRAN with:
install.packages("gghilbertstrings")
You can install the development version from GitHub with:
# install.packages("remotes") # run only if not installed remotes::install_github("Sumidu/gghilbertstrings")
The gghilbertstrings
package comes with functions for fast plotting of Hilbert curves in ggplot. At it's core is a fast RCpp implementation that maps a 1D vector to a 2D position.
The gghilbertplot
function creates a Hilbert curve and plots individual data points to the corners of this plot. It automatically rescales the used ID
-variable to the full range of the Hilbert curve. The method also automatically picks a suitable level of detail able to represent all values of ID
.
The following figure shows different hilbert curves for different maximum ID
s.
p1 <- tibble(id = c(1,4)) %>% gghilbertplot(id, add_curve = T) + ggtitle("n = 4") + theme_void() p2 <- tibble(id = c(1,16)) %>% gghilbertplot(id, add_curve = T) + ggtitle("n = 16") + theme_void() p3 <- tibble(id = c(1,64)) %>% gghilbertplot(id, add_curve = T) + ggtitle("n = 64") + theme_void() p4 <- tibble(id = c(1,256)) %>% gghilbertplot(id, add_curve = T) + ggtitle("n = 256") + theme_void() p1 + p2 + p3 + p4 + plot_annotation(title = "Different depths of Hilbert curves")
The most simple way to plot data is to generate an id
column that ranges from 1 to n, where n is the largest value to use in the Hilbert curve. Beware: The id
s are rounded to integers.
library(gghilbertstrings) # val is the ID column used here df <- tibble(val = 1:256, size = runif(256, 1, 5), # create random sizes color = rep(c(1,2,3,4),64)) # create random colours gghilbertplot(df, val, color = factor(color), # render color as a factor size = size, add_curve = T) # also render the curves
We run the creation of a coordinate system r reps
times. This means creating 1 entry for every possible corner in the Hilbert Curve.
library(microbenchmark) library(HilbertCurve) library(tidyverse) library(gghilbertstrings) mb <- list() for (i in 1:10) { df <- tibble(val = 1:4^i, size = runif(4^i, 1, 5), # create random sizes color = rep(c(1, 2, 3, 4), 4^(i - 1))) values <- df$val mb[[i]] <- microbenchmark(times = reps, HilbertCurve = { hc <- HilbertCurve(1, 4^i, level = i, newpage = FALSE) }, gghilbertstrings = { ggh <- hilbertd2xy(n = 4^i, values) }) }
# Performance benchmark library(microbenchmark) remotes::install_github("jokergoo/HilbertCurve") library(HilbertCurve) library(tidyverse) library(gghilbertstrings) mb <- list() for (i in 1:10) { df <- tibble(val = 1:4^i, size = runif(4^i, 1, 5), # create random sizes color = rep(c(1, 2, 3, 4), 4^(i - 1))) values <- df$val mb[[i]] <- microbenchmark(times = reps, HilbertCurve = { hc <- HilbertCurve(1, 4^i, level = i, newpage = FALSE) }, gghilbertstrings = { ggh <- hilbertd2xy(n = 4^i, values) }) } res <- data.frame() for (i in 1:length(mb)) { tmp <- mb[[i]] %>% as_tibble() %>% mutate(depth = i) res <- res %>% bind_rows(tmp) }
library(scales) res %>% mutate(time = time/1000) %>% ggplot() + aes(x = depth, y = time, color = expr) + geom_jitter(width = 0.1, height = 0, alpha = 0.2) + geom_smooth() + scale_y_log10(labels = comma) + scale_x_continuous(breaks = 3:14, minor_breaks = NULL) + scale_color_viridis_d(begin = 0.5, end = 0.8, option = "D") + labs(x = "Order of Hilbert Curve", y = "Time in ms (log-scale)", color = "Package", title = "This package is two orders of magnitute faster", subtitle = glue("Comparison of {reps} repetitions across all orders."), caption = "Order 14 means 268,435,456 coordinates")
We use the eliasdabbas/search-engine-results-flights-tickets-keywords
data set on Kaggle as an example for a simple analysis. We map the full search URLs to the Hilbert curve and then add points when the URL was present for a specific search term. By comparing resulting facets we can see systematic difference in which provides show up for which search term.
#library(kaggler) #kgl_auth() # requires a kaggle token. data_set <- "eliasdabbas/search-engine-results-flights-tickets-keywords" #refs <- kgl_datasets_list(owner_dataset = data_set) #refs$datasetFiles #kgl_datasets_download(owner_dataset = data_set, # fileName = refs$datasetFiles$name[1], datasetVersionNumber = 14) read_plus <- function(flnm) { read_csv(flnm) %>% mutate(filename = flnm) } # files must be added in the my_tests folder for this section to work. df_with_sources <- list.files(path = "my_tests", pattern = "*.csv", full.names = T) %>% map_df(~read_plus(.)) comparison <- df_with_sources$searchTerms %>% unique() %>% head(4) df_with_sources %>% create_id_column(link) %>% filter(searchTerms %in% comparison) %>% gghilbertplot(gghid, color = displayLink, size = rank, label = displayLink, alpha = 0.1, jitter = 0.1, add_curve = F, curve_alpha = 0.1) + aes(group = displayLink) + facet_wrap(~searchTerms) + labs(title = "Comparsion of domains shown for different search queries") + guides(color = FALSE) -> p p #plotly::ggplotly(p)
Link: https://www.kaggle.com/eliasdabbas/search-engine-results-flights-tickets-keywords under License CC0
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.