In mkearney/funique: A Faster Unique Function

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-"
)
drop_hl <- function(x, n = 1) {
  x <- tibble::as_tibble(x, validate = FALSE)
  x <- dplyr::arrange(x, expr, time)
  tot <- sum(x$expr == x$expr[1])
  g <- length(unique(x$expr))
  s <- (1 + n):(tot - n)
  s <- unlist(Map("+", list(s), c(0, cumsum(rep(tot, g - 1)))))
  structure(x[s, ], class = c("hlmb", "tbl_df", "tbl", "data.frame"))
}

plot.hlmb <- function(x) {
  x$time <- x$time / 1000
  min <- 0
  max <- max(x$time, na.rm = TRUE) * 1.05
  x$expr <- as.character(x$expr)
  ggplot2::ggplot(x, ggplot2::aes(x = expr, y = time, fill = expr)) +
    ggplot2::geom_boxplot(outlier.shape = NA, alpha = .6) +
    ggplot2::geom_jitter(shape = 21, size = ggplot2::rel(3), alpha = .6) +
    ggplot2::theme_minimal(base_size = 11, base_family = "Roboto Condensed") +
    ggplot2::theme(legend.position = "none",
      text = ggplot2::element_text(colour = "#444444"),
      axis.title = ggplot2::element_text(size = ggplot2::rel(1.0),
        hjust = 0.95, face = "italic", colour = "black"),
      axis.text.x = ggplot2::element_text(size = ggplot2::rel(0.9),
        colour = "black"),
      axis.text.y = ggplot2::element_text(size = ggplot2::rel(1.1),
        colour = "black", angle = 90, hjust = .5),
      plot.title = ggplot2::element_text(size = ggplot2::rel(1.4),
        colour = "black", face = "bold"),
      plot.subtitle = ggplot2::element_text(size = ggplot2::rel(1.1),
        colour = "black"),
      plot.caption = ggplot2::element_text(hjust = 0, size = ggplot2::rel(.95)),
      panel.grid.minor.x = ggplot2::element_blank(),
      panel.grid.major.x = ggplot2::element_line(linetype = "dashed"),
      panel.grid.major.y = ggplot2::element_line(linetype = "dashed"),
      axis.line.x = ggplot2::element_line(colour = "#44444422")) +
    ggplot2::labs(y = "Time (microseconds)", x = "Expression",
      title = "Benchmarking expression evaluation times",
      subtitle = "Boxplots overlayed with jittered replication times",
      caption = "Estimates from the {microbenchmark} pkg") +
    ggplot2::scale_y_continuous(limits = c(min, max)) +
    ggplot2::coord_flip() + 
    ggplot2::scale_fill_manual(values = c("greenyellow", "gray"))
}
library(funique)

funique

⌚️ A faster unique() function

Installation

You can install the released version of funique from Github with:

## install remotes pkg if not already
if (!requireNamespace("remotes", quietly = TRUE)) {
  install.packages("remotes")
}

## install funique from github
remotes::install_github("mkearney/funique")

Usage

There's one function funique(), which is the same as base::unique() only optimized to be faster when data contain date-time variables.

Speed test: `funique()` vs. `base::unique()`

The code below creates a data frame with several duplicate rows and then compares performance (in time) of funique() versus base::unique().

## set seed
set.seed(20180812)

## generate data
d <- data.frame(
  x = rnorm(1000),
  y = seq.POSIXt(as.POSIXct("2018-01-01"),
    as.POSIXct("2018-12-31"), length.out = 10))

## create data frame with duplicate rows
d <- d[c(1:1000, sample(1:1000, 500, replace = TRUE)), ]
row.names(d) <- NULL

## check the output against base::unique
identical(unique(d), funique(d))

## bench mark
(m <- microbenchmark::microbenchmark(unique(d), funique(d), 
  times = 200, unit = "relative"))

## plot
plot(drop_hl(m, n = 4)) + 
  ggplot2::ggsave("man/figures/r1.png", width = 8, height = 4.5, units = "in")