hutils.R
In hutils: Miscellaneous R Functions and Aliases

## ----knitrOpts----------------------------------------------------------------
library(knitr)
suggested_packages <- c("geosphere", "nycflights13", "dplyr", "ggplot2", "microbenchmark")
opts_chunk$set(eval = all(vapply(suggested_packages, requireNamespace, quietly = TRUE, FUN.VALUE = FALSE)))

## ----loadPackages-------------------------------------------------------------
tryCatch({
  library(geosphere)
  library(nycflights13)
  library(dplyr, warn.conflicts = FALSE)
  library(ggplot2)
  library(microbenchmark)
  library(data.table, warn.conflicts = FALSE)
  library(magrittr)
  library(hutils, warn.conflicts = FALSE)
}, 
# requireNamespace does not detect errors like
# package ‘dplyr’ was installed by an R version with different internals; it needs to be reinstalled for use with this R version
error = function(e) {
  opts_chunk$set(eval = FALSE)
})
options(digits = 4)

## ----aliases------------------------------------------------------------------
OR(OR(TRUE,
      stop("Never happens")),  ## short-circuits
   AND(FALSE,
       stop("Never happens")))  

## ----compare_if_else----------------------------------------------------------
my_check <- function(values) {
  all(vapply(values[-1], function(x) identical(values[[1]], x), logical(1)))
}

set.seed(2)
cnd <- sample(c(TRUE, FALSE, NA), size = 100e3, replace = TRUE)
yes <- sample(letters, size = 100e3, replace = TRUE)
no <- sample(letters, size = 100e3, replace = TRUE)
na <- sample(letters, size = 100e3, replace = TRUE)

microbenchmark(dplyr =  dplyr::if_else(cnd, yes, no, na),
               hutils = hutils::if_else(cnd, yes, no, na),
               check = my_check) %>%
  print

cnd <- sample(c(TRUE, FALSE, NA), size = 100e3, replace = TRUE)
yes <- sample(letters, size = 1, replace = TRUE)
no <- sample(letters, size = 100e3, replace = TRUE)
na <- sample(letters, size = 1, replace = TRUE)

microbenchmark(dplyr =  dplyr::if_else(cnd, yes, no, na),
               hutils = hutils::if_else(cnd, yes, no, na),
               check = my_check) %>%
  print

## ----compare_coalesce---------------------------------------------------------
x <- sample(c(letters, NA), size = 100e3, replace = TRUE)
A <- sample(c(letters, NA), size = 100e3, replace = TRUE)
B <- sample(c(letters, NA), size = 100e3, replace = TRUE)
C <- sample(c(letters, NA), size = 100e3, replace = TRUE)

microbenchmark(dplyr =  dplyr::coalesce(x, A, B, C),
               hutils = hutils::coalesce(x, A, B, C),
               check = my_check) %>%
  print

## ----compare_coalesce_short_circuit_x-----------------------------------------
x <- sample(c(letters), size = 100e3, replace = TRUE)

microbenchmark(dplyr =  dplyr::coalesce(x, A, B, C),
               hutils = hutils::coalesce(x, A, B, C),
               check = my_check) %>%
  print

## ----compare_coalesce_short_circuit_A-----------------------------------------
x <- sample(c(letters, NA), size = 100e3, replace = TRUE)
A <- sample(c(letters), size = 100e3, replace = TRUE)

microbenchmark(dplyr =  dplyr::coalesce(x, A, B, C),
               hutils = hutils::coalesce(x, A, B, C),
               check = my_check) %>%
  print

## ----canonical_drop_DT--------------------------------------------------------
DT <- data.table(A = 1:5, B = 1:5, C = 1:5)
DT[, A := NULL]

## ----drop_col_hutils----------------------------------------------------------
DT <- data.table(A = 1:5, B = 1:5, C = 1:5)
DT %>%
  drop_col("A") %>%
  drop_col("B")

# or
DT <- data.table(A = 1:5, B = 1:5, C = 1:5)
DT %>%
  drop_cols(c("A", "B"))

## ----drop_colr----------------------------------------------------------------
flights <- as.data.table(flights)

flights %>%
  drop_colr("time") %>%
  drop_colr("arr(?!_delay)", perl = TRUE)

## ----drop_constant_cols-------------------------------------------------------
flights %>%
  .[origin == "JFK"] %>%
  drop_constant_cols

## ----drop_empty_cols----------------------------------------------------------
planes %>% 
  as.data.table %>% 
  .[!complete.cases(.)]

planes %>% 
  as.data.table %>% 
  .[!complete.cases(.)] %>% 
  # drops speed
  drop_empty_cols

## ----duplicated_rows----------------------------------------------------------
flights %>%
  # only the 'second' of the duplicates is returned
  .[duplicated(., by = c("origin", "dest"))]  

flights %>%
  # Both rows are returned and (by default)
  # duplicates are presented adjacently
  duplicated_rows(by = c("origin", "dest"))

## ----haversine_distance-------------------------------------------------------
DT1 <- data.table(lat_orig = runif(1e5, -80, 80),
                  lon_orig = runif(1e5, -179, 179),
                  lat_dest = runif(1e5, -80, 80),
                  lon_dest = runif(1e5, -179, 179))

DT2 <- copy(DT1)

microbenchmark(DT1[, distance := haversine_distance(lat_orig, lon_orig,
                                                    lat_dest, lon_dest)],
               
               DT2[, distance := distHaversine(cbind(lon_orig, lat_orig),
                                               cbind(lon_orig, lat_orig))])
rm(DT1, DT2)

## ----mutate-other, results='asis'---------------------------------------------
set.seed(1)
DT <- data.table(Fruit = sample(c("apple", "pear", "orange", "tomato", "eggplant"),
                                size = 20,
                                prob = c(0.45, 0.25, 0.15, 0.1, 0.05),
                                replace = TRUE),
                 Price = rpois(20, 10))

kable(mutate_other(DT, "Fruit", n = 3)[])

## ----iris-veriscolor----------------------------------------------------------
iris <- as.data.table(iris)
iris[Species %in% c("setosa", "versicolour")] %$%
  mean(Sepal.Length / Sepal.Width)

## ----iris-versicolor, error=TRUE----------------------------------------------
iris <- as.data.table(iris)
iris[Species %ein% c("setosa", "versicolour")] %$%
  mean(Sepal.Length / Sepal.Width)

## ----pin----------------------------------------------------------------------
identical(iris[grep("v", Species)],
          iris[Species %pin% "v"])

## ----pin-multi----------------------------------------------------------------
iris[Species %pin% c("ver", "vir")] %>%
  head

## -----------------------------------------------------------------------------
DT <- data.table(x = 1:5,
                 y = letters[1:5],
                 AB = c(NA, TRUE, FALSE, TRUE, FALSE))
select_which(DT, anyNA, .and.dots = "y")

## -----------------------------------------------------------------------------
dt <- data.table(y = !sample(0:1, size = 100, replace = TRUE), 
                 x = runif(100))
dt[, pred := predict(lm(y ~ x, data = .SD), newdata = .SD)]

dt[, auc(y, pred)]

## ----select_grep--------------------------------------------------------------
flights %>%
  select_grep("arr")

## ----select_grep-and----------------------------------------------------------
flights %>%
  select_grep("arr", .and = "year", .but.not = "arr_time")

## -----------------------------------------------------------------------------
RQ(dplyr, "dplyr must be installed")
RQ("dplyr", "dplyr needs installing", "dplyr installed.")

## ----ahull-1------------------------------------------------------------------
if (!identical(Sys.info()[["sysname"]], "Darwin"))
  ggplot(data.table(x = c(0, 1, 2, 3, 4), y = c(0, 1, 2, 0.1, 0))) +
  geom_area(aes(x, y)) +
  geom_rect(data = ahull(, c(0, 1, 2, 3, 4), c(0, 1, 2, 0.1, 0)),
            aes(xmin = xmin,
                xmax = xmax,
                ymin = ymin,
                ymax = ymax),
            color = "red") 

## ----ahull-demos, fig.width = 8, fig.height = 6-------------------------------
set.seed(101)
ahull_dt <-
  data.table(x = c(0:100) / 100,
             y = cumsum(rnorm(101, 0.05)))
if (!identical(Sys.info()[["sysname"]], "Darwin"))
ggplot(ahull_dt) +
  geom_area(aes(x, y)) + 
  geom_rect(data = ahull(ahull_dt), 
            aes(xmin = xmin, 
                xmax = xmax, 
                ymin = ymin, 
                ymax = ymax), 
            color = "red") + 
  geom_rect(data = ahull(ahull_dt,
                         incl_negative = TRUE), 
            aes(xmin = xmin, 
                xmax = xmax, 
                ymin = ymin, 
                ymax = ymax), 
            color = "blue") + 
  geom_rect(data = ahull(ahull_dt,
                         incl_negative = TRUE,
                         minH = 4), 
            aes(xmin = xmin, 
                xmax = xmax, 
                ymin = ymin, 
                ymax = ymax), 
            color = "green") + 
  geom_rect(data = ahull(ahull_dt,
                         incl_negative = TRUE,
                         minW = 0.25), 
            aes(xmin = xmin, 
                xmax = xmax, 
                ymin = ymin, 
                ymax = ymax), 
            color = "white",
            fill = NA)



## ----weighted_quantile-ex-----------------------------------------------------
x <- 1:10
w <- c(rep(1, 5), rep(2, 5))
quantile(x, prob = c(0.25, 0.75), names = FALSE)

weighted_quantile(x, w, p = c(0.25, 0.75))

## ----mutate_ntile-ex----------------------------------------------------------
flights %>%
  as.data.table %>%
  .[, .(year, month, day, origin, dest, distance)] %>%
  mutate_ntile(distance, n = 5L)


## ----mutate_ntile-ex-charonly-------------------------------------------------
flights %>%
  as.data.table %>%
  .[, .(year, month, day, origin, dest, distance)] %>%
  mutate_ntile(distance, n = 5L)

## ----mutate_ntile-ex-2--------------------------------------------------------
flights %>%
  as.data.table %>%
  mutate_ntile("distance",
               n = 5L,
               character.only = TRUE) %>%
  .[, dep_delay := coalesce(dep_delay, 0)] %>%
  .[, .(avgDelay = mean(dep_delay)), keyby = "distanceQuintile"]


## ----longest-affix------------------------------------------------------------
trim_common_affixes(c("CurrentHousingCosts(weekly)",
                      "CurrentFuelCosts(weekly)"))

## ----swap---------------------------------------------------------------------
a <- 1
b <- 2
a %<->% b
identical(c(a, b), c(2, 1))

## ----average-bearing----------------------------------------------------------
average_bearing(0, 270)  # NW
mean(c(0, 270))          # SE (i.e. wrong)

## ----Mode-eg------------------------------------------------------------------
Mode(c(1, 1, 1, 2, 3))

## ----samp-eg------------------------------------------------------------------
DT <- data.table(x = c(5, 2, 3),
                 y = c(5, 3, 4))
DT[, .(Base = sample(.BY[["x"]]:.BY[["y"]])), keyby = .(x, y)]
DT[, .(Base = samp(.BY[["x"]]:.BY[["y"]])), keyby = .(x, y)]