knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  warning = FALSE,
  paged.print = FALSE
)

Speed Comparisons

A few notes:

pacman::p_load(tidytable, dtplyr, data.table)

setDTthreads(4)

set.seed(123)

random_strings <- c('OoVt', 'wCbu', 'cXxX', 'jdFu', 'MCRx',
                    'ukhz', 'ikce', 'PHyu', 'jpBY', 'nLQM')

bench_mark <- function(..., .fn = NULL) {
  bench::mark(...,
              check = FALSE, iterations = 11,
              memory = FALSE, time_unit = 'ms') %>%
    suppressWarnings() %>%
    mutate(expression = as.character(expression),
            median = round(median, 1),
            func_tested = .env$.fn) %>%
    select(func_tested, expression, median) %>%
    pivot_wider(names_from = expression, values_from = median)
}

test_data <- function(.size) {
  tidytable(a = sample(1:20, .size, TRUE),
            b = sample(1:20, .size, TRUE),
            c = sample(random_strings, .size, TRUE),
            d = sample(random_strings, .size, TRUE))
}

data_size <- 10000000

test_dt <- test_data(data_size)

test_tbl <- dplyr::as_tibble(test_dt)
inv_gc(); inv_gc();

filter_marks <- bench_mark(
  tidyverse = dplyr::filter(test_tbl, a <= 7, c == 'OoVt'),
  dtplyr = as.data.table(dplyr::filter(lazy_dt(test_dt), a <= 7, c == 'OoVt')),
  tidytable = filter(test_dt, a <= 7, c == 'OoVt'),
  data.table = test_dt[a <= 7 & c == 'OoVt'],
  .fn = "filter"
)

inv_gc(); inv_gc();

summarize_marks <- bench_mark(
  tidyverse = test_tbl %>%
    dplyr::group_by(c, d) %>%
    dplyr::summarize(avg_a = mean(a), med_b = median(b), .groups = "drop"),
  dtplyr = lazy_dt(test_dt) %>%
    dplyr::group_by(c, d) %>%
    dplyr::summarize(avg_a = mean(a), med_b = median(b), .groups = "drop") %>%
    as.data.table(),
  tidytable = summarize(test_dt, avg_a = mean(a), med_b = median(b), .by = c(c, d)),
  data.table = test_dt[, list(avg_a = mean(a), med_b = median(b)), by = .(c, d)],
  .fn = "summarize"
)

inv_gc(); inv_gc();

mut_dt <- copy(test_dt)

mutate_marks <- bench_mark(
  tidyverse = dplyr::mutate(test_tbl, double_a = a * 2, a_plus_b = a + b),
  dtplyr = as.data.table(dplyr::mutate(lazy_dt(test_dt), double_a = a * 2, a_plus_b = a + b)),
  tidytable = mutate(test_dt, double_a = a * 2, a_plus_b = a + b),
  data.table = mut_dt[, ':='(double_a = a * 2, a_b = a + b)][],
  .fn = "mutate"
)
remove(list = ls(pattern = "tbl$")); remove(list = ls(pattern = "dt$"));
inv_gc(); inv_gc();

test_dt <- test_data(data_size * .1)

test_tbl <- dplyr::as_tibble(test_dt)

arrange_marks <- bench_mark(
  tidyverse = dplyr::arrange(test_tbl, c, a),
  dtplyr = as.data.table(dplyr::arrange(lazy_dt(test_dt), c, a)),
  tidytable = arrange(test_dt, c, a),
  data.table = test_dt[order(c, a)],
  .fn = "arrange"
)

inv_gc(); inv_gc();

distinct_marks <- bench_mark(
  tidyverse = dplyr::distinct(test_tbl),
  dtplyr = as.data.table(dplyr::distinct(lazy_dt(test_dt))),
  tidytable = distinct(test_dt),
  data.table = unique(test_dt),
  .fn = "distinct"
)
remove(list = ls(pattern = "tbl$")); remove(list = ls(pattern = "dt$"));
inv_gc(); inv_gc();

fill_size <- data_size * .2
fill_dt <- tidytable(
  id = sample(1:3, fill_size, replace = TRUE),
  int1 = sample(c(1:5, NA), fill_size, replace = TRUE),
  int2 = sample(c(1:5, NA), fill_size, replace = TRUE)
)

fill_tbl <- dplyr::as_tibble(fill_dt)

fill_marks <- bench_mark(
  tidyverse = tidyr::fill(group_by(fill_tbl, id), int1, int2),
  dtplyr = as.data.table(tidyr::fill(dplyr::group_by(lazy_dt(fill_dt), id), int1, int2)),
  tidytable = fill(fill_dt, int1, int2, .by = id),
  # needs a copy or subsequent runs won't have NAs to fill
  data.table = tidytable:::shallow(fill_dt)[
    ,
    c("new1", "new2") := lapply(.SD, nafill, type = "locf"),
    .SDcols = c("int1", "int2"),
    by = id],
  .fn = "fill"
)
remove(list = ls(pattern = "tbl$")); remove(list = ls(pattern = "dt$"));
inv_gc(); inv_gc();

case_size <- data_size * .3
x <- runif(case_size * .3)

case_marks <- bench_mark(
  tidyverse = dplyr::case_when(x < .5 ~ 1,
                        x >= .5 ~ 2,
                        TRUE ~ 3),
  tidytable = case(x < .5, 1,
                   x >= .5, 2,
                   default = 3),
  data.table = fcase(x < .5, 1,
                     x >= .5, 2,
                     rep(TRUE, length(x)), 3),
  .fn = "case_when"
)
remove(x)

remove(list = ls(pattern = "tbl$")); remove(list = ls(pattern = "dt$"));
inv_gc(); inv_gc();

wider_dt <- 1:3 %>%
  map_dfr(
    ~ expand_grid(a = letters, b = LETTERS, c = paste0(letters, LETTERS)) %>%
    unite("name") %>%
    mutate(value = sample(as.double(1:n()), n())) %>%
    mutate(id = .x)
  ) %>%
  relocate(id)

wider_tbl <- dplyr::as_tibble(wider_dt)

wider_marks <- bench_mark(
  tidyverse = tidyr::pivot_wider(wider_tbl, names_from = name, values_from = value),
  dtplyr = lazy_dt(wider_dt) %>%
    tidyr::pivot_wider(names_from = name, values_from = value) %>%
    as.data.table(),
  tidytable = pivot_wider(wider_dt, names_from = name, values_from = value),
  data.table = data.table::dcast.data.table(wider_dt, id ~ name),
  .fn = "pivot_wider"
)
remove(list = ls(pattern = "tbl$")); remove(list = ls(pattern = "dt$"));
inv_gc(); inv_gc();

longer_size <- 100000
longer_dt <-
  tidytable(id = 1:longer_size) %>%
  bind_cols(
    map_dfc(letters, ~ tidytable(!!.x := sample(c(letters, LETTERS), longer_size, TRUE)))
  )

longer_tbl <- dplyr::as_tibble(longer_dt)

longer_marks <- bench_mark(
  tidyverse = tidyr::pivot_longer(longer_tbl, cols = -id),
  dtplyr = as.data.table(tidyr::pivot_longer(lazy_dt(longer_dt), cols = -id)),
  tidytable = pivot_longer(longer_dt, cols = -id, fast_pivot = TRUE),
  data.table = melt(longer_dt,
                    measure.vars = names(longer_dt)[names(longer_dt) != "id"],
                    variable.name = "name",
                    variable.factor = FALSE),
  .fn = "pivot_longer"
)
remove(list = ls(pattern = "tbl$")); remove(list = ls(pattern = "dt$"));
inv_gc(); inv_gc();

left_dt <- test_data(data_size * .1)

right_dt <- left_dt %>%
  distinct(c, d) %>%
  mutate(e = row_number())

left_tbl <- dplyr::as_tibble(left_dt)
right_tbl <- dplyr::as_tibble(right_dt)

left_join_marks <- bench_mark(
  tidyverse = dplyr::left_join(left_tbl, right_tbl, by = c("c", "d")),
  dtplyr = as.data.table(dplyr::left_join(lazy_dt(left_dt), lazy_dt(right_dt), by = c("c", "d"))),
  tidytable = left_join(left_dt, right_dt, by = c("c", "d")),
  data.table = right_dt[left_dt, on = c("c", "d"), allow.cartesian = TRUE],
  .fn = "left_join"
)

inv_gc(); inv_gc();

inner_join_marks <- bench_mark(
  tidyverse = dplyr::inner_join(left_tbl, right_tbl, by = c("c", "d")),
  dtplyr = as.data.table(dplyr::inner_join(lazy_dt(left_dt), lazy_dt(right_dt), by = c("c", "d"))),
  tidytable = inner_join(left_dt, right_dt, c("c", "d")),
  data.table = left_dt[right_dt, on = c("c", "d"), allow.cartesian = TRUE, nomatch = 0],
  .fn = "inner_join"
)
remove(list = ls(pattern = "tbl$")); remove(list = ls(pattern = "dt$"));
inv_gc(); inv_gc();

unnest_dt <- test_data(data_size * .3) %>%
  nest_by(c, d, .key = "list_column")

unnest_tbl <- unnest_dt %>%
  mutate(list_column = map(list_column, dplyr::as_tibble)) %>%
  dplyr::as_tibble()

unnest_marks <- bench_mark(
  tidyverse = tidyr::unnest(unnest_tbl, list_column),
  tidytable = unnest(unnest_dt, list_column),
  data.table = unnest_dt[, unlist(list_column, recursive = FALSE), by = .(c, d)],
  .fn = "unnest"
)
remove(list = ls(pattern = "tbl$")); remove(list = ls(pattern = "dt$"));
inv_gc(); inv_gc();

nest_dt <- test_data(data_size * .1)

nest_tbl <- dplyr::as_tibble(nest_dt)

nest_marks <- bench_mark(
  tidyverse = dplyr::group_nest(nest_tbl, c, d),
  dtplyr = as.data.table(tidyr::nest(lazy_dt(nest_dt), data = c(c, d))),
  tidytable = nest_by(nest_dt, c, d),
  data.table = nest_dt[, list(data = list(.SD)), by = .(c, d)],
  .fn = "nest"
)
# # pandas code (run in a .ipynb in vscode)
# import pandas as pd
# import numpy as np
# import gc
# from timeit import timeit
# 
# num_runs = 5
# def run_benchmarks(_dict):
#     return {key: np.round(timeit(value, number = num_runs) * 1000/num_runs, 1) for key, value in _dict.items()}
# 
# np.random.seed(123)
# random_strings = np.array(['OoVt', 'wCbu', 'cXxX', 'jdFu', 'MCRx', 'ukhz', 'ikce', 'PHyu', 'jpBY', 'nLQM'])
# 
# initial_data_size = 10000000
# data_size = initial_data_size
# 
# test_df = (
#     pd.DataFrame({
#         'a': np.random.choice(np.arange(20), data_size),
#         'b': np.random.choice(np.arange(20), data_size),
#         'c': np.random.choice(random_strings, data_size),
#         'd': np.random.choice(random_strings, data_size)
#     })
#     .convert_dtypes()
# )
# 
# bench_fns1 = dict(
#     filter = lambda: test_df.loc[(test_df.a <= 7) & (test_df.c == 'OoVt')],
#     summarize = lambda: test_df.groupby(['c', 'd']).agg(avg_a = ('a', np.mean), med_b = ('b', np.median)),
#     mutate = lambda: test_df.assign(double_a = test_df.a * 2, a_plus_b = test_df.a + test_df.b)
# )
# 
# bench_marks1 = run_benchmarks(bench_fns1)
# 
# data_size = initial_data_size * .1
# test_df = test_df.head(int(data_size))
# gc.collect()
# 
# bench_fns2 = dict(
#     arrange = lambda: test_df.sort_values(['c', 'a']),
#     distinct = lambda: test_df.drop_duplicates()
# )
# 
# bench_marks2 = run_benchmarks(bench_fns2)
# 
# del test_df
# gc.collect()
# 
# x = np.random.normal(size = 3000000)
# median_x = np.median(x)
# 
# bench_fns3 = dict(
#     case_when = lambda: np.select(
#         [x < median_x, x <= median_x],
#         [1, 2],
#         default = 3
#     )
# )
# 
# bench_marks3 = run_benchmarks(bench_fns3)
# 
# del x, median_x
# gc.collect()
# 
# data_size = int(initial_data_size * .2)
# fill_df = (
#     pd.DataFrame(dict(
#         id = np.random.choice(np.arange(3), data_size),
#         int1 = np.random.choice(np.array([1, 2, 3, 4, 5, np.nan]), data_size),
#         int2 = np.random.choice(np.array([1, 2, 3, 4, 5, np.nan]), data_size)
#     ))
#     .convert_dtypes()
# )
# 
# bench_fns4 = dict(
#     fill = lambda: fill_df.groupby('id')[['int1','int2']].fillna(method = 'ffill')
# )
# 
# bench_marks4 = run_benchmarks(bench_fns4)
# 
# del fill_df
# gc.collect()
# 
# bench_marks = bench_marks1 | bench_marks2 | bench_marks3 | bench_marks4
# 
# dict(sorted(bench_marks.items()))
remove(list = ls(pattern = "tbl$")); remove(list = ls(pattern = "dt$"));
inv_gc(); inv_gc();

marks_names <- setdiff(
  unlist(ls(pattern="marks$")),
  c("all_marks", "pandas_marks")
)

all_marks <- marks_names %>%
  map_dfr(get) %>%
  arrange(func_tested)
pandas_marks <-
  tidytable(
    arrange = 715.8,
    case_when = 64.4,
    distinct = 308.7,
    fill = 724,
    filter = 903.8,
    mutate = 780.1,
    summarize = 3079.8
  ) %>%
  pivot_longer(everything(), names_to = "func_tested", values_to = "pandas")

all_marks <- all_marks %>%
  left_join(pandas_marks) %>%
  select(func_tested, data.table, tidytable, dtplyr, tidyverse, pandas)
print(glue::glue("Date last run: {lubridate::today()}"))
all_marks


mtfairbanks/gdt documentation built on April 12, 2024, 6:51 p.m.