knitr::opts_chunk$set( collapse = TRUE, comment = "#>", warning = FALSE, paged.print = FALSE )
A few notes:
summarize()
tests were performed on a different dataset from case_when()
.setDTthreads(4)
was used for data.table
& tidytable
timings.data.table
when being compared to mutate.()
& dplyr::mutate()
fill.()
& tidyr::fill()
both work with character/factor/logical columns, whereas data.table::nafill()
does not. Testing only included numeric columns due to this constraint.dtplyr
is missing timings for functions that are not yet implemented in the package.pandas
comparisons are in the process of being added - more will be added soon.tidytable
functions faster than their data.table
counterpart?data.table
in the background.tidytable
runs were slightly shorter on those specific functions on this
iteration of the tests. However one goal of these tests is to show that the "time cost" of
translating tidyverse
syntax to data.table
is very negligible to the
user (especially on medium-to-large datasets).pacman::p_load(tidytable, dtplyr, data.table) setDTthreads(4) set.seed(123) random_strings <- c('OoVt', 'wCbu', 'cXxX', 'jdFu', 'MCRx', 'ukhz', 'ikce', 'PHyu', 'jpBY', 'nLQM') bench_mark <- function(..., .fn = NULL) { bench::mark(..., check = FALSE, iterations = 11, memory = FALSE, time_unit = 'ms') %>% suppressWarnings() %>% mutate(expression = as.character(expression), median = round(median, 1), func_tested = .env$.fn) %>% select(func_tested, expression, median) %>% pivot_wider(names_from = expression, values_from = median) } test_data <- function(.size) { tidytable(a = sample(1:20, .size, TRUE), b = sample(1:20, .size, TRUE), c = sample(random_strings, .size, TRUE), d = sample(random_strings, .size, TRUE)) } data_size <- 10000000 test_dt <- test_data(data_size) test_tbl <- dplyr::as_tibble(test_dt)
inv_gc(); inv_gc(); filter_marks <- bench_mark( tidyverse = dplyr::filter(test_tbl, a <= 7, c == 'OoVt'), dtplyr = as.data.table(dplyr::filter(lazy_dt(test_dt), a <= 7, c == 'OoVt')), tidytable = filter(test_dt, a <= 7, c == 'OoVt'), data.table = test_dt[a <= 7 & c == 'OoVt'], .fn = "filter" ) inv_gc(); inv_gc(); summarize_marks <- bench_mark( tidyverse = test_tbl %>% dplyr::group_by(c, d) %>% dplyr::summarize(avg_a = mean(a), med_b = median(b), .groups = "drop"), dtplyr = lazy_dt(test_dt) %>% dplyr::group_by(c, d) %>% dplyr::summarize(avg_a = mean(a), med_b = median(b), .groups = "drop") %>% as.data.table(), tidytable = summarize(test_dt, avg_a = mean(a), med_b = median(b), .by = c(c, d)), data.table = test_dt[, list(avg_a = mean(a), med_b = median(b)), by = .(c, d)], .fn = "summarize" ) inv_gc(); inv_gc(); mut_dt <- copy(test_dt) mutate_marks <- bench_mark( tidyverse = dplyr::mutate(test_tbl, double_a = a * 2, a_plus_b = a + b), dtplyr = as.data.table(dplyr::mutate(lazy_dt(test_dt), double_a = a * 2, a_plus_b = a + b)), tidytable = mutate(test_dt, double_a = a * 2, a_plus_b = a + b), data.table = mut_dt[, ':='(double_a = a * 2, a_b = a + b)][], .fn = "mutate" )
remove(list = ls(pattern = "tbl$")); remove(list = ls(pattern = "dt$")); inv_gc(); inv_gc(); test_dt <- test_data(data_size * .1) test_tbl <- dplyr::as_tibble(test_dt) arrange_marks <- bench_mark( tidyverse = dplyr::arrange(test_tbl, c, a), dtplyr = as.data.table(dplyr::arrange(lazy_dt(test_dt), c, a)), tidytable = arrange(test_dt, c, a), data.table = test_dt[order(c, a)], .fn = "arrange" ) inv_gc(); inv_gc(); distinct_marks <- bench_mark( tidyverse = dplyr::distinct(test_tbl), dtplyr = as.data.table(dplyr::distinct(lazy_dt(test_dt))), tidytable = distinct(test_dt), data.table = unique(test_dt), .fn = "distinct" )
remove(list = ls(pattern = "tbl$")); remove(list = ls(pattern = "dt$")); inv_gc(); inv_gc(); fill_size <- data_size * .2 fill_dt <- tidytable( id = sample(1:3, fill_size, replace = TRUE), int1 = sample(c(1:5, NA), fill_size, replace = TRUE), int2 = sample(c(1:5, NA), fill_size, replace = TRUE) ) fill_tbl <- dplyr::as_tibble(fill_dt) fill_marks <- bench_mark( tidyverse = tidyr::fill(group_by(fill_tbl, id), int1, int2), dtplyr = as.data.table(tidyr::fill(dplyr::group_by(lazy_dt(fill_dt), id), int1, int2)), tidytable = fill(fill_dt, int1, int2, .by = id), # needs a copy or subsequent runs won't have NAs to fill data.table = tidytable:::shallow(fill_dt)[ , c("new1", "new2") := lapply(.SD, nafill, type = "locf"), .SDcols = c("int1", "int2"), by = id], .fn = "fill" )
remove(list = ls(pattern = "tbl$")); remove(list = ls(pattern = "dt$")); inv_gc(); inv_gc(); case_size <- data_size * .3 x <- runif(case_size * .3) case_marks <- bench_mark( tidyverse = dplyr::case_when(x < .5 ~ 1, x >= .5 ~ 2, TRUE ~ 3), tidytable = case(x < .5, 1, x >= .5, 2, default = 3), data.table = fcase(x < .5, 1, x >= .5, 2, rep(TRUE, length(x)), 3), .fn = "case_when" )
remove(x) remove(list = ls(pattern = "tbl$")); remove(list = ls(pattern = "dt$")); inv_gc(); inv_gc(); wider_dt <- 1:3 %>% map_dfr( ~ expand_grid(a = letters, b = LETTERS, c = paste0(letters, LETTERS)) %>% unite("name") %>% mutate(value = sample(as.double(1:n()), n())) %>% mutate(id = .x) ) %>% relocate(id) wider_tbl <- dplyr::as_tibble(wider_dt) wider_marks <- bench_mark( tidyverse = tidyr::pivot_wider(wider_tbl, names_from = name, values_from = value), dtplyr = lazy_dt(wider_dt) %>% tidyr::pivot_wider(names_from = name, values_from = value) %>% as.data.table(), tidytable = pivot_wider(wider_dt, names_from = name, values_from = value), data.table = data.table::dcast.data.table(wider_dt, id ~ name), .fn = "pivot_wider" )
remove(list = ls(pattern = "tbl$")); remove(list = ls(pattern = "dt$")); inv_gc(); inv_gc(); longer_size <- 100000 longer_dt <- tidytable(id = 1:longer_size) %>% bind_cols( map_dfc(letters, ~ tidytable(!!.x := sample(c(letters, LETTERS), longer_size, TRUE))) ) longer_tbl <- dplyr::as_tibble(longer_dt) longer_marks <- bench_mark( tidyverse = tidyr::pivot_longer(longer_tbl, cols = -id), dtplyr = as.data.table(tidyr::pivot_longer(lazy_dt(longer_dt), cols = -id)), tidytable = pivot_longer(longer_dt, cols = -id, fast_pivot = TRUE), data.table = melt(longer_dt, measure.vars = names(longer_dt)[names(longer_dt) != "id"], variable.name = "name", variable.factor = FALSE), .fn = "pivot_longer" )
remove(list = ls(pattern = "tbl$")); remove(list = ls(pattern = "dt$")); inv_gc(); inv_gc(); left_dt <- test_data(data_size * .1) right_dt <- left_dt %>% distinct(c, d) %>% mutate(e = row_number()) left_tbl <- dplyr::as_tibble(left_dt) right_tbl <- dplyr::as_tibble(right_dt) left_join_marks <- bench_mark( tidyverse = dplyr::left_join(left_tbl, right_tbl, by = c("c", "d")), dtplyr = as.data.table(dplyr::left_join(lazy_dt(left_dt), lazy_dt(right_dt), by = c("c", "d"))), tidytable = left_join(left_dt, right_dt, by = c("c", "d")), data.table = right_dt[left_dt, on = c("c", "d"), allow.cartesian = TRUE], .fn = "left_join" ) inv_gc(); inv_gc(); inner_join_marks <- bench_mark( tidyverse = dplyr::inner_join(left_tbl, right_tbl, by = c("c", "d")), dtplyr = as.data.table(dplyr::inner_join(lazy_dt(left_dt), lazy_dt(right_dt), by = c("c", "d"))), tidytable = inner_join(left_dt, right_dt, c("c", "d")), data.table = left_dt[right_dt, on = c("c", "d"), allow.cartesian = TRUE, nomatch = 0], .fn = "inner_join" )
remove(list = ls(pattern = "tbl$")); remove(list = ls(pattern = "dt$")); inv_gc(); inv_gc(); unnest_dt <- test_data(data_size * .3) %>% nest_by(c, d, .key = "list_column") unnest_tbl <- unnest_dt %>% mutate(list_column = map(list_column, dplyr::as_tibble)) %>% dplyr::as_tibble() unnest_marks <- bench_mark( tidyverse = tidyr::unnest(unnest_tbl, list_column), tidytable = unnest(unnest_dt, list_column), data.table = unnest_dt[, unlist(list_column, recursive = FALSE), by = .(c, d)], .fn = "unnest" )
remove(list = ls(pattern = "tbl$")); remove(list = ls(pattern = "dt$")); inv_gc(); inv_gc(); nest_dt <- test_data(data_size * .1) nest_tbl <- dplyr::as_tibble(nest_dt) nest_marks <- bench_mark( tidyverse = dplyr::group_nest(nest_tbl, c, d), dtplyr = as.data.table(tidyr::nest(lazy_dt(nest_dt), data = c(c, d))), tidytable = nest_by(nest_dt, c, d), data.table = nest_dt[, list(data = list(.SD)), by = .(c, d)], .fn = "nest" )
# # pandas code (run in a .ipynb in vscode) # import pandas as pd # import numpy as np # import gc # from timeit import timeit # # num_runs = 5 # def run_benchmarks(_dict): # return {key: np.round(timeit(value, number = num_runs) * 1000/num_runs, 1) for key, value in _dict.items()} # # np.random.seed(123) # random_strings = np.array(['OoVt', 'wCbu', 'cXxX', 'jdFu', 'MCRx', 'ukhz', 'ikce', 'PHyu', 'jpBY', 'nLQM']) # # initial_data_size = 10000000 # data_size = initial_data_size # # test_df = ( # pd.DataFrame({ # 'a': np.random.choice(np.arange(20), data_size), # 'b': np.random.choice(np.arange(20), data_size), # 'c': np.random.choice(random_strings, data_size), # 'd': np.random.choice(random_strings, data_size) # }) # .convert_dtypes() # ) # # bench_fns1 = dict( # filter = lambda: test_df.loc[(test_df.a <= 7) & (test_df.c == 'OoVt')], # summarize = lambda: test_df.groupby(['c', 'd']).agg(avg_a = ('a', np.mean), med_b = ('b', np.median)), # mutate = lambda: test_df.assign(double_a = test_df.a * 2, a_plus_b = test_df.a + test_df.b) # ) # # bench_marks1 = run_benchmarks(bench_fns1) # # data_size = initial_data_size * .1 # test_df = test_df.head(int(data_size)) # gc.collect() # # bench_fns2 = dict( # arrange = lambda: test_df.sort_values(['c', 'a']), # distinct = lambda: test_df.drop_duplicates() # ) # # bench_marks2 = run_benchmarks(bench_fns2) # # del test_df # gc.collect() # # x = np.random.normal(size = 3000000) # median_x = np.median(x) # # bench_fns3 = dict( # case_when = lambda: np.select( # [x < median_x, x <= median_x], # [1, 2], # default = 3 # ) # ) # # bench_marks3 = run_benchmarks(bench_fns3) # # del x, median_x # gc.collect() # # data_size = int(initial_data_size * .2) # fill_df = ( # pd.DataFrame(dict( # id = np.random.choice(np.arange(3), data_size), # int1 = np.random.choice(np.array([1, 2, 3, 4, 5, np.nan]), data_size), # int2 = np.random.choice(np.array([1, 2, 3, 4, 5, np.nan]), data_size) # )) # .convert_dtypes() # ) # # bench_fns4 = dict( # fill = lambda: fill_df.groupby('id')[['int1','int2']].fillna(method = 'ffill') # ) # # bench_marks4 = run_benchmarks(bench_fns4) # # del fill_df # gc.collect() # # bench_marks = bench_marks1 | bench_marks2 | bench_marks3 | bench_marks4 # # dict(sorted(bench_marks.items()))
remove(list = ls(pattern = "tbl$")); remove(list = ls(pattern = "dt$")); inv_gc(); inv_gc(); marks_names <- setdiff( unlist(ls(pattern="marks$")), c("all_marks", "pandas_marks") ) all_marks <- marks_names %>% map_dfr(get) %>% arrange(func_tested)
pandas_marks <- tidytable( arrange = 715.8, case_when = 64.4, distinct = 308.7, fill = 724, filter = 903.8, mutate = 780.1, summarize = 3079.8 ) %>% pivot_longer(everything(), names_to = "func_tested", values_to = "pandas") all_marks <- all_marks %>% left_join(pandas_marks) %>% select(func_tested, data.table, tidytable, dtplyr, tidyverse, pandas)
print(glue::glue("Date last run: {lubridate::today()}"))
all_marks
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.