knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
Timings for the hydrorecipes package are prefaced with an "h". The first few comparisons include the R6 interface in hydrorecipes to check if there is a loss of speed compared to the standard API. Most users are likely to use the standard API so the remaining benchmarks only present that. Typical speed improvements are between 2-10x and memory consumption is typically half of the recipes package.
#| echo: false #| warning: false #| message: false library(collapse) library(data.table) library(hydrorecipes) library(bench) library(tibble) library(RcppRoll) # for recipes::step_window
#| echo: true #| warning: false relative <- TRUE n <- c(1e2, 1e4, 5e6) formula <- as.formula(y~x) results <- bench::press( rows = n, { dat <- data.frame(x = rnorm(rows), y = 1:rows) bench::mark( hrec1 = hydrorecipes:::Recipe$new(formula = formula, data = dat), hrec2 = recipe(formula = formula, data = dat), rec = recipes::recipe(formula = formula, data = dat), check = FALSE, relative = relative ) } ) results
#| echo: true #| warning: false results <- bench::press( rows = n, { dat <- data.frame(x = rnorm(rows), y = 1:rows) bench::mark( hrec1 = hydrorecipes:::Recipe$new(formula = formula, data = dat)$ add_step(hydrorecipes:::StepCenter$new(x)), hrec2 = recipe(formula = formula, data = dat) |> step_center(x), rec = {recipes::recipe(formula = formula, data = dat) |> recipes::step_center(x)}, check = FALSE, relative = relative ) } ) results
#| echo: true #| warning: false results <- bench::press( rows = n, { dat <- data.frame(x = rnorm(rows), y = 1:rows) hrec1 = hydrorecipes:::Recipe$new(formula = formula, data = dat)$ add_step(hydrorecipes:::StepCenter$new(x)) hrec2 = recipe(formula = formula, data = dat) |> step_center(x) rec = recipes::recipe(formula = formula, data = dat) |> recipes::step_center(x) bench::mark( hrec1$prep(), hrec2 |> prep(), rec |> recipes::prep(), check = FALSE, min_iterations = 1L, relative = relative ) } ) results
#| echo: true #| warning: false results <- bench::press( rows = n, { dat <- data.frame(x = rnorm(rows), y = 1:rows) hrec1 = hydrorecipes:::Recipe$new(formula = formula, data = dat)$ add_step(hydrorecipes:::StepCenter$new(x)) hrec2 = recipe(formula = formula, data = dat) |> step_center(x) rec = recipes::recipe(formula = formula, data = dat) |> recipes::step_center(x) bench::mark( hrec1$prep()$bake(), hrec2 |> prep() |> bake(), rec |> recipes::prep() |> recipes::bake(new_data = NULL), check = FALSE, min_iterations = 1L, relative = relative ) } ) results
#| echo: true #| warning: false formula <- as.formula(y~x+z) results <- bench::press( rows = n, { dat <- data.frame(x = rnorm(rows), y = 1:rows, z = rnorm(rows)) bench::mark( hrec = (recipe(formula = formula, data = dat) |> step_center(x) |> plate())[["x"]], rec = (recipes::recipe(formula = formula, data = dat) |> recipes::step_center(x) |> recipes::prep() |> recipes::bake(new_data = NULL))[["x"]], check = TRUE, min_iterations = 1L, relative = relative ) } ) results
#| echo: true #| warning: false formula <- as.formula(y~x) results <- bench::press( rows = n, { dat <- data.frame(x = rnorm(rows), y = 1:rows, z = rnorm(rows)) bench::mark( hrec = (recipe(formula = formula, data = dat) |> step_scale(x, fun = fsd, n_sd = 2L) |> plate())[["x"]], rec = (recipes::recipe(formula = formula, data = dat) |> recipes::step_scale(x, factor = 2L) |> recipes::prep() |> recipes::bake(new_data = NULL))[["x"]], check = TRUE, relative = relative, min_iterations = 1L ) } ) results
#| echo: true #| warning: false formula <- as.formula(y~x) results <- bench::press( rows = n, { dat <- data.frame(x = rnorm(rows), y = rnorm(rows)) bench::mark( hrec = (recipe(formula = formula, data = dat) |> step_intercept() |> plate("tbl"))[["intercept"]], rec = (recipes::recipe(formula = formula, data = dat) |> recipes::step_intercept() |> recipes::prep() |> recipes::bake(new_data = NULL))[["intercept"]], check = TRUE, relative = relative, min_iterations = 1L ) } ) results
#| echo: true #| warning: false formula <- as.formula(y~x+z) results <- bench::press( rows = n, { dat <- data.frame(x = rnorm(rows), y = rnorm(rows), z = rnorm(rows)) bench::mark( hrec1 = (recipe(formula = formula, data = dat) |> step_normalize(c(x, z, y)) |> plate("tbl"))[, c("x", "z", "y")], hrec2 = (recipe(formula = formula, data = dat) |> step_center(c(x, z, y)) |> step_scale(c(x, z, y)) |> plate("tbl"))[, c("x", "z", "y")], rec = recipes::recipe(formula = formula, data = dat) |> recipes::step_normalize(x, y, z) |> recipes::prep() |> recipes::bake(new_data = NULL), relative = relative, min_iterations = 1L, check = TRUE ) } ) results
#| echo: true #| warning: false formula <- as.formula(y~x) results <- bench::press( rows = n, { dat <- data.frame(x = rnorm(rows), y = as.numeric(1:rows), z = rnorm(rows)) bench::mark( hrec1 = unname(recipe(formula = formula, data = dat) |> step_lead_lag(x, lag = 1:30) |> plate("tbl")), rec = unname(recipes::recipe(formula = formula, data = dat) |> recipes::step_lag(x, lag = 1:30) |> recipes::prep() |> recipes::bake(new_data = NULL)), check = TRUE, relative = relative, min_iterations = 1L ) } ) results
#| echo: true #| warning: false formula <- as.formula(y~x) results <- bench::press( rows = c(5e5, 5e6, 1e7), { dat <- data.frame(x = rnorm(rows), y = 1:rows, z = rnorm(rows)) bench::mark( hrec = recipe(formula = formula, data = dat) |> step_distributed_lag(x, knots = log_lags(5, 86401)) |> prep() |> bake(), check = FALSE, relative = FALSE, min_iterations = 1L ) } ) results
#| echo: true #| warning: false formula <- as.formula(y~x) results <- bench::press( rows = n, { dat <- data.frame(x = rnorm(rows), y = 1:rows, z = rnorm(rows)) bench::mark( hrec = recipe(formula = formula, data = dat) |> step_harmonic(x, frequency = c(1.0, 2.0, 3.0), cycle_size = 0.1, starting_value = 0.0) |> plate("tbl"), rec = recipes::recipe(formula = formula, data = dat) |> recipes::step_harmonic(x, frequency = c(1.0, 2.0, 3.0), cycle_size = 0.1, starting_val = 0.0, keep_original_cols = TRUE) |> recipes::prep() |> recipes::bake(new_data = NULL), # sin and cos terms order is different check = FALSE, relative = relative, min_iterations = 1L ) } ) results # rows <- 1e6 # dat <- data.frame(x = rnorm(rows), # y = 1:rows, # z = rnorm(rows)) # bench::mark( # # {hrec = recipe(formula = formula, data = dat) |> # step_harmonic(x, # frequency = c(1.0, 2.0, 3.0), # cycle_size = 0.1, # starting_value = 0.0, # varying = "cycle_size") |> # step_harmonic(x, # frequency = c(1.0, 2.0, 3.0), # cycle_size = 0.1, # starting_value = 0.0) |> # step_intercept() |> # step_center(x) |> # prep() |> # bake()}, # # {hrec$steps[[2]]$update_step("cycle_size", 0.2) # hrec$bake() # }, # check = FALSE # )
#| echo: true #| warning: false set.seed(1) formula <- as.formula(x~a + b + c + d + e + f + g + h + i + j + k + l) results <- bench::press( rows = n, { dat <- data.frame(x = rnorm(rows), a = rnorm(rows), b = rnorm(rows), c = rnorm(rows), d = rnorm(rows), e = rnorm(rows), f = rnorm(rows), g = rnorm(rows), h = rnorm(rows), i = rnorm(rows), j = rnorm(rows), k = rnorm(rows), l = rnorm(rows) ) bench::mark( hrec1 = recipe(formula = formula, data = dat)|> step_pca(c(a,b,c,d,e,f,g,h,i,j,k,l), n_comp = 10L) |> plate(), hrec2 = recipe(formula = formula, data = dat)|> step_pca(c(a,b,c,d,e,f,g,h,i,j,k,l), n_comp = 5L) |> plate(), hrec3 = recipe(formula = formula, data = dat)|> step_pca(c(a,b,c,d,e,f,g,h,i,j,k,l), n_comp = 10L, center = FALSE, scale = FALSE) |> plate(), hrec4 = recipe(formula = formula, data = dat)|> step_pca(c(a,b,c,d,e,f,g,h,i,j,k,l), n_comp = 5L, center = FALSE, scale = FALSE) |> plate(), rec1 = recipes::recipe(formula = formula, data = dat) |> recipes::step_pca(recipes::all_predictors(), num_comp = 10L, options = list(center = TRUE, scale. = TRUE))|> recipes::prep() |> recipes::bake(new_data = NULL), rec2 = recipes::recipe(formula = formula, data = dat) |> recipes::step_pca(recipes::all_predictors(), num_comp = 5L, options = list(center = TRUE, scale. = TRUE)) |> recipes::prep() |> recipes::bake(new_data = NULL), rec3 = recipes::recipe(formula = formula, data = dat) |> recipes::step_pca(recipes::all_predictors(), num_comp = 10L) |> recipes::prep() |> recipes::bake(new_data = NULL), rec4 = recipes::recipe(formula = formula, data = dat) |> recipes::step_pca(recipes::all_predictors(), num_comp = 5L) |> recipes::prep() |> recipes::bake(new_data = NULL), check = FALSE, relative = relative, min_iterations = 1L ) } ) print(results, n = 100)
#| echo: true #| warning: false formula <- as.formula(y~x) results <- bench::press( rows = n, { dat <- data.frame(x = rnorm(rows), y = qF(sample(1:10, rows, replace = TRUE)), z = rnorm(rows)) bench::mark( hrec = unname(recipe(formula = formula, data = dat) |> step_dummy(y) |> plate("tbl"))[,3:11], rec = unname(recipes::recipe(formula = formula, data = dat) |> recipes::step_dummy(y, keep_original_cols = TRUE) |> recipes::prep() |> recipes::bake(new_data = NULL))[,3:11], check = TRUE, relative = relative, min_iterations = 1L ) } ) results
#| echo: true #| warning: false formula <- as.formula(y~x) results <- bench::press( rows = n, { dat <- data.frame(x = rnorm(rows), y = 1:rows, z = rnorm(rows)) bench::mark( hrec = recipe(formula = formula, data = dat) |> step_find_interval(x, vec = c(-0.1, 0, 0.1)) |> plate("tbl"), rec = recipes::recipe(formula = formula, data = dat) |> recipes::step_cut(x, breaks = c(-0.1, 0, 0.1)) |> recipes::prep() |> recipes::bake(new_data = NULL), check = FALSE, relative = relative, min_iterations = 1L ) } ) results
#| echo: true #| warning: false formula <- as.formula(y~x+z) results <- bench::press( rows = n, { dat <- data.frame(x = rep(1, rows), y = 1:rows, z = rnorm(rows)) bench::mark( hrec = recipe(formula = formula, data = dat) |> step_varying(c(x, y, z)) |> plate("tbl"), rec = recipes::recipe(formula = formula, data = dat) |> recipes::step_zv(x, y, z) |> recipes::prep() |> recipes::bake(new_data = NULL), check = TRUE, relative = relative, min_iterations = 1L ) } ) results
step_kernel_filter uses an Fast Fourier Transform (FFT) based convolution instead of an explicit sliding window. This should be much faster for large datasets and particularly when the kernel size is also large.
#| echo: true #| warning: false formula <- as.formula(y~x+z) results <- bench::press( rows = c(2e4, 2e5), { dat <- data.frame(x = rep(1, rows), y = 1:rows, z = cumsum(rnorm(rows))) bench::mark( hrec = unname((recipe(formula = formula, data = dat) |> step_kernel_filter(z, kernel = list(rep(1, 5001L)/5001L), align = "center") |> plate("tbl"))[10000, "kernel_filter_z"]), {rec = recipes::recipe(formula = formula, data = dat) |> recipes::step_window(z, size = 5001L, statistic = "mean") |> recipes::prep() |> recipes::bake(new_data = NULL) unname(rec[10000, "z"])}, min_iterations = 1L, relative = relative, check = TRUE ) } ) results
#| echo: true #| warning: false formula <- as.formula(y~x+z) results <- bench::press( rows = c(2e4, 2e6), { dat <- data.frame(x = rep(1, rows), y = 1:rows, z = cumsum(rnorm(rows))) bench::mark( hrec = (recipe(formula = formula, data = dat) |> step_convolve_gamma(z, amplitude = 1, k = 1, theta = 1) |> plate("tbl")), min_iterations = 1, relative = relative, check = TRUE ) } ) results
step_harmonic dominates these results.
#| echo: true #| warning: false formula <- as.formula(y~x) results <- bench::press( rows = n, { dat <- data.frame(x = as.numeric(1:rows), y = 1:rows) bench::mark( hrec = recipe(formula = formula, data = dat) |> step_lead_lag(x, lag = 1:10) |> step_harmonic(x, frequency = c(1, 2, 3), cycle_size = 0.1, starting_value = 0) |> step_center(x) |> plate("tbl"), rec = recipes::recipe(formula = formula, data = dat) |> recipes::step_lag(x, lag = 1:10, keep_original_cols = TRUE) |> recipes::step_harmonic(x, frequency = c(1, 2, 3), cycle_size = 0.1, starting_val = 0, keep_original_cols = TRUE) |> recipes::step_center(x) |> recipes::prep() |> recipes::bake(new_data = NULL), check = FALSE, relative = relative, min_iterations = 1 ) } ) results
#| echo: true #| warning: false formula <- as.formula(y~x) n <- c(100, 1e4, 5e6) results <- bench::press( rows = n, { dat <- data.frame(x = rnorm(rows), y = 1:rows) bench::mark( hrec = unname(recipe(formula = formula, data = dat) |> step_spline_b(x, df = 13) |> plate("tbl")), rec = unname(recipes::recipe(formula = formula, data = dat) |> recipes::step_spline_b(x, deg_free = 13, keep_original_cols = TRUE)|> recipes::prep() |> recipes::bake(new_data = NULL)), check = TRUE, relative = relative, min_iterations = 2 ) } ) results
#| echo: true #| warning: false formula <- as.formula(y~x) results <- bench::press( rows = n, { dat <- data.frame(x = rnorm(rows), y = 1:rows) bench::mark( hrec = unname(recipe(formula = formula, data = dat) |> step_spline_n(x, df = 11L) |> plate("tbl")), rec = unname(recipes::recipe(formula = formula, data = dat) |> recipes::step_spline_natural(x, deg_free = 11L, keep_original_cols = TRUE)|> recipes::prep() |> recipes::bake(new_data = NULL)), check = TRUE, relative = relative, min_iterations = 2 ) } ) results
#| echo: true #| warning: false formula <- as.formula(y~x) results <- bench::press( rows = n, { dat <- data.frame(x = as.numeric(1:rows), y = rep(0.01, rows)) bench::mark( hrec1 = recipe(formula = formula, data = dat) |> step_aquifer_grf(time = x, flow_rate = y) |> plate("dt"), hrec2 = recipe(formula = formula, data = dat) |> step_aquifer_theis(time = x, flow_rate = y) |> plate("dt"), check = TRUE, relative = relative) } ) results
The Theis solution is a subset of the grf solution.
#| echo: true #| warning: false formula <- as.formula(y~x) results <- bench::press( rows = n, { dat <- data.frame(x = as.numeric(1:rows), y = rep(0.01, rows)) bench::mark( hrec1 = recipe(formula = formula, data = dat) |> step_add_noise(y) |> plate("dt"), relative = relative) } ) results
#| echo: true #| warning: false formula <- as.formula(y~x) results <- bench::press( rows = n, { dat <- data.frame(x = as.numeric(1:rows), y = rep(0.01, rows)) bench::mark( hrec1 = unname(recipe(formula = formula, data = dat) |> step_aquifer_leaky(time = x, flow_rate = y, leakage = 100000000) |> plate("dt")), hrec2 = unname(recipe(formula = formula, data = dat) |> step_aquifer_theis(time = x, flow_rate = y) |> plate("dt")), check = TRUE, relative = relative) } ) results
#| echo: true #| warning: false formula <- as.formula(y~x) results <- bench::press( rows = c(1e5), { dat <- data.frame(x = as.numeric(1:rows), y = rep(0.01, rows)) bench::mark( hrec1 = (recipe(formula = formula, data = dat) |> step_aquifer_grf(time = x, flow_rate = y) |> plate("dt")), hrec3 = (recipe(formula = formula, data = dat) |> step_aquifer_patch(time = x, flow_rate = 0.01, thickness = 1.0, radius = 100.0, radius_patch = 200.0, specific_storage_inner = 1e-6, specific_storage_outer = 1e-6, hydraulic_conductivity_inner = 1e-4, hydraulic_conductivity_outer = 1e-4, n_stehfest = 8L ) |> plate("dt")), check = FALSE, relative = relative) } ) results
#| echo: true #| warning: false formula <- as.formula(y~x) results <- bench::press( rows = n, { dat <- data.frame(x = as.numeric(1:rows), y = as.numeric(1:rows)) bench::mark( hrec1 = (recipe(formula = formula, data = dat) |> step_vadose_weeks(time = x, air_diffusivity = 0.8, thickness = 5, precision = 1e-12) |> plate("dt")), check = FALSE, min_iterations = 2 ) } ) results
#| echo: true #| warning: false formula <- as.formula(y~x) results <- bench::press( rows = n, { dat <- data.frame(expand.grid(as.numeric(1:rows), as.numeric(1:10))) names(dat) <- c('x', 'y') bench::mark( hrec1 = (recipe(formula = formula, data = dat) |> step_transport_ogata_banks(time = x, distance = y) |> plate("dt")), check = FALSE, min_iterations = 2 ) } ) results
#| echo: true #| warning: false formula <- as.formula(~time+z+x) dat <- setDT(expand.grid(10^(3:8), seq(0.0, 10, 1), c(0.0))) names(dat) <- c("time", "z", "x") results <- bench::mark( hrec1 = recipe(formula = formula, data = dat) |> step_transport_fractures_solute(time = time, distance_fracture = z, distance_matrix = x) |> plate("dt"), check = FALSE, min_iterations = 2 ) results
#| echo: true #| warning: false formula <- as.formula(~time+z+x) dat <- setDT(expand.grid(10^(3:8), seq(0.0, 100, 1), c(0.0, 0.05))) names(dat) <- c("time", "z", "x") results <- bench::mark( hrec1 = recipe(formula = formula, data = dat) |> step_transport_fractures_heat(time = time, distance_fracture = z, distance_matrix = x) |> plate("dt"), check = FALSE, min_iterations = 2 ) results
#| echo: true #| warning: false formula <- as.formula(y~x + z) results <- bench::press( rows = n, { dat <- data.frame(x = rnorm(rows), y = rnorm(rows), z = rnorm(rows), q = rnorm(rows), r = rnorm(rows), s = rnorm(rows)) bench::mark( hrec1 = recipe(formula = formula, data = dat) |> step_fft_pgram(c(x, y), 3, TRUE, TRUE, FALSE, 0.1, time_step = 1) |> prep() |> bake(), hrec2 = recipe(formula = formula, data = dat) |> step_fft_pgram(c(x, y), 3, TRUE, TRUE, TRUE, 0.1, time_step = 1) |> prep() |> bake(), hrec3 = recipe(formula = formula, data = dat) |> step_fft_welch(c(x, y), length_subset = nrow(dat) / 10, overlap = 0.60, window = window_nuttall(nrow(dat) / 10), time_step = 1) |> prep() |> bake(), check = FALSE, min_iterations = 1 ) } ) results
#| echo: true #| warning: false formula <- as.formula(y~x) results <- bench::press( rows = c(1e5, 1e6, 1e7), { dat <- data.frame(x = rnorm(rows), y = rnorm(rows)) bench::mark( hrec1 = recipe(formula = formula, data = dat) |> step_fft_transfer_pgram(c(x, y), 3, TRUE, TRUE, 0.1, time_step = 1) |> prep() |> bake(), hrec2 = recipe(formula = formula, data = dat) |> step_fft_transfer_welch(c(x, y), length_subset = nrow(dat) / 10, overlap = 0.60, window = window_nuttall(nrow(dat) / 10), time_step = 1) |> prep() |> bake(), hrec3 <- recipe(formula = formula, data = dat) |> step_fft_transfer_experimental(c(x, y), spans = 3, taper = 0.1, n_groups = 300, time_step = 1) |> prep() |> bake(), check = FALSE, min_iterations = 1 ) } ) results
#| echo: true #| warning: false formula <- as.formula(y~x+z) results <- bench::press( rows = n, { dat <- data.frame(x = rnorm(rows), y = rnorm(rows), z = rnorm(rows)) m <- qM(dat) bench::mark( hrec = recipe(formula = formula, data = dat) |> # step_center(x) |> step_distributed_lag(z, knots = log_lags(10, 50)) |> # step_intercept() |> step_ols(formula = as.formula(y~.), do_response = FALSE) |> prep() |> bake(), lm = lm(formula, dat), lm.fit(x = m[, c(2, 3)], y = m[,1]), check = FALSE, relative = FALSE ) } ) results formula <- as.formula(y~x+z) results <- bench::press( rows = n, { dat <- data.frame(x = rnorm(rows), y = rnorm(rows), z = rnorm(rows)) bench::mark( hrec = recipe(formula = formula, data = dat) |> step_intercept() |> step_nls(formula = as.formula(y~.)) |> prep() |> bake(), check = FALSE, relative = FALSE ) } )
#| echo: true #| warning: false set.seed(123) n <- 100000 frm <- formula(x ~ y + z) x <- cumsum(rnorm(n)) dat <- data.table(x = x, y = x, z = as.numeric(1:n)) dat[, x := x + c(rep(20, n/2), rep(0, n/2))] dat[, x := x + 3.0 * sin(z * 1/n)] tmp <- copy(dat$x) # Set value to NA. These values will be estimated. dat[60000:70000, x := NA_real_] dat <- unclass(dat) # bench::mark( # {h = recipe(formula = frm, data = dat) |> # step_find_interval(z, vec = c(0, n/2, n)) |> # step_intercept() |> # step_spline_b(z, df = 4) |> # step_drop_columns(z) # # hrec = recipe(formula = frm, data = dat) |> # step_ols_gap_fill(c(x, y, z), recipe = h) |> # prep() |> # bake()}, # )
formula <- as.formula(y~x) # results <- bench::press( # rows = n, # { # dat <- data.frame(x = rnorm(rows), # y = 1:rows, # z = rnorm(rows)) # bench::mark( # hrec1 = unname(recipe(formula = formula, data = dat) |> # step_lead_lag(x, lag = 1:30) |> # plate("tbl")), # hrec1 = unname(recipe(formula = formula, data = dat) |> # step_lead_lag(x, lag = 1:30) |> # plate("tbl")), # check = TRUE, # relative = relative, # min_iterations = 1 # ) # } # ) # # results
#| echo: true #| warning: false formula <- as.formula(y~x) results <- bench::press( rows = n, { dat <- data.frame(x = rnorm(rows), y = 1:rows) dat[9, "x"] <- NA dat[9, "y"] <- NA bench::mark( hrec1 = recipe(formula = formula, data = dat) |> step_check_spacing(y) |> step_check_na(y) |> prep() |> bake(), hrec2 =recipe(formula = formula, data = dat) |> step_check_spacing(x) |> step_check_na(x) |> prep() |> bake(), check = FALSE, relative = relative, min_iterations = 2 ) } ) results
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.