So the price for using these abstractions reduces mainly to the overhead generated by casting formulas into functions; and additionally to the overhead generated by S4 dispatch. This obviously is not such a problem when the time applying the function itself increases -- either by computation time or number if iterations.
```{R eval = FALSE} library("rbenchmark") library("dat")
benchmark( flatmap(1:3, x ~ x^2), sapply(1:3, function(x) x^2), sapply(1:3, as.function(x ~ x^2)), flatmap(1:3, function(x) x^2) )
benchmark( flatmap(1:1e4, x ~ x^2), sapply(1:1e4, function(x) x^2) )
The same can be seen for the multivariate map implementation: ```r benchmark( flatmap(1:3 ~ 1:3, f(x, y) ~ x + y), mapply(function(x, y) x + y, 1:3, 1:3), mapply(as.function(f(x, y) ~ x + y), 1:3, 1:3) )
There is some overhead generated by this package. To some extend this is because we try to preserve the class we operate on. However in most tests it seems that it is mostly because dplyr is used to communicate with data.table, which has a price.
library("data.table") library("dplyr") options("dat.use.dplyr" = FALSE) N <- 2e7 # more is not possible with small laptop K <- 100 set.seed(1) DT <- data.table( id1 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) id2 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char) id4 = sample(K, N, TRUE), # large groups (int) id5 = sample(K, N, TRUE), # large groups (int) id6 = sample(N/K, N, TRUE), # small groups (int) v1 = sample(5, N, TRUE), # int in range [1,5] v2 = sample(5, N, TRUE), # int in range [1,5] v3 = sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749 ) setClass("DataTable", "data.table") setMethod("[", "DataTable", mutar) DT4 <- new("DataTable", DT) cat("GB =", round(sum(gc()[,2]) / 1024, 3), "\n") format(object.size(DT), units = "MB") format(object.size(DT4), units = "MB") system.time(DT[, sum(v1), keyby = id1]) system.time(DT[, sum(v1), keyby = id1]) system.time(DT4[V1 ~ sum(v1), sby = "id1"]) system.time(DT4[V1 ~ sum(v1), sby = "id1"]) system.time(group_by(DT, id1) %>% summarise(V1 = sum(v1))) system.time(group_by(DT, id1) %>% summarise(V1 = sum(v1))) system.time(DT[, sum(v1), keyby = "id1,id2"]) system.time(DT[, sum(v1), keyby = "id1,id2"]) system.time(DT4[V1 ~ sum(v1), sby = c("id1", "id2")]) system.time(DT4[V1 ~ sum(v1), sby = c("id1", "id2")]) system.time(group_by(DT, id1, id2) %>% summarise(V1 = sum(v1))) system.time(group_by(DT, id1, id2) %>% summarise(V1 = sum(v1))) system.time(DT[, list(sum(v1), mean(v3)), keyby = id3]) system.time(DT[, list(sum(v1), mean(v3)), keyby = id3]) system.time(DT4[V1 ~ sum(v1), V3 ~ mean(v3), sby = "id3"]) system.time(DT4[V1 ~ sum(v1), V3 ~ mean(v3), sby = "id3"]) system.time(group_by(DT, id3) %>% summarise(V1 = sum(v1), V3 = mean(v3))) system.time(group_by(DT, id3) %>% summarise(V1 = sum(v1), V3 = mean(v3))) system.time(DT[, lapply(.SD, mean), keyby = id4, .SDcols = 7:9]) system.time(DT[, lapply(.SD, mean), keyby = id4, .SDcols = 7:9]) system.time(DT4[FL(.n ~ mean(.n), .n = "^v[1-3]"), sby = "id4"]) system.time(DT4[FL(.n ~ mean(.n), .n = "^v[1-3]"), sby = "id4"]) system.time(group_by(DT, id4) %>% summarise(V1 = mean(v1), V2 = mean(v2), V3 = mean(v3))) system.time(group_by(DT, id4) %>% summarise(V1 = mean(v1), V2 = mean(v2), V3 = mean(v3))) system.time(DT[, lapply(.SD, sum), keyby = id6, .SDcols = 7:9]) system.time(DT[, lapply(.SD, sum), keyby = id6, .SDcols = 7:9]) system.time(DT4[FL(.n ~ sum(.n), .n = "v1:v3"), sby = "id6"]) system.time(DT4[FL(.n ~ sum(.n), .n = "v1:v3"), sby = "id6"]) system.time(group_by(DT, id6) %>% summarise(V1 = sum(v1), V2 = sum(v2), V3 = sum(v3))) system.time(group_by(DT, id6) %>% summarise(V1 = sum(v1), V2 = sum(v2), V3 = sum(v3)))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.