collapse: Advanced and Fast Data Transformation

#####################################################
### Replication Script for the JSS Article:
### collapse: Advanced and Fast Statistical Computing
###           and Data Transformation in R

### By: Sebastian Krantz, IfW Kiel
### E-Mail: sebastian.krantz@ifw-kiel.de
#####################################################

###################################################
### code chunk number 1: preliminaries
###################################################

rm(list = ls()); gc()

##           used (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 1510201 80.7    3842590 205.3         NA  3842590 205.3
## Vcells 9683701 73.9   36617092 279.4      49152 81718535 623.5

r_opts <- options(prompt = "R> ", continue = "+  ", width = 77, digits = 4, useFancyQuotes = FALSE, warn = 1)

# Loading libraries and installing if unavailable
if(!requireNamespace("fastverse", quietly = TRUE)) install.packages("fastverse")

if(!requireNamespace("collapse", quietly = TRUE) || packageVersion("collapse") < "2.1.2") {
  install.packages("collapse_2.1.2.tar.gz", type = "source", repos = NULL)
}

options(fastverse.styling = FALSE)
library(fastverse) # loads data.table, collapse, magrittr and kit (not used)

# Package versions used in the article:
# fastverse 0.3.4, collapse 2.1.2, data.table 1.17.0, nycflights23 0.2.0,
# bench 1.1.4, dplyr 1.1.4, tidyr 1.3.1, matrixStats 1.5.0, janitor 2.2.0
pkg <- c("bench", "dplyr", "tidyr", "matrixStats", "janitor", "nycflights23")
if(!all(avail <- is_installed(pkg))) install.packages(pkg[!avail])

# Reset collapse options (if set)
oldopts <- set_collapse(nthreads = 1L, remove = NULL, stable.algo = TRUE, sort = TRUE,
             digits = 2L, stub = TRUE, verbose = 1L, mask = NULL, na.rm = TRUE)

# For presentation: removing whitespace around labels
vlabels(GGDC10S) <- trimws(vlabels(GGDC10S))

# This is the benchmark function
bmark <- function(...) {
  bench::mark(..., min_time = 3, check = FALSE) |> janitor::clean_names() |>
  fselect(expression, min, median, mem_alloc, n_itr, n_gc, total_time) |>
  fmutate(expression = names(expression)) |> dapply(as.character) |> qDF()
}


###################################################
### code chunk number 2: collapse_topics
###################################################
.COLLAPSE_TOPICS

##  [1] "collapse-documentation"     "fast-statistical-functions"
##  [3] "fast-grouping-ordering"     "fast-data-manipulation"    
##  [5] "quick-conversion"           "advanced-aggregation"      
##  [7] "data-transformations"       "time-series-panel-series"  
##  [9] "list-processing"            "summary-statistics"        
## [11] "recode-replace"             "efficient-programming"     
## [13] "small-helpers"              "collapse-options"

help("collapse-documentation")


###################################################
### code chunk number 3: faststatfun
###################################################
fmean(iris$Sepal.Length)

## [1] 5.843

fmean(num_vars(iris))

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##        5.843        3.057        3.758        1.199

identical(fmean(num_vars(iris)), fmean(as.matrix(num_vars(iris))))

## [1] TRUE

fmean(iris$Sepal.Length, g = iris$Species)

##     setosa versicolor  virginica 
##      5.006      5.936      6.588

fmean(num_vars(iris), g = iris$Species)

##            Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa            5.006       3.428        1.462       0.246
## versicolor        5.936       2.770        4.260       1.326
## virginica         6.588       2.974        5.552       2.026

###################################################
### code chunk number 4: TRA
###################################################
fmean(iris$Sepal.Length, g = iris$Species, TRA = "fill")[c(1:5, 51:55)]

##  [1] 5.006 5.006 5.006 5.006 5.006 5.936 5.936 5.936 5.936 5.936

###################################################
### code chunk number 5: BY
###################################################
BY(num_vars(iris), g = iris$Species, FUN = mad) |> head(1)

##        Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa       0.2965      0.3706       0.1483           0

###################################################
### code chunk number 6: fscale
###################################################
fscale(iris$Sepal.Length, g = iris$Species) |> fsd(g = iris$Species)

##     setosa versicolor  virginica 
##          1          1          1

###################################################
### code chunk number 7: wlddev_distinct
###################################################
fndistinct(wlddev)

## country   iso3c    date    year  decade  region  income    OECD   PCGDP 
##     216     216      61      61       7       7       4       2    9470 
##  LIFEEX    GINI     ODA     POP 
##   10548     368    7832   12877

###################################################
### code chunk number 8: fsummarise
###################################################
wlddev |> fgroup_by(country, income) |>
  fsummarise(n = fnobs(LIFEEX),
             diff = fmax(LIFEEX) %-=% fmin(LIFEEX),
             mean_diff = fmean(fdiff(LIFEEX)),
             cor_PCGDP = pwcor(Dlog(LIFEEX), Dlog(PCGDP)),
             across(c(LIFEEX, PCGDP, POP), flast)) |>
  collap( ~ income, w = ~ POP, keep.w = FALSE) |> print(digits = 2)

##         country              income  n diff mean_diff cor_PCGDP LIFEEX PCGDP
## 1 United States         High income 60   13      0.23  -0.00828     81 44599
## 2      Ethiopia          Low income 60   26      0.43   0.27153     64   691
## 3         India Lower middle income 60   24      0.41  -0.15366     69  2324
## 4         China Upper middle income 60   27      0.45   0.00077     76  8749

###################################################
### code chunk number 9: fsummarise_2
###################################################
wlddev |> fsubset(year >= 2015, income, PCGDP:POP) |>
  fgroup_by(income) |> fmean(POP)

##                income   sum.POP PCGDP LIFEEX  GINI        ODA
## 1         High income 5.901e+09 43340  80.70 36.14   81398948
## 2          Low income 3.296e+09   663  63.05 39.13 2466732035
## 3 Lower middle income 1.490e+10  2177  68.31 36.48 2234540565
## 4 Upper middle income 1.318e+10  8168  75.51 41.68  -86630117

###################################################
### code chunk number 10: glinreg
###################################################
wlddev |> collap(POP ~ region + year, FUN = fsum) |>
  fmutate(POP = POP / 1e6) |> fgroup_by(region) |>
  fmutate(dmy = fmean(year, TRA = "-")) |>
  fsummarise(beta = fsum(POP, dmy) %/=% fsum(dmy, dmy),
             POP20 = flast(POP)) |>
  fmutate(POP21 = POP20 + beta, POP22 = POP21 + beta,
          POP23 = POP22 + beta, POP24 = POP23 + beta)

##                       region   beta  POP20  POP21  POP22  POP23  POP24
## 1        East Asia & Pacific 18.731 2291.4 2310.2 2328.9 2347.6 2366.4
## 2      Europe & Central Asia  2.463  921.2  923.7  926.1  928.6  931.0
## 3  Latin America & Caribbean  6.460  646.4  652.9  659.4  665.8  672.3
## 4 Middle East & North Africa  5.409  456.7  462.1  467.5  472.9  478.3
## 5              North America  2.301  365.9  368.2  370.5  372.8  375.1
## 6                 South Asia 19.640 1835.8 1855.4 1875.1 1894.7 1914.3
## 7         Sub-Saharan Africa 12.852 1103.5 1116.3 1129.2 1142.0 1154.9

###################################################
### code chunk number 11: wstats
###################################################
wlddev |> fsubset(is.finite(POP)) |> fgroup_by(region) |>
  fmutate(o = radixorder(GRPid(), LIFEEX)) |>
  fsummarise(min = fmin(LIFEEX),
             Q1 = fnth(LIFEEX, 0.25, POP, o = o, ties = "q8"),
             mean = fmean(LIFEEX, POP),
             median = fmedian(LIFEEX, POP, o = o),
             Q3 = fnth(LIFEEX, 0.75, POP, o = o, ties = "q8"),
             max = fmax(LIFEEX))

##                       region   min    Q1  mean median    Q3   max
## 1        East Asia & Pacific 18.91 65.28 68.45  69.67 73.86 85.08
## 2      Europe & Central Asia 45.37 68.68 72.30  71.58 76.67 85.42
## 3  Latin America & Caribbean 41.76 65.17 69.16  70.87 74.48 82.19
## 4 Middle East & North Africa 29.92 61.96 66.65  69.12 72.64 82.80
## 5              North America 68.90 73.57 75.54  75.62 78.38 82.05
## 6                 South Asia 32.45 55.08 60.19  62.00 66.67 78.92
## 7         Sub-Saharan Africa 26.17 46.51 52.53  52.23 58.32 74.51

###################################################
### code chunk number 12: GRP
###################################################
str(g <- GRP(wlddev, ~ income + OECD))

## Class 'GRP'  hidden list of 9
##  $ N.groups    : int 6
##  $ group.id    : int [1:13176] 3 3 3 3 3 3 3 3 3 3 ...
##  $ group.sizes : int [1:6] 2745 2074 1830 2867 3538 122
##  $ groups      :'data.frame':    6 obs. of  2 variables:
##   ..$ income: Factor w/ 4 levels "High income",..: 1 1 2 3 4 4
##   .. ..- attr(*, "label")= chr "Income Level"
##   ..$ OECD  : logi [1:6] FALSE TRUE FALSE FALSE FALSE TRUE
##   .. ..- attr(*, "label")= chr "Is OECD Member Country?"
##  $ group.vars  : chr [1:2] "income" "OECD"
##  $ ordered     : Named logi [1:2] TRUE FALSE
##   ..- attr(*, "names")= chr [1:2] "ordered" "sorted"
##  $ order       : int [1:13176] 245 246 247 248 249 250 251 252 253 254 ...
##   ..- attr(*, "starts")= int [1:6] 1 2746 4820 6650 9517 13055
##   ..- attr(*, "maxgrpn")= int 3538
##   ..- attr(*, "sorted")= logi FALSE
##  $ group.starts: int [1:6] 245 611 1 306 62 7687
##  $ call        : language GRP.default(X = wlddev, by = ~income + OECD)

###################################################
### code chunk number 13: gcomp
###################################################
add_vars(g$groups,
         get_vars(wlddev, "country") |> fmode(g, wlddev$POP, use = FALSE),
         get_vars(wlddev, c("PCGDP", "LIFEEX")) |> fmean(g, wlddev$POP, use = FALSE),
         get_vars(wlddev, "POP") |> fsum(g, use = FALSE))

##                income  OECD       country   PCGDP LIFEEX       POP
## 1         High income FALSE  Saudi Arabia 22426.7  73.00 3.114e+09
## 2         High income  TRUE United States 31749.6  75.84 5.573e+10
## 3          Low income FALSE      Ethiopia   557.1  53.51 2.095e+10
## 4 Lower middle income FALSE         India  1238.8  60.59 1.138e+11
## 5 Upper middle income FALSE         China  3820.6  68.21 1.114e+11
## 6 Upper middle income  TRUE        Mexico  8311.2  69.06 8.162e+09

###################################################
### code chunk number 14: gtrans
###################################################
add_vars(wlddev) <- get_vars(wlddev, c("PCGDP", "LIFEEX")) |>
  fwithin(g, mean = "overall.mean") |> add_stub("center_")


###################################################
### code chunk number 15: fsumm_bench
###################################################
bmark(collapse = collapse::fsummarise(mtcars, mean = fmean(mpg)),
      dplyr = dplyr::summarise(mtcars, mean = fmean(mpg)))

##   expression      min   median mem_alloc n_itr n_gc total_time
## 1   collapse   4.67µs   5.17µs        0B  9999    1    54.15ms
## 2      dplyr 199.14µs 220.87µs    1.92KB  9970   30      2.35s

###################################################
### code chunk number 16: ts_example
###################################################
fgrowth(airmiles, n = 10, power = 1/10) |> na.omit() |> round(1)

## Time Series:
## Start = 1947 
## End = 1960 
## Frequency = 1 
##  [1] 31.0 28.7 25.7 22.5 22.5 24.3 24.6 22.6 19.4 14.2 15.3 15.5 15.8 14.3

###################################################
### code chunk number 17: exports
###################################################
set.seed(101)
exports <- expand.grid(y = 2001:2010, c = paste0("c", 1:10),
                       s = paste0("s", 1:10)) |> fmutate(v = abs(rnorm(1e3))) |>
  colorder(c, s) |> fsubset(-sample.int(1e3, 500))


###################################################
### code chunk number 18: ts_example_2
###################################################
.c(y, v) %=% fsubset(exports, c == "c1" & s == "s7", -c, -s)
print(y)

## [1] 2001 2002 2004 2005 2007 2008 2009 2010

fgrowth(v, t = y) |> round(2)

## [1]     NA 175.52     NA -22.37     NA 624.27 -79.01 534.56

fgrowth(v, -1:3, t = y) |> head(4)

##         FG1     --     G1   L2G1   L3G1
## [1,] -63.71 0.3893     NA     NA     NA
## [2,]     NA 1.0726 175.52     NA     NA
## [3,]  28.82 0.8450     NA -21.22 117.05
## [4,]     NA 0.6559 -22.37     NA -38.85

###################################################
### code chunk number 19: example_growth
###################################################
G(exports, -1:2, by = v ~ c + s, t = ~ y) |> head(3)

##    c  s    y  FG1.v      v   G1.v L2G1.v
## 1 c1 s1 2002 -18.15 0.5525     NA     NA
## 2 c1 s1 2003 214.87 0.6749  22.17     NA
## 3 c1 s1 2004 -31.02 0.2144 -68.24  -61.2

tfm(exports, fgrowth(list(v = v), -1:2, g = list(c, s), t = y)) |> head(3)

##    c  s    y      v  FG1.v   G1.v L2G1.v
## 1 c1 s1 2002 0.5525 -18.15     NA     NA
## 2 c1 s1 2003 0.6749 214.87  22.17     NA
## 3 c1 s1 2004 0.2144 -31.02 -68.24  -61.2

###################################################
### code chunk number 20: example_growth_continued
###################################################
A <- exports |> fgroup_by(c, s) |> fmutate(gv = G(v, t = y)) |> fungroup()
head(B <- exports |> fmutate(gv = G(v, g = list(c, s), t = y)), 4)

##    c  s    y      v     gv
## 1 c1 s1 2002 0.5525     NA
## 2 c1 s1 2003 0.6749  22.17
## 3 c1 s1 2004 0.2144 -68.24
## 4 c1 s1 2005 0.3108  44.98

identical(A, B)

## [1] TRUE

###################################################
### code chunk number 21: indexing
###################################################
exportsi <- exports |> findex_by(c, s, y)
exportsi |> G(0:1) |> head(5)

##    c  s    y      v   G1.v
## 1 c1 s1 2002 0.5525     NA
## 2 c1 s1 2003 0.6749  22.17
## 3 c1 s1 2004 0.2144 -68.24
## 4 c1 s1 2005 0.3108  44.98
## 5 c1 s1 2006 1.1740 277.76
## 
## Indexed by:  c.s [1] | y [5 (10)]

exportsi |> findex() |> print(2)

##     c.s    y
## 1 c1.s1 2002
## 2 c1.s1 2003
## ---                
## 499 c10.s10 2007
## 500 c10.s10 2009
## 
## c.s [100] | y [10]

###################################################
### code chunk number 22: indexing_2
###################################################
vi <- exportsi$v; str(vi, width = 70, strict = "cut")

##  'indexed_series' num [1:500] 0.552 0.675 0.214 0.311 1.174 ...
##  - attr(*, "index_df")=Classes 'index_df', 'pindex' and 'data.frame'..
##   ..$ c.s: Factor w/ 100 levels "c1.s1","c2.s1",..: 1 1 1 1 1 1 1 1 ..
##   ..$ y  : Ord.factor w/ 10 levels "2001"<"2002"<..: 2 3 4 5 6 7 8 9..
##   ..- attr(*, "nam")= chr [1:3] "c" "s" "y"

is_irregular(vi)

## [1] TRUE

vi |> psmat() |> head(3)

##       2001  2002  2003  2004  2005 2006  2007   2008  2009  2010
## c1.s1   NA 0.552 0.675 0.214 0.311 1.17 0.619 0.1127 0.917 0.223
## c2.s1   NA 0.795    NA    NA 0.237   NA    NA 0.0585 0.818    NA
## c3.s1   NA 0.709 0.268 1.464    NA   NA 0.467 0.1193 0.467    NA

fdiff(vi) |> psmat() |> head(3)

##       2001 2002   2003   2004   2005  2006   2007   2008  2009   2010
## c1.s1   NA   NA  0.122 -0.461 0.0964 0.863 -0.555 -0.506 0.804 -0.694
## c2.s1   NA   NA     NA     NA     NA    NA     NA     NA 0.759     NA
## c3.s1   NA   NA -0.441  1.196     NA    NA     NA -0.348 0.348     NA

###################################################
### code chunk number 23: indexing_3
###################################################
settransform(exportsi, v_ld = Dlog(v))
lm(v_ld ~ L(v_ld, 1:2), exportsi) |> summary() |> coef() |> round(3)

##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)      -0.008      0.141  -0.058    0.954
## L(v_ld, 1:2)L1   -0.349      0.115  -3.042    0.004
## L(v_ld, 1:2)L2   -0.033      0.154  -0.215    0.831

###################################################
### code chunk number 24: rollmean
###################################################
BY(vi, findex(vi)$c.s, data.table::frollmean, 5) |> head(10)

##  [1]     NA     NA     NA     NA 0.5853 0.5986 0.4861 0.6267 0.6092     NA
## 
## Indexed by:  c.s [2] | y [9 (10)]

###################################################
### code chunk number 25: joins
###################################################
teacher <- data.frame(id = 1:4, names = c("John", "Jane", "Bob", "Carl"),
                      age = c(35, 32, 42, 67), subject = c("Math", "Econ", "Stats", "Trade"))
course <- data.frame(id = c(1, 2, 2, 3, 5), semester = c(1, 1, 2, 1, 2),
                     course = c("Math I", "Microecon", "Macroecon", "Stats I", "History"))
join(teacher, course, on = "id")

## left join: teacher[id] 3/4 (75%) <1:1st> course[id] 3/5 (60%)

##   id names age subject semester    course
## 1  1  John  35    Math        1    Math I
## 2  2  Jane  32    Econ        1 Microecon
## 3  3   Bob  42   Stats        1   Stats I
## 4  4  Carl  67   Trade       NA      <NA>

###################################################
### code chunk number 26: joins_2
###################################################
join(teacher, course, how = "full", multiple = TRUE, column = TRUE)

## full join: teacher[id] 3/4 (75%) <1:1.33> course[id] 4/5 (80%)

##   id names age subject semester    course   .join
## 1  1  John  35    Math        1    Math I matched
## 2  2  Jane  32    Econ        1 Microecon matched
## 3  2  Jane  32    Econ        2 Macroecon matched
## 4  3   Bob  42   Stats        1   Stats I matched
## 5  4  Carl  67   Trade       NA      <NA> teacher
## 6  5  <NA>  NA    <NA>        2   History  course

###################################################
### code chunk number 27: joins_3
###################################################
join(teacher, course, multiple = TRUE, attr = "jn") |> attr("jn") |>
  str(strict.width = "cut", width = 70)

## left join: teacher[id] 3/4 (75%) <1:1.33> course[id] 4/5 (80%)
## List of 3
##  $ call   : language join(x = teacher, y = course, multiple = TRUE,"..
##  $ on.cols:List of 2
##   ..$ x: chr "id"
##   ..$ y: chr "id"
##  $ match  : 'qG' int [1:5] 1 2 3 4 NA
##   ..- attr(*, "N.nomatch")= int 1
##   ..- attr(*, "N.groups")= int 5
##   ..- attr(*, "N.distinct")= int 4

###################################################
### code chunk number 28: joins_4
###################################################
join(teacher, course, on = "id", validate = "1:1") |>
  tryCatch(error = function(e) strwrap(e) |> cat(sep = "\n"))

## Error in join(teacher, course, on = "id", validate = "1:1"): Join is
## not 1:1: teacher (x) is unique on the join columns; course (y) is
## not unique on the join columns

###################################################
### code chunk number 29: joins_4.5
###################################################
join(teacher, course, on = "id", require = list(x = 0.8)) |>
  tryCatch(error = function(e) substr(e, 102, 200) |> cat())

## Matched 75.0% of records in table teacher (x), but 80.0% is required

###################################################
### code chunk number 30: joins_5
###################################################
for (h in c("semi", "anti")) join(teacher, course, how = h) |> print()

## semi join: teacher[id] 3/4 (75%) <1:1st> course[id] 3/5 (60%)
##   id names age subject
## 1  1  John  35    Math
## 2  2  Jane  32    Econ
## 3  3   Bob  42   Stats
## anti join: teacher[id] 3/4 (75%) <1:1st> course[id] 3/5 (60%)
##   id names age subject
## 1  4  Carl  67   Trade

###################################################
### code chunk number 31: joins_6
###################################################
course$names <- teacher$names[course$id]
join(teacher, course, on = "id", how = "inner", multiple = TRUE)

## inner join: teacher[id] 3/4 (75%) <1:1.33> course[id] 4/5 (80%)
## duplicate columns: names => renamed using suffix '_course' for y

##   id names age subject semester    course names_course
## 1  1  John  35    Math        1    Math I         John
## 2  2  Jane  32    Econ        1 Microecon         Jane
## 3  2  Jane  32    Econ        2 Macroecon         Jane
## 4  3   Bob  42   Stats        1   Stats I          Bob

###################################################
### code chunk number 32: joins_7
###################################################
join(teacher, course, on = "id", multiple = TRUE, drop.dup.cols = "y")

## left join: teacher[id] 3/4 (75%) <1:1.33> course[id] 4/5 (80%)
## duplicate columns: names => dropped from y

##   id names age subject semester    course
## 1  1  John  35    Math        1    Math I
## 2  2  Jane  32    Econ        1 Microecon
## 3  2  Jane  32    Econ        2 Macroecon
## 4  3   Bob  42   Stats        1   Stats I
## 5  4  Carl  67   Trade       NA      <NA>

###################################################
### code chunk number 33: pivots
###################################################
data <- GGDC10S |>
  fmutate(Label = ifelse(Variable == "VA", "Value Added", "Employment")) |>
  fsubset(is.finite(AGR), Country, Variable, Label, Year, AGR:MAN)
namlab(data, N = TRUE, Ndistinct = TRUE, class = TRUE)

##   Variable     Class    N Ndist         Label
## 1  Country character 4364    43       Country
## 2 Variable character 4364     2      Variable
## 3    Label character 4364     2          <NA>
## 4     Year   numeric 4364    67          Year
## 5      AGR   numeric 4364  4353   Agriculture
## 6      MIN   numeric 4355  4224        Mining
## 7      MAN   numeric 4355  4353 Manufacturing

###################################################
### code chunk number 34: pivot_longer
###################################################
head(dl <- pivot(data, ids = 1:4, names = list("Sectorcode", "Value"),
                 labels = "Sector", how = "longer"))

##   Country Variable       Label Year Sectorcode      Sector Value
## 1     BWA       VA Value Added 1964        AGR Agriculture 16.30
## 2     BWA       VA Value Added 1965        AGR Agriculture 15.73
## 3     BWA       VA Value Added 1966        AGR Agriculture 17.68
## 4     BWA       VA Value Added 1967        AGR Agriculture 19.15
## 5     BWA       VA Value Added 1968        AGR Agriculture 21.10
## 6     BWA       VA Value Added 1969        AGR Agriculture 21.86

###################################################
### code chunk number 35: pivot_wider
###################################################
head(dw <- pivot(data, c("Country", "Year"), names = "Variable",
                 labels = "Label", how = "wider"))

##   Country Year AGR_VA AGR_EMP MIN_VA MIN_EMP MAN_VA MAN_EMP
## 1     BWA 1964  16.30   152.1  3.494  1.9400 0.7366   2.420
## 2     BWA 1965  15.73   153.3  2.496  1.3263 1.0182   2.330
## 3     BWA 1966  17.68   153.9  1.970  1.0022 0.8038   1.282
## 4     BWA 1967  19.15   155.1  2.299  1.1192 0.9378   1.042
## 5     BWA 1968  21.10   156.2  1.839  0.7855 0.7503   1.069
## 6     BWA 1969  21.86   157.4  5.245  2.0314 2.1396   2.124

namlab(dw)

##   Variable                       Label
## 1  Country                     Country
## 2     Year                        Year
## 3   AGR_VA   Agriculture - Value Added
## 4  AGR_EMP    Agriculture - Employment
## 5   MIN_VA        Mining - Value Added
## 6  MIN_EMP         Mining - Employment
## 7   MAN_VA Manufacturing - Value Added
## 8  MAN_EMP  Manufacturing - Employment

###################################################
### code chunk number 36: pivot_recast
###################################################
head(dr <- pivot(data, c("Country", "Year"),
                 names = list(from = "Variable", to = "Sectorcode"),
                 labels = list(from = "Label", to = "Sector"), how = "recast"))

##   Country Year Sectorcode      Sector    VA   EMP
## 1     BWA 1964        AGR Agriculture 16.30 152.1
## 2     BWA 1965        AGR Agriculture 15.73 153.3
## 3     BWA 1966        AGR Agriculture 17.68 153.9
## 4     BWA 1967        AGR Agriculture 19.15 155.1
## 5     BWA 1968        AGR Agriculture 21.10 156.2
## 6     BWA 1969        AGR Agriculture 21.86 157.4

vlabels(dr)[3:6]

##    Sectorcode        Sector            VA           EMP 
##            NA            NA "Value Added"  "Employment"

###################################################
### code chunk number 37: pivot_agg
###################################################
head(dr_agg <- pivot(data, "Country", c("AGR", "MIN", "MAN"), how = "recast",
                     names = list(from = "Variable", to = "Sectorcode"),
                     labels = list(from = "Label", to = "Sector"), FUN = "mean"))

##   Country Sectorcode      Sector       VA      EMP
## 1     BWA        AGR Agriculture    462.2   188.06
## 2     ETH        AGR Agriculture  34389.9 17624.34
## 3     GHA        AGR Agriculture   1549.4  3016.04
## 4     KEN        AGR Agriculture 139705.9  5348.91
## 5     MWI        AGR Agriculture  28512.6  2762.62
## 6     MUS        AGR Agriculture   3819.6    59.34

###################################################
### code chunk number 38: list_proc
###################################################
d_list <- GGDC10S |> rsplit( ~ Country + Variable)
d_list$ARG$EMP$AGR[1:12]

##  [1] 1800 1835 1731 2030 1889 1843 1789 1724 1678 1725 1650 1553

###################################################
### code chunk number 39: list_proc_2
###################################################
results <- list()
for (country in c("ARG", "BRA", "CHL")) {
  for (variable in c("EMP", "VA")) {
    m <- lm(log(AGR+1) ~ log(MIN+1) + log(MAN+1) + Year,
            data = d_list[[country]][[variable]])
    results[[country]][[variable]] <- list(model = m, BIC = BIC(m),
                                           summary = summary(m))
  }
}


###################################################
### code chunk number 40: list_proc_3
###################################################
str(r_sq <- results |> get_elem("r.squared"))

## List of 3
##  $ ARG:List of 2
##   ..$ EMP: num 0.907
##   ..$ VA : num 1
##  $ BRA:List of 2
##   ..$ EMP: num 0.789
##   ..$ VA : num 0.999
##  $ CHL:List of 2
##   ..$ EMP: num 0.106
##   ..$ VA : num 0.999

rowbind(r_sq, idcol = "Country", return = "data.frame")

##   Country    EMP     VA
## 1     ARG 0.9068 0.9996
## 2     BRA 0.7888 0.9988
## 3     CHL 0.1058 0.9991

###################################################
### code chunk number 41: list_proc_3.5
###################################################
r_sq |> t_list() |> rowbind(idcol = "Variable", return = "data.frame")

##   Variable    ARG    BRA    CHL
## 1      EMP 0.9068 0.7888 0.1058
## 2       VA 0.9996 0.9988 0.9991

###################################################
### code chunk number 42: list_proc_4
###################################################
results$ARG$EMP$summary$coefficients

##               Estimate Std. Error  t value  Pr(>|t|)
## (Intercept)  26.583617  1.2832583  20.7157 1.747e-28
## log(MIN + 1)  0.083168  0.0352493   2.3594 2.169e-02
## log(MAN + 1) -0.064413  0.0767614  -0.8391 4.048e-01
## Year         -0.009683  0.0005556 -17.4278 1.003e-24

###################################################
### code chunk number 43: list_proc_5
###################################################
results |> get_elem("coefficients") |> get_elem(is.matrix) |>
  unlist2d(idcols = c("Country", "Variable"),
           row.names = "Covariate") |> head(3)

##   Country Variable    Covariate Estimate Std. Error t value  Pr(>|t|)
## 1     ARG      EMP  (Intercept) 26.58362    1.28326 20.7157 1.747e-28
## 2     ARG      EMP log(MIN + 1)  0.08317    0.03525  2.3594 2.169e-02
## 3     ARG      EMP log(MAN + 1) -0.06441    0.07676 -0.8391 4.048e-01

###################################################
### code chunk number 44: varying
###################################################
varying(wlddev, ~ iso3c)

##       country          date          year        decade        region 
##         FALSE          TRUE          TRUE          TRUE         FALSE 
##        income          OECD         PCGDP        LIFEEX          GINI 
##         FALSE         FALSE          TRUE          TRUE          TRUE 
##           ODA           POP  center_PCGDP center_LIFEEX 
##          TRUE          TRUE          TRUE          TRUE

###################################################
### code chunk number 45: pdec
###################################################
LIFEEXi <- reindex(wlddev$LIFEEX, wlddev$iso3c)
all.equal(fvar(LIFEEXi), fvar(fbetween(LIFEEXi)) + fvar(fwithin(LIFEEXi)))

## [1] TRUE

###################################################
### code chunk number 46: qsu_1
###################################################
qsu(LIFEEXi)

##              N/T     Mean       SD      Min      Max
## Overall    11670  64.2963  11.4764   18.907  85.4171
## Between      207  64.9537   9.8936  40.9663  85.4171
## Within   56.3768  64.2963   6.0842  32.9068  84.4198

###################################################
### code chunk number 47: qsu_2
###################################################
wlda15 <- wlddev |> fsubset(year >= 2015) |> fgroup_by(iso3c) |> flast()
qsu(wlda15, by = LIFEEX ~ income, w = ~ POP)

##                       N       WeightSum     Mean      SD      Min     Max
## High income          68  1.19122607e+09   80.879   2.441  70.6224  85.078
## Low income           29      694'893773  63.8061  3.9266   53.283  72.697
## Lower middle income  47  3.06353648e+09  68.7599  4.7055   54.331  76.699
## Upper middle income  55  2.67050662e+09  75.9476  2.3895   58.735  80.279

###################################################
### code chunk number 48: descr
###################################################
descr(wlda15, by = income + LIFEEX ~ OECD, w = ~ replace_na(POP / 1e6))

## Dataset: wlda15, 2 Variables, N = 216, WeightSum = 7620.902563
## Grouped by: OECD [2]
##          N   Perc  WeightSum   Perc
## FALSE  180  83.33    6311.15  82.81
## TRUE    36  16.67    1309.75  17.19
## -----------------------------------------------------------------------------
## income (factor): Income Level
## Statistics (WeightSum = 7621, 0% NAs)
##        WeightSum   Perc  Ndist
## FALSE    6311.15  82.81      4
## TRUE     1309.75  17.19      2
## 
## Table (WeightSum Perc)
##                          FALSE       TRUE      Total
## Lower middle income  3064 48.5     0  0.0  3064 40.2
## Upper middle income  2460 39.0   211 16.1  2671 35.0
## High income            93  1.5  1099 83.9  1192 15.6
## Low income            695 11.0     0  0.0   695  9.1
## -----------------------------------------------------------------------------
## LIFEEX (numeric): Life expectancy at birth, total (years)
## Statistics (N = 199, 7.87% NAs)
##          N  Ndist  WeightSum   Perc   Mean    SD    Min    Max   Skew  Kurt
## FALSE  163    164    6310.41  82.81  71.14  5.77  53.28  85.08  -0.99  3.76
## TRUE    36     36    1309.75  17.19  80.32  2.77  75.05  84.36  -0.29  2.11
## 
## Quantiles
##           1%     5%    10%    25%    50%    75%    90%    95%    99%
## FALSE  54.69  59.38  63.67  69.64  71.71  76.89  76.91  76.91  77.94
## TRUE   75.07  75.14  76.03  78.77  80.86  82.86  83.61  84.05   84.3
## -----------------------------------------------------------------------------

###################################################
### code chunk number 49: qtab
###################################################
wlda15 |> with(qtab(OECD, income))

##        income
## OECD    High income Low income Lower middle income Upper middle income
##   FALSE          45         30                  47                  58
##   TRUE           34          0                   0                   2

###################################################
### code chunk number 50: qtab_2
###################################################
wlda15 |> with(qtab(OECD, income, w = POP / 1e6))

##        income
## OECD    High income Low income Lower middle income Upper middle income
##   FALSE       93.01     694.89             3063.54             2459.71
##   TRUE      1098.75       0.00                0.00              211.01

###################################################
### code chunk number 51: qtab_3
###################################################
wlda15 |> with(qtab(OECD, income, w = LIFEEX, wFUN = fmean))

##        income
## OECD    High income Low income Lower middle income Upper middle income
##   FALSE       78.75      62.81               68.30               73.81
##   TRUE        81.09                                              76.37

###################################################
### code chunk number 52: qtab_4
###################################################
wlda15 |> with(qtab(OECD, income, w = LIFEEX, wFUN = fmean,
                    wFUN.args = list(w = POP)))

##        income
## OECD    High income Low income Lower middle income Upper middle income
##   FALSE       77.91      63.81               68.76               75.93
##   TRUE        81.13                                              76.10

###################################################
### code chunk number 53: bench_1
###################################################
rm(list = setdiff(ls(), c("bmark", "vars", "r_opts", "oldopts")))
gc()

##           used (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 1512011 80.8    3842590 205.3         NA  3842590 205.3
## Vcells 9687339 74.0   36617092 279.4      49152 81718535 623.5

set.seed(101);
int <- 1:1000; g_int <- sample.int(1000, 1e7, replace = TRUE)
char <- c(letters, LETTERS, month.abb, month.name)
g_char <- sample(char <- outer(char, char, paste0), 1e7, TRUE)
bmark(base = unique(g_int), collapse = funique(g_int))

## Warning: Some expressions had a GC in every iteration; so filtering is
## disabled.

##   expression     min  median mem_alloc n_itr n_gc total_time
## 1       base 44.09ms 46.54ms   166.2MB    63   63         3s
## 2   collapse  6.15ms  9.23ms    38.2MB   333   48      3.01s

bmark(base = unique(g_char), collapse = funique(g_char))

## Warning: Some expressions had a GC in every iteration; so filtering is
## disabled.

##   expression    min median mem_alloc n_itr n_gc total_time
## 1       base 69.8ms 71.8ms   166.2MB    42   42      3.01s
## 2   collapse   14ms 17.1ms    38.2MB   169   24         3s

bmark(base = match(g_int, int), collapse = fmatch(g_int, int))

##   expression     min  median mem_alloc n_itr n_gc total_time
## 1       base 16.26ms 17.18ms    76.3MB    99   61      1.71s
## 2   collapse  6.96ms  7.58ms    38.2MB   302   50       2.3s

bmark(base = match(g_char, char), data.table =
        chmatch(g_char, char), collapse = fmatch(g_char, char))

##   expression    min median mem_alloc n_itr n_gc total_time
## 1       base 49.4ms 51.1ms   114.5MB    22   34      1.12s
## 2 data.table   32ms 33.1ms    38.1MB    74   15      2.45s
## 3   collapse 14.7ms 15.7ms    38.1MB   148   30      2.32s

###################################################
### code chunk number 54: bench_2
###################################################
set_collapse(na.rm = FALSE, sort = FALSE, nthreads = 4)
m <- matrix(rnorm(1e7), ncol = 1000)
bmark(base = colSums(m), collapse = fsum(m))

##   expression      min   median mem_alloc n_itr n_gc total_time
## 1       base   5.42ms   5.74ms    7.86KB   520    0         3s
## 2   collapse 294.05µs 362.28µs    7.86KB  8182    0         3s

bmark(base = colMeans(m), collapse = fmean(m))

##   expression      min   median mem_alloc n_itr n_gc total_time
## 1       base   5.38ms   5.43ms    7.86KB   545    0         3s
## 2   collapse 305.98µs 367.11µs    7.86KB  7804    1      2.99s

bmark(matrixStats = matrixStats::colMedians(m), collapse = fmedian(m))

##    expression    min median mem_alloc n_itr n_gc total_time
## 1 matrixStats   72ms 73.5ms   86.04KB    41    0      3.03s
## 2    collapse 19.2ms 19.6ms    7.86KB   153    0         3s

###################################################
### code chunk number 55: bench_2.1
###################################################
g <- sample.int(1e3, 1e4, TRUE)
bmark(base = rowsum(m, g), collapse = fsum(m, g))

##   expression      min median mem_alloc n_itr n_gc total_time
## 1       base   5.09ms 5.57ms    7.85MB   490   20      2.75s
## 2   collapse 908.48µs 1.17ms    7.67MB  1905   76      2.36s

###################################################
### code chunk number 56: bench_flights_setup
###################################################
fastverse_extend(nycflights23, dplyr, data.table); setDTthreads(4)
list(flights, airports, airlines, planes, weather) |> sapply(nrow)

## [1] 435352   1255     14   4840  26207

flights |> fselect(month, day, origin, dest) |> fnunique()

## [1] 75899

###################################################
### code chunk number 57: bench_flights_summarise
###################################################
vars <- .c(dep_delay, arr_delay, air_time, distance, hour, minute)
bmark(dplyr = flights |> group_by(month, day, origin, dest) |>
        summarise(across(all_of(vars), sum), .groups = "drop"),
      data.table = qDT(flights)[, lapply(.SD, sum), .SDcols = vars,
                                by = .(month, day, origin, dest)],
      collapse = flights |> fgroup_by(month, day, origin, dest) |>
        get_vars(vars) |> fsum())

## Warning: Some expressions had a GC in every iteration; so filtering is
## disabled.

##   expression      min   median mem_alloc n_itr n_gc total_time
## 1      dplyr 357.25ms  484.3ms   48.56MB     7   69      3.21s
## 2 data.table   7.67ms   9.59ms   18.93MB   284   40      3.01s
## 3   collapse   3.14ms   3.84ms    9.11MB   663   42         3s

###################################################
### code chunk number 58: bench_flights_summarise_2
###################################################
bmark(dplyr_mean = flights |> group_by(month, day, origin, dest) |>
        summarise(across(all_of(vars), mean), .groups = "drop"),
      data.table_mean = qDT(flights)[, lapply(.SD, mean), .SDcols = vars,
                                     by = .(month, day, origin, dest)],
      collapse_mean = flights |> fgroup_by(month, day, origin, dest) |>
        get_vars(vars) |> fmean())

## Warning: Some expressions had a GC in every iteration; so filtering is
## disabled.

##        expression    min median mem_alloc n_itr n_gc total_time
## 1      dplyr_mean  1.16s  1.19s   48.56MB     3   70      3.65s
## 2 data.table_mean 7.59ms 9.27ms   18.93MB   284   39         3s
## 3   collapse_mean 3.07ms 3.84ms    9.11MB   658   41         3s

bmark(dplyr_median = flights |> group_by(month, day, origin, dest) |>
        summarise(across(all_of(vars), median), .groups = "drop"),
      data.table_median = qDT(flights)[, lapply(.SD, median), .SDcols = vars,
                                       by = .(month, day, origin, dest)],
      collapse_median = flights |> fgroup_by(month, day, origin, dest) |>
        get_vars(vars) |> fmedian())

## Warning: Some expressions had a GC in every iteration; so filtering is
## disabled.

##          expression     min  median mem_alloc n_itr n_gc total_time
## 1      dplyr_median   4.32s   4.32s    52.8MB     1  102      4.32s
## 2 data.table_median 21.73ms 23.17ms    18.9MB   117   16      3.02s
## 3   collapse_median  9.49ms 10.55ms    11.1MB   267   21      3.01s

###################################################
### code chunk number 59: bench_flights_summarise_3
###################################################
bmark(dplyr = flights |> group_by(month, day, origin, dest) |>
        summarise(rng = max(arr_delay) - min(arr_delay), .groups = "drop"),
      data.table = qDT(flights)[, .(rng = max(arr_delay) - min(arr_delay)),
                                by = .(month, day, origin, dest)],
      collapse = flights |> fgroup_by(month, day, origin, dest) |>
        fsummarise(rng = fmax(arr_delay) - fmin(arr_delay)))

## Warning: Some expressions had a GC in every iteration; so filtering is
## disabled.

##   expression     min   median mem_alloc n_itr n_gc total_time
## 1      dplyr 89.84ms 105.25ms   20.18MB    26   59      3.17s
## 2 data.table  55.4ms  58.58ms    5.77MB    51   26      3.01s
## 3   collapse  5.34ms   5.86ms     6.8MB   476   22         3s

###################################################
### code chunk number 60: bench_flights_join_setup
###################################################
flights |> join(weather, on = c("origin", "time_hour")) |>
  join(planes, on = "tailnum") |> join(airports, on = c(dest = "faa")) |>
  join(airlines, on = "carrier") |> dim() |>
  capture.output() |> substr(1, 76) |> cat(sep = "\n")

## left join: flights[origin, time_hour] 434526/435352 (99.8%) <21.94:1st> weat
## duplicate columns: year, month, day, hour => renamed using suffix '_weather'
## left join: x[tailnum] 424068/435352 (97.4%) <87.62:1st> planes[tailnum] 4840
## duplicate columns: year => renamed using suffix '_planes' for y
## left join: x[dest] 435352/435352 (100%) <3689.42:1st> airports[faa] 118/1255
## left join: x[carrier] 435352/435352 (100%) <31096.57:1st> airlines[carrier] 
## duplicate columns: name => renamed using suffix '_airlines' for y
## [1] 435352     48

###################################################
### code chunk number 61: bench_flights_join
###################################################
rm(list = setdiff(ls(), c("bmark", "vars", "r_opts", "oldopts")))
gc()

##           used (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 1512387 80.8    3842590 205.3         NA  3842590 205.3
## Vcells 9688147 74.0   49854626 380.4      49152 81718535 623.5

bmark(
  dplyr_joins = flights |>
    left_join(weather, by = c("origin", "time_hour"), multiple = "first") |>
    left_join(planes, by = "tailnum", multiple = "first") |>
    left_join(airports, by = c(dest = "faa"), multiple = "first") |>
    left_join(airlines, by = "carrier", multiple = "first"),

  data.table_joins = qDT(airlines)[qDT(airports)[qDT(planes)[qDT(weather)[qDT(flights), on = c("origin", "time_hour"), mult = "first"], on = "tailnum", mult = "first"], on = c("faa" = "dest"), mult = "first"], on = "carrier", mult = "first"],

  collapse_joins = flights |>
    join(weather, on = c("origin", "time_hour"), verbose = 0) |>
    join(planes, on = "tailnum", verbose = 0) |>
    join(airports, on = c("dest" = "faa"), verbose = 0) |>
    join(airlines, on = "carrier", verbose = 0)
)

## Warning: Some expressions had a GC in every iteration; so filtering is
## disabled.

##         expression     min median mem_alloc n_itr n_gc total_time
## 1      dplyr_joins   149ms  222ms   558.9MB    15   52      3.04s
## 2 data.table_joins 245.9ms  258ms   490.4MB    12   49       3.1s
## 3   collapse_joins  10.1ms   13ms    89.7MB   183   99      3.01s

###################################################
### code chunk number 62: bench_pivot_long
###################################################
bmark(tidyr = tidyr::pivot_longer(flights, cols = vars),
      data.table = qDT(flights) |> melt(measure = vars),
      collapse = pivot(flights, values = vars))

## Warning: Some expressions had a GC in every iteration; so filtering is
## disabled.

##   expression     min  median mem_alloc n_itr n_gc total_time
## 1      tidyr 132.8ms   140ms     251MB    21   41      3.02s
## 2 data.table  37.4ms  48.8ms     209MB    47   35      3.01s
## 3   collapse  13.9ms  27.7ms     209MB    72   87      3.09s

###################################################
### code chunk number 63: bench_pivot_wide
###################################################
bmark(tidyr = tidyr::pivot_wider(flights, id_cols = .c(month, day, dest),
                                 names_from = "origin", values_from = vars, values_fn = sum),
      data.table = dcast(qDT(flights), month + day + dest ~ origin,
                         value.var = vars, fun = sum),
      collapse_fsum = pivot(flights, .c(month, day, dest), vars,
                            "origin", how = "wider", FUN = fsum),
      collapse_itnl = pivot(flights, .c(month, day, dest), vars,
                            "origin", how = "wider", FUN = "sum"))

## Warning: Some expressions had a GC in every iteration; so filtering is
## disabled.

##      expression      min   median mem_alloc n_itr n_gc total_time
## 1         tidyr 349.02ms 394.22ms   142.4MB     8   63      3.28s
## 2    data.table 226.07ms 231.09ms      21MB    13   57      3.02s
## 3 collapse_fsum   5.92ms   7.48ms    39.1MB   311   58         3s
## 4 collapse_itnl   3.77ms   4.71ms    12.4MB   564   37         3s

###################################################
### code chunk number 64: reset_options
###################################################
options(r_opts)
set_collapse(oldopts)
rm(list = ls()); gc()

##           used (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 1512525 80.8    3842590 205.3         NA  3842590 205.3
## Vcells 9688354 74.0   42341140 323.1      49152 87703454 669.2

###################################################
### Print Session Information
###################################################

sessionInfo()

## R version 4.4.3 (2025-02-28)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.3.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Berlin
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] dplyr_1.1.4        nycflights23_0.2.0 collapse_2.1.2    
## [4] kit_0.0.19         magrittr_2.0.3     data.table_1.17.0 
## [7] fastverse_0.3.4   
## 
## loaded via a namespace (and not attached):
##  [1] compiler_4.4.3    crayon_1.5.3      tidyselect_1.2.1  Rcpp_1.0.14      
##  [5] stringr_1.5.1     parallel_4.4.3    snakecase_0.11.1  tidyr_1.3.1      
##  [9] fastmap_1.2.0     R6_2.6.1          commonmark_1.9.5  generics_0.1.3   
## [13] knitr_1.50        tibble_3.2.1      janitor_2.2.1     lubridate_1.9.4  
## [17] pillar_1.10.1     rlang_1.1.6       stringi_1.8.7     litedown_0.7     
## [21] xfun_0.52         timechange_0.3.0  cli_3.6.5         withr_3.0.2      
## [25] digest_0.6.37     rstudioapi_0.17.1 markdown_2.0      lifecycle_1.0.4  
## [29] vctrs_0.6.5       bench_1.1.4       evaluate_1.0.3    glue_1.8.0       
## [33] profmem_0.6.0     rmarkdown_2.29    purrr_1.0.4       htmltools_0.5.8.1
## [37] matrixStats_1.5.0 tools_4.4.3       pkgconfig_2.0.3

SebKrantz/collapse documentation built on June 12, 2025, 1:44 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

SebKrantz/collapse
Advanced and Fast Data Transformation

misc/JSS article/final/jss5278/jss5278.md
In SebKrantz/collapse: Advanced and Fast Data Transformation

R Package Documentation

Browse R Packages

We want your feedback!

SebKrantz/collapse Advanced and Fast Data Transformation

misc/JSS article/final/jss5278/jss5278.md In SebKrantz/collapse: Advanced and Fast Data Transformation

R Package Documentation

Browse R Packages

We want your feedback!

SebKrantz/collapse
Advanced and Fast Data Transformation

misc/JSS article/final/jss5278/jss5278.md
In SebKrantz/collapse: Advanced and Fast Data Transformation