Use data.table the tidy way: An ultimate tutorial of tidyfst"

knitr::opts_chunk$set(
  collapse = TRUE,
  #eval = FALSE,
  comment = "#>"
)

I love the tidy syntax of dplyr and the ultimate speed of data.table. Why not take the both? That is why I have started the work of tidyfst, bridging the tidy syntax and computation performance via translation. This tool is especially friendly for dplyr users who want to learn some data.table, but data.table could also benefit from it (more or less).

A great comparison of data.table and dplyr was displayed at https://atrebas.github.io/post/2019-03-03-datatable-dplyr/ (thanks to Atrebas). I love this tutorial very much because it dig rather deep into many features from both packages. Here I'll try to implement all operations from that tutorial, and the potential users could find why they would prefer tidyfst for some (if not most) tasks.

The below examples have all been checked with tidyfst. Now let's begin.

Create example data

library(tidyfst)

set.seed(1L)

## Create a data table
DF <- data.table(V1 = rep(c(1L, 2L), 5)[-10],
                V2 = 1:9,
                V3 = c(0.5, 1.0, 1.5),
                V4 = rep(LETTERS[1:3], 3))
copy(DF) -> DT

class(DF)
DF

Basic operations

Filter rows

### Filter rows using indices
slice_dt(DF, 3:4)

### Discard rows using negative indices
slice_dt(DF, -(3:7))

### Filter rows using a logical expression
filter_dt(DF, V2 > 5)
filter_dt(DF, V4 %in% c("A", "C"))
filter_dt(DF, V4 %chin% c("A", "C")) # fast %in% for character

### Filter rows using multiple conditions
filter_dt(DF, V1 == 1, V4 == "A")
# equals to
filter_dt(DF, V1 == 1 & V4 == "A")

### Filter unique rows
distinct_dt(DF) # unique(DF)
distinct_dt(DF, V1,V4)

### Discard rows with missing values
drop_na_dt(DF) # na.omit(DF)

### Other filters
sample_n_dt(DF, 3)      # n random rows
sample_frac_dt(DF, 0.5) # fraction of random rows
slice_max_dt(DF, V1,1)     # top n entries (includes equals)

filter_dt(DT,V4 %like% "^B")
filter_dt(DT,V2 %between% c(3, 5))
filter_dt(DT,between(V2, 3, 5, incbounds = FALSE))
filter_dt(DT,V2 %inrange% list(-1:1, 1:3))  # see also ?inrange

Sort rows

### Sort rows by column
arrange_dt(DF, V3)

### Sort rows in decreasing order
arrange_dt(DF, -V3)

### Sort rows based on several columns
arrange_dt(DF, V1, -V2)

Select columns

### Select one column using an index (not recommended)
pull_dt(DT,3) # returns a vector
select_dt(DT,3) # returns a data.table

### Select one column using column name
select_dt(DF, V2) # returns a data.table
pull_dt(DF, V2)   # returns a vector

### Select several columns
select_dt(DF, V2, V3, V4)
select_dt(DF, V2:V4) # select columns between V2 and V4

### Exclude columns
select_dt(DF, -V2, -V3)

### Select/Exclude columns using a character vector
cols <- c("V2", "V3")
select_dt(DF,cols = cols)
select_dt(DF,cols = cols,negate = TRUE)

### Other selections
select_dt(DF, cols = paste0("V", 1:2))
relocate_dt(DF, V4) # reorder columns
select_dt(DF, "V")
select_dt(DF, "3$")
select_dt(DF, ".2")
select_dt(DF, "V1")
select_dt(DF, -"^V2")
# remove variables using "-" prior to function

Summarise data

### Summarise one column
summarise_dt(DF, sum(V1)) # returns a data.table
summarise_dt(DF, sumV1 = sum(V1)) # returns a data.table

### Summarise several columns
summarise_dt(DF, sum(V1), sd(V3))

### Summarise several columns and assign column names
DF %>%
  summarise_dt(sumv1 = sum(V1),
              sdv3  = sd(V3))

### Summarise a subset of rows
DT[1:4, sum(V1)]
DF %>%
  slice_dt(1:4) %>%
  summarise_dt(sum(V1))

### Misc
summarise_dt(DF, nth(V3,1))
summarise_dt(DF, nth(V3,-1))
summarise_dt(DF, nth(V3, 5))
summarise_dt(DF, uniqueN(V4))
uniqueN(DF)

group computation (by)

### By group
# not recommended
DF %>%
  group_dt(
    by = V4,
    summarise_dt(sumV2 = sum(V2))
  )
# recommended
DF %>% 
  summarise_dt(sumV2 = sum(V2),by = V4)


### By several groups
DF %>% 
  summarise_dt(sumV2 = sum(V2),by = .(V1,V4))

### Calling function in by
DF %>% 
  summarise_dt(sumV2 = sum(V2),by = tolower(V4))


### Assigning column name in by
DF %>% 
  summarise_dt(sumV2 = sum(V2),by = .(abc = tolower(V4)))


### Using a condition in by
DF %>% 
  summarise_dt(sumV2 = sum(V2),by = V4 == "A")



### By on a subset of rows
DF %>% 
  slice_dt(1:5) %>% 
  summarise_dt(sumV1 = sum(V1),by = V4)

### Count number of observations for each group
count_dt(DF, V4)

### Add a column with number of observations for each group
add_count_dt(DF, V1)

### Retrieve the first/last/nth observation for each group
DF %>% summarise_dt(by = V4,nth(V2,1))
DF %>% summarise_dt(by = V4,nth(V2,-1))
DF %>% summarise_dt(by = V4,nth(V2,2))

Going further

Advanced columns manipulation

### Summarise all the columns
summarise_vars(DT,.func = max)

### Summarise several columns
summarise_vars(DT,c("V1", "V2"),mean)

### Summarise several columns by group
DT %>% 
    summarise_vars(c("V1", "V2"),mean,by = V4)

## using patterns (regex)
DT %>% 
    summarise_vars("V1|V2",mean,by = V4)

## Summarise with more than one function by group
# when you can't find a way, you can always use `in_dt` to use data.table
DT %>% 
  in_dt(, by = V4, 
     c(lapply(.SD, sum),
       lapply(.SD, mean)))

### Summarise using a condition
summarise_vars(DF, is.numeric, mean)

### Modify all the columns
mutate_vars(DF, .func = rev)

### Modify several columns (dropping the others)
DF %>% 
  select_dt(cols = c("V1", "V2")) %>% 
  mutate_vars(.func = sqrt)
DF %>% 
  select_dt(-V4) %>% 
  mutate_vars(.func = exp)

### Modify several columns (keeping the others)
DF %>% 
  mutate_vars(c("V1", "V2"), sqrt)
DF %>% 
  mutate_vars(-"V4",  exp)

### Modify columns using a condition (dropping the others)
select_dt(DT,is.numeric)

### Modify columns using a condition (keeping the others)
mutate_vars(DT,is.numeric,as.integer)

### Use a complex expression
DF %>% 
  group_dt(
    by = V4,
    slice_dt(1:2) %>% 
      transmute_dt(V1 = V1,
            V2 = "X")
  )

### Use multiple expressions (with DT[,{j}])
DT %>% 
  in_dt(,{
      print(V1) #  comments here!
      print(summary(V1))
      x <- V1 + sum(V2)
     .(A = 1:.N, B = x) # last list returned as a data.table
     }
  )

Advanced use of by

### Select first/last/… row by group
DT %>% 
  group_dt(
    by = V4,
    head(1)
  )

DT %>% 
  group_dt(
    by = V4,
    tail(2)
  )

DT %>% 
  group_dt(
    by = V4,
    slice_dt(1,.N)
  )

### Select rows using a nested query
DF %>% 
  group_dt(
    by = V4,
    arrange_dt(V2) %>% 
      slice_dt(1)
  )

### Add a group counter column
DT %>% 
  mutate_dt(Grp = .GRP,by = .(V4, V1))

### Get row number of first (and last) observation by group
DT %>% summarise_dt(I = .I,by = V4)
DT %>% summarise_dt(I = .I[1],by = V4)
DT %>% summarise_dt(I = .I[c(1,.N)],by = V4)

### Handle list-columns by group

DT %>% 
  select_dt(V1,V4) %>% 
  chop_dt(V1) # return V1 as a list
DT %>% nest_dt(V4) # subsets of the data

### Grouping sets (multiple by at once)
# use data.table directly, tidyfst does not provide new methods for it yet
data.table::rollup(DT,
       .(SumV2 = sum(V2)),
       by = c("V1", "V4"))

data.table::rollup(DT,
       .(SumV2 = sum(V2), .N),
       by = c("V1", "V4"),
       id = TRUE)

data.table::cube(DT,
     .(SumV2 = sum(V2), .N),
     by = c("V1", "V4"),
     id = TRUE)

data.table::groupingsets(DT,
             .(SumV2 = sum(V2), .N),
             by   = c("V1", "V4"),
             sets = list("V1", c("V1", "V4")),
             id   = TRUE)

Miscellaneous

Read / Write data

tidyfst exports data.table::fread and data.table::fwrite directly.

### Write data to a csv file
fwrite(DT, "DT.csv")

### Write data to a tab-delimited file
fwrite(DT, "DT.txt", sep = "\t")

### Write list-column data to a csv file
fwrite(setDT(list(0, list(1:5))), "DT2.csv")
#
### Read a csv / tab-delimited file
fread("DT.csv")
# fread("DT.csv", verbose = TRUE) # full details
fread("DT.txt", sep = "\t")

### Read a csv file selecting / droping columns
fread("DT.csv", select = c("V1", "V4"))
fread("DT.csv", drop = "V4")
# NA
### Read and rbind several files
rbindlist(lapply(c("DT.csv", "DT.csv"), fread))
# c("DT.csv", "DT.csv") %>% lapply(fread) %>% rbindlist

Reshape data

### Melt data (from wide to long)
fsetequal(DT,DF)

mDT = DT %>% longer_dt(V3,V4)
mDF = DF %>% longer_dt(-"V1|V2")

fsetequal(mDT,mDF)
mDT

### Cast data (from long to wide)
mDT %>% 
  wider_dt(V4,name = "name",value = "value")
# below is a special case and could only be done in tidyfst
mDT %>% 
  wider_dt(V4,name = "name",value = "value",fun = list)

mDT %>% 
  wider_dt(V4,name = "name",value = "value",fun = sum)

### Split
split(DT, by = "V4")

Other

### Lead/Lag
lag_dt(1:10,n = 1)
lag_dt(1:10,n = 1:2)
lead_dt(1:10,n = 1)

Join/Bind data sets

Join

x <- data.table(Id  = c("A", "B", "C", "C"),
                X1  = c(1L, 3L, 5L, 7L),
                XY  = c("x2", "x4", "x6", "x8"),
                key = "Id")

y <- data.table(Id  = c("A", "B", "B", "D"),
                Y1  = c(1L, 3L, 5L, 7L),
                XY  = c("y1", "y3", "y5", "y7"),
                key = "Id")

### left join
left_join_dt(x, y, by = "Id")

### right join
right_join_dt(x, y, by = "Id")

### inner join
inner_join_dt(x, y, by = "Id")

### full join
full_join_dt(x, y, by = "Id")

### semi join
semi_join_dt(x, y, by = "Id")

### anti join
anti_join_dt(x, y, by = "Id")

Bind

x <- data.table(1:3)
y <- data.table(4:6)
z <- data.table(7:9, 0L)
### Bind rows
rbind(x, y)
rbind(x, z, fill = TRUE)

### Bind rows using a list
rbindlist(list(x, y), idcol = TRUE)

### Bind columns
cbind(x, y)

Set operations

x <- data.table(c(1, 2, 2, 3, 3))
y <- data.table(c(2, 2, 3, 4, 4))
### Intersection
fintersect(x, y)
fintersect(x, y, all = TRUE)

### Difference
fsetdiff(x, y)
fsetdiff(x, y, all = TRUE)

### Union
funion(x, y)
funion(x, y, all = TRUE)

### Equality
fsetequal(x, x[order(-V1),])
all.equal(x, x) # S3 method
setequal(x, x[order(-V1),])

Summary

To break all these codes through, tidyfst has improved bit by bit. If you are using tidyfst frequently, you'll find that while it enjoys a tidy syntax, it is more like you are using data.table in another style. Compared with many other packages with similar goals, tidyfst sticks to many principles of data.table (and is more like data.table in many ways). However, the ultimate goal is still clear: providing users with state-of-the-art data manipulation tools with least pain. Therefore, keep it simple and make it fast. Enjoy tidyfst~



Try the tidyfst package in your browser

Any scripts or data that you put into this service are public.

tidyfst documentation built on Sept. 16, 2024, 9:06 a.m.