english_tutorial.R
In tidyfst: Tidy Verbs for Fast Data Manipulation

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  #eval = FALSE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(tidyfst)

set.seed(1L)

## Create a data table
DF <- data.table(V1 = rep(c(1L, 2L), 5)[-10],
                V2 = 1:9,
                V3 = c(0.5, 1.0, 1.5),
                V4 = rep(LETTERS[1:3], 3))
copy(DF) -> DT

class(DF)
DF

## -----------------------------------------------------------------------------
### Filter rows using indices
slice_dt(DF, 3:4)

### Discard rows using negative indices
slice_dt(DF, -(3:7))

### Filter rows using a logical expression
filter_dt(DF, V2 > 5)
filter_dt(DF, V4 %in% c("A", "C"))
filter_dt(DF, V4 %chin% c("A", "C")) # fast %in% for character

### Filter rows using multiple conditions
filter_dt(DF, V1 == 1, V4 == "A")
# equals to
filter_dt(DF, V1 == 1 & V4 == "A")

### Filter unique rows
distinct_dt(DF) # unique(DF)
distinct_dt(DF, V1,V4)

### Discard rows with missing values
drop_na_dt(DF) # na.omit(DF)

### Other filters
sample_n_dt(DF, 3)      # n random rows
sample_frac_dt(DF, 0.5) # fraction of random rows
slice_max_dt(DF, V1,1)     # top n entries (includes equals)

filter_dt(DT,V4 %like% "^B")
filter_dt(DT,V2 %between% c(3, 5))
filter_dt(DT,between(V2, 3, 5, incbounds = FALSE))
filter_dt(DT,V2 %inrange% list(-1:1, 1:3))  # see also ?inrange

## -----------------------------------------------------------------------------
### Sort rows by column
arrange_dt(DF, V3)

### Sort rows in decreasing order
arrange_dt(DF, -V3)

### Sort rows based on several columns
arrange_dt(DF, V1, -V2)

## -----------------------------------------------------------------------------
### Select one column using an index (not recommended)
pull_dt(DT,3) # returns a vector
select_dt(DT,3) # returns a data.table

### Select one column using column name
select_dt(DF, V2) # returns a data.table
pull_dt(DF, V2)   # returns a vector

### Select several columns
select_dt(DF, V2, V3, V4)
select_dt(DF, V2:V4) # select columns between V2 and V4

### Exclude columns
select_dt(DF, -V2, -V3)

### Select/Exclude columns using a character vector
cols <- c("V2", "V3")
select_dt(DF,cols = cols)
select_dt(DF,cols = cols,negate = TRUE)

### Other selections
select_dt(DF, cols = paste0("V", 1:2))
relocate_dt(DF, V4) # reorder columns
select_dt(DF, "V")
select_dt(DF, "3$")
select_dt(DF, ".2")
select_dt(DF, "V1")
select_dt(DF, -"^V2")
# remove variables using "-" prior to function

## -----------------------------------------------------------------------------
### Summarise one column
summarise_dt(DF, sum(V1)) # returns a data.table
summarise_dt(DF, sumV1 = sum(V1)) # returns a data.table

### Summarise several columns
summarise_dt(DF, sum(V1), sd(V3))

### Summarise several columns and assign column names
DF %>%
  summarise_dt(sumv1 = sum(V1),
              sdv3  = sd(V3))

### Summarise a subset of rows
DT[1:4, sum(V1)]
DF %>%
  slice_dt(1:4) %>%
  summarise_dt(sum(V1))

### Misc
summarise_dt(DF, nth(V3,1))
summarise_dt(DF, nth(V3,-1))
summarise_dt(DF, nth(V3, 5))
summarise_dt(DF, uniqueN(V4))
uniqueN(DF)

## -----------------------------------------------------------------------------
### By group
# not recommended
DF %>%
  group_dt(
    by = V4,
    summarise_dt(sumV2 = sum(V2))
  )
# recommended
DF %>% 
  summarise_dt(sumV2 = sum(V2),by = V4)


### By several groups
DF %>% 
  summarise_dt(sumV2 = sum(V2),by = .(V1,V4))

### Calling function in by
DF %>% 
  summarise_dt(sumV2 = sum(V2),by = tolower(V4))


### Assigning column name in by
DF %>% 
  summarise_dt(sumV2 = sum(V2),by = .(abc = tolower(V4)))


### Using a condition in by
DF %>% 
  summarise_dt(sumV2 = sum(V2),by = V4 == "A")



### By on a subset of rows
DF %>% 
  slice_dt(1:5) %>% 
  summarise_dt(sumV1 = sum(V1),by = V4)

### Count number of observations for each group
count_dt(DF, V4)

### Add a column with number of observations for each group
add_count_dt(DF, V1)

### Retrieve the first/last/nth observation for each group
DF %>% summarise_dt(by = V4,nth(V2,1))
DF %>% summarise_dt(by = V4,nth(V2,-1))
DF %>% summarise_dt(by = V4,nth(V2,2))


## -----------------------------------------------------------------------------
### Summarise all the columns
summarise_vars(DT,.func = max)

### Summarise several columns
summarise_vars(DT,c("V1", "V2"),mean)

### Summarise several columns by group
DT %>% 
    summarise_vars(c("V1", "V2"),mean,by = V4)

## using patterns (regex)
DT %>% 
    summarise_vars("V1|V2",mean,by = V4)

## Summarise with more than one function by group
# when you can't find a way, you can always use `in_dt` to use data.table
DT %>% 
  in_dt(, by = V4, 
     c(lapply(.SD, sum),
       lapply(.SD, mean)))

### Summarise using a condition
summarise_vars(DF, is.numeric, mean)

### Modify all the columns
mutate_vars(DF, .func = rev)

### Modify several columns (dropping the others)
DF %>% 
  select_dt(cols = c("V1", "V2")) %>% 
  mutate_vars(.func = sqrt)
DF %>% 
  select_dt(-V4) %>% 
  mutate_vars(.func = exp)

### Modify several columns (keeping the others)
DF %>% 
  mutate_vars(c("V1", "V2"), sqrt)
DF %>% 
  mutate_vars(-"V4",  exp)

### Modify columns using a condition (dropping the others)
select_dt(DT,is.numeric)

### Modify columns using a condition (keeping the others)
mutate_vars(DT,is.numeric,as.integer)

### Use a complex expression
DF %>% 
  group_dt(
    by = V4,
    slice_dt(1:2) %>% 
      transmute_dt(V1 = V1,
            V2 = "X")
  )

### Use multiple expressions (with DT[,{j}])
DT %>% 
  in_dt(,{
      print(V1) #  comments here!
      print(summary(V1))
      x <- V1 + sum(V2)
     .(A = 1:.N, B = x) # last list returned as a data.table
     }
  )


## -----------------------------------------------------------------------------
### Select first/last/… row by group
DT %>% 
  group_dt(
    by = V4,
    head(1)
  )

DT %>% 
  group_dt(
    by = V4,
    tail(2)
  )

DT %>% 
  group_dt(
    by = V4,
    slice_dt(1,.N)
  )

### Select rows using a nested query
DF %>% 
  group_dt(
    by = V4,
    arrange_dt(V2) %>% 
      slice_dt(1)
  )

### Add a group counter column
DT %>% 
  mutate_dt(Grp = .GRP,by = .(V4, V1))

### Get row number of first (and last) observation by group
DT %>% summarise_dt(I = .I,by = V4)
DT %>% summarise_dt(I = .I[1],by = V4)
DT %>% summarise_dt(I = .I[c(1,.N)],by = V4)

### Handle list-columns by group

DT %>% 
  select_dt(V1,V4) %>% 
  chop_dt(V1) # return V1 as a list
DT %>% nest_dt(V4) # subsets of the data

### Grouping sets (multiple by at once)
# use data.table directly, tidyfst does not provide new methods for it yet
data.table::rollup(DT,
       .(SumV2 = sum(V2)),
       by = c("V1", "V4"))

data.table::rollup(DT,
       .(SumV2 = sum(V2), .N),
       by = c("V1", "V4"),
       id = TRUE)

data.table::cube(DT,
     .(SumV2 = sum(V2), .N),
     by = c("V1", "V4"),
     id = TRUE)

data.table::groupingsets(DT,
             .(SumV2 = sum(V2), .N),
             by   = c("V1", "V4"),
             sets = list("V1", c("V1", "V4")),
             id   = TRUE)


## ----eval=FALSE---------------------------------------------------------------
#  ### Write data to a csv file
#  fwrite(DT, "DT.csv")
#  
#  ### Write data to a tab-delimited file
#  fwrite(DT, "DT.txt", sep = "\t")
#  
#  ### Write list-column data to a csv file
#  fwrite(setDT(list(0, list(1:5))), "DT2.csv")
#  #
#  ### Read a csv / tab-delimited file
#  fread("DT.csv")
#  # fread("DT.csv", verbose = TRUE) # full details
#  fread("DT.txt", sep = "\t")
#  
#  ### Read a csv file selecting / droping columns
#  fread("DT.csv", select = c("V1", "V4"))
#  fread("DT.csv", drop = "V4")
#  # NA
#  ### Read and rbind several files
#  rbindlist(lapply(c("DT.csv", "DT.csv"), fread))
#  # c("DT.csv", "DT.csv") %>% lapply(fread) %>% rbindlist
#  

## -----------------------------------------------------------------------------
### Melt data (from wide to long)
fsetequal(DT,DF)

mDT = DT %>% longer_dt(V3,V4)
mDF = DF %>% longer_dt(-"V1|V2")

fsetequal(mDT,mDF)
mDT

### Cast data (from long to wide)
mDT %>% 
  wider_dt(V4,name = "name",value = "value")
# below is a special case and could only be done in tidyfst
mDT %>% 
  wider_dt(V4,name = "name",value = "value",fun = list)

mDT %>% 
  wider_dt(V4,name = "name",value = "value",fun = sum)

### Split
split(DT, by = "V4")


## -----------------------------------------------------------------------------
### Lead/Lag
lag_dt(1:10,n = 1)
lag_dt(1:10,n = 1:2)
lead_dt(1:10,n = 1)


## -----------------------------------------------------------------------------
x <- data.table(Id  = c("A", "B", "C", "C"),
                X1  = c(1L, 3L, 5L, 7L),
                XY  = c("x2", "x4", "x6", "x8"),
                key = "Id")

y <- data.table(Id  = c("A", "B", "B", "D"),
                Y1  = c(1L, 3L, 5L, 7L),
                XY  = c("y1", "y3", "y5", "y7"),
                key = "Id")

### left join
left_join_dt(x, y, by = "Id")

### right join
right_join_dt(x, y, by = "Id")

### inner join
inner_join_dt(x, y, by = "Id")

### full join
full_join_dt(x, y, by = "Id")

### semi join
semi_join_dt(x, y, by = "Id")

### anti join
anti_join_dt(x, y, by = "Id")


## -----------------------------------------------------------------------------
x <- data.table(1:3)
y <- data.table(4:6)
z <- data.table(7:9, 0L)
### Bind rows
rbind(x, y)
rbind(x, z, fill = TRUE)

### Bind rows using a list
rbindlist(list(x, y), idcol = TRUE)

### Bind columns
cbind(x, y)


## -----------------------------------------------------------------------------
x <- data.table(c(1, 2, 2, 3, 3))
y <- data.table(c(2, 2, 3, 4, 4))
### Intersection
fintersect(x, y)
fintersect(x, y, all = TRUE)

### Difference
fsetdiff(x, y)
fsetdiff(x, y, all = TRUE)

### Union
funion(x, y)
funion(x, y, all = TRUE)

### Equality
fsetequal(x, x[order(-V1),])
all.equal(x, x) # S3 method
setequal(x, x[order(-V1),])