Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
#eval = FALSE,
comment = "#>"
)
## ----setup--------------------------------------------------------------------
library(tidyfst)
set.seed(1L)
## Create a data table
DF <- data.table(V1 = rep(c(1L, 2L), 5)[-10],
V2 = 1:9,
V3 = c(0.5, 1.0, 1.5),
V4 = rep(LETTERS[1:3], 3))
copy(DF) -> DT
class(DF)
DF
## -----------------------------------------------------------------------------
### Filter rows using indices
slice_dt(DF, 3:4)
### Discard rows using negative indices
slice_dt(DF, -(3:7))
### Filter rows using a logical expression
filter_dt(DF, V2 > 5)
filter_dt(DF, V4 %in% c("A", "C"))
filter_dt(DF, V4 %chin% c("A", "C")) # fast %in% for character
### Filter rows using multiple conditions
filter_dt(DF, V1 == 1, V4 == "A")
# equals to
filter_dt(DF, V1 == 1 & V4 == "A")
### Filter unique rows
distinct_dt(DF) # unique(DF)
distinct_dt(DF, V1,V4)
### Discard rows with missing values
drop_na_dt(DF) # na.omit(DF)
### Other filters
sample_n_dt(DF, 3) # n random rows
sample_frac_dt(DF, 0.5) # fraction of random rows
slice_max_dt(DF, V1,1) # top n entries (includes equals)
filter_dt(DT,V4 %like% "^B")
filter_dt(DT,V2 %between% c(3, 5))
filter_dt(DT,between(V2, 3, 5, incbounds = FALSE))
filter_dt(DT,V2 %inrange% list(-1:1, 1:3)) # see also ?inrange
## -----------------------------------------------------------------------------
### Sort rows by column
arrange_dt(DF, V3)
### Sort rows in decreasing order
arrange_dt(DF, -V3)
### Sort rows based on several columns
arrange_dt(DF, V1, -V2)
## -----------------------------------------------------------------------------
### Select one column using an index (not recommended)
pull_dt(DT,3) # returns a vector
select_dt(DT,3) # returns a data.table
### Select one column using column name
select_dt(DF, V2) # returns a data.table
pull_dt(DF, V2) # returns a vector
### Select several columns
select_dt(DF, V2, V3, V4)
select_dt(DF, V2:V4) # select columns between V2 and V4
### Exclude columns
select_dt(DF, -V2, -V3)
### Select/Exclude columns using a character vector
cols <- c("V2", "V3")
select_dt(DF,cols = cols)
select_dt(DF,cols = cols,negate = TRUE)
### Other selections
select_dt(DF, cols = paste0("V", 1:2))
relocate_dt(DF, V4) # reorder columns
select_dt(DF, "V")
select_dt(DF, "3$")
select_dt(DF, ".2")
select_dt(DF, "V1")
select_dt(DF, -"^V2")
# remove variables using "-" prior to function
## -----------------------------------------------------------------------------
### Summarise one column
summarise_dt(DF, sum(V1)) # returns a data.table
summarise_dt(DF, sumV1 = sum(V1)) # returns a data.table
### Summarise several columns
summarise_dt(DF, sum(V1), sd(V3))
### Summarise several columns and assign column names
DF %>%
summarise_dt(sumv1 = sum(V1),
sdv3 = sd(V3))
### Summarise a subset of rows
DT[1:4, sum(V1)]
DF %>%
slice_dt(1:4) %>%
summarise_dt(sum(V1))
### Misc
summarise_dt(DF, nth(V3,1))
summarise_dt(DF, nth(V3,-1))
summarise_dt(DF, nth(V3, 5))
summarise_dt(DF, uniqueN(V4))
uniqueN(DF)
## -----------------------------------------------------------------------------
### By group
# not recommended
DF %>%
group_dt(
by = V4,
summarise_dt(sumV2 = sum(V2))
)
# recommended
DF %>%
summarise_dt(sumV2 = sum(V2),by = V4)
### By several groups
DF %>%
summarise_dt(sumV2 = sum(V2),by = .(V1,V4))
### Calling function in by
DF %>%
summarise_dt(sumV2 = sum(V2),by = tolower(V4))
### Assigning column name in by
DF %>%
summarise_dt(sumV2 = sum(V2),by = .(abc = tolower(V4)))
### Using a condition in by
DF %>%
summarise_dt(sumV2 = sum(V2),by = V4 == "A")
### By on a subset of rows
DF %>%
slice_dt(1:5) %>%
summarise_dt(sumV1 = sum(V1),by = V4)
### Count number of observations for each group
count_dt(DF, V4)
### Add a column with number of observations for each group
add_count_dt(DF, V1)
### Retrieve the first/last/nth observation for each group
DF %>% summarise_dt(by = V4,nth(V2,1))
DF %>% summarise_dt(by = V4,nth(V2,-1))
DF %>% summarise_dt(by = V4,nth(V2,2))
## -----------------------------------------------------------------------------
### Summarise all the columns
summarise_vars(DT,.func = max)
### Summarise several columns
summarise_vars(DT,c("V1", "V2"),mean)
### Summarise several columns by group
DT %>%
summarise_vars(c("V1", "V2"),mean,by = V4)
## using patterns (regex)
DT %>%
summarise_vars("V1|V2",mean,by = V4)
## Summarise with more than one function by group
# when you can't find a way, you can always use `in_dt` to use data.table
DT %>%
in_dt(, by = V4,
c(lapply(.SD, sum),
lapply(.SD, mean)))
### Summarise using a condition
summarise_vars(DF, is.numeric, mean)
### Modify all the columns
mutate_vars(DF, .func = rev)
### Modify several columns (dropping the others)
DF %>%
select_dt(cols = c("V1", "V2")) %>%
mutate_vars(.func = sqrt)
DF %>%
select_dt(-V4) %>%
mutate_vars(.func = exp)
### Modify several columns (keeping the others)
DF %>%
mutate_vars(c("V1", "V2"), sqrt)
DF %>%
mutate_vars(-"V4", exp)
### Modify columns using a condition (dropping the others)
select_dt(DT,is.numeric)
### Modify columns using a condition (keeping the others)
mutate_vars(DT,is.numeric,as.integer)
### Use a complex expression
DF %>%
group_dt(
by = V4,
slice_dt(1:2) %>%
transmute_dt(V1 = V1,
V2 = "X")
)
### Use multiple expressions (with DT[,{j}])
DT %>%
in_dt(,{
print(V1) # comments here!
print(summary(V1))
x <- V1 + sum(V2)
.(A = 1:.N, B = x) # last list returned as a data.table
}
)
## -----------------------------------------------------------------------------
### Select first/last/… row by group
DT %>%
group_dt(
by = V4,
head(1)
)
DT %>%
group_dt(
by = V4,
tail(2)
)
DT %>%
group_dt(
by = V4,
slice_dt(1,.N)
)
### Select rows using a nested query
DF %>%
group_dt(
by = V4,
arrange_dt(V2) %>%
slice_dt(1)
)
### Add a group counter column
DT %>%
mutate_dt(Grp = .GRP,by = .(V4, V1))
### Get row number of first (and last) observation by group
DT %>% summarise_dt(I = .I,by = V4)
DT %>% summarise_dt(I = .I[1],by = V4)
DT %>% summarise_dt(I = .I[c(1,.N)],by = V4)
### Handle list-columns by group
DT %>%
select_dt(V1,V4) %>%
chop_dt(V1) # return V1 as a list
DT %>% nest_dt(V4) # subsets of the data
### Grouping sets (multiple by at once)
# use data.table directly, tidyfst does not provide new methods for it yet
data.table::rollup(DT,
.(SumV2 = sum(V2)),
by = c("V1", "V4"))
data.table::rollup(DT,
.(SumV2 = sum(V2), .N),
by = c("V1", "V4"),
id = TRUE)
data.table::cube(DT,
.(SumV2 = sum(V2), .N),
by = c("V1", "V4"),
id = TRUE)
data.table::groupingsets(DT,
.(SumV2 = sum(V2), .N),
by = c("V1", "V4"),
sets = list("V1", c("V1", "V4")),
id = TRUE)
## ----eval=FALSE---------------------------------------------------------------
# ### Write data to a csv file
# fwrite(DT, "DT.csv")
#
# ### Write data to a tab-delimited file
# fwrite(DT, "DT.txt", sep = "\t")
#
# ### Write list-column data to a csv file
# fwrite(setDT(list(0, list(1:5))), "DT2.csv")
# #
# ### Read a csv / tab-delimited file
# fread("DT.csv")
# # fread("DT.csv", verbose = TRUE) # full details
# fread("DT.txt", sep = "\t")
#
# ### Read a csv file selecting / droping columns
# fread("DT.csv", select = c("V1", "V4"))
# fread("DT.csv", drop = "V4")
# # NA
# ### Read and rbind several files
# rbindlist(lapply(c("DT.csv", "DT.csv"), fread))
# # c("DT.csv", "DT.csv") %>% lapply(fread) %>% rbindlist
#
## -----------------------------------------------------------------------------
### Melt data (from wide to long)
fsetequal(DT,DF)
mDT = DT %>% longer_dt(V3,V4)
mDF = DF %>% longer_dt(-"V1|V2")
fsetequal(mDT,mDF)
mDT
### Cast data (from long to wide)
mDT %>%
wider_dt(V4,name = "name",value = "value")
# below is a special case and could only be done in tidyfst
mDT %>%
wider_dt(V4,name = "name",value = "value",fun = list)
mDT %>%
wider_dt(V4,name = "name",value = "value",fun = sum)
### Split
split(DT, by = "V4")
## -----------------------------------------------------------------------------
### Lead/Lag
lag_dt(1:10,n = 1)
lag_dt(1:10,n = 1:2)
lead_dt(1:10,n = 1)
## -----------------------------------------------------------------------------
x <- data.table(Id = c("A", "B", "C", "C"),
X1 = c(1L, 3L, 5L, 7L),
XY = c("x2", "x4", "x6", "x8"),
key = "Id")
y <- data.table(Id = c("A", "B", "B", "D"),
Y1 = c(1L, 3L, 5L, 7L),
XY = c("y1", "y3", "y5", "y7"),
key = "Id")
### left join
left_join_dt(x, y, by = "Id")
### right join
right_join_dt(x, y, by = "Id")
### inner join
inner_join_dt(x, y, by = "Id")
### full join
full_join_dt(x, y, by = "Id")
### semi join
semi_join_dt(x, y, by = "Id")
### anti join
anti_join_dt(x, y, by = "Id")
## -----------------------------------------------------------------------------
x <- data.table(1:3)
y <- data.table(4:6)
z <- data.table(7:9, 0L)
### Bind rows
rbind(x, y)
rbind(x, z, fill = TRUE)
### Bind rows using a list
rbindlist(list(x, y), idcol = TRUE)
### Bind columns
cbind(x, y)
## -----------------------------------------------------------------------------
x <- data.table(c(1, 2, 2, 3, 3))
y <- data.table(c(2, 2, 3, 4, 4))
### Intersection
fintersect(x, y)
fintersect(x, y, all = TRUE)
### Difference
fsetdiff(x, y)
fsetdiff(x, y, all = TRUE)
### Union
funion(x, y)
funion(x, y, all = TRUE)
### Equality
fsetequal(x, x[order(-V1),])
all.equal(x, x) # S3 method
setequal(x, x[order(-V1),])
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.