inst/doc/Introduction.R

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(tidyft)

# make copies
copy(iris) -> a
copy(mtcars) -> b

# before
class(a)
class(b)

# convert codes
lapply(ls(),get) %>%
  lapply(setDT) %>%
  invisible()

# after
class(a)
class(b)

## -----------------------------------------------------------------------------
rm(list = ls())

library(tidyft)
# make a large data.frame
iris[rep(1:nrow(iris),1e4),] -> dt
# size: 1500000 rows, 5 columns
dim(dt)
# save as fst table
as_fst(dt) -> ft
# remove the data.frame from RAM
rm(dt)

# inspect the fst table of large iris
ft
summary_fst(ft)

# list the variables in the environment
ls() # only the ft exists

## -----------------------------------------------------------------------------
ft %>%
  slice_fst(5555:6666)  # get 5555 to 6666 row

## -----------------------------------------------------------------------------

sys_time_print({
  res =  ft %>%
   select_fst(Species,Sepal.Length,Sepal.Width) %>%
   rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
   arrange(group,sl) %>%
   filter(sl > 5) %>%
   distinct(sl,.keep_all = TRUE) %>%
   summarise(sw = max(sw),by = group)
})

res


## -----------------------------------------------------------------------------

rm(list = ls())

library(data.table)
library(dplyr)
library(dtplyr)
library(tidyft)


# make a large data.frame
iris[rep(1:nrow(iris),1e4),] -> dt
# size: 1500000 rows, 5 columns
dim(dt)
# save as fst table
as_fst(dt) -> ft
# remove the data.frame from RAM
rm(dt)


bench::mark(

  dplyr = ft %>%
    select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
    dplyr::select(-Petal.Length) %>%
    dplyr::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
    dplyr::arrange(group,sl) %>%
    dplyr::filter(sl > 5) %>%
    dplyr::distinct(sl,.keep_all = TRUE) %>%
    dplyr::group_by(group) %>%
    dplyr::summarise(sw = max(sw)),

  dtplyr = ft %>%
    select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
    lazy_dt() %>%
    dplyr::select(-Petal.Length) %>%
    dplyr::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
    dplyr::arrange(group,sl) %>%
    dplyr::filter(sl > 5) %>%
    dplyr::distinct(sl,.keep_all = TRUE) %>%
    dplyr::group_by(group) %>%
    dplyr::summarise(sw = max(sw)) %>%
    as.data.table(),

  data.table = ft[,c("Species","Sepal.Length","Sepal.Width","Petal.Length")] %>%
    setDT() %>%
    .[,.SD,.SDcols = -"Petal.Length"] %>%
    setnames(old =c("Species","Sepal.Length","Sepal.Width"),
             new = c("group","sl","sw")) %>%
    setorder(group,sl) %>%
    .[sl>5] %>% unique(by = "sl") %>%
    .[,.(sw = max(sw)),by = group],


  tidyft =  ft %>%
    tidyft::select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
    tidyft::select(-Petal.Length) %>%
    tidyft::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
    tidyft::arrange(group,sl) %>%
    tidyft::filter(sl > 5) %>%
    tidyft::distinct(sl,.keep_all = TRUE) %>%
    tidyft::summarise(sw = max(sw),by = group),

  check = setequal
)


## -----------------------------------------------------------------------------
sessionInfo()

Try the tidyft package in your browser

Any scripts or data that you put into this service are public.

tidyft documentation built on Sept. 30, 2024, 9:40 a.m.