Simple tasks related to R Tip: Break up Function Nesting for Legibility. Most remote data systems start and end with data remote, so we are materializing tables when showing database timings.

library("dplyr")
library("microbenchmark")
library("ggplot2")
library("data.table")
library("rquery")


db <- DBI::dbConnect(RPostgreSQL::PostgreSQL(),
                     host = 'localhost',
                     port = 5432,
                     user = 'johnmount',
                     password = '')

# PostgreSQL dbExistsTable() does not work
dbopts <- dbi_connection_preferences(db)
print(dbopts)
options(dbopts)


nrows <- 100000

Simple problem (subset rows and columns).

mtcarsb <- mtcars[rep(seq_len(nrow(mtcars)), nrows), ,]
print(dim(mtcarsb))

mtcarsd <- as.data.table(mtcarsb)

mtcarsdb <- dbi_copy_to(db, "mtcarsdb", mtcarsb,
                        overwrite = TRUE,
                        temporary = TRUE)
# DBI::dbGetQuery(db, "CREATE INDEX mtcarsdb_cyl ON mtcarsdb (cyl)")
rquery_sql <- mtcarsdb %.>% 
  select_rows_nse(., cyl == 8) %.>% 
  select_columns(., qc(mpg, cyl, wt)) %.>%
  to_sql(., db)

cat(rquery_sql)

DBI::dbGetQuery(db, paste("EXPLAIN", rquery_sql))

mtcarst <- dplyr::tbl(db, "mtcarsdb")

dplyr_sql <- mtcarst %>%
  filter(cyl == 8) %>%
  select(mpg, cyl, wt) %>%
  dbplyr::remote_query(.)

cat(dplyr_sql)

DBI::dbGetQuery(db, paste("EXPLAIN", dplyr_sql))


timings <- microbenchmark(
  base_stepped = {
    . <- mtcarsb
    . <- subset(., cyl == 8)
    . <- .[, c("mpg", "cyl", "wt")]
    nrow(.)
  },
  base_nested = {
    nrow(mtcarsb[mtcarsb$cyl == 8, c("mpg", "cyl", "wt")])
  },
  dplyr = {
    mtcarsb                %>%
      filter(cyl == 8)     %>%
      select(mpg, cyl, wt) %>%
      nrow
  },
  dplyr_database = {
    res <- mtcarst         %>%
      filter(cyl == 8)     %>%
      select(mpg, cyl, wt) %>%
      compute()
    as.numeric(as.data.frame(tally(res))[[1]][[1]])
  },
  data.table_nested = {
    nrow(mtcarsd[cyl==8, c("mpg", "cyl", "wt")])
  },
  data.table_stepped = {
   mtcarsd[cyl==8,        ][,
    c("mpg", "cyl", "wt") ][, 
    .N                    ]
  },
  rquery_database = {
    res <- mtcarsdb                       %.>% 
      select_rows_nse(., cyl == 8)        %.>% 
      select_columns(., qc(mpg, cyl, wt)) %.>%
      materialize(db, ., 
                  table_name = "restab",
                  overwrite = TRUE,
                  temporary = TRUE)
     dbi_nrow(db, res$table_name)
  }
)

print(timings)

autoplot(timings)

Want to add the query to query folding feature that dbplyr's optimizer has.

DBI::dbDisconnect(db)


WinVector/rquery documentation built on Aug. 24, 2023, 11:12 a.m.