knitr::opts_chunk$set(
  comment = "#",
  error = FALSE,
  tidy = FALSE,
  cache = FALSE,
  collapse = TRUE)

dwtools is R package focused on Data Warehouse-like data processing. The package heavily relies on the data.table and DBI/RJDBC/RODBC. These packages are employed as transformation and extraction/loading processors.


Features

Below are the most basic examples of dwtools functions.

library(devtools)
library(magrittr)
library(dwtools)

Populate data in star schema model. Generates list of 5 dimension tables and 1 fact table.
We will use it in the next examples.

X <- dw.populate(N=1e3, scenario="star")
sapply(X,nrow) # volume of each
lapply(X,head,2) # preview of data

Data processing related

db: simple db interface

Simple database interface, an extraction and loading processes. Designed to use in data.table chaining.
Unifies database connectors to DBI, JDBC, ODBC or csv file.
Supports managing of multiple db connections, below one sqlite, one h2 db, and one csv file.

library(RSQLite) # install.packages("RSQLite")
library(RH2) # install.packages("RH2")
sqlite1 <- list(drvName = "SQLite", conn = DBI::dbConnect(SQLite(), ":memory:"))
h21 <- list(drvName = "JDBC", conn = RJDBC::dbConnect(H2(), "jdbc:h2:mem:"))
csv1 <- list(drvName = "csv")
options("dwtools.db.conns" = list(sqlite1=sqlite1, h21=h21, csv1=csv1))

db function accepts vector of sql statements / table names to allow batch processing.
All db function calls can be logged by argument timing=TRUE, or automatically using options("dwtools.timing"=TRUE).

SALES <- X$SALES
GEOGRAPHY <- X$GEOGRAPHY
CURRENCY <- X$CURRENCY

### Basic db syntax

## read, query, send:
# db( statement | tablename, connname )

## write to db:
# db( DT, tablename, connname )

# write to db - default connection is the first defined
db(SALES,"sales_fact")

# read from from db
db("sales_fact") %>% head(2)

# query from db
db("SELECT * FROM sales_fact") %>% tail(2)

# send to db
db("DROP TABLE sales_fact")

### Vectorized input - recycling on tablen ame and connection name

# write into two databases: sqlite and h2
db(SALES,"sales_fact",c("sqlite1","h21"))

# write into two tables various names in two connections
db(GEOGRAPHY,c("geography_dim","h2_geography"),c("sqlite1","h21"))

# read data from multiple connections, return list
l <- db("sales_fact",c("sqlite1","h21"))
sapply(l, nrow) # count of each

# read data from multiple tables, return list
l <- db(c("SELECT * FROM geography_dim","SELECT geog_code, geog_region_name FROM geography_dim"))
sapply(l, ncol) # number of columns

### Mirror copy table from h2 into two connections, various table names

# chaining, save h2
db("geography_dim")[, db(.SD, "h2_geography_alt", "h21")] %>% invisible
# piping, save sqlite1 and csv
db("h2_geography_alt","h21") %>% db(., c("geography_dim_alt","geography"), c("sqlite1","csv1"))

# or use dbCopy wrapper
dbCopy("h2_geography", "h21", "geography_alt_alt", "sqlite1")

### Join across databases

# join sqlite fact table to h2 geography dimension
db("sales_fact", "sqlite1", key="geog_code")[
  db("h2_geography", "h21", key="geog_code")] %>% head(2)

# join h2 fact table to sqlite geography dimension
db("sales_fact", "h21", key="geog_code")[
  db("geography", "csv1", key="geog_code")] %>% head(2)

### Obviously join database side fully supported

db("SELECT * FROM sales_fact sales JOIN geography_dim geog ON sales.geog_code = geog.geog_code") %>% head(2)

### ETL: fetch from various databases, join, aggregate and save to db

# Extract data from databases
geog <- db("h2_geography", "h21", key="geog_code")
sales <- db("sales_fact", "sqlite1", key="geog_code")

# Make transformations
sales_by_division_vs_region <- 
  geog[sales
       ][, region_value := sum(value,na.rm=TRUE), .(geog_region_name)
         ][, .(value = sum(value,na.rm=TRUE),
               value_to_region = sum(value,na.rm=TRUE) / region_value[1L]),
           .(geog_region_name,geog_division_name)]

# Load data to to sqlite
db(sales_by_division_vs_region,"sales_by_division_vs_region")

# Verify data
db("SELECT * FROM sales_by_division_vs_region") %>% head(4)

### The same ETL process but in one data.table chain

db("h2_geography", "h21", key="geog_code"
   )[db("sales_fact", "sqlite1", key="geog_code")
     ][, region_value := sum(value,na.rm=TRUE), .(geog_region_name)
       ][, .(value = sum(value,na.rm=TRUE),
             value_to_region = sum(value,na.rm=TRUE) / region_value[1L]),
         .(geog_region_name,geog_division_name)
         ][, db(.SD,"sales_by_division_vs_region_2")] %>% invisible

data.equal.data.table: data equality check

Databases stores the data with no row order, so to check equality of data in tables we must ignore row order.
SQLite does not supports analytics functions so for tidiness I will use views.

# aggregate with count to handle duplicate rows
sql <- "CREATE VIEW v_division_by_region AS 
        SELECT geog_region_name, geog_division_name, value, value_to_region, COUNT(*) cnt
        FROM sales_by_division_vs_region 
        GROUP BY geog_region_name, geog_division_name, value, value_to_region"
sql <- c(sql, "CREATE VIEW v_division_by_region_2 AS
               SELECT geog_region_name, geog_division_name, value, value_to_region, COUNT(*) cnt
               FROM sales_by_division_vs_region_2 
               GROUP BY geog_region_name, geog_division_name, value, value_to_region")

# except statements
sql <- c(sql, "CREATE VIEW v_1_minus_2 AS
               SELECT * FROM v_division_by_region
               EXCEPT
               SELECT * FROM v_division_by_region_2")
sql <- c(sql, "CREATE VIEW v_2_minus_1 AS
               SELECT * FROM v_division_by_region_2
               EXCEPT 
               SELECT * FROM v_division_by_region")

# create all 4 views, turn on timing
db(sql, timing=TRUE)

# for exact data match below query should return 0 rows
db("SELECT * FROM v_1_minus_2 UNION ALL SELECT * FROM v_2_minus_1", timing=TRUE)

# drop all existing views
db("SELECT name FROM sqlite_master WHERE type = 'view'")[,paste("DROP VIEW",name)] %>% db

# alternatively whole process on the R side, wrapped with timing
timing(
  data.equal.data.table(
    db("sales_by_division_vs_region"),
    db("sales_by_division_vs_region_2")
  )
)

Maintenance helpers

timing

Precise measure timing plus metadata such in/out count, tags, optionally log to db.

timing({
  Sys.sleep(1.123)
  "my result"
})
get.timing(40)[,.SD,.SDcols=-c("dwtools_session","in_n","user_self","sys_self","tag")] # recent timings

For extended logging solution see logR package that allows:


pkgsVersion: multiple pkgs version managing

Handy wrapper to compare packages version across libraries.
Makes easy to manage multiple environments.

lib.dev <- "lib_dev"
dir.create(lib.dev,FALSE)
lib.prod <- "lib_prod"
dir.create(lib.prod,FALSE)

# install to production
install.packages("R6", lib = lib.prod, repos="http://cran.stat.ucla.edu")
# install to dev
with_lib(paste(getwd(), lib.dev, sep="/"), install_github("wch/R6"))

# load from different environments
library("R6", lib.loc = lib.prod) # for dev use: lib.loc = lib.dev

# manage versions
pkgs <- c("DBI","data.table","devtools","dwtools","logR","R6")
pkgsVersion(pkgs, libs = list(user = .libPaths()[1L],
                              dev = lib.dev,
                              prod = lib.prod))
#           pkg    user        dev  prod
# 1:        DBI   0.3.1         NA    NA
# 2: data.table   1.9.5         NA    NA
# 3:   devtools   1.7.0         NA    NA
# 4:    dwtools 0.8.3.6         NA    NA
# 5:       logR   1.9.9         NA    NA
# 6:         R6   2.0.1 2.0.0.9000 2.0.1

Data modelling related

CUBE: cube class object and MDX queries

Forms CUBE object from fact and dimension tables and query cube using MDX like queries.

cube <- CUBE$new(
  fact = list(sales = X$SALES),
  dim = list(customer = X$CUSTOMER,
              product = X$PRODUCT,
              geography = X$GEOGRAPHY,
              time = X$TIME,
              currency = X$CURRENCY),
  ref = list("sales-time"="time_code", "sales-currency"="curr_code", "sales-geography"="geog_code", "sales-customer"="cust_code", "sales-product"="prod_code")
)
print(cube)
cube$MDX(
  rows = list(time = c("time_year_code","time_month_code","time_month_name")),
  cols = list(sales = "value", geography = "geog_region_name"),
  from = "sales",
  where = list(currency = quote(curr_type == "fiat"))
) %>% head(6L)

joinbyv: batch join

Batch join multiple tables into one master table.
Denormalization of star schema and snowflake schema to flat fact table.

DT <- joinbyv(
  master = X$SALES,
  join = list(customer = X$CUSTOMER,
              product = X$PRODUCT,
              geography = X$GEOGRAPHY,
              time = X$TIME,
              currency = X$CURRENCY),
  col.subset = list(c("cust_active"),
                    c("prod_group_name","prod_family_name"),
                    c("geog_region_name"),
                    c("time_month_name"),
                    NULL)
)
print(names(DT)) # all columns in result

build_hierarchy: detect basic hierarchy

Takes single dataset on input and detects hierarchies by cardinality of unique groupings of all possible variable pairs. Returns star schema: one fact table and multiple dimension tables. It handle only basic cases.

DT <- joinbyv(X$SALES, join=list(X$CURRENCY, X$GEOGRAPHY))
names(DT)
dw <- build_hierarchy(DT, factname="fact_sales")
sapply(dw$tables,ncol) # number of columns in dimension tables or fact table, including keys
sapply(dw$tables,nrow) # number of rows in dimension tables or fact table, including keys

eav: Entity-Attribute-Value manipulation

EAV modeled data calculation, intelligent wrapper for dcast %>% eval(j) %>% melt.

DT <- X$SALES[,.(prod_code,time_code,amount,value)
              ][,melt(.SD, id=1:2,variable.name='measure',value.name='value')
                ] # prepare EAV
setkey(DT,prod_code,time_code,measure)
DT %>% head(4)
eav(DT, quote(avg_price:=value/amount)) %>% head(6)

Reporting related

shinyBI

Early version of hierarchical data BI app. You can preview live demo of the app, 100k rows, slow public hosting.

shiny::runApp(system.file("shinyBI", package="dwtools"))

idxv: user defined pre-calc indices

DT binary search on multiple keys, also known as Nth setkey.
Creates custom indices for a data.table object. May require lot of memory.
Make sense on heavy DT quering with filtering on various columns.
Possibly to remove after data.table#1067 resolved.

DT <- X$SALES
# create some particular indices
Idx <- list(
  c("cust_code", "prod_code", "geog_code"),
  c("cust_code", "geog_code", "curr_code"),
  c(2:3)
)
IDX <- idxv(DT, Idx)

# binary search on first index # DT[cust_code=="id014" & prod_code==2L & geog_code=="VA"]
DT[CJI(IDX,"id014",2L,"VA")] %>% head(2)
# binary search on second index # DT[cust_code=="id012" & geog_code=="WV" & curr_code=="ARS"]
DT[CJI(IDX,"id012",TRUE,"WV",TRUE,"ARS")] %>% head(2)
# binary search on third index # DT[prod_code==5L & geog_code=="NV"]
DT[CJI(IDX,TRUE,5L,"NV")] %>% head(2)

Others

A brief comment on others functions in the package.

dbDisconnect(conn=sqlite1$conn)
dbDisconnect(conn=h21$conn)
file.remove("geography.csv")


jangorecki/dwtools documentation built on May 18, 2019, 12:24 p.m.