knitr::opts_chunk$set( comment = "#", error = FALSE, tidy = FALSE, cache = FALSE, collapse = TRUE)
dwtools is R package focused on Data Warehouse-like data processing. The package heavily relies on the data.table and DBI/RJDBC/RODBC. These packages are employed as transformation and extraction/loading processors.
Below are the most basic examples of dwtools
functions.
library(devtools) library(magrittr) library(dwtools)
Populate data in star schema model. Generates list of 5 dimension tables and 1 fact table.
We will use it in the next examples.
X <- dw.populate(N=1e3, scenario="star") sapply(X,nrow) # volume of each lapply(X,head,2) # preview of data
db
: simple db interfaceSimple database interface, an extraction and loading processes. Designed to use in data.table chaining.
Unifies database connectors to DBI, JDBC, ODBC or csv file.
Supports managing of multiple db connections, below one sqlite, one h2 db, and one csv file.
library(RSQLite) # install.packages("RSQLite") library(RH2) # install.packages("RH2") sqlite1 <- list(drvName = "SQLite", conn = DBI::dbConnect(SQLite(), ":memory:")) h21 <- list(drvName = "JDBC", conn = RJDBC::dbConnect(H2(), "jdbc:h2:mem:")) csv1 <- list(drvName = "csv") options("dwtools.db.conns" = list(sqlite1=sqlite1, h21=h21, csv1=csv1))
db
function accepts vector of sql statements / table names to allow batch processing.
All db
function calls can be logged by argument timing=TRUE
, or automatically using options("dwtools.timing"=TRUE)
.
SALES <- X$SALES GEOGRAPHY <- X$GEOGRAPHY CURRENCY <- X$CURRENCY ### Basic db syntax ## read, query, send: # db( statement | tablename, connname ) ## write to db: # db( DT, tablename, connname ) # write to db - default connection is the first defined db(SALES,"sales_fact") # read from from db db("sales_fact") %>% head(2) # query from db db("SELECT * FROM sales_fact") %>% tail(2) # send to db db("DROP TABLE sales_fact") ### Vectorized input - recycling on tablen ame and connection name # write into two databases: sqlite and h2 db(SALES,"sales_fact",c("sqlite1","h21")) # write into two tables various names in two connections db(GEOGRAPHY,c("geography_dim","h2_geography"),c("sqlite1","h21")) # read data from multiple connections, return list l <- db("sales_fact",c("sqlite1","h21")) sapply(l, nrow) # count of each # read data from multiple tables, return list l <- db(c("SELECT * FROM geography_dim","SELECT geog_code, geog_region_name FROM geography_dim")) sapply(l, ncol) # number of columns ### Mirror copy table from h2 into two connections, various table names # chaining, save h2 db("geography_dim")[, db(.SD, "h2_geography_alt", "h21")] %>% invisible # piping, save sqlite1 and csv db("h2_geography_alt","h21") %>% db(., c("geography_dim_alt","geography"), c("sqlite1","csv1")) # or use dbCopy wrapper dbCopy("h2_geography", "h21", "geography_alt_alt", "sqlite1") ### Join across databases # join sqlite fact table to h2 geography dimension db("sales_fact", "sqlite1", key="geog_code")[ db("h2_geography", "h21", key="geog_code")] %>% head(2) # join h2 fact table to sqlite geography dimension db("sales_fact", "h21", key="geog_code")[ db("geography", "csv1", key="geog_code")] %>% head(2) ### Obviously join database side fully supported db("SELECT * FROM sales_fact sales JOIN geography_dim geog ON sales.geog_code = geog.geog_code") %>% head(2) ### ETL: fetch from various databases, join, aggregate and save to db # Extract data from databases geog <- db("h2_geography", "h21", key="geog_code") sales <- db("sales_fact", "sqlite1", key="geog_code") # Make transformations sales_by_division_vs_region <- geog[sales ][, region_value := sum(value,na.rm=TRUE), .(geog_region_name) ][, .(value = sum(value,na.rm=TRUE), value_to_region = sum(value,na.rm=TRUE) / region_value[1L]), .(geog_region_name,geog_division_name)] # Load data to to sqlite db(sales_by_division_vs_region,"sales_by_division_vs_region") # Verify data db("SELECT * FROM sales_by_division_vs_region") %>% head(4) ### The same ETL process but in one data.table chain db("h2_geography", "h21", key="geog_code" )[db("sales_fact", "sqlite1", key="geog_code") ][, region_value := sum(value,na.rm=TRUE), .(geog_region_name) ][, .(value = sum(value,na.rm=TRUE), value_to_region = sum(value,na.rm=TRUE) / region_value[1L]), .(geog_region_name,geog_division_name) ][, db(.SD,"sales_by_division_vs_region_2")] %>% invisible
data.equal.data.table
: data equality checkDatabases stores the data with no row order, so to check equality of data in tables we must ignore row order.
SQLite does not supports analytics functions so for tidiness I will use views.
# aggregate with count to handle duplicate rows sql <- "CREATE VIEW v_division_by_region AS SELECT geog_region_name, geog_division_name, value, value_to_region, COUNT(*) cnt FROM sales_by_division_vs_region GROUP BY geog_region_name, geog_division_name, value, value_to_region" sql <- c(sql, "CREATE VIEW v_division_by_region_2 AS SELECT geog_region_name, geog_division_name, value, value_to_region, COUNT(*) cnt FROM sales_by_division_vs_region_2 GROUP BY geog_region_name, geog_division_name, value, value_to_region") # except statements sql <- c(sql, "CREATE VIEW v_1_minus_2 AS SELECT * FROM v_division_by_region EXCEPT SELECT * FROM v_division_by_region_2") sql <- c(sql, "CREATE VIEW v_2_minus_1 AS SELECT * FROM v_division_by_region_2 EXCEPT SELECT * FROM v_division_by_region") # create all 4 views, turn on timing db(sql, timing=TRUE) # for exact data match below query should return 0 rows db("SELECT * FROM v_1_minus_2 UNION ALL SELECT * FROM v_2_minus_1", timing=TRUE) # drop all existing views db("SELECT name FROM sqlite_master WHERE type = 'view'")[,paste("DROP VIEW",name)] %>% db # alternatively whole process on the R side, wrapped with timing timing( data.equal.data.table( db("sales_by_division_vs_region"), db("sales_by_division_vs_region_2") ) )
timing
Precise measure timing plus metadata such in/out count, tags, optionally log to db.
timing({ Sys.sleep(1.123) "my result" }) get.timing(40)[,.SD,.SDcols=-c("dwtools_session","in_n","user_self","sys_self","tag")] # recent timings
For extended logging solution see logR package that allows:
dwtools::db
.pkgsVersion
: multiple pkgs version managingHandy wrapper to compare packages version across libraries.
Makes easy to manage multiple environments.
lib.dev <- "lib_dev" dir.create(lib.dev,FALSE) lib.prod <- "lib_prod" dir.create(lib.prod,FALSE) # install to production install.packages("R6", lib = lib.prod, repos="http://cran.stat.ucla.edu") # install to dev with_lib(paste(getwd(), lib.dev, sep="/"), install_github("wch/R6")) # load from different environments library("R6", lib.loc = lib.prod) # for dev use: lib.loc = lib.dev # manage versions pkgs <- c("DBI","data.table","devtools","dwtools","logR","R6") pkgsVersion(pkgs, libs = list(user = .libPaths()[1L], dev = lib.dev, prod = lib.prod))
# pkg user dev prod # 1: DBI 0.3.1 NA NA # 2: data.table 1.9.5 NA NA # 3: devtools 1.7.0 NA NA # 4: dwtools 0.8.3.6 NA NA # 5: logR 1.9.9 NA NA # 6: R6 2.0.1 2.0.0.9000 2.0.1
CUBE
: cube class object and MDX queriesForms CUBE
object from fact and dimension tables and query cube using MDX like queries.
cube <- CUBE$new( fact = list(sales = X$SALES), dim = list(customer = X$CUSTOMER, product = X$PRODUCT, geography = X$GEOGRAPHY, time = X$TIME, currency = X$CURRENCY), ref = list("sales-time"="time_code", "sales-currency"="curr_code", "sales-geography"="geog_code", "sales-customer"="cust_code", "sales-product"="prod_code") ) print(cube) cube$MDX( rows = list(time = c("time_year_code","time_month_code","time_month_name")), cols = list(sales = "value", geography = "geog_region_name"), from = "sales", where = list(currency = quote(curr_type == "fiat")) ) %>% head(6L)
joinbyv
: batch joinBatch join multiple tables into one master table.
Denormalization of star schema and snowflake schema to flat fact table.
DT <- joinbyv( master = X$SALES, join = list(customer = X$CUSTOMER, product = X$PRODUCT, geography = X$GEOGRAPHY, time = X$TIME, currency = X$CURRENCY), col.subset = list(c("cust_active"), c("prod_group_name","prod_family_name"), c("geog_region_name"), c("time_month_name"), NULL) ) print(names(DT)) # all columns in result
build_hierarchy
: detect basic hierarchyTakes single dataset on input and detects hierarchies by cardinality of unique groupings of all possible variable pairs. Returns star schema: one fact table and multiple dimension tables. It handle only basic cases.
DT <- joinbyv(X$SALES, join=list(X$CURRENCY, X$GEOGRAPHY)) names(DT) dw <- build_hierarchy(DT, factname="fact_sales") sapply(dw$tables,ncol) # number of columns in dimension tables or fact table, including keys sapply(dw$tables,nrow) # number of rows in dimension tables or fact table, including keys
eav
: Entity-Attribute-Value manipulationEAV modeled data calculation, intelligent wrapper for dcast %>% eval(j) %>% melt
.
DT <- X$SALES[,.(prod_code,time_code,amount,value) ][,melt(.SD, id=1:2,variable.name='measure',value.name='value') ] # prepare EAV setkey(DT,prod_code,time_code,measure) DT %>% head(4) eav(DT, quote(avg_price:=value/amount)) %>% head(6)
Early version of hierarchical data BI app. You can preview live demo of the app, 100k rows, slow public hosting.
shiny::runApp(system.file("shinyBI", package="dwtools"))
idxv
: user defined pre-calc indicesDT binary search on multiple keys, also known as Nth setkey.
Creates custom indices for a data.table object. May require lot of memory.
Make sense on heavy DT quering with filtering on various columns.
Possibly to remove after data.table#1067 resolved.
DT <- X$SALES # create some particular indices Idx <- list( c("cust_code", "prod_code", "geog_code"), c("cust_code", "geog_code", "curr_code"), c(2:3) ) IDX <- idxv(DT, Idx) # binary search on first index # DT[cust_code=="id014" & prod_code==2L & geog_code=="VA"] DT[CJI(IDX,"id014",2L,"VA")] %>% head(2) # binary search on second index # DT[cust_code=="id012" & geog_code=="WV" & curr_code=="ARS"] DT[CJI(IDX,"id012",TRUE,"WV",TRUE,"ARS")] %>% head(2) # binary search on third index # DT[prod_code==5L & geog_code=="NV"] DT[CJI(IDX,TRUE,5L,"NV")] %>% head(2)
A brief comment on others functions in the package.
?dbCopy
: mirror copy tables?vwap
: aggregate tick trades data to OHLC including VWAP?anonymize
: anonymization by hashing sensitive data?sql.dt
: query data.table using data.table syntax but SQL args and its sequencedbDisconnect(conn=sqlite1$conn) dbDisconnect(conn=h21$conn) file.remove("geography.csv")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.