R/dwtools.R

Defines functions POSIXct as.POSIXct is.int.POSIXct nrowDT nrowDTlengthVec pkgsVersion data.equal.data.table is.unique n_unique

Documented in as.POSIXct data.equal.data.table is.int.POSIXct is.unique nrowDT nrowDTlengthVec n_unique pkgsVersion POSIXct

#' @title Data Warehouse tools
#' @description Extension for \link{data.table} package for Data Warehouse related functionalities.
#' @details The core functions includes:
#' \itemize{
#' \item \link{db} as Extracting and Loading tool in ETL terms.
#' \item \link{CUBE} cube class object and MDX like queries on it.
#' \item \link{joinbyv} a denormalization of star schema and snowflake schema to single table.
#' \item \link{dw.populate} populate star schema data.
#' \item \link{timing} measure timing and rows in-out, including logging to db and verbose messages.
#' \item \link{build_hierarchy} transform single dataset to star schema, columns allocation based on cardinalities in unique groupings of each pair.
#' \item \link{idxv} custom indices for in-memory processing.
#' }
#' @note
#' All dot prefixed arguments are designed to be taken from the options, use them only in special cases, they may be removed from functions input args in future.\cr
#' The base option \emph{digits.secs} will be changed to 3 on package load.
#' @docType package
#' @import devtools digest data.table R6
#' @name dwtools
NULL

# dwtools cache, timing logs, etc.
dwtools.cache <- new.env()

# technical ---------------------------------------------------------------

#' @title POSIXct
#' @description 0 length POSIXct wrapper
#' @keywords internal
POSIXct <- function(length = 0, tz = "UTC"){
  structure(numeric(length), class = c("POSIXct","POSIXt"), tzone = tz)
}

#' @title as.POSIXct
#' @description Setting default for UTC and 1970.
#' @keywords internal
as.POSIXct <- function(x,tz="UTC",origin="1970-01-01"){
  base::as.POSIXct(x,tz=tz,origin=origin)
}

#' @title is.int.POSIXct
#' @description Check if is integer and could be POSIX between 1970 and 2100.
#' @keywords internal
is.int.POSIXct <- function(x, date_from = as.POSIXct("1970-01-01"), date_to = as.POSIXct("2100-01-01")){
  is.integer(x) && all(as.POSIXct(x) %between% c(date_from,date_to))
}

#' @title nrowDT
#' @description Return nrow if DT else NA.
#' @keywords internal
nrowDT <- function(x){
  if(any(c("data.frame","data.table") %in% class(x))) nrow(x) else NA_integer_
}

#' @title nrowDTlengthVec
#' @description Return *nrow* if DT, *length* if Vec else *NA*.
#' @keywords internal
nrowDTlengthVec <- function(x){
  if(any(c("data.frame","data.table") %in% class(x))) nrow(x)
  else if(is.list(x) || is.vector(x)) length(x)
  else NA_integer_
}

# maintanance -------------------------------------------------------------

#' @title pkgsVersion
#' @description Batch package version compare between libraries.
#' @param pkgs character vector of packages names.
#' @param libs character vector of libraries paths to compare, vector names will be column names.
#' @export
#' @example tests/example-pkgs_version.R
pkgsVersion <- function(pkgs, libs = .libPaths()){
  # TO DO optimize code below
  l = lapply(libs, function(lib){
    l = lapply(pkgs, function(pkg, lib){
      tryCatch(as.character(packageVersion(pkg, lib.loc = lib)),
               error = function(e) NA_character_)
    }, lib)
    setNames(l,pkgs)
  })
  if(length(names(libs)) > 0){
    l = setNames(l,names(libs))
  }
  else l = setNames(l,libs)
  setDT(l)[,pkg:=pkgs]
  setcolorder(l,c("pkg",names(l)[names(l)!="pkg"]))[]
}

# data.table helpers ------------------------------------------------------

#' @title Data equal in two data.tables
#' @description Test if data equal in two data.tables, can ignore order of rows or columns.
#' @param DT1 data.table.
#' @param DT2 data.table.
#' @param ignore_row_order logical.
#' @param ignore_col_order logical.
#' @param check.attributes logical, only \emph{FALSE} supported.
#' @note Duplicate names in DTs were not tested. All attributes all ignored.
#' @export
#' @example tests/example-data_equal_data_table.R
data.equal.data.table <- function(DT1, DT2, ignore_row_order=TRUE, ignore_col_order=FALSE, check.attributes=FALSE){
  if(check.attributes) stop("check.attributes TRUE is not supported, function test only the data.")
  if(!is.data.table(DT1)) DT1 <- as.data.table(DT1) else DT1 <- copy(DT1)
  if(!is.data.table(DT2)) DT2 <- as.data.table(DT2) else DT2 <- copy(DT2)
  if(!identical(length(DT1),length(DT2))) return(FALSE)
  if(!ignore_row_order && !ignore_col_order) return(identical(DT1,DT2))
  if(ignore_col_order){
    if(!identical(names(DT1),names(DT2))){
      if(!identical(names(DT1)[order(names(DT1))],names(DT2)[order(names(DT2))])) return(FALSE) # check if identical after sorting
      else{
        setcolorder(DT1,names(DT1)[order(names(DT1))])
        setcolorder(DT2,names(DT2)[order(names(DT2))])
      }
    }
  }
  DT1[,`__dwtools_N`:=.N,by=c(names(DT1))]
  DT2[,`__dwtools_N`:=.N,by=c(names(DT2))]
  setkeyv(DT1,names(DT1))
  setkeyv(DT2,names(DT2))
  if(nrow(DT2[!DT1])>0L) return(FALSE)
  if(nrow(DT1[!DT2])>0L) return(FALSE)
  return(TRUE)
}

#' @title Fast check is vector unique
#' @description Test if vector unique.
#' @param x vector
#' @export
#' @example tests/example-is_unique.R
is.unique <- function(x){
  if(is.null(NULL)) return(TRUE)
  identical(attr(data.table:::forderv(x, retGrp=TRUE),"maxgrpn",TRUE),1L)
}

#' @title Fast N unique
#' @description Fast length of unique elements in a vector
#' @param x vector
#' @export
n_unique <- function(x){
  if(is.null(NULL)) return(0L)
  length(attr(data.table:::forderv(x, retGrp=TRUE),"starts",TRUE))
}
jangorecki/dwtools documentation built on May 18, 2019, 12:24 p.m.