R/cube.R

Defines functions selfNames mb.size parse.each.i dimnames.attributes build.each.i initialize print dapply fapply denormalize parse.i subset

# easy operate on names results from *apply functions
selfNames = function(x) setNames(x, x)

mb.size = function(x) as.numeric(object.size(x))/(1024^2)

# handling various types of input to [.cube `...` argument and [[.cube `i` argument.
parse.each.i = function(int, i, keys){
    # preprocessing of `...` arg of `[.cube` and `i` arg of `[[.cube`
    stopifnot(is.integer(int), is.pairlist(i), is.character(keys))
    x = if(int > length(i)) list() else i[[int]] # fill missing args: x[,] to x[,,.()] in case of 3 dimensions
    if(missing(x)) x = list() # empty args: x[,,"asd"] to x[.(),.(),"asd"]
    else if(is.null(x)) x = setNames(list(NULL), keys[[int]]) # null args: x[NULL,NULL,"asd"] to x[.(keycol=NULL),.(keycol=NULL),"asd"]
    if(is.call(x) && x[[1L]]==as.symbol(".")) x[[1L]] = quote(list) # decode x[.(y)] to x[list(y)]
    x = eval.parent(x) # x[,,var] to x[,,.(keycol=c("val1","val2"))], x[,,c("asd","asd2")] to x[,,.(keycol=c("asd","asd2"))]
    if(is.atomic(x)) x = setNames(list(x), keys[[int]]) # x[,,"asd"] to x[,,.(keycol="asd")]
    stopifnot(is.list(x))
    if(length(x)==1L && is.null(x[[1L]]) && is.null(names(x[1L]))) x = setNames(x, keys[[int]]) # x[.(NULL)] to x[.(keycol=NULL)]
    if(length(x)) stopifnot(length(unique(names(x)))==length(names(x))) # unique names
    x
}

# dimension names for attributes
dimnames.attributes = function(x, cb){
    stopifnot(is.character(x), is.cube(cb))
    if(!length(x)) return(character())
    # get all dimension columns and use to detect dimensions used in `by`
    dimcolnames = cb$dapply(colnames)
    # column name match in 2 dimensions, possibility the same dim
    x.dims = character()
    for(b in x){
        x.dim = sapply(dimcolnames, function(colnames) b %in% colnames)
        x.dim = names(x.dim)[x.dim]
        if(length(x.dim) > 1L) stop(sprintf("Column name used in `by`/`MARGIN`/etc. match to columns in mutliple dimensions."))
        x.dims[b] = x.dim
    }
    x.order = unique(x.dims)
    dims.x = lapply(dimcolnames, function(colnames) x[x %in% colnames])
    dims.x[x.order]
}

# building filter query on denormalized dataset
build.each.i = function(dim.i){
    build.each.i.attr = function(attr) if(is.null(dim.i[[attr]])) 0L else as.call(list(quote(`%in%`), as.name(attr), dim.i[[attr]]))
    Reduce(function(a, b) bquote(.(a) & .(b)), lapply(names(dim.i), build.each.i.attr))
}

# cube --------------------------------------------------------------------

#' @title OLAP cube class
#' @docType class
#' @format An R6 class object.
#' @name cube
#' @details Generates cube class objects.
#' @note This class will be deprecated, use \code{\link{data.cube}} class instead.
cube = R6Class(
    classname = "cube",
    public = list(
        env = NULL,
        initialize = function(x){
            self$env = as.environment(x)
            invisible(self)
        },
        print = function(){
            prnt = character()
            prnt["head"] = "<cube>"
            fact.size = self$fapply(mb.size, simplify = TRUE)
            prnt["fact"] = sprintf("fact:\n  %s %s rows x %s cols (%.2f MB)", self$fact, self$fapply(nrow, simplify = TRUE), self$fapply(ncol, simplify = TRUE), fact.size)
            if(length(self$dims)){
                dims.size = self$dapply(mb.size, simplify = TRUE)
                prnt["dims"] = paste0("dims:\n", paste(sprintf("  %s %s rows x %s cols (%.2f MB)", self$dims, self$dapply(nrow, simplify = TRUE), self$dapply(ncol, simplify = TRUE), dims.size), collapse="\n"))
            } else dims.size = 0
            prnt["size"] = sprintf("total size: %.2f MB", sum(c(fact.size, dims.size)))
            cat(prnt, sep="\n")
        },
        # dim apply
        dapply = function(FUN, ..., simplify = FALSE, USE.NAMES = TRUE, dims = self$dims){
            FUN = match.fun(FUN)
            sapply(X = lapply(selfNames(dims), function(x) self$env$dims[[x]]),
                   FUN = FUN, ...,
                   simplify = simplify, USE.NAMES = USE.NAMES)
        },
        # fact apply
        fapply = function(FUN, ..., simplify = FALSE, USE.NAMES = TRUE){
            FUN = match.fun(FUN)
            sapply(X = self$env$fact,
                   FUN = FUN, ...,
                   simplify = simplify, USE.NAMES = USE.NAMES)
        },
        # methods
        denormalize = function(dims = self$dims, na.fill = FALSE){
            all_cols = self$dapply(names, dims = dims)
            key_cols = sapply(all_cols, `[`, 1L)
            lkp_cols = lapply(all_cols, `[`, -1L)
            if(anyDuplicated(unlist(lkp_cols))) stop("Cannot lookup dimension attributes due to the column names duplicated between dimensions.")
            r = if(!na.fill | length(dims)==0L){
                copy(self$env$fact[[self$fact]])
            } else {
                # `nomatch` to be extended after data.table#857 resolved
                self$env$fact[[self$fact]][i = do.call(CJ, self$dapply(`[[`,1L, dims=dims)), nomatch=NA, on = swap.on(key_cols)]
            }
            sapply(dims[as.logical(sapply(lkp_cols, length))], function(dim) lookup(r, self$env$dims[[dim]], lkp_cols[[dim]]))
            if(length(dims)) setkeyv(r, unname(key_cols[dims]))[] else r[]
        },
        parse.i = function(i){
            keys = self$dapply(key, simplify = TRUE)
            i = lapply(setNames(seq_along(keys), names(keys)), parse.each.i, i, keys)
            # - [x] check if all cols exists in dims
            cols_missing = sapply(names(i), function(dim) !all(names(i[[dim]]) %in% names(self$env$dims[[dim]])))
            if(any(cols_missing)) stop(sprintf("Field used in query does not exists in dimensions '%s'.", paste(names(cols_missing)[cols_missing], collapse=", ")))
            i
        },
        # [.cube
        subset = function(..., .dots){
            # - [x] catch dots, preprocess, evaluate
            if(missing(.dots)) .dots = match.call(expand.dots = FALSE)$`...`
            i = self$parse.i(.dots)
            dims.filter = lapply(i, build.each.i)
            # - [x] iterate over dimensions - cleaner than mapply
            r = new.env()
            r$fact = list()
            r$dims = list()
            keys = self$dapply(key, simplify = TRUE)
            for(dim in self$dims){
                if(getOption("datacube.verbose", FALSE)) cat(sprintf("data.cube: processing dimension '%s'.\n", dim))
                r$dims[[dim]] = if(is.null(dims.filter[[dim]])) copy(self$env$dims[[dim]]) else self$env$dims[[dim]][eval(dims.filter[[dim]])]
                setkeyv(r$dims[[dim]], keys[[dim]])
            }
            # - [x] NULL subset returns empty fact
            if(any(sapply(dims.filter, identical, 0L))){
                r$fact[[self$fact]] = self$env$fact[[self$fact]][0L]
                return(as.cube(r))
            }
            fact_filter = !sapply(dims.filter, is.null)
            # - [x]  no filters returns copy of fact
            if(all(!fact_filter)){
                r$fact[[self$fact]] = copy(self$env$fact[[self$fact]])
                return(as.cube(r))
            }
            # - [x] check if binary search possible, only leading fact filters
            if(isTRUE(fact_filter[[1L]])){
                fact_filter2 = copy(fact_filter) # data.table#1419 rleid workaround as `copy()`, fixed in 1.9.7
                binarysearch_dims = self$dims[rleid(fact_filter2)==1L]
                if(getOption("datacube.verbose", FALSE)) cat(sprintf("data.cube: filter facts using binary search on '%s'.\n", paste(binarysearch_dims, collapse=", ")))
                r$fact[[self$fact]] = self$env$fact[[self$fact]][i = do.call(CJ, lapply(selfNames(binarysearch_dims), function(dim) r$dims[[dim]][[1L]])), nomatch = NA]
            } else binarysearch_dims = character()
            # - [x] fact-filter after the gap as vector scans
            vectorscan_dims = setdiff(self$dims[fact_filter], binarysearch_dims)
            if(length(vectorscan_dims)){
                if(getOption("datacube.verbose", FALSE)) cat(sprintf("data.cube: filter facts using vector scan on '%s'.\n", paste(vectorscan_dims, collapse=", ")))
                dims.keys = lapply(setNames(vectorscan_dims, keys[vectorscan_dims]), function(dim) r$dims[[dim]][[1L]])
                qi = Reduce(function(a, b) bquote(.(a) & .(b)), lapply(names(dims.keys), function(col) as.call(list(quote(`%in%`), as.name(col), dims.keys[[col]]))))
                r$fact[[self$fact]] = if(!length(r$fact[[self$fact]])) self$env$fact[[self$fact]][eval(qi)] else r$fact[[self$fact]][eval(qi)]
            }
            # - [x] return cube with all dimensions filtered and fact filtered
            return(as.cube(r))
        },
        # [[.cube
        extract = function(i, j, by, .call){
            if(!missing(.call)){
                i = .call[["i"]]
                j = .call[["j"]]
                #by = .call[["by"]]
            } else {
                #stop("direct access to 'extract' method not yet supported")
                i = substitute(i)
            }
            # parse i
            if(!is.null(i)){
                if(!(i[[1L]]==as.symbol(".") | i[[1L]]==quote(list))) stop("Argument `i` to `[[.cube` must be a call `list()` or `.()`.")
                i = as.list(i)[-1L]
                keep_dims = names(i)
                stopifnot(keep_dims %in% self$dims)
                all.i = sapply(self$dims, list, simplify = FALSE)
                all.i[keep_dims] = i
                i = self$parse.i(as.pairlist(all.i))[keep_dims]
            }
            # get dimensions by attributes
            dims.by = dimnames.attributes(by, self)
            # filter
            dims.filter = lapply(i, build.each.i)
            # processing dims
            r = new.env()
            r$fact = list()
            r$dims = list()
            keys = self$dapply(key, simplify = TRUE)
            copy.dims = unique(c(names(dims.by), names(dims.filter)))
            # copy only id column and the one used in `by`
            for(dim in copy.dims){
                r$dims[[dim]] = if(is.null(dims.filter[[dim]])){
                    #if(!dim %in% names(keys)) browser() # was issue for multiple columns from same dimension
                    self$env$dims[[dim]][, .SD, .SDcols = unique(c(keys[[dim]], dims.by[[dim]]))]
                } else {
                    self$env$dims[[dim]][eval(dims.filter[[dim]]), .SD, .SDcols = unique(c(keys[[dim]], dims.by[[dim]]))]
                    setkeyv(r$dims[[dim]], keys[[dim]])
                }
            }
            # keep only required dimensions, including those for filtering
            measures = setdiff(names(self$env$fact[[self$fact]]), keys)
            rm.dim.keys = keys[!names(keys) %in% copy.dims]
            r$fact[[self$fact]] = if(length(rm.dim.keys)) self$env$fact[[self$fact]][, -rm.dim.keys, with=FALSE] else copy(self$env$fact[[self$fact]])
            # join and filter fact
            for(dim in copy.dims){
                # this doesn't make sense for no-hierarchy cases so should be skipped
                r$fact[[self$fact]] = r$dims[[dim]][r$fact[[self$fact]], nomatch = 0L, on = c(keys[[dim]])]
            }
            # remove dimensions used only in filter
            rm.filter.dims = setdiff(names(dims.filter), names(dims.by))
            if(length(rm.filter.dims)) r$dims[rm.filter.dims] = NULL
            # rollup dimensions
            for(dim in names(r$dims)){
                if(!keys[[dim]] %in% dims.by[[dim]]){
                    r$dims[[dim]][, c(keys[[dim]]) := NULL]
                    new_key = names(r$dims[[dim]])[1L]
                    r$dims[[dim]] = unique(r$dims[[dim]], by = new_key)
                    setkeyv(r$dims[[dim]], new_key)
                }
            }
            # keep only required dimensions, only those used in `by`
            keep.dim.keys = unlist(sapply(r$dims, key))
            #if(any(!keep.dim.keys %in% by) | any(!by %in% keep.dim.keys)) browser()
            r$fact[[self$fact]] = r$fact[[self$fact]][, unique(c(keep.dim.keys, by, measures)), with=FALSE]
            # aggregate facts
            r$fact[[self$fact]] = if(length(by)) r$fact[[self$fact]][, eval(j), by = by] else r$fact[[self$fact]][, eval(j)]
            # setkey
            if(length(by)) setkeyv(r$fact[[self$fact]], by)
            # return cube
            return(as.cube(r))
        },
        # drop used in [.cube
        drop = function(drop=1L){
            keys = self$dapply(key, simplify = TRUE)
            # Direct access to cube object method by `$drop()` should not be used on cubes that shares dimensions, you can use drop arg in `[.cube` safely
            # - [x] drop dimensions where cardinality = 1
            if(nrow(self$env$fact[[self$fact]])){
                cardinality = self$env$fact[[self$fact]][, lapply(.SD, uniqueN), .SDcols = c(unname(keys))]
                dims_to_drop = sapply(setNames(cardinality, self$dims), `==`, 1L)
            } else {
                dims_to_drop = self$dapply(function(dim) nrow(dim)==1L, simplify = TRUE)
            }
            if(any(dims_to_drop)){
                dims_to_drop = names(dims_to_drop)[dims_to_drop]
                keys_to_drop = unname(self$dapply(key, dims = dims_to_drop, simplify = TRUE))
                self$env$dims[dims_to_drop] = NULL
                self$env$fact[[self$fact]][, c(keys_to_drop) := NULL]
                keys_to_keep = unname(setdiff(keys, keys_to_drop))
                if(length(keys_to_keep)) setkeyv(self$env$fact[[self$fact]], keys_to_keep)
            }
            self
        }
    ),
    active = list(
        fact = function() names(self$env$fact), # scalar character
        dims = function() names(self$env$dims) # character vector
    )
)

# *.cube ----------------------------------------------------------------

#' @title Subset cube
#' @param x cube object
#' @param ... values to subset on corresponding dimensions, when wrapping in list it will refer to dimension hierarchy
#' @param drop logical, default TRUE, drop dimensions same as *drop* argument in `[.array`.
#' @note Consider to use \code{\link{data.cube}} class objects instead. \code{data.cube} is a set of R6 classes for cube/fact/dimension/etc and will be more actively developed than \code{cube} class.
#' @return Cube class object
"[.cube" = function(x, ..., drop = TRUE){
    if(!is.logical(drop)) stop("`drop` argument to cube subset must be logical. If argument name conflicts with your dimension name then provide it without name, elements in ... are matched by positions - as in array method - not names.")
    r = x$subset(.dots = match.call(expand.dots = FALSE)$`...`)
    if(isTRUE(drop)) r$drop() else r
    r
}

#' @title Extract cube
#' @param x cube object
#' @param i list of values used to slice and dice on cube
#' @param j expression to evaluate on fact
#' @param by expression/character vector to aggregate measures accroding to *j* argument.
#' @note Consider to use \code{\link{data.cube}} class objects instead. \code{data.cube} is a set of R6 classes for cube/fact/dimension/etc and will be more actively developed than \code{cube} class.
#' @return Cube class object
"[[.cube" = function(x, i, j, by){
    r = x$extract(by = by, .call = match.call())
    r
}

#' @title Test if cube class
#' @param x object to tests.
is.cube = function(x) inherits(x, "cube")

dim.cube = function(x){
    unname(unlist(x$dapply(nrow, simplify = TRUE)))
}

dimnames.cube = function(x){
    r = x$dapply(`[[`,1L)
    if(!length(r)) return(NULL)
    r
}

str.cube = function(object, ...){
    cat("Classes ",paste(paste0("'",class(object),"'"), collapse=", "),".\n",sep="")
    cat("cube$env$fact: ")
    str(object$env$fact, max.level = 1L, give.attr = FALSE)
    cat("cube$env$dims: ")
    str(object$env$dims, max.level = 1L, give.attr = FALSE)
    invisible()
}

format.cube = function(x, measure.format = list(), dots.format = list(), dcast = FALSE, ...){
    stopifnot(is.list(measure.format))
    keys = x$dapply(key, simplify = TRUE)
    measures = setdiff(names(x$env$fact[[x$fact]]), keys)
    if(length(measure.format)) stopifnot(
        sapply(measure.format, is.function),
        length(names(measure.format))==length(measure.format), 
        names(measure.format) %in% measures
    )
    if(length(keys)) r = setorderv(copy(x$env$fact[[x$fact]]), cols = keys, order=1L, na.last=TRUE) else {
        stopifnot(nrow(x$env$fact[[x$fact]])==1L) # grant total
        r = copy(x$env$fact[[x$fact]])
    }
    if(length(measure.format)){
        for(mf in names(measure.format)){
            FUN = measure.format[[mf]]
            DOTS = dots.format[[mf]]
            set(r, i = NULL, j = mf, value = FUN(r[[mf]], ... = DOTS))
        }
    }
    if(isTRUE(dcast)) r = dcast.data.table(r, ...)
    r[]
}

# capply ------------------------------------------------------------------

#' @title Apply function on measures while aggregate on cube dimensions
#' @param x cube object
#' @param MARGIN character or list
#' @param FUN function
#' @param ... arguments passed to *FUN*
#' @description Wrapper around `[[.cube` and `j`, `by` arg.
#' @note Consider to use \code{\link{data.cube}} class objects instead. \code{data.cube} is a set of R6 classes for cube/fact/dimension/etc and will be more actively developed than \code{cube} class.
capply = aggregate.cube = function(x, MARGIN, FUN, ...){
    `.` = NULL
    stopifnot(is.cube(x), !missing(MARGIN), !missing(FUN), is.function(FUN))
    j = as.call(list(quote(lapply), X = quote(.SD), FUN = substitute(FUN), "..." = ...))
    x$extract(i = .(), j = j, by = MARGIN)
}
jangorecki/data.cube documentation built on Aug. 22, 2019, 4:15 p.m.