R/orderby.R

#' Order By
#'
#' Orders the data based on simple comparison between rows.
#'
#' Given a list of expressions and directions, the data is sorted by evaluating
#' the expressions for each row and placing the rows according to the given
#' directions. The expressions are evaluated in order, with earlier expressions
#' given more precdence. If there is a tie for all expressions for two distinct
#' rows, then the order in which they appear in the result is random.
#'
#' The direction for each expression is given as either 'asc' or 'dsc'. The
#' combination of the direction and a corresponding expression is specified as a
#' call, with the direction being the function and the only argument being the
#' expression.
#'
#' For example, if the comparison operators are specified as
#' \code{dsc(a), asc(b)}, then rows with larger values for \code{a} appear
#' before those with smaller values. In the case of a tie, rows with smaller
#' values for \code{b} appear before those with larger values.
#'
#' @param data A \code{\link{waypoint}}.
#' @param \dots Specification of ordering. See \sQuote{details} for more
#'   information.
#' @param limit The maximum number of tuples to include. If this is given as a
#'   positive integer, then only the first \code{limit} rows are kept.
#'
#'   Because ties are randomly broken, some rows may be discarded while rows
#'   that tie with them are kept.
#' @param rank Either NULL or an attribute name. In the latter case, a column is
#'   included in the result whose value is the numeric rank of that row, e.g. 1,
#'   2, ...
#' @param inputs A named list of expressions, with the names being used as the
#'   corresponding outputs. These expressions are outputted in addition to those
#'   used to specify the sorting.
#'
#'   If no name is given and the corresponding  expression is simply an
#'   attribute, then said attribute is used as the name. Otherwise an error is
#'   thrown, as there is no reason to include an extra input if corresponding
#'   output column cannot be referenced later.
#'
#'   If this is not given, then each attribute of \code{data} that is not used
#'   exactly as an expression for comparison is included.
#' @param outputs The usual way to specify the outputs. If both this and names
#'   for the \code{inputs} are given, a warning is given and \code{outputs} is
#'   used.
#' @return A \code{\link{waypoint}} with the designated columns and rows.
#' @author Jon Claus, <jonterainsights@@gmail.com>, Tera Insights, LLC.
#' @seealso \code{\link{ExtremeTuples}} for a similarly functioning GLA.

OrderBy <- function(data, ..., inputs, outputs) {
  constructor <- OrderByMake(..., data = data)
  exprs <- get.exprs(constructor$inputs)
  atts <- as.character(exprs[is.symbols(exprs)])

  if (missing(inputs)) {
    inputs <- convert.schema(subtract(names(data$schema), atts))
  } else {
    inputs <- substitute(inputs)
    check.exprs(inputs)
    inputs <- convert.exprs(inputs)
  }

  if (missing(outputs)) {
    outputs <- convert.names(inputs)
    missing <- which(outputs == "")
    exprs <- grokit$expressions[inputs[missing]]
    if (all(is.symbols(exprs)))
      outputs[missing] <- as.character(exprs)
    else
      stop("no name given for complex inputs:",
           paste("\n\t", lapply(exprs, deparse), collapse = ""))
  } else {
    if (!is.null(attr(inputs, "names")))
      warning("both outputs and named inputs given. outputs used.")
    outputs <- convert.atts(substitute(outputs))
  }

  inputs <- c(constructor$inputs, inputs)
  outputs <- c(constructor$outputs, outputs)

  if (length(inputs) != length(outputs) - constructor$rank)
    stop("The number of inputs and outputs must be the same.")

  Aggregate(data, constructor$GLA, inputs, outputs)
}

#' @rdname OrderBy
OrderByMake <- function(..., limit = 0, rank = NULL, keep.ties = FALSE, data) {
  args <- as.list(substitute(list(...)))[-1]
  names <- names(args)
  ordering <- lapply(args, function(arg) {
    if (length(arg) != 2 || !is.symbol(arg[[1]]))
      stop("Error in ", deparse(arg), "\n",
           "Each input must be a call of asc or dsc on an expression.")
    order <- arg[[1]]
    expr <- arg[[2]]
    check.exprs(expr)
    list(order, expr)
  })
  exprs <- lapply(ordering, `[[`, 2)
  directions <- as.character(lapply(ordering, `[[`, 1))

  if (!all(directions %in% c("asc", "dsc")))
    stop("Directional specifiers must be either asc or dsc.",
         "The following are erroneous:\n",
         paste0("\t", subtract(directions, c("asc", "dsc")), collapse = "\n"))

  if (!(is.logical(keep.ties) && length(keep.ties) == 1))
    stop("keep.ties should be a single boolean value.")

  atts <- unlist(lapply(exprs, convert.exprs, data = data))
  directions <- as.list(directions)
  names(directions) <- atts

  indices <-
    if (is.null(names))
      1:length(exprs)
    else
      which(names == "")
  for (i in indices)
    names[[i]] <-
      if (is.symbol(grokit$expressions[[atts[[i]]]]))
        as.character(grokit$expressions[[atts[[i]]]])
      else
        paste0("_orderAtt", i)

  rank <- substitute(rank)
  if (!is.null(rank)) {
    check.atts(rank)
    rank <- convert.atts(rank)
    names <- c(rank, names)
    class(rank) <- "attribute"
  }

  GLA <- GLA(OrderBy, order = directions, limit, rank, keep.ties)

  list(GLA = GLA,
       inputs = atts,
       outputs = names,
       rank = as.integer(!is.null(rank)))
}
tera-insights/gtBase documentation built on May 31, 2019, 8:35 a.m.