R/setkey.R

Defines functions CJ SJ getNumericRounding setNumericRounding binary setorderv setorder fsort forder forderv maybe_reset_index is.sorted setreordervec haskey getindex indices key setkeyv setindexv setindex setkey

Documented in CJ forder forderv fsort getNumericRounding haskey indices key setindex setindexv setkey setkeyv setNumericRounding setorder setorderv SJ

setkey = function(x, ..., verbose=getOption("datatable.verbose"), physical=TRUE)
{
  if (is.character(x)) stopf("x may no longer be the character name of the data.table. The possibility was undocumented and has been removed.")
  cols = as.character(substitute(list(...))[-1L])
  if (!length(cols)) { cols=colnames(x) }
  else if (identical(cols,"NULL")) cols=NULL
  setkeyv(x, cols, verbose=verbose, physical=physical)
}

# FR #1442
setindex = function(...) setkey(..., physical=FALSE)
setindexv = function(x, cols, verbose=getOption("datatable.verbose")) {
  if (is.list(cols)) {
    sapply(cols, setkeyv, x=x, verbose=verbose, physical=FALSE)
    return(invisible(x))
  } else {
    setkeyv(x, cols, verbose=verbose, physical=FALSE)
  }
}

# Has been warning since 2012, with stronger warning in Mar 2019 (note in news for 1.12.2); #3399
"key<-" = function(x,value) {
  stopf("key(x)<-value is deprecated and not supported. Please change to use setkey() with perhaps copy(). Has been warning since 2012.")
}

setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRUE)
{
  if (is.null(cols)) {   # this is done on a data.frame when !cedta at top of [.data.table
    if (physical) setattr(x,"sorted",NULL)
    setattr(x,"index",NULL)  # setkey(DT,NULL) also clears secondary keys. setindex(DT,NULL) just clears secondary keys.
    return(invisible(x))
  }
  if (!missing(verbose)) {
    stopifnot(isTRUEorFALSE(verbose))
    # set the global verbose option because that is fetched from C code without having to pass it through
    oldverbose = options(datatable.verbose=verbose)
    on.exit(options(oldverbose))
  }
  if (!is.data.table(x)) stopf("x is not a data.table")
  if (!is.character(cols)) stopf("cols is not a character vector. Please see further information in ?setkey.")
  if (physical && .Call(C_islocked, x)) stopf("Setting a physical key on .SD is reserved for possible future use; to modify the original data's order by group. Try setindex() instead. Or, set*(copy(.SD)) as a (slow) last resort.")
  if (!length(cols)) {
    warningf("cols is a character vector of zero length. Removed the key, but use NULL instead, or wrap with suppressWarnings() to avoid this warning.")
    setattr(x,"sorted",NULL)
    return(invisible(x))
  }
  if (identical(cols,"")) stopf("cols is the empty string. Use NULL to remove the key.")
  if (!all(nzchar(cols))) stopf("cols contains some blanks.")
  cols = gsub("`", "", cols, fixed = TRUE)
  miss = !(cols %chin% colnames(x))
  if (any(miss)) stopf("some columns are not in the data.table: %s", brackify(cols[miss]))

  if (physical && identical(head(key(x), length(cols)), cols)){ ## for !physical we need to compute groups as well #4387
    ## key is present but x has a longer key. No sorting needed, only attribute is changed to shorter key.
    setattr(x,"sorted",cols)
    return(invisible(x))
  }

  if (".xi" %chin% names(x)) stopf("x contains a column called '.xi'. Conflicts with internal use by data.table.")
  for (i in cols) {
    .xi = x[[i]]  # [[ is copy on write, otherwise checking type would be copying each column
    if (!typeof(.xi) %chin% ORDERING_TYPES) stopf("Column '%s' is type '%s' which is not supported as a key column type, currently.", i, typeof(.xi))
  }
  if (!is.character(cols) || length(cols)<1L) stopf("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov

  if (verbose) {
    # we now also retGrp=TRUE #4387 for !physical
    tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=!physical, reuseSorting=TRUE)))  # system.time does a gc, so we don't want this always on, until refcnt is on by default in R
    # suppress needed for tests 644 and 645 in verbose mode
    catf("forder took %.03f sec\n", tt["user.self"]+tt["sys.self"])
  } else {
    o = forderv(x, cols, sort=TRUE, retGrp=!physical, reuseSorting=TRUE)
  }
  if (!physical) { # index COULD BE saved from C forderReuseSorting already, but disabled for now
    maybe_reset_index(x, o, cols)
    return(invisible(x))
  }
  if (length(o)) {
    setattr(x,"index",NULL)   # TO DO: reorder existing indexes likely faster than rebuilding again. Allow optionally. Simpler for now to clear. Only when order changes.
    if (verbose) { last.started.at = proc.time() }
    .Call(Creorder,x,o)
    if (verbose) { catf("reorder took %s\n", timetaken(last.started.at)); flush.console() }
  } else {
    if (verbose) catf("x is already ordered by these columns, no need to call reorder\n")
  } # else empty integer() from forderv means x is already ordered by those cols, nothing to do.
  setattr(x,"sorted",cols)
  invisible(x)
}

key = function(x) attr(x, "sorted", exact=TRUE)

indices = function(x, vectors = FALSE) {
  ans = names(attributes(attr(x, "index", exact=TRUE)))
  if (is.null(ans)) return(ans) # otherwise character() gets returned by next line
  ans = gsub("^__","",ans)     # the leading __ is internal only, so remove that in result
  if (isTRUE(vectors))
    ans = strsplit(ans, "__", fixed = TRUE)
  ans
}

getindex = function(x, name) {
  # name can be "col", or "col1__col2", or c("col1","col2")
  ans = attr(attr(x, 'index', exact=TRUE), paste0("__",name,collapse=""), exact=TRUE)
  if (!is.null(ans) && (!is.integer(ans) || (length(ans)!=nrow(x) && length(ans)!=0L))) {
    stopf("Internal error: index '%s' exists but is invalid", name)   # nocov
  }
  c(ans) ## drop starts and maxgrpn attributes
}

haskey = function(x) !is.null(key(x))

# reorder a vector based on 'order' (integer)
# to be used in fastorder instead of x[o], but in general, it's better to replace vector subsetting with this..?
# Basic checks that all items of order are in range 1:n with no NAs are now made inside Creorder.
# FOR INTERNAL USE ONLY
setreordervec = function(x, order) .Call(Creorder, x, order)

# sort = sort.int = sort.list = order = is.unsorted = function(...)
#    stopf("Should never be called by data.table internals. Use is.sorted() on vectors, or forder() for lists and vectors.")
# Nice idea, but users might use these in i or j e.g. blocking order caused tests 304 to fail.
# Maybe just a grep through *.R for use of these function internally would be better (TO DO).

# Don't use base::is.unsorted internally, because :
#    1) it uses locale whereas in data.table we control locale sorting independently (C locale currently, but
#       "sorted" attribute will need an extra attribute "locale" so we can check if key's locale is the current locale)
#    2) wrapper needed, used to be :
#       identical(FALSE,is.unsorted(x)) && !(length(x)==1 && is.na(x))
#       where the && was needed to maintain backwards compatibility after r-devel's change of is.unsorted(NA) to FALSE (was NA) [May 2013].
# The others (order, sort.int etc) are turned off to protect ourselves from using them internally, for speed and for
# consistency; e.g., consistent twiddling of numeric/integer64, NA at the beginning of integer, locale ordering of character vectors.

is.sorted = function(x, by=NULL) {
  if (is.list(x)) {
    if (missing(by)) by = seq_along(x)   # wouldn't make sense when x is a vector; hence by=seq_along(x) is not the argument default
    if (is.character(by)) by = chmatch(by, names(x))
  } else {
    if (!missing(by)) stopf("x is vector but 'by' is supplied")
  }
  .Call(Cissorted, x, as.integer(by))
  # Return value of TRUE/FALSE is relied on in [.data.table quite a bit on vectors. Simple. Stick with that (rather than -1/0/+1)
}

maybe_reset_index = function(x, idx, cols) {
  if (isTRUE(getOption("datatable.forder.auto.index"))) return(invisible())
  if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer())
  setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), idx)
  invisible(x)
}

ORDERING_TYPES = c('logical', 'integer', 'double', 'complex', 'character')
forderv = function(x, by=seq_along(x), retGrp=FALSE, retStats=retGrp, sort=TRUE, order=1L, na.last=FALSE, reuseSorting=getOption("datatable.reuse.sorting", NA)) {
  if (is.atomic(x) || is.null(x)) {  # including forderv(NULL) which returns error consistent with base::order(NULL),
    if (!missing(by) && !is.null(by)) stopf("x is a single vector, non-NULL 'by' doesn't make sense")
    by = NULL
  } else {
    if (!length(x)) return(integer(0L)) # e.g. forderv(data.table(NULL)) and forderv(list()) return integer(0L))
    by = colnamesInt(x, by, check_dups=FALSE)
  }
  order = as.integer(order) # length and contents of order being +1/-1 is checked at C level
  .Call(CforderReuseSorting, x, by, retGrp, retStats, sort, order, na.last, reuseSorting)  # returns integer() if already sorted, regardless of sort=TRUE|FALSE
}

forder = function(..., na.last=TRUE, decreasing=FALSE)
{
  sub = substitute(list(...))
  tt = vapply_1b(sub, function(x) is.null(x) || (is.symbol(x) && !nzchar(x)))
  if (any(tt)) sub[tt] = NULL  # remove any NULL or empty arguments; e.g. test 1962.052: forder(DT, NULL) and forder(DT, )
  if (length(sub)<2L) return(NULL)  # forder() with no arguments returns NULL consistent with base::order
  asc = rep.int(1L, length(sub)-1L)  # ascending (1) or descending (-1) per column
  # the idea here is to intercept - (and unusual --+ deriving from built expressions) before vectors in forder(DT, -colA, colB) so that :
  # 1) - on character vector works; ordinarily in R that fails with type error
  # 2) each column/expression can have its own +/- more easily that having to use a separate decreasing=TRUE/FALSE
  # 3) we can pass the decreasing (-) flag to C and avoid what normally happens in R; i.e. allocate a new vector and apply - to every element first
  # We intercept the unevaluated expressions and massage them before evaluating in with(DT) scope or not depending on the first item.
  for (i in seq.int(2L, length(sub))) {
    v = sub[[i]]
    while (v %iscall% c('-', '+') && length(v)==2L) {
      if (v[[1L]] == "-") asc[i-1L] = -asc[i-1L]
      sub[[i]] = v = v[[2L]]  # remove the leading +/- which is the 2nd item since length(v)==2; i.e. monadic +/-
    }
  }
  x = eval(sub[[2L]], parent.frame(), parent.frame())
  if (is.list(x)) {
    if (length(x)==0L && is.data.frame(x)) stopf("Attempting to order a 0-column data.table or data.frame.")
    sub[2L] = NULL  # change list(DT, ...) to list(...)
    if (length(sub)==1L) {
      data = x
    } else {
      if (!is.data.frame(x)) stopf("The first item passed to [f]order is a plain list but there are more items. It should be a data.table or data.frame.")
      asc = asc[-1L]
      data = eval(sub, x, parent.frame())
    }
  } else {
    data = eval(sub, parent.frame(), parent.frame())
  }
  stopifnot(isTRUEorFALSE(decreasing))
  o = forderv(data, seq_along(data), retGrp=FALSE, retStats=FALSE, sort=TRUE, order=if (decreasing) -asc else asc, na.last=na.last)
  if (!length(o) && length(data)>=1L) o = seq_along(data[[1L]]) else o
  o
}

fsort = function(x, decreasing=FALSE, na.last=FALSE, internal=FALSE, verbose=FALSE, ...)
{
  containsNAs = FALSE
  if (typeof(x)=="double" && !decreasing && !(containsNAs <- anyNA(x))) {
      if (internal) stopf("Internal code should not be being called on type double")
      return(.Call(Cfsort, x, verbose))
  }
  else {
    # fsort is now exported for testing. Trying to head off complaints "it's slow on integer"
    # The only places internally we use fsort internally (3 calls, all on integer) have had internal=TRUE added for now.
    # TODO: implement integer and character in Cfsort and remove this branch and warning
    if (!internal){
      if (typeof(x)!="double") warningf("Input is not a vector of type double. New parallel sort has only been done for double vectors so far. Using one thread.")
      if (decreasing)  warningf("New parallel sort has not been implemented for decreasing=TRUE so far. Using one thread.")
      if (containsNAs) warningf("New parallel sort has not been implemented for vectors containing NA values so far. Using one thread.")
    }
    orderArg = if (decreasing) -1 else 1
    o = forderv(x, order=orderArg, na.last=na.last)
    return( if (length(o)) x[o] else x )
  }
}

setorder = function(x, ..., na.last=FALSE)
# na.last=FALSE here, to be consistent with data.table's default
# as opposed to DT[order(.)] where na.last=TRUE, to be consistent with base
{
  if (!is.data.frame(x)) stopf("x must be a data.frame or data.table")
  cols = substitute(list(...))[-1L]
  if (identical(as.character(cols),"NULL")) return(x)
  if (length(cols)) {
    cols=as.list(cols)
    order=rep(1L, length(cols))
    for (i in seq_along(cols)) {
      v=as.list(cols[[i]])
      if (length(v) > 1L && v[[1L]] == "+") v=v[[-1L]]
      else if (length(v) > 1L && v[[1L]] == "-") {
        v=v[[-1L]]
        order[i] = -1L
      }
      cols[[i]]=as.character(v)
    }
    cols=unlist(cols, use.names=FALSE)
  } else {
    cols=colnames(x)
    order=rep(1L, length(cols))
  }
  setorderv(x, cols, order, na.last)
}

setorderv = function(x, cols = colnames(x), order=1L, na.last=FALSE)
{
  if (is.null(cols)) return(x)
  if (!is.data.frame(x)) stopf("x must be a data.frame or data.table")
  na.last = as.logical(na.last)
  if (is.na(na.last) || !length(na.last)) stopf('na.last must be logical TRUE/FALSE')
  if (!is.character(cols)) stopf("cols is not a character vector. Please see further information in ?setorder.")
  if (!length(cols)) {
    warningf("cols is a character vector of zero length. Use NULL instead, or wrap with suppressWarnings() to avoid this warning.")
    return(x)
  }
  if (!all(nzchar(cols))) stopf("cols contains some blanks.")     # TODO: probably I'm checking more than necessary here.. there are checks in 'forderv' as well
  # remove backticks from cols
  cols = gsub("`", "", cols, fixed = TRUE)
  miss = !(cols %chin% colnames(x))
  if (any(miss)) stopf("some columns are not in the data.table: %s", brackify(cols[miss]))
  if (".xi" %chin% colnames(x)) stopf("x contains a column called '.xi'. Conflicts with internal use by data.table.")
  for (i in cols) {
    .xi = x[[i]]  # [[ is copy on write, otherwise checking type would be copying each column
    if (!typeof(.xi) %chin% ORDERING_TYPES) stopf("Column '%s' is type '%s' which is not supported for ordering currently.", i, typeof(.xi))
  }
  if (!is.character(cols) || length(cols)<1L) stopf("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov

  o = forderv(x, cols, sort=TRUE, retGrp=FALSE, order=order, na.last=na.last)
  if (length(o)) {
    .Call(Creorder, x, o)
    if (is.data.frame(x) && !is.data.table(x)) {
      setattr(x, 'row.names', rownames(x)[o])
    }
    k = key(x)
    if (!identical(head(cols, length(k)), k) || any(head(order, length(k)) < 0L))
      setattr(x, 'sorted', NULL) # if 'forderv' is not 0-length & key is not a same-ordered subset of cols, it means order has changed. So, set key to NULL, else retain key.
    setattr(x, 'index', NULL)  # remove secondary keys too. These could be reordered and retained, but simpler and faster to remove
  }
  invisible(x)
}

binary = function(x) .Call(Cbinary, x)

setNumericRounding = function(x) invisible(.Call(CsetNumericRounding, as.integer(x)))
getNumericRounding = function() .Call(CgetNumericRounding)

SJ = function(...) {
  JDT = as.data.table(list(...))
  setkey(JDT)
}
# S for Sorted, usually used in i to sort the i table

# TO DO?: Use the CJ list() replication method for SJ (inside as.data.table.list?, #2109) too to avoid setalloccol

CJ = function(..., sorted = TRUE, unique = FALSE)
{
  # Pass in a list of unique values, e.g. ids and dates
  # Cross Join will then produce a join table with the combination of all values (cross product).
  # The last vector is varied the quickest in the table, so dates should be last for roll for example
  l = list(...)
  vnames = name_dots(...)$vnames
  if (any(tt <- !nzchar(vnames))) vnames[tt] = paste0("V", which(tt))
  dups = FALSE # fix for #1513
  for (i in seq_along(l)) {
    y = l[[i]]
    if (!length(y)) next
    if (sorted) {
      if (!is.atomic(y)) stopf("'sorted' is TRUE but element %d is non-atomic, which can't be sorted; try setting sorted = FALSE", i)
      o = forderv(y, retGrp=TRUE)
      thisdups = attr(o, 'maxgrpn', exact=TRUE)>1L
      if (thisdups) {
        dups = TRUE
        if (length(o)) l[[i]] = if (unique) y[o[attr(o, "starts", exact=TRUE)]] else y[o]
        else if (unique) l[[i]] = y[attr(o, "starts", exact=TRUE)]  # test 1525.5
      } else {
        if (length(o)) l[[i]] = y[o]
      }
    } else {
      if (unique) l[[i]] = unique(y)
    }
  }
  nrow = prod(lengths(l))
  if (nrow > .Machine$integer.max) stopf("Cross product of elements provided to CJ() would result in %.0f rows which exceeds .Machine$integer.max == %d", nrow, .Machine$integer.max)
  l = .Call(Ccj, l)
  setDT(l)
  l = setalloccol(l)  # a tiny bit wasteful to over-allocate a fixed join table (column slots only), doing it anyway for consistency since
                    # it's possible a user may wish to use SJ directly outside a join and would expect consistent over-allocation
  setnames(l, vnames)
  if (sorted) {
    if (!dups) setattr(l, 'sorted', names(l))
    else setkey(l) # fix #1513
  }
  l
}

Try the data.table package in your browser

Any scripts or data that you put into this service are public.

data.table documentation built on Sept. 11, 2024, 6:25 p.m.