Nothing
cbindlist_impl_ = function(l, copy) {
ans = .Call(Ccbindlist, l, copy)
if (anyDuplicated(names(ans))) { ## invalidate key and index
setattr(ans, "sorted", NULL)
setattr(ans, "index", NULL)
}
setDT(ans)
ans
}
cbindlist = function(l) cbindlist_impl_(l, copy=TRUE)
setcbindlist = function(l) invisible(cbindlist_impl_(l, copy=FALSE))
# when 'on' is missing then use keys, used only for inner and full join
onkeys = function(x, y) {
if (is.null(x) && !is.null(y)) return(y)
if (!is.null(x) && is.null(y)) return(x)
if (!is.null(x) && !is.null(y)) {
if (length(x) >= length(y))
return(intersect(y, x)) ## align order to shorter|rhs key
else
return(intersect(x, y))
}
NULL # nocov. Internal error is being called later in mergepair
}
# column index selection helper
someCols = function(x, cols, drop=character(), keep=character(), retain.order=FALSE) {
keep = colnamesInt(x, keep)
drop = colnamesInt(x, drop)
cols = colnamesInt(x, cols)
ans = union(keep, setdiff(cols, drop))
if (!retain.order) return(ans)
sort(ans)
}
hasindex = function(x, by, retGrp=FALSE) {
index = attr(x, "index", TRUE)
if (is.null(index)) return(FALSE)
idx_name = paste0("__", by, collapse="")
idx = attr(index, idx_name, TRUE)
if (is.null(idx)) return(FALSE)
if (!retGrp) return(TRUE)
!is.null(attr(idx, "starts", TRUE))
}
# fdistinct applies mult='first|last'
# for mult='first' it is unique(x, by=on)[, c(on, cols), with=FALSE]
# it may not copy when copy=FALSE and x is unique by 'on'
fdistinct = function(x, on=key(x), mult=c("first", "last"), cols=seq_along(x), copy=TRUE) {
if (!perhaps.data.table(x))
stopf("'x' must be data.table")
if (!is.character(on) || !length(on) || anyNA(on) || !all(on %chin% names(x)))
stopf("'on' must be character column names of 'x' argument")
mult = match.arg(mult)
if (is.null(cols))
cols = seq_along(x)
else if (!(is.character(cols) || is.integer(cols)) || !length(cols) || anyNA(cols))
stopf("'cols' must be non-zero length, non-NA, integer or character columns of 'x' argument")
if (!isTRUEorFALSE(copy))
stopf("'%s' must be TRUE or FALSE", "copy")
## do not compute sort=F for mult="first" if index (sort=T) already available, sort=T is needed only for mult="last"
## this short circuit will work after #4386 because it requires retGrp=T
#### sort = mult!="first" || hasindex(x, by=on, retGrp=TRUE)
sort = TRUE ## above line does not work for the moment, test 302.02
o = forderv(x, by=on, sort=sort, retGrp=TRUE)
if (attr(o, "maxgrpn", TRUE) <= 1L) {
ans = .shallow(x, someCols(x, cols, keep=on), retain.key=TRUE)
if (copy) ans = copy(ans)
return(ans)
}
f = attr(o, "starts", exact=TRUE)
if (mult == "last") {
if (!sort) internal_error("sort must be TRUE when computing mult='last'") # nocov
f = c(f[-1L] - 1L, nrow(x)) ## last of each group
}
if (length(o)) f = o[f]
if (sort && length(o <- forderv(f))) f = f[o] ## this rolls back to original order
.Call(CsubsetDT, x, f, someCols(x, cols, keep=on))
}
# extra layer over bmerge to provide ready to use row indices (or NULL for 1:nrow)
# NULL to avoid extra copies in downstream code, it turned out that avoiding copies precisely is costly and enormously complicates code, need #4409 and/or handle 1:nrow in subsetDT
dtmerge = function(x, i, on, how, mult, join.many, void=FALSE, verbose) {
nomatch = switch(how,
inner=, semi=, anti=, cross= 0L,
left=, right=, full=NA_integer_)
nomatch0 = identical(nomatch, 0L)
if (is.null(mult))
mult = switch(how,
semi=, anti="last",
cross="all",
inner=, left=, right=, full="error")
if (void && mult != "error")
internal_error("'void' must be used with mult='error'") # nocov
if (how == "cross") { ## short-circuit bmerge results only for cross join
if (length(on) || mult != "all" || !join.many)
stopf("cross join must be used with zero-length on, mult='all', join.many=TRUE")
if (void)
internal_error("cross join must be used with void=FALSE") # nocov
ans = list(allLen1=FALSE, starts=rep.int(1L, nrow(i)), lens=rep.int(nrow(x), nrow(i)), xo=integer())
} else {
if (!length(on))
stopf("'on' must be non-zero length character vector")
if (mult == "all" && (how == "semi" || how == "anti"))
stopf("semi and anti joins must be used with mult!='all'")
icols = colnamesInt(i, on, check_dups=TRUE)
xcols = colnamesInt(x, on, check_dups=TRUE)
ans = bmerge(i, x, icols, xcols, roll=0, rollends=c(FALSE, TRUE), nomatch=nomatch, mult=mult, ops=rep.int(1L, length(on)), verbose=verbose)
if (void) { ## void=T is only for the case when we want raise error for mult='error', and that would happen in above line
return(invisible(NULL))
} else if (how == "semi" || how == "anti") { ## semi and anti short-circuit
## we will subset i rather than x, thus assign to irows, not to xrows
if (how == "semi")
irows = which(ans$lens != 0L)
else
irows = which(ans$lens == 0L)
if (length(irows) == length(ans$lens)) irows = NULL
return(list(ans=ans, irows=irows))
} else if (mult == "all" && !ans$allLen1 && !join.many && ## join.many, like allow.cartesian, check
!(length(ans$starts) == 1L && ans$lens == nrow(x)) && ## special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"]
anyDuplicated(ans$starts, incomparables=c(0L, NA_integer_))
)
stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.")
}
## xrows, join-to
xrows = if (ans$allLen1) ans$starts else vecseq(ans$starts, ans$lens, NULL)
if (nomatch0 && ans$allLen1) xrows = xrows[as.logical(ans$lens)]
len.x = length(xrows) ## as of now cannot optimize to NULL, search for #4409 here
## irows, join-from
irows = if (!(ans$allLen1 && (!nomatch0 || len.x == length(ans$starts)))) seqexp(ans$lens)
len.i = if (is.null(irows)) nrow(i) else length(irows)
if (length(ans$xo) && length(xrows))
xrows = ans$xo[xrows]
len.x = length(xrows)
if (len.i != len.x)
internal_error("dtmerge out len.i != len.x") # nocov
list(ans=ans, irows=irows, xrows=xrows)
}
# atomic join between two tables
mergepair = function(lhs, rhs, on, how, mult, lhs.cols=names(lhs), rhs.cols=names(rhs), copy=TRUE, join.many=TRUE, verbose=FALSE) {
semi_or_anti = how == "semi" || how == "anti"
inner_or_full = how == "inner" || how == "full"
if (how != "cross") {
if (is.null(on)) {
if (how == "left" || semi_or_anti) on = key(rhs)
else if (how == "right") on = key(lhs)
else if (inner_or_full) on = onkeys(key(lhs), key(rhs))
if (is.null(on))
stopf("'on' is missing and necessary key is not present")
}
if (any(bad.on <- !on %chin% names(lhs)))
stopf("'on' argument specifies columns to join %s that are not present in %s table %s", brackify(on[bad.on]), "LHS", brackify(names(lhs)))
if (any(bad.on <- !on %chin% names(rhs)))
stopf("'on' argument specifies columns to join %s that are not present in %s table %s", brackify(on[bad.on]), "RHS", brackify(names(rhs)))
} else if (is.null(on)) {
on = character() ## cross join only
}
## join-to and join-from tables and columns (right outer join-->swap)
if (how != "right") {
join_from = lhs; from_cols = lhs.cols; join_to = rhs; to_cols = rhs.cols
} else {
join_from = rhs; from_cols = rhs.cols; join_to = lhs; to_cols = lhs.cols
}
## ensure symmetric join for inner|full join, apply mult on both tables, bmerge do only 'x' table
copy_i = FALSE ## copy marker of out.i
if (inner_or_full && !is.null(mult) && (mult == "first" || mult == "last")) {
join_from = fdistinct(join_from, on=on, mult=mult, cols=from_cols, copy=FALSE) ## might not copy when already unique by 'on'
copy_i = nrow(join_from) != nrow(lhs) ## nrow(lhs) bc how='inner|full' so join_from=lhs
} else if (how == "inner" && (is.null(mult) || mult == "error")) { ## we do this branch only to raise error from bmerge, we cannot use forder to just find duplicates because those duplicates might not have matching rows in another table, full join checks mult='error' during two non-void bmerges
dtmerge(x=join_from, i=join_to, on=on, how=how, mult=mult, verbose=verbose, join.many=join.many, void=TRUE)
}
## binary merge
ans = dtmerge(x=join_to, i=join_from, on=on, how=how, mult=mult, verbose=verbose, join.many=join.many)
## make i side; avoid subsetting if possible
cols_i = someCols(join_from, from_cols, keep=on, retain.order=semi_or_anti)
if (is.null(ans$irows)) {
out.i = .shallow(join_from, cols=cols_i, retain.key=TRUE)
} else {
out.i = .Call(CsubsetDT, join_from, ans$irows, cols_i)
copy_i = TRUE
}
## make x side
copy_x = TRUE
if (semi_or_anti) {
out.x = list()
} else {
if (is.null(ans$xrows)) ## as of now xrows cannot be NULL #4409 thus nocov below
internal_error("dtmerge()$xrows returned NULL, #4409 been resolved but related code has not been updated?") # nocov
out.x = .Call(CsubsetDT, join_to, ans$xrows, someCols(join_to, to_cols, drop=on))
copy_x = TRUE
## ensure no duplicated column names in merge results
if (any(dup.i <- names(out.i) %chin% names(out.x)))
stopf("merge result has duplicated column names %s, use 'cols' argument or rename columns in 'l' tables", brackify(names(out.i)[dup.i]))
}
## stack i and x
if (how != "full") {
if (!copy_i && copy) out.i = copy(out.i)
#if (!copy_x && copy) out.x = copy(out.x) ## as of now copy_x always TRUE, search for #4409 here
out = .Call(Ccbindlist, list(out.i, out.x), FALSE)
if (how == "right") setcolorder(out, neworder=c(on, names(out.x))) ## arrange columns: i.on, x.cols, i.cols
} else { # how=="full"
## we made left join side above, proceed to right join side, so swap tbls
join_from = rhs; from_cols = rhs.cols; join_to = lhs; to_cols = lhs.cols
copy_r = FALSE
if (!is.null(mult) && (mult == "first" || mult == "last")) {
join_from = fdistinct(join_from, on=on, mult=mult, cols=from_cols, copy=FALSE)
copy_r = nrow(join_from) != nrow(rhs) ## nrow(rhs) bc join_from=rhs
} ## mult=="error" check was made on one side already, below we do on the second side, test 101.43
## binary merge anti join; only need to keep 'irows'
mult = if (!is.null(mult) && mult != "all") mult
supplement_rows = dtmerge(x=join_to, i=join_from, on=on, how="anti", mult=mult, verbose=verbose, join.many=join.many)$irows
## make anti join side
cols_r = someCols(join_from, from_cols, keep=on)
if (is.null(supplement_rows)) {
out.r = .shallow(join_from, cols=cols_r, retain.key=TRUE) ## retain.key is used only in the edge case when !nrow(out.i)
} else {
out.r = .Call(CsubsetDT, join_from, supplement_rows, cols_r)
copy_r = TRUE
}
## short circuit to avoid rbindlist to empty sets and retains keys
if (!nrow(out.r)) { ## possibly also !nrow(out.i)
if (!copy_i && copy) out.i = copy(out.i)
#if (!copy_x && copy) out.x = copy(out.x) ## as of now copy_x always TRUE, search for #4409 here
out = .Call(Ccbindlist, list(out.i, out.x), FALSE)
} else if (!nrow(out.i)) { ## but not !nrow(out.r)
if (!copy_r && copy) out.r = copy(out.r)
if (length(add <- setdiff(names(out.i), names(out.r)))) { ## add missing columns of proper types NA
neworder = copy(names(out.i)) #set(out.r, NULL, add, lapply(unclass(out.i)[add], `[`, 1L)) ## 291.04 overalloc exceed fail during set()
out.i = lapply(unclass(out.i)[add], `[`, seq_len(nrow(out.r))) ## could eventually remove this when cbindlist recycle 0 rows up, note that we need out.r not to be copied
out.r = .Call(Ccbindlist, list(out.r, out.i), FALSE)
setcolorder(out.r, neworder=neworder)
}
out = out.r
} else { ## all might have not been copied yet, rbindlist will copy
out.l = .Call(Ccbindlist, list(out.i, out.x), FALSE)
out = rbindlist(list(out.l, out.r), use.names=TRUE, fill=TRUE)
}
}
setDT(out)
}
mergelist_impl_ = function(l, on, cols, how, mult, join.many, copy) {
verbose = getOption("datatable.verbose")
if (verbose)
p = proc.time()[[3L]]
if (!is.list(l) || is.data.frame(l))
stopf("'%s' must be a list", "l")
if (!all(vapply_1b(l, is.data.table)))
stopf("Every element of 'l' list must be data.table objects")
if (!all(idx <- lengths(l) > 0L))
stopf("Tables in 'l' must all have columns, but these entries have 0: %s", brackify(which(!idx)))
if (any(idx <- vapply_1i(l, function(x) anyDuplicated(names(x))) > 0L))
stopf("Column names in individual 'l' entries must be unique, but these have some duplicates: %s", brackify(which(idx)))
n = length(l)
if (n < 2L) {
out = if (n) l[[1L]] else as.data.table(l)
if (copy) out = copy(out)
if (verbose)
catf("mergelist: merging %d table(s), took %.3fs\n", n, proc.time()[[3L]]-p)
return(out)
}
if (!is.list(join.many))
join.many = rep(list(join.many), n - 1L)
if (length(join.many) != n - 1L || !all(vapply_1b(join.many, isTRUEorFALSE)))
stopf("'join.many' must be TRUE or FALSE, or a list of such whose length must be length(l)-1L")
if (missing(mult))
mult = NULL
if (!is.list(mult))
mult = rep(list(mult), n - 1L)
if (length(mult) != n - 1L || !all(vapply_1b(mult, function(x) is.null(x) || (is.character(x) && length(x) == 1L && !anyNA(x) && x %chin% c("error", "all", "first", "last")))))
stopf("'mult' must be one of [error, all, first, last] or NULL, or a list of such whose length must be length(l)-1L")
if (!is.list(how))
how = rep(list(how), n-1L)
if (length(how)!=n-1L || !all(vapply_1b(how, function(x) is.character(x) && length(x)==1L && !anyNA(x) && x %chin% c("left", "inner", "full", "right", "semi", "anti", "cross"))))
stopf("'how' must be one of [left, inner, full, right, semi, anti, cross], or a list of such whose length must be length(l)-1L")
if (is.null(cols)) {
cols = vector("list", n)
} else {
if (!is.list(cols))
stopf("'%s' must be a list", "cols")
if (length(cols) != n)
stopf("'cols' must be same length as 'l' (%d != %d)", length(cols), n)
skip = vapply_1b(cols, is.null)
if (!all(vapply_1b(cols[!skip], function(x) is.character(x) && !anyNA(x) && !anyDuplicated(x))))
stopf("'cols' must be a list of non-zero length, non-NA, non-duplicated, character vectors, or eventually NULLs (all columns)")
if (any(mapply(function(x, icols) !all(icols %chin% names(x)), l[!skip], cols[!skip])))
stopf("'cols' specify columns not present in corresponding table")
}
if (missing(on) || is.null(on)) {
on = vector("list", n - 1L)
} else {
if (!is.list(on))
on = rep(list(on), n - 1L)
if (length(on) != n-1L || !all(vapply_1b(on, function(x) is.character(x) && !anyNA(x) && !anyDuplicated(x)))) ## length checked in dtmerge
stopf("'on' must be non-NA, non-duplicated, character vector, or a list of such which length must be length(l)-1L")
}
l.mem = lapply(l, vapply, address, "")
out = l[[1L]]
out.cols = cols[[1L]]
for (join.i in seq_len(n - 1L)) {
rhs.i = join.i + 1L
out = mergepair(
lhs = out, rhs = l[[rhs.i]],
on = on[[join.i]],
how = how[[join.i]], mult = mult[[join.i]],
lhs.cols = out.cols, rhs.cols = cols[[rhs.i]],
copy = FALSE, ## avoid any copies inside, will copy once below
join.many = join.many[[join.i]],
verbose = verbose
)
out.cols = copy(names(out))
}
out.mem = vapply_1c(out, address)
if (copy)
.Call(CcopyCols, out, colnamesInt(out, names(out.mem)[out.mem %chin% unique(unlist(l.mem, recursive=FALSE))]))
if (verbose)
catf("mergelist: merging %d tables, took %.3fs\n", n, proc.time()[[3L]] - p)
out
}
mergelist = function(l, on, cols=NULL, how=c("left", "inner", "full", "right", "semi", "anti", "cross"), mult, join.many=getOption("datatable.join.many")) {
if (missing(how) || is.null(how))
how = match.arg(how)
mergelist_impl_(l, on, cols, how, mult, join.many, copy=TRUE)
}
setmergelist = function(l, on, cols=NULL, how=c("left", "inner", "full", "right", "semi", "anti", "cross"), mult, join.many=getOption("datatable.join.many")) {
if (missing(how) || is.null(how))
how = match.arg(how)
invisible(mergelist_impl_(l, on, cols, how, mult, join.many, copy=FALSE))
}
# Previously, we had a custom C implementation here, which is ~2x faster,
# but this is fast enough we don't bother maintaining a new routine.
# Hopefully in the future rep() can recognize the ALTREP and use that, too.
seqexp = function(x) rep(seq_along(x), x)
perhaps.data.table = function(x) .Call(CperhapsDataTableR, x)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.