#' Random field of points
#'
#' Test and explore the functionality of \code{\link{coalesce}} with a
#' random field of weighted points generated by this function.
#'
#' @param n
#' integer [1]; number of points
#' @param TwoD
#' logical [1];
#' should the points be distributed in only two dimensions?
#'
#' @return
#' \link[data.table]{data.table} with columns:\cr
#' \code{..$x,y,z} (num): position (\code{z} only if \code{ThreeD})\cr
#' \code{..$m} (num): mass (or weight) of points
#'
#' @export
#'
#' @examples
#' test.points(5L)
#' test.points(150L, FALSE)
#' test.points(150L)
#' \dontrun{test.points(1e5)}
#'
test.points <- function(n = 100L, TwoD = FALSE){
as.data.table(list(x = sample(0:50000, n, TRUE)/500,
y = sample(0:50000, n, TRUE)/500,
z = if(!TwoD) sample(0:50000, n, TRUE)/50000,
m = sample(0:100, n, TRUE)/100))
}
#' Coalesce
#'
#' Organise a field of weighted particles by grouping clusters.
#'
#' @param xyzm
#' \link[data.table]{data.table}, or object coercible to one;
#' 4- (or 3- if \code{TwoD}) columns, with rows representing individual
#' particles and columns giving the x, y and z (if not \code{TwoD})
#' co-ordinates and the mass (or weight) of the particles (as with
#' return value from \code{\link{test.points}})
#' @param cdh,cdv
#' numeric [1];
#' horizontal and vertical search radii for finding nearby particles; the
#' search region around each particle is a circle (\code{TwoD}) or an
#' ellipsoid with circular horizontal section
#' @param mm
#' numeric [1];
#' minimum mass for resulting particles: particles below this mass are
#' coalesced with the nearest particle even if outside the search radius
#' @param subregions
#' logical [1];
#' use subregions? Highly recommended for vast performance increases: will
#' sort particles into rectangular subregions before coalescing so that
#' particle distances will only be calculated between particles in the same
#' subregion, with the sacrifice that the occasional grouping will be
#' missed for particle pairs straddling subregion boundaries.
#' Automatically switches off for data sets that are too small to benefit.
#' @param nppsr
#' integer [1];
#' if using subregions, how many particles should there be in each
#' subregion, at least; 256 highly recommended as generally the best
#' compromise
#' @param TwoD
#' logical [1];
#' set to \code{TRUE} if data set is 2D (i.e. no z column)
#' @param na.rep
#' logical [1];
#' rows which contain NAs are removed prior to coalescing; set
#' \code{na.rep = TRUE} if these rows should be replaced after coalescing
#' @param maxnp
#' integer [1] or \code{Inf};
#' maximum number of particles in the result; the lightest particles that
#' exceed this limit are forced to coalesce with their nearest neighbours,
#' irrespective of the distance
#' @param maxbymm
#' not used
#' @param print.timings
#' logical [1];
#' put \code{TRUE} if you would like to see a print out of the timings of
#' the various sections of this function
#'
#' @return
#' a \link[data.table]{data.table} with columns:\cr
#' \code{..$x,y,z} (num): \code{z} only if not \code{TwoD}\cr
#' \code{..$m} (num): mass/ weight
#'
#' @import data.table
#' @export
#'
#' @examples
#' library(graphics)
#' library(data.table)
#'
#' tps <- test.points(100L, TRUE)
#'
#' # total mass and centre of mass
#' sum(tps$m)
#' tps[, lapply(list(x = x, y = y), weighted.mean, m)]
#'
#' # plot original points - size represents mass
#' plot(tps[, .(x, y)], cex = tps$m*2, pch = 16L)
#'
#' cps1 <- coalesce(tps, 5, TwoD = TRUE)
#' cps2 <- coalesce(tps, 10, TwoD = TRUE)
#'
#' # total mass and centre of mass should be unchanged
#' sum(cps1$m)
#' cps1[, lapply(list(x = x, y = y), weighted.mean, m)]
#' sum(cps2$m)
#' cps2[, lapply(list(x = x, y = y), weighted.mean, m)]
#'
#' points(cps1[, .(x, y)], cex = cps1$m*2, pch = 16L, col = "red")
#' points(cps2[, .(x, y)], cex = cps2$m*2, pch = 16L, col = "blue")
#' points(tps[, .(x, y)], cex = tps$m*2)
#'
#' legend("bottom", legend = c("original", "cdh = 5", "cdh = 10"),
#' ncol = 3L, col = c("black", "red", "blue"), pch = 16L,
#' bg = "white")
#'
coalesce <- function(xyzm, cdh, cdv = cdh, mm = 0,
subregions = TRUE, nppsr = 256L,
TwoD = FALSE, na.rep = FALSE,
maxnp = Inf, maxbymm = TRUE,
print.timings = FALSE){
if(print.timings) st.time <- Sys.time()
fe <- environment()
# copy makes a deep copy, removing the by reference relationship to the
# xyzm data table, which would cause it to retain a link with xyzm and
# therefore be modified along with it
cns <- copy(colnames(xyzm))
xyzm <- as.data.table(xyzm)
# Remove rows containing NA for the calculation. Optionally save them for
# reattachment at the end.
nas <- Reduce(`|`, lapply(xyzm, is.na))
if(na.rep && any(nas)) xyzmna <- xyzm[nas,]
if(any(nas)) xyzm <- xyzm[!nas,]
if(!is.finite(cdv)) cdv <- 0
if(cdh == 0){
# because otherwise get 0/0: doesn't change result in this case
cdv <- 1
}
nan.flag <- cdv == 0
setnames(xyzm, c("x", "y", if(!TwoD) "z", "m"))
# remove zero-mass particles
xyzm <- xyzm[xyzm$m != 0,]
np <- nrow(xyzm)
# number of subregions: the largest square number that will allow at
# least 256 (or nppsr) points per subregion
if(subregions){
sqrtnsr <- as.integer(sqrt(np)%/%sqrt(nppsr))
nsr <- as.integer(sqrtnsr^2L)
# if not enough particles to justify subregions (or effectively one
# subregion)
if(nsr < 4L) subregions <- FALSE
}
# groups points into 16 subgroups to reduce the number of distance
# calculations that must be performed (no point if small number of
# particles, and should not be done if cd is large compared to bounding
# box)
# - grouping is by x and y only
if(subregions){
# divide into roughly equally populated sub-y-sections
setkey(xyzm, y)
xyzm[, sry := as.integer(.I%/%ceiling((np + 1L)/sqrtnsr) + 1L)]
# divide the sub-y-sections into roughly equally populated
# sub-x-sections
setkey(xyzm, sry, x)
xyzm[, srx := {
as.integer((.I - .I[1L] + 1L)%/%ceiling((.N + 1L)/sqrtnsr) + 1L)
}, by = sry]
# give each subregion a single unique identifier
xyzm[, sr := .GRP, by = c("sry", "srx")]
xyzm[, c("srx", "sry") := NULL]
}
if(print.timings) sr.time <- Sys.time()
if(cdh > 0){
setkey(xyzm, m)
# group to which assigned
grlabs <- integer(np)
xyzm[, group := {
# initialise
# - integer group labels
grvec <- integer(.N)
#
# - logical flags changed to TRUE when points are assigned to groups,
# so that particles are not checked when they are already assigned
# -- saves time
# -- avoids long group chains
asd <- logical(.N)
# those particle numbers within particle p's subregion to which p is
# close, including self
cl <- lapply(.N:1, function(p){
# if this particle is already assigned to a group, then skip
if(asd[p]) return(integer(0L))
# find all particles (within this subregion) that are within the
# coalescing radius of this particle
wcl <- which({
zsepsq <- if(TwoD) 0 else ((z[p] - z)^2)*(cdh/cdv)^2
# 0*Inf = NaN, but would like 0 here (when cdv = 0 and equal z
# values are compared)
if(nan.flag && !TwoD) zsepsq[is.nan(zsepsq)] <- 0
(x[p] - x)^2 + (y[p] - y)^2 + zsepsq <= cdh^2
})
# mark the close-by particles as assigned
asd[wcl] <<- TRUE
# assign group labels - doesn't matter what the labels are as long
# as they are distinct only need to be distinct within subregion
# because by = c("group", "sr") is used for the grouping
grvec[wcl] <<- p
NULL
})
# return value
grvec
}, by = if(subregions) sr]
}
if(print.timings) cl.time <- Sys.time()
# aggregate with data table by group label
# - don't bother if no coalescing actually requested
if(cdh > 0){
xyzm <- if(TwoD){
xyzm[, if(identical(.N, 1L)){
list(x = x, y = y, m = m)
}else{
xy <- lapply(list(x = x, y = y), weighted.mean, m)
c(xy, list(m = sum(m)))
}, by = c("group", if(subregions) "sr")]
}else{
xyzm[, if(identical(.N, 1L)){
list(x = x, y = y, z = z, m = m)
}else{
xyz <- lapply(list(x = x, y = y, z = z), weighted.mean, m)
c(xyz, list(m = sum(m)))
}, by = c("group", if(subregions) "sr")]
}
}
if(print.timings) gr.time <- Sys.time()
np <- nrow(xyzm)
# which particles are smaller than mm, or which are the smallest particles
# if there is a set maximum count
if(maxbymm && mm <= 0) mm <- min(xyzm$m)
# when maxnp != Inf or mm > 0 and subregions = TRUE, there is the
# possibility of subregions with one very small particle
# - in this case, an infinite loop occurs because there is no particle for
# these small particles to merge with
# - in order to avoid this, a variable called mnp.modifier is used that
# will modify the used value for maxnp
# - it is updated with each loop to be the number of isolated small
# particles which are the sole occupants of their subregions
# - the result is that maxnp is effectively reduced when finding which
# particles to flag as small, so that other subregions have reduced
# particle counts
# - mm > 0 is dealt with in a different way: groups with one small
# particle are given a special -1 group label which are then ignored
# - thus it is possible for some particles with m < mm to be retained
# - this will be reset with each loop
assign("mnp.modifier", 0L, fe)
smallf <- expression({
if(maxbymm && np > maxnp){
pflags <- logical(np)
msrt <- sort.list(xyzm$m, method = "quick", na.last = NA)
# the lightest np - maxnp particles are labelled as small
pflags[msrt[1:(np - (maxnp - mnp.modifier))]] <- TRUE; pflags
}else{
smtmp <- xyzm$m < mm
# no point trying to group a subregion with only one particle - ignore
smtmp[xyzm$group == -1L] <- FALSE
smtmp
}
})
# coalesce particles with mass less than mm with nearest particles until
# no particles have mass less than mm
# - this is a limited while loop, avoiding any chance of infinite loops
for(attempts in 1:20) if(mm > 0 && any(sm <- eval(smallf))){
if(identical(np, 1L)) break #infinite loop would be possible here
if(all(sm)){
# all particles are small - return one super-particle
xyzm <- xyzm[, c(list(weighted.mean(x, m), weighted.mean(y, m)),
if(!TwoD) list(weighted.mean(z, m)), list(sum(m)))]
break
}
if(subregions && is.finite(maxnp)) mnp.modifier <- 0L # reset
xyzm[, small := sm]; rm(sm)
# make groups to coalesce each small particle with one or more others
bys <- if(subregions) "sr"
xyzm[, group := {
if(all(small)){
# this step avoids infinite loops - see above
if(subregions && is.finite(maxnp) && .N == 1L){
assign("mnp.modifier", mnp.modifier + 1L, fe)
}
# all small: all to be coalesced together
-1L
}else{
grvec <- 1:.N
asd <- logical(.N)
grvec[small] <- vapply(which(small), function(p){
if(asd[p]) return(grvec[p])
# find closest particle (within subregion) to this small particle
# the closest particle's group label is assigned as this small
# particle's group label
zsepsq <- if(TwoD) 0 else ((z[p] - z)^2)*(cdh/cdv)^2
if(nan.flag && !TwoD) zsepsq[is.nan(zsepsq)] <- 0
dsq <- (x[p] - x)^2 + (y[p] - y)^2 + zsepsq
dsq[p] <- NA # don't want it to find itself; which.min ignores NAs
closest <- which.min(dsq)
asd[closest] <<- TRUE # mark as assigned to group already
grvec[closest]
}, integer(1L))
grvec
}
}, by = bys]
# coalesce groups due to smallness
bys <- if(subregions) "group,sr" else "group"
xyzm <- if(TwoD){
xyzm[, if(identical(.N, 1L)){
list(x = x, y = y, m = m)
}else{
xy <- lapply(list(x = x, y = y), weighted.mean, m)
c(xy, list(m = sum(m)))
}, by = bys]
}else{
xyzm[, if(identical(.N, 1L)){
list(x = x, y = y, z = z, m = m)
}else{
xyz <- lapply(list(x = x, y = y, z = z), weighted.mean, m)
c(xyz, list(m = sum(m)))
}, by = bys]
}
# small particles are now moved to new homes, so to speak, so can delete
# their original entries
# mass and centre of mass are conserved
np <- nrow(xyzm)
}else break
if(print.timings) mm.time <- Sys.time()
if(print.timings) print(diff(c(st.time,
subregions = sr.time,
close = cl.time,
coalescing = gr.time,
minmass = mm.time)))
suppressWarnings(xyzm[, c("group", "sr", "small") := NULL])
if(!is.null(cns)) setnames(xyzm, cns)
if(na.rep && any(nas)) xyzm <- rbind(xyzm, xyzmna)
return(xyzm)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.