coalesce: Coalesce a field of weighted points

Documented in coalesce test.points

#' Random field of points
#'
#' Test and explore the functionality of \code{\link{coalesce}} with a
#'  random field of weighted points generated by this function.
#'
#' @param n
#' integer [1]; number of points
#' @param TwoD
#' logical [1];
#' should the points be distributed in only two dimensions?
#'
#' @return
#' \link[data.table]{data.table} with columns:\cr
#' \code{..$x,y,z} (num): position (\code{z} only if \code{ThreeD})\cr
#' \code{..$m} (num): mass (or weight) of points
#'
#' @export
#'
#' @examples
#' test.points(5L)
#' test.points(150L, FALSE)
#' test.points(150L)
#' \dontrun{test.points(1e5)}
#'
test.points <- function(n = 100L, TwoD = FALSE){
  as.data.table(list(x = sample(0:50000, n, TRUE)/500,
                     y = sample(0:50000, n, TRUE)/500,
                     z = if(!TwoD) sample(0:50000, n, TRUE)/50000,
                     m = sample(0:100, n, TRUE)/100))
}

#' Coalesce
#'
#' Organise a field of weighted particles by grouping clusters.
#'
#' @param xyzm
#' \link[data.table]{data.table}, or object coercible to one;
#' 4- (or 3- if \code{TwoD}) columns, with rows representing individual
#'  particles and columns giving the x, y and z (if not \code{TwoD})
#'  co-ordinates and the mass (or weight) of the particles (as with
#'  return value from \code{\link{test.points}})
#' @param cdh,cdv
#' numeric [1];
#' horizontal and vertical search radii for finding nearby particles; the
#'  search region around each particle is a circle (\code{TwoD}) or an
#'  ellipsoid with circular horizontal section
#' @param mm
#' numeric [1];
#' minimum mass for resulting particles: particles below this mass are
#'  coalesced with the nearest particle even if outside the search radius
#' @param subregions
#' logical [1];
#' use subregions?  Highly recommended for vast performance increases: will
#'  sort particles into rectangular subregions before coalescing so that
#'  particle distances will only be calculated between particles in the same
#'  subregion, with the sacrifice that the occasional grouping will be
#'  missed for particle pairs straddling subregion boundaries.
#'  Automatically switches off for data sets that are too small to benefit.
#' @param nppsr
#' integer [1];
#' if using subregions, how many particles should there be in each
#'  subregion, at least; 256 highly recommended as generally the best
#'  compromise
#' @param TwoD
#' logical [1];
#' set to \code{TRUE} if data set is 2D (i.e. no z column)
#' @param na.rep
#' logical [1];
#' rows which contain NAs are removed prior to coalescing; set
#'  \code{na.rep = TRUE} if these rows should be replaced after coalescing
#' @param maxnp
#' integer [1] or \code{Inf};
#' maximum number of particles in the result; the lightest particles that
#'  exceed this limit are forced to coalesce with their nearest neighbours,
#'  irrespective of the distance
#' @param maxbymm
#' not used
#' @param print.timings
#' logical [1];
#' put \code{TRUE} if you would like to see a print out of the timings of
#'  the various sections of this function
#'
#' @return
#' a \link[data.table]{data.table} with columns:\cr
#' \code{..$x,y,z} (num): \code{z} only if not \code{TwoD}\cr
#' \code{..$m} (num): mass/ weight
#'
#' @import data.table
#' @export
#'
#' @examples
#' library(graphics)
#' library(data.table)
#'
#' tps <- test.points(100L, TRUE)
#'
#' # total mass and centre of mass
#' sum(tps$m)
#' tps[, lapply(list(x = x, y = y), weighted.mean, m)]
#'
#' # plot original points - size represents mass
#' plot(tps[, .(x, y)], cex = tps$m*2, pch = 16L)
#'
#' cps1 <- coalesce(tps, 5, TwoD = TRUE)
#' cps2 <- coalesce(tps, 10, TwoD = TRUE)
#'
#' # total mass and centre of mass should be unchanged
#' sum(cps1$m)
#' cps1[, lapply(list(x = x, y = y), weighted.mean, m)]
#' sum(cps2$m)
#' cps2[, lapply(list(x = x, y = y), weighted.mean, m)]
#'
#' points(cps1[, .(x, y)], cex = cps1$m*2, pch = 16L, col = "red")
#' points(cps2[, .(x, y)], cex = cps2$m*2, pch = 16L, col = "blue")
#' points(tps[, .(x, y)], cex = tps$m*2)
#'
#' legend("bottom", legend = c("original", "cdh = 5", "cdh = 10"),
#'        ncol = 3L, col = c("black", "red", "blue"), pch = 16L,
#'        bg = "white")
#'
coalesce <- function(xyzm, cdh, cdv = cdh, mm = 0,
                     subregions = TRUE, nppsr = 256L,
                     TwoD = FALSE, na.rep = FALSE,
                     maxnp = Inf, maxbymm = TRUE,
                     print.timings = FALSE){
  if(print.timings) st.time <- Sys.time()
  fe <- environment()

  # copy makes a deep copy, removing the by reference relationship to the
  #  xyzm data table, which would cause it to retain a link with xyzm and
  #  therefore be modified along with it
  cns <- copy(colnames(xyzm))
  xyzm <- as.data.table(xyzm)

  # Remove rows containing NA for the calculation. Optionally save them for
  #  reattachment at the end.
  nas <- Reduce(`|`, lapply(xyzm, is.na))
  if(na.rep && any(nas)) xyzmna <- xyzm[nas,]
  if(any(nas)) xyzm <- xyzm[!nas,]

  if(!is.finite(cdv)) cdv <- 0
  if(cdh == 0){
    # because otherwise get 0/0: doesn't change result in this case
    cdv <- 1
  }
  nan.flag <- cdv == 0

  setnames(xyzm, c("x", "y", if(!TwoD) "z", "m"))

  # remove zero-mass particles
  xyzm <- xyzm[xyzm$m != 0,]
  np <- nrow(xyzm)

  # number of subregions: the largest square number that will allow at
  #  least 256 (or nppsr) points per subregion
  if(subregions){
    sqrtnsr <- as.integer(sqrt(np)%/%sqrt(nppsr))
    nsr <- as.integer(sqrtnsr^2L)
    # if not enough particles to justify subregions (or effectively one
    #  subregion)
    if(nsr < 4L) subregions <- FALSE
  }

  # groups points into 16 subgroups to reduce the number of distance
  #  calculations that must be performed (no point if small number of
  #  particles, and should not be done if cd is large compared to bounding
  #  box)
  # - grouping is by x and y only
  if(subregions){
    # divide into roughly equally populated sub-y-sections
    setkey(xyzm, y)
    xyzm[, sry := as.integer(.I%/%ceiling((np + 1L)/sqrtnsr) + 1L)]

    # divide the sub-y-sections into roughly equally populated
    #  sub-x-sections
    setkey(xyzm, sry, x)
    xyzm[, srx := {
      as.integer((.I - .I[1L] + 1L)%/%ceiling((.N + 1L)/sqrtnsr) + 1L)
    }, by = sry]

    # give each subregion a single unique identifier
    xyzm[, sr := .GRP, by = c("sry", "srx")]
    xyzm[, c("srx", "sry") := NULL]
  }

  if(print.timings) sr.time <- Sys.time()

  if(cdh > 0){
    setkey(xyzm, m)

    # group to which assigned
    grlabs <- integer(np)

    xyzm[, group := {
      # initialise
      # - integer group labels
      grvec <- integer(.N)
      #
      # - logical flags changed to TRUE when points are assigned to groups,
      #    so that particles are not checked when they are already assigned
      #  -- saves time
      #  -- avoids long group chains
      asd <- logical(.N)

      # those particle numbers within particle p's subregion to which p is
      #  close, including self
      cl <- lapply(.N:1, function(p){
        # if this particle is already assigned to a group, then skip
        if(asd[p]) return(integer(0L))

        # find all particles (within this subregion) that are within the
        #  coalescing radius of this particle
        wcl <- which({
          zsepsq <- if(TwoD) 0 else ((z[p] - z)^2)*(cdh/cdv)^2

          # 0*Inf = NaN, but would like 0 here (when cdv = 0 and equal z
          #  values are compared)
          if(nan.flag && !TwoD) zsepsq[is.nan(zsepsq)] <- 0
          (x[p] - x)^2 + (y[p] - y)^2 + zsepsq <= cdh^2
        })

        # mark the close-by particles as assigned
        asd[wcl] <<- TRUE

        # assign group labels - doesn't matter what the labels are as long
        #  as they are distinct only need to be distinct within subregion
        #  because by = c("group", "sr") is used for the grouping
        grvec[wcl] <<- p

        NULL
      })

      # return value
      grvec
    }, by = if(subregions) sr]
  }

  if(print.timings) cl.time <- Sys.time()

  # aggregate with data table by group label
  # - don't bother if no coalescing actually requested
  if(cdh > 0){
    xyzm <- if(TwoD){
      xyzm[, if(identical(.N, 1L)){
        list(x = x, y = y, m = m)
      }else{
        xy <- lapply(list(x = x, y = y), weighted.mean, m)
        c(xy, list(m = sum(m)))
      }, by = c("group", if(subregions) "sr")]
    }else{
      xyzm[, if(identical(.N, 1L)){
        list(x = x, y = y, z = z, m = m)
      }else{
        xyz <- lapply(list(x = x, y = y, z = z), weighted.mean, m)
        c(xyz, list(m = sum(m)))
      }, by = c("group", if(subregions) "sr")]
    }
  }

  if(print.timings) gr.time <- Sys.time()

  np <- nrow(xyzm)

  # which particles are smaller than mm, or which are the smallest particles
  #  if there is a set maximum count
  if(maxbymm && mm <= 0) mm <- min(xyzm$m)

  # when maxnp != Inf or mm > 0 and subregions = TRUE, there is the
  #  possibility of subregions with one very small particle
  # - in this case, an infinite loop occurs because there is no particle for
  #    these small particles to merge with
  # - in order to avoid this, a variable called mnp.modifier is used that
  #    will modify the used value for maxnp
  # - it is updated with each loop to be the number of isolated small
  #    particles which are the sole occupants of their subregions
  # - the result is that maxnp is effectively reduced when finding which
  #    particles to flag as small, so that other subregions have reduced
  #    particle counts
  # - mm > 0 is dealt with in a different way: groups with one small
  #    particle are given a special -1 group label which are then ignored
  # - thus it is possible for some particles with m < mm to be retained
  # - this will be reset with each loop
  assign("mnp.modifier", 0L, fe)

  smallf <- expression({
    if(maxbymm && np > maxnp){
      pflags <- logical(np)
      msrt <- sort.list(xyzm$m, method = "quick", na.last = NA)

      # the lightest np - maxnp particles are labelled as small
      pflags[msrt[1:(np - (maxnp - mnp.modifier))]] <- TRUE; pflags
    }else{
      smtmp <- xyzm$m < mm

      # no point trying to group a subregion with only one particle - ignore
      smtmp[xyzm$group == -1L] <- FALSE
      smtmp
    }
  })

  # coalesce particles with mass less than mm with nearest particles until
  #  no particles have mass less than mm
  # - this is a limited while loop, avoiding any chance of infinite loops
  for(attempts in 1:20) if(mm > 0 && any(sm <- eval(smallf))){
    if(identical(np, 1L)) break #infinite loop would be possible here
    if(all(sm)){
      # all particles are small - return one super-particle
      xyzm <- xyzm[, c(list(weighted.mean(x, m), weighted.mean(y, m)),
                       if(!TwoD) list(weighted.mean(z, m)), list(sum(m)))]
      break
    }
    if(subregions && is.finite(maxnp)) mnp.modifier <- 0L # reset

    xyzm[, small := sm]; rm(sm)

    # make groups to coalesce each small particle with one or more others
    bys <- if(subregions) "sr"
    xyzm[, group := {
      if(all(small)){
        # this step avoids infinite loops - see above
        if(subregions && is.finite(maxnp) && .N == 1L){
          assign("mnp.modifier", mnp.modifier + 1L, fe)
        }

        # all small: all to be coalesced together
        -1L
      }else{
        grvec <- 1:.N
        asd <- logical(.N)

        grvec[small] <- vapply(which(small), function(p){
          if(asd[p]) return(grvec[p])

          # find closest particle (within subregion) to this small particle
          # the closest particle's group label is assigned as this small
          #  particle's group label
          zsepsq <- if(TwoD) 0 else ((z[p] - z)^2)*(cdh/cdv)^2
          if(nan.flag && !TwoD) zsepsq[is.nan(zsepsq)] <- 0
          dsq <- (x[p] - x)^2 + (y[p] - y)^2 + zsepsq
          dsq[p] <- NA # don't want it to find itself; which.min ignores NAs
          closest <- which.min(dsq)

          asd[closest] <<- TRUE # mark as assigned to group already
          grvec[closest]
        }, integer(1L))

        grvec
      }
    }, by = bys]

    # coalesce groups due to smallness
    bys <- if(subregions) "group,sr" else "group"
    xyzm <- if(TwoD){
      xyzm[, if(identical(.N, 1L)){
        list(x = x, y = y, m = m)
      }else{
        xy <- lapply(list(x = x, y = y), weighted.mean, m)
        c(xy, list(m = sum(m)))
      }, by = bys]
    }else{
      xyzm[, if(identical(.N, 1L)){
        list(x = x, y = y, z = z, m = m)
      }else{
        xyz <- lapply(list(x = x, y = y, z = z), weighted.mean, m)
        c(xyz, list(m = sum(m)))
      }, by = bys]
    }

    # small particles are now moved to new homes, so to speak, so can delete
    #  their original entries
    # mass and centre of mass are conserved
    np <- nrow(xyzm)
  }else break

  if(print.timings) mm.time <- Sys.time()

  if(print.timings) print(diff(c(st.time,
                                 subregions = sr.time,
                                 close = cl.time,
                                 coalescing = gr.time,
                                 minmass = mm.time)))

  suppressWarnings(xyzm[, c("group", "sr", "small") := NULL])
  if(!is.null(cns)) setnames(xyzm, cns)
  if(na.rep && any(nas)) xyzm <- rbind(xyzm, xyzmna)

  return(xyzm)
}