R/geoThin.r

Defines functions geoThin

Documented in geoThin

#' Thin geographic points deterministically or randomly
#'
#' This function thins geographic points such that none have nearest neighbors closer than some user-specified distance. For a given set of points that fall within this distance, thinning can be conducted in two ways.  Both begin by first calculating all pairwise distances between points. Then, clusters of points are found based on proximity using the "single-linkage" method (i.e., based on minimum distance between groups). Then, either a deterministic or random method is used to select the retained points:
#' \itemize{
#'	\item Deterministic: For each cluster, distances between each point in the cluster and all points outside of the cluster are calculated. The point retained in each cluster is the one with the greatest minimum pairwise distance to any points in any other cluster. This point will this be maximally isolated from any other point.
#'  \item Random: For each cluster, a random point is chosen.
#' }
#'
#' @param x A "spatial points" object of class \code{SpatVector}, \code{sf}, \code{data.frame}, or \code{matrix}. If \code{x} is a \code{data.frame} or \code{matrix}, then the points will be assumed to have the WGS84 coordinate system (i.e., unprojected).
#' @param minDist Minimum distance (in meters) needed between points to retain them. Points falling closer than this distance will be candidates for being discarded.
#' @param random Logical: If \code{FALSE} (default), then use the deterministic method for thinning. If \code{TRUE}, then use the random method.
#' @param longLat Numeric or integer vector: This is ignored if \code{x} is a \code{SpatVector} or \code{sf} object. However, if \code{x} is a \code{data.frame} or \code{matrix}, then this should be a character or integer vector specifying the columns in \code{x} corresponding to longitude and latitude (in that order). For example, \code{c('long', 'lat')} or \code{c(1, 2)}. The default is to assume that the first two columns in \code{x} represent coordinates.
#' @param method Character: Method used by \code{\link[stats]{hclust}} to cluster points. By default, this is \code{'single'}, but in some cases this may result in strange clustering (especially when there is a large number of points). The \code{'complete'} method (or others) may give more reasonable results in these cases.
#' @param ... Additional arguments. Not used.
#'
#' @return Object of class \code{x}.
#'
#' @examples
#' 
#' library(sf)
#' 
#' # lemur occurrence data
#' data(mad0)
#' data(lemurs)
#' crs <- getCRS('WGS84')
#' occs <- lemurs[lemurs$species == 'Eulemur fulvus', ]
#' ll <- c('longitude', 'latitude')
#' occs <- st_as_sf(occs, coords = ll, crs = getCRS('WGS84'))
#' 
#' # deterministically thin
#' det <- geoThin(x = occs, minDist = 30000)
#' 
#' # randomly thin
#' set.seed(123)
#' rand <- geoThin(x = occs, minDist = 30000, random = TRUE)
#' 
#' # map
#' oldPar <- par(mfrow = c(1, 2))
#'
#' plot(st_geometry(occs), cex = 1.4, main = 'Deterministic')
#' plot(st_geometry(det), pch = 21, cex = 1.4, bg = 1:nrow(det), add = TRUE)
#' plot(st_geometry(mad0), add = TRUE)
#' 
#' plot(st_geometry(occs), cex = 1.4, main = 'Random')
#' plot(st_geometry(rand), pch = 21, cex = 1.4, bg = 1:nrow(rand), add = TRUE)
#' plot(st_geometry(mad0), add = TRUE)
#' 
#' par(oldPar)
#'
#' @export
geoThin <- function(
	x,
	minDist,
	random = FALSE,
	longLat = 1:2,
	method = 'single',
	...
) {

	if (!inherits(x, c('SpatVector', 'sf'))) {
		x <- sf::st_as_sf(x, coords = longLat, remove = FALSE, crs = getCRS('WGS84'))
		input <- 'data.frame'
	} else if (inherits(x, 'SpatVector')) {
		x <- sf::st_as_sf(x)
		input <- 'SpatVector'
	} else if (inherits(x, 'sf')) {
		x <- sf::st_cast(x, 'POINT')
		input <- 'sf'
	} else {
		x <- sf::st_as_sf(x)
		input <- 'unknown'
	}

	# cluster based on distances
	dists <- sf::st_distance(x)
	diag(dists) <- NA
	distsDists <- stats::as.dist(dists)
	clust <- stats::hclust(distsDists, method = method)

	# define groups
	groups <- stats::cutree(clust, h = minDist)
	uniqueGroups <- sort(unique(groups))

	if (length(uniqueGroups) == 1L) {
	
		warning(paste('All points are less than', minDist, 'm from one another.\nOnly the point farthest from its nearest neighbor\nwillbe returned. Try increasing "minDist".'))
		
		minDist <- apply(dists, 1, min, na.rm = TRUE)
		out <- x[which.max(minDist), ]
		
	# points in > 1 group
	} else {

		# within each group, find point that is least close to point in any other group
		if (exists('out', inherits = FALSE)) rm(out)
		for (i in seq_along(uniqueGroups)) {

			group <- uniqueGroups[i]
			inPoints <- x[groups == group, , drop=FALSE]

			if (random) {
			
				keep <- sample(1L:nrow(inPoints), 1)
				keep <- inPoints[keep, ]
			
			} else {

				outPoints <- x[groups != group, , drop=FALSE]

				distsToOtherPoints <- sf::st_distance(inPoints, outPoints)
				minDistToOtherPoints <- apply(distsToOtherPoints, 1, min)
				whichMaxDistToOtherPoints <- which.max(minDistToOtherPoints)
				
				keep <- inPoints[whichMaxDistToOtherPoints, ]
				
			}
				
			out <- if (exists('out', inherits = FALSE)) {
				rbind(out, keep)
			} else {
				keep
			}

		}
		
	}
		
	if (input == 'data.frame') {
		out <- as.data.frame(out)
		out$geometry <- NULL
	} else if (input == 'SpatVector') {
		out <- terra::vect(out)
	}
	
	out

}

Try the enmSdmX package in your browser

Any scripts or data that you put into this service are public.

enmSdmX documentation built on April 3, 2025, 7:50 p.m.