R/get.nearest.R

Defines functions get.nearest

Documented in get.nearest

#' @title Find distance from each point in a set to the nearest of a second set of points (by lat/lon).
#'
#' @description
#' \code{get.nearest} returns the distance from each point in a set to the nearest of a second set of points (by lat/lon).
#' 
#' @details
#' This function returns a vector of distances, 
#' which are the distances from one set of points to the nearest single member (if any) of another set of points.
#' Points are specified using latitude and longitude in decimal degrees.
#' Relies on the \pkg{sp} package for the \code{\link[sp]{spDistsN1}} and \code{\link[sp]{SpatialPoints}} functions. \cr \cr
#' 
#' A future version may use get.distances.all() but for performance only use it for distance pairs (pairs of points) that have been initially 
#' quickly filtered using lat/lon to be not too far, in an attempt to go much faster in an initial pass.
#' *** old get.nearest with loops takes 42 seconds vs 3 seconds for this version, for 100k frompoints and 100 topoints:  Sys.time(); x=get.nearest(t100k, t100); Sys.time()
#' > Sys.time(); x=get.nearest(testpoints(1e6), testpoints(100)); Sys.time() \cr
#' [1] 14:33:05 EDT \cr
#' [1] 14:33:33 EDT  <30 seconds from 1 mill to 100 points, as in finding nearest of 100 sites for 9% of the US Census blocks.
#' But R hung/crashed on 11mill frompoints -- Probably out of memory. *** Need to break it up into batches of maybe 1 to 100 million distances at a time? 
#' There are 11,078,297 blocks according to \url{http://www.census.gov/geo/maps-data/data/tallies/national_geo_tallies.html}
#' @param frompoints A matrix or data.frame with two cols, 'lat' and 'lon' (or only 2 cols that are lat and lon in that order) with datum=WGS84 assumed.
#' @param topoints A matrix or data.frame with two cols, 'lat' and 'lon' (or only 2 cols that are lat and lon in that order) with datum=WGS84 assumed.
#' @param radius Optional number, default is Inf. Distance within which search should be limited, or max distance that will be returned.
#' @param ignore0 A logical, default is FALSE, specifying whether to ignore distances that are zero and report only the minimum nonzero distance. 
#'   Useful if nearest point other than self, where frompoints=topoints, for example.
#' @param return.rownums Logical value, TRUE by default. If TRUE, value returned also includes these 2 columns:
#'   a col named fromrow of index numbers starting at 1 specifying the frompoint and a similar col named n specifying the row of the nearest topoint.
#' @param return.latlons Logical value, FALSE by default. If TRUE, value returned also includes four extra columns,
#'   showing fromlat, fromlon, tolat, tolon.
#' @param units A string that is 'miles' by default, or 'km' for kilometers, specifying units for distances returned.
#' @return By default, returns a vector of distances, but can return a matrix of numbers, with columns that can include fromrow and torow indexing 
#'   which is nearest (first if >1 match) of topoints, fromlat, fromlon, tolat, tolon, and d (distance).
#'   ** Returns Inf when no topoints are found within the radius, and also when a distance to nearest is zero but ignore0=TRUE. 
#'   Distance returned is in miles by default, but with option to set units='km' to get kilometers.
#'   See parameters for details on other formats that may be returned if specified.
#' @seealso \code{\link{get.distances}} which gets distances between all points (within an optional search radius),
#'   \code{\link{get.distances.all}} which allows you to get distances between all points,
#'   \code{\link{get.distances.prepaired}} for finding distances when data are already formatted as pairs of points, and
#'   \code{\link{proxistat}} which calculates a proximity score for each spatial unit based on distances to nearby points.
#' @concept proximity
#' @import sp
#' @examples
#' set.seed(999)
#' t1=testpoints(1)
#' t10=testpoints(10)
#' t100=testpoints(100)
#' t1k=testpoints(1e3)
#' t10k=testpoints(1e4)
#' t100k=testpoints(1e5)
#' t1m=testpoints(1e6)
#' #t10m=testpoints(1e7)
#' 
#' get.nearest(t1, t1)
#' get.nearest(t1, t10[2, ,drop=FALSE])
#' get.nearest(t10, t1k)
#' get.nearest(t10, t1k, radius=500, units='km')
#' get.nearest(t10, t1k, radius=10, units='km')
#' 
#' test.from <- structure(list(fromlat = c(38.9567309094, 38.9507043428), 
#'  fromlon = c(-77.0896572305, -77.0896199948)), .Names = c("lat", "lon"), 
#'  row.names = c("6054762", "6054764"), class = "data.frame")
#' test.to <- structure(list(tolat = c(38.9575019287, 38.9507043428, 38.9514152435), 
#'  tolon = c(-77.0892818598, -77.0896199948, -77.0972395245)), .Names = c("lat", "lon"),
#'  class = "data.frame", row.names = c("6054762", "6054763", "6054764"))
#' get.nearest(test.from, test.to)
#' get.nearest(testpoints(10), testpoints(30))
#' @export
get.nearest <- function(frompoints, topoints, units='miles', ignore0=FALSE, 
                         return.rownums=FALSE, return.latlons=FALSE, radius=Inf) {
  
  # handle cases where an input is only one row (one point)
  if (is.vector(frompoints)) {mycols <- names(frompoints); frompoints <- matrix(frompoints, nrow=1); dimnames(frompoints)[[2]] = mycols }
  if (is.vector(topoints)) {mycols <- names(topoints); topoints <- matrix(topoints, nrow=1); dimnames(topoints)[[2]] = mycols }
  
  colnames(frompoints) <- latlon.colnames.check(frompoints)
  colnames(topoints)   <- latlon.colnames.check(topoints)
  
  n.from <- length(frompoints[,1])
  n.to <- length(topoints[,1])
  #frompoints and topoints each must be a matrix with 'lat' and 'lon' colnames (or only 2 cols that are lat and lon in that order)
  
  # ***** Notes on performance/ speed: ******
  # It might be easy to speed this up (for very large numbers of topoints) by using a box to search within (as get.distances does) and only enlarge the search box if no topoints are found in it.
  # One could assume topoints are uniformly distributed in extent defined by range(c(frompoints$lat, topoints$lat)) and range(c(frompoints$lon, topoints$lon))
  # Then do initial search for nearest by using get.distances(frompoints, topoints, max.miles=x) 
  # where max.miles is chosen so there is a 95% chance that at least one topoint will be within max.miles ***
  # *** BUT doesn't that assume frompoint is in the center of all the topoints? and that box is close to square not long rectangle?
  # So full extent of A sqkm for N topoints means avg of N/A topoints per sqkm, or A/N sqkms have 1 topoint on avg, and 
  # Each of N topoints has box/A chance of being in box, and 
  # chance of at least one being in box is 1-(1-box/A)^N = 0.95, so box/A= 1 - (1-0.95)^(1/N)
  #
  #   searchRadiuskm <- function(frompoints, topoints, prob=0.95) {
  #     meanlat = mean(topoints$lat)
  #     latrangekm = meters.per.degree.lat(meanlat ) * abs(diff(range(c(frompoints$lat, topoints$lat))))  / 1000
  #     lonrangekm = meters.per.degree.lon(meanlat ) * abs(diff(range(c(frompoints$lon, topoints$lon))))  / 1000
  #     allPointsSqkm = latrangekm * lonrangekm 
  #     n.to=length(topoints$lat)
  #     searchboxarea = allPointsSqkm * (1-(1-prob)^(1/n.to)) 
  #     searchRadius1 = sqrt(searchboxarea) / 2
  #     return(searchRadius1)
  #   }
  #   searchRadius1 <- searchRadiuskm(frompoints, topoints, 0.95)
  #   searchRadiusLat <- searchRadius1 / ( meters.per.degree.lat(mean(topoints$lat))/1000 )
  #   searchRadiusLon <- searchRadius1 / ( meters.per.degree.lon(mean(topoints$lat))/1000 )
  #   # There is a high probability that within ± this search radius you will find
  #   #   the nearest (at least one of) the topoints.
  #   if (usebox) {
  #     # would want to limit search for a given frompoint, using box, but that is slower than just getting ALL distances in sp package.
  #   }
  
  frompoints <- sp::SpatialPoints(coords = data.frame(x = frompoints[,'lon'], y = frompoints[,'lat']), proj4string=sp::CRS("+proj=longlat +datum=WGS84"))
  topoints   <- sp::SpatialPoints(coords = data.frame(x = topoints[  ,'lon'], y = topoints[  ,'lat']), proj4string=sp::CRS("+proj=longlat +datum=WGS84"))
  
  distances <- sp::spDists(frompoints, topoints, longlat=TRUE)
  
  if (ignore0) {
    # This could be improved to consider machine precision to handle zero vs nearly zero values correctly.
    distances[distances==0] <- Inf
  }
  
  d = analyze.stuff::rowMins( distances, na.rm = TRUE)
  
  if (units=='miles') {
    miles.per.km <- convert( 1, 'km', 'miles')
    d <- d * miles.per.km
  }
  
  d[d > radius] <- Inf
  
  if (return.rownums || return.latlons) {
    #torow <- apply( distances, 1, which.min ) # this was slightly slower than line below approach
    torow <- max.col(distances)
  }
  
  if (return.rownums & return.latlons) { 
    nearest <- matrix(nrow=n.from, ncol=7)
    nearest[ , ] <- c(d, 1:n.from, torow, as.vector(frompoints@coords[ , c('y','x')]), as.vector(topoints@coords[ torow, c('y','x')]))
    colnames(nearest) <- c('d', 'fromrow', 'torow', 'fromlat', 'fromlon', 'tolat', 'tolon')
  }
  
  if (!return.rownums & return.latlons) { 
    nearest <- matrix(nrow=n.from, ncol=5)
    nearest[ , ] <- c(d, as.vector(frompoints@coords[ , c('y','x')]), as.vector(topoints@coords[ torow, c('y','x')]))
    colnames(nearest) <- c('d', 'fromlat', 'fromlon', 'tolat', 'tolon')
  }
  
  if (return.rownums & !return.latlons) { 
    nearest <- matrix(nrow=n.from, ncol=3)
    nearest[ , ] <- c(d, 1:n.from, torow)
    colnames(nearest) <- c('d', 'fromrow', 'torow')
  }
  
  if (!return.rownums & !return.latlons) { 
    nearest <- matrix(d, ncol = 1)
    colnames(nearest) <- 'd'
  }
  
  return(nearest)
}
ejanalysis/proxistat documentation built on April 2, 2024, 10:13 a.m.