R/testpoints.R

Defines functions testpoints

Documented in testpoints

#' @title Generate a number of randomly placed points, as latitude/longitude values.
#'
#' @description
#' Generate a number of randomly placed points, as latitude/longitude values.
#' 
#' @details
#' This function returns n points at random location latitude and longitude values, 
#' weighted by area or by population count in block group, to be roughly representative of 
#' either a random location (like throwing a dart, equal weight per square mile of land area
#'  - does not include water area in weighting but 
#'  can return a point in water if bg has internal point in water?!)
#' or where a randomly selected US resident lives (population weighted).
#' Points are returned as latitude and longitude in decimal degrees. 
#' 
#' For people weighted, it picks from the Census blockgroup internal points, 
#'  which are much more representative of where people live than a random square mile would be.
#'  
#' @param n Numeric value, 1 by default. Specifies how many testpoints to return. 
#'   Must be an integer between zero and 50 million, and not NA.
#' @param weighting area or people or geo or degrees   text indicating what type of weighting, 
#'   by area or people or geo (each block or block group has equal chance).
#'   Default is area - a random square mile not a random residence or random Census geometry. 
#' @param useblocks logical, whether to use the several million Census blocks if TRUE 
#'   (2020 Census as of 11/2022)
#'   or just the approx 220k block groups (from github package ejscreen) if FALSE which is faster
#' @param ST optional vector of 2 letter state abbreviations, to limit it to some state(s). 
#'   Default is all USA (including PR I think).
#' @param minlat the minimum latitude in decimal degrees to use for generating 
#'   random points within some range.
#' @param maxlat the maximum latitude in decimal degrees to use for generating 
#'   random points within some range.
#' @param minlon the minimum longitude in decimal degrees to use for generating 
#'   random points within some range.
#' @param maxlon the maximum longitude in decimal degrees to use for generating 
#'   random points within some range.
#' @param as.df Logical, default is TRUE, in which case returns a data.frame, 
#'   otherwise a matrix.
#' 
#' @return By default, returns a data.frame (or matrix if as.df=FALSE) that has 2 columns: 
#'   lat and lon, in decimal degrees, with 1 row per point.
#' @seealso \code{\link{get.distances.all}} which allows you to get distances between all points,
#'   \code{\link{get.distances.prepaired}} for finding distances 
#'     when data are already formatted as pairs of points,
#'   \code{\link{get.nearest}} which finds the distance to the single nearest point 
#'   within a specified search radius instead of all topoints, and 
#'   \code{\link{proxistat}} which calculates a proximity score for each spatial unit 
#'    based on distances to nearby points.
#' @concept proximity
#' @examples
#' testpoints(19,minlat=47,maxlat=48)
#' get.distances(testpoints(100),testpoints[10],radius=999,return.rownums=TRUE,return.latlons=TRUE)
#' 
#' @export
testpoints <- function(n=1, weighting='area', ST, as.df=TRUE, 
                       minlat=17.9, maxlat=71.25, minlon=-175.89, maxlon=178.34, useblocks=FALSE) { 
  # box for default bounds is 
  # summary(bg22[,c('lat', 'lon')])
  # lat             lon         
  # Min.   :17.90   Min.   :-175.86  
  # Max.   :71.25   Max.   : 178.34  
  # NA's   :13      NA's   :13      
  
  is.wholenumber <- function(x, tol = .Machine$double.eps^0.5)  abs(x - round(x)) < tol
  if ( NROW(n) > 1  || !is.numeric(n) || is.na(n) || !is.wholenumber(n) || is.infinite(n) || n < 0 || n > 5e7) {
    if (n == 1e6) {warning('a million is a lot yikes')}
    if (n > 1e6) {warning('more than a million is more than a lot FYI')}
    if (n > 1e7) {warning('that is ridiculous OMG')}
    if (n == 1) {warning('you just want one point? whatever.')}
    if (n == 0) {warning('you want zero points. srsly?')}
    stop('n must be a single whole number, where <0 is too hard to fathom and >50 million is too hard to keep track of')
  }
  if ( minlat < -90  || maxlat < -90 || minlat > 90  || maxlat > 90 )   {stop('minlat and maxlat must be from -90 through 90 decimal degrees')}
  if ( minlon < -180 || maxlon < -180 || minlon > 180 || maxlon > 180 ) {stop('minlon and maxlon must be from -180 through 180 decimal degrees')}
  if (!(weighting %in% c('area', 'people', 'geo', 'degrees'))) {stop('weighting must be a valid option. see help.')}
  
  colsneeded <- c('lat', 'lon')
  
  if (weighting == 'people') {
    colsneeded <- c(colsneeded, 'pop')
  }
  if (weighting == 'area') {
    colsneeded <- c(colsneeded, 'area')
  }
  if (weighting == 'degrees') {
    # ?
  }
  
  if (useblocks) {
    
    # use about 11 million Census block points - 2010 was EXTREMELY slow the way it was coded and stored
    # census 2020 in EJAMblockdata package has 8.174955 million rows.
    if (!missing(ST)) {
      colsneeded <- c(colsneeded, 'fips')
    }
     places <- proxistat::blockpoints_area_pop
    
    if (!missing(ST)) {
      # if ('ST' %in% colnames(places)) {
        places$ST <- get.state.info( get.fips.st(places$blockfips), fields = 'ST')[,'ST']
      # }
      # places$FIPS <- NULL
    }
    
  } else {
    # DO NOT USE BLOCKS - USE BLOCK GROUPS (UNLESS weighting is degrees)
    # use about 220,000 Census block GROUP points
    # summary(bg22[,c('lat', 'lon')])
    # lat             lon         
    # Min.   :17.90   Min.   :-175.86  
    # Max.   :71.25   Max.   : 178.34  
    # NA's   :13      NA's   :13      
    if (!missing(ST)) {
      colsneeded <- c(colsneeded, 'ST')
    }
    if (isTRUE(require('ejscreen'))) {
      # if you have bg22, use it
      places <- ejscreen::bg22[ , colsneeded]
      # places <- bg.pts # did not have population counts, only areas. 
    } else {
      # bg22 not available so warn and treat as if weighting is degrees, so use box min/max lat/lon
      warning('blockgroups dataset not found so picking from rectangle via min and max lat and lon')
      weighting <- 'degrees'
    }
  }
  
  if (missing(ST)) {
    # all states in US are selected from
  } else {
    # limit to specified states assuming that column is not available
    if ('ST' %in% colnames(places)) {
      places <- places[places$ST %in% ST, ]
    } else {
      warning('you specified states but there is no column called ST found')
    }
  }
  if (weighting == 'degrees') {
    rownum <- 1:n
    places <- data.frame(lat=runif(min = minlat, max = maxlat), lon=runif(min = minlon, max = maxlon))
  }
  
  # it is slower, but drop invalid lat lon BEFORE draw sample to ensure we get n not n minus invalid ones.
  # places <- places[!is.na(rowSums(places)), ]
  
  ########  latlon_is.valid()  FUNCTION IS IN EJAM AND RELATED PACKAGES BUT A COPY IS IN PROXISTAT 

  places <- places[ latlon_is.valid(lat = places$lat, lon = places$lon), ] # this validates lat lon values, not only for NA values.
  
  if (weighting == 'geo') {
    rownum <- sample(x=1:NROW(places), size = n, replace = FALSE) 
  }
  if (weighting == 'area') {
    rownum <- sample(x=1:NROW(places), size = n, replace = FALSE, prob = places$area)
  }
  if (weighting == 'pop') {
    rownum <- sample(x=1:NROW(places), size = n, replace = FALSE, prob = places$pop)
  }
  
  # MIGHT WANT TO RETURN FIPS ALSO? 
  
  x <- structure( c(places[rownum, 'lat'], places[rownum, 'lon']),
                  .Dim = c(n, 2L), .Dimnames = list( NULL, c("lat", "lon")))
  
  # x <- x[ which(!is.na(rowSums(df))), ] 
  
  if (as.df) {
    return(as.data.frame(x))
  } else {
    return(x)
  }
}
ejanalysis/proxistat documentation built on April 2, 2024, 10:13 a.m.