spsurvey: Spatial Sampling Design and Analysis

Documented in grts

###############################################################################
# Function: grts (exported)
# Programmers: Tony Olsen, Tom Kincaid
# Date: January 22, 2021
#' Select a generalized random tessellation stratified (GRTS) sample
#'
#' Select a spatially balanced sample from a point (finite), linear / linestring (infinite),
#' or areal / polygon (infinite) sampling frame using the Generalized Random Tessellation
#' Stratified (GRTS) algorithm. The GRTS algorithm accommodates unstratified and
#' stratified sampling designs and allows for equal inclusion probabilities, unequal
#' inclusion probabilities according to a categorical variable, and inclusion
#' probabilities proportional to a positive auxiliary variable. Several additional
#' sampling options are included, such as including legacy (historical) sites,
#' requiring a minimum distance between sites, and selecting replacement sites.
#' For technical details, see Stevens and Olsen (2004).
#'
#'
#' @param sframe A sampling frame as an \code{sf} object. The coordinate
#'   system for \code{sframe} must projected (not geographic). If m or z values
#'   are in \code{sframe}'s geometry, they are silently dropped (i.e., only x-coordinates
#'   and y-coordinates are preserved).
#'
#' @param n_base The base sample size required. If the sampling design is unstratified,
#'   this is a single numeric value. If the sampling design is stratified, this is a named
#'   vector or list whose names represent each stratum and whose values represent each
#'   stratum's sample size. These names must match the values of the stratification
#'   variable represented by \code{stratum_var}. Legacy sites are considered part
#'   of the base sample, so the value for \code{n_base} should be equal to the number
#'   of legacy sites plus the number of desired non-legacy sites.
#'
#' @param stratum_var A character string containing the name of the column from
#'   \code{sframe} that identifies stratum membership for each element in \code{sframe}.
#'   If stratum equals \code{NULL}, the sampling design is unstratified and all elements in \code{sframe}
#'   are eligible to be selected in the sample. The default is \code{NULL}.
#'
#' @param seltype A character string or vector indicating the inclusion probability type,
#'   which must be one of following: \code{"equal"} for equal inclusion probabilities;
#'   \code{"unequal"} for unequal inclusion probabilities according to a categorical
#'   variable specified by \code{caty_var}; and \code{"proportional"} for inclusion
#'   probabilities proportional to a positive auxiliary variable specified by
#'   \code{aux_var}. If the sampling design is unstratified, \code{seltype} is a single
#'   character vector. If the sampling design is stratified, \code{seltype} is a named vector
#'   whose names represent each stratum and whose values represent each stratum's
#'   inclusion probability type. \code{seltype}'s default value tries to match the
#'   intended inclusion probability type: If \code{caty_var} and \code{aux_var} are
#'   not specified, \code{seltype} is \code{"equal"}; if \code{caty_var} is specified,
#'   \code{seltype} is \code{"unequal"}; and if \code{aux_var} is specified, \code{seltype}
#'   is \code{"proportional"}.
#'
#' @param caty_var A character string containing the name of the column from
#'   \code{sframe} that represents the unequal probability variable.
#'
#' @param caty_n A character vector indicating the expected sample size for each
#'   level of \code{caty_var}, the unequal probability variable. If the sampling design
#'   is unstratified, \code{caty_n} is a named vector whose names represent each
#'   level of \code{caty_var} and whose values represent each level's expected
#'   sample size. The sum of \code{caty_n} must equal \code{n_base}. If the sampling design
#'   is stratified and the expected sample sizes are the same among strata, \code{caty_n} is
#'   a named vector whose names represent represent each
#'   level of \code{caty_var} and whose values represent each level's expected
#'   sample size -- these expected sample sizes are applied to all strata. The sum of
#'   \code{caty_n} must equal each stratum's value in \code{n_base}.
#'   If the sampling design is stratified and the expected sample sizes differ among strata,
#'   \code{caty_n} is a list where each element  is named as a stratum in \code{n_base}.
#'   Each stratum's list element is a named vector whose
#'   names represent each level of \code{caty_var} and whose values represent each
#'   level's expected sample size (within the stratum). The sum of the values in each stratum's
#'   list element must equal that stratum's value in \code{n_base}.
#'
#' @param aux_var A character string containing the name of the column from
#'   \code{sframe} that represents the proportional (to size) inclusion probability
#'   variable (auxiliary variable). This auxiliary variable must be positive, and the resulting
#'   inclusion probabilities are proportional to the values of the auxiliary variable.
#'   Larger values of the auxiliary variable result in higher inclusion probabilities.
#'
#' @param legacy_sites An sf object with a \code{POINT} or \code{MULTIPOINT}
#'   geometry representing the legacy sites. spsurvey assumes that
#'   the legacy sites were selected from a previous sampling design that
#'   incorporated randomness into site selection and that the legacy sites
#'   are elements of the current sampling frame. If \code{sframe} has a
#'   \code{POINT} or \code{MULTIPOINT} geometry, the observations in \code{legacy_sites}
#'   should not also be in \code{sframe} (i.e., duplicates are not removed). Thus, \code{sframe}
#'   and \code{legacy_sites} together compose the current sampling frame. If m or z values
#'   are in \code{legacy_sites}' geometry, they are silently dropped (i.e., only x-coordinates
#'   and y-coordinates are preserved).
#'
#' @param legacy_stratum_var A character string containing the name of the column from
#'   \code{legacy_sites} that identifies stratum membership for each element of \code{legacy_sites}.
#'   This argument is required when the sampling design is stratified and its levels
#'   must be contained in the levels of the \code{stratum_var} variable. The default value of \code{legacy_stratum_var}
#'   is \code{stratum_var}, so \code{legacy_stratum_var} need only be specified explicitly when
#'   the name of the stratification variable in \code{legacy_sites} differs from \code{stratum_var}.
#'
#' @param legacy_caty_var A character string containing the name of the column from
#'   \code{legacy_sites} that identifies the unequal probability variable for each element of \code{legacy_sites}.
#'   This argument is required when the sampling design uses unequal selection probabilities and its categories
#'   must be contained in the levels of the \code{caty_var} variable. The default value of \code{legacy_caty_var}
#'   is \code{caty_var}, so \code{legacy_caty_var} need only be specified explicitly when
#'   the name of the unequal probability variable in \code{legacy_sites} differs from \code{caty_var}.
#'
#' @param legacy_aux_var A character string containing the name of the column from
#'   \code{legacy_sites} that identifies the proportional probability variable for each element of \code{legacy_sites}.
#'   This argument is required when the sampling design uses proportional selection probabilities and the values of the
#'   \code{legacy_aux_var} variable must be positive. The
#'   default value of \code{legacy_aux_var} is \code{aux_var}, so \code{legacy_aux_var} need only be specified explicitly
#'   when the name of the proportional probability variable in \code{legacy_sites} differs from \code{aux_var}.
#'
#' @param legacy_var This argument can be used instead of \code{legacy_sites}
#'   when \code{sframe} is a \code{POINT} or \code{MULTIPOINT}
#'   geometry (i.e. a finite sampling frame),
#'   When \code{legacy_var} is used, it is a character string containing the name of the column
#'   from \code{sframe} that represents whether each site is a legacy site. For
#'   legacy sites, the values of the \code{legacy_var} must contain character strings that
#'   act as a legacy site identifier. For non-legacy sites, the values of the
#'   \code{legacy_var} column must be \code{NA}. Using this approach,
#'   \code{legacy_stratum_var}, \code{legacy_caty_var}, and \code{legacy_aux_var}
#'   are not required and should not be used (because \code{legacy_var} represents a column
#'   in \code{sframe}). \code{spsurvey} assumes that the legacy sites were selected from
#'   a previous sampling design that incorporated randomness into site selection
#'   and that the legacy sites are elements of the current sampling frame.
#'
#' @param mindis A numeric value indicating the desired minimum distance between sampled
#'   sites. If the sampling design is stratified and \code{mindis} is an numeric value, the minimum
#'   distance is applied to all strata. If the sampling design is stratified and different minimum distances
#'   are desired among strata, then \code{mindis}
#'   is a list whose names match the names of \code{n_base} and whose and values
#'   are the minimum distance for the corresponding stratum.  If a minimum distance is not desired
#'   for a particular stratum, then the corresponding value in \code{mindis} should be \code{0} or
#'    \code{NULL} (which is equivalent to \code{0}).
#'   The units of \code{mindis} must represent the units in \code{sframe}. A warning is returned if the
#'   minimum distance could not be reached after \code{maxtry} attempts. If legacy sites are used, the minimum distance
#'   requirement (and subsequent warning if \code{maxtry} attempts are reached) is enforced for all base sites
#'   that are not legacy sites (i.e., the minimum distance is enforced for these sites
#'   by comparing distances against all base sites (legacy and non-legacy)).
#'
#' @param maxtry The number of maximum attempts to apply the minimum distance algorithm to obtain
#'   the desired minimum distance between sites. Each iteration takes roughly as long as the
#'   standard GRTS algorithm. Successive iterations will always contain at least as many
#'   sites satisfying the minimum distance requirement as the previous iteration. The algorithm stops
#'   when the minimum distance requirement is met or there are \code{maxtry} iterations.
#'   The default number of maximum iterations is \code{10}.
#'
#' @param n_over The number of reverse hierarchically ordered (rho) replacement sites.
#'   If the sampling design is unstratified, then
#'   \code{n_over} is an integer specifying the number of rho replacement sites desired.
#'  If the sampling design is stratified,
#'   then \code{n_over} is a vector (or list) whose names match the names of \code{n_base} and
#'   whose values indicate the number of rho replacement sites for each stratum.
#'   If replacement sites are not desired for a particular stratum, then the corresponding
#'   value in \code{n_over} should be \code{0} or \code{NULL} (which is equivalent to \code{0}).
#'   If the sampling design is stratified but the number of \code{n_over} sites is the same in each
#'   stratum, \code{n_over} can be a vector which is used for each stratum.
#'   If \code{n_over} is an unnamed, length-one vector, it's value is recycled
#'   and used for each stratum. Note that if the
#'   sampling design has unequal selection probabilities (\code{seltype = "unequal"}), then \code{n_over} sites
#'   are given the same proportion of \code{caty_n} values as \code{n_base}.
#'
#'
#' @param n_near The number of nearest neighbor (nn) replacement sites.
#'   If the sampling design is unstratified, \code{n_near} is integer from \code{1}
#'   to \code{10} specifying the number of
#'   nn replacement sites to be selected for each base site. If the sampling design
#'   is stratified but the same number of nn replacement sites is desired
#'   for each stratum,  \code{n_near} is integer from \code{1}
#'   to \code{10} specifying the number of
#'   nn replacement sites to be selected for each base site. If the sampling design is
#'   unstratified and a different number of nn replacement sites is
#'   desired for each stratum, \code{n_near} is a vector (or list) whose names represent strata and whose
#'   values is integer from \code{1}
#'   to \code{10} specifying the number of
#'   nn replacement sites to be selected for each base site in the stratum. If replacement sites
#'   are not desired for a particular stratum, then the corresponding value in \code{n_over}
#'   should be \code{0} or \code{NULL} (which is equivalent to \code{0}). For
#'   infinite sampling frames, the distance between a site and its nn
#'   depends on \code{pt_density}. The larger \code{pt_density}, the closer the nn neighbors.
#'
#' @param wgt_units The units used to compute the design weights. These
#'   units must be standard units as defined by the \code{set_units()} function in
#'   the units package. The default units match the units of the sf object.
#'
#' @param pt_density A positive integer controlling the density of the GRTS approximation
#'   for infinite sampling frames. The GRTS approximation for infinite sample
#'   frames vastly improves computational efficiency by generating many finite points and
#'   selecting a sample from the points. \code{pt_density} represents the density
#'   of finite points per unit to use in the approximation. More specifically,
#'   for each stratum, the number of points used in the approximation equals
#'   \code{pt_density * (n_base + n_over)}. A larger value of \code{pt_density}
#'   means a closer approximation to the infinite sampling frame but less
#'   computational efficiency. The default value of \code{pt_density} is \code{10}. Note that
#'   when used with \code{caty_n}, the unequal inclusion probabilities generated from
#'   this approach are also approximations.
#'
#' @param DesignID A character string indicating the naming structure for each
#'   site's identifier selected in the sample, which is matched with \code{SiteBegin} and
#'   included as a variable in the
#'   sf object in the function's output. Default is "Site".
#'
#' @param SiteBegin A character string indicating the first number to use to match
#'   with \code{DesignID} while creating each site's identifier selected in the sample.
#'   Successive sites are given successive integers. The default starting number
#'   is \code{1} and the number of digits is equal to number of digits in
#'   \code{nbase + nover}.
#'   For example, if \code{nbase} is 50 and \code{nover} is 0, then the default
#'   site identifiers are \code{Site-01} to \code{Site-50}
#'
#' @param sep A character string that acts as a separator between
#'  \code{DesignID} and \code{SiteBegin}. The default is \code{"-"}.
#'
#' @param projcrs_check A check for whether the coordinates are projected. If \code{TRUE},
#'    an error is returned if coordinates are not projected  (i.e., they are geographic or NA). If \code{FALSE}, the
#'    check is not performed, which means that the crs in \code{sframe} (and \code{legacy_sites} if provided) can be projected, geographic, or NA.
#'
#' @details \code{n_base} is the number of sites used to calculate
#'   the design weights, which is typically the number of sites used in an analysis. When a panel sampling design is implemented, \code{n_base} is typically the
#'   number of sites in all panels that will be sampled in the same temporal period --
#'   \code{n_base} is not the total number of sites in all panels. The sum of \code{n_base} and
#'   \code{n_over} is equal to the total number of sites to be visited for all panels plus
#'   any replacement sites that may be required.
#'
#' @return The sampling design sites and additional information about the sampling design. More specifically, it is, a list with five elements:
#'   \itemize{
#'     \item \code{sites_legacy} An sf object containing legacy sites. This is
#'       \code{NULL} if legacy sites were not included in the sample.
#'     \item \code{sites_base} An sf object containing the base sites. This is \code{NULL}
#'     if \code{n_base} equals the number of legacy sites.
#'     \item \code{sites_over} An sf object containing the reverse hierarchically
#'       ordered replacement sites. This is \code{NULL} if no reverse hierarchically
#'       ordered replacement sites were included in the sample.
#'     \item \code{sites_near} An sf object containing the nearest neighbor
#'       replacement sites. This is \code{NULL} if no nearest neighbor replacement
#'       sites were included in the sample.
#'     \item \code{design} A list documenting the specifications of this sampling design.
#'       This can be checked to verify your sampling design ran as intended.
#'       \itemize{
#'         \item \code{call} The original function call.
#'         \item \code{stratum_var} The name of the stratification variable in \code{sframe}.
#'           This equals \code{NULL} if no stratification is used.
#'         \item \code{stratum} The unique strata. This equals \code{"None"} if
#'           the sampling design is unstratified.
#'         \item \code{n_base} The base sample size per stratum.
#'         \item \code{seltype} The selection type per stratum.
#'         \item \code{caty_var} The name of the unequal probability variable in \code{sframe}.
#'           This equals \code{NULL} if no unequal probability variable is used.
#'         \item \code{caty_n} The expected sample sizes for each level of the
#'           unequal probability grouping variable per stratum. This equals
#'           \code{NULL} when \code{seltype} is not \code{"unequal"}.
#'         \item \code{aux_var} The name of the proportional probability (auxiliary) variable in \code{sframe}.
#'           This equals \code{NULL} if no proportional probability variable is used.
#'         \item \code{legacy} A logical variable indicating whether legacy sites
#'           were included in the sample.
#'         \item \code{legacy_stratum_var} The name of the stratification variable in \code{legacy_sites}.
#'           Omitted if legacy sites are not used. This equals \code{NULL} if legacy sites were used but
#'           no stratification variable is used.
#'         \item \code{legacy_caty_var} The name of the unequal probability variable in \code{legacy_sites}.
#'           Omitted if legacy sites are not used. This equals \code{NULL} if legacy sites were used but
#'           no unequal probability variable is used.
#'         \item \code{legacy_aux_var} The name of the proportional probability (auxiliary)
#'           variable in \code{legacy_sites}.
#'           Omitted if legacy sites are not used. This equals \code{NULL} if legacy sites
#'           were used but no proportional probability variable is used.
#'         \item \code{mindis} The minimum distance requirement desired. This
#'           is \code{NULL} when no minimum distance requirement was applied.
#'         \item \code{n_over} The reverse hierarchically ordered replacement
#'           site sample sizes per stratum. If \code{seltype} is \code{unequal},
#'           this represents the expected sample sizes. This is \code{NULL}
#'           when no reverse hierarchically ordered replacement sites were selected.
#'         \item \code{n_near} The number of nearest neighbor replacement sites
#'           desired. This is \code{NULL} when no nearest neighbor replacement
#'           sites were selected.
#'       }
#'   }
#'   When non-\code{NULL}, the \code{sites_legacy}, \code{sites_base},
#'   \code{sites_over}, and \code{sites_near} objects contain the original columns
#'   in \code{sframe} and include a few additional columns. These additional columns
#'   are
#'   \itemize{
#'     \item \code{siteID} A site identifier (as named using the \code{DesignID}
#'       and \code{SiteBegin} arguments to \code{grts()}).
#'     \item \code{siteuse} Whether the site is a legacy site (\code{Legacy}), base
#'       site (\code{Base}), reverse hierarchically ordered replacement site
#'       (\code{Over}), or nearest neighbor replacement site (\code{Near}).
#'     \item \code{replsite} The replacement site ordering. \code{replsite} is
#'       \code{None} if the site is not a replacement site, \code{Next} if it is
#'       the next reverse hierarchically ordered replacement site to use, or
#'       \code{Near_}, where the word following \code{_} indicates the ordering of sites closest to
#'       the originally sampled site.
#'     \item \code{lon_WGS84} Longitude coordinates using the WGS84 coordinate
#'       system (EPSG:4326). Only given if coordinates are projected.
#'     \item \code{lat_WGS84} Latitude coordinates using the WGS84 coordinate
#'       system (EPSG:4326). Only given if coordinates are projected.
#'     \item \code{X} Longitude coordinates using the provided coordinate
#'       system. Only given if coordinates are not projected (i.e., they are geographic or NA).
#'     \item \code{Y} Latitude coordinates using the provided coordinate
#'       system. Only given if coordinates are not projected (i.e., they are geographic or NA).
#'     \item \code{stratum} A stratum indicator. \code{stratum} is \code{None}
#'       if the sampling design was unstratified. If the sampling design was \code{stratified},
#'       \code{stratum} indicates the stratum.
#'     \item \code{wgt} The design weight.
#'     \item \code{ip} The site's original inclusion probability (the reciprocal)
#'       of (\code{wgt}).
#'     \item \code{caty} An unequal probability grouping indicator. \code{caty}
#'       is \code{None} if the sampling design did not use unequal inclusion probabilities.
#'       If the sampling design did use unequal inclusion probabilities, \code{caty}
#'       indicates the unequal probability level.
#'     \item \code{aux} The auxiliary proportional probability variable. This
#'       column is only returned if \code{seltype} was \code{proportional} in the
#'       original sampling design.
#'   }
#'   If any columns in \code{sframe} contain these names, those columns
#'   from \code{sframe} will be automatically prefixed with \code{sframe_}
#'   in the \code{sites} object. When output is printed, a summary of site counts by
#'   the levels in \code{stratum_var} and \code{caty_var} is shown.
#'
#' @author Tony Olsen \email{olsen.tony@@epa.gov}
#'
#' @seealso
#'   \describe{
#'     \item{\code{\link{irs}}}{ to select a sample that is not spatially balanced}
#'  }
#'
#' @references
#' Stevens Jr., Don L. and Olsen, Anthony R. (2004). Spatially balanced sampling
#' of natural resources. \emph{Journal of the American Statistical Association}, 99(465), 262-278.
#'
#' @examples
#' \dontrun{
#' samp <- grts(NE_Lakes, n_base = 100)
#' print(samp)
#' strata_n <- c(low = 25, high = 30)
#' samp_strat <- grts(NE_Lakes, n_base = strata_n, stratum_var = "ELEV_CAT")
#' print(samp_strat)
#' samp_over <- grts(NE_Lakes, n_base = 30, n_over = 5)
#' print(samp_over)
#' }
#' @export
################################################################################
grts <- function(sframe, n_base, stratum_var = NULL, seltype = NULL, caty_var = NULL,
                 caty_n = NULL, aux_var = NULL, legacy_var = NULL,
                 legacy_sites = NULL, legacy_stratum_var = NULL,
                 legacy_caty_var = NULL, legacy_aux_var = NULL, mindis = NULL,
                 maxtry = 10, n_over = NULL, n_near = NULL, wgt_units = NULL,
                 pt_density = NULL, DesignID = "Site", SiteBegin = 1, sep = "-", projcrs_check = TRUE) {

  if (inherits(sframe, c("tbl_df", "tbl"))) { # identify if tibble class elements are present
    class(sframe) <- setdiff(class(sframe), c("tbl_df", "tbl"))
    # remove tibble class for rownames warning
  }

  if (!is.null(legacy_sites) & inherits(legacy_sites, c("tbl_df", "tbl"))) { # identify if tibble class elements are present
    class(legacy_sites) <- setdiff(class(legacy_sites), c("tbl_df", "tbl"))
    # remove tibble class for rownames warning
  }

  geom_col_name <- attr(sframe, "sf_column")
  st_geometry(sframe) <- "geometry"
  if (!is.null(legacy_sites)) {
    st_geometry(legacy_sites) <- "geometry"
  }

  # save initial variable specifications for the design list later
  initial_stratum_var <- stratum_var
  initial_caty_var <- caty_var
  initial_aux_var <- aux_var

  # Create warning indicator and data frame to collect all potential issues during
  # sample selection
  warn_ind <- FALSE
  warn_df <- data.frame(stratum = "Stratum", func = "Calling Function", warn = "Message")

  # Ensure that the geometry types for sframe are consistent

  temp <- st_geometry_type(sframe)
  tst <- all(temp %in% c("POINT", "MULTIPOINT")) |
    all(temp %in% c("LINESTRING", "MULTILINESTRING")) |
    all(temp %in% c("POLYGON", "MULTIPOLYGON"))
  if (!tst) {
    stop(paste("\nThe geometry types for the survey frame object passed to function grts: \n\"",
      unique(st_geometry_type(sframe)), "\" are not consistent.",
      sep = ""
    ))
  }

  # Drop m and z values to ensure no issues with grts functionality with sf object
  if (!is.null(st_m_range(sframe)) | !is.null(st_z_range(sframe))) {
    # warn_ind <- TRUE
    # warn_df$warn <- "\nThe survey frame object passed to function grts contains m or z values - they are being dropped to ensure functionality in grts."
    sframe <- st_zm(sframe)
  }

  # Determine type of sampling frame: point, line, polygon
  if (all(temp %in% c("POINT", "MULTIPOINT"))) sf_type <- "sf_point"
  if (all(temp %in% c("LINESTRING", "MULTILINESTRING"))) sf_type <- "sf_linear"
  if (all(temp %in% c("POLYGON", "MULTIPOLYGON"))) sf_type <- "sf_area"

  if (all(is.null(legacy_sites), is.null(legacy_var))) {
    legacy_option <- FALSE
  } else {
    legacy_option <- TRUE
  }

  if (is.null(stratum_var)) {
    stratum <- NULL
  } else {
    stratum <- names(n_base)
  }

  # set default seltype if not provided (based on specification of other variables)
  if (is.null(seltype)) {
    if (is.null(caty_var) & is.null(aux_var)) {
      seltype <- "equal"
    } else if (!is.null(caty_var)) {
      seltype <- "unequal"
    } else {
      seltype <- "proportional"
    }
  }


  # check input. If errors, dsgn_check will stop grtspts and report errors.
  dsgn_check(
    sframe = sframe, sf_type = sf_type, legacy_sites = legacy_sites,
    legacy_option = legacy_option, stratum = stratum, seltype = seltype,
    n_base = n_base, caty_n = caty_n, n_over = n_over, n_near = n_near,
    stratum_var = stratum_var, caty_var = caty_var, aux_var = aux_var,
    legacy_stratum_var = legacy_stratum_var, legacy_caty_var = legacy_caty_var,
    legacy_aux_var = legacy_aux_var,
    legacy_var = legacy_var, mindis = mindis, DesignID = DesignID,
    SiteBegin = SiteBegin, maxtry = maxtry, projcrs_check = projcrs_check
  )

  # preserve original sframe names
  sframe_names <- names(sframe)

  # preserve original legacy_sites names if needed
  if (!is.null(legacy_sites)) {
    legacy_sites_names <- names(legacy_sites)
  }

  ## Create variables in sampling frame if needed.
  # Create unique sampling frame ID values
  sframe$id <- 1:nrow(sframe)

  # Assign stratum variable or create it if design not stratified and variable not provided.
  if (is.null(stratum_var)) {
    stratum_var <- "stratum"
    sframe$stratum <- "None"
    stratum <- c("None") # names of all strata
  } else {
    # ensure class for stratum variable is character and assign to stratum
    sframe$stratum <- as.character(sframe[[stratum_var]])
  }

  # set caty, aux and legacy variables in sampling frame if needed
  if (!is.null(caty_var)) sframe$caty <- as.character(sframe[[caty_var]])
  if (!is.null(aux_var)) sframe$aux <- sframe[[aux_var]]
  if (!is.null(legacy_var)) sframe$legacy <- sframe[[legacy_var]]

  # set stratum, caty, aux and legacy variables in legacy_sites if needed
  # add idpts to legacy_sites
  if (legacy_option == TRUE & (sf_type != "sf_point" | ((sf_type == "sf_point") & !is.null(legacy_sites)))) {
    legacy_names <- names(legacy_sites)
    legacy_sites$idpts <- 1:nrow(legacy_sites)
    if (stratum[1] == "None") {
      legacy_sites$stratum <- "None"
    } else {
      if (is.null(legacy_stratum_var)) {
        legacy_stratum_var <- stratum_var
      }
      legacy_sites$stratum <- as.character(legacy_sites[[legacy_stratum_var]])
    }
    if (!is.null(caty_var)) {
      if (is.null(legacy_caty_var)) {
        legacy_caty_var <- caty_var
      }
      legacy_sites$caty <- as.character(legacy_sites[[legacy_caty_var]])
    }
    if (!is.null(aux_var)) {
      if (is.null(legacy_aux_var)) {
        legacy_aux_var <- aux_var
      }
      legacy_sites$aux <- legacy_sites[[legacy_aux_var]]
    }
    if (is.null(legacy_var)) {
      legacy_sites$legacy <- TRUE
      legacy_var <- "legacy"
    } else {
      legacy_sites$legacy <- legacy_sites[[legacy_var]]
    }
  }

  # save initial variable specifications for the design list later
  initial_legacy_stratum_var <- legacy_stratum_var
  initial_legacy_caty_var <- legacy_caty_var
  initial_legacy_aux_var <- legacy_aux_var

  ## Create a dsgn list object
  # variable assignments to dsgn list object
  dsgn <- list(
    stratum_var = stratum_var, caty_var = caty_var, aux_var = aux_var,
    legacy_option = legacy_option, legacy_var = legacy_var, stratum = stratum,
    wgt_units = wgt_units, seltype = NULL, n_base = NULL, caty_n = NULL,
    n_over = NULL, n_near = NULL, mindis = mindis
  )

  # seltype
  if (length(seltype) == length(stratum)) {
    dsgn$seltype <- seltype
    names(dsgn$seltype) <- stratum
  } else {
    tmp <- sapply(stratum, function(x, seltype) {
      x <- seltype
    }, seltype)
    names(tmp) <- stratum
    dsgn$seltype <- tmp
  }

  # n_base
  if (length(n_base) == length(stratum)) {
    dsgn$n_base <- n_base
    names(dsgn$n_base) <- stratum
  } else {
    tmp <- sapply(stratum, function(x, n_base) {
      x <- n_base
    }, n_base)
    names(tmp) <- stratum
    dsgn$n_base <- tmp
  }

  # caty_n
  if (is.list(caty_n)) {
    dsgn$caty_n <- caty_n
  } else {
    tmp <- lapply(stratum, function(x, caty_n) {
      x <- caty_n
    }, caty_n)
    names(tmp) <- stratum
    dsgn$caty_n <- tmp
  }

  # n_over
  if (!is.null(n_over)) {
    if (is.list(n_over)) {
      n_over <- lapply(n_over, function(x) if (all(x == 0)) NULL else x)
      dsgn$n_over <- n_over
    } else if (!is.null(names(n_over)) && all(sort(stratum) == sort(names(n_over)))) {
      tmp <- lapply(stratum, function(x, n_over) {
        x <- n_over[[x]]
      }, n_over)
      names(tmp) <- stratum
      dsgn$n_over <- tmp
    } else {
      tmp <- lapply(stratum, function(x, n_over) {
        x <- n_over
      }, n_over)
      names(tmp) <- stratum
      dsgn$n_over <- tmp
    }
  }

  # n_near
  if (!is.null(n_near)) {
    if (is.list(n_near)) {
      n_near <- lapply(n_near, function(x) if (all(x == 0)) NULL else x)
      dsgn$n_near <- n_near
    } else if (!is.null(names(n_near)) && all(sort(stratum) == sort(names(n_near)))) {
      tmp <- lapply(stratum, function(x, n_near) {
        x <- n_near[[x]]
      }, n_near)
      names(tmp) <- stratum
      dsgn$n_near <- tmp
    } else {
      tmp <- lapply(stratum, function(x, n_near) {
        x <- n_near
      }, n_near)
      names(tmp) <- stratum
      dsgn$n_near <- tmp
    }
  }

  # mindis
  if (!is.null(mindis)) {
    if (is.list(mindis)) {
      mindis <- lapply(mindis, function(x) if (all(x == 0)) NULL else x)
      dsgn$mindis <- mindis
    } else {
      tmp <- lapply(stratum, function(x, mindis) {
        x <- mindis
      }, mindis)
      names(tmp) <- stratum
      dsgn$mindis <- tmp
    }
  }

  # legacy_option
  if (legacy_option == TRUE) {
    tmp <- sapply(stratum, function(x, legacy_option) {
      x <- legacy_option
    }, legacy_option)
    names(tmp) <- stratum
    dsgn$legacy_option <- tmp
  }

  ## select sites for each stratum
  rslts <- lapply(dsgn$stratum, grts_stratum,
    dsgn = dsgn, sframe = sframe, sf_type = sf_type, wgt_units = wgt_units,
    pt_density = pt_density, legacy_option = legacy_option,
    legacy_sites = legacy_sites, maxtry = maxtry, warn_ind = warn_ind, warn_df = warn_df
  )
  names(rslts) <- stratum


  # combine across strata
  sites_legacy <- NULL
  sites_base <- NULL
  sites_over <- NULL
  sites_near <- NULL
  warn_ind <- FALSE
  warn_df <- NULL
  for (i in 1:length(rslts)) {
    sites_legacy <- rbind(sites_legacy, rslts[[i]]$sites_legacy)
    sites_base <- rbind(sites_base, rslts[[i]]$sites_base)
    sites_over <- rbind(sites_over, rslts[[i]]$sites_over)
    sites_near <- rbind(sites_near, rslts[[i]]$sites_near)
    if (rslts[[i]]$warn_ind) {
      warn_ind <- TRUE
      warn_df <- rbind(warn_df, rslts[[i]]$warn_df)
    }
  }

  # spelling change to avoid clash with analysis warnings
  if (!is.null(warn_df) && "warning" %in% names(warn_df)) {
    names(warn_df)[which(names(warn_df) == "warning")] <- "Warning"
  }


  # Create a siteID for all sites
  ntot <- NROW(sites_legacy) + NROW(sites_base) + NROW(sites_over) + NROW(sites_near)
  siteID <- gsub(" ", "0", paste0(DesignID, sep, format(SiteBegin - 1 + 1:ntot, sep = "")))
  nlast <- 0

  # Create siteID for legacy sites if present using DesignID and SiteBegin
  if (!is.null(sites_legacy)) {
    row.names(sites_legacy) <- 1:nrow(sites_legacy)
    sites_legacy$siteID <- siteID[1:nrow(sites_legacy)]
    nlast <- nrow(sites_legacy)
    # set siteuse and replsite
    sites_legacy$siteuse <- "Legacy"
    sites_legacy$replsite <- "None"
  }

  # Create siteID for base sites using DesignID and SiteBegin
  if (!is.null(sites_base)) {
    row.names(sites_base) <- 1:nrow(sites_base)
    sites_base$siteID <- siteID[(nlast + 1):(nlast + nrow(sites_base))]
    nlast <- nlast + nrow(sites_base)
    # set siteuse and replsite for base sites
    sites_base$siteuse <- "Base"
    sites_base$replsite <- "None"
  }

  # create siteID for n_over sites if any
  if (!is.null(n_over)) {
    row.names(sites_over) <- 1:nrow(sites_over)
    sites_over$siteID <- siteID[(nlast + 1):(nlast + nrow(sites_over))]
    nlast <- nlast + nrow(sites_over)
    # set siteuse and replsite for n_over sites
    sites_over$siteuse <- "Over"
    sites_over$replsite <- "Next"
  }

  # if n_near sample sites, assign base ids to the replacement sites. then add siteIDs
  if (!is.null(n_near)) {
    tst <- match(paste(sites_near$stratum, sites_near$replsite, sep = "_"),
      paste(sites_legacy$stratum, sites_legacy$idpts, sep = "_"),
      nomatch = 0
    )
    sites_near$replsite[tst > 0] <- sites_legacy$siteID[tst]

    tst <- match(paste(sites_near$stratum, sites_near$replsite, sep = "_"),
      paste(sites_base$stratum, sites_base$idpts, sep = "_"),
      nomatch = 0
    )
    sites_near$replsite[tst > 0] <- sites_base$siteID[tst]
    tst <- match(paste(sites_near$stratum, sites_near$replsite, sep = "_"),
      paste(sites_over$stratum, sites_over$idpts, sep = "_"),
      nomatch = 0
    )
    sites_near$replsite[tst > 0] <- sites_over$siteID[tst]

    # sort by id so that sites_near in same order as sites in sites_base and sites_over
    sites_near <- sites_near[order(sites_near$replsite, sites_near$siteuse), ]
    row.names(sites_near) <- 1:nrow(sites_near)
    # assign siteIDs
    sites_near$siteID <- siteID[(nlast + 1):(nlast + nrow(sites_near))]
  }

  # Add lat/lon in WGS84 or X/Y coordinates
  if ((!is.null(sites_base) && is.na(st_crs(sites_base))) | (!is.null(sites_base) && st_is_longlat(st_crs(sites_base))) | (!is.null(sites_legacy) && is.na(st_crs(sites_legacy))) | (!is.null(sites_legacy) && st_is_longlat(st_crs(sites_legacy)))) {
    if (!is.null(sites_legacy)) {
      sites_legacy$X <- st_coordinates(sites_legacy)[, "X"]
      sites_legacy$Y <- st_coordinates(sites_legacy)[, "Y"]
    }

    if (!is.null(sites_base)) {
      sites_base$X <- st_coordinates(sites_base)[, "X"]
      sites_base$Y <- st_coordinates(sites_base)[, "Y"]
    }

    if (!is.null(sites_over)) {
      sites_over$X <- st_coordinates(sites_over)[, "X"]
      sites_over$Y <- st_coordinates(sites_over)[, "Y"]
    }

    if (!is.null(sites_near)) {
      sites_near$X <- st_coordinates(sites_near)[, "X"]
      sites_near$Y <- st_coordinates(sites_near)[, "Y"]
    }

    # reorder sf object variables by first specifying design names excluding unique
    # feature ID id and idpts as they are internal
    dsgn_names <- c(
      "siteID", "siteuse", "replsite", "X", "Y",
      "stratum", "wgt", "ip", "caty", "aux"
    )
  } else {
    if (!is.null(sites_legacy)) {
      sites_legacy$lon_WGS84 <- st_coordinates(st_transform(sites_legacy, crs = 4326))[, "X"]
      sites_legacy$lat_WGS84 <- st_coordinates(st_transform(sites_legacy, crs = 4326))[, "Y"]
    }

    if (!is.null(sites_base)) {
      sites_base$lon_WGS84 <- st_coordinates(st_transform(sites_base, crs = 4326))[, "X"]
      sites_base$lat_WGS84 <- st_coordinates(st_transform(sites_base, crs = 4326))[, "Y"]
    }

    if (!is.null(sites_over)) {
      sites_over$lon_WGS84 <- st_coordinates(st_transform(sites_over, crs = 4326))[, "X"]
      sites_over$lat_WGS84 <- st_coordinates(st_transform(sites_over, crs = 4326))[, "Y"]
    }

    if (!is.null(sites_near)) {
      sites_near$lon_WGS84 <- st_coordinates(st_transform(sites_near, crs = 4326))[, "X"]
      sites_near$lat_WGS84 <- st_coordinates(st_transform(sites_near, crs = 4326))[, "Y"]
    }

    # reorder sf object variables by first specifying design names excluding unique
    # feature ID id and idpts as they are internal
    dsgn_names <- c(
      "siteID", "siteuse", "replsite", "lon_WGS84", "lat_WGS84",
      "stratum", "wgt", "ip", "caty", "aux"
    )
  }

  dsgn_names_extra <- c(dsgn_names, "xcoord", "ycoord", "idpts")

  if (geom_col_name != "geometry") {
    # sframe prefix if necessary
    if (geom_col_name %in% dsgn_names_extra) {
      geom_col_name <- paste("sframe", geom_col_name, sep = "_")
    }

    # restore original column names
    if (!is.null(sites_legacy)) {
      st_geometry(sites_legacy) <- geom_col_name
      legacy_sites_names[legacy_sites_names == "geometry"] <- geom_col_name
      st_geometry(legacy_sites) <- geom_col_name
    }

    if (!is.null(sites_base)) {
      sframe_names[sframe_names == "geometry"] <- geom_col_name
      st_geometry(sites_base) <- geom_col_name
    }

    if (!is.null(sites_over)) {
      st_geometry(sites_over) <- geom_col_name
    }

    if (!is.null(sites_near)) {
      st_geometry(sites_near) <- geom_col_name
    }
  }

  # sites_legacy
  if (!is.null(sites_legacy)) {
    if (sf_type != "sf_point") {
      add_names <- dsgn_names[dsgn_names %in% names(sites_legacy)]
      legacy_sites_names_good <- legacy_sites_names[!legacy_sites_names %in% dsgn_names_extra]
      if (all(legacy_sites_names %in% legacy_sites_names_good)) {
        sites_legacy <- subset(sites_legacy, select = c(add_names, legacy_sites_names))
      } else {
        legacy_sites_names_bad <- legacy_sites_names[legacy_sites_names %in% dsgn_names_extra]
        legacy_sites_temp <- legacy_sites[, legacy_sites_names_bad, drop = FALSE]
        temp_geometry_col <- which(names(legacy_sites_temp) == attr(legacy_sites_temp, "sf_column"))
        legacy_sites_geometry_col <- which(names(legacy_sites) == attr(legacy_sites, "sf_column"))
        names(legacy_sites_temp)[-temp_geometry_col] <- paste("legacy_sites", names(legacy_sites_temp)[-temp_geometry_col], sep = "_")
        sites_legacy <- st_join(sites_legacy, legacy_sites_temp, join = st_nearest_feature)
        sites_legacy <- subset(sites_legacy, select = c(add_names, legacy_sites_names_good[-legacy_sites_geometry_col], names(legacy_sites_temp)[-temp_geometry_col]))
        for (i in names(sites_legacy)) {
          if (i %in% c("legacy_sites_xcoord", "legacy_sites_ycoord", "legacy_sites_idpts")) {
            names(sites_legacy)[which(names(sites_legacy) == i)] <- substring(i, first = 14)
          }
        }
      }
    }
    if (sf_type == "sf_point") {
      add_names <- dsgn_names[dsgn_names %in% names(sites_legacy)]
      sframe_names_good <- sframe_names[!sframe_names %in% dsgn_names_extra]
      if (all(sframe_names %in% sframe_names_good)) {
        sites_legacy <- subset(sites_legacy, select = c(add_names, sframe_names))
      } else {
        sframe_names_bad <- sframe_names[sframe_names %in% dsgn_names_extra]
        sframe_temp <- sframe[, sframe_names_bad, drop = FALSE]
        temp_geometry_col <- which(names(sframe_temp) == attr(sframe_temp, "sf_column"))
        sframe_geometry_col <- which(names(sframe) == attr(sframe, "sf_column"))
        names(sframe_temp)[-temp_geometry_col] <- paste("sframe", names(sframe_temp)[-temp_geometry_col], sep = "_")
        sites_legacy <- st_join(sites_legacy, sframe_temp, join = st_nearest_feature)
        sites_legacy <- subset(sites_legacy,
          select = c(add_names, sframe_names_good[-sframe_geometry_col], names(sframe_temp)[-temp_geometry_col])
        )
        for (i in names(sites_legacy)) {
          if (i %in% c("sframe_xcoord", "sframe_ycoord", "sframe_idpts")) {
            names(sites_legacy)[which(names(sites_legacy) == i)] <- substring(i, first = 8)
          }
        }
      }
    }
  }

  # sites_base
  # check what design variables are present in sf objects and add if missing
  if (!is.null(sites_base)) {
    add_names <- dsgn_names[dsgn_names %in% names(sites_base)]
    sframe_names_good <- sframe_names[!sframe_names %in% dsgn_names_extra]
    if (all(sframe_names %in% sframe_names_good)) {
      sites_base <- subset(sites_base, select = c(add_names, sframe_names))
    } else {
      sframe_names_bad <- sframe_names[sframe_names %in% dsgn_names_extra]
      sframe_temp <- sframe[, sframe_names_bad, drop = FALSE]
      temp_geometry_col <- which(names(sframe_temp) == attr(sframe_temp, "sf_column"))
      sframe_geometry_col <- which(names(sframe) == attr(sframe, "sf_column"))
      names(sframe_temp)[-temp_geometry_col] <- paste("sframe", names(sframe_temp)[-temp_geometry_col], sep = "_")
      sites_base <- st_join(sites_base, sframe_temp, join = st_nearest_feature)
      sites_base <- subset(sites_base, select = c(add_names, sframe_names_good[-sframe_geometry_col], names(sframe_temp)[-temp_geometry_col]))
      for (i in names(sites_base)) {
        if (i %in% c("sframe_xcoord", "sframe_ycoord", "sframe_idpts")) {
          names(sites_base)[which(names(sites_base) == i)] <- substring(i, first = 8)
        }
      }
    }
  }

  # sites_over
  if (!is.null(sites_over)) {
    add_names <- dsgn_names[dsgn_names %in% names(sites_over)]
    sframe_names_good <- sframe_names[!sframe_names %in% dsgn_names_extra]
    if (all(sframe_names %in% sframe_names_good)) {
      sites_over <- subset(sites_over, select = c(add_names, sframe_names))
    } else {
      sframe_names_bad <- sframe_names[sframe_names %in% dsgn_names_extra]
      sframe_temp <- sframe[, sframe_names_bad, drop = FALSE]
      temp_geometry_col <- which(names(sframe_temp) == attr(sframe_temp, "sf_column"))
      sframe_geometry_col <- which(names(sframe) == attr(sframe, "sf_column"))
      names(sframe_temp)[-temp_geometry_col] <- paste("sframe", names(sframe_temp)[-temp_geometry_col], sep = "_")
      sites_over <- st_join(sites_over, sframe_temp, join = st_nearest_feature)
      sites_over <- subset(sites_over, select = c(add_names, sframe_names_good[-sframe_geometry_col], names(sframe_temp)[-temp_geometry_col]))
      for (i in names(sites_over)) {
        if (i %in% c("sframe_xcoord", "sframe_ycoord", "sframe_idpts")) {
          names(sites_over)[which(names(sites_over) == i)] <- substring(i, first = 8)
        }
      }
    }
  }

  # sites_near
  if (!is.null(sites_near)) {
    add_names <- dsgn_names[dsgn_names %in% names(sites_near)]
    sframe_names_good <- sframe_names[!sframe_names %in% dsgn_names_extra]
    if (all(sframe_names %in% sframe_names_good)) {
      sites_near <- subset(sites_near, select = c(add_names, sframe_names))
    } else {
      sframe_names_bad <- sframe_names[sframe_names %in% dsgn_names_extra]
      sframe_temp <- sframe[, sframe_names_bad, drop = FALSE]
      temp_geometry_col <- which(names(sframe_temp) == attr(sframe_temp, "sf_column"))
      sframe_geometry_col <- which(names(sframe) == attr(sframe, "sf_column"))
      names(sframe_temp)[-temp_geometry_col] <- paste("sframe", names(sframe_temp)[-temp_geometry_col], sep = "_")
      sites_near <- st_join(sites_near, sframe_temp, join = st_nearest_feature)
      sites_near <- subset(sites_near, select = c(add_names, sframe_names_good[-sframe_geometry_col], names(sframe_temp)[-temp_geometry_col]))
      for (i in names(sites_near)) {
        if (i %in% c("sframe_xcoord", "sframe_ycoord", "sframe_idpts")) {
          names(sites_near)[which(names(sites_near) == i)] <- substring(i, first = 8)
        }
      }
    }
  }

  # add function call to dsgn list
  # dsgn <- c(list(Call = match.call()), dsgn)
  dsgn <- list(
    call = match.call(), stratum_var = initial_stratum_var, stratum = dsgn$stratum, n_base = dsgn$n_base,
    seltype = dsgn$seltype, caty_var = initial_caty_var, caty_n = dsgn$caty_n, aux_var = initial_aux_var, legacy = dsgn$legacy_option,
    mindis = dsgn$mindis, n_over = dsgn$n_over, n_near = dsgn$n_near
  )

  if (any(dsgn$legacy)) {
    dsgn <- c(dsgn, list(legacy_stratum_var = initial_legacy_stratum_var, legacy_caty_var = initial_legacy_caty_var,
                         legacy_aux_var = initial_legacy_aux_var))
    dsgn <- dsgn[c("call", "stratum_var", "stratum", "n_base", "seltype", "caty_var",
                   "caty_n", "aux_var", "legacy", "legacy_stratum_var", "legacy_caty_var", "legacy_aux_var",
                   "mindis", "n_over", "n_near")]
  }

  # create output list
  sites <- list(
    sites_legacy = sites_legacy, sites_base = sites_base,
    sites_over = sites_over, sites_near = sites_near,
    design = dsgn
  )

  # As necessary, output a message indicating that warning messages were generated
  # during execution of the program

  if (warn_ind) {
    warn_df <<- warn_df
    if (nrow(warn_df) == 1) {
      message("During execution of the program, a warning message was generated. The warning \nmessage is stored in a data frame named 'warn_df'.  Enter the following command \nto view the warning message: warnprnt()\n")
    } else {
      message(paste("During execution of the program,", nrow(warn_df), "warning messages were generated.  The warning \nmessages are stored in a data frame named 'warn_df'.  Enter the following \ncommand to view the warning messages: warnprnt() \nTo view a subset of the warning messages (say, messages number 1, 3, and 5), \nenter the following command: warnprnt(m=c(1,3,5))\n"))
    }
  }

  # constructor for design class
  sites <- structure(sites, class = "sp_design")

  # return the survey design sf object
  invisible(sites)
}