Nothing
#' Spatiotemporal grid sampling of observation data
#'
#' Sample observation data on a spacetime grid to reduce spatiotemporal bias.
#'
#' @param x data frame; observations to sample, including at least the columns
#' defining the location in space and time. Additional columns can be included
#' such as features that will later be used in model training.
#' @param coords character; names of the spatial and temporal coordinates. By
#' default the spatial spatial coordinates should be `longitude` and
#' `latitude`, and temporal coordinate should be `day_of_year`.
#' @param is_lonlat logical; if the points are in unprojected, lon-lat
#' coordinates. In this case, the points will be projected to an equal area
#' sinusoidal CRS prior to grid assignment.
#' @param res numeric; resolution of the spatiotemporal grid in the x, y, and
#' time dimensions. Unprojected locations are projected to an equal area
#' coordinate system prior to sampling, and resolution should therefore be
#' provided in units of meters. The temporal resolution should be in the
#' native units of the time coordinate in the input data frame, typically it
#' will be a number of days.
#' @param jitter_grid logical; whether to jitter the location of the origin of
#' the grid to introduce some randomness.
#' @param sample_size_per_cell integer; number of observations to sample from
#' each grid cell.
#' @param cell_sample_prop proportion `(0-1]`; if less than 1, only this
#' proportion of cells will be randomly selected for sampling.
#' @param keep_cell_id logical; whether to retain a unique cell identifier,
#' stored in column named `.cell_id`.
#' @param grid_definition list defining the spatiotemporal sampling grid as
#' returned by [assign_to_grid()] in the form of an attribute of the returned
#' data frame.
#'
#' @details
#' [grid_sample_stratified()] performs stratified case control sampling,
#' independently sampling from strata defined by, for example, year and
#' detection/non-detection. Within each stratum, [grid_sample()] is used to
#' sample the observations on a spatiotemporal grid. In addition, if case
#' control sampling is turned on, the detections are oversampled to increase the
#' frequecy of detections in the dataset.
#'
#' The sampling grid is defined, and assignment of locations to cells occurs, in
#' [assign_to_grid()]. Consult the help for that function for further details on
#' how the grid is generated and locations are assigned. Note that by providing
#' 2-element vectors to both `coords` and `res` the time component of the grid
#' can be ignored and spatial-only subsampling is performed.
#'
#' @return A data frame of the spatiotemporally sampled data.
#' @export
#'
#' @examples
#' set.seed(1)
#'
#' # generate some example observations
#' n_obs <- 10000
#' checklists <- data.frame(longitude = rnorm(n_obs, sd = 0.1),
#' latitude = rnorm(n_obs, sd = 0.1),
#' day_of_year = sample.int(28, n_obs, replace = TRUE),
#' year = NA_integer_,
#' obs = rpois(n_obs, lambda = 0.1),
#' forest_cover = runif(n_obs),
#' island = as.integer(runif(n_obs) > 0.95))
#' # add a year column, giving more data to recent years
#' checklists$year <- sample(seq(2016, 2020), size = n_obs, replace = TRUE,
#' prob = seq(0.3, 0.7, length.out = 5))
#' # create several rare islands
#' checklists$island[sample.int(nrow(checklists), 9)] <- 2:10
#'
#' # basic spatiotemporal grid sampling
#' sampled <- grid_sample(checklists)
#'
#' # plot original data and grid sampled data
#' par(mar = c(0, 0, 0, 0))
#' plot(checklists[, c("longitude", "latitude")],
#' pch = 19, cex = 0.3, col = "#00000033",
#' axes = FALSE)
#' points(sampled[, c("longitude", "latitude")],
#' pch = 19, cex = 0.3, col = "red")
#'
#' # case control sampling stratified by year and island
#' # return a maximum of 1000 checklists
#' sampled_cc <- grid_sample_stratified(checklists, sample_by = "island",
#' maximum_ss = 1000)
#'
#' # case control sampling increases the prevalence of detections
#' mean(checklists$obs > 0)
#' mean(sampled$obs > 0)
#' mean(sampled_cc$obs > 0)
#'
#' # stratifying by island ensures all levels are retained, even rare ones
#' table(checklists$island)
#' # normal grid sampling loses rare island levels
#' table(sampled$island)
#' # stratified grid sampling retain at least one observation from each level
#' table(sampled_cc$island)
grid_sample <- function(x,
coords = c("longitude", "latitude", "day_of_year"),
is_lonlat = TRUE,
res = c(3000, 3000, 7), jitter_grid = TRUE,
sample_size_per_cell = 1,
cell_sample_prop = 0.75,
keep_cell_id = FALSE,
grid_definition = NULL) {
# input checks
stopifnot(is.data.frame(x))
stopifnot(is.character(coords), length(coords) == 3, all(!is.na(coords)),
all(coords %in% names(x)))
stopifnot(is_flag(is_lonlat))
stopifnot(is.numeric(res), length(res) %in% c(2, 3), all(res > 0))
stopifnot(is_flag(jitter_grid))
stopifnot(is_count(sample_size_per_cell), sample_size_per_cell > 0)
stopifnot(is.numeric(cell_sample_prop), length(cell_sample_prop) == 1,
cell_sample_prop > 0, cell_sample_prop <=1)
stopifnot(is_flag(keep_cell_id))
# handle edge case of no observations
if (nrow(x) == 0) {
return(x)
}
# assign the observations to a grid
if (!is.null(grid_definition)) {
cells <- assign_to_grid(points = x,
grid_definition = grid_definition)
} else {
cells <- assign_to_grid(points = x, res = res,
coords = coords,
jitter_grid = jitter_grid,
is_lonlat = is_lonlat,
grid_definition = NULL)
}
# was this a spacetime grid or just a spatial grid
if ("cell_xyt" %in% names(cells)) {
cells <- cells[, "cell_xyt", drop = FALSE]
} else if ("cell_xy" %in% names(cells)) {
cells <- cells[, "cell_xy", drop = FALSE]
} else {
stop("Invalid cells generated by assign_to_grid()")
}
names(cells) <- "cell_id"
cells[["row_number"]] <- seq_len(nrow(cells))
if (any(is.na(cells[["cell_id"]]))) {
stop("Sampling grid did not capture all points.")
}
# only select a subset of cell, keep at least 1
if (cell_sample_prop < 1) {
ids <- unique(cells[["cell_id"]])
n_cells <- max(round(cell_sample_prop * length(ids)), 1)
keed_ids <- safe_sample(ids, size = n_cells, replace = FALSE)
cells <- cells[cells[["cell_id"]] %in% keed_ids, ]
rm(ids, keed_ids)
}
# sample from each cell
sampled <- tapply(cells[["row_number"]],
INDEX = cells[["cell_id"]],
FUN = safe_sample, size = sample_size_per_cell,
simplify = FALSE)
sampled <- unique(do.call(c, sampled))
# extract just the sampled rows of the original dataset
x_sampled <- dplyr::tibble(x[sampled, ])
if (keep_cell_id) {
lookup <- match(sampled, cells[["row_number"]])
x_sampled[[".cell_id"]] <- cells[["cell_id"]][lookup]
}
return(x_sampled)
}
#' @param unified_grid logical; whether a single, unified spatiotemporal
#' sampling grid should be defined and used for all observations in `x` or a
#' different grid should be used for each stratum.
#' @param by_year logical; whether the sampling should be done by year, i.e.
#' sampling N observations per grid cell per year, rather than across years,
#' i.e. N observations per grid cell regardless of year. If using sampling by
#' year, the input data frame `x` must have a `year` column.
#' @param case_control logical; whether to apply case control sampling whereby
#' presence and absence are sampled independently.
#' @param obs_column character; if `case_control = TRUE`, this is the name of
#' the column in `x` that defines detection (`obs_column > 0`) and
#' non-detection (`obs_column == 0`).
#' @param sample_by character; additional columns in `x` to stratify sampling
#' by. For example, if a landscape has many small islands (defined by an
#' `island` variable) and we wish to sample from each independently, use
#' `sample_by = "island"`.
#' @param min_detection_probability proportion `[0-1)`; the minimum detection
#' probability in the final dataset. If `case_control = TRUE`, and the
#' proportion of detections in the grid sampled dataset is below this level,
#' then additional detections will be added via grid sampling the detections
#' from the input dataset until at least this proportion of detections appears
#' in the final dataset. This will typically result in duplication of some
#' observations in the final dataset. To turn this off this feature use
#' `min_detection_probability = 0`.
#' @param maximum_ss integer; the maximum sample size in the final dataset. If
#' the grid sampling yields more than this number of observations,
#' `maximum_ss` observations will be selected randomly from the full set. Note
#' that this subsampling will be performed in such a way that all levels of
#' each strata will have at least one observation within the final dataset,
#' and therefore it is not truly randomly sampling.
#' @param jitter_columns character; if detections are oversampled to achieve the
#' minimum detection probability, some observations will be duplicated, and it
#' can be desirable to slightly "jitter" the values of model training features
#' for these duplicated observations. This argument defines the column names
#' in `x` that will be jittered.
#' @param jitter_sd numeric; strength of the jittering in units of standard
#' deviations, see `jitter_columns`.
#' @param ... additional arguments defining the spatiotemporal grid; passed to
#' [grid_sample()].
#'
#' @rdname grid_sample
#' @export
grid_sample_stratified <- function(
x,
coords = c("longitude", "latitude", "day_of_year"),
is_lonlat = TRUE,
unified_grid = FALSE,
keep_cell_id = FALSE,
by_year = TRUE,
case_control = TRUE, obs_column = "obs",
sample_by = NULL,
min_detection_probability = 0,
maximum_ss = NULL,
jitter_columns = NULL,
jitter_sd = 0.1,
...) {
# input checks
stopifnot(is.data.frame(x))
stopifnot(is_flag(is_lonlat), is_flag(unified_grid), is_flag(keep_cell_id))
stopifnot(is_flag(case_control))
if (case_control) {
stopifnot(is.character(obs_column), length(obs_column) == 1,
obs_column %in% names(x))
}
if (is.null(sample_by)) {
sample_by <- character()
}
stopifnot(is.character(sample_by), all(sample_by %in% names(x)))
stopifnot(is_flag(by_year))
if (by_year) {
stopifnot("year" %in% names(x))
sample_by <- unique(c("year", sample_by))
}
if (!is.null(maximum_ss)) {
stopifnot(is_count(maximum_ss), maximum_ss > 0)
}
stopifnot(is.numeric(min_detection_probability),
length(min_detection_probability) == 1,
min_detection_probability >= 0,
min_detection_probability < 1)
if (keep_cell_id && !unified_grid) {
warning("Cell IDs can only be returned if unified_grid = TRUE since ",
"different grids are used for each stratum otherwise. Setting ",
"keep_cell_id = FALSE")
keep_cell_id <- FALSE
}
if (!is.null(jitter_columns)) {
stopifnot(is.character(jitter_columns),
all(jitter_columns %in% names(x)))
stopifnot(is.numeric(jitter_sd), length(jitter_sd) == 1, jitter_sd >= 0)
# ensure columns to jitter are numeric
col_types <- vapply(x, class, character(1))
col_types <- col_types[jitter_columns]
not_numeric <- names(col_types)[!col_types %in% c("numeric", "integer")]
if (length(not_numeric) > 0) {
stop("The following jitter columns must be of type numeric or integer: ",
paste(not_numeric, collapse = ", "))
}
}
# handle edge case of no observations
if (nrow(x) == 0) {
return(x)
}
# no strata defined
if (!by_year && !case_control &&
(is.null(sample_by) || length(sample_by) == 0)) {
return(grid_sample(x, keep_cell_id = keep_cell_id, ...))
}
# subset to just the location and strata columns
locs <- x[, coords, drop = FALSE]
# project once now to avoid having to do it for every stratum
if (is_lonlat) {
xy <- project_equal_area(locs, coords = coords[1:2])
# add time dimension
if (length(coords) == 3) {
xy[["t"]] <- locs[[coords[3]]]
}
locs <- xy
coords <- c("x", "y", "t")
is_lonlat <- FALSE
rm(xy)
}
locs[[".row"]] <- seq_len(nrow(locs))
# subset to just the location and strata columns
locs <- dplyr::bind_cols(locs, x[, sample_by, drop = FALSE])
# add presence/absence columns for case control sampling
# also stratify by this new column
if (case_control) {
locs[[".detected"]] <- x[[obs_column]] > 0
# turn off case control sampling if there is only one class
if (all(locs[[".detected"]])) {
message("No non-detections in dataset, setting case_control = FALSE")
case_control <- FALSE
} else if (all(!locs[[".detected"]])) {
message("No detections in dataset, setting case_control = FALSE")
case_control <- FALSE
} else {
sample_by <- c(sample_by, ".detected")
}
}
# split all locs into strata
locs_split <- split(locs, f = locs[, sample_by, drop = FALSE], drop = TRUE)
# define the grid once for all observations
grid_definition <- NULL
if (unified_grid) {
cells <- assign_to_grid(points = locs,
coords = coords,
is_lonlat = is_lonlat,
res = c(3000, 3000, 7),
jitter_grid = TRUE)
grid_definition <- attr(cells, "grid_definition")
rm(cells)
}
# sample from each stratum
sampled <- lapply(locs_split, FUN = grid_sample,
coords = coords, is_lonlat = is_lonlat,
keep_cell_id = keep_cell_id,
grid_definition = grid_definition,
...)
rm(locs_split)
sampled <- dplyr::bind_rows(sampled)
# subsample to decrease sample size to maximum
# TODO consider adding && nrow(sampled) > maximum_ss here
if (!is.null(maximum_ss)) {
sample_prop <- maximum_ss / nrow(sampled)
if (case_control) {
# case control sampling on: sample preserving detections probability
# calculate detection probability for the grid sampled data
det_prob <- mean(sampled[[".detected"]], na.rm = TRUE)
# bias to detections if detection probability is below the minimum
# otherwise preserve the detection probability
target_det_prob <- max(det_prob, min_detection_probability)
target_prop_det <- sample_prop * target_det_prob / det_prob
target_prop_non <- sample_prop * (1 - target_det_prob) / (1 - det_prob)
# sample detections
s <- split(sampled, f = sampled[[".detected"]], drop = TRUE)
sample_by_nodet <- setdiff(sample_by, ".detected")
if (target_prop_det < 1 && "TRUE" %in% names(s)) {
s[["TRUE"]] <- sample_stratify(s[["TRUE"]],
prop = target_prop_det,
sample_by = sample_by_nodet)
}
# sample non-detections
if (target_prop_non < 1 && "FALSE" %in% names(s)) {
s[["FALSE"]] <- sample_stratify(s[["FALSE"]],
prop = target_prop_non,
sample_by = sample_by_nodet)
}
sampled <- dplyr::bind_rows(s)
rm(s)
} else if (nrow(sampled) > maximum_ss) {
sampled <- sample_stratify(sampled,
prop = sample_prop,
sample_by = sample_by)
}
}
# if case control sampling is off, return here
if (!case_control) {
x_sampled <- x[sampled[[".row"]], ]
if (keep_cell_id) {
x_sampled[[".cell_id"]] <- sampled[[".cell_id"]]
}
return(x_sampled)
}
# oversample detections if detection probability is below minimum
det_prob <- mean(sampled[[".detected"]], na.rm = TRUE)
if (det_prob < min_detection_probability && det_prob > 0) {
# just the detections
locs_det <- locs[locs[[".detected"]], ]
# split detections by year, but ignore other strata at this stage
if (by_year) {
locs_split <- split(locs_det, f = locs_det[["year"]], drop = TRUE)
} else {
locs_split <- list(locs_det)
}
rm(locs_det)
# target probability
p <- min_detection_probability
n_non <- sum(!sampled[[".detected"]], na.rm = TRUE)
# repeatedly add additional grid sampled positives, up to 25 times
for (i in seq_len(25)) {
# number of detections to add to reach the target
n_det <- sum(sampled[[".detected"]], na.rm = TRUE)
n_det_add <- round(n_non * p / (1 - p) - n_det)
if (n_det_add <= 0) {
break
}
# grid sample to add new detections
add_dets <- lapply(locs_split, FUN = grid_sample,
coords = coords, is_lonlat = is_lonlat,
keep_cell_id = keep_cell_id,
grid_definition = grid_definition,
...)
add_dets <- dplyr::bind_rows(add_dets)
# only take as many as required to reach the target
if (nrow(add_dets) > n_det_add) {
add_dets <- dplyr::slice_sample(add_dets, n = n_det_add)
}
sampled <- dplyr::bind_rows(sampled, add_dets)
}
}
# if covariate jittering doesn't need to be performed, return here
has_duplicates <- anyDuplicated(sampled[[".row"]]) > 0
if (is.null(jitter_columns) || jitter_sd == 0 || !has_duplicates) {
x_sampled <- x[sampled[[".row"]], ]
if (keep_cell_id) {
x_sampled[[".cell_id"]] <- sampled[[".cell_id"]]
}
return(x_sampled)
}
# identify duplicate records
dupes <- duplicated(sampled[[".row"]])
x_dupes <- x[sampled[[".row"]][dupes], ]
x_sampled <- x[sampled[[".row"]][!dupes], ]
if (keep_cell_id) {
x_dupes[[".cell_id"]] <- sampled[[".cell_id"]][dupes]
x_sampled[[".cell_id"]] <- sampled[[".cell_id"]][!dupes]
}
# estimate standard deviation from full dataset
col_sd <- apply(x[, jitter_columns, drop = FALSE],
MARGIN = 2,
FUN = stats::sd, na.rm = TRUE)
# scale standard deviation
col_sd <- col_sd * jitter_sd
# apply jittering to each column
for (col in jitter_columns) {
jitter_amt <- col_sd[[col]] * stats::runif(nrow(x_dupes), min = -1, max = 1)
x_dupes[[col]] <- x_dupes[[col]] + jitter_amt
}
# combine and return jittered and unjittered data
return(dplyr::tibble(dplyr::bind_rows(x_sampled, x_dupes)))
}
#' Assign points to a spacetime grid
#'
#' Given a set of points in space and (optionally) time, define a regular grid
#' with given dimensions, and return the grid cell index for each point.
#'
#' @param points data frame; points with spatial coordinates `x` and `y`, and an
#' optional time coordinate `t`.
#' @param coords character; names of the spatial and temporal coordinates in the
#' input dataframe. Only provide these names if you want to overwrite the
#' default coordinate names: `c("x", "y", "t")` or `c("longitude", "latitude",
#' "t")` if `is_lonlat = TRUE`.
#' @param is_lonlat logical; if the points are in unprojected, lon-lat
#' coordinates. In this case, the input data frame should have columns
#' `"longitude"` and `"latitude"` and the points will be projected to an equal
#' area sinusoidal CRS prior to grid assignment.
#' @param res numeric; resolution of the grid in the `x`, `y`, and `t`
#' dimensions, respectively. If only 2 dimensions are provided, a space only
#' grid will be generated. The units of `res` are the same as the coordinates
#' in the input data unless `is_lonlat` is true in which case the `x` and `y`
#' resolution should be provided in meters.
#' @param jitter_grid logical; whether to jitter the location of the origin of
#' the grid to introduce some randomness.
#' @param grid_definition list; object defining the grid via the `origin` and
#' `resolution` components. To assign multiple sets of points to exactly the
#' same grid, [assign_to_grid()] returns a data frame with a `grid_definition`
#' attribute that can be passed to subsequent calls to [assign_to_grid()].
#' `res` and `jitter` are ignored if `grid_definition` is provided.
#'
#' @return Data frame with the indices of the space-only and spacetime grid
#' cells. This data frame will have a `grid_definition` attribute that can be
#' used to reconstruct the grid.
#' @export
#'
#' @examples
#' set.seed(1)
#'
#' # generate some example points
#' points_xyt <- data.frame(x = runif(100), y = runif(100), t = rnorm(100))
#' # assign to grid
#' cells <- assign_to_grid(points_xyt, res = c(0.1, 0.1, 0.5))
#'
#' # assign a second set of points to the same grid
#' assign_to_grid(points_xyt, grid_definition = attr(cells, "grid_definition"))
#'
#' # assign lon-lat points to a 10km space-only grid
#' points_ll <- data.frame(longitude = runif(100, min = -180, max = 180),
#' latitude = runif(100, min = -90, max = 90))
#' assign_to_grid(points_ll, res = c(10000, 10000), is_lonlat = TRUE)
#'
#' # overwrite default coordinate names, 5km by 1 week grid
#' points_names <- data.frame(lon = runif(100, min = -180, max = 180),
#' lat = runif(100, min = -90, max = 90),
#' day = sample.int(365, size = 100))
#' assign_to_grid(points_names,
#' res = c(5000, 5000, 7),
#' coords = c("lon", "lat", "day"),
#' is_lonlat = TRUE)
assign_to_grid <- function(points,
coords = NULL, is_lonlat = FALSE,
res, jitter_grid = TRUE,
grid_definition = NULL) {
if (!is.null(grid_definition)) {
stopifnot(all(c("res", "origin") %in% names(grid_definition)))
res <- grid_definition[["res"]]
} else {
stopifnot(is_flag(jitter_grid))
}
stopifnot(is.numeric(res), length(res) %in% c(2, 3), all(res > 0))
stopifnot(is_flag(is_lonlat))
if (!is.null(coords)) {
stopifnot(is.character(coords), length(coords) == length(res),
all(!is.na(coords)), all(coords %in% names(points)))
if (is_lonlat) {
default_names <- c("longitude", "latitude", "t")
} else {
default_names <- c("x", "y", "t")
}
default_names <- default_names[seq_len(length(coords))]
names(points)[match(coords, names(points))] <- default_names
}
# project to sinusoidal if unprojected
if (is_lonlat) {
stopifnot(all(c("longitude", "latitude") %in% names(points)))
coords <- terra::vect(points[, c("longitude", "latitude")],
geom = c("longitude", "latitude"),
crs = "epsg:4326")
coords <- terra::project(coords, y = "+proj=sinu")
coords <- as.data.frame(terra::crds(coords))
# add temporal columns
if ("t" %in% names(points)) {
coords[["t"]] <- points[["t"]]
}
points <- coords
rm(coords)
}
# 2d or 3d grid?
if (length(res) == 2) {
stopifnot(all(c("x", "y") %in% names(points)))
points <- points[, c("x", "y")]
time_dim <- FALSE
} else {
stopifnot(all(c("x", "y", "t") %in% names(points)))
points <- points[, c("x", "y", "t")]
time_dim <- TRUE
}
# define grid based on lower left corner of grid
if (is.null(grid_definition)) {
ll <- apply(points, 2, min, na.rm = TRUE)
# jitter
if (jitter_grid) {
jitter_amount <- stats::runif(length(res)) * res
ll <- ll - jitter_amount
}
} else {
ll <- grid_definition[["origin"]]
}
# assign indexes for space dimensions
x_cell <- 1 + (points[["x"]] - ll[1]) %/% res[1]
y_cell <- 1 + (points[["y"]] - ll[2]) %/% res[2]
if (is.null(grid_definition)) {
grid_dim <- c(x = max(x_cell), y = max(y_cell))
} else {
grid_dim <- grid_definition[["dim"]]
}
stopifnot(!is.na(x_cell), !is.na(y_cell))
cells <- dplyr::tibble(cell_xy = paste0(x_cell, "-", y_cell))
# assign spacetime index
if (time_dim) {
t_cell <- 1 + (points[["t"]] - ll[3]) %/% res[3]
if (is.null(grid_definition)) {
grid_dim <- c(grid_dim, t = max(t_cell))
}
stopifnot(!is.na(t_cell))
cells[["cell_xyt"]] <- paste0(cells[["cell_xy"]], "-", t_cell)
}
# check that none of the points were outside the grid
if (!is.null(grid_definition)) {
if (any(x_cell > grid_dim[["x"]]) || any(y_cell > grid_dim[["y"]])) {
stop("some points fall outside the provided spatial grid")
}
if (time_dim && any(t_cell > grid_dim[["t"]])) {
stop("some points fall outside the provided temporal grid")
}
}
# attach grid definition to return object
attr(cells, "grid_definition") <- list(res = res, dim = grid_dim, origin = ll)
return(cells)
}
# internal ----
# project to equal area
project_equal_area <- function(points, coords = c("longitude", "latitude")) {
stopifnot(all(coords %in% names(points)))
locs <- terra::vect(points[, coords], geom = coords, crs = "epsg:4326")
locs <- terra::project(locs, y = "+proj=sinu")
locs <- as.data.frame(terra::crds(locs))
return(stats::setNames(locs, c("x", "y")))
}
# safe version of sample() that works when length(x) == 1
safe_sample <- function(x, size, ...) {
if (length(x) <= size || length(x) == 1) {
return(x)
}
sample(x, size = size, ...)
}
sample_stratify <- function(x, prop, sample_by) {
if (length(sample_by) == 0) {
x <- list(x)
} else {
x <- split(x, f = x[, sample_by, drop = FALSE], drop = TRUE)
}
n <- vapply(x, FUN = nrow, FUN.VALUE = integer(1))
# ensure at least one row is retained from each stratum
size <- pmax(1, round(n * prop))
# sample by group
sampled <- mapply(FUN = dplyr::slice_sample, .data = x, n = size,
SIMPLIFY = FALSE)
dplyr::bind_rows(sampled)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.