#' Nearest neighbour based edge lists
#'
#'
#' \code{edge_nn} returns edge lists defined by the nearest neighbour. The
#' function accepts a \code{data.table} with relocation data, individual
#' identifiers and a threshold argument. The threshold argument is used to
#' specify the criteria for distance between points which defines a group.
#' Relocation data should be in two columns representing the X and Y
#' coordinates.
#'
#'
#' The \code{DT} must be a \code{data.table}. If your data is a
#' \code{data.frame}, you can convert it by reference using
#' \code{\link[data.table:setDT]{data.table::setDT}}.
#'
#' The \code{id}, \code{coords}, \code{timegroup} (and optional \code{splitBy})
#' arguments expect the names of a column in \code{DT} which correspond to the
#' individual identifier, X and Y coordinates, timegroup (generated by
#' \code{group_times}) and additional grouping columns.
#'
#' The \code{threshold} must be provided in the units of the coordinates. The
#' \code{threshold} must be larger than 0. The coordinates must be planar
#' coordinates (e.g.: UTM). In the case of UTM, a \code{threshold} = 50 would
#' indicate a 50m distance threshold.
#'
#' The \code{timegroup} argument is required to define the temporal groups
#' within which edge nearest neighbours are calculated. The intended framework
#' is to group rows temporally with \code{\link{group_times}} then spatially
#' with \code{edge_nn}. If you have already calculated temporal groups without
#' \code{\link{group_times}}, you can pass this column to the \code{timegroup}
#' argument. Note that the expectation is that each individual will be observed
#' only once per timegroup. Caution that accidentally including huge numbers of
#' rows within timegroups can overload your machine since all pairwise distances
#' are calculated within each timegroup.
#'
#' The \code{splitBy} argument offers further control over grouping. If within
#' your \code{DT}, you have multiple populations, subgroups or other distinct
#' parts, you can provide the name of the column which identifies them to
#' \code{splitBy}. \code{edge_nn} will only consider rows within each
#' \code{splitBy} subgroup.
#'
#' @param threshold (optional) spatial distance threshold to set maximum
#' distance between an individual and their neighbour.
#' @param returnDist boolean indicating if the distance between individuals
#' should be returned. If FALSE (default), only ID, NN columns (and timegroup,
#' splitBy columns if provided) are returned. If TRUE, another column
#' "distance" is returned indicating the distance between ID and NN.
#' @inheritParams group_pts
#'
#' @return \code{edge_nn} returns a \code{data.table} with three columns:
#' timegroup, ID and NN. If 'returnDist' is TRUE, column 'distance' is
#' returned indicating the distance between ID and NN.
#'
#' The ID and NN columns represent the edges defined by the nearest neighbours
#' (and temporal thresholds with \code{group_times}).
#'
#' If an individual was alone in a timegroup or splitBy, or did not have any
#' neighbours within the threshold distance, they are assigned NA for nearest
#' neighbour.
#'
#' @export
#'
#' @family Edge-list generation
#'
#' @examples
#' # Load data.table
#' library(data.table)
#' \dontshow{data.table::setDTthreads(1)}
#'
#' # Read example data
#' DT <- fread(system.file("extdata", "DT.csv", package = "spatsoc"))
#'
#' # Select only individuals A, B, C for this example
#' DT <- DT[ID %in% c('A', 'B', 'C')]
#'
#' # Cast the character column to POSIXct
#' DT[, datetime := as.POSIXct(datetime, tz = 'UTC')]
#'
#' # Temporal grouping
#' group_times(DT, datetime = 'datetime', threshold = '20 minutes')
#'
#' # Edge list generation
#' edges <- edge_nn(DT, id = 'ID', coords = c('X', 'Y'),
#' timegroup = 'timegroup')
#'
#' # Edge list generation using maximum distance threshold
#' edges <- edge_nn(DT, id = 'ID', coords = c('X', 'Y'),
#' timegroup = 'timegroup', threshold = 100)
#'
#' # Edge list generation, returning distance between nearest neighbours
#' edge_nn(DT, id = 'ID', coords = c('X', 'Y'),
#' timegroup = 'timegroup', threshold = 100,
#' returnDist = TRUE)
#'
edge_nn <- function(DT = NULL,
id = NULL,
coords = NULL,
timegroup,
splitBy = NULL,
threshold = NULL,
returnDist = FALSE) {
# NSE
N <- NULL
if (is.null(DT)) {
stop('input DT required')
}
if (!is.null(threshold)) {
if (!is.numeric(threshold)) {
stop('threshold must be numeric')
}
if (threshold <= 0) {
stop('threshold must be greater than 0')
}
}
if (is.null(id)) {
stop('ID field required')
}
if (length(coords) != 2) {
stop('coords requires a vector of column names for coordinates X and Y')
}
if (missing(timegroup)) {
stop('timegroup required')
}
if (any(!(
c(timegroup, id, coords, splitBy) %in% colnames(DT)
))) {
stop(paste0(
as.character(paste(setdiff(
c(timegroup, id, coords, splitBy),
colnames(DT)
), collapse = ', ')),
' field(s) provided are not present in input DT'
))
}
if (any(!(DT[, vapply(.SD, is.numeric, TRUE), .SDcols = coords]))) {
stop('coords must be numeric')
}
if (!is.null(timegroup)) {
if (any(unlist(lapply(DT[, .SD, .SDcols = timegroup], class)) %in%
c('POSIXct', 'POSIXlt', 'Date', 'IDate', 'ITime', 'character'))) {
warning(
strwrap(
prefix = " ",
initial = "",
x = 'timegroup provided is a date/time
or character type, did you use group_times?'
)
)
}
}
if ('splitBy' %in% colnames(DT)) {
warning(
strwrap(x = 'a column named "splitBy" was found in your data.table,
renamed to "split_by" to avoid confusion with the argument
"splitBy"')
)
setnames(DT, 'splitBy', 'split_by')
}
if (is.null(timegroup) && is.null(splitBy)) {
splitBy <- NULL
} else {
splitBy <- c(splitBy, timegroup)
if (DT[, .N, by = c(id, splitBy, timegroup)][N > 1, sum(N)] != 0) {
warning(
strwrap(
prefix = " ",
initial = "",
x = 'found duplicate id in a
timegroup and/or splitBy -
does your group_times threshold match the fix rate?'
)
)
}
}
DT[, {
distMatrix <-
as.matrix(stats::dist(.SD[, 2:3], method = 'euclidean'))
diag(distMatrix) <- NA
if (is.null(threshold)) {
wm <- apply(distMatrix, MARGIN = 2, which.min)
} else {
distMatrix[distMatrix > threshold] <- NA
wm <- apply(distMatrix, MARGIN = 2,
function(x) ifelse(sum(!is.na(x)) > 0, which.min(x), NA))
}
if (returnDist) {
w <- wm + (length(wm) * (as.numeric(names(wm)) - 1))
l <- list(ID = .SD[[1]][as.numeric(names(wm))],
NN = .SD[[1]][wm],
distance = distMatrix[w])
} else {
l <- list(ID = .SD[[1]][as.numeric(names(wm))],
NN = .SD[[1]][wm])
}
l
},
by = splitBy, .SDcols = c(id, coords)]
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.