R/neighbor_match_test.R

Defines functions compute_nmt cast_dist_list cast_dist_df nb_match_test.data.frame nb_match_test.default nb_match_test

#' @title Local Neighbor Match Test
#'
#' @description Implements the Local Neighbor Match Test as described in _Tobler's Law in a Multivariate World_ (Anselin and Li, 2020).
#'
#' @export
#'
#'@examples
#' acs %>%
#'   mutate(nb = st_knn(geometry, k = 10),
#'          nmt = nb_match_test(.cols = list(med_house_income, bach),
#'                              nb = nb,
#'                              k = 10))
#'
#' acs %>%
#'   mutate(nb = st_knn(geometry, k = 10)) %>%
#'   nb_match_test(.cols = c(med_house_income, bach),
#'                 nb = nb,
#'                 k = 10)
#'

nb_match_test <- function(x, ...) {
  UseMethod("nb_match_test")
}

# Personal thought, can we extend this concept a bit more to use any form of neighbors matrix? For example contiguities? Then the calculation of p values are a bit different. Your contiguities might have different numbers. Say your k is set to 10 but card(nb) = 5.
# what is the probability of sampling 10 neighbors and of those 10 5 of them are same as your existing neighbors.

#' @param .cols A list of numeric vectors. Intended to be used inside of a `dplyr::mutate()` call. In the case of a data.frame, the columns to be used in calculating the distance matrix using tidy selection.
#' @method nb_match_test default
#' @describeIn nb_match_test Conduct a neighbor match test within a mutate call.
#' @export
nb_match_test.default <- function(.cols, nb, k = 10, method = "euclidean", scale = TRUE, p = 2) {

  d <- cast_dist_list(.cols = .cols, method = method, scale = scale, p = p)

  compute_nmt(d, k = k, nb = nb)

}

#' @param .data A data frame.
#' @param nb The unquoted name of your knn neighbor list.
#' @param k The number of neighbors to identify. Should be the same `k` used in `st_knn()`.
#' @param method The distance measure to be used by `stats::dist()`.
#' @param scale Whether or not to standardize columns prior to calculation. It is strongly recommended to do so.
#' @param p The power of the Minkowski distance.
#'
#' @method nb_match_test data.frame
#' @describeIn nb_match_test Conduct a neighbor match test with a data frame.
#' @export
nb_match_test.data.frame <- function(.data, .cols, nb, k = 10,
                                           method = "euclidean", scale = TRUE, p = 2) {

  d <- cast_dist_df(.data, .cols = {{.cols}}, method, scale, p)
  nbs <- dplyr::pull(.data, {{ nb }})
  compute_nmt(d, k = k, nb = nbs)

}


# Casting to distance matrix ----------------------------------------------
# Help functions
# Two separate functions for casting to distance matrix
cast_dist_df <- function(.data, .cols, method, scale, p) {

  if (any(class(.data) == "sf")) .data <- tibble::as_tibble(.data)

  df <- dplyr::select(.data, {{ .cols }})

  if (scale) df <- dplyr::mutate(df, dplyr::across(.cols = everything(), .fns = base::scale))

  dist(df, method = method, p = p)

}

cast_dist_list <- function(.cols, method, scale, p) {
  m <- Reduce(cbind, .cols)
  if (scale) m <- scale(m)

  dist(m, method = method, p = p)

}

# Calculate Neighbor Match Test -------------------------------------------
# one function to do the computation that will be shared among s3 methods for nmt
compute_nmt <- function(d, k, nb) {

  kd <- dbscan::kNN(d, k = k)

  # find neighbor ids
  knn_attr <- dbscan::adjacencylist(kd)

  # find matches
  matches <- purrr::map2(knn_attr, nb, base::intersect)

  N <- attr(d, "Size") - 1
  v <- lengths(matches)
  k <- 10

  p_vals <- choose(k, v) * choose(N - k, k - v) / choose(N, k)


  tibble::tibble(
    attr_nb = knn_attr,
    nb_matches = matches,
    n_shared = lengths(matches),
    p_value = p_vals
  )
}


# neighbor match test of varied sample sizes

# 𝐶(𝑘,𝑣)𝐶(𝑛−𝑘,𝑤−𝑣)/ 𝐶(𝑛,𝑤)
# thanks henry!

# https://math.stackexchange.com/questions/4305145/probability-of-same-observations-in-two# -independent-samples-of-different-size
#
# geo_nb <- acs %>%
#   mutate(nb = st_contiguity(geometry)) %>%
#   pull(nb)
#
# geo_knn <- acs %>%
#   mutate(nb = st_knn(geometry, 10)) %>%
#   pull(nb)
#
# attr_knn <- acs %>%
#   as_tibble() %>%
#   select(bach, med_house_income, by_pub_trans) %>%
#   as.matrix() %>%
#   scale() %>%
#   dbscan::kNN(k = 10)
#
# knn_nb <- dbscan::adjacencylist(attr_knn)
#
# matches_kw <- purrr::map2(knn_nb, geo_nb, base::intersect)
# matches_k <- purrr::map2(knn_nb, geo_knn, base::intersect)
#
# v <- lengths(matches)
# w <- lengths(geo_nb)
# k <- 10
# N <- length(knn_nb) - 1
#
# w > k
#
# p_kw<- choose(k, v) * choose(N - k, w - v) / choose(N, w)
# p_k <- choose(k, v) * choose(N - k, k - v) / choose(N, k)
# p_vals <= 0.05
#
JosiahParry/sfweight documentation built on April 4, 2022, 1:52 a.m.