R/dissimilarity.R
In sociome: Operationalizing Social Determinants of Health Data for Researchers

Documented in append_dissimilarities

#' Append list columns of Gower's distances and sampling weights to a data frame
#'
#' Runs [cluster::daisy()] on a data frame, breaks up the columns of the
#' resulting dissimilarity into a list, and adds this list to the data frame as
#' a list column. In addition or instead, it adds a transformed version of the
#' dissimilarity list, which can be used as sampling weights.
#'
#' All columns are fed to [cluster::daisy()] by default, but the user can select
#' which ones using the `cols` argument.
#'
#' Once the full dissimilarity matrix is obtained, the columns are separated
#' into a list via [asplit()] and appended to `data`. Each element of the list
#' is therefore a [double] vector with [`nrow`]`(data)` values. For any given
#' row, its dissimilarity vector represents the row's dissimilarity to every
#' row.
#'
#' The optional/alternative "sampling weight" column is a transformed version of
#' the dissimilarity list: 1. All dissimilarity measures of 0 are replaced with
#' the next smallest dissimilarity value in the vector. In effect, this means
#' that a row's dissimilarity to itself (and any rows identical to it) is
#' replaced with the dissimilarity value of its next most similar row.
#' (Exception: if all elements are 0, all of them are replaced with 1). 2. Then
#' the reciprocal of each element is taken so that larger values represent
#' greater similarity. 3. Each element is divided by the sum of the vector,
#' which standardizes the elements to add to 1.
#' 
#' Requires the package `cluster` to be installed.
#'
#' @param data A data frame that has at least one row and at least one column.
#' @param cols <[`tidy-select`][dplyr::dplyr_tidy_select]> Columns of `data` on
#'   which to calculate dissimilarities.
#' @param dissimilarity_measure_name,sampling_weight_name The names of the list
#'   columns that will be added to `data`. Cannot match the names of the
#'   existing columns. Make one of them `NULL` if you don't want it added, but
#'   they can't both be `NULL`.
#' @param metric,... Passed to [cluster::daisy()]. Use `...` at your own risk.
#'
#' @examples
#' # Running this on all mtcars columns
#' mtdissim <- append_dissimilarities(mtcars)
#' 
#' # Therefore, these numbers represent the dissimilarity of each row to the
#' # fifth row:
#' mtdissim$dissimilarities[[5]]
#' 
#' # And these are the dissimilarities' corresponding sampling weights:
#' mtdissim$sampling_weights[[5]]
#' 
#' # Now we run it on mtcars without the wt and qsec colums so that we purposely
#' # end up with some duplicate rows (the first and second).
#' mtdissim_dup <- append_dissimilarities(mtcars, cols = !c(wt, qsec))
#'
#' # These represent each row's dissimilarity to its first row.
#' # Since we specifically told it not to take wt and qsec into account, the
#' # first two rows are identical. Therefore, both values are zero.
#' mtdissim_dup$dissimilarities[[1]]
#' 
#' # Here are the corresponding sampling weights. Notice that the first two
#' # rows' sampling weights are the same as the sampling weight of row 30, which
#' # is the next most similar row.
#' mtdissim_dup$sampling_weights[[1]]
#' @return A data frame, specifically the `data` argument with one or two more
#'   columns added to the end.
#' @export
append_dissimilarities <- function(data,
                                   cols = dplyr::everything(),
                                   dissimilarity_measure_name =
                                     "dissimilarities",
                                   sampling_weight_name =
                                     "sampling_weights",
                                   metric = "gower",
                                   ...) {
  check_for_packages("cluster")
  if (!is.data.frame(data) || !nrow(data)) {
    stop("data must be a data frame with at least one row", call. = FALSE)
  }
  
  data2 <- dplyr::select(data, {{cols}})
  
  if (!ncol(data2)) {
    stop("cols must specify at least one column in data", call. = FALSE)
  }
  
  if (anyNA(data2)) {
    warning("missing values detected among these cols:\n", 
            paste0(
              names(data2)[vapply(data2, anyNA, logical(1L))],
              collapse = "\n"
            ), call. = FALSE)
  }
  
  validate_dissim_colnames(
    dissimilarity_measure_name,
    sampling_weight_name,
    data
  )
  
  if (anyDuplicated(data2)) {
    warning(
      "\nDuplicate rows detected. The dissimilarity measures and sampling",
      "\nweights of identical rows will be identical.\nRows with duplicates:\n",
      paste0(
        which(duplicated(data2) | duplicated(data2, fromLast = TRUE)),
        collapse = ", "
      ),
      call. = FALSE
    )
  }
  
  dissim_measure <-
    cluster::daisy(x = data2, metric = metric, ...) %>% 
    as.matrix() %>% 
    unname() %>% 
    asplit(MARGIN = 2L) %>% 
    lapply(as.numeric)
  
  if (!is.null(dissimilarity_measure_name)) {
    data[[dissimilarity_measure_name]] <- dissim_measure
  }
  
  if (!is.null(sampling_weight_name)) {
    data[[sampling_weight_name]] <- lapply(dissim_measure, dissim_samp_wts)
  }
  
  data
}


dissim_samp_wts <- function(x) {
  zeros <- x == 0
  if (all(zeros)) {
    x[] <- 1
  } else {
    x[zeros] <- min(x[!zeros])
    x <- 1 / x
  }
  
  x / sum(x)
}