#' Calculate a word's Euclidean distance from other words
#'
#' Caclulates the Euclidean distance of a word from all other words in a df, on selected variables.
#'
#' @param df A data frame.
#' @param target The target string (word) that euclidean distances are required for.
#' @param vars The variables to be used as dimensions which Euclidean distance should be calculated over. Can be a vector of variable names (e.g. `c(Zipf.SUBTLEX_UK, Length)`), or, `"all"`, to use all numeric variables in the data frame. The default is `"all"`.
#' @param scale,center How should variables be scaled and/or centred before calculating Euclidean distance? For options, see the `scale` and `center` arguments of \code{\link[base]{scale}}. Default for both is `TRUE`. Scaling can be useful when variables are in differently scaled.
#' @param weights An (optional) list of weights, in the same order as `vars`. After any scaling is applied, the values will be multiplied by these weights. Default is `NA`, meaning no weights are applied.
#' @param standardise_weights Logical; should the weights be standardised to average to 1 (i.e., sum to the length of `vars`)? If TRUE, `weights=c(1, 3, 6)` will be treated as `weights=c(0.3, 0.6, 1.8)`. Setting `standardise_weights=TRUE` ensures that the space itself is unchanged when weights change. This means, for example, that the same tolerance can be used in `control_for_euc()`.
#' @param id_col The column containing the strings (default = `"string"`).
#' @param standard_eval Logical; bypasses non-standard evaluation, and allows more standard R objects in `vars`. If `TRUE`, `vars` should be a character vector referring to columns in `df` (e.g. `c("Length", "Zipf.SUBTLEX_UK")`). Default = `FALSE`.
#'
#' @return Returns a vector of Euclidean distances, in the order of rows in `df`.
#' @examples
#'
#' # Get the distance of every entry in the `lexops` dataset from the word "thicket".
#' # (Note: This will be calculated using the dimensions of frequency, arousal, and size)
#' lexops |>
#' euc_dists("thicket", c(Zipf.SUBTLEX_UK, AROU.Warriner, SIZE.Glasgow_Norms))
#'
#' # no scaling or centering
#' lexops |>
#' euc_dists(
#' "thicket",
#' c(Zipf.SUBTLEX_UK, AROU.Warriner, SIZE.Glasgow_Norms),
#' scale = FALSE,
#' center = FALSE
#' )
#'
#' # Add Euclidean distance as new column
#' # (Also sort ascendingly by distance; barbara will have a distance of 0 so will be first)
#' lexops %>%
#' dplyr::mutate(ed = euc_dists(., "barbara", c(Length, Zipf.SUBTLEX_UK, BG.SUBTLEX_UK))) |>
#' dplyr::arrange(ed)
#'
#' # bypass non-standard evaluation
#' lexops |>
#' euc_dists(
#' "thicket",
#' c("Zipf.SUBTLEX_UK", "AROU.Warriner", "SIZE.Glasgow_Norms"),
#' standard_eval = TRUE
#' )
#' @export
euc_dists <- function(df = LexOPS::lexops, target, vars = "all", scale = TRUE, center = TRUE, weights = NA, standardise_weights = TRUE, id_col = "string", standard_eval = FALSE) {
if (!standard_eval) {
vars <- parse_levels(substitute(vars)) %>%
unlist()
# revert to "all" if parse_levels(vars) returns "\"all\""
if (all(vars=="\"all\"") & length(vars)==1) vars <- "all"
}
# check the df is a dataframe
if (!is.data.frame(df)) stop(sprintf("Expected df to be of class data frame, not %s", class(df)))
# if there are no vars specified (e.g. empty vector) return NAs
if (length(vars) == 0) {
warning("No numeric columns specified in `vars`")
return(rep(NA, nrow(df)))
}
# check that vars is an expected input
if (!is.vector(vars) & all(vars != "all")) stop('Expected vars to be either a vector of column names, or "all"')
# if vars == "all", change vars to all numeric columns
if (all(vars == "all")) {
vars <- colnames(df)[sapply(df, is.numeric)]
if (length(vars) == 0) {
warning("No numeric columns detected in `df`")
return(rep(NA, nrow(df)))
}
}
# check that all vars are in df
if (!all(vars %in% colnames(df))) {
unknown_cols <- vars[!vars %in% colnames(df)]
stop(sprintf("%i unknown columns in `df`: %s", length(unknown_cols), paste(unknown_cols, collapse = ", ")))
}
# check that all vars are numeric
if (!all(sapply(df[, vars], is.numeric))) {
non_numeric_cols <- colnames(df[, vars])[!sapply(df[, vars], is.numeric)]
stop(sprintf("%i non-numeric columns specified in `vars`: %s", length(non_numeric_cols), paste(non_numeric_cols, collapse = ", ")))
}
# if specified, check weights is a numeric vector of correct length
if (any(!is.na(weights))) {
if (!is.numeric(weights) | length(weights)!=length(vars)) stop("`weights` should be a numeric vector of length equal to that of vars")
# standardise weights if necessary
if (standardise_weights) {
weights <- weights / mean(weights)
}
}
# check id_col is a string
if (!is.character(id_col)) stop(sprintf("Expected id_col to be of class string, not %s", class(id_col)))
# calculate Euclidean distance of word from all others on specified dimensions
# firstly, scale all specified dimensions as required
df[, vars] <- lapply(df[, vars], base::scale, center, scale)
# get all the other vectors (including the target)
dims <- df[, vars]
# apply weights
if (any(!is.na(weights))) {
dims <- lapply(1:ncol(dims), function(i) dims[, i] * weights[i]) %>%
as.data.frame() %>%
stats::setNames(names(dims))
}
# get the vector for the target word
target_dims <- dims[df[[id_col]] == target, vars]
# get the distance squared
dist_sq <- as.data.frame(
lapply(colnames(dims), function(d) {
(dims[d] - c(target_dims[[d]]))**2
})
)
# get the row-wise euclidean distance
sqrt(rowSums(dist_sq))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.