#' @title Calculate the similarity weight for a set of observations
#'
#' @description Calculate the similarity weight for a set of observations, based
#' on their distance from some arbitary points in data space. Observations which
#' are very similar to the point under consideration are given weight 1, while
#' observations which are dissimilar to the point are given weight zero.
#'
#' @param x A dataframe describing arbitrary points in the space of the data
#' (i.e., with same \code{colnames} as \code{data}).
#' @param data A dataframe representing observed data.
#' @param threshold Threshold distance outside which observations will
#' be assigned similarity weight zero. This is numeric and should be > 0.
#' Defaults to 1.
#' @param distance The type of distance measure to be used, currently just two
#' types of Minkowski distance: \code{"euclidean"} (default), and
#' \code{"maxnorm"}.
#' @param lambda A constant to multiply by the number of categorical
#' mismatches, before adding to the Minkowski distance, to give a general
#' dissimilarity measure. If left \code{NULL}, behaves as though \code{lambda}
#' is set larger than \code{threshold}, meaning that one factor mismatch
#' guarantees zero weight.
#'
#' @return A numeric vector or matrix, with values from 0 to 1. The similarity
#' weights for the observations in \code{data} arranged in rows for each row
#' in \code{x}.
#'
#' @details Similarity weight is assigned to observations based on their
#' distance from a given point. The distance is calculated as Minkowski
#' distance between the numeric elements for the observations whose
#' categorical elements match, with the option to use a more general
#' dissimilarity measure comprising Minkowski distance and a mismatch count.
#'
#' @examples
#' ## Say we want to find observations similar to the first observation.
#' ## The first observation is identical to itself, so it gets weight 1. The
#' ## second observation is similar, so it gets some weight. The rest are more
#' ## different, and so get zero weight.
#'
#' data(mtcars)
#' similarityweight(x = mtcars[1, ], data = mtcars)
#'
#' ## By increasing the threshold, we can find observations which are more
#' ## approximately similar to the first row. Note that the second observation
#' ## now has weight 1, so we lose some ability to discern how similar
#' ## observations are by increasing the threshold.
#'
#' similarityweight(x = mtcars[1, ], data = mtcars, threshold = 5)
#'
#' ## Can provide a number of points to 'x'. Here we see that the Mazda RX4 Wag
#' ## is more similar to the Merc 280 than the Mazda RX4 is.
#'
#' similarityweight(mtcars[1:2, ], mtcars, threshold = 3)
#'
#' @seealso \code{\link{dist1}}
#'
#' @references O'Connell M, Hurley CB and Domijan K (2017). ``Conditional
#' Visualization for Statistical Models: An Introduction to the
#' \strong{condvis} Package in R.''\emph{Journal of Statistical Software},
#' \strong{81}(5), pp. 1-20. <URL:http://dx.doi.org/10.18637/jss.v081.i05>.
similarityweight <-
function (x, data, threshold = NULL, distance = NULL, lambda = NULL)
{
if (!is.null(threshold) && threshold < 0)
stop("cannot have negative dissimilarity 'threshold'")
## Initialise the internal function
vwfun <- .similarityweight(xc = data)
## Make empty matrix for weights
k <- matrix(nrow = nrow(x), ncol = nrow(data), dimnames = list(rownames(
x), rownames(data)))
## Loop through rows of 'x'
for (i in 1:nrow(x)){
k[i, ] <- do.call(vwfun, list(xc.cond = x[i, , drop = FALSE], sigma =
threshold, distance = distance, lambda = lambda))$k
}
## Return the matrix of weights, dropping to vector if possible
k[, , drop = TRUE]
}
## Internal function which does some preprocessing (particularly scaling) and
## returns a function which calculates similarity weight for a single row of a
## dataframe.
.similarityweight <-
function (xc)
{
## Scale the dataframe and calculate a few things for later use.
nrow.xc <- nrow(xc)
if (nrow.xc < 2)
stop("cannot apply scale to data.frame with less than 2 rows")
colnames.xc <- colnames(xc)
arefactors <- vapply(xc, is.factor, logical(1L))
zerovar <- vapply(xc, function (x) all(duplicated(x)[-1L]) , logical(1L))
factorindex <- arefactors & !zerovar
numindex <- !arefactors & !zerovar
xc.factors <- data.matrix(xc[, factorindex, drop = FALSE])
xc.num <- data.matrix(xc[, numindex, drop = FALSE])
x.scaled <- scale(xc.num)
k <- rep(0, nrow.xc)
## Return a function which will calculate the weights for a single arbitrary
## point in the data space.
function (xc.cond, sigma = NULL, distance = c("euclidean", "maxnorm"),
lambda = NULL)
{
## Set up values
sigma <- if (is.null(sigma))
1
else sigma
distance <- match.arg(distance)
p <- if (identical(distance, "maxnorm")) 1 else 2
## If 'sigma' is Inf, return 1s for all observations
if (identical(sigma, Inf))
return(list(k = rep(1, nrow.xc), sigma = sigma, distance = distance))
## Get the arbitary point in order.
xc.cond <- xc.cond[, colnames.xc, drop = FALSE]
xc.cond.factors <- data.matrix(xc.cond[, factorindex, drop = FALSE])
xc.cond.num <- data.matrix(xc.cond[, numindex, drop = FALSE])
## 'factormatches' is the index of observations on which we will calculate
## the Minkowski distance. Basically pre-filtering for speed.
##
## If 'lambda' is NULL, require all factors to be equal to even bother
## calculating Minkowski distance.
##
## If 'lambda' is supplied, only want observations with less than
## (sigma / lambda) mismatches in the factors.
##
## If there are no factors, want all rows.
factormatches <- if (any(factorindex)){
if (is.null(lambda)){
which((nfactormatches <- rowSums(xc.factors == matrix(xc.cond.factors,
ncol = length(xc.cond.factors), nrow = nrow.xc, byrow = TRUE))) ==
length(xc.cond.factors))
} else {
which(length(xc.cond.factors) - (nfactormatches <- rowSums(xc.factors ==
matrix(xc.cond.factors, ncol = length(xc.cond.factors), nrow = nrow.xc
, byrow = TRUE))) <= (sigma / lambda))
}
} else {rep(TRUE, nrow.xc)}
## If any observations make it past the above filtering, calculate the
## dissimilarity 'd' as Minkowski distance plus 'lambda' times number of
## factor mismatches if 'lambda' is supplied.
##
## Convert the dissimilarity to similarity weights 'k', between 0 and 1.
if ((lfm <- length(factormatches)) > 0){
if (all(factorindex)){
if (is.null(lambda)){
d <- rep(0, lfm)
} else {
d <- lambda * (sum(factorindex) - nfactormatches[factormatches]) ^ p
}
} else {
xcond.scaled <- (xc.cond.num - attr(x.scaled, "scaled:center")) / attr(
x.scaled, "scaled:scale")
d <- dist1(xcond.scaled, x.scaled[factormatches, ], inf = identical(
distance, "maxnorm")) + if (any(factorindex) && !is.null(lambda))
(lambda * (sum(factorindex) - nfactormatches[factormatches])) ^ p
else 0
}
k[factormatches] <- pmax(0, 1 - (d ^ (1 / p)) / (sigma))
}
list(k = k, sigma = sigma, distance = distance)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.