#' Compute thresholded distances between rows or columns of a matrix
#'
#' Compute and return distances and indices of rows or columns within a specified distance threshold
#' with respect to a specified distance metric. The algorithm works best for Euclidean
#' distance (the default option).
#' Alternatively compute the \code{t} closest rows when \code{rank=TRUE}. Or use
#' \code{columns=TRUE} to compute distances between columns instead, which is somewhat
#' cheaper for this algorithm than computing row distances.
#' Increase p to cut down the total number of candidate pairs evaluated,
#' at the expense of costlier truncated SVDs.
#'
#' @param A an m by n real-valued dense or sparse matrix
#' @param t a threshold distance value either in absolute distance (the default) or rank order (see \code{rank} below);
#' if missing an estimate derived from a 1-d SVD projection will be used
#' @param p projected subspace dimension
#' @param filter "local" filters candidate set sequentially,
#' "distributed" computes thresholded correlations in a parallel code section which can be
#' faster but requires that the data matrix is available (see notes).
#' @param method the distance measure to be used, one of
#' "euclidean", or "manhattan".
#' Any unambiguous substring can be given.
#' @param rank when \code{TRUE}, the threshold \code{t} represents the top \code{t}
#' closest vectors, otherwise the threshold \code{t} specifies absolute distance; when
#' \code{rank=TRUE} then \code{t} must also be specified
#' @param dry_run set \code{TRUE} to return statistics and truncated SVD for tuning
#' \code{p} (see notes)
#' @param max_iter when \code{rank=TRUE}, a portion of the algorithm may iterate; this
#' number sets the maximum numer of such iterations
#' @param columns set to \code{TRUE} to compute distances between matrix columns instead
#' of rows, saving the expense of a matrix transpose (which can be significant if \code{A} is large)
#' @param restart either output from a previous run of \code{tdist} with \code{dry_run=TRUE},
#' or direct output from from \code{\link{irlba}} used to restart the \code{irlba}
#' algorithm when tuning \code{p} (see notes)
#' @param ... additional arguments passed to \code{\link{irlba}}
#'
#' @return A list with elements:
#' \enumerate{
#' \item \code{indices} A three-column matrix. The first two columns contain
#' indices of rows meeting the distance threshold \code{t},
#' the third column contains the corresponding distance value (not returned
#' when \code{dry_run=TRUE}).
#' \item \code{restart} A truncated SVD returned by the IRLBA used to restart the
#' algorithm (only returned when \code{dry_run=TRUE}).
#' \item \code{tot} The total number of _possible_ vectors that meet
#' the correlation threshold identified by the algorithm.
#' \item \code{longest_run} The largest number of successive entries in the
#' ordered first singular vector within a projected distance defined by the
#' correlation threshold; Equivalently, the number of \code{n * p} matrix
#' vector products employed in the algorithm, not counting the truncated SVD step.
#' \item \code{t} The threshold value.
#' \item \code{svd_time} Time to compute truncated SVD.
#' \item \code{total_time} Total run time.
#' }
#'
#' @note When \code{rank=TRUE} the method returns at least, and perhaps more than, the top \code{t} closest
#' indices and their distances, unless they could not be found within the iteration
#' limit \code{max_iter}.
#' @seealso \code{\link{dist}}, \code{\link{tcor}}
#' @references \url{http://arxiv.org/abs/1512.07246} (preprint)
#' @examples
#' x <- matrix(rnorm(100 * 20), nrow=100)
#' # Find the top 10 closest vectors with respect to Euclidean distance:
#' td <- tdist(x, 10, rank=TRUE)
#' print(td$indices[1:10, ])
#'
#' # Compare with distances from `dist`:
#' d <- dist(x)
#' print(sort(d)[1:10])
#'
#' @importFrom irlba irlba
#' @importFrom stats dist quantile
#' @export
tdist = function(A, t, p=10,
filter=c("distributed", "local"),
method=c("euclidean", "manhattan", "maximum"), rank=FALSE,
dry_run=FALSE, max_iter=4, columns=FALSE, restart, ...)
{
filter = match.arg(filter)
method = match.arg(method)
if(!columns) A = base::t(A) # XXX expensive, find a better approach...
if(ncol(A) < p) p = max(1, floor(ncol(A) / 2 - 1))
nlim = function(t)
switch(method,
euclidean = t ^ 2,
maximum = nrow(A) * t ^ 2, # XXX unlikely to be a good bound?
manhattan = t ^ 2) # just bound by 2-norm, not so great either
t0 = proc.time()
if(p == ncol(A) || p == nrow(A)) L = svd(A)
else
{
if(missing(restart)) L = irlba(A, p, ...)
else
{
# Handle either output from tcor(..., dry_run=TRUE), or direct output from irlba:
if("restart" %in% names(restart)) restart = restart$restart
L = irlba(A, p, v=restart, ...)
}
}
t1 = (proc.time() - t0)[[3]]
if(missing(t) && rank) stop("t must be specified when rank=TRUE")
N = 1
if(rank) N = t
if(missing(t) || rank)
{
# Estimate a threshold based on a 1-d projection, with crude tuning over a range of values.
v = L$v[order(L$v[,1]), 1]
ts = sort(c(quantile(L$d[1] * (v[-1] - v[1]), probs=c(0.001, 0.01, 0.1)),
quantile(L$d[1] * (v[length(v)] - v[-length(v)]), probs=c(0.001, 0.01, 0.1))))
# (that last expression considers two possible SVD bases of different sign)
as = lapply(ts, function(t) two_seven(A, L, t, filter, normlim=nlim(t), dry_run=TRUE)$tot)
i = which(as > 0)
if(length(i) > 0) t = ts[min(i)]
else t = ts[3]
attr(t, "names") = c()
if(rank && method == "maximum") t = t / 4 # just a fudge factor, these bounds are not great
if(rank && method == "manhattan") t = t * 4
}
full_dist_fun =
switch(method,
euclidean = function(idx) vapply(1:nrow(idx), function(k) sqrt(crossprod(A[, idx[k,1]] - A[, idx[k, 2]])), 1),
manhattan = function(idx) vapply(1:nrow(idx), function(k) sum(abs(A[, idx[k,1]] - A[, idx[k, 2]])), 1),
maximum = function(idx) vapply(1:nrow(idx), function(k) max(abs(A[, idx[k,1]] - A[, idx[k, 2]])), 1)
)
filter_fun = function(v, t) v <= t
iter = 1
while(iter <= max_iter)
{
ans = two_seven(A, L, t, filter, normlim=nlim(t), full_dist_fun=full_dist_fun, filter_fun=filter_fun, dry_run=dry_run)
if(dry_run) return(list(restart=L, longest_run=ans$longest_run, tot=ans$tot, t=t, svd_time=t1))
if(!rank || (nrow(ans$indices) >= N)) break
iter = iter + 1
# back off faster as we get closer to avoid too much filtering, at the expense of maybe more iterations
t = t * (2 - (nrow(ans$indices)/N)^(1/4))
}
ans$indices = ans$indices[order(ans$indices[,"val"]),]
c(ans, svd_time=t1, total_time=(proc.time() - t0)[[3]])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.