R/minhash.R

Defines functions random_ints minhash_generator

Documented in minhash_generator

#' Generate a minhash function
#'
#' A minhash value is calculated by hashing the strings in a character vector to
#' integers and then selecting the minimum value. Repeated minhash values are
#' generated by using different hash functions: these different hash functions
#' are created by using performing a bitwise \code{XOR} operation
#' (\code{\link{bitwXor}}) with a vector of random integers. Since it is vital
#' that the same random integers be used for each document, this function
#' generates another function which will always use the same integers. The
#' returned function is intended to be passed to the \code{hash_func} parameter
#' of \code{\link{TextReuseTextDocument}}.
#'
#' @param n The number of minhashes that the returned function should generate.
#' @param seed An option parameter to set the seed used in generating the random
#'   numbers to ensure that the same minhash function is used on repeated
#'   applications.
#' @return A function which will take a character vector and return \code{n}
#'   minhashes.
#' @references Jure Leskovec, Anand Rajaraman, and Jeff Ullman,
#'   \href{http://www.mmds.org/#book}{\emph{Mining of Massive Datasets}}
#'   (Cambridge University Press, 2011), ch. 3. See also Matthew Casperson,
#'   "\href{http://matthewcasperson.blogspot.com/2013/11/minhash-for-dummies.html}{Minhash
#'    for Dummies}" (November 14, 2013).
#' @seealso \code{\link{lsh}}
#' @examples
#' set.seed(253)
#' minhash <- minhash_generator(10)
#'
#' # Example with a TextReuseTextDocument
#' file <- system.file("extdata/legal/ny1850-match.txt", package = "textreuse")
#' doc <- TextReuseTextDocument(file = file, hash_func = minhash,
#'                              keep_tokens = TRUE)
#' hashes(doc)
#'
#' # Example with a character vector
#' is.character(tokens(doc))
#' minhash(tokens(doc))
#' @export
minhash_generator <- function(n = 200, seed = NULL) {
  assert_that(is.count(n))
  if (!is.null(seed)) set.seed(seed)
  r <- random_ints(n)
  f <- function(x) {
    assert_that(is.character(x))
    h <- hash_string(x)
    vapply(r, function(i) { min(bitwXor(h, i)) },
           integer(1), USE.NAMES = FALSE)
  }
  f
}

# Generate random integers for minhashing
#
# It is crucial that you use the same random integers for every document in the
# corpus. The random integers generated by this function are intended to be
# passed to \code{\link{minhash}}.
# @param n The number of random integers to generate.
# @return A vector of integers
# @seealso \code{\link{minhash}}
# @examples
# random_ints(3)
random_ints <- function(n) {
  as.integer(stats::runif(n, -2147483648, 2147483647))
}
ropensci/textreuse documentation built on May 19, 2020, 7:40 a.m.