#' Generate a minhash function
#'
#' A minhash value is calculated by hashing the strings in a character vector to
#' integers and then selecting the minimum value. Repeated minhash values are
#' generated by using different hash functions: these different hash functions
#' are created by using performing a bitwise \code{XOR} operation
#' (\code{\link{bitwXor}}) with a vector of random integers. Since it is vital
#' that the same random integers be used for each document, this function
#' generates another function which will always use the same integers. The
#' returned function is intended to be passed to the \code{hash_func} parameter
#' of \code{\link{TextReuseTextDocument}}.
#'
#' @param n The number of minhashes that the returned function should generate.
#' @param seed An option parameter to set the seed used in generating the random
#' numbers to ensure that the same minhash function is used on repeated
#' applications.
#' @return A function which will take a character vector and return \code{n}
#' minhashes.
#' @references Jure Leskovec, Anand Rajaraman, and Jeff Ullman,
#' \href{http://www.mmds.org/#book}{\emph{Mining of Massive Datasets}}
#' (Cambridge University Press, 2011), ch. 3. See also Matthew Casperson,
#' "\href{http://matthewcasperson.blogspot.com/2013/11/minhash-for-dummies.html}{Minhash
#' for Dummies}" (November 14, 2013).
#' @seealso \code{\link{lsh}}
#' @examples
#' set.seed(253)
#' minhash <- minhash_generator(10)
#'
#' # Example with a TextReuseTextDocument
#' file <- system.file("extdata/legal/ny1850-match.txt", package = "textreuse")
#' doc <- TextReuseTextDocument(file = file, hash_func = minhash,
#' keep_tokens = TRUE)
#' hashes(doc)
#'
#' # Example with a character vector
#' is.character(tokens(doc))
#' minhash(tokens(doc))
#' @export
minhash_generator <- function(n = 200, seed = NULL) {
assert_that(is.count(n))
if (!is.null(seed)) set.seed(seed)
r <- random_ints(n)
f <- function(x) {
assert_that(is.character(x))
h <- hash_string(x)
vapply(r, function(i) { min(bitwXor(h, i)) },
integer(1), USE.NAMES = FALSE)
}
f
}
# Generate random integers for minhashing
#
# It is crucial that you use the same random integers for every document in the
# corpus. The random integers generated by this function are intended to be
# passed to \code{\link{minhash}}.
# @param n The number of random integers to generate.
# @return A vector of integers
# @seealso \code{\link{minhash}}
# @examples
# random_ints(3)
random_ints <- function(n) {
as.integer(stats::runif(n, -2147483648, 2147483647))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.