R/minhash.r

Defines functions jaccard_minhashes minhash_generator random_ints minhash

Documented in jaccard_minhashes minhash minhash_generator random_ints

#' Minhash Document Shingles
#'
#' @param shingles list of shingled documents (from \code{shingle_*()} functions)
#' @param n number of minhashes per document
#' @param seed optional seed used when generating minhash functions
minhash <- function(shingles, n = 100, seed = NULL) {
  # generate minhasher
  f <- minhash_generator(n, seed)

  # generate minhash for every shingle
  vals <- pbmcapply::pbmclapply(shingles, f)

  # build minhash matrix
  matrix(unlist(vals), nrow = n)
}

#' Generate n random integers
#'
#' Used to parametrize the minhasher
#' @param n number of integers
#' @param seed optional seed
random_ints <- function(n, seed = NULL) {
  if (!is.null(seed)) set.seed(seed)
  sample.int(.Machine$integer.max, size = n)
}

#' Generate a minhash function
#'
#' @param n number of minhashes per document
#' @param seed optional seed used when generating minhash functions
minhash_generator <- function(n, seed = NULL) {
  rs <- random_ints(n, seed)

  # return generator function
  # x is a shingled document
  function(x) {
    # get minhash for every hash function (paramaterized by a random int from rs)
    vapply(rs, function(r) min(bitwXor(x, r)), integer(1L), USE.NAMES = FALSE)
  }
}

#' jaccard similarity on minhashes
jaccard_minhashes <- function(x, y) {
  mean(x == y)
}
zamorarr/lshr documentation built on April 24, 2021, 11:35 p.m.