R/shingle.r

Defines functions jaccard_shingles shingle

Documented in jaccard_shingles

#' @export
shingle <- function(x, tokenizer = tokenizers::tokenize_words) {
  # tokenize each document
  s <- tokenizer(x)

  # make sure documents are sets
  s <- lapply(s, unique)

  # find unique shingles
  s_unique <- unique(unlist(s))

  # ensure we can convert to integers
  if (length(s_unique) >= 2^31) {
    stop("there are more than 2^31 unique words, cannot use match() function!", call. = FALSE)
  }

  # convert characters to integers
  lapply(s, function(doc) fastmatch::fmatch(doc, s_unique))
}

#' Jaccard similarity for shingled documents
#'
#' @param x,y shingle documents generated by \code{shingle_} function
#' @export
jaccard_shingles <- function(x, y) {
  # intersection / union
  length(intersect(x, y))/length(union(x,y))
}
zamorarr/lshr documentation built on April 24, 2021, 11:35 p.m.