#' @export
shingle <- function(x, tokenizer = tokenizers::tokenize_words) {
# tokenize each document
s <- tokenizer(x)
# make sure documents are sets
s <- lapply(s, unique)
# find unique shingles
s_unique <- unique(unlist(s))
# ensure we can convert to integers
if (length(s_unique) >= 2^31) {
stop("there are more than 2^31 unique words, cannot use match() function!", call. = FALSE)
}
# convert characters to integers
lapply(s, function(doc) fastmatch::fmatch(doc, s_unique))
}
#' Jaccard similarity for shingled documents
#'
#' @param x,y shingle documents generated by \code{shingle_} function
#' @export
jaccard_shingles <- function(x, y) {
# intersection / union
length(intersect(x, y))/length(union(x,y))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.