Nothing
#'
#' @importFrom text2vec itoken
#' @importFrom text2vec itoken_parallel
#' @importFrom text2vec create_vocabulary
#' @importFrom text2vec vocab_vectorizer
#' @importFrom text2vec create_dtm
#' @importFrom tokenizers tokenize_character_shingles
#' @importFrom igraph components
#' @importFrom igraph graph_from_data_frame
#' @importFrom igraph make_clusters
#' @importFrom stats dist
#' @importFrom readr read_table
#' @importFrom utils download.file
#' @importFrom utils unzip
#'
#'
#' @title Block records based on character vectors
#'
#' @author Maciej Beręsewicz, Adam Struzik
#'
#' @description
#' Function creates shingles (strings with 2 characters, default) or vectors using a given model (e.g., GloVe),
#' applies approximate nearest neighbour (ANN) algorithms via the \link[rnndescent]{rnndescent}, \link[RcppHNSW]{RcppHNSW}, \link[RcppAnnoy]{RcppAnnoy} and \link[mlpack]{mlpack} packages,
#' and creates blocks using graphs via \link[igraph]{igraph}.
#'
#' @param x reference data (a character vector or a matrix),
#' @param y query data (a character vector or a matrix), if not provided NULL by default and thus deduplication is performed,
#' @param representation method of representing input data (possible \code{c("shingles", "vectors")}; default \code{"shingles"}),
#' @param model a matrix containing word embeddings (e.g., GloVe), required only when \code{representation = "vectors"},
#' @param deduplication whether deduplication should be applied (default TRUE as y is set to NULL),
#' @param on variables for ANN search (currently not supported),
#' @param on_blocking variables for blocking records before ANN search (currently not supported),
#' @param ann algorithm to be used for searching for ann (possible, \code{c("nnd", "hnsw", "annoy", "lsh", "kd")}, default \code{"nnd"} which corresponds to nearest neighbour descent method),
#' @param distance distance metric (default \code{cosine}, more options are possible see details),
#' @param ann_write writing an index to file. Two files will be created: 1) an index, 2) and text file with column names,
#' @param ann_colnames file with column names if \code{x} or \code{y} are indices saved on the disk (currently not supported),
#' @param true_blocks matrix with true blocks to calculate evaluation metrics (standard metrics based on confusion matrix are returned).
#' @param verbose whether log should be provided (0 = none, 1 = main, 2 = ANN algorithm verbose used),
#' @param graph whether a graph should be returned (default FALSE),
#' @param seed seed for the algorithms (for reproducibility),
#' @param n_threads number of threads used for the ANN algorithms and adding data for index and query,
#' @param control_txt list of controls for text data (passed only to \link[text2vec]{itoken_parallel} or \link[text2vec]{itoken}), used only when \code{representation = "shingles"},
#' @param control_ann list of controls for the ANN algorithms.
#'
#' @returns Returns a list containing:\cr
#' \itemize{
#' \item{\code{result} -- \code{data.table} with indices (rows) of x, y, block and distance between points}
#' \item{\code{method} -- name of the ANN algorithm used,}
#' \item{\code{deduplication} -- information whether deduplication was applied,}
#' \item{\code{representation} -- information whether shingles or vectors were used,}
#' \item{\code{metrics} -- metrics for quality assessment, if \code{true_blocks} is provided,}
#' \item{\code{confusion} -- confusion matrix, if \code{true_blocks} is provided,}
#' \item{\code{colnames} -- variable names (colnames) used for search,}
#' \item{\code{graph} -- \code{igraph} class object.}
#' }
#'
#' @examples
#' ## an example using RcppHNSW
#'
#' df_example <- data.frame(txt = c("jankowalski", "kowalskijan", "kowalskimjan",
#' "kowaljan", "montypython", "pythonmonty", "cyrkmontypython", "monty"))
#'
#' result <- blocking(x = df_example$txt,
#' ann = "hnsw",
#' control_ann = controls_ann(hnsw = control_hnsw(M = 5, ef_c = 10, ef_s = 10)))
#'
#' result
#'
#' ## an example using GloVe and RcppAnnoy
#' \dontrun{
#' old <- getOption("timeout")
#' options(timeout = 500)
#' utils::download.file("https://nlp.stanford.edu/data/glove.6B.zip", destfile = "glove.6B.zip")
#' utils::unzip("glove.6B.zip")
#'
#' glove_6B_50d <- readr::read_table("glove.6B.50d.txt",
#' col_names = FALSE,
#' show_col_types = FALSE)
#' data.table::setDT(glove_6B_50d)
#'
#' glove_vectors <- glove_6B_50d[,-1]
#' glove_vectors <- as.matrix(glove_vectors)
#' rownames(glove_vectors) <- glove_6B_50d$X1
#'
#' ## spaces between words are required
#' df_example_spaces <- data.frame(txt = c("jan kowalski", "kowalski jan", "kowalskim jan",
#' "kowal jan", "monty python", "python monty", "cyrk monty python", "monty"))
#'
#' result_annoy <- blocking(x = df_example_spaces$txt,
#' ann = "annoy",
#' representation = "vectors",
#' model = glove_vectors)
#'
#' result_annoy
#'
#' options(timeout = old)
#' }
#'
#' @export
blocking <- function(x,
y = NULL,
representation = c("shingles", "vectors"),
model,
deduplication = TRUE,
on = NULL,
on_blocking = NULL,
ann = c("nnd", "hnsw", "annoy", "lsh", "kd"),
distance = c("cosine", "euclidean", "l2", "ip", "manhatan", "hamming", "angular"),
ann_write = NULL,
ann_colnames = NULL,
true_blocks = NULL,
verbose = c(0, 1, 2),
graph = FALSE,
seed = 2023,
n_threads = 1,
control_txt = controls_txt(),
control_ann = controls_ann()) {
## defaults
if (missing(representation)) representation <- "shingles"
if (missing(verbose)) verbose <- 0
if (missing(ann)) ann <- "nnd"
if (missing(distance)) distance <- switch(ann,
"nnd" = "cosine",
"hnsw" = "cosine",
"annoy" = "angular",
"lsh" = NULL,
"kd" = NULL)
stopifnot("Only character, dense or sparse (dgCMatrix) matrix x is supported" =
is.character(x) | is.matrix(x) | inherits(x, "Matrix"))
if (!is.null(ann_write)) {
stopifnot("Path provided in the `ann_write` is incorrect" = file.exists(ann_write) )
}
if (ann == "nnd") {
stopifnot("Distance for NND should be `euclidean, cosine, manhatan, hamming`" =
distance %in% c("euclidean", "cosine","manhatan", "hamming"))
}
if ((ann == "nnd") && (distance == "manhatan")) {
distance <- "manhattan"
}
if (ann == "hnsw") {
stopifnot("Distance for HNSW should be `l2, euclidean, cosine, ip`" =
distance %in% c("l2", "euclidean", "cosine", "ip"))
}
if (ann == "annoy") {
stopifnot("Distance for Annoy should be `euclidean, manhatan, hamming, angular`" =
distance %in% c("euclidean", "manhatan", "hamming", "angular"))
}
if (!is.null(y)) {
deduplication <- FALSE
y_default <- FALSE
k <- 1L
} else {
y_default <- y
y <- x
k <- 2L
}
if (!is.null(true_blocks)) {
stopifnot("`true_blocks` should be a data.frame" = is.data.frame(true_blocks))
if (deduplication == FALSE) {
stopifnot("`true blocks` should be a data.frame with columns: x, y, block" =
length(colnames(true_blocks)) == 3,
all(colnames(true_blocks) == c("x", "y", "block")))
}
if (deduplication) {
stopifnot("`true blocks` should be a data.frame with columns: x, block" =
length(colnames(true_blocks)) == 2,
all(colnames(true_blocks) == c("x", "block")))
}
}
## add verification if x and y is a sparse matrix
if (is.matrix(x) | inherits(x, "Matrix")) {
x_dtm <- x
y_dtm <- y
} else {
if ((verbose %in% 1:2) && (representation == "shingles")) cat("===== creating tokens =====\n")
## vectors
if (representation == "vectors"){
x_embeddings <- sentence_to_vector(x, model)
if (is.null(y_default)) {
y_embeddings <- x_embeddings
} else {
y_embeddings <- sentence_to_vector(y, model)
}
} else{
## shingles
## tokens for x
if (.Platform$OS.type == "unix") {
x_tokens <- text2vec::itoken_parallel(
iterable = x,
tokenizer = function(x) tokenizers::tokenize_character_shingles(x,
n = control_txt$n_shingles,
lowercase = control_txt$lowercase,
strip_non_alphanum = control_txt$strip_non_alphanum),
n_chunks = control_txt$n_chunks,
progressbar = verbose)
} else {
x_tokens <- text2vec::itoken(
iterable = x,
tokenizer = function(x) tokenizers::tokenize_character_shingles(x,
n = control_txt$n_shingles,
lowercase = control_txt$lowercase,
strip_non_alphanum = control_txt$strip_non_alphanum),
n_chunks = control_txt$n_chunks,
progressbar = verbose)
}
x_voc <- text2vec::create_vocabulary(x_tokens)
x_vec <- text2vec::vocab_vectorizer(x_voc)
x_dtm <- text2vec::create_dtm(x_tokens, x_vec)
if (is.null(y_default)) {
y_dtm <- x_dtm
} else {
if (.Platform$OS.type == "unix") {
y_tokens <- text2vec::itoken_parallel(
iterable = y,
tokenizer = function(x) tokenizers::tokenize_character_shingles(x,
n = control_txt$n_shingles,
lowercase = control_txt$lowercase,
strip_non_alphanum = control_txt$strip_non_alphanum),
n_chunks = control_txt$n_chunks,
progressbar = verbose)
} else {
y_tokens <- text2vec::itoken(
iterable = y,
tokenizer = function(x) tokenizers::tokenize_character_shingles(x,
n = control_txt$n_shingles,
lowercase = control_txt$lowercase,
strip_non_alphanum = control_txt$strip_non_alphanum),
n_chunks = control_txt$n_chunks,
progressbar = verbose)
}
y_voc <- text2vec::create_vocabulary(y_tokens)
y_vec <- text2vec::vocab_vectorizer(y_voc)
y_dtm <- text2vec::create_dtm(y_tokens, y_vec)
}
}
}
if (representation == "shingles"){
colnames_xy <- intersect(colnames(x_dtm), colnames(y_dtm))
}
if (verbose %in% 1:2) {
if (representation == "shingles") {
cat(sprintf("===== starting search (%s, x, y: %d, %d, t: %d) =====\n",
ann, nrow(x_dtm), nrow(y_dtm), length(colnames_xy)))
} else {
cat("===== starting search =====\n")
}
}
x_df <- switch(ann,
"nnd" = method_nnd(x = if (representation == "shingles") x_dtm[, colnames_xy] else x_embeddings,
y = if (representation == "shingles") y_dtm[, colnames_xy] else y_embeddings,
k = k,
distance = distance,
deduplication = deduplication,
seed = seed,
verbose = if (verbose == 2) TRUE else FALSE,
n_threads = n_threads,
control = control_ann),
"hnsw" = method_hnsw(x = if (representation == "shingles") x_dtm[, colnames_xy] else x_embeddings,
y = if (representation == "shingles") y_dtm[, colnames_xy] else y_embeddings,
k = k,
distance = distance,
seed = seed,
verbose = if (verbose == 2) TRUE else FALSE,
n_threads = n_threads,
path = ann_write,
control = control_ann),
"lsh" = method_mlpack(x = if (representation == "shingles") x_dtm[, colnames_xy] else x_embeddings,
y = if (representation == "shingles") y_dtm[, colnames_xy] else y_embeddings,
algo = "lsh",
k = k,
verbose = if (verbose == 2) TRUE else FALSE,
seed = seed,
path = ann_write,
control = control_ann),
"kd" = method_mlpack(x = if (representation == "shingles") x_dtm[, colnames_xy] else x_embeddings,
y = if (representation == "shingles") y_dtm[, colnames_xy] else y_embeddings,
algo = "kd",
k = k,
verbose = if (verbose == 2) TRUE else FALSE,
seed = seed,
path = ann_write,
control = control_ann),
"annoy" = method_annoy(x = if (representation == "shingles") x_dtm[, colnames_xy] else x_embeddings,
y = if (representation == "shingles") y_dtm[, colnames_xy] else y_embeddings,
k = k,
distance = distance,
verbose = if (verbose == 2) TRUE else FALSE,
seed = seed,
path = ann_write,
control = control_ann))
if (verbose %in% 1:2) cat("===== creating graph =====\n")
## remove duplicated pairs
if (deduplication) {
setDT(x_df)
setorder(x_df, x)
x_df[, "pair" := sapply(seq_len(.N), function(i) paste(sort(c(x[i], y[i])), collapse = "_"))]
x_df <- x_df[, .SD[dist == min(dist)], by = "pair"]
x_df <- x_df[, .SD[1], by = "pair"]
x_df[, "pair" := NULL]
x_df <- x_df[x != y]
}
x_df[, x := as.integer(x)]
if (deduplication) {
x_df[, `:=`("query_g", paste0("q", y))]
x_df[, `:=`("index_g", paste0("q", x))]
} else {
x_df[, `:=`("query_g", paste0("q", y))]
x_df[, `:=`("index_g", paste0("i", x))]
}
x_gr <- igraph::graph_from_data_frame(x_df[, c("query_g", "index_g")], directed = F)
x_block <- igraph::components(x_gr, "weak")$membership
x_df[, `:=`("block", x_block[names(x_block) %in% x_df$query_g])]
## if true are given
if (!is.null(true_blocks)) {
setDT(true_blocks)
if (!deduplication) {
eval <- eval_reclin(x_df, true_blocks)
eval_metrics <- unlist(get_metrics(TP = eval$TP,
FP = eval$FP,
FN = eval$FN,
TN = eval$TN))
confusion <- get_confusion(TP = eval$TP,
FP = eval$FP,
FN = eval$FN,
TN = eval$TN)
} else {
eval <- eval_dedup(x_df, true_blocks)
eval_metrics <- unlist(get_metrics(TP = eval$TP,
FP = eval$FP,
FN = eval$FN,
TN = eval$TN))
confusion <- get_confusion(TP = eval$TP,
FP = eval$FP,
FN = eval$FN,
TN = eval$TN)
}
}
if (deduplication){
x_df[, `:=`(x = pmin(x, y), y = pmax(x, y))]
}
setorderv(x_df, c("x", "y", "block"))
structure(
list(
result = x_df[, c("x", "y", "block", "dist")],
method = ann,
deduplication = deduplication,
representation = representation,
metrics = if (is.null(true_blocks)) NULL else eval_metrics,
confusion = if (is.null(true_blocks)) NULL else confusion,
colnames = if (exists("colnames_xy", where = environment())) colnames_xy else NULL,
graph = if (graph) {
igraph::graph_from_data_frame(x_df[, c("x", "y")], directed = F)
} else NULL
),
class = "blocking"
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.