Nothing
text2vec_dist = R6::R6Class(
classname = "distance_model",
public = list(
dist2 = function(...) {stop("Method is not implemented")},
pdist2 = function(...) {stop("Method is not implemented")},
progressbar = TRUE
),
private = list(
internal_matrix_format = NULL
)
)
#' @name RelaxedWordMoversDistance
#' @title Creates Relaxed Word Movers Distance (RWMD) model
#' @description RWMD model can be used to query the "relaxed word movers distance" from a document to a
#' collection of documents. RWMD tries to measure distance between query document and collection of documents by
#' calculating how hard is to transform words from query document into words from each document in collection.
#' For more detail see following article: \url{http://mkusner.github.io/publications/WMD.pdf}.
#' However in contrast to the article above we calculate "easiness" of the convertion of one word into another
#' by using \bold{cosine} similarity (but not a euclidean distance).
#' Also here in text2vec we've implemented effiient RWMD using the tricks from the
#' Linear-Complexity Relaxed Word Mover's Distance with GPU Acceleration article https://arxiv.org/abs/1711.07227
#' @section Usage:
#' For usage details see \bold{Methods, Arguments and Examples} sections.
#' \preformatted{
#' rwmd = RelaxedWordMoversDistance$new(x, embeddings)
#' rwmd$sim2(x)
#' }
#' @format \code{\link{R6Class}} object.
#' @section Methods:
#' \describe{
#' \item{\code{$new(x, embeddings)}}{Constructor for RWMD model.
#' \code{x} - docuent-term matrix which represents collection of
#' documents against which you want to perform queries. \code{embeddings} -
#' matrix of word embeddings which will be used to calculate similarities
#' between words (each row represents a word vector).}
#' \item{\code{$sim(x)}}{calculates similarity from a collection of documents
#' to collection query documents \code{x}.
#' \code{x} here is a document-term matrix which represents the set of query documents}
#' \item{\code{$dist(x)}}{calculates distance from a collection of documents
#' to collection query documents \code{x}
#' \code{x} here is a document-term matrix which represents the set of query documents}
#' }
#' @export
#' @examples
#' \dontrun{
#' library(text2vec)
#' library(rsparse)
#' data("movie_review")
#' tokens = word_tokenizer(tolower(movie_review$review))
#' v = create_vocabulary(itoken(tokens))
#' v = prune_vocabulary(v, term_count_min = 5, doc_proportion_max = 0.5)
#' it = itoken(tokens)
#' vectorizer = vocab_vectorizer(v)
#' dtm = create_dtm(it, vectorizer)
#' tcm = create_tcm(it, vectorizer, skip_grams_window = 5)
#' glove_model = GloVe$new(rank = 50, x_max = 10)
#' wv = glove_model$fit_transform(tcm, n_iter = 5)
#' # get average of main and context vectors as proposed in GloVe paper
#' wv = wv + t(glove_model$components)
#' rwmd_model = RelaxedWordMoversDistance$new(dtm, wv)
#' rwms = rwmd_model$sim2(dtm[1:10, ])
#' head(sort(rwms[1, ], decreasing = T))
#'}
RelaxedWordMoversDistance = R6::R6Class(
classname = "RWMD",
public = list(
x = NULL,
embedding_ids = NULL,
item_ids = NULL,
embeddings = NULL,
initialize = function(x, embeddings) {
stopifnot(is.matrix(embeddings))
stopifnot(is.numeric(embeddings))
self$embedding_ids = intersect(colnames(x), rownames(embeddings))
self$item_ids = rownames(x)
embeddings = embeddings[self$embedding_ids, , drop = FALSE]
x = x[, self$embedding_ids, drop = FALSE]
x = text2vec:::transform_rows_unit_norm(x, 1)
private$internal_matrix_format = 'RsparseMatrix'
self$x = as(x, private$internal_matrix_format)
# make shure that word vectors are L2 normalized
# and transpose them for faster column subsetting
# R stores matrices in column-major format
self$embeddings = t(text2vec:::transform_rows_unit_norm(embeddings, 2))
},
sim2 = function(x) {
stopifnot(identical(colnames(x), self$embedding_ids))
x = as(x, private$internal_matrix_format)
# main loop
res = vector("list", nrow(x))
for (j in 2L:(length(x@p))) {
row_number = j - 1L
# futile.logger::flog.debug(sprintf("row %d", row_number))
i1 = (x@p[[row_number]] + 1L):x@p[[j]]
j1 = x@j[i1] + 1L
m_j1 = self$embeddings[, j1, drop = FALSE]
d = crossprod(m_j1, self$embeddings)
# calculates how easy is to transform each word in vocabulary into words from query
d = matrix(text2vec:::colMaxs(d), ncol = 1)
# matrix mult from rsparse
# calculates the cost of the best transformation from each of the
# documents from collection into query document. We transform each word from
# each document into closest word in the query
d = self$x %*% d
res[[row_number]]= d[, 1]
}
res = do.call(rbind, res)
colnames(res) = self$item_ids
rownames(res) = rownames(x)
res
},
dist2 = function(x) {
1 - self$sim2(x)
}
),
private = list(
internal_matrix_format = NULL
)
)
#' @rdname RelaxedWordMoversDistance
#' @export
RWMD = RelaxedWordMoversDistance
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.