#' @title Global Vectors
#' @description Creates Global Vectors matrix factorization model
#' @references \url{http://nlp.stanford.edu/projects/glove/}
#' @rdname GloVe
#' @export
#' @examples
#' data('movielens100k')
#' co_occurence = crossprod(movielens100k)
#' glove_model = GloVe$new(rank = 4, x_max = 10, learning_rate = .25)
#' embeddings = glove_model$fit_transform(co_occurence, n_iter = 2, n_threads = 1)
#' embeddings = embeddings + t(glove_model$components) # embeddings + context embedings
#' identical(dim(embeddings), c(ncol(movielens100k), 10L))
GloVe = R6::R6Class(
classname = c("GloVe"),
public = list(
#' @field components represents context embeddings
components = NULL,
#' @field bias_i bias term i as per paper
bias_i = NULL,
#' @field bias_j bias term j as per paper
bias_j = NULL,
#' @field shuffle \code{logical = FALSE} by default. Whether to perform shuffling before
#' each SGD iteration. Generally shuffling is a good practice for SGD.
shuffle = FALSE,
#' @description
#' Creates GloVe model object
#' @param rank desired dimension for the latent vectors
#' @param x_max \code{integer} maximum number of co-occurrences to use in the weighting function
#' @param learning_rate \code{numeric} learning rate for SGD. I do not recommend that you
#' modify this parameter, since AdaGrad will quickly adjust it to optimal
#' @param alpha \code{numeric = 0.75} the alpha in weighting function formula :
#' \eqn{f(x) = 1 if x > x_max; else (x/x_max)^alpha}
#' @param lambda \code{numeric = 0.0} regularization parameter
#' @param shuffle see \code{shuffle} field
#' @param init \code{list(w_i = NULL, b_i = NULL, w_j = NULL, b_j = NULL)}
#' initialization for embeddings (w_i, w_j) and biases (b_i, b_j).
#' \code{w_i, w_j} - numeric matrices, should have #rows = rank, #columns =
#' expected number of rows (w_i) / columns(w_j) in the input matrix.
#' \code{b_i, b_j} = numeric vectors, should have length of
#' #expected number of rows(b_i) / columns(b_j) in input matrix
initialize = function(rank,
learning_rate = 0.15,
alpha = 0.75,
lambda = 0.0,
shuffle = FALSE,
init = list(w_i = NULL, b_i = NULL, w_j = NULL, b_j = NULL)
) {
self$shuffle = shuffle
private$rank = as.integer(rank)
private$learning_rate = learning_rate
private$x_max = x_max
private$alpha = alpha
private$lambda = lambda
private$fitted = FALSE
private$w_i = init$w_i
private$b_i = init$b_i
private$w_j = init$w_j
private$b_j = init$b_j
#' @description fits model and returns embeddings
#' @param x An input term co-occurence matrix. Preferably in \code{dgTMatrix} format
#' @param n_iter \code{integer} number of SGD iterations
#' @param convergence_tol \code{numeric = -1} defines early stopping strategy. Stop fitting
#' when one of two following conditions will be satisfied: (a) passed
#' all iterations (b) \code{cost_previous_iter / cost_current_iter - 1 <
#' convergence_tol}.
#' @param n_threads number of threads to use
#' @param ... not used at the moment
fit_transform = function(x, n_iter = 10L, convergence_tol = -1,
n_threads = getOption("rsparse_omp_threads", 1L), ...) {
x = as(x, "TsparseMatrix")
stopifnot(ncol(x) == nrow(x))
embedding_names = colnames(x)
if (is.null(embedding_names)) embedding_names = rownames(x)
stopifnot(all(x@x > 0))
IS_TRIANGULAR = isTriangular(x)
n = ncol(x)
target_dim = c(private$rank, n)
if (is.null(private$w_i)) private$w_i = matrix(runif(private$rank * n, -0.5, 0.5), private$rank, n)
if (is.null(private$b_i)) private$b_i = runif(n, -0.5, 0.5)
if (is.null(private$w_j)) private$w_j = matrix(runif(private$rank * n, -0.5, 0.5), private$rank, n)
if (is.null(private$b_j)) private$b_j = runif(n, -0.5, 0.5)
if (!identical(dim(private$w_i), target_dim) ||
!identical(dim(private$w_j), target_dim) ||
length(private$b_j) != n ||
length(private$b_i) != n) {
stop(sprintf("init values provided in the constructor don't match expected dimensions from the input matrix"))
# params in a specific format to pass to C++ backend
initial = list(w_i = private$w_i, w_j = private$w_j,
b_i = private$b_i, b_j = private$b_j)
# initial = list(w_i = private$w_i@Data, w_j = private$w_j@Data,
# b_i = private$b_i@Data, b_j = private$b_j@Data)
glove_params =
list(vocab_size = n,
word_vec_size = private$rank,
learning_rate = private$learning_rate,
x_max = private$x_max,
alpha = private$alpha,
lambda = private$lambda,
initial = initial)
# init C++ class which actually perform fitting
private$glove_fitter = cpp_glove_create(glove_params)
private$cost_history = numeric(0)
# number of non-zero elements in co-occurence matrix
n_nnz = length(x@i)
# sometimes it is useful to perform shuffle between SGD iterations
# by default we will not perfrom shuffling:
# length(iter_order)==0 will be checked at C++ level
iter_order = integer(0)
# perform iterations over input co-occurence matrix
i = 1
while (i <= n_iter) {
# if shuffling is required, perform reordering at each iteration
if ( self$shuffle ) {
iter_order = sample.int( n_nnz, replace = FALSE)
cost = cpp_glove_partial_fit(private$glove_fitter, x@i, x@j, x@x, iter_order, n_threads)
if (is.nan(cost)) stop("Cost becomes NaN, try to use smaller learning_rate.")
#if matrix is triangular fit another iterations on transposed one
cost = cost + cpp_glove_partial_fit(private$glove_fitter, x@j, x@i, x@x, iter_order, n_threads)
if (is.nan(cost)) stop("Cost becomes NaN, try to use smaller learning_rate.")
if (cost / n_nnz > 1) stop("Cost is too big, probably something goes wrong... try smaller learning rate")
# save cost history
private$cost_history = c(private$cost_history, cost / n_nnz)
logger$info("epoch %d, loss %.4f", i, private$cost_history[[i]])
# check convergence
if ( i > 1 && (private$cost_history[[i - 1]] / private$cost_history[[i]] - 1) < convergence_tol) {
logger$info("Success: early stopping. Improvement at iterartion %d is less then convergence_tol", i)
i = i + 1
private$fitted = TRUE
colnames(private$w_i) = embedding_names
colnames(private$w_j) = embedding_names
self$components = private$w_j
self$bias_i = private$b_i
self$bias_j = private$b_j
#' @description returns value of the loss function for each epoch
get_history = function() {
list(cost_history = private$cost_history)
private = list(
w_i = NULL,
w_j = NULL,
b_i = NULL,
b_j = NULL,
rank = NULL,
initial = NULL,
alpha = NULL,
x_max = NULL,
learning_rate = NULL,
lambda = NULL,
cost_history = numeric(0),
glove_fitter = NULL,
fitted = FALSE
