text2vec: Modern Text Mining Framework for R

Documented in hash_vectorizer vocab_vectorizer

# // Copyright (C) 2015 - 2016  Dmitriy Selivanov
# // This file is part of text2vec
# //
#   // text2vec is free software: you can redistribute it and/or modify it
# // under the terms of the GNU General Public License as published by
# // the Free Software Foundation, either version 2 of the License, or
# // (at your option) any later version.
# //
#   // text2vec is distributed in the hope that it will be useful, but
# // WITHOUT ANY WARRANTY; without even the implied warranty of
# // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# // GNU General Public License for more details.
# //
#   // You should have received a copy of the GNU General Public License
# // along with text2vec.  If not, see <http://www.gnu.org/licenses/>.

encode_context = function(context_string_name = c("symmetric", "right", "left")) {
  context_string_name = match.arg(context_string_name)
  switch(context_string_name,
         symmetric = 0L,
         right = 1L,
         left = -1L)
}

corpus_insert_generic = function(corpus_ptr, tokens, grow_dtm, skip_grams_window_context, window_size, weights,
                                 binary_cooccurence) {
  if(inherits(corpus_ptr, "HashCorpus")) {
    cpp_hash_corpus_insert_document_batch(corpus_ptr, tokens, grow_dtm, skip_grams_window_context, window_size, weights)
  } else if(inherits(corpus_ptr, "VocabCorpus")) {
    cpp_vocabulary_corpus_insert_document_batch(corpus_ptr, tokens, grow_dtm, skip_grams_window_context, window_size, weights, binary_cooccurence)
  } else {
    stop("can't recognize corpus - neither HashCorpus or VocabCorpus")
  }

  e = environment()
  reg.finalizer(e, malloc_trim_finalizer)
  TRUE
}

corpus_insert = function(corpus_ptr, iterator, grow_dtm, skip_grams_window_context, window_size, weights, binary_cooccurence) {
  skip_grams_window_context_code = force(encode_context(skip_grams_window_context))
  if (inherits(iterator, "R6"))
    it = iterator$clone(TRUE)
  else {
    warning("Can't clone input iterator. It will be modified by current function call", immediate. = TRUE)
    it = iterator
  }

  ids = new.env(parent = emptyenv())
  k = 0L
  while(!it$is_complete) {
    k = k + 1L
    val = it$nextElem()
    res = corpus_insert_generic(corpus_ptr, val$tokens, grow_dtm, skip_grams_window_context_code, window_size, weights, binary_cooccurence)
    if(!res) stop("something went wrong during insert into corpus")
    ids[[as.character(k)]] = val$ids
  }
  ids = as.list(ids)
  ids = unlist(ids[as.character(seq_len(k))], recursive = FALSE, use.names = FALSE)

  attr(corpus_ptr, "ids") = ids
  corpus_ptr
}

#' @name vectorizers
#' @title Vocabulary and hash vectorizers
#' @description This function creates an object (closure) which defines on how to
#' transform list of tokens into vector space - i.e. how to map words to indices.
#' It supposed to be used only as argument to \link{create_dtm}, \link{create_tcm},
#' \link{create_vocabulary}.
#' @return A vectorizer \code{object} (closure).
#' @seealso \link{create_dtm} \link{create_tcm} \link{create_vocabulary}
#' @examples
#' data("movie_review")
#' N = 100
#' vectorizer = hash_vectorizer(2 ^ 18, c(1L, 2L))
#' it = itoken(movie_review$review[1:N], preprocess_function = tolower,
#'              tokenizer = word_tokenizer, n_chunks = 10)
#' hash_dtm = create_dtm(it, vectorizer)
#'
#' it = itoken(movie_review$review[1:N], preprocess_function = tolower,
#'              tokenizer = word_tokenizer, n_chunks = 10)
#' v = create_vocabulary(it, c(1L, 1L) )
#'
#' vectorizer = vocab_vectorizer(v)
#'
#' it = itoken(movie_review$review[1:N], preprocess_function = tolower,
#'              tokenizer = word_tokenizer, n_chunks = 10)
#'
#' dtm = create_dtm(it, vectorizer)


#' @rdname vectorizers
#' @param vocabulary \code{text2vec_vocabulary} object, see \link{create_vocabulary}.
#' @export
vocab_vectorizer = function(vocabulary) {
  force(vocabulary)
  vectorizer = function(iterator, grow_dtm, skip_grams_window_context, window_size, weights, binary_cooccurence = FALSE) {
    vocab_corpus_ptr = cpp_vocabulary_corpus_create(vocabulary$term,
                                                    attr(vocabulary, "ngram")[[1]],
                                                    attr(vocabulary, "ngram")[[2]],
                                                    attr(vocabulary, "stopwords"),
                                                    attr(vocabulary, "sep_ngram"))
    setattr(vocab_corpus_ptr, "ids", character(0))
    setattr(vocab_corpus_ptr, "class", "VocabCorpus")
    corpus_insert(vocab_corpus_ptr, iterator, grow_dtm, skip_grams_window_context, window_size, weights, binary_cooccurence)
  }
  vectorizer
}

#' @rdname vectorizers
#' @param hash_size \code{integer} The number of of hash-buckets for the feature
#'   hashing trick. The number must be greater than 0, and preferably it will be
#'   a power of 2.
#'@param ngram \code{integer} vector. The lower and upper boundary of the range
#'  of n-values for different n-grams to be extracted. All values of \code{n}
#'  such that ngram_min <= n <= ngram_max will be used.
#' @param signed_hash \code{logical},  indicating whether to use a signed
#'   hash-function to reduce collisions when hashing.
#' @export
hash_vectorizer = function(hash_size = 2 ^ 18,
                           ngram = c(1L, 1L),
                           signed_hash = FALSE) {
  stopifnot(is.numeric(ngram) && length(ngram) == 2 && ngram[[2]] >= ngram[[1]])

  vectorizer = function(iterator, grow_dtm, skip_grams_window_context, window_size, weights) {
    hash_corpus_ptr = cpp_hash_corpus_create(hash_size, ngram[[1]], ngram[[2]], signed_hash)
    attr(hash_corpus_ptr, "ids") = character(0)
    class(hash_corpus_ptr) = "HashCorpus"
    corpus_insert(hash_corpus_ptr, iterator, grow_dtm, skip_grams_window_context, window_size, weights)
  }
  vectorizer
}