R/similar_texts.R

Defines functions similar_texts_lite similar_texts

Documented in similar_texts

#' @title  Returns 'similar' texts to user given strings
#' @description Use to return semantically similar texts from one or severeal user-defined
#' texts.
#' @param texts The texts given by the user to classify later.
#' @param texts_ids The text_ids in the text to output nice clean format.
#' @param strings A vector of strings to be feed into the model.
#' @param num_texts The number of texts to be returned for each user-given string.
#' @param Term_count_min GloVe parameter.
#' @param Skip_gram_window GloVe parameter.
#' @param Word_vectors_size GloVe parameter.
#' @param X_max GloVe parameter.
#' @param N_iter GloVe parameter.
#' @param xprt_txt_vctrs Defaults to TRUE. Returns a list with the text vectors computed by the
#' GloVe model to be reused again to feed for more strings.
#' @return A list with dataframes of similiar texts for each string. If \code{text_vectors} == TRUE,
#' an additional list is added to the end of list containing the text vectors of the model to
#' used by \code{similar_texts_lite()} function.
#' @export
#' @examples
#' similar_texts(texts = df$text,
#'               texts_id = df$text_id,
#'               strings = mystrgs,
#'               num_texts = 5,
#'               Term_count_min = 5,
#'               Skip_gram_window = 10,
#'               Word_vectors_size = 100,
#'               X_max = 10,
#'               N_iter = 8,
#'               xprt_txt_vctrs = TRUE
#'               )
#'

similar_texts = function(texts, texts_id, strings, num_texts, Term_count_min, Skip_gram_window, Word_vectors_size,
                         X_max, N_iter, xprt_txt_vctrs = TRUE){
  # Notes -----------------------
  #   -


  # Plug this piece of code to parallize
  future::plan(future::multisession) ## => parallelize on your local computer
  texts_tokens_clean = future.apply::future_lapply(
    texts_tokens,
    function(x) x[which(x %in% vocab_trimmed$term)])

  texts_by_ndim_list = future.apply::future_lapply(
    texts_tokens_clean,
    function(x) word_vectors[x, , drop = FALSE])

  texts_by_ndim_averaged = future.apply::future_lapply(
    texts_by_ndim_list,
    function(x) apply(x, 2, sum)/dim(x)[1]) ## average each doc across its terms
}


similar_texts_lite = function(texts,texts_id,string,num_texts,Term_count_min,Skip_gram_window,Word_vectors_size,X_max,N_iter){
  # Notes -----------------------
  #   - Is not parallellized b/c it's only meant to run 1 single combination of parameters

}
jcgonzalez14/textwhiz documentation built on Aug. 26, 2020, 9:39 a.m.