R/sent_ngrams_lem.R

Defines functions sent_ngrams_lem

Documented in sent_ngrams_lem

#' Function for generating sentence level n-grams - lemmatized sentences
#'
#' This function allows tokenizing text on the level of sentences.
#' @param sentences Table with lemmatized sentences generated by get_sentences function. No default.
#' @param n Integer specifying the parsing rule for ngrams: 2 for bigrams, 3 for trigrams, etc. No default.
#' @keywords tokenization sentences lemmatization
#' @export
#' @examples
#' sent_ngrams_lem()

sent_ngrams_lem <- function(sentences, n) {
  if (isFALSE("grouped_df" %in% class(sentences))) stop("Data input 'sentences' must be a data frame of class 'grouped_df' returned by funcition 'get_sentences()'.")
  if (isFALSE(is.numeric(n) & n == round(n) & length(n) == 1 & n > 0)) stop("Need to specify a proper number of higher n-grams: 'n' must be a non-negative integer.")

  ids <- unique(sentences$doc_id)
  ngrams_lem <- c()
  for (id in ids) {
    one_doc <- subset(sentences, doc_id == id)
    if (nrow(one_doc) < n) {
      ngram <- paste0(one_doc$sentence_lem, collapse = " ")
      ngrams_lem <- rbind(ngrams_lem, cbind(doc_id = as.numeric(id), ngram_id = 1, ngram))
    } else {
      for (x in 1:(nrow(one_doc)-(n-1))) {
        ngram <- paste0(one_doc$sentence_lem[x:(x+n-1)], collapse = " ")
        ngrams_lem <- rbind(ngrams_lem, cbind(doc_id = as.numeric(id), ngram_id = x, ngram))
      }
    }
  }
  ngrams_lem <- data.frame(ngrams_lem, stringsAsFactors = F)
  ngrams_lem$doc_id <- as.integer(ngrams_lem$doc_id)
  ngrams_lem$ngram_id <- as.integer(ngrams_lem$ngram_id)
  return(ngrams_lem)
}
mmochtak/sentenceR documentation built on Aug. 25, 2022, 9:31 a.m.