R/sent_ngrams.R

Defines functions sent_ngrams

Documented in sent_ngrams

#' Function for generating sentence level n-grams - raw sentences
#'
#' This function allows tokenizing text on the level of sentences.
#' @param sentences Table with raw sentences generated by get_sentences function. No default.
#' @param n Integer specifying the parsing rule for ngrams: 2 for bigrams, 3 for trigrams, etc. No default.
#' @keywords tokenization sentences
#' @export
#' @examples
#' sent_ngrams()

sent_ngrams <- function(sentences, n) {
  if (isFALSE("grouped_df" %in% class(sentences))) stop("Data input 'sentences' must be a data frame of class 'grouped_df' returned by function 'get_sentences()'.")
  if (isFALSE(is.numeric(n) & n == round(n) & length(n) == 1 & n > 0)) stop("Need to specify a proper number of higher n-grams: 'n' must be a non-negative integer.")

  ids <- unique(sentences$doc_id)
  ngrams <- c()
  for (id in ids) {
    one_doc <- subset(sentences, doc_id == id)
    if (nrow(one_doc) < n) {
      ngram <- paste0(one_doc$sentence, collapse = " ")
      ngrams <- rbind(ngrams, cbind(doc_id = as.numeric(id), ngram_id = 1, ngram))
    } else {
      for (x in 1:(nrow(one_doc)-(n-1))) {
        ngram <- paste0(one_doc$sentence[x:(x+n-1)], collapse = " ")
        ngrams <- rbind(ngrams, cbind(doc_id = as.numeric(id), ngram_id = x, ngram))
      }
    }
  }
  ngrams <- data.frame(ngrams, stringsAsFactors = F)
  ngrams$doc_id <- as.integer(ngrams$doc_id)
  ngrams$ngram_id <- as.integer(ngrams$ngram_id)
  return(ngrams)
}
mmochtak/sentenceR documentation built on Aug. 25, 2022, 9:31 a.m.