#' Function for generating sentence level n-grams - lemmatized sentences
#'
#' This function allows tokenizing text on the level of sentences.
#' @param sentences Table with lemmatized sentences generated by get_sentences function. No default.
#' @param n Integer specifying the parsing rule for ngrams: 2 for bigrams, 3 for trigrams, etc. No default.
#' @keywords tokenization sentences lemmatization
#' @export
#' @examples
#' sent_ngrams_lem()
sent_ngrams_lem <- function(sentences, n) {
if (isFALSE("grouped_df" %in% class(sentences))) stop("Data input 'sentences' must be a data frame of class 'grouped_df' returned by funcition 'get_sentences()'.")
if (isFALSE(is.numeric(n) & n == round(n) & length(n) == 1 & n > 0)) stop("Need to specify a proper number of higher n-grams: 'n' must be a non-negative integer.")
ids <- unique(sentences$doc_id)
ngrams_lem <- c()
for (id in ids) {
one_doc <- subset(sentences, doc_id == id)
if (nrow(one_doc) < n) {
ngram <- paste0(one_doc$sentence_lem, collapse = " ")
ngrams_lem <- rbind(ngrams_lem, cbind(doc_id = as.numeric(id), ngram_id = 1, ngram))
} else {
for (x in 1:(nrow(one_doc)-(n-1))) {
ngram <- paste0(one_doc$sentence_lem[x:(x+n-1)], collapse = " ")
ngrams_lem <- rbind(ngrams_lem, cbind(doc_id = as.numeric(id), ngram_id = x, ngram))
}
}
}
ngrams_lem <- data.frame(ngrams_lem, stringsAsFactors = F)
ngrams_lem$doc_id <- as.integer(ngrams_lem$doc_id)
ngrams_lem$ngram_id <- as.integer(ngrams_lem$ngram_id)
return(ngrams_lem)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.