#' Run the annotation pipeline on a set of documents
#'
#' Runs the clean_nlp annotators over a given corpus of text
#' using the desired backend. The details for which annotators to run and
#' how to run them are specified by using one of:
#' \code{\link{cnlp_init_stringi}}, \code{\link{cnlp_init_spacy}},
#' \code{\link{cnlp_init_udpipe}}, or \code{\link{cnlp_init_corenlp}}.
#'
#' @param input an object containing the data to parse. Either a
#' character vector with the texts (optional names can
#' be given to provide document ids) or a data frame. The
#' data frame should have a column named 'text' containing
#' the raw text to parse; if there is a column named
#' 'doc_id', it is treated as a a document identifier.
#' The name of the text and document id columns can be
#' changed by setting \code{text_name} and \code{doc_name}
#' This conforms with corpus objects respecting the Text
#' Interchange Format (TIF), while allowing for some
#' variation.
#' @param backend name of the backend to use. Will default to the last
#' model to be initalized.
#' @param verbose set to a positive integer n to display a progress
#' message to display every n'th record. The default is
#' 10. Set to a non-positive integer to turn off messages.
#' Logical input is converted to an integer, so it also
#' possible to set to TRUE (1) to display a message for
#' every document and FALSE (0) to turn off messages.
#' @param text_name column name containing the text input. The default
#' is 'text'. This parameter is ignored when \code{input}
#' is a character vector.
#' @param doc_name column name containing the document ids. The default
#' is 'doc_id'. This parameter is ignored when
#' \code{input} is a character vector.
#'
#' @return a named list with components "token", "document" and (when
#' running spacy with NER) "entity".
#'
#' @details
#' The returned object is a named list where each element containing a data
#' frame. The document table contains one row for each document, along with
#' with all of the metadata that was passed as an input. The tokens table
#' has one row for each token detected in the input. The first three columns
#' are always "doc_id" (to index the input document), "sid" (an integer index
#' for the sentence number), and "tid" (an integer index to the specific
#' token). Together, these are a primary key for each row.
#'
#' Other columns provide extracted data about each token, which differ slightly
#' based on which backend, language, and options are supplied.
#' \itemize{
#' \item \bold{token}: detected token, as given in the original input
#' \item \bold{token_with_ws}: detected token along with white space; in,
#' theory, collapsing this field through an entire document will yield
#' the original text
#' \item \bold{lemma}: lemmatised version of the token; the exact form
#' depends on the choosen language and backend
#' \item \bold{upos}: the universal part of speech code; see
#' \url{https://universaldependencies.org/u/pos/all.html}
#' for more information
#' \item \bold{xpos}: language dependent part of speech code; the specific
#' categories and their meaning depend on the choosen backend, model
#' and language
#' \item \bold{feats}: other extracted linguistic features, typically given
#' as Universal Dependencies
#' (\url{https://universaldependencies.org/u/feat/index.html}), but can
#' be model dependent; currently only provided by the udpipe backend
#' \item \bold{tid_source}: the token id (tid) of the head word for the
#' dependency relationship starting from this token; for the token
#' attached to the root, this will be given as zero
#' \item \bold{relation}: the dependency relation, usually provided using
#' Universal Dependencies (more information available here
#' \url{https://universaldependencies.org/}
#' ), but could be different for a specific model
#' }
#'
#' @author Taylor B. Arnold, \email{taylor.arnold@@acm.org}
#'
#'@examples
#'cnlp_init_stringi()
#'cnlp_annotate(un)
#'
#' @export
cnlp_annotate <- function(
input, backend = NULL, verbose = 10, text_name = 'text', doc_name = 'doc_id'
) {
# validate input variables
backend <- ifnull(backend, volatiles$model_init_last)
assert(
backend %in% c("stringi", "spacy", "corenlp", "udpipe"),
"No initialized backends found."
)
if (!inherits(input, "data.frame")) {
assert(is.vector(input), "'input' must be a data frame or vector object.")
input <- data.frame(
doc_id=ifnull(names(input), seq_len(length(input))),
text=as.character(input)
)
}
assert(text_name %in% names(input),
sprintf("'input' data frame must contain a column named '%s'", text_name)
)
names(input)[names(input) == text_name] <- "text"
# add an identifier to the dataset if not already included
if (!(doc_name %in% names(input)))
{
input$doc_id <- seq_len(nrow(input))
}
names(input)[names(input) == doc_name] <- "doc_id"
assert(all(!duplicated(input$doc_id)), "duplicate values found in 'id'")
# pass to the respective backend
if (backend == "stringi") anno <- annotate_with_stringi(input, verbose)
if (backend == "spacy") anno <- annotate_with_spacy(input, verbose)
if (backend == "corenlp") anno <- annotate_with_corenlp(input, verbose)
if (backend == "udpipe") anno <- annotate_with_udpipe(input, verbose)
class(anno) <- c("cnlp_annotation", "list")
return(anno)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.