R/stanza.R

Defines functions multi_word_token entities tokens sents.stanza.models.common.doc.Document words.stanza.models.common.doc.Document sents.stanza.models.common.doc.Document print.stanza.models.common.doc.Document stanza_version stanza_pipeline stanza_download_method_code check_init

Documented in entities multi_word_token stanza_download_method_code stanza_pipeline stanza_version tokens

#' @importFrom checkmate assert check_list check_character check_logical
#' @importFrom reticulate import conda_create conda_install conda_list miniconda_path py_eval virtualenv_list virtualenv_create virtualenv_install
#' @importFrom NLP words sents
#' @importFrom stats setNames
#' @importFrom utils head
NULL


check_init <- function() {
    if (!is_initialized()) {
        caller_name <- deparse(sys.calls()[[sys.nframe() - 1]])
        msg <- paste(sprintf("in '%s' stanza is not initialized,", caller_name),
                     "use 'stanza_initialize' to initialize stanza!",
                     collapse = " ")
        stop(msg, call. = FALSE)
    }
}


#' Select Download Method
#'
#' Function to obtain the download method code or list all allowed download methods.
#'
#' @param method a character string giving the name of the download method.
#'  The case oft he download method name is ignored.
#'  If \code{NULL} all allowed download methods are shown.
#' 
#' @returns an integer giving the download method code.
#' 
#' @examples 
#' if (is_stanza_initialized()) {
#'   stanza_download_method_code()
#'   stanza_download_method_code("none")
#'   stanza_download_method_code("reuse_resources")
#'   stanza_download_method_code("download_resources")
#' }
#' 
#' @export
stanza_download_method_code <- function(method = NULL) {
    download_methods <- rpy("download_methods")
    if (is.null(method)) {
        return(download_methods)
    }
    if (checkmate::test_integerish(method, len = 1L)) {
        return(method)
    }
    checkmate::check_string(method, null.ok = TRUE)
    if (toupper(method) %in% names(download_methods)) {
        as.integer(unname(download_methods[toupper(method)]))
    } else {
        msg <- sprintf("'%s' is not among the allowed methods, allowed methods are %s.",
                       method, deparse(names(download_methods)))
        stop(msg)
    }
}


#' NLP Pipeline
#'
#' @param language a character string giving the language (default is \code{"en"}). 
#' @param model_dir path to the directory for storing the for \code{Stanza} models
#'  (default is \code{"~/stanza_resources"}).
#' @param package (default is \code{"default"}.
#' @param processors FIXME: we should define if we want to use comma seperated string or a character vector.
#' @param logging_level a character string giving the logging level (default is \code{"INFO"}),
#'  available levels are \code{c('DEBUG', 'INFO', 'WARNING', 'WARN', 'ERROR', 'CRITICAL', 'FATAL')}.
#' @param use_gpu a logical giving if \code{GPU} or \code{CPU} should be used (default is \code{FALSE}).
#' @param download_method an integer or character string giving the download method code.
#'  If a character string is provided, it is passed to \code{stanza_download_method_code}
#'  to obtain the integer code.
#'  Use \code{stanza_download_method_code} to obtain the code and list all
#'  available download methods.
#' @param ... additional named arguments passed to the stanza pipeline.
#' 
#' @returns a function that can be used to process text.
#' 
#' @examples
#' \dontrun{
#' p <- stanza_pipeline()
#' doc <- p('R is a programming language for statistical computing.')
#' }
#' 
#' @export
stanza_pipeline <- function(language = "en",
                            model_dir = stanza_options("model_dir"),
                            package = "default",
                            processors = list(),
                            logging_level = 'INFO',
                            use_gpu = FALSE,
                            download_method = "reuse_resources",
                            ...) {
    check_init()
    assert(check_character(language, len = 1L),
           check_character(model_dir, len = 1L),
           check_character(package, len = 1L),
           check_character(logging_level, len = 1L),
           check_logical(use_gpu), combine = "and")
    assert(check_character(processors, len = 1L), check_list(processors))
    if (length(processors) == 0L) {
        processors <- setNames(list(), character(0))
    }
    if (is.list(processors)) assert(check_character(names(processors)))
    if (length(kwargs <- list(...))) {
        if (!is.character(names(kwargs))) {
            stop("the additional arguments '...' have to be named")
        }
    }
    download_method <- stanza_download_method_code(download_method)
    processor <- stanza$Pipeline(lang = language, dir = model_dir, package = package,
                                 processors = processors, logging_level = 'INFO',
                                 use_gpu = use_gpu, download_method = download_method,
                                 ...)
    function(doc, processors = NULL) {
        obj <- processor(doc, processors)
        cls <- sprintf("stanza_%s", tolower(gsub(".*\\.", "", head(class(obj), 1))))
        class(obj) <- c(cls, class(obj))
        obj
    }
}


#' Stanza Version
#' 
#' Obtain the version of the \pkg{stanza} Python package.
#' 
#' @returns a character string giving the version of the \pkg{stanza} Python package.
#' 
#' @examples 
#' stanza_version()
#' 
#' @export
stanza_version <- function() {
    trimws(stanza[["__version__"]])
}

    
#
# Document
#

#' @noRd
#' @export
print.stanza.models.common.doc.Document <- function(x, ...) {
    writeLines(sprintf("<%s>", head(class(x), 1)))
    writeLines(sprintf("  number of sentences: %i", py_eval("len")(x$sentences)))
    writeLines(sprintf("  number of tokens: %i", x$num_tokens))
    writeLines(sprintf("  number of words: %i", x$num_words))
}


#'
#' @export
sents.stanza.models.common.doc.Document <- function(x, type = c("word", "token"), ...) {
    type <- match.arg(type)
    rstanza$sents(x, type)
}


#' @noRd
#' @export
words.stanza.models.common.doc.Document <- function(x, ...) {
    rstanza$words(x)
}


#' @noRd
#' @export
sents.stanza.models.common.doc.Document <- function(x, type = c("word", "token"), ...) {
    type <- match.arg(type)
    rstanza$sents(x, type)
}


#' Tokens
#'
#' @param x an object inheriting from \code{"stanza_document"} or \code{"stanza_sentence"}.
#' @param ... optional additional arguments, currently not used.
#' 
#' @returns a data.frame with the tokens.
#' 
#' @export
tokens <- function(x, ...) {
    rstanza$tokens(x)
}


#' Entities
#'
#' @param x an object inheriting from \code{"stanza_document"}.
#' @param ... optional additional arguments, currently not used.
#' 
#' @returns a data.frame with the entities.
#' 
#' @export
entities <- function(x, ...) {
    rstanza$entities(x)
}


#' Multi-Word Token
#'
#' @param x an object of 
#' @param ... optional additional arguments, currently not used.
#' 
#' @returns a data.frame with the multi-word tokens.
#' 
#' @export
multi_word_token <- function(x, ...) {
    x <- rstanza$multi_word_token(x)
    data.frame(x)
}

Try the stanza package in your browser

Any scripts or data that you put into this service are public.

stanza documentation built on June 8, 2025, 1:23 p.m.