R/init.R

Defines functions check_python cnlp_init_corenlp cnlp_init_stringi cnlp_init_udpipe cnlp_init_spacy

Documented in cnlp_init_corenlp cnlp_init_spacy cnlp_init_stringi cnlp_init_udpipe

#' Interface for initializing the spacy backend
#'
#' This function must be run before annotating text with
#' the spacy backend. It sets the properties for the
#' spacy engine and loads the file using the R to Python
#' interface provided by reticulate.
#'
#' @param model_name    string giving the model name for the spacy backend.
#'                      Defaults to "en" (English) if set to NULL.
#' @param disable       an optional vector of pipes to disable.
#' @param max_length    amount of temporary memory provided to Spacy, in
#'                      characters. The default of 1000000 should work for most
#'                      applications, but can be increased when working with
#'                      long documents.
#'
#' @author Taylor B. Arnold, \email{taylor.arnold@@acm.org}
#'
#' @examples
#'\dontrun{
#'cnlp_init_spacy(model_name = "en")
#'}
#'
#' @export
cnlp_init_spacy <- function(model_name=NULL, disable=NULL, max_length=NULL) {

  check_python()
  volatiles$spacy$model_name  <- ifnull(model_name, "en")
  volatiles$spacy$max_length  <- ifnull(max_length, 1000000)

  if (is.null(disable))
  {
    volatiles$spacy$obj <- volatiles$cleannlp$spacy$spacyCleanNLP(
      volatiles$spacy$model_name,
      volatiles$spacy$max_length
    )
  } else {
    volatiles$spacy$obj <- volatiles$cleannlp$spacy$spacyCleanNLP(
      volatiles$spacy$model_name,
      volatiles$spacy$max_length,
      disable
    )
  }

  assert(
    !is.null(volatiles$spacy$obj$nlp),
    sprintf(
      "model %s not found; use cnlp_download_spacy(\"%s\") to install",
      volatiles$spacy$model_name,
      volatiles$spacy$model_name
    )
  )
  volatiles$spacy$init <- TRUE
  volatiles$model_init_last <- "spacy"
}

#' Interface for initializing the udpipe backend
#'
#' This function must be run before annotating text with
#' the udpipe backend. It will parse in English by default,
#' but you can load other models as well.
#'
#' @param model_name   string giving the model namel.
#'                     Defaults to "english" if NULL.
#'                     Ignored if \code{model_path} is not NULL.
#' @param model_path   provide a full path to a model file.
#' @param tokenizer    a character string of length 1, which is either
#'                     'tokenizer' (default udpipe tokenisation) or a
#'                     character string with more
#'                     complex tokenisation options as specified in <URL:
#'                     http://ufal.mff.cuni.cz/udpipe/users-manual> in which
#'                     case tokenizer should be a character string where the
#'                     options are put after each other using the semicolon as
#'                     separation.
#' @param tagger       a character string of length 1, which is either 'default'
#'                     (default udpipe POS tagging and lemmatisation) or 'none' (no
#'                     POS tagging and lemmatisation needed) or a character string
#'                     with more complex tagging options as specified in <URL:
#'                     http://ufal.mff.cuni.cz/udpipe/users-manual> in which case
#'                     tagger should be a character string where the options are
#'                     put after each other using the semicolon as separation.
#' @param parser       a character string of length 1, which is either 'default'
#'                     (default udpipe dependency parsing) or 'none' (no dependency
#'                     parsing needed) or a character string with more complex
#'                     parsing options as specified in <URL:
#'                     http://ufal.mff.cuni.cz/udpipe/users-manual> in which case
#'                     parser should be a character string where the options are
#'                     put after each other using the semicolon as separation.
#'
#' @author Taylor B. Arnold, \email{taylor.arnold@@acm.org}
#'
#' @examples
#'\dontrun{
#'cnlp_init_udpipe(model_name = "english")
#'}
#'
#' @export
cnlp_init_udpipe <- function(
  model_name = NULL,
  model_path = NULL,
  tokenizer = "tokenizer",
  tagger = "default",
  parser = "default"
)
{
  model_name <- ifnull(model_name, "english")
  model_loc <- system.file("extdata", package="cleanNLP")

  if (is.null(model_path))
  {
    model_loc <- system.file("extdata", package="cleanNLP")
    model_path <- Sys.glob(file.path(model_loc,
                                    sprintf("%s-*.udpipe", model_name)))[1]

    # If model does not exist, download it
    if (is.na(model_path)) {
      udpipe::udpipe_download_model(
        language = model_name,
        model_dir = model_loc
      )
    }

    model_path <- Sys.glob(file.path(model_loc,
                                    sprintf("%s-*.udpipe", model_name)))[1]
  }

  volatiles$udpipe$model_name   <- ifnull(model_name, "english")
  volatiles$udpipe$model_path   <- model_path
  volatiles$udpipe$model_obj    <- udpipe::udpipe_load_model(model_path)
  volatiles$udpipe$tokenizer    <- tokenizer
  volatiles$udpipe$tagger       <- tagger
  volatiles$udpipe$parser       <- parser

  volatiles$udpipe$init <- TRUE
  volatiles$model_init_last <- "udpipe"
}

#' Interface for initializing the standard R backend
#'
#' This function must be run before annotating text with
#' the tokenizers backend.
#'
#' @param locale            string giving the locale name to
#'                          pass to the stringi functions. If
#'                          \code{NULL}, the default locale is
#'                          selected
#'
#' @param include_spaces    logical. Should spaces be included as tokens in
#'                          the output. Defaults to FALSE
#'
#' @author Taylor B. Arnold, \email{taylor.arnold@@acm.org}
#'
#' @examples
#'\dontrun{
#'cnlp_init_stringi()
#'}
#'
#' @export
cnlp_init_stringi <- function(locale=NULL, include_spaces=FALSE) {

  volatiles$stringi$locale         <- ifnull(locale, stringi::stri_locale_get())
  volatiles$stringi$init           <- TRUE
  volatiles$stringi$include_spaces <- include_spaces
  volatiles$model_init_last        <- "stringi"

}

#' Interface for initializing the coreNLP backend
#'
#' This function must be run before annotating text with
#' the coreNLP backend. It sets the properties for the
#' spacy engine and loads the file using the R to Python
#' interface provided by reticulate.
#'
#' @param lang        string giving the language name for the corenlp backend.
#'                    Defaults to "en" (English) if set to NULL.
#' @param models_dir  directory where model files are located. Set to NULL to
#'                    use the default.
#' @param config      An optional named list to be converted to a Python
#'                    dictionary.
#'
#'
#' @author Taylor B. Arnold, \email{taylor.arnold@@acm.org}
#'
#' @examples
#'\dontrun{
#'cnlp_init_corenlp()
#'}
#'
#' @export
cnlp_init_corenlp <- function(lang=NULL, models_dir=NULL, config=NULL) {

  check_python()

  assert(
    volatiles$cleannlp$corenlp$STANFORD_AVAILABLE,
    paste(c(
    "The Python module 'stanfordnlp' not found. Install with:\n",
    "  pip install stanfordnlp"
    ))
  )

  volatiles$corenlp$lang <- ifnull(lang, "en")
  volatiles$corenlp$models_dir <- ifnull(
    models_dir,
    volatiles$cleannlp$corenlp$default_model_dir()
  )
  volatiles$corenlp$config <- reticulate::dict(config)
  assert(
    volatiles$corenlp$lang %in%
      stringi::stri_sub(dir(volatiles$corenlp$models_dir), 1, 2),
    sprintf(
      "model %s not found; use cnlp_download_corenlp(\"%s\") to install",
      volatiles$corenlp$lang,
      volatiles$corenlp$lang
    )
  )

  volatiles$corenlp$obj <- volatiles$cleannlp$corenlp$corenlpCleanNLP(
    volatiles$corenlp$lang,
    volatiles$corenlp$models_dir,
    volatiles$corenlp$config
  )
  volatiles$corenlp$init <- TRUE
  volatiles$model_init_last <- "corenlp"

}

check_python <- function() {

  disc <- reticulate::py_discover_config(required_module="cleannlp")
  assert(
    !is.null(disc$required_module_path),
    "Python module 'cleannlp' not found. Install with:\n  pip install cleannlp"
  )

  assert(
    reticulate::py_module_available("cleannlp"),
    paste(c(
      "The 'cleannlp' appears to be available on your system, however\n",
      "the reticulate package has selected an alternative version of Python\n",
      "to the one where you installed the module. Restart R and run:\n\n",
      "   library(cleanNLP)\n\n",
      "prior to running any other code. If that still produces this error,\n",
      "restart R and manually select the version of Python before running\n",
      "any other functions with:\n\n",
      sprintf("   use_python(\"%s\")\n\n", disc$python)
    ), collapse=" ")
  )

  if (is.null(volatiles$cleannlp))
  {
    volatiles$cleannlp <- reticulate::import("cleannlp")
  }

  version_num_required <- "1.0.3"
  assert(
    volatiles$cleannlp$VERSION >= "1.0.3",
    paste(c(
      "Python module 'cleannlp' was found, but is out of date.",
      "Upgrade with:\n  pip install -U cleannlp"
    ), collapse=" ")
  )
}

Try the cleanNLP package in your browser

Any scripts or data that you put into this service are public.

cleanNLP documentation built on Nov. 17, 2023, 1:06 a.m.