R/spacy_initialize.R

Defines functions spacy_initialize spacy_finalize find_spacy find_spacy_env check_spacy_model set_spacy_python_option clear_spacy_options check_spacy_python_options save_spacy_options

Documented in find_spacy find_spacy_env spacy_finalize spacy_initialize

#' Initialize spaCy
#' 
#' Initialize spaCy to call from R. 
#' @return NULL
#' @param model Language package for loading spaCy. Example: \code{en_core_web_sm} (English) and
#' \code{de_core_web_sm} (German). Default is \code{en_core_web_sm}.
#' @param python_executable the full path to the Python executable, for which
#'   spaCy is installed
#' @param ask logical; if \code{FALSE}, use the first spaCy installation found;
#'   if \code{TRUE}, list available spaCy installations and prompt the user for
#'   which to use. If another (e.g. \code{python_executable}) is set, then this
#'   value will always be treated as \code{FALSE}.
#' @param virtualenv set a path to the Python virtual environment with spaCy
#'   installed Example: \code{virtualenv = "~/myenv"}
#' @param condaenv set a path to the anaconda virtual environment with spaCy
#'   installed Example: \code{condalenv = "myenv"}
#' @param entity logical; if \code{FALSE} is selected, named entity recognition
#'   is turned off in spaCy. This will speed up the parsing as it will exclude
#'   \code{ner} from the pipeline. For details of spaCy pipeline, see
#'   \url{https://spacy.io/usage/processing-pipelines}. The option \code{FALSE}
#'   is available only for spaCy version 2.0.0 or higher.
#' @param check_env logical; check whether conda/virtual environment generated
#'   by \code{spacyr_istall()} exists
#' @param refresh_settings logical; if \code{TRUE}, spacyr will ignore the saved
#'   settings in the profile and initiate a search of new settings.
#' @param save_profile logical; if \code{TRUE}, the current spaCy setting will
#'   be saved for the future use.
#' @export
#' @author Akitaka Matsuo
spacy_initialize <- function(model = "en_core_web_sm",
                             python_executable = NULL,
                             virtualenv = NULL,
                             condaenv = NULL,
                             ask = FALSE,
                             refresh_settings = FALSE,
                             save_profile = FALSE,
                             check_env = TRUE,
                             entity = TRUE) {

    # here are a number of checkings
    if (!is.null(options("spacy_initialized")$spacy_initialized)) {
        message("spaCy is already initialized")
        return(NULL)
    }

    # once python is initialized, you cannot change the python executables
    if (!is.null(options("python_initialized")$python_initialized)) {
        message("Python space is already attached.  If you want to switch to a different Python, please restart R.")
    }
    # NEW: if spacy_condaenv exists use it
    else {
        set_spacy_python_option(python_executable,
                                virtualenv,
                                condaenv,
                                check_env,
                                refresh_settings,
                                ask,
                                model)
    }

    ## check settings and start reticulate python
    settings <- check_spacy_python_options()
    if (!is.null(settings)) {
        ####
        if (settings$key == "spacy_python_executable") {
            if (check_spacy_model(settings$val, model) != "OK") {
                stop("spaCy or language model ", model, " is not installed in ", settings$val)
            }
            reticulate::use_python(settings$val, required = TRUE)
        }
        else if (settings$key == "spacy_virtualenv") reticulate::use_virtualenv(settings$val, required = TRUE)
        else if (settings$key == "spacy_condaenv") {
            reticulate::use_condaenv(settings$val, required = TRUE)
        }
    }
    options("python_initialized" = TRUE) # next line could cause non-recoverable error
    spacyr_pyexec(pyfile = system.file("python", "spacyr_class.py",
                                       package = "spacyr"))

    spacyr_pyassign("model", model)
    spacyr_pyassign("spacy_entity", entity)
    options("spacy_entity" = entity)
    spacyr_pyexec(pyfile = system.file("python", "initialize_spacyPython.py",
                                       package = "spacyr"))

    spacy_version <- spacyr_pyget("spacy_version")
    if (entity == FALSE && as.integer(substr(spacy_version, 1, 1)) < 2){
        message("entity == FALSE is only available for spaCy version 2.0.0 or higher")
        options("spacy_entity" = TRUE)
    }
    message("successfully initialized (spaCy Version: ", spacy_version, ", language model: ", model, ")")
    settings <- check_spacy_python_options()
    message('(python options: type = "', sub("spacy_", "", settings$key), '", value = "', settings$val, '")')
    options("spacy_initialized" = TRUE)

    if (save_profile == TRUE){
        save_spacy_options(settings$key, settings$val)
    }
}

#' Finalize spaCy
#' 
#' While running spaCy on Python through R, a Python process is always running
#' in the background and Rsession will take up a lot of memory (typically over
#' 1.5GB). \code{spacy_finalize()} terminates the Python process and frees up
#' the memory it was using.
#' @return NULL
#' @export
#' @author Akitaka Matsuo
spacy_finalize <- function() {
    if (is.null(getOption("spacy_initialized"))) {
        stop("Nothing to finalize. spaCy is not initialized")
    }
    spacyr_pyexec(pyfile = system.file("python", "finalize_spacyPython.py",
                                       package = "spacyr"))
    options("spacy_initialized" = NULL)
}

#' Find spaCy
#' 
#' Locate the user's version of Python for which spaCy installed.
#' @return spacy_python
#' @export
#' @param model name of the language model
#' @param ask logical; if \code{FALSE}, use the first spaCy installation found; 
#'   if \code{TRUE}, list available spaCy installations and prompt the user 
#'   for which to use. If another (e.g. \code{python_executable}) is set, then 
#'   this value will always be treated as \code{FALSE}.
#'  
#' @keywords internal
#' @importFrom data.table data.table
find_spacy <- function(model = "en_core_web_sm", ask){
    spacy_found <- `:=` <- NA
    spacy_python <- NULL
    options(warn = -1)
    py_execs <- if (is_windows()) {
        system2("where", "python", stdout = TRUE)
    } else if (is_osx() && file.exists("~/.bash_profile")) {
        c(system2("source", "~/.bash_profile; which -a python", stdout = TRUE),
          system2("source", "~/.bash_profile; which -a python3", stdout = TRUE))
    } else {
        c(system2("which", "-a python", stdout = TRUE),
          system2("which", "-a python3", stdout = TRUE))
    }
    py_execs <- unique(py_execs)
    options(warn = 0)

    if (length(py_execs) == 0 | grepl("not find", py_execs[1])[1]){
        return(NA)
    }
    df_python_check <- data.table::data.table(py_execs, spacy_found = 0)
    for (i in 1:nrow(df_python_check)) {
        py_exec <- df_python_check[i, py_execs]
        sys_message <- check_spacy_model(py_exec, model)
        if (sys_message == "OK") {
            df_python_check[i, spacy_found := 1]
        }
    }

    if (df_python_check[, sum(spacy_found)] == 0) {
        return(NULL)
    } else if (df_python_check[, sum(spacy_found)] == 1) {
        spacy_python <- df_python_check[spacy_found == 1, py_execs]
        message("spaCy (language model: ", model, ") is installed in ", spacy_python)
    } else if (ask == FALSE) {
        spacy_python <- df_python_check[spacy_found == 1, py_execs][1]
        message("spaCy (language model: ", model, ") is installed in more than one python")
        message("spacyr will use ", spacy_python, " (because ask = FALSE)")
    } else {
        spacy_pythons <- df_python_check[spacy_found == 1, py_execs]
        message("spaCy (language model: ", model, ") is installed in more than one python")
        number <- utils::menu(spacy_pythons, title = "Please select python:")
        if (number == 0) {
            stop("Initialization was canceled by user", call. = FALSE)
        }
        spacy_python <- spacy_pythons[number]
        message("spacyr will use: ", spacy_python)
    }
    return(spacy_python)
}


#' Find spaCy env
#' 
#' check whether conda/virtual environment for spaCy exists
#' @export
#'  
#' @keywords internal
find_spacy_env <- function(){
    if (is.null(tryCatch(reticulate::conda_binary("auto"), error = function(e) NULL))){
        return(FALSE)
    }
    found <- if ("spacy_condaenv" %in% reticulate::conda_list(conda = "auto")$name) {
        TRUE
    } else if (file.exists(file.path("~/.virtualenvs", "spacy_virtualenv", "bin", "activate"))) {
        TRUE
    } else {
        FALSE
    }
    return(found)
}
    

check_spacy_model <- function(py_exec, model) {
    options(warn = -1)
    py_exist <- if (is_windows()) {
        if (py_exec %in% system2("where", "python", stdout = TRUE)) {
            py_exec
        } else {
            NULL
        }
    } else {
        system2("which", py_exec, stdout = TRUE)
    }

    if (length(py_exist) == 0) {
        stop(py_exec, " is not a python executable")
    }
    tryCatch({
        sys_message <-
            system2(py_exec, c(sprintf("-c \"import spacy; spacy.load('%s'); print('OK')\"", model)),
                    stderr = TRUE, stdout = TRUE)
    })
    options(warn = 0)
    return(paste(sys_message, collapse = " "))
}


set_spacy_python_option <- function(python_executable = NULL,
                                    virtualenv = NULL,
                                    condaenv = NULL,
                                    check_env = TRUE,
                                    refresh_settings = FALSE,
                                    ask = NULL,
                                    model = NULL) {
    if (refresh_settings) clear_spacy_options()

    if (!is.null(check_spacy_python_options())) {
        settings <- check_spacy_python_options()
        message("spacy python option is already set, spacyr will use:\n\t",
                sub("spacy_", "", settings$key), ' = "', settings$val, '"')
    }
    # a user can specify only one
    else if (sum(!is.null(c(python_executable, virtualenv, condaenv))) > 1) {
        stop(paste("Too many python environments are specified, please select only one",
                   "from python_executable, virtualenv, and condaenv"))
    }
    # give warning when nothing is specified
    else if (sum(!is.null(c(python_executable, virtualenv, condaenv))) == 1){
        if (!is.null(python_executable)) {
            if (check_spacy_model(python_executable, model) != "OK"){
                stop("spaCy or language model ", model, " is not installed in ", python_executable)
            }
            clear_spacy_options()
            options(spacy_python_executable = python_executable)
        }
        else if (!is.null(virtualenv)) {
            clear_spacy_options()
            options(spacy_virtualenv = virtualenv)
        }
        else if (!is.null(condaenv)) {
            clear_spacy_options()
            options(spacy_condaenv = condaenv)
        }
    }
    else if (check_env &&
              !(is.null(tryCatch(reticulate::conda_binary("auto"), error = function(e) NULL))) &&
              "spacy_condaenv" %in% reticulate::conda_list(conda = "auto")$name) {
        message("Found 'spacy_condaenv'. spacyr will use this environment")
        clear_spacy_options()
        options(spacy_condaenv = "spacy_condaenv")
    }
    else if (check_env && file.exists(file.path("~/.virtualenvs", "spacy_virtualenv", "bin", "activate"))) {
        message("Found 'spacy_virtualenv'. spacyr will use this environment")
        clear_spacy_options()
        options(spacy_virtualenv = "~/.virtualenvs/spacy_virtualenv")
    }
    else {
        message("Finding a python executable with spaCy installed...")
        spacy_python <- find_spacy(model, ask = ask)
        if (is.null(spacy_python)) {
            stop("spaCy or language model ", model, " is not installed in any of python executables.")
        } else if (is.na(spacy_python)) {
            stop("No python was found on system PATH")
        } else {
            options(spacy_python_executable = spacy_python)
        }
    }
    return(NULL)
}

clear_spacy_options <- function(){
    options(spacy_python_executable = NULL)
    options(spacy_condaenv = NULL)
    options(spacy_virtualenv = NULL)
}

check_spacy_python_options <- function() {
    settings <- NULL
    for (k in c("spacy_python_executable",
               "spacy_condaenv",
               "spacy_virtualenv")) {
        if (!is.null(getOption(k))) {
            settings$key <- k
            settings$val <- getOption(k)
        }
    }
    return(settings)
}

save_spacy_options <- function(key, val, prompt = TRUE) {
    prof_file <- "~/.Rprofile"
    if (!is.null(getOption("spacy_prompt"))) prompt <- getOption("spacy_prompt")

    ans <- if (prompt) {
        utils::menu(c("No", "Yes"),
                    title = sprintf('Do you want to set the option, \'%s = "%s"\' , as a default (y|[n])? ', key, val))
    } else 2
    if (ans == 2) {
        rprofile <- if (file.exists(prof_file)) readLines(prof_file) else NULL
        rprofile <- grep("options\\(\\s*spacy_.+\\)", rprofile, value = TRUE, invert = TRUE)
        rprofile <- c(rprofile, sprintf('options(%s = "%s")', key, val))
        write(rprofile, file = prof_file)
        message("The option was saved. The option will be used in spacy_initialize() in future")
    } else {
        message("The option was not saved (user cancelled)")
    }
}

Try the spacyr package in your browser

Any scripts or data that you put into this service are public.

spacyr documentation built on March 26, 2020, 5:25 p.m.