R/init_and_install.R

Defines functions check_virtualenv_py py_install_method_detect .activate_env check_sentiment.ai init_sentiment.ai install_default_embeddings install_scoring_model install_sentiment.ai

Documented in check_sentiment.ai init_sentiment.ai install_default_embeddings install_scoring_model install_sentiment.ai

#' setup
#' @description Install and Setup sentiment.ai package
#'
#' @inheritParams reticulate::py_install
#' @param method Installation method. By default, "auto" automatically finds a
#'        method that will work in the local environment. Change the default to
#'        force a specific installation method. Note that the "virtualenv"
#'        method may not available on Windows due to a tensorflow issue. Note
#'        also that since this command runs without privilege the "system"
#'        method is available only on Windows.
#' @param gpu Whether GPU should be enabled when installing TensorFlow
#' @param modules A list of modules needed for installing tensorflow. See
#'        details for more information. Only change this argument if you know
#'        what you are doing!
#' @param fresh_install Whether to create the Python environment prior to
#'        installing the modules or to install everything in an existing
#'        environment (if one exists). Only change this argument if you know what
#'        you are doing! If the environment does not already exist, will create
#'        the environment first.
#' @param restart_session Whether to restart the R session after finishing
#'        installation. Only works on Rstudio.
#' @param model path to tensorflow hub embedding model. default is both universal
#'        sentence encoder en (default) and multi.
#' @details
#' Sets up environment specific for sentiment.ai. The packages that it currently
#' needs are as follows:
#'
#' | Module          | Version |
#' | :-------------- | :-----: |
#' | python          | 3.8.10  |
#' | numpy           | 1.19.5  |
#' | tensorflow      | 2.4.1   |
#' | tensorflow_hub  | 0.12.0  |
#' | tensorflow-text | 2.4.3   |
#' | sentencepiece   | 0.1.95  |
#'
#' Please do not change these unless you know what you are doing.
#'
#' Note that it installs with like \code{tensorflow::install_tensorflow} and
#' \code{pip = TRUE}
#'
#' @note
#' Setting environments with \code{reticulate} is notoriously difficult. If the
#' RETICULATE_PYTHON environment is set, then reticulate will not let you change
#' the Python binary used (or the Python environment) using \code{use_condaenv}
#' **or** \code{use_virtualenv}. This environment can be accidentally set in
#' the following ways:
#'
#' 1. If RETICULATE_PYTHON is in your .Renviron file or bash/zsh rc files. This is
#'    the most obvious place that this environment will be set.
#' 2. Using Project Options or Global Options under "Python>Python Interpreter".
#'    If this is set, then reticulate will almost always use this version of Python
#'    and will not let you change.
#' 3. If you have already loaded reticulate and have run `py_config`. Once a Python
#'    version/environment is instantiated, you will not be able to change it and
#'    will have to restart R.
#' 4. If you are in **any** project, at all! Currently (as of `reticulate` version
#'    1.22), every project automatically sets the RETICULATE_PYTHON environment
#'    variable, either through the Global or Project Options or by using heuristics.
#'    If you are in an RStudio project, you **must** update Global/Project Options
#'    with the specific version/environment of Python that you want to use, or
#'    you will not be able to change it!
#'
#' Manually setting the environment variable to NULL (using
#' `Sys.unsetenv("RETICULATE_PYTHON")`, updating your Project/Global options going
#' Tools>Project Options or Tools>Global Options and then select Python in the
#' left menu bar and click the "Select" button to manually set the Python
#' interpreter, and/or restarting your R session **might** fix the problem.
#'
#' We know this is a pain, and we would like to fix this for you, but we are
#' dependent on the RStudio/reticulate team to update how they determine the
#' allowable Python versions/environments.
#'
#' @examples
#' \dontrun{
#' install_sentiment.ai(envname = "r-sentiment-ai",
#'                      method  = "conda",
#'                      python_version = "3.8.10")
#' init_sentiment.ai(model   = "en.large",
#'                   envname = "r-sentiment-ai")
#' check_sentiment.ai()
#'
#' # if you run into an issue, follow the instructions/see the note and retry!
#' }


# 1. INSTALL ===================================================================


# install environment and dependencies\

#' @rdname setup
#' @return NULL this function simply installs the required python dependencies and default scoring models and pre-calculated embedding vectors.
#' @importFrom roperators "%ni%"
#' @import tensorflow
#' @import tfhub
#' @export
install_sentiment.ai <- function(envname = "r-sentiment-ai",
                                 method  = c("auto", "virtualenv", "conda"),
                                 gpu     = FALSE,
                                 python_version = "3.8.10",
                                 modules = list(numpy             = "1.19.5",
                                                sentencepiece     = "0.1.95",
                                                tensorflow        = "2.4.1",
                                                tensorflow_hub    = "0.12.0",
                                                `tensorflow-text` = "2.4.3"),
                                 fresh_install   = TRUE,
                                 restart_session = TRUE,
                                 ...){

  # STILL HAVE INSTALL ISSUES ... SHOULD DO SOMETHING ABOUT THIS ???

  method <- match.arg(method)

  # if environment is missing, set it to r-sentiment-ai
  if(length(envname) == 0){
    envname <- "r-sentiment-ai"
  }

  # if method is default, figure out method using reticulate
  if(method == "auto"){
    method   <- py_install_method_detect(
      envname = envname,
      ...
    )
  }

  # 1. parse tensorflow version name -----------------------------------------

  tf_module <- "tensorflow"

  # make sure tensorflow is in the modules list
  if(tf_module %ni% names(modules)){
    stop("tensorflow version must be specified in 'modules'",
         call. = FALSE)
  }

  # make sure gpu is TRUE or FALSE
  gpu <- isTRUE(gpu)

  if(gpu && roperators::is.os_mac()){
    warning("gpu not available for OSX; setting gpu flag to FALSE",
            call. = FALSE)
    gpu <- FALSE
  } else if(gpu){
    message("gpu flag is TRUE; installation needs CUDA configured")
    names(modules)[names(modules) %in% tf_module] <- paste0(
      tf_module, "-", "gpu"
    )
  }

  # 2. parse other module names and versions -----------------------------------

  # make sure that all modules have length 1
  stopifnot(all(lengths(modules) == 1))

  # paste name with the version (should we check to make sure this is done OK?)
  modules_vers <- paste0(names(modules), "==", modules)

  # 3. install everything ------------------------------------------------------



  # if fresh install, create environment first
  if(fresh_install){
    switch(
      EXPR       = method,
      virtualenv = {
        check_virtualenv_py(envname = envname, version = python_version, ...)
        if(envname %ni% as.character(reticulate::virtualenv_list()) ){
          # if had to install py first
          reticulate::virtualenv_create(envname = envname, version = python_version, ...)
        }
        },
      conda      = reticulate::conda_create(envname = envname,python_version = python_version,...)
    )
  } else{
    message("Because 'fresh_install = FALSE', not creating environment before installing.\n",
            "Only do this if you know what you are doing, as you might have conflicting\n",
            "installations and/or reticulate might not be able to find the correct environment.")
  }

  reticulate::py_install(
    packages       = modules_vers,
    envname        = envname,
    method         = method,
    python_version = python_version,
    pip            = TRUE,
    ...
  )

  message("Successfully created ", method, " environment: ", envname)

  # Since they already have internet here, install.
  message("Installing default scoring model from Github")
  install_scoring_model(scoring = "xgb")
  install_scoring_model(scoring = "glm")

  # also pull precalculated embeddings
  message("Instaling pre-calculated embeddings from Github")
  install_default_embeddings()

  # restart session if needed
  if(restart_session && rstudioapi::hasFun("restartSession")){
    rstudioapi::restartSession()
  } else{
    message("Please restart your R session")
  }

  invisible(NULL)
}


#' Install a Scoring Model
#'
#' @param model The embedding model, one of c("en.large", "en", "multi.large",
#'        "multi").
#' @param scoring The scoring model, currently one of:
#'   - "xgb" does default xgboost
#'   - "glm" does generalized linear model (if you can't run xgboost)
#' @param scoring_version Version of scoring model (will add more over time)
#' @param ... Additional options to the function, including:
#'   - repo_url: OPTIONAL custom github repo blob url for external scoring models.
#'     The default repo_url is "https://github.com/BenWiseman/sentiment.ai/blob/main/models"
#'
#' @return
#' 0 if model did not need to be downloaded.
#' 1 if model needed to be downloaded.
#'
#' @details
#' This downloads the scoring models from a set repository in order to keep the
#' main package within CRAN size limits.
#'
#' In the future, this will also make it possible for the community to add new
#' and improved models!
#'
#' @importFrom roperators "%ni%"
install_scoring_model <- function(model   =  c("en.large", "en", "multi.large", "multi"),
                                  scoring = c("xgb", "glm"),
                                  scoring_version = "1.0",
                                  ...){

  # for return status
  status <- 0

  # passthrough optional repo_url
  opts <- list(...)

  if(is.null(opts$repo_url)) {
    repo_url <- "https://github.com/BenWiseman/sentiment.ai/raw/main/models"
  } else{
    repo_url <- opts$repo_url
  }

  # Remove match.arg not used - give flexibility.
  model   <- match.arg(model)
  scoring <- scoring[1]
  scoring_version <- scoring_version[1]

  # glm models will be plain text for max compatibility
  file_ext   <- if(scoring == "glm") "csv" else scoring
  file_name  <- paste0(model, ".", file_ext)

  # base url - repo containing model objects
  target_url <- paste(repo_url, scoring, scoring_version, file_name, sep = "/")

  # Add query param to end
  target_url <- paste0(target_url, "?raw=true")


  # get download location: <pkg_dir>/scoring/xgb/1.0/en.large
  # determining package name and base path
  pkg_name <- utils::packageName()
  pkg_path <- system.file(package = pkg_name)
  dl_path  <- file.path(pkg_path, "scoring", scoring, scoring_version)
  obj_path <- file.path(dl_path, file_name)

  # should dl_path go to a library or the package dir? if it's in the package dir
  # they will always have to reinstall all of the scoring models if upgraded

  # if model exists, return NULL - nothing to do
  if(file.exists(obj_path)) return(0)

  # model doesn't exist, download it into dl_path
  if(!dir.exists(dl_path)) {
    # directory doesn't exist, make directory & download
    dir.create(dl_path,
               showWarnings = FALSE,
               recursive = TRUE)
  }

  message("Downloading ", model, ": ", scoring, " ", scoring_version,  " from github")

  tryCatch({
    # you can always use wb to download bites
    utils::download.file(url      = target_url,
                         destfile = obj_path,
                         mode     = "wb")
    status <- 1

  }, error=function(e){

    message("Attempt to pull scoring models failed.\n",
            "Unfortunately these are necessary and can't be bundled on CRAN!\n",
            "The error was:\n", as.character(e))

  })

  return(status)
}

#' Function to grab the default embeddings for `sentiment_match()`
#' Necessary to keep package size under 5Mb.
#' Will check if they're there, if so return TRUE.
#' If they are not there, try download and return TRUE.
#' Otherwise, return FALSE (and generate them - will take a few seconds!).
install_default_embeddings <- function(){
  # for return status
  status <- 0

  # base folder
  repo_url <- "https://github.com/BenWiseman/sentiment.ai/raw/main/default_embeddings"

  # to get right version
  version   <- "0.1.0"# utils::packageDescription("sentiment.ai", fields = "Version")
  file_name <- paste0(version, ".json") # update version manually when default embeddings change!

  # base url - repo containing model objects
  target_url <- paste(repo_url, file_name, sep = "/")

  # Add query param to end
  target_url <- paste0(target_url, "?raw=true")

  # get download location: <pkg_dir>/data/en.large_def_emb.rds
  # determining package name and base path
  pkg_path <- system.file(package = "sentiment.ai")
  dl_path  <- file.path(pkg_path, "default_embeddings")
  obj_path <- file.path(dl_path, file_name)

  # should dl_path go to a library or the package dir? if it's in the package dir
  # they will always have to reinstall all of the scoring models if upgraded

  # if model exists, return NULL - nothing to do
  if(file.exists(obj_path)) return(0)

  # model doesn't exist, download it into dl_path
  if(!dir.exists(dl_path)) {
    # directory doesn't exist, make directory & download
    dir.create(dl_path,
               showWarnings = FALSE,
               recursive = TRUE)
  }

  message("Downloading precalculated default embeddings v.", version, " from github")

  tryCatch({
    # you can always use wb to download bites
    utils::download.file(url      = target_url,
                         destfile = obj_path,
                         mode     = "wb")
    status <- 1

  }, error=function(e){

    message("Attempt to pull pre-calculated embeddings failed.\n",
            "This should only be a problem for speed as they can still be calculated on the fly!\n")

  })

  return(status)

}
# 2. INITIALIZE ================================================================

#' @rdname setup
#' @param silent logical - do you want to suppress console logging? Can't affect tensorflow/GPU/python/c++ output, unfortunately.
#' @return python function to embed text can be returned, but is not necessary.
#'         `embed_text()` does this for you.
#' @export
init_sentiment.ai <- function(model   = c("en.large", "multi.large", "en", "multi"),
                              envname = "r-sentiment-ai",
                              method  =  c("auto", "virtualenv", "conda"),
                              silent  = FALSE){


  method = match.arg(method)

  # determining package name and base path
  pkg_name <- utils::packageName()
  pkg_path <- system.file(package = pkg_name)

  # can reduce GPU out of memory issues
  Sys.setenv("TF_FORCE_GPU_ALLOW_GROWTH" = "true")

  # activating environment (what if the environment won't activate??)
  .activate_env(envname, silent = silent, r_envir = -2, method=method)

  # 2. preparing model ---------------------------------------------------------

  # make sure load_language_model is NULL to suppress NOTE
  load_language_model <- NULL

  # load things (including load_language_model)
  if(!silent) message("Preparing Model")
  reticulate::source_python(
    system.file("get_embedder.py", package = pkg_name)
  )

  # pull out model link
  model <- choose_model(model)

  # parse cache folder
  # (needed or it will save to temp, which will through a fit after restart due to
  # "OSError: SavedModel file does not exist at: path/to/temp/dir")

  # pulling out the directory/name/version
  model_dir  <- gsub(x           = model,
                     pattern     = ".*\\/(.*\\/)(.*$)",
                     replacement = "\\1\\2")
  model_dir  <- strsplit(x     = model_dir,
                         split = "/")[[1]]

  model_name <- model_dir[1]
  model_ver  <- model_dir[2]

  # for setting generic cache folder (hopefuly works!)
  cache_dir  <- file.path(pkg_path, "tfhub_modules")

  # make sure the directory has been created
  # (for manual DL, need to create each level of name, version???)
  dir.create(path         = cache_dir,
             showWarnings = FALSE,
             recursive    = TRUE)

  # create sentiment.env object and make it global IN the package
  env   <- sentiment.ai::sentiment.env
  # allow silent/less log text!
  if(!silent){
    # do it with all the details
    env$embed <- load_language_model(model, cache_dir)
  } else{
    # just give reduced message that it'll take a while
    message("Loading language model...")
    env$embed <- suppressMessages(load_language_model(model, cache_dir))
  }


  env$embed
}

#' @rdname setup
#' @return NULL this function checks if `init_sentiment.ai()` has been called
#'         successfully, if not, it is called.
#' @export
check_sentiment.ai <- function(...){

  if(is.null(sentiment.ai::sentiment.env$embed)){
    message("Preparing model (this may take a while).\n",
            "Consider running init_sentiment.ai().")
    init_sentiment.ai(...)
  } else{
    # commented out for now - this may get annoying for users to see every time
    # ...especially if they *apply instead of passing in a vector for some reason!
    # message("sentiment.env$embed found in environment.\n",
    #         "To change model, call init_sentiment.ai().")
  }

  return(NULL)
}

# 3. HELPER FUNCTIONS ==========================================================

# Activate sentiment.ai Environment
.activate_env <- function(envname = "r-sentiment-ai",
                          silent  = FALSE,
                          r_envir = -1,
                          method  = c("auto", "virtualenv", "conda")){

  method = match.arg(method)
  # pull the environment lists
  venv_envs  <- character(0)
  conda_envs <- character(0)

  # If problem finding envs for specified method or auto tell user!
  # if the specified method is broken, show error text else only if silent is FALSE
  tryCatch(venv_envs  <- reticulate::virtualenv_list(), error = function(e) if(method != "auto") message(e) else if(!silent) message(e))
  tryCatch(conda_envs <- reticulate::conda_list()$name, error = function(e) if(method != "auto") message(e) else if(!silent) message(e))

  # use py_install_method_detect but give note that default behavior is different!

  # exception handle
  auto_failover <- function(e){
    message(e)
    out <- method
    if(!silent) message("Attempting failover which will prioritise conda if available.")
    # change env method based on where envname is found
    if(method == "auto" && envname %in% venv_envs) out <- "virtualenv"
    # priority to conda (not better, just most popular)
    if(method == "auto" && envname %in% conda_envs) out <- "conda"
    return(out)
  }

  # find method
  if(method == "auto"){
    # Find preferred install method
    method <- tryCatch(suppressWarnings(py_install_method_detect()), error = function(e) auto_failover(e))
    # sanity check
    if(method == "auto") stop("py_install_method_detect failed and environment could not be found!")
    # let user know if they're venv or conda
    if(!silent) message("Attempting to activate ", method, " environment...")
   }

  all_envs   <- c(venv_envs, conda_envs)

  # make sure the environment name is in the list
  stopifnot(is.character(envname),
            length(envname) == 1)

  if(envname %ni% all_envs){
    stop(envname, " environment not available. ",
         "run sentiment.ai_install() on envname '", envname, "' first.",
         call. = FALSE)
  }

  # 0.1.1 patch to allow user specified environment

  switch(method,
    "conda"      = env_expr <- expression(reticulate::use_condaenv(envname, required = TRUE)),
    "virtualenv" = env_expr <- expression(reticulate::use_virtualenv(envname, required = TRUE))
  )


  # we need to check whether the environment is active, right??
  eval(expr  = env_expr,
       envir = r_envir)

  # pull out the python environment
  py_ver_def <- Sys.getenv("RETICULATE_PYTHON")

  # py config depends on OS - pick which one isn't NULL
  py_path    <- c(reticulate::py_discover_config()$exec_prefix,
                  reticulate::py_discover_config()$pythonhome)

  py_env_set <- vapply(py_path, normalizePath, character(1))

  # determine if environment is set correctly (if previous code returns silently)
  py_env_ok  <- any(endsWith(py_env_set, envname))

  # double check if system environment is set
  tryCatch(
    expr = {
      if(!py_env_ok){
        stop("")
      }
    },
    error = function(cond){
      if(nchar(py_ver_def) > 0){
        # doesn't work if in projects unless the RETICULATE_PYTHON is set correctly
        text <- c("The RETICULATE_PYTHON environment variable is set, which can be due to",
                  "being in a project (regardless of whether the global/project Python options",
                  "are set), having the global/project Python options set, or having",
                  "RETICULATE_PYTHON in your .Renviron file or bash/zsh rc files.")
      } else{
        text <- c("RETICULATE_PYTHON environment variable is not set, so that shouldn't be the problem.",
                  "Double check reticulate/python are installed correctly, that the corrdct dependencies are installed, and that the environment is loading.",
                  "In testing, this also happened when using Microsoft R Open and reticulate 1.6.")
      }

      head_text <- "Internal error when checking that the environment is active."
      tail_text <- c("We appreciate that getting conda environments correct with reticulate is a pain!",
                     "If you have difficulties with environments in RStudio, go to tools>Global Options>Python.",
                     "From there you can force RStudio to use the proper environment if reticulate isn't working.")

      create_error_text(head_text, text, "", tail_text, "")
      stop("env ", envname, " is not active. try restarting R and/or changing default environment.",
           call. = FALSE)
    }
  )

  return(TRUE)
}

# determine the python environment
py_install_method_detect <- function(envname,
                                     conda = "auto",
                                     ...){

  if(length(conda) == 0){
    conda <- "auto"
  }

  # is just reticulate:::py_install_method_detect
  # all in local_from_reticulate.R as internal funcs
  install_method_detect(envname = envname,conda   = conda)
}



# internal func for virtualenv - may need user to install specific python version
# create env if possible else install py if needed./
# create env in same method as install py fails for some reason - so moved out of this func
# install python if needed for virtualenv
check_virtualenv_py <- function(envname, version, ...){

  tryCatch(
    reticulate::virtualenv_create(
      envname = envname,
      version = version,
      ...
    ),
    error = function(e){

      message(e)
      # check if pyenv_python error happens
      if(grepl('Try installing it with install_python', e)){

        inst_promt <- readline(paste0("Python ", version, " is missing. Install now? [Y/n]:\n"))

        if(grepl("^Y", inst_promt, ignore.case = TRUE)){
          # install python and try again
          reticulate::install_python(version = version)

        }
      } else{
        stop(e)
      }

    }
  )

}

Try the sentiment.ai package in your browser

Any scripts or data that you put into this service are public.

sentiment.ai documentation built on March 19, 2022, 2:15 a.m.