R/tesseract.R

Defines functions df_as_tibble bail tesseract_engine parse_params tesseract_info tesseract_params

Documented in tesseract_info tesseract_params

#' Tesseract Engine
#'
#' Create an OCR engine for a given language and control parameters. This can be used by
#' the [ocr] and [ocr_data] functions to recognize text.
#'
#' Tesseract control parameters can be set either via a named list in the
#' `options` parameter, or in a `config` file text file which contains the parameter name
#' followed by a space and then the value, one per line. Use [tesseract_params()] to list
#' or find parameters. Note that that some parameters are only supported in certain versions
#' of libtesseract, and that invalid parameters can sometimes cause libtesseract to crash.
#'
#' @export
#' @return no return value, called for side effects
#' @rdname tesseract
#' @family tesseract
#' @param language string with language for training data. Usually defaults to `eng`
#' @param datapath path with the training data for this language. Default uses
#' the system library.
#' @param configs character vector with files, each containing one or more parameter
#' values. These config files can exist in the current directory or one of the standard
#' tesseract config files that live in the tessdata directory. See details.
#' @param options a named list with tesseract parameters. See details.
#' @param cache speed things up by caching engines
tesseract <- local({
  store <- new.env()
  function(language = "eng", datapath = NULL, configs = NULL, options = NULL, cache = TRUE) {
    datapath <- normalizePath(as.character(datapath), mustWork = TRUE)
    language <- as.character(language)
    configs <- as.character(configs)
    options <- as.list(options)
    if (isTRUE(cache)) {
      key <- digest::digest(list(language, datapath, configs, options))
      if (is.null(store[[key]])) {
        ptr <- tesseract_engine(datapath, language, configs, options)
        assign(key, ptr, store)
      }
      store[[key]]
    } else {
      tesseract_engine(datapath, language, configs, options)
    }
  }
})

#' @export
#' @return no return value, called for side effects
#' @rdname tesseract
#' @param filter only list parameters containing a particular string
#' @examples
#' tesseract_params("smooth")
tesseract_params <- function(filter = "") {
  tmp <- print_params(tempfile())
  on.exit(unlink(tmp))
  df <- parse_params(tmp)
  subset <- grepl(filter, paste(df$param, df$desc), ignore.case = TRUE)
  df_as_tibble(df[subset, ])
}

#' @export
#' @return list with information about the tesseract engine
#' @rdname tesseract
tesseract_info <- function() {
  info <- engine_info_internal(tesseract())
  config <- tesseract_config()
  list(
    datapath = info$datapath,
    available = info$available,
    version = config$version,
    configs = list.files(file.path(info$datapath, "configs"))
  )
}

parse_params <- function(path) {
  utils::read.delim(path,
    header = FALSE, quote = "",
    col.names = c("param", "default", "desc"), stringsAsFactors = FALSE
  )
}

tesseract_engine <- function(datapath, language, configs, options) {
  # Tesseract::read_config_file first checks for local file, then in tessdata
  lapply(configs, function(confpath) {
    if (file.exists(confpath)) {
      params <- tryCatch(utils::read.table(confpath, quote = ""), error = function(e) {
        bail("Failed to parse config file '%s': %s", confpath, e$message)
      })
      ok <- validate_params(params$V1)
      if (any(!ok)) {
        bail("Unsupported Tesseract parameter(s): [%s] in %s", paste(params$V1[!ok], collapse = ", "), confpath)
      }
    }
  })

  opt_names <- as.character(names(options))
  opt_values <- as.character(options)
  ok <- validate_params(opt_names)
  if (any(!ok)) {
    bail("Unsupported Tesseract parameter(s): [%s]", paste(opt_names[!ok], collapse = ", "))
  }

  tesseract_engine_internal(datapath, language, configs, opt_names, opt_values)
}

bail <- function(...) {
  stop(sprintf(...), call. = FALSE)
}

df_as_tibble <- function(df) {
  stopifnot(is.data.frame(df))
  class(df) <- c("tbl_df", "tbl", "data.frame")
  df
}

Try the cpp11tesseract package in your browser

Any scripts or data that you put into this service are public.

cpp11tesseract documentation built on April 4, 2025, 5:24 a.m.