inst/doc/intro.R

## ----echo = FALSE, message = FALSE--------------------------------------------
library(tibble)
#knitr::opts_chunk$set(comment = "")
has_nld <- "nld" %in% tesseract::tesseract_info()$available
if(identical(Sys.info()[['user']], 'jeroen')) stopifnot(has_nld)
if(grepl('tesseract.Rcheck', getwd())){
  Sys.sleep(10) #workaround for CPU time check
}

## -----------------------------------------------------------------------------
library(tesseract)
eng <- tesseract("eng")
text <- tesseract::ocr("http://jeroen.github.io/images/testocr.png", engine = eng)
cat(text)

## -----------------------------------------------------------------------------
results <- tesseract::ocr_data("http://jeroen.github.io/images/testocr.png", engine = eng)
results

## -----------------------------------------------------------------------------
tesseract_info()

## ----eval=FALSE---------------------------------------------------------------
# # Only need to do download once:
# tesseract_download("nld")

## ----eval = has_nld-----------------------------------------------------------
# Now load the dictionary
(dutch <- tesseract("nld"))
text <- ocr("https://jeroen.github.io/images/utrecht2.png", engine = dutch)
cat(text)

## -----------------------------------------------------------------------------
library(magick)
input <- image_read("https://jeroen.github.io/images/bowers.jpg")

text <- input %>%
  image_resize("2000x") %>%
  image_convert(type = 'Grayscale') %>%
  image_trim(fuzz = 40) %>%
  image_write(format = 'png', density = '300x300') %>%
  tesseract::ocr() 

cat(text)

## ----eval=require(pdftools)---------------------------------------------------
pngfile <- pdftools::pdf_convert('https://jeroen.github.io/images/ocrscan.pdf', dpi = 600)
text <- tesseract::ocr(pngfile)
cat(text)

## -----------------------------------------------------------------------------
# List all parameters with *colour* in name or description
tesseract_params('colour')

## -----------------------------------------------------------------------------
tesseract::tesseract_info()['version']

## -----------------------------------------------------------------------------
numbers <- tesseract(options = list(tessedit_char_whitelist = "$.0123456789"))
cat(ocr("https://jeroen.github.io/images/receipt.png", engine = numbers))

## -----------------------------------------------------------------------------
# Do not allow any dollar sign 
numbers2 <- tesseract(options = list(tessedit_char_whitelist = ".0123456789"))
cat(ocr("https://jeroen.github.io/images/receipt.png", engine = numbers2))

Try the tesseract package in your browser

Any scripts or data that you put into this service are public.

tesseract documentation built on April 4, 2025, 2:10 a.m.