#' OCR document synchronously
#'
#' @description Sends a single document to the Google Cloud Services (GCS)
#' Document AI v1 API for synchronous (immediate) processing. Returns a
#' HTTP response object containing the OCRed text and additional data.
#'
#' @param file path to a single-page pdf or image file
#' @param proj_id a GCS project id.
#' @param proc_id a Document AI processor id.
#' @param proc_v one of 1) a processor version name, 2) "stable" for the
#' latest processor from the stable channel, or 3) "rc" for the latest
#' processor from the release candidate channel.
#' @param skip_rev whether to skip human review; "true" or "false".
#' @param loc a two-letter region code; "eu" or "us".
#' @param token an authentication token generated by \code{dai_auth()} or
#' another auth function.
#'
#' @return a HTTP response object.
#'
#' @details Requires a GCS access token and some configuration of the
#' .Renviron file; see package vignettes for details.Input files can be in
#' either .pdf, .bmp, .gif, .jpeg, .jpg, .png, or .tiff format. PDF files
#' can be up to five pages long. Extract the text from the response object with
#' \code{text_from_dai_response()}. Inspect the entire response object with
#' \code{httr::content()}.
#' @export
#'
#' @examples
#' \dontrun{
#' response <- dai_sync("doc_page.pdf")
#'
#' response <- dai_sync("doc_page.pdf",
#' proc_v = "pretrained-ocr-v1.1-2022-09-12"
#' )
#' }
dai_sync <- function(file,
proj_id = get_project_id(),
proc_id = Sys.getenv("DAI_PROCESSOR_ID"),
proc_v = NA,
skip_rev = "true",
loc = "eu",
token = dai_token()) {
# Check inputs
if (!(is.character(file) && length(file) == 1)) {
stop("Invalid file input.")
}
extension <- tolower(stringr::str_extract(file, "(?<=\\.)\\w{3,4}$"))
supported <- c("bmp", "gif", "jpeg", "jpg", "pdf", "png", "tif", "tiff", "webp")
if (!(extension %in% supported)) {
stop("Unsupported file format. DAI accepts only bmp, gif, jpeg, jpg, pdf, png, tif, tiff, and webp.")
}
if (extension == "pdf" && !(is_pdf(file))) {
stop("Input file not a real pdf. Is the file in your working directory?")
}
if (!(is.character(proj_id) && length(proj_id) == 1)) {
stop("Invalid proj_id.")
}
if (!(is.character(proc_id) && length(proc_id) == 1) || proc_id == "") {
stop("Invalid proc_id.")
}
if (!(length(proc_v) == 1)) {
stop("Invalid proc_v.")
}
if (!(is.na(proc_v) || is.character(proc_v))) {
stop("Invalid proc_v.")
}
skip_rev <- tolower(skip_rev)
if (!(skip_rev %in% c("true", "false") && length(skip_rev) == 1)) {
stop("Invalid skip_rev parameter.")
}
loc <- tolower(loc)
if (!(loc %in% c("eu", "us"))) {
stop("Invalid location parameter.")
}
# Encode
if (extension == "pdf") {
encoded_file <- pdf_to_binbase(file)
} else {
encoded_file <- img_to_binbase(file)
}
## Create json request body
req <- list(
"skipHumanReview" = skip_rev,
"rawDocument" = list(
"content" = encoded_file,
"mimeType" = "image/png"
)
)
bod <- jsonlite::toJSON(req, auto_unbox = TRUE)
## Build URL and submit API request
base_url <- glue::glue("https://{loc}-documentai.googleapis.com/")
path <- glue::glue("v1/projects/{proj_id}/locations/{loc}/processors/{proc_id}")
if (is.na(proc_v)) {
version <- ""
} else {
version <- glue::glue("/processorVersions/{proc_v}")
}
method <- ":process"
url <- glue::glue("{base_url}{path}{version}{method}")
response <- httr::POST(url,
httr::config(token = token),
body = bod
)
if (response$status_code == 200) {
cli::cli_alert_success(glue::glue("File submitted at {response$date}. HTTP status: 200 - OK."))
} else {
parsed <- httr::content(response)
cli::cli_alert_danger(glue::glue('File submitted at {response$date}. HTTP status: {response$status_code} - unsuccessful.\nError: "{parsed$error$message}"'))
}
response
}
#' OCR documents asynchronously
#'
#' @description Sends files from a Google Cloud Services (GCS) Storage
#' bucket to the GCS Document AI v1 API for asynchronous (offline) processing.
#' The output is delivered to the same bucket as JSON files containing
#' the OCRed text and additional data.
#'
#' @param files a vector or list of pdf filepaths in a GCS Storage bucket
#' Filepaths must include all parent bucket folder(s) except the bucket name
#' @param dest_folder the name of the GCS Storage bucket subfolder where
#' you want the json output
#' @param bucket the name of the GCS Storage bucket where the files
#' to be processed are located
#' @param proj_id a GCS project id
#' @param proc_id a Document AI processor id
#' @param proc_v one of 1) a processor version name, 2) "stable" for the
#' latest processor from the stable channel, or 3) "rc" for the latest
#' processor from the release candidate channel.
#' @param skip_rev whether to skip human review; "true" or "false"
#' @param loc a two-letter region code; "eu" or "us"
#' @param token an access token generated by `dai_auth()` or another
#' auth function
#' @return A list of HTTP responses
#'
#' @details Requires a GCS access token and some configuration of the
#' .Renviron file; see package vignettes for details. Currently, a
#' \code{dai_async()} call can contain a maximum of 50 files (but a
#' multi-page pdf counts as one file). You can not have more than
#' 5 batch requests and 10,000 pages undergoing processing at any one time.
#' Maximum pdf document length is 2,000 pages. With long pdf documents,
#' Document AI divides the JSON output into separate files ('shards') of
#' 20 pages each. If you want longer shards, use \code{dai_tab_async()},
#' which accesses another API endpoint that allows for shards of up to
#' 100 pages.
#' @export
#'
#' @examples
#' \dontrun{
#' # with daiR configured on your system, several parameters are automatically provided,
#' # and you can pass simple calls, such as:
#' dai_async("my_document.pdf")
#'
#' # NB: Include all parent bucket folders (but not the bucket name) in the filepath:
#' dai_async("for_processing/pdfs/my_document.pdf")
#'
#' # Bulk process by passing a vector of filepaths in the files argument:
#' dai_async(my_files)
#'
#' # Specify a bucket subfolder for the json output:
#' dai_async(my_files, dest_folder = "processed")
#' }
dai_async <- function(files,
dest_folder = NULL,
bucket = Sys.getenv("GCS_DEFAULT_BUCKET"),
proj_id = get_project_id(),
proc_id = Sys.getenv("DAI_PROCESSOR_ID"),
proc_v = NA,
skip_rev = "true",
loc = "eu",
token = dai_token()) {
# Check and modify inputs
if (!(is.character(files) && length(files) >= 1)) {
stop("Invalid files parameter.")
}
extensions <- tolower(stringr::str_extract_all(files, "(?<=\\.)\\w{3,4}$"))
supported <- c("bmp", "gif", "jpeg", "jpg", "pdf", "png", "tif", "tiff", "webp")
if (!(all(unique(extensions) %in% supported))) {
stop("Unsupported file formats. DAI accepts only bmp, gif, jpeg, jpg, pdf, png, tif, tiff, and webp.")
}
if (length(dest_folder) > 1) {
stop("Invalid dest_folder parameter.")
}
if (length(dest_folder) == 1 && !(is.character(dest_folder))) {
stop("Invalid dest_folder parameter.")
}
if (length(dest_folder) == 1 && grepl("/$", dest_folder)) {
dest_folder <- stringr::str_replace(dest_folder, "/$", "")
}
if (!(is.character(bucket) && length(bucket) == 1) || bucket == "") {
stop("Invalid bucket parameter.")
}
if (grepl("^gs://", bucket)) {
bucket <- stringr::str_replace(bucket, "^gs://", "")
}
if ((grepl("/$", bucket))) {
bucket <- stringr::str_replace(bucket, "/$", "")
}
if (!(is.character(proj_id) && length(proj_id) == 1)) {
stop("Invalid proj_id parameter.")
}
if (!(is.character(proc_id) && length(proc_id) == 1) || proc_id == "") {
stop("Invalid proc_id parameter.")
}
if (!(length(proc_v) == 1)) {
stop("Invalid proc_v.")
}
if (!(is.na(proc_v) || is.character(proc_v))) {
stop("Invalid proc_v.")
}
if (!(skip_rev %in% c("true", "false") && length(skip_rev) == 1)) {
stop("Invalid skip_rev parameter.")
}
loc <- tolower(loc)
if (!(loc %in% c("eu", "us") && length(loc) == 1)) {
stop("Invalid loc parameter.")
}
# format list of documents
doc_list <- list()
for (file in files) {
filetype <- stringr::str_extract(file, "(?<=\\.)\\w{3,4}$")
if (filetype == "pdf") {
mime <- "application/pdf"
} else if (filetype == "gif") {
mime <- "image/gif"
} else if (filetype %in% c("tif", "tiff")) {
mime <- "image/tiff"
} else if (filetype %in% c("jpg", "jpeg")) {
mime <- "image/jpeg"
} else if (filetype == "png") {
mime <- "image/png"
} else if (filetype == "bmp") {
mime <- "image/bmp"
} else {
mime <- "image/webp"
}
uri <- glue::glue("gs://{bucket}/{file}")
entry <- list(list(
"gcsUri" = uri,
"mimeType" = mime
))
doc_list <- append(doc_list, entry)
}
# format dest folder uri
if (is.null(dest_folder)) {
dest_folder_uri <- glue::glue("gs://{bucket}/")
} else {
dest_folder_uri <- glue::glue("gs://{bucket}/{dest_folder}/")
}
## create json request body
req <- list(
"inputDocuments" = list("gcsDocuments" = list("documents" = doc_list)),
"documentOutputConfig" = list("gcsOutputConfig" = list("gcsUri" = dest_folder_uri)),
"skipHumanReview" = skip_rev
)
bod <- jsonlite::toJSON(req, auto_unbox = TRUE)
## build URL and submit API request
base_url <- glue::glue("https://{loc}-documentai.googleapis.com/")
path <- glue::glue("v1/projects/{proj_id}/locations/{loc}/processors/{proc_id}")
if (is.na(proc_v)) {
version <- ""
} else {
version <- glue::glue("/processorVersions/{proc_v}")
}
method <- ":batchProcess"
url <- glue::glue("{base_url}{path}{version}{method}")
response <- httr::POST(url,
httr::config(token = token),
body = bod
)
if (response$status_code == 200) {
cli::cli_alert_info(glue::glue("{length(files)} file(s) submitted at {response$date}. Check job status with daiR::dai_status()."))
} else {
parsed <- httr::content(response)
cli::cli_alert_danger(glue::glue('{length(files)} files submitted at {response$date}. HTTP status: {response$status_code} - unsuccessful.\nError: "{parsed$error$message}"'))
}
response
}
#' Check job status
#'
#' @description Queries the Google Cloud Services (GCS)
#' Document AI API about the status of a previously submitted
#' asynchronous job.
#'
#' @param response A HTTP response object generated by
#' \code{dai_async()}
#' @param loc A two-letter region code; "eu" or "us"
#' @param token An authentication token generated by
#' \code{dai_auth()} or another auth function
#' @param verbose boolean; Whether to output the full response
#'
#' @return If verbose was set to \code{TRUE}, a HTTP response object.
#' If verbose was set to \code{FALSE}, a string summarizing the status.
#' @export
#'
#' @examples
#' \dontrun{
#' # Short status message:
#' response <- dai_async(myfiles)
#' dai_status(response)
#'
#' # Full status details:
#' response <- dai_async(myfiles)
#' status <- dai_status(response, verbose = TRUE)
#' }
dai_status <- function(response,
loc = "eu",
token = dai_token(),
verbose = FALSE) {
if (!(inherits(response, "response") || inherits(response[[1]], "response"))) {
stop("Input is not a valid HTTP response.")
}
if (inherits(response[[1]], "response")) {
last_elem <- max(length(response))
parsed <- httr::content(response[[last_elem]])
} else {
parsed <- httr::content(response)
}
if (!("name" %in% names(parsed))) {
stop("Input does not contain a processing job id. Make sure it is from dai_async.")
}
if (!(loc %in% c("eu", "us") && length(loc) == 1)) {
stop("Invalid location parameter.")
}
if (!(verbose %in% c(TRUE, FALSE) && length(verbose))) {
stop("Parameter verbose can only be TRUE or FALSE.")
}
name <- parsed$name
base_url <- glue::glue("https://{loc}-documentai.googleapis.com/v1/")
url <- glue::glue(base_url, name)
resp <- httr::GET(url, httr::config(token = token))
resp_par <- httr::content(resp)
if (is.null(resp_par$metadata$commonMetadata$state)) {
status <- resp_par$metadata$state
} else {
status <- resp_par$metadata$commonMetadata$state
}
job_no <- stringr::str_extract(name, "(?<=/)\\d+$")
if (inherits(response[[1]], "response")) {
cli::cli_alert_info(glue::glue('Status for job {job_no} submitted {response[[last_elem]]$date}: "{status}."'))
} else {
cli::cli_alert_info(glue::glue('Status for job {job_no} submitted {response$date}: "{status}."'))
}
if (isTRUE(verbose)) {
resp
}
}
#' Notify on job completion
#'
#' @description Queries to the Google Cloud Services (GCS) Document AI API
#' about the status of a previously submitted asynchronous job
#' and emits a sound notification when the job is complete.
#'
#' @param response a HTTP response object generated by
#' \code{dai_async()}
#' @param loc A two-letter region code; "eu" or "us"
#' @param token An authentication token generated by `dai_auth()` or
#' another auth function
#' @param sound A number from 1 to 10 for the Beepr sound selection
#' (https://www.r-project.org/nosvn/pandoc/beepr.html).
#'
#' @return no return value, called for side effects
#'
#' @export
#'
#' @examples
#' \dontrun{
#' response <- dai_async(myfiles)
#' dai_notify(response)
#' }
dai_notify <- function(response,
loc = "eu",
token = dai_token(),
sound = 2) {
if (!(inherits(response, "response") || inherits(response[[1]], "response"))) {
stop("Input is not a valid HTTP response.")
}
if (inherits(response[[1]], "response")) {
last_elem <- max(length(response))
parsed <- httr::content(response[[last_elem]])
} else {
parsed <- httr::content(response)
}
if (!("name" %in% names(parsed))) {
stop("Input does not contain a processing job id. Either it's not from a\n
dai processing function or it's from an unsuccessful processing request.")
}
if (!(loc %in% c("eu", "us") && length(loc) == 1)) {
stop("Invalid location parameter.")
}
if (!(sound %in% 1:10 && length(sound) == 1)) {
stop("Invalid sound parameter.")
}
finished <- FALSE
cli::cli_alert_info("Checking job. I'll beep when it's done.")
while (isFALSE(finished)) {
msg <- utils::capture.output(dai_status(response, loc, token), type = "message")
finished <- grepl("SUCCEEDED", msg)
Sys.sleep(1)
}
cli::cli_alert_success("Job complete.")
beepr::beep(sound)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.