daiR: Interface with Google Cloud Document AI API

Documented in dai_async dai_notify dai_status dai_sync

#' OCR document synchronously
#'
#' @description Sends a single document to the Google Cloud Services (GCS)
#' Document AI v1 API for synchronous (immediate) processing. Returns a
#' HTTP response object containing the OCRed text and additional data.
#'
#' @param file path to a single-page pdf or image file
#' @param proj_id a GCS project id.
#' @param proc_id a Document AI processor id.
#' @param proc_v one of 1) a processor version name, 2) "stable" for the
#' latest processor from the stable channel, or 3) "rc" for the latest
#' processor from the release candidate channel.
#' @param skip_rev whether to skip human review; "true" or "false".
#' @param loc a two-letter region code; "eu" or "us".
#' @param token an authentication token generated by \code{dai_auth()} or
#' another auth function.
#'
#' @return a HTTP response object.
#'
#' @details Requires a GCS access token and some configuration of the
#' .Renviron file; see package vignettes for details.Input files can be in
#' either .pdf, .bmp, .gif, .jpeg, .jpg, .png, or .tiff format. PDF files
#' can be up to five pages long. Extract the text from the response object with
#' \code{text_from_dai_response()}. Inspect the entire response object with
#' \code{httr::content()}.
#' @export
#'
#' @examples
#' \dontrun{
#' response <- dai_sync("doc_page.pdf")
#'
#' response <- dai_sync("doc_page.pdf",
#'   proc_v = "pretrained-ocr-v1.1-2022-09-12"
#' )
#' }
dai_sync <- function(file,
                     proj_id = get_project_id(),
                     proc_id = Sys.getenv("DAI_PROCESSOR_ID"),
                     proc_v = NA,
                     skip_rev = "true",
                     loc = "eu",
                     token = dai_token()) {
  # Check inputs
  if (!(is.character(file) && length(file) == 1)) {
    stop("Invalid file input.")
  }

  extension <- tolower(stringr::str_extract(file, "(?<=\\.)\\w{3,4}$"))
  supported <- c("bmp", "gif", "jpeg", "jpg", "pdf", "png", "tif", "tiff", "webp")

  if (!(extension %in% supported)) {
    stop("Unsupported file format. DAI accepts only bmp, gif, jpeg, jpg, pdf, png, tif, tiff, and webp.")
  }

  if (extension == "pdf" && !(is_pdf(file))) {
    stop("Input file not a real pdf. Is the file in your working directory?")
  }

  if (!(is.character(proj_id) && length(proj_id) == 1)) {
    stop("Invalid proj_id.")
  }

  if (!(is.character(proc_id) && length(proc_id) == 1) || proc_id == "") {
    stop("Invalid proc_id.")
  }

  if (!(length(proc_v) == 1)) {
    stop("Invalid proc_v.")
  }

  if (!(is.na(proc_v) || is.character(proc_v))) {
    stop("Invalid proc_v.")
  }

  skip_rev <- tolower(skip_rev)

  if (!(skip_rev %in% c("true", "false") && length(skip_rev) == 1)) {
    stop("Invalid skip_rev parameter.")
  }

  loc <- tolower(loc)

  if (!(loc %in% c("eu", "us"))) {
    stop("Invalid location parameter.")
  }

  # Encode
  if (extension == "pdf") {
    encoded_file <- pdf_to_binbase(file)
  } else {
    encoded_file <- img_to_binbase(file)
  }

  ## Create json request body
  req <- list(
    "skipHumanReview" = skip_rev,
    "rawDocument" = list(
      "content" = encoded_file,
      "mimeType" = "image/png"
    )
  )

  bod <- jsonlite::toJSON(req, auto_unbox = TRUE)

  ## Build URL and submit API request

  base_url <- glue::glue("https://{loc}-documentai.googleapis.com/")

  path <- glue::glue("v1/projects/{proj_id}/locations/{loc}/processors/{proc_id}")

  if (is.na(proc_v)) {
    version <- ""
  } else {
    version <- glue::glue("/processorVersions/{proc_v}")
  }

  method <- ":process"

  url <- glue::glue("{base_url}{path}{version}{method}")

  response <- httr::POST(url,
    httr::config(token = token),
    body = bod
  )

  if (response$status_code == 200) {
    cli::cli_alert_success(glue::glue("File submitted at {response$date}. HTTP status: 200 - OK."))
  } else {
    parsed <- httr::content(response)
    cli::cli_alert_danger(glue::glue('File submitted at {response$date}. HTTP status: {response$status_code} - unsuccessful.\nError: "{parsed$error$message}"'))
  }

  response
}

#' OCR documents asynchronously
#'
#' @description Sends files from a Google Cloud Services (GCS) Storage
#' bucket to the GCS Document AI v1 API for asynchronous (offline) processing.
#' The output is delivered to the same bucket as JSON files containing
#' the OCRed text and additional data.
#'
#' @param files a vector or list of pdf filepaths in a GCS Storage bucket
#' Filepaths must include all parent bucket folder(s) except the bucket name
#' @param dest_folder the name of the GCS Storage bucket subfolder where
#' you want the json output
#' @param bucket the name of the GCS Storage bucket where the files
#' to be processed are located
#' @param proj_id a GCS project id
#' @param proc_id a Document AI processor id
#' @param proc_v one of 1) a processor version name, 2) "stable" for the
#' latest processor from the stable channel, or 3) "rc" for the latest
#' processor from the release candidate channel.
#' @param skip_rev whether to skip human review; "true" or "false"
#' @param loc a two-letter region code; "eu" or "us"
#' @param token an access token generated by `dai_auth()` or another
#' auth function
#' @return A list of HTTP responses
#'
#' @details Requires a GCS access token and some configuration of the
#' .Renviron file; see package vignettes for details. Currently, a
#' \code{dai_async()} call can contain a maximum of 50 files (but a
#' multi-page pdf counts as one file). You can not have more than
#' 5 batch requests and 10,000 pages undergoing processing at any one time.
#' Maximum pdf document length is 2,000 pages. With long pdf documents,
#' Document AI divides the JSON output into separate files ('shards') of
#' 20 pages each. If you want longer shards, use \code{dai_tab_async()},
#' which accesses another API endpoint that allows for shards of up to
#' 100 pages.
#' @export
#'
#' @examples
#' \dontrun{
#' # with daiR configured on your system, several parameters are automatically provided,
#' # and you can pass simple calls, such as:
#' dai_async("my_document.pdf")
#'
#' # NB: Include all parent bucket folders (but not the bucket name) in the filepath:
#' dai_async("for_processing/pdfs/my_document.pdf")
#'
#' # Bulk process by passing a vector of filepaths in the files argument:
#' dai_async(my_files)
#'
#' # Specify a bucket subfolder for the json output:
#' dai_async(my_files, dest_folder = "processed")
#' }
dai_async <- function(files,
                      dest_folder = NULL,
                      bucket = Sys.getenv("GCS_DEFAULT_BUCKET"),
                      proj_id = get_project_id(),
                      proc_id = Sys.getenv("DAI_PROCESSOR_ID"),
                      proc_v = NA,
                      skip_rev = "true",
                      loc = "eu",
                      token = dai_token()) {
  # Check and modify inputs
  if (!(is.character(files) && length(files) >= 1)) {
    stop("Invalid files parameter.")
  }

  extensions <- tolower(stringr::str_extract_all(files, "(?<=\\.)\\w{3,4}$"))
  supported <- c("bmp", "gif", "jpeg", "jpg", "pdf", "png", "tif", "tiff", "webp")

  if (!(all(unique(extensions) %in% supported))) {
    stop("Unsupported file formats. DAI accepts only bmp, gif, jpeg, jpg, pdf, png, tif, tiff, and webp.")
  }

  if (length(dest_folder) > 1) {
    stop("Invalid dest_folder parameter.")
  }

  if (length(dest_folder) == 1 && !(is.character(dest_folder))) {
    stop("Invalid dest_folder parameter.")
  }

  if (length(dest_folder) == 1 && grepl("/$", dest_folder)) {
    dest_folder <- stringr::str_replace(dest_folder, "/$", "")
  }

  if (!(is.character(bucket) && length(bucket) == 1) || bucket == "") {
    stop("Invalid bucket parameter.")
  }

  if (grepl("^gs://", bucket)) {
    bucket <- stringr::str_replace(bucket, "^gs://", "")
  }

  if ((grepl("/$", bucket))) {
    bucket <- stringr::str_replace(bucket, "/$", "")
  }

  if (!(is.character(proj_id) && length(proj_id) == 1)) {
    stop("Invalid proj_id parameter.")
  }

  if (!(is.character(proc_id) && length(proc_id) == 1) || proc_id == "") {
    stop("Invalid proc_id parameter.")
  }

  if (!(length(proc_v) == 1)) {
    stop("Invalid proc_v.")
  }

  if (!(is.na(proc_v) || is.character(proc_v))) {
    stop("Invalid proc_v.")
  }

  if (!(skip_rev %in% c("true", "false") && length(skip_rev) == 1)) {
    stop("Invalid skip_rev parameter.")
  }

  loc <- tolower(loc)

  if (!(loc %in% c("eu", "us") && length(loc) == 1)) {
    stop("Invalid loc parameter.")
  }

  # format list of documents
  doc_list <- list()

  for (file in files) {
    filetype <- stringr::str_extract(file, "(?<=\\.)\\w{3,4}$")

    if (filetype == "pdf") {
      mime <- "application/pdf"
    } else if (filetype == "gif") {
      mime <- "image/gif"
    } else if (filetype %in% c("tif", "tiff")) {
      mime <- "image/tiff"
    } else if (filetype %in% c("jpg", "jpeg")) {
      mime <- "image/jpeg"
    } else if (filetype == "png") {
      mime <- "image/png"
    } else if (filetype == "bmp") {
      mime <- "image/bmp"
    } else {
      mime <- "image/webp"
    }

    uri <- glue::glue("gs://{bucket}/{file}")

    entry <- list(list(
      "gcsUri" = uri,
      "mimeType" = mime
    ))

    doc_list <- append(doc_list, entry)
  }

  # format dest folder uri
  if (is.null(dest_folder)) {
    dest_folder_uri <- glue::glue("gs://{bucket}/")
  } else {
    dest_folder_uri <- glue::glue("gs://{bucket}/{dest_folder}/")
  }


  ## create json request body
  req <- list(
    "inputDocuments" = list("gcsDocuments" = list("documents" = doc_list)),
    "documentOutputConfig" = list("gcsOutputConfig" = list("gcsUri" = dest_folder_uri)),
    "skipHumanReview" = skip_rev
  )

  bod <- jsonlite::toJSON(req, auto_unbox = TRUE)

  ## build URL and submit API request

  base_url <- glue::glue("https://{loc}-documentai.googleapis.com/")

  path <- glue::glue("v1/projects/{proj_id}/locations/{loc}/processors/{proc_id}")

  if (is.na(proc_v)) {
    version <- ""
  } else {
    version <- glue::glue("/processorVersions/{proc_v}")
  }

  method <- ":batchProcess"

  url <- glue::glue("{base_url}{path}{version}{method}")

  response <- httr::POST(url,
    httr::config(token = token),
    body = bod
  )

  if (response$status_code == 200) {
    cli::cli_alert_info(glue::glue("{length(files)} file(s) submitted at {response$date}. Check job status with daiR::dai_status()."))
  } else {
    parsed <- httr::content(response)
    cli::cli_alert_danger(glue::glue('{length(files)} files submitted at {response$date}. HTTP status: {response$status_code} - unsuccessful.\nError: "{parsed$error$message}"'))
  }

  response
}

#' Check job status
#'
#' @description Queries the Google Cloud Services (GCS)
#' Document AI API about the status of a previously submitted
#' asynchronous job.
#'
#' @param response A HTTP response object generated by
#' \code{dai_async()}
#' @param loc A two-letter region code; "eu" or "us"
#' @param token An authentication token generated by
#' \code{dai_auth()} or another auth function
#' @param verbose boolean; Whether to output the full response
#'
#' @return If verbose was set to \code{TRUE}, a HTTP response object.
#' If verbose was set to \code{FALSE}, a string summarizing the status.
#' @export
#'
#' @examples
#' \dontrun{
#' # Short status message:
#' response <- dai_async(myfiles)
#' dai_status(response)
#'
#' # Full status details:
#' response <- dai_async(myfiles)
#' status <- dai_status(response, verbose = TRUE)
#' }
dai_status <- function(response,
                       loc = "eu",
                       token = dai_token(),
                       verbose = FALSE) {
  if (!(inherits(response, "response") || inherits(response[[1]], "response"))) {
    stop("Input is not a valid HTTP response.")
  }

  if (inherits(response[[1]], "response")) {
    last_elem <- max(length(response))
    parsed <- httr::content(response[[last_elem]])
  } else {
    parsed <- httr::content(response)
  }

  if (!("name" %in% names(parsed))) {
    stop("Input does not contain a processing job id. Make sure it is from dai_async.")
  }

  if (!(loc %in% c("eu", "us") && length(loc) == 1)) {
    stop("Invalid location parameter.")
  }

  if (!(verbose %in% c(TRUE, FALSE) && length(verbose))) {
    stop("Parameter verbose can only be TRUE or FALSE.")
  }

  name <- parsed$name

  base_url <- glue::glue("https://{loc}-documentai.googleapis.com/v1/")

  url <- glue::glue(base_url, name)

  resp <- httr::GET(url, httr::config(token = token))

  resp_par <- httr::content(resp)

  if (is.null(resp_par$metadata$commonMetadata$state)) {
    status <- resp_par$metadata$state
  } else {
    status <- resp_par$metadata$commonMetadata$state
  }

  job_no <- stringr::str_extract(name, "(?<=/)\\d+$")

  if (inherits(response[[1]], "response")) {
    cli::cli_alert_info(glue::glue('Status for job {job_no} submitted {response[[last_elem]]$date}: "{status}."'))
  } else {
    cli::cli_alert_info(glue::glue('Status for job {job_no} submitted {response$date}: "{status}."'))
  }

  if (isTRUE(verbose)) {
    resp
  }
}

#' Notify on job completion
#'
#' @description Queries to the Google Cloud Services (GCS) Document AI API
#' about the status of a previously submitted asynchronous job
#' and emits a sound notification when the job is complete.
#'
#' @param response a HTTP response object generated by
#' \code{dai_async()}
#' @param loc A two-letter region code; "eu" or "us"
#' @param token An authentication token generated by `dai_auth()` or
#' another auth function
#' @param sound A number from 1 to 10 for the Beepr sound selection
#' (https://www.r-project.org/nosvn/pandoc/beepr.html).
#'
#' @return no return value, called for side effects
#'
#' @export
#'
#' @examples
#' \dontrun{
#' response <- dai_async(myfiles)
#' dai_notify(response)
#' }
dai_notify <- function(response,
                       loc = "eu",
                       token = dai_token(),
                       sound = 2) {
  if (!(inherits(response, "response") || inherits(response[[1]], "response"))) {
    stop("Input is not a valid HTTP response.")
  }

  if (inherits(response[[1]], "response")) {
    last_elem <- max(length(response))
    parsed <- httr::content(response[[last_elem]])
  } else {
    parsed <- httr::content(response)
  }

  if (!("name" %in% names(parsed))) {
    stop("Input does not contain a processing job id. Either it's not from a\n
    dai processing function or it's from an unsuccessful processing request.")
  }

  if (!(loc %in% c("eu", "us") && length(loc) == 1)) {
    stop("Invalid location parameter.")
  }

  if (!(sound %in% 1:10 && length(sound) == 1)) {
    stop("Invalid sound parameter.")
  }

  finished <- FALSE

  cli::cli_alert_info("Checking job. I'll beep when it's done.")

  while (isFALSE(finished)) {
    msg <- utils::capture.output(dai_status(response, loc, token), type = "message")
    finished <- grepl("SUCCEEDED", msg)
    Sys.sleep(1)
  }

  cli::cli_alert_success("Job complete.")

  beepr::beep(sound)
}