R/hocr.R

Defines functions hocr_parse

Documented in hocr_parse

#' Parse hOCR file into a tibble
#'
#' @param x XHTML output from OCR algorithm in hOCR format (see https://en.wikipedia.org/wiki/HOCR for details)
#'
#' @return tibble with one word per line and columns describing lines, paragraphs, content areas and pages
#'
#' @examples
#' \dontrun{
#' library(tesseract)
#' ocr("file.png", HOCR=TRUE) %>%
#'   tidy_hocr()
#' }
#' @export
#' @importFrom xml2 read_xml
hocr_parse <- function(x){
  xml2::read_xml(x) %>%
    parse_page()
}
dmi3kno/hocr documentation built on April 27, 2020, 10:39 a.m.