R/process_output.R

Defines functions transpose_page transpose_block get_vertices redraw_blocks from_labelme reassign_tokens2 reassign_tokens split_block build_block_df build_token_df

Documented in build_block_df build_token_df from_labelme reassign_tokens reassign_tokens2 redraw_blocks split_block

#' Build token dataframe
#'
#' @description Builds a token dataframe from the text OCRed by
#' Document AI (DAI) in an asynchronous request. Rows are tokens, in the
#' order DAI proposes to read them. Columns are location variables
#' such as page coordinates and block bounding box numbers.
#'
#' @param type one of "sync" or "async" depending on
#' the function used to process the original document.
#' @param output either a HTTP response object (from `dai_sync()`) or
#' the path to a JSON file (from `dai_async`).
#' @return a token data frame
#'
#' @details The location variables are: token, start index, end index,
#' confidence, left boundary, right boundary, top boundary, bottom boundary,
#' page number, and block number. Start and end indices refer to
#' character position in the string containing the full text.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' token_df <- build_token_df(type = "async", output = "pdf_output.json")
#' resp <- dai_sync("file.pdf")
#' token_df <- build_token_df(type = "sync", output = resp)
#' }

build_token_df <- function(
                           type,
                           output
                           ) {

    # checks
    if (!(length(type) == 1) || !(type %in% c("sync", "async", "sync-tab", "async-tab"))) {
        stop("Invalid type parameter.")
    }

    if (!(inherits(output, "response") || is_json(output))) {
        stop("Invalid output parameter.")
    }

    if (type == "sync") {

        output_sync <- httr::content(output)
        text <- output_sync$document$text
        pages_sync <- output_sync$document$pages

        # get pagewise indices
        pages_tokens_sync <- purrr::map(pages_sync, ~.x$tokens)

        # extract all start indices as single vector
        start_ind <- integer()
        for (i in pages_tokens_sync) {
            for (j in i) {
                ind <- as.integer(j[["layout"]][["textAnchor"]][["textSegments"]][[1]][["startIndex"]]) + 1
                start_ind <- c(start_ind, ind)
            }
        }

        # extract all end indices as single vector
        end_ind <- integer()
        for (i in pages_tokens_sync) {
            for (j in i) {
                ind <- j[["layout"]][["textAnchor"]][["textSegments"]][[1]][["endIndex"]]
                end_ind <- c(end_ind, as.integer(ind))
            }
        }

        # get confidence scores as single vector
        conf <- numeric()
        for (i in pages_tokens_sync) {
            for (j in i) {
                con <- j$layout$confidence
                conf <- c(conf, con)
            }
        }

    } else if (type == "async") {

        output_async <- jsonlite::fromJSON(output)

        if (!("pages" %in% names(output_async))) {
            stop("JSON not in right format. Is it from DAI?")
        }

        if (!("text" %in% names(output_async))) {
            stop("DAI found no tokens. Was the document blank?")
        }

        text <- output_async$text

        # get pagewise indices
        pages_tokens_async <- output_async$pages$tokens
        pagewise_token_indices_async <- purrr::map(pages_tokens_async, ~.x$layout$textAnchor$textSegments)

        # extract all start indices as single vector, shifting them by one, because the token really starts at (index + 1)
        start_ind <- as.integer(unlist(purrr::map(pagewise_token_indices_async, ~ purrr::map(.x, ~.x$startIndex)))) + 1

        # extract all end indices as single vector
        end_ind <- unlist(purrr::map(pagewise_token_indices_async, ~ purrr::map(.x, ~.x$endIndex)))

        # get confidence scores as single vector
        conf <- unlist(purrr::map(pages_tokens_async, ~.x$layout$confidence))

    }
    # insert implicit start index
    start_ind <- c(1, start_ind)

    # get all tokens as single vector
    token <- character()
    for (i in seq_along(start_ind)) {
        tok <- substr(text,
                      start_ind[i],
                      end_ind[i]
        )
        token <- c(token, tok)
    }

    # get page numbers as single vector
    if (type == "sync") { pages_tokens <- pages_tokens_sync } else { pages_tokens <- pages_tokens_async }
    page <- integer()
    for (i in seq_along(pages_tokens)) {
        if (is.null(pages_tokens[[i]])) {
            instances <- 0
        } else {
            if (type == "sync") { instances <- length(pages_tokens[[i]]) }
            else { instances <- nrow(pages_tokens[[i]])
            }
        }
        pg <- rep(i, each = instances)
        page <- c(page, pg)
    }

    # get boundaries as single vectors
    if (type == "sync") {
        pagewise_token_coords <- purrr::map(pages_tokens_sync, get_vertices)
        pagewise_token_coords <- purrr::map(pagewise_token_coords, transpose_page)
    } else { pagewise_token_coords <- purrr::map(pages_tokens_async, ~.x$layout$boundingPoly$normalizedVertices)}

    left <-  unlist(purrr::map(pagewise_token_coords, ~ purrr::map(.x, ~ min(.x$x))))
    right <- unlist(purrr::map(pagewise_token_coords, ~ purrr::map(.x, ~ max(.x$x))))
    top <- unlist(purrr::map(pagewise_token_coords, ~ purrr::map(.x, ~ min(.x$y))))
    bottom <- unlist(purrr::map(pagewise_token_coords, ~ purrr::map(.x, ~ max(.x$y))))

    # combine all vectors to dataframe
    df <- data.frame(token,
                     start_ind,
                     end_ind,
                     conf,
                     left,
                     right,
                     top,
                     bottom,
                     page,
                     block = NA
                    )

    # get block numbers, assigning by token location
    if (type == "sync") {
        pages_blocks_sync <- purrr::map(pages_sync, ~.x$blocks)
        pages_blocks_coords <- purrr::map(pages_blocks_sync, get_vertices)
        pages_blocks <- purrr::map(pages_blocks_coords, transpose_page)
    } else {
        pages_blocks_async <- output_async$pages$blocks
        pages_blocks <- purrr::map(pages_blocks_async, ~.x$layout$boundingPoly$normalizedVertices)
    }

    for (i in seq_along(pages_blocks)) {
        df_sub <- df[df$page == i, ]
        count <- 1
        for (j in pages_blocks[[i]]) {

            # assign number to token rows that fall within the box
            df_sub$block[df_sub$right > mean(c(j$x[1], j$x[4])) &
                             df_sub$left < mean(c(j$x[2], j$x[3])) &
                             df_sub$bottom > mean(c(j$y[1], j$y[2])) &
                             df_sub$top < mean(c(j$y[3], j$y[4]))] <- count
            count <- count + 1
        }
        df$block[df$page == i] <- df_sub$block
    }

    return(df)

}

#' Build block dataframe
#'
#' @description Creates a dataframe with the block bounding boxes
#' identified by Document AI (DAI) in an asynchronous request.
#' Rows are blocks, in the order DAI proposes to read them. Columns
#' are location variables such as page coordinates and page numbers.
#'
#' @param type one of "sync" or "async" depending on
#' the function used to process the original document.
#' @param output either a HTTP response object (from `dai_sync()`) or
#' the path to a JSON file (from `dai_async`).
#' @return a block data frame
#'
#' @details The dataframe variables are: page number, block number,
#' confidence score, left boundary, right boundary, top boundary,
#' and bottom boundary.
#' @export
#'
#' @examples
#' \dontrun{
#' block_df <- build_block_df(type = "async", output = "pdf_output.json")
#' resp <- dai_sync("file.pdf")
#' block_df <- build_block_df(type = "sync", output = resp)
#' }

    build_block_df <- function(
        type,
        output
    ) {

        # checks
        if (!(length(type) == 1) || !(type %in% c("sync", "async", "sync-tab", "async-tab"))) {
            stop("Invalid type parameter.")
        }

        if (!(inherits(output, "response") || is_json(output))) {
            stop("Invalid output parameter.")
        }

        if (type == "sync") {

            output_sync <- httr::content(output)

            # extract a list with pagewise sets of block boundary boxes
            pages_sync <- output_sync$document$pages
            pages_blocks_sync <- purrr::map(pages_sync, ~.x$blocks)
            pagewise_block_sets_sync <- purrr::map(pages_blocks_sync, get_vertices)
            pagewise_block_sets_sync <- purrr::map(pagewise_block_sets_sync, transpose_page)

            # get confidence scores as single vector
            conf <- numeric()
            for (i in pages_blocks_sync) {
                for (j in i) {
                    con <- j$layout$confidence
                    conf <- c(conf, con)
                }
            }
        }

        else if (type == "async") {

            output_async <- jsonlite::fromJSON(output)

            if (!("pages" %in% names(output_async))) {
                stop("JSON not in right format. Is it from DAI?")
            }

            if (!("text" %in% names(output_async))) {
                stop("DAI found no blocks. Was the document blank?")
            }

            # extract a list with pagewise sets of block boundary boxes
            pages_blocks_async <- output_async$pages$blocks
            pagewise_block_sets_async <- purrr::map(pages_blocks_async, ~.x$layout$boundingPoly$normalizedVertices)

            # get confidence scores as single vector
            conf <- unlist(purrr::map(pages_blocks_async, ~.x$layout$confidence))

        }

        # get page numbers as single vector
        page <- integer()
        if (type == "sync") { pages_blocks <- pages_blocks_sync } else { pages_blocks <- pages_blocks_async }
        for (i in seq_along(pages_blocks)) {
            if (is.null(pages_blocks[[i]])) {
                instances <- 0
            } else {
                if (type == "sync") { instances <- length(pages_blocks[[i]]) }
                else { instances <- nrow(pages_blocks[[i]])
                }
            }
            pg <- rep(i, each = instances)
            page <- c(page, pg)
        }

        # get block numbers as single vector
        if (type == "sync") { pagewise_block_coords <- pagewise_block_sets_sync } else { pagewise_block_coords <- pagewise_block_sets_async}
        block <- integer()
        for (i in pagewise_block_coords) {
            b <- seq_along(i)
            block <- c(block, b)
        }

        # get coordinates as single vectors
        left <-  unlist(purrr::map(pagewise_block_coords, ~ purrr::map(.x, ~ min(.x$x))))
        right <- unlist(purrr::map(pagewise_block_coords, ~ purrr::map(.x, ~ max(.x$x))))
        top <- unlist(purrr::map(pagewise_block_coords, ~ purrr::map(.x, ~ min(.x$y))))
        bottom <- unlist(purrr::map(pagewise_block_coords, ~ purrr::map(.x, ~ max(.x$y))))

        # combine all vectors to dataframe
        df <- data.frame(page, block, conf, left, right, top, bottom)

        return(df)

    }

#' Split a block bounding box
#'
#' @description This function 'splits' (in the sense of changing the
#' coordinates) of an existing block bounding box vertically or
#' horizontally at a specified point. It takes a block data frame as
#' input and modifies it. The splitting produces a new block, which
#' is added to the data frame while the old block's coordinates are
#' updated. The function returns a revised block data frame.
#'
#' @param block_df A dataframe generated by \code{build_block_df()}.
#' @param page The number of the page where the split will be made.
#' Defaults to 1.
#' @param block The number of the block to be split.
#' @param cut_point A number between 0 and 100, where 0 is the
#' existing left/top limit and 100 is the existing right/bottom limit.
#' @param direction "V" for vertical split or "H" for horizontal split.
#' Defaults to "V".
#'
#' @return a block data frame
#' @export
#'
#' @examples
#' \dontrun{
#' new_block_df <- split_block(df = old_block_df, block = 7, cut_point = 33)
#' }

split_block <- function(block_df,
                        page = 1,
                        block,
                        cut_point,
                        direction = "v"
                        ) {

  # checks
  if (!(is.data.frame(block_df))) {
    stop("Input not a data frame.")
    }

  if (!(identical(colnames(block_df), c("page", "block", "conf", "left", "right", "top", "bottom")))) {
    stop("Dataframe not recognized. Was it made with build_block_df?")
    }

  if (!(length(page) == 1 && is.numeric(page) && round(page) == page && page > 0)) {
    stop("Invalid page parameter.")
    }

  if (page > max(block_df$page, na.rm = TRUE)) {
    stop("No such page number in this dataframe.")
    }

  if (!(length(block) == 1 && is.numeric(block) && round(block) == block && block > 0)) {
    stop("Invalid block parameter.")
    }

  if (block > max(block_df$block[block_df$page == page])) {
    stop("No such block number on this page.")
    }

  if (!(length(cut_point) == 1 && is.numeric(cut_point) && round(cut_point) == cut_point && cut_point > 0)) {
    stop("Invalid cut point parameter.")
    }

  if (!(cut_point %in% 1:99)) {
    stop("Cut point out of range.")
    }

  if (!(length(direction) == 1 && is.character(direction))) {
    stop("Invalid direction parameter.")
    }

  direction <- tolower(direction)

  if (!(direction %in% c("h", "v"))) {
    stop('Split direction must be either "h" or "v".')
    }

  # rename for readability
  old_df <- block_df

  # select page and block
  old_page_df <- old_df[old_df$page == page, ]

  old_block <- old_page_df[old_page_df$block == block, ]

  # vertical split
  if (direction == "v") {

    dist <- old_block$right - old_block$left
    cut <- dist / 100 * cut_point
    cut_loc <- old_block$left + cut

    new_block <- data.frame(
      page = as.integer(page),
      block = as.integer(max(old_page_df$block[old_page_df$page == page]) + 1),
      conf = NA,
      left = cut_loc,
      right = old_block$right,
      top = old_block$top,
      bottom = old_block$bottom
      )

    new_page_df <- rbind(old_page_df, new_block)

    new_page_df$right[new_page_df$block == block] <- cut_loc

    if (page == 1 && max(old_df$page) == 1) {
      new_block_df <- new_page_df
      } else if (page == 1 && max(old_df$page) > 1) {
        succeeding <- old_df[old_df$page > page, ]
        new_block_df <- rbind(new_page_df, succeeding)
        } else if (page > 1 && page == max(old_df$page)) {
          preceding <- old_df[old_df$page < page, ]
          new_block_df <- rbind(preceding, new_page_df)
          } else {
            preceding <- old_df[old_df$page < page, ]
            succeeding <- old_df[old_df$page > page, ]
            new_block_df <- rbind(preceding, new_page_df, succeeding)
          }

  # horizontal split
  } else {

    dist <- old_block$bottom - old_block$top
    cut <- dist / 100 * cut_point
    cut_loc <- old_block$top + cut

    new_block <- data.frame(
      page = as.integer(page),
      block = as.integer(max(old_page_df$block[old_page_df$page == page]) + 1),
      conf = NA,
      left = old_block$right,
      right = old_block$right,
      top = cut_loc,
      bottom = old_block$bottom
      )

    new_page_df <- rbind(old_page_df, new_block)

    new_page_df$bottom[new_page_df$block == block] <- cut_loc

    if (page == 1 && max(old_df$page) == 1) {
      new_block_df <- new_page_df
      } else if (page == 1 && max(old_df$page) > 1) {
        succeeding <- old_df[old_df$page > page, ]
        new_block_df <- rbind(new_page_df, succeeding)
        } else if (page > 1 && page == max(old_df$page)) {
          preceding <- old_df[old_df$page < page, ]
          new_block_df <- rbind(preceding, new_page_df)
          } else {
            preceding <- old_df[old_df$page < page, ]
            succeeding <- old_df[old_df$page > page, ]
            new_block_df <- rbind(preceding, new_page_df, succeeding)
          }
    }

  row.names(new_block_df) <- NULL

  return(new_block_df)

  }

#' Assign tokens to new blocks
#'
#' @description This is a specialized function for use in connection
#' with text reordering. It modifies a token dataframe by assigning
#' new block bounding box values to a subset of tokens based on
#' prior modifications made to a block dataframe.
#'
#' @param token_df a dataframe generated by \code{build_token_df()}
#' @param block_df a dataframe generated by \code{dair::split_block()}
#' or \code{dair::build_block_df()}
#' @return a token data frame
#'
#' @details The token and block data frames provided as input must be
#' from the same JSON output file.
#' @export
#'
#' @examples
#' \dontrun{
#' new_token_df <- reassign_tokens(token_df, new_block_df)
#' }

reassign_tokens <- function(token_df,
                            block_df
                            ) {

  # checks
  if (!(is.data.frame(token_df))) {
    stop("token_df not a data frame.")
    }

  if (!(identical(colnames(token_df),
                  c("token", "start_ind", "end_ind", "conf", "left", "right", "top", "bottom", "page", "block")))) {
    stop("Token dataframe not recognized. Was it made with build_token_df?")
    }

  if (!(is.data.frame(block_df))) {
    stop("block_df not a data frame.")
    }

  if (!(identical(colnames(block_df), c("page", "block", "conf", "left", "right", "top", "bottom")))) {
    stop("Block dataframe not recognized. Was it made with build_block_df?")
    }

  # get list of pagewise dfs
  token_df_pages <- split(token_df,
                          token_df$page
                          )

  block_df_pages <- split(block_df,
                          block_df$page
                          )

  # set up empty new df
  colnames <- colnames(token_df)

  new_token_df <- as.data.frame(matrix(ncol = length(colnames), nrow = 0, dimnames = list(NULL, colnames)))

  # loop over each page
  for (i in seq_along(token_df_pages)) {

    # short names for readability
    tokens <- token_df_pages[[i]]
    blocks <- block_df_pages[[i]]

    # set up empty new page dataframe
    new_token_df_page <- as.data.frame(matrix(ncol = length(colnames), nrow = 0, dimnames = list(NULL, colnames)))

    # loop over each block
    for (j in seq_len(nrow(blocks))) {

      tokens_in_block <- tokens[tokens$top >= blocks[j, ]$top &
                                  tokens$bottom <= blocks[j, ]$bottom &
                                  tokens$left >= blocks[j, ]$left &
                                  tokens$right <= blocks[j, ]$right, ]

      if (nrow(tokens_in_block) > 0) {
        tokens_in_block$block <- j
        tokens_in_block$page <- i
        new_token_df_page <- rbind(new_token_df_page, tokens_in_block)
      }
    }

    new_token_df <- rbind(new_token_df, new_token_df_page)

    }

  row.names(new_token_df) <- NULL

  return(new_token_df)

  }

#' Assign tokens to a single new block
#'
#' @description This is a specialized function for use in connection
#' with text reordering. It is designed to facilitate manual splitting
#' of block boundary boxes and typically takes a one-row block dataframe
#' generated by \code{from_labelme()}.
#'
#' @param token_df a data frame generated by \code{dair::build_token_df}
#' @param block a one-row data frame of the same format as \code{token_df}
#' @param page the number of the page on which the block belongs
#'
#' @return a token data frame
#' @export
#'
#' @examples
#' \dontrun{
#' new_token_df <- reassign_tokens2(token_df, new_block_df)
#' new_token_df <- reassign_tokens2(token_df, new_block_df, 5)
#' }

reassign_tokens2 <- function(token_df,
                             block,
                             page = 1
                             ) {

  # checks
  if (!(is.data.frame(token_df))) {
    stop("token_df not a data frame.")
    }

  if (!(identical(colnames(token_df),
                  c("token", "start_ind", "end_ind", "conf", "left", "right", "top", "bottom", "page", "block")))) {
    stop("Token dataframe not recognized. Was it made with build_token_df?")
    }

  if (!(is.data.frame(block))) {
    stop("block input not a data frame.")
    }

  if (!(identical(colnames(block), c("page", "block", "conf",  "left", "right", "top", "bottom")))) {
    stop("Block dataframe format not recognized.")
    }

  if (!(length(page) == 1 && is.numeric(page) && round(page) == page && page > 0)) {
    stop("Invalid page parameter.")
    }

  if (page > max(token_df$page, na.rm = TRUE)) {
    stop("No such page number in this dataframe.")
    }

  # get only selected page
  page_df <- token_df[token_df$page == page, ]

  page_df$block[page_df$top >= block$top &
                  page_df$bottom <= block$bottom &
                  page_df$left >= block$left &
                  page_df$right <= block$right] <- as.numeric(block$block)

  # if only page
  if (page == 1 && max(token_df$page) == 1) {
    new_token_df <- page_df

  # if first of several
    } else if (page == 1 && max(token_df$page) > 1) {
      succeeding <- token_df[token_df$page > page, ]
      new_token_df <- rbind(page_df, succeeding)

  # if last of several
      } else if (page > 1 && page == max(token_df$page)) {
        preceding <- token_df[token_df$page < page, ]
        new_token_df <- rbind(preceding, page_df)

  # if in middle
        } else {
          preceding <- token_df[token_df$page < page, ]
          succeeding <- token_df[token_df$page > page, ]
          new_token_df <- rbind(preceding, page_df, succeeding)
        }

  row.names(new_token_df) <- NULL

  return(new_token_df)

  }

#' Extract block coordinates from labelme files
#'
#' @description This is a specialized function for use in connection
#' with text reordering. It takes the output from the image
#' annotation tool 'Labelme' <https://github.com/wkentaro/labelme>
#' and turns it into a one-row data frame compatible with other
#' 'daiR' functions for text reordering such as
#' \code{reassign_tokens2()}. See package vignette on text reconstruction
#' for details.
#'
#' @param json a json file generated by 'Labelme'
#' @param page the number of the annotated page
#' @return a data frame with location coordinates for the rectangle
#' marked in 'Labelme'.
#' @export
#'
#' @examples
#' \dontrun{
#' new_block <- from_labelme("document1_blocks.json")
#' new_block <- from_labelme("document5_blocks.json", 5)
#' }

from_labelme <- function(json,
                         page = 1
                         ) {

  # checks
  if (!(is_json(json))) {
    stop("Input file not .json.")
    }

  if (!(length(page) == 1 && is.numeric(page) && round(page) == page && page > 0)) {
    stop("Invalid page parameter.")
    }

  # extract the coordinates
  info <- jsonlite::fromJSON(json)

  height <- info[[6]]

  width <- info[[7]]

  left <- info[[3]][[2]][[1]][1, 1] / width

  right <- info[[3]][[2]][[1]][2, 1] / width

  top <- info[[3]][[2]][[1]][1, 2] / height

  bottom <- info[[3]][[2]][[1]][2, 2] / height

  block <- info[[3]][[1]]

  blockdata <- data.frame(page = page,
                          block = as.numeric(block),
                          conf = NA,
                          left = left,
                          right = right,
                          top = top,
                          bottom = bottom
                          )

  return(blockdata)

  }


#' Inspect revised block bounding boxes
#'
#' @description Tool to visually check the order of block bounding boxes after
#' manual processing (e.g. block reordering or splitting). Takes as its main
#' input a token dataframe generated with \code{build_token_df()},
#' \code{reassign_tokens()}, or \code{reassign_tokens2()}.
#' The function plots the block bounding boxes onto images of the submitted
#' document. Generates an annotated .png file for each page in the
#' original document.
#'
#' @param json filepath of a JSON file obtained using \code{dai_async()}
#' @param token_df a token data frame generated with \code{build_token_df()},
#' \code{reassign_tokens()}, or \code{reassign_tokens2()}.
#' @param dir path to the desired output directory.
#' @return no return value, called for side effects
#'
#' @details Not vectorized, but documents can be multi-page.
#'
#' @export
#' @examples
#' \dontrun{
#' redraw_blocks("pdf_output.json", revised_token_df, dir = tempdir())
#' }

redraw_blocks <- function(json,
                          token_df,
                          dir = getwd()
                          ) {
  # checks
  if (length(json) > 1) {
    stop("Invalid json input. This function is not vectorised.")
  }

  if (!(is.character(json))) {
    stop("Invalid json input.")
  }

  if (!(is_json(json))) {
    stop("Input 'json' not .json.")
  }

  if (!(is.data.frame(token_df))) {
    stop("token_df not a data frame.")
  }

  if (!(identical(colnames(token_df),
                  c("token", "start_ind", "end_ind", "conf", "left", "right", "top", "bottom", "page", "block")))) {
    stop("Token dataframe format not recognized.")
  }

  # parse the json
  parsed <- jsonlite::fromJSON(json)

  # create a list with pagewise sets of block boundary box coordinates
  pages_blocks <- split(token_df, token_df$page)

  pagewise_block_sets <- list()
  for (i in seq_along(pages_blocks)) {
    blocks <- split(pages_blocks[[i]], pages_blocks[[i]]$block)
    block_coords <- list()
    for (j in seq_along(blocks)) {
      token_sub_df <- blocks[[j]]
      left <- min(token_sub_df$left)
      right <- max(token_sub_df$right)
      top <- min(token_sub_df$top)
      bottom <- max(token_sub_df$bottom)
      x <- c(left, right, right, left)
      y <- c(top, top, bottom, bottom)
      coords <- list(data.frame(x, y))
      block_coords <- append(block_coords, coords)
    }
    pagewise_block_sets <- append(pagewise_block_sets, list(block_coords))
  }

  # Get vector of base64-encoded images
  page_imgs <- parsed$pages$image$content

  # loop over the pagewise sets
  for (i in seq_along(pagewise_block_sets)) {

    # decode base64
    path <- file.path(tempdir(), glue::glue("page{i}.jpg"))
    outconn <- file(path, "wb")
    base64enc::base64decode(page_imgs[i], outconn)
    close(outconn)

    # read image into magick
    img_decoded <- magick::image_read(path)

    # get image dimensions
    info <- magick::image_info(img_decoded)

    # prepare for plotting on image
    canvas <- magick::image_draw(img_decoded)

    # set counter for box number
    counter <- 1

    #loop over boxes on the page
    for (box in pagewise_block_sets[[i]]) {

      # transform from relative to absolute coordinates
      box$x1 <- box$x * info$width

      box$y1 <- box$y * info$height

      # draw polygon
      graphics::polygon(x = box$x1,
                        y = box$y1,
                        border = "red",
                        lwd = 3
      )

      graphics::text(x = box$x1[1],
                     y = box$y1[1],
                     label = counter,
                     cex = 4,
                     col = "blue",
                     family = "Liberation Sans"
      )

      counter <- counter + 1

    }

    # write annotated image to file

    filename <- glue::glue("page{i}_blocks_revised.png")

    dest <- file.path(dir, filename)

    magick::image_write(canvas, format = "png", dest)

    grDevices::dev.off()

  }

  pages <- length(pages_blocks)

  message(glue::glue("Generated {pages} annotated image(s)."))

}

#' Get vertices
#'
#' @description Helper function for extracting coordinates from DAI reponse objects
#'
#' @param lst a list
#'
#' @noRd

get_vertices <- function(lst) {
    boxes <- purrr::map(lst, ~.x$layout$boundingPoly$normalizedVertices)
    return(boxes)
}

#' Transpose blocks
#'
#' @description Helper function for converting coordinates from DAI reponse objects
#'
#' @param lst a list
#'
#' @noRd
transpose_block <- function(lst) {
    xs <- c(lst[[1]][["x"]], lst[[2]][["x"]], lst[[3]][["x"]], lst[[4]][["x"]])
    ys <- c(lst[[1]][["y"]], lst[[2]][["y"]], lst[[3]][["y"]], lst[[4]][["y"]])
    df <- data.frame(xs, ys)
    return(df)
}

#' Transpose page
#'
#' @description Helper function for converting coordinates from DAI reponse objects
#'
#' @param x a list element in a response object converted with httr::content().
#' 
#' @noRd

transpose_page <- function(x) {
    blocks <- purrr::map(x, transpose_block)
    return(blocks)
}

Try the daiR package in your browser

Any scripts or data that you put into this service are public.

daiR documentation built on Sept. 8, 2023, 5:43 p.m.