daiR: Interface with Google Cloud Document AI API

## OBJECT TYPES ----------------------------------------------------------------

null <- NULL
na <- NA
boolean <- TRUE
number_random <- sample(1:1000, 1)
string_random <- paste0(sample(letters, 5), collapse = "")
vector_strings <- c("foo", "bar")
list_strings <- list("foo", "bar")
df <- mtcars
matrix <- as.matrix(mtcars)

# correct but irrelevant JSON file
fill <- list("a" = 1, "b" = 2) 
json <- jsonlite::toJSON(fill)
madeup_json_file <- tempfile(fileext = ".json")
write(json, madeup_json_file)

## MERGE SHARDS ----------------------------------------

## BUILD_TOKEN_DF --------------------------------------------------------------

test_that("build_token_df() warns of input errors", {
  expect_error(build_token_df(type = null), "Invalid type parameter.")
  expect_error(build_token_df(type = na), "Invalid type parameter.")
  expect_error(build_token_df(type = boolean), "Invalid type parameter.")
  expect_error(build_token_df(type = number_random), "Invalid type parameter.")
  expect_error(build_token_df(type = string_random), "Invalid type parameter.")
  expect_error(build_token_df(type = vector_strings), "Invalid type parameter.")
  expect_error(build_token_df(type = list_strings), "Invalid type parameter.")
  expect_error(build_token_df(type = df), "Invalid type parameter.")
  expect_error(build_token_df(type = matrix), "Invalid type parameter.")
  expect_error(build_token_df(type = "sync", object = null), "Invalid object parameter.")
  expect_error(build_token_df(type = "sync", object = na), "Invalid object parameter.")
  expect_error(build_token_df(type = "sync", object = boolean), "Invalid object parameter.")
  expect_error(build_token_df(type = "sync", object = number_random), "Invalid object parameter.")
  expect_error(build_token_df(type = "sync", object = string_random), "Invalid object parameter.")
  expect_error(build_token_df(type = "sync", object = vector_strings), "Invalid object parameter.")
  expect_error(build_token_df(type = "sync", object = list_strings), "Invalid object parameter.")
  expect_error(build_token_df(type = "sync", object = df), "Invalid object parameter.")
  expect_error(build_token_df(type = "sync", object = matrix), "Invalid object parameter.")
})

test_that("build_token_df() warns of files not containing tokens", {
  expect_error(build_token_df(type = "async", object = madeup_json_file), "JSON not in right format. Is it from DAI?")
  # from DAI but blank:
  blank <- testthat::test_path("examples", "output_blank.json")
  expect_error(build_token_df(type = "async", object = blank), "DAI found no tokens. Was the document blank?")
})

test_that("build_token_df() builds a token dataframe", {
  json <- testthat::test_path("examples", "output.json")
  # is df:
  df <- build_token_df(json, type = "async")
  expect_true(is.data.frame(df))
  # has right properties:
  expect_equal(ncol(df), 10)
  expect_setequal(colnames(df), c("token", "start_ind", "end_ind", "conf", "left", "right", "top", "bottom", "page", "block"))
  expect_true(is.character(df$token))
  expect_true(is.numeric(df$left))
  expect_lt(max(df$right, na.rm = TRUE), 1)
  expect_gt(min(df$top, na.rm = TRUE), 0)
  jsonlist <- jsonlite::fromJSON(json)
  expect_equal(max(df$page), nrow(jsonlist[["pages"]]))
  words <- ngram::wordcount(get_text(json, type = "async"))
  expect_lt(nrow(df), 1.5*words)
  expect_gt(nrow(df), 0.5*words)
  expect_false(is.unsorted(df$start_ind))
})

## BUILD_BLOCK_DF --------------------------------------------------------------

test_that("build_block_df() warns of input errors", {
  expect_error(build_block_df(type = null), "Invalid type parameter.")
  expect_error(build_block_df(type = na), "Invalid type parameter.")
  expect_error(build_block_df(type = boolean), "Invalid type parameter.")
  expect_error(build_block_df(type = number_random), "Invalid type parameter.")
  expect_error(build_block_df(type = string_random), "Invalid type parameter.")
  expect_error(build_block_df(type = vector_strings), "Invalid type parameter.")
  expect_error(build_block_df(type = list_strings), "Invalid type parameter.")
  expect_error(build_block_df(type = df), "Invalid type parameter.")
  expect_error(build_block_df(type = matrix), "Invalid type parameter.")
  expect_error(build_block_df(type = "sync", object = null), "Invalid object parameter.")
  expect_error(build_block_df(type = "sync", object = na), "Invalid object parameter.")
  expect_error(build_block_df(type = "sync", object = boolean), "Invalid object parameter.")
  expect_error(build_block_df(type = "sync", object = number_random), "Invalid object parameter.")
  expect_error(build_block_df(type = "sync", object = string_random), "Invalid object parameter.")
  expect_error(build_block_df(type = "sync", object = vector_strings), "Invalid object parameter.")
  expect_error(build_block_df(type = "sync", object = list_strings), "Invalid object parameter.")
  expect_error(build_block_df(type = "sync", object = df), "Invalid object parameter.")
  expect_error(build_block_df(type = "sync", object = matrix), "Invalid object parameter.")
})

test_that("build_block_df() warns of files not containing blocks", {
  expect_error(build_block_df(type = "async", object = madeup_json_file), "JSON not in right format. Is it from DAI?")
  # from DAI but blank:
  blank <- testthat::test_path("examples", "output_blank.json")
  expect_error(build_block_df(type = "async", object = blank), "DAI found no blocks. Was the document blank?")
})

test_that("build_block_df() builds a block dataframe", {
  json <- testthat::test_path("examples", "output.json")
  # is df:
  df <- build_block_df(type = "async", object = json)
  expect_true(is.data.frame(df))
  # has right properties:
  expect_equal(ncol(df), 7)
  expect_setequal(colnames(df), c("page", "block", "conf", "left", "right", "top", "bottom"))
  expect_true(is.numeric(df$left))
  expect_lt(max(df$right, na.rm = TRUE), 1)
  expect_gt(min(df$top, na.rm = TRUE), 0)
  jsonlist <- jsonlite::fromJSON(json)
  expect_equal(max(df$page), nrow(jsonlist[["pages"]]))
  expect_false(is.unsorted(df$block))
})

## SPLIT_BLOCK -----------------------------------------------------------------

test_that("split_block() warns of input errors", {
  expect_error(split_block(block_df = null), "Input not a data frame.")
  expect_error(split_block(block_df = na), "Input not a data frame.")
  expect_error(split_block(block_df = boolean), "Input not a data frame.")
  expect_error(split_block(block_df = number_random), "Input not a data frame.")
  expect_error(split_block(block_df = string_random), "Input not a data frame.")
  expect_error(split_block(block_df = vector_strings), "Input not a data frame.")
  expect_error(split_block(block_df = list_strings), "Input not a data frame.")
  #expect_error(split_block(block_df = df), "Dataframe not recognized. Was it made with build_block_df?")
  expect_error(split_block(block_df = matrix), "Input not a data frame.")
  json <- testthat::test_path("examples", "output.json")
  df <- build_block_df(type = "async", object = json)
  expect_error(split_block(block_df = df, page = 1-2), "Invalid page parameter.")
  expect_error(split_block(block_df = df, page = "one"), "Invalid page parameter.")
  expect_error(split_block(block_df = df, page = c(1,2)), "Invalid page parameter.")
  expect_error(split_block(block_df = df, page = 10), "No such page number in this dataframe.")
  expect_error(split_block(block_df = df, block = 1-2), "Invalid block parameter.")
  expect_error(split_block(block_df = df, block = "one"), "Invalid block parameter.")
  expect_error(split_block(block_df = df, block = c(1,2)), "Invalid block parameter.")
  expect_error(split_block(block_df = df, block = 50), "No such block number on this page.")
  expect_error(split_block(block_df = df, block = 1, cut_point = "middle"), "Invalid cut point parameter.")
  expect_error(split_block(block_df = df, block = 1, cut_point = 150), "Cut point out of range.")
  expect_error(split_block(block_df = df, block = 1, cut_point = 50, direction = 1), "Invalid direction parameter.")
  expect_error(split_block(block_df = df, block = 1, cut_point = 50, direction = "horizontal"), 'Split direction must be either "h" or "v".')
})

test_that("split_block() returns a revised block dataframe", {
  json <- testthat::test_path("examples", "output.json")
  df <- build_block_df(type = "async", object = json)
  # choose random block and cut point:
  block <- sample(min(df$block):max(df$block), 1)
  cut_point <- sample(1:99, 1)
  # check that output df has one block more:
  new_df <- split_block(df, block = block, cut_point = cut_point)
  expect_equal(max(new_df$block), max(df$block) + 1)
  # check that the selected block is now different while others are similar:
  old_block_n <- df[df$block == block,]
  new_block_n <- new_df[new_df$block == block,]
  #expect_false(isTRUE(all.equal(old_block_n, new_block_n))) # strangely fails on ci
  rest_old_df <- df[!df$block == block,]
  new_df_ex_last <- new_df[!new_df$block == max(new_df$block),]
  rest_new_df <- new_df_ex_last[!new_df_ex_last$block == block,]
  rownames(rest_new_df) <- NULL
  rownames(rest_old_df) <- NULL
  expect_true(isTRUE(all.equal(rest_old_df, rest_new_df)))

  ### same process but for block in middle of multipage doc:
  json <- testthat::test_path("examples", "sample3pg.json")
  df <- build_block_df(type = "async", object = json)
  df_sub <- df[df$page == 2,]
  # choose random block and cut point:
  block <- sample(min(df_sub$block):max(df_sub$block), 1)
  cut_point <- sample(1:99, 1)
  # check that output df has one block more:
  new_df <- split_block(df, block = block, cut_point = cut_point, page = 2)
  expect_equal(nrow(new_df), nrow(df) + 1)
  # check that the selected block is now different while others are similar:
  new_df_sub <- new_df[new_df$page == 2,]
  old_block_n <- df_sub[df_sub$block == block,]
  new_block_n <- new_df_sub[new_df_sub$block == block,]
  expect_false(isTRUE(all.equal(old_block_n, new_block_n)))
  rest_old_df <- df_sub[!df_sub$block == block,]
  new_df_ex_last <- new_df_sub[!new_df_sub$block == max(new_df_sub$block),]
  rest_new_df <- new_df_ex_last[!new_df_ex_last$block == block,]
  rownames(rest_new_df) <- NULL
  rownames(rest_old_df) <- NULL
  expect_true(isTRUE(all.equal(rest_old_df, rest_new_df)))

  ### same process but for block at end of multipage doc:
  json <- testthat::test_path("examples", "sample3pg.json")
  df <- build_block_df(type = "async", object = json)
  df_sub <- df[df$page == 3,]
  # choose random block and cut point:
  block <- sample(min(df_sub$block):max(df_sub$block), 1)
  cut_point <- sample(1:99, 1)
  # check that output df has one block more:
  new_df <- split_block(df, block = block, cut_point = cut_point, page = 3)
  expect_equal(nrow(new_df), nrow(df) + 1)
  # check that the selected block is now different while others are similar:
  new_df_sub <- new_df[new_df$page == 3,]
  old_block_n <- df_sub[df_sub$block == block,]
  new_block_n <- new_df_sub[new_df_sub$block == block,]
  expect_false(isTRUE(all.equal(old_block_n, new_block_n)))
  rest_old_df <- df_sub[!df_sub$block == block,]
  new_df_ex_last <- new_df_sub[!new_df_sub$block == max(new_df_sub$block),]
  rest_new_df <- new_df_ex_last[!new_df_ex_last$block == block,]
  rownames(rest_new_df) <- NULL
  rownames(rest_old_df) <- NULL
  expect_true(isTRUE(all.equal(rest_old_df, rest_new_df)))

  ### same process, but for horizontal split:
  json <- testthat::test_path("examples", "sample3pg.json")
  df <- build_block_df(type = "async", object = json)
  # choose random block and cut point:
  block <- sample(min(df$block):max(df$block), 1)
  cut_point <- sample(1:99, 1)
  # check that output df has one block more:
  new_df <- split_block(df, block = block, cut_point = cut_point, direction = "h")
  expect_equal(max(new_df$block), max(df$block) + 1)
  # check that the selected block is now different while others are similar:
  old_block_n <- df[df$block == block,]
  new_block_n <- new_df[new_df$block == block,]
  expect_false(isTRUE(all.equal(old_block_n, new_block_n)))
  rest_old_df <- df[!df$block == block,]
  new_df_ex_last <- new_df[!new_df$block == max(new_df$block),]
  rest_new_df <- new_df_ex_last[!new_df_ex_last$block == block,]
  rownames(rest_new_df) <- NULL
  rownames(rest_old_df) <- NULL
  expect_true(isTRUE(all.equal(rest_old_df, rest_new_df)))

  ### same process but for block in middle of multipage doc:
  json <- testthat::test_path("examples", "sample3pg.json")
  df <- build_block_df(type = "async", object = json)
  df_sub <- df[df$page == 2,]
  # choose random block and cut point:
  block <- sample(min(df_sub$block):max(df_sub$block), 1)
  cut_point <- sample(1:99, 1)
  # check that output df has one block more:
  new_df <- split_block(df, block = block, cut_point = cut_point, page = 2, direction = "h")
  expect_equal(nrow(new_df), nrow(df) + 1)
  # check that the selected block is now different while others are similar:
  new_df_sub <- new_df[new_df$page == 2,]
  old_block_n <- df_sub[df_sub$block == block,]
  new_block_n <- new_df_sub[new_df_sub$block == block,]
  expect_false(isTRUE(all.equal(old_block_n, new_block_n)))
  rest_old_df <- df_sub[!df_sub$block == block,]
  new_df_ex_last <- new_df_sub[!new_df_sub$block == max(new_df_sub$block),]
  rest_new_df <- new_df_ex_last[!new_df_ex_last$block == block,]
  rownames(rest_new_df) <- NULL
  rownames(rest_old_df) <- NULL
  expect_true(isTRUE(all.equal(rest_old_df, rest_new_df)))

  ### same process but for block at end of multipage doc:
  json <- testthat::test_path("examples", "sample3pg.json")
  df <- build_block_df(type = "async", object = json)
  df_sub <- df[df$page == 3,]
  # choose random block and cut point:
  block <- sample(min(df_sub$block):max(df_sub$block), 1)
  cut_point <- sample(1:99, 1)
  # check that output df has one block more:
  new_df <- split_block(df, block = block, cut_point = cut_point, page = 3, direction = "h")
  expect_equal(nrow(new_df), nrow(df) + 1)
  # check that the selected block is now different while others are similar:
  new_df_sub <- new_df[new_df$page == 3,]
  old_block_n <- df_sub[df_sub$block == block,]
  new_block_n <- new_df_sub[new_df_sub$block == block,]
  expect_false(isTRUE(all.equal(old_block_n, new_block_n)))
  rest_old_df <- df_sub[!df_sub$block == block,]
  new_df_ex_last <- new_df_sub[!new_df_sub$block == max(new_df_sub$block),]
  rest_new_df <- new_df_ex_last[!new_df_ex_last$block == block,]
  rownames(rest_new_df) <- NULL
  rownames(rest_old_df) <- NULL
  expect_true(isTRUE(all.equal(rest_old_df, rest_new_df)))
})

## REASSIGN_TOKENS -------------------------------------------------------------

test_that("reassign_tokens() warns of input errors", {
  expect_error(reassign_tokens(token_df = null), "token_df not a data frame.")
  expect_error(reassign_tokens(token_df = na), "token_df not a data frame.")
  expect_error(reassign_tokens(token_df = boolean), "token_df not a data frame.")
  expect_error(reassign_tokens(token_df = number_random), "token_df not a data frame.")
  expect_error(reassign_tokens(token_df = string_random), "token_df not a data frame.")
  expect_error(reassign_tokens(token_df = vector_strings), "token_df not a data frame.")
  expect_error(reassign_tokens(token_df = list_strings), "token_df not a data frame.")
  #expect_error(reassign_tokens(token_df = df), "Token dataframe not recognized. Was it made with build_token_df?")
  expect_error(reassign_tokens(token_df = matrix), "token_df not a data frame.")
  json <- testthat::test_path("examples", "output.json")
  df <- build_token_df(type = "async", object = json)
  expect_error(reassign_tokens(df, block_df = null), "block_df not a data frame.")
  expect_error(reassign_tokens(df, block_df = na), "block_df not a data frame.")
  expect_error(reassign_tokens(df, block_df = boolean), "block_df not a data frame.")
  expect_error(reassign_tokens(df, block_df = number_random), "block_df not a data frame.")
  expect_error(reassign_tokens(df, block_df = string_random), "block_df not a data frame.")
  expect_error(reassign_tokens(df, block_df = vector_strings), "block_df not a data frame.")
  expect_error(reassign_tokens(df, block_df = list_strings), "block_df not a data frame.")
  expect_error(reassign_tokens(df, block_df = df), "Block dataframe not recognized. Was it made with build_block_df?")
  expect_error(reassign_tokens(df, block_df = matrix), "block_df not a data frame.")
})

test_that("reassign_tokens() returns a revised token dataframe", {
  # first get sample token df:
  json <- testthat::test_path("examples", "output.json")
  tdf_old <- build_token_df(type = "async", object = json)
  # then use split blocks to make a revised block dataframe:
    # start by making regular block df:
    json <- testthat::test_path("examples", "output.json")
    bdf_old <- build_block_df(type = "async", object = json)
    # then split a random block (with a minimum of 10 words) to make a new block_df:
    blocks <- list()
    for (i in 1:max(tdf_old$block, na.rm=TRUE)) {
      block <- list(tdf_old[tdf_old$block == i,])
      blocks <- append(blocks, block)
    }
    sizes <- data.frame(block = numeric(), words = numeric())
    for (i in 1:max(tdf_old$block, na.rm=TRUE)) {
      ntokens <- nrow(blocks[[i]])
      entry <- data.frame(block = i, words = ntokens)
      sizes <- rbind(sizes, entry)
    }
    b10plus <- sizes$block[sizes$words >= 10]
    to_split <- sample(b10plus, 1)
    bdf_new <- split_block(bdf_old, block = to_split, cut_point = sample(1:99, 1))
  # then reassign tokens:
  tdf_new <- reassign_tokens(tdf_old, bdf_new)
  # now check:
  expect_true(is.data.frame(tdf_new))
  #expect_equal(nrow(tdf_new), nrow(tdf_old)) # Not necessarily equal bc a word
  # split in middle (as result of random split) can be counted twice
  expect_true(identical(colnames(tdf_new), colnames(tdf_old)))
  expect_false(isTRUE(all.equal(tdf_new, tdf_old)))
  expect_true(is.unsorted(tdf_new$start_ind, na.rm = TRUE))
  expect_false(is.unsorted(tdf_old$start_ind, na.rm = TRUE))
})

## REASSIGN_TOKENS2 ------------------------------------------------------------

test_that("reassign_tokens2() warns of input errors", {
  expect_error(reassign_tokens2(token_df = null), "token_df not a data frame.")
  expect_error(reassign_tokens2(token_df = na), "token_df not a data frame.")
  expect_error(reassign_tokens2(token_df = boolean), "token_df not a data frame.")
  expect_error(reassign_tokens2(token_df = number_random), "token_df not a data frame.")
  expect_error(reassign_tokens2(token_df = string_random), "token_df not a data frame.")
  expect_error(reassign_tokens2(token_df = vector_strings), "token_df not a data frame.")
  expect_error(reassign_tokens2(token_df = list_strings), "token_df not a data frame.")
  #expect_error(reassign_tokens2(token_df = df), "Token dataframe not recognized. Was it made with build_token_df?")
  expect_error(reassign_tokens2(token_df = matrix), "token_df not a data frame.")
  json <- testthat::test_path("examples", "output.json")
  tdf <- build_token_df(type = "async", object = json)
  expect_error(reassign_tokens2(tdf, block = null), "block input not a data frame.")
  expect_error(reassign_tokens2(tdf, block = na), "block input not a data frame.")
  expect_error(reassign_tokens2(tdf, block = boolean), "block input not a data frame.")
  expect_error(reassign_tokens2(tdf, block = number_random), "block input not a data frame.")
  expect_error(reassign_tokens2(tdf, block = string_random), "block input not a data frame.")
  expect_error(reassign_tokens2(tdf, block = vector_strings), "block input not a data frame.")
  expect_error(reassign_tokens2(tdf, block = list_strings), "block input not a data frame.")
  expect_error(reassign_tokens2(tdf, block = df), "Block dataframe format not recognized.")
  expect_error(reassign_tokens2(tdf, block = matrix), "block input not a data frame.")
  bdf <- build_block_df(type = "async", object = json)
  bdf <- bdf[1,]
  expect_error(reassign_tokens2(tdf, bdf, page = 1-2), "Invalid page parameter.")
  expect_error(reassign_tokens2(tdf, bdf, page = "one"), "Invalid page parameter.")
  expect_error(reassign_tokens2(tdf, bdf, page = c(1,2)), "Invalid page parameter.")
  expect_error(reassign_tokens2(tdf, bdf, page = 10), "No such page number in this dataframe.")
})

test_that("reassign_tokens2() returns a revised token dataframe", {
  # make token df:
  json <- testthat::test_path("examples", "peshtigo.json")
  tdf_old <- build_token_df(type = "async", object = json)
  # make block df with from_labelme():
  json <- testthat::test_path("examples", "peshtigo_labelme.json")
  block <- from_labelme(json)
  # then reassign:
  tdf_new <- reassign_tokens2(tdf_old, block)
  # now check:
  expect_true(is.data.frame(tdf_new))
  expect_equal(max(tdf_new$block, na.rm = TRUE), max(tdf_old$block, na.rm = TRUE) + 1)
  # expect_equal(nrow(tdf_new), nrow(tdf_old)) Not necessarily equal bc a word
  # split in middle (as result of random split) can be counted twice
  expect_true(identical(colnames(tdf_new), colnames(tdf_old)))
  expect_false(isTRUE(all.equal(tdf_new, tdf_old)))
  expect_false(is.unsorted(tdf_old$start_ind, na.rm = TRUE))
  # expect_true(is.unsorted(tdf_new$start_ind, na.rm = TRUE)) 
  # not necessarily true if the block split is the last one
})

## FROM_LABELME ------------------------------------------------------------

test_that("from_labelme() warns of input errors", {
  expect_error(from_labelme(null), "Input file not .json.")
  expect_error(from_labelme(na), "Input file not .json.")
  expect_error(from_labelme(boolean), "Input file not .json.")
  expect_error(from_labelme(number_random), "Input file not .json.")
  expect_error(from_labelme(string_random), "Input file not .json.")
  expect_error(from_labelme(vector_strings), "Input file not .json.")
  expect_error(from_labelme(list_strings), "Input file not .json.")
  expect_error(from_labelme(df), "Input file not .json.")
  expect_error(from_labelme(matrix), "Input file not .json.")
  expect_error(from_labelme("wrongfile.csv"), "Input file not .json.")
  expect_error(from_labelme("madeup.json"), "Input file not .json.")
  json <- testthat::test_path("examples", "peshtigo_labelme.json")
  expect_error(from_labelme(json, page = 1-2), "Invalid page parameter.")
  expect_error(from_labelme(json, page = "one"), "Invalid page parameter.")
  expect_error(from_labelme(json, page = c(1, 2)), "Invalid page parameter.")
})

test_that("from_labelme() produces a properly formatted df", {
  json <- testthat::test_path("examples", "peshtigo_labelme.json")
  df <- from_labelme(json)
  expect_true(is.data.frame(df))
  #expect_true(identical(colnames(df), c("page", "block", "left", "right", "top", "bottom")))
  expect_true(is.numeric(df$block))
  expect_true(is.numeric(df$left))
  expect_lt(max(df$right, na.rm = TRUE), 1)
  expect_gt(min(df$top, na.rm = TRUE), 0)
})

unlink(madeup_json_file, force = TRUE)

## REDRAW_BLOCKS ----------------------------------------
Hegghammer/daiR documentation built on Nov. 15, 2024, 10:34 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
Hegghammer/daiR
Interface with Google Cloud Document AI API

tests/testthat/test_manipulate.R
In Hegghammer/daiR: Interface with Google Cloud Document AI API

R Package Documentation

Browse R Packages

We want your feedback!

Hegghammer/daiR Interface with Google Cloud Document AI API

tests/testthat/test_manipulate.R In Hegghammer/daiR: Interface with Google Cloud Document AI API

R Package Documentation

Browse R Packages

We want your feedback!

Hegghammer/daiR
Interface with Google Cloud Document AI API

tests/testthat/test_manipulate.R
In Hegghammer/daiR: Interface with Google Cloud Document AI API