test_process_output.R
In daiR: Interface with Google Cloud Document AI API

## OBJECT TYPES ----------------------------------------------------------------

null <- NULL
na <- NA
boolean <- TRUE
number_random <- sample(1:1000, 1)
string_random <- paste0(sample(letters, 5), collapse = "")
vector_strings <- c("foo", "bar")
list_strings <- list("foo", "bar")
df <- mtcars
matrix <- as.matrix(mtcars)

# correct but irrelevant JSON file
fill <- list("a" = 1, "b" = 2) 
json <- jsonlite::toJSON(fill)
madeup_json_file <- tempfile(fileext = ".json")
write(json, madeup_json_file)

## BUILD_TOKEN_DF --------------------------------------------------------------

test_that("build_token_df() warns of input errors", {
  expect_error(build_token_df(type = null), "Invalid type parameter.")
  expect_error(build_token_df(type = na), "Invalid type parameter.")
  expect_error(build_token_df(type = boolean), "Invalid type parameter.")
  expect_error(build_token_df(type = number_random), "Invalid type parameter.")
  expect_error(build_token_df(type = string_random), "Invalid type parameter.")
  expect_error(build_token_df(type = vector_strings), "Invalid type parameter.")
  expect_error(build_token_df(type = list_strings), "Invalid type parameter.")
  expect_error(build_token_df(type = df), "Invalid type parameter.")
  expect_error(build_token_df(type = matrix), "Invalid type parameter.")
  
  expect_error(build_token_df(type = "sync", output = null), "Invalid output parameter.")
  expect_error(build_token_df(type = "sync", output = na), "Invalid output parameter.")
  expect_error(build_token_df(type = "sync", output = boolean), "Invalid output parameter.")
  expect_error(build_token_df(type = "sync", output = number_random), "Invalid output parameter.")
  expect_error(build_token_df(type = "sync", output = string_random), "Invalid output parameter.")
  expect_error(build_token_df(type = "sync", output = vector_strings), "Invalid output parameter.")
  expect_error(build_token_df(type = "sync", output = list_strings), "Invalid output parameter.")
  expect_error(build_token_df(type = "sync", output = df), "Invalid output parameter.")
  expect_error(build_token_df(type = "sync", output = matrix), "Invalid output parameter.")
})

test_that("build_token_df() warns of files not containing tokens", {
  expect_error(build_token_df(type = "async", output = madeup_json_file), "JSON not in right format. Is it from DAI?")
  # from DAI but blank:
  blank <- testthat::test_path("examples", "output_blank.json")
  expect_error(build_token_df(type = "async", output = blank), "DAI found no tokens. Was the document blank?")
})

test_that("build_token_df() builds a token dataframe", {
  json <- testthat::test_path("examples", "output.json")
  # is df:
  df <- build_token_df(type = "async", output = json)
  expect_true(is.data.frame(df))
  # has right properties:
  expect_equal(ncol(df), 10)
  expect_setequal(colnames(df), c("token", "start_ind", "end_ind", "conf", "left", "right", "top", "bottom", "page", "block"))
  expect_true(is.character(df$token))
  expect_true(is.numeric(df$left))
  expect_lt(max(df$right, na.rm = TRUE), 1)
  expect_gt(min(df$top, na.rm = TRUE), 0)
  jsonlist <- jsonlite::fromJSON(json)
  expect_equal(max(df$page), nrow(jsonlist[["pages"]]))
  words <- ngram::wordcount(text_from_dai_file(json))
  expect_lt(nrow(df), 1.5*words)
  expect_gt(nrow(df), 0.5*words)
  expect_false(is.unsorted(df$start_ind))
})

## BUILD_BLOCK_DF --------------------------------------------------------------

test_that("build_block_df() warns of input errors", {
  expect_error(build_block_df(type = null), "Invalid type parameter.")
  expect_error(build_block_df(type = na), "Invalid type parameter.")
  expect_error(build_block_df(type = boolean), "Invalid type parameter.")
  expect_error(build_block_df(type = number_random), "Invalid type parameter.")
  expect_error(build_block_df(type = string_random), "Invalid type parameter.")
  expect_error(build_block_df(type = vector_strings), "Invalid type parameter.")
  expect_error(build_block_df(type = list_strings), "Invalid type parameter.")
  expect_error(build_block_df(type = df), "Invalid type parameter.")
  expect_error(build_block_df(type = matrix), "Invalid type parameter.")
  
  expect_error(build_block_df(type = "sync", output = null), "Invalid output parameter.")
  expect_error(build_block_df(type = "sync", output = na), "Invalid output parameter.")
  expect_error(build_block_df(type = "sync", output = boolean), "Invalid output parameter.")
  expect_error(build_block_df(type = "sync", output = number_random), "Invalid output parameter.")
  expect_error(build_block_df(type = "sync", output = string_random), "Invalid output parameter.")
  expect_error(build_block_df(type = "sync", output = vector_strings), "Invalid output parameter.")
  expect_error(build_block_df(type = "sync", output = list_strings), "Invalid output parameter.")
  expect_error(build_block_df(type = "sync", output = df), "Invalid output parameter.")
  expect_error(build_block_df(type = "sync", output = matrix), "Invalid output parameter.")
})

test_that("build_block_df() warns of files not containing blocks", {
  expect_error(build_block_df(type = "async", output = madeup_json_file), "JSON not in right format. Is it from DAI?")
  # from DAI but blank:
  blank <- testthat::test_path("examples", "output_blank.json")
  expect_error(build_block_df(type = "async", output = blank), "DAI found no blocks. Was the document blank?")
})

test_that("build_block_df() builds a block dataframe", {
  json <- testthat::test_path("examples", "output.json")
  # is df:
  df <- build_block_df(type = "async", output = json)
  expect_true(is.data.frame(df))
  # has right properties:
  expect_equal(ncol(df), 7)
  expect_setequal(colnames(df), c("page", "block", "conf", "left", "right", "top", "bottom"))
  expect_true(is.numeric(df$left))
  expect_lt(max(df$right, na.rm = TRUE), 1)
  expect_gt(min(df$top, na.rm = TRUE), 0)
  jsonlist <- jsonlite::fromJSON(json)
  expect_equal(max(df$page), nrow(jsonlist[["pages"]]))
  expect_false(is.unsorted(df$block))
})

## SPLIT_BLOCK -----------------------------------------------------------------

test_that("split_block() warns of input errors", {
  expect_error(split_block(block_df = null), "Input not a data frame.")
  expect_error(split_block(block_df = na), "Input not a data frame.")
  expect_error(split_block(block_df = boolean), "Input not a data frame.")
  expect_error(split_block(block_df = number_random), "Input not a data frame.")
  expect_error(split_block(block_df = string_random), "Input not a data frame.")
  expect_error(split_block(block_df = vector_strings), "Input not a data frame.")
  expect_error(split_block(block_df = list_strings), "Input not a data frame.")
  #expect_error(split_block(block_df = df), "Dataframe not recognized. Was it made with build_block_df?")
  expect_error(split_block(block_df = matrix), "Input not a data frame.")
  json <- testthat::test_path("examples", "output.json")
  df <- build_block_df(type = "async", output = json)
  expect_error(split_block(block_df = df, page = 1-2), "Invalid page parameter.")
  expect_error(split_block(block_df = df, page = "one"), "Invalid page parameter.")
  expect_error(split_block(block_df = df, page = c(1,2)), "Invalid page parameter.")
  expect_error(split_block(block_df = df, page = 10), "No such page number in this dataframe.")
  expect_error(split_block(block_df = df, block = 1-2), "Invalid block parameter.")
  expect_error(split_block(block_df = df, block = "one"), "Invalid block parameter.")
  expect_error(split_block(block_df = df, block = c(1,2)), "Invalid block parameter.")
  expect_error(split_block(block_df = df, block = 50), "No such block number on this page.")
  expect_error(split_block(block_df = df, block = 1, cut_point = "middle"), "Invalid cut point parameter.")
  expect_error(split_block(block_df = df, block = 1, cut_point = 150), "Cut point out of range.")
  expect_error(split_block(block_df = df, block = 1, cut_point = 50, direction = 1), "Invalid direction parameter.")
  expect_error(split_block(block_df = df, block = 1, cut_point = 50, direction = "horizontal"), 'Split direction must be either "h" or "v".')
})

test_that("split_block() returns a revised block dataframe", {
  json <- testthat::test_path("examples", "output.json")
  df <- build_block_df(type = "async", output = json)
  # choose random block and cut point:
  block <- sample(min(df$block):max(df$block), 1)
  cut_point <- sample(1:99, 1)
  # check that output df has one block more:
  new_df <- split_block(df, block = block, cut_point = cut_point)
  expect_equal(max(new_df$block), max(df$block) + 1)
  # check that the selected block is now different while others are similar:
  old_block_n <- df[df$block == block,]
  new_block_n <- new_df[new_df$block == block,]
  #expect_false(isTRUE(all.equal(old_block_n, new_block_n))) # strangely fails on ci
  rest_old_df <- df[!df$block == block,]
  new_df_ex_last <- new_df[!new_df$block == max(new_df$block),]
  rest_new_df <- new_df_ex_last[!new_df_ex_last$block == block,]
  rownames(rest_new_df) <- NULL
  rownames(rest_old_df) <- NULL
  expect_true(isTRUE(all.equal(rest_old_df, rest_new_df)))

  ### same process but for block in middle of multipage doc:
  json <- testthat::test_path("examples", "sample3pg.json")
  df <- build_block_df(type = "async", output = json)
  df_sub <- df[df$page == 2,]
  # choose random block and cut point:
  block <- sample(min(df_sub$block):max(df_sub$block), 1)
  cut_point <- sample(1:99, 1)
  # check that output df has one block more:
  new_df <- split_block(df, block = block, cut_point = cut_point, page = 2)
  expect_equal(nrow(new_df), nrow(df) + 1)
  # check that the selected block is now different while others are similar:
  new_df_sub <- new_df[new_df$page == 2,]
  old_block_n <- df_sub[df_sub$block == block,]
  new_block_n <- new_df_sub[new_df_sub$block == block,]
  expect_false(isTRUE(all.equal(old_block_n, new_block_n)))
  rest_old_df <- df_sub[!df_sub$block == block,]
  new_df_ex_last <- new_df_sub[!new_df_sub$block == max(new_df_sub$block),]
  rest_new_df <- new_df_ex_last[!new_df_ex_last$block == block,]
  rownames(rest_new_df) <- NULL
  rownames(rest_old_df) <- NULL
  expect_true(isTRUE(all.equal(rest_old_df, rest_new_df)))

  ### same process but for block at end of multipage doc:
  json <- testthat::test_path("examples", "sample3pg.json")
  df <- build_block_df(type = "async", output = json)
  df_sub <- df[df$page == 3,]
  # choose random block and cut point:
  block <- sample(min(df_sub$block):max(df_sub$block), 1)
  cut_point <- sample(1:99, 1)
  # check that output df has one block more:
  new_df <- split_block(df, block = block, cut_point = cut_point, page = 3)
  expect_equal(nrow(new_df), nrow(df) + 1)
  # check that the selected block is now different while others are similar:
  new_df_sub <- new_df[new_df$page == 3,]
  old_block_n <- df_sub[df_sub$block == block,]
  new_block_n <- new_df_sub[new_df_sub$block == block,]
  expect_false(isTRUE(all.equal(old_block_n, new_block_n)))
  rest_old_df <- df_sub[!df_sub$block == block,]
  new_df_ex_last <- new_df_sub[!new_df_sub$block == max(new_df_sub$block),]
  rest_new_df <- new_df_ex_last[!new_df_ex_last$block == block,]
  rownames(rest_new_df) <- NULL
  rownames(rest_old_df) <- NULL
  expect_true(isTRUE(all.equal(rest_old_df, rest_new_df)))

  ### same process, but for horizontal split:
  json <- testthat::test_path("examples", "sample3pg.json")
  df <- build_block_df(type = "async", output = json)
  # choose random block and cut point:
  block <- sample(min(df$block):max(df$block), 1)
  cut_point <- sample(1:99, 1)
  # check that output df has one block more:
  new_df <- split_block(df, block = block, cut_point = cut_point, direction = "h")
  expect_equal(max(new_df$block), max(df$block) + 1)
  # check that the selected block is now different while others are similar:
  old_block_n <- df[df$block == block,]
  new_block_n <- new_df[new_df$block == block,]
  expect_false(isTRUE(all.equal(old_block_n, new_block_n)))
  rest_old_df <- df[!df$block == block,]
  new_df_ex_last <- new_df[!new_df$block == max(new_df$block),]
  rest_new_df <- new_df_ex_last[!new_df_ex_last$block == block,]
  rownames(rest_new_df) <- NULL
  rownames(rest_old_df) <- NULL
  expect_true(isTRUE(all.equal(rest_old_df, rest_new_df)))

  ### same process but for block in middle of multipage doc:
  json <- testthat::test_path("examples", "sample3pg.json")
  df <- build_block_df(type = "async", output = json)
  df_sub <- df[df$page == 2,]
  # choose random block and cut point:
  block <- sample(min(df_sub$block):max(df_sub$block), 1)
  cut_point <- sample(1:99, 1)
  # check that output df has one block more:
  new_df <- split_block(df, block = block, cut_point = cut_point, page = 2, direction = "h")
  expect_equal(nrow(new_df), nrow(df) + 1)
  # check that the selected block is now different while others are similar:
  new_df_sub <- new_df[new_df$page == 2,]
  old_block_n <- df_sub[df_sub$block == block,]
  new_block_n <- new_df_sub[new_df_sub$block == block,]
  expect_false(isTRUE(all.equal(old_block_n, new_block_n)))
  rest_old_df <- df_sub[!df_sub$block == block,]
  new_df_ex_last <- new_df_sub[!new_df_sub$block == max(new_df_sub$block),]
  rest_new_df <- new_df_ex_last[!new_df_ex_last$block == block,]
  rownames(rest_new_df) <- NULL
  rownames(rest_old_df) <- NULL
  expect_true(isTRUE(all.equal(rest_old_df, rest_new_df)))

  ### same process but for block at end of multipage doc:
  json <- testthat::test_path("examples", "sample3pg.json")
  df <- build_block_df(type = "async", output = json)
  df_sub <- df[df$page == 3,]
  # choose random block and cut point:
  block <- sample(min(df_sub$block):max(df_sub$block), 1)
  cut_point <- sample(1:99, 1)
  # check that output df has one block more:
  new_df <- split_block(df, block = block, cut_point = cut_point, page = 3, direction = "h")
  expect_equal(nrow(new_df), nrow(df) + 1)
  # check that the selected block is now different while others are similar:
  new_df_sub <- new_df[new_df$page == 3,]
  old_block_n <- df_sub[df_sub$block == block,]
  new_block_n <- new_df_sub[new_df_sub$block == block,]
  expect_false(isTRUE(all.equal(old_block_n, new_block_n)))
  rest_old_df <- df_sub[!df_sub$block == block,]
  new_df_ex_last <- new_df_sub[!new_df_sub$block == max(new_df_sub$block),]
  rest_new_df <- new_df_ex_last[!new_df_ex_last$block == block,]
  rownames(rest_new_df) <- NULL
  rownames(rest_old_df) <- NULL
  expect_true(isTRUE(all.equal(rest_old_df, rest_new_df)))
})

## REASSIGN_TOKENS -------------------------------------------------------------

test_that("reassign_tokens() warns of input errors", {
  expect_error(reassign_tokens(token_df = null), "token_df not a data frame.")
  expect_error(reassign_tokens(token_df = na), "token_df not a data frame.")
  expect_error(reassign_tokens(token_df = boolean), "token_df not a data frame.")
  expect_error(reassign_tokens(token_df = number_random), "token_df not a data frame.")
  expect_error(reassign_tokens(token_df = string_random), "token_df not a data frame.")
  expect_error(reassign_tokens(token_df = vector_strings), "token_df not a data frame.")
  expect_error(reassign_tokens(token_df = list_strings), "token_df not a data frame.")
  #expect_error(reassign_tokens(token_df = df), "Token dataframe not recognized. Was it made with build_token_df?")
  expect_error(reassign_tokens(token_df = matrix), "token_df not a data frame.")
  json <- testthat::test_path("examples", "output.json")
  df <- build_token_df(type = "async", output = json)
  expect_error(reassign_tokens(df, block_df = null), "block_df not a data frame.")
  expect_error(reassign_tokens(df, block_df = na), "block_df not a data frame.")
  expect_error(reassign_tokens(df, block_df = boolean), "block_df not a data frame.")
  expect_error(reassign_tokens(df, block_df = number_random), "block_df not a data frame.")
  expect_error(reassign_tokens(df, block_df = string_random), "block_df not a data frame.")
  expect_error(reassign_tokens(df, block_df = vector_strings), "block_df not a data frame.")
  expect_error(reassign_tokens(df, block_df = list_strings), "block_df not a data frame.")
  expect_error(reassign_tokens(df, block_df = df), "Block dataframe not recognized. Was it made with build_block_df?")
  expect_error(reassign_tokens(df, block_df = matrix), "block_df not a data frame.")
})

test_that("reassign_tokens() returns a revised token dataframe", {
  # first get sample token df:
  json <- testthat::test_path("examples", "output.json")
  tdf_old <- build_token_df(type = "async", output = json)
  # then use split blocks to make a revised block dataframe:
    # start by making regular block df:
    json <- testthat::test_path("examples", "output.json")
    bdf_old <- build_block_df(type = "async", output = json)
    # then split a random block (with a minimum of 10 words) to make a new block_df:
    blocks <- list()
    for (i in 1:max(tdf_old$block, na.rm=TRUE)) {
      block <- list(tdf_old[tdf_old$block == i,])
      blocks <- append(blocks, block)
    }
    sizes <- data.frame(block = numeric(), words = numeric())
    for (i in 1:max(tdf_old$block, na.rm=TRUE)) {
      ntokens <- nrow(blocks[[i]])
      entry <- data.frame(block = i, words = ntokens)
      sizes <- rbind(sizes, entry)
    }
    b10plus <- sizes$block[sizes$words >= 10]
    to_split <- sample(b10plus, 1)
    bdf_new <- split_block(bdf_old, block = to_split, cut_point = sample(1:99, 1))
  # then reassign tokens:
  tdf_new <- reassign_tokens(tdf_old, bdf_new)
  # now check:
  expect_true(is.data.frame(tdf_new))
  #expect_equal(nrow(tdf_new), nrow(tdf_old)) # Not necessarily equal bc a word
  # split in middle (as result of random split) can be counted twice
  expect_true(identical(colnames(tdf_new), colnames(tdf_old)))
  expect_false(isTRUE(all.equal(tdf_new, tdf_old)))
  expect_true(is.unsorted(tdf_new$start_ind, na.rm = TRUE))
  expect_false(is.unsorted(tdf_old$start_ind, na.rm = TRUE))
})

## REASSIGN_TOKENS2 ------------------------------------------------------------

test_that("reassign_tokens2() warns of input errors", {
  expect_error(reassign_tokens2(token_df = null), "token_df not a data frame.")
  expect_error(reassign_tokens2(token_df = na), "token_df not a data frame.")
  expect_error(reassign_tokens2(token_df = boolean), "token_df not a data frame.")
  expect_error(reassign_tokens2(token_df = number_random), "token_df not a data frame.")
  expect_error(reassign_tokens2(token_df = string_random), "token_df not a data frame.")
  expect_error(reassign_tokens2(token_df = vector_strings), "token_df not a data frame.")
  expect_error(reassign_tokens2(token_df = list_strings), "token_df not a data frame.")
  #expect_error(reassign_tokens2(token_df = df), "Token dataframe not recognized. Was it made with build_token_df?")
  expect_error(reassign_tokens2(token_df = matrix), "token_df not a data frame.")
  json <- testthat::test_path("examples", "output.json")
  tdf <- build_token_df(type = "async", output = json)
  expect_error(reassign_tokens2(tdf, block = null), "block input not a data frame.")
  expect_error(reassign_tokens2(tdf, block = na), "block input not a data frame.")
  expect_error(reassign_tokens2(tdf, block = boolean), "block input not a data frame.")
  expect_error(reassign_tokens2(tdf, block = number_random), "block input not a data frame.")
  expect_error(reassign_tokens2(tdf, block = string_random), "block input not a data frame.")
  expect_error(reassign_tokens2(tdf, block = vector_strings), "block input not a data frame.")
  expect_error(reassign_tokens2(tdf, block = list_strings), "block input not a data frame.")
  expect_error(reassign_tokens2(tdf, block = df), "Block dataframe format not recognized.")
  expect_error(reassign_tokens2(tdf, block = matrix), "block input not a data frame.")
  bdf <- build_block_df(type = "async", output = json)
  bdf <- bdf[1,]
  expect_error(reassign_tokens2(tdf, bdf, page = 1-2), "Invalid page parameter.")
  expect_error(reassign_tokens2(tdf, bdf, page = "one"), "Invalid page parameter.")
  expect_error(reassign_tokens2(tdf, bdf, page = c(1,2)), "Invalid page parameter.")
  expect_error(reassign_tokens2(tdf, bdf, page = 10), "No such page number in this dataframe.")
})

test_that("reassign_tokens2() returns a revised token dataframe", {
  # make token df:
  json <- testthat::test_path("examples", "peshtigo.json")
  tdf_old <- build_token_df(type = "async", output = json)
  # make block df with from_labelme():
  json <- testthat::test_path("examples", "peshtigo_labelme.json")
  block <- from_labelme(json)
  # then reassign:
  tdf_new <- reassign_tokens2(tdf_old, block)
  # now check:
  expect_true(is.data.frame(tdf_new))
  expect_equal(max(tdf_new$block, na.rm = TRUE), max(tdf_old$block, na.rm = TRUE) + 1)
  # expect_equal(nrow(tdf_new), nrow(tdf_old)) Not necessarily equal bc a word
  # split in middle (as result of random split) can be counted twice
  expect_true(identical(colnames(tdf_new), colnames(tdf_old)))
  expect_false(isTRUE(all.equal(tdf_new, tdf_old)))
  expect_false(is.unsorted(tdf_old$start_ind, na.rm = TRUE))
  # expect_true(is.unsorted(tdf_new$start_ind, na.rm = TRUE)) 
  # not necessarily true if the block split is the last one
})

## FROM_LABELME ------------------------------------------------------------

test_that("from_labelme() warns of input errors", {
  expect_error(from_labelme(null), "Input file not .json.")
  expect_error(from_labelme(na), "Input file not .json.")
  expect_error(from_labelme(boolean), "Input file not .json.")
  expect_error(from_labelme(number_random), "Input file not .json.")
  expect_error(from_labelme(string_random), "Input file not .json.")
  expect_error(from_labelme(vector_strings), "Input file not .json.")
  expect_error(from_labelme(list_strings), "Input file not .json.")
  expect_error(from_labelme(df), "Input file not .json.")
  expect_error(from_labelme(matrix), "Input file not .json.")
  expect_error(from_labelme("wrongfile.csv"), "Input file not .json.")
  expect_error(from_labelme("madeup.json"), "Input file not .json.")
  json <- testthat::test_path("examples", "peshtigo_labelme.json")
  expect_error(from_labelme(json, page = 1-2), "Invalid page parameter.")
  expect_error(from_labelme(json, page = "one"), "Invalid page parameter.")
  expect_error(from_labelme(json, page = c(1, 2)), "Invalid page parameter.")
})

test_that("from_labelme() produces a properly formatted df", {
  json <- testthat::test_path("examples", "peshtigo_labelme.json")
  df <- from_labelme(json)
  expect_true(is.data.frame(df))
  #expect_true(identical(colnames(df), c("page", "block", "left", "right", "top", "bottom")))
  expect_true(is.numeric(df$block))
  expect_true(is.numeric(df$left))
  expect_lt(max(df$right, na.rm = TRUE), 1)
  expect_gt(min(df$top, na.rm = TRUE), 0)
})

unlink(madeup_json_file, force = TRUE)
Any scripts or data that you put into this service are public.
daiR documentation built on Sept. 8, 2023, 5:43 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
daiR
Interface with Google Cloud Document AI API

tests/testthat/test_process_output.R
In daiR: Interface with Google Cloud Document AI API

Try the daiR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

daiR Interface with Google Cloud Document AI API

tests/testthat/test_process_output.R In daiR: Interface with Google Cloud Document AI API

Try the daiR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

daiR
Interface with Google Cloud Document AI API

tests/testthat/test_process_output.R
In daiR: Interface with Google Cloud Document AI API