## OBJECT TYPES ----------------------------------------------------------------
null <- NULL
na <- NA
boolean <- TRUE
number_random <- sample(1:1000, 1)
string_random <- paste0(sample(letters, 5), collapse = "")
vector_strings <- c("foo", "bar")
list_strings <- list("foo", "bar")
df <- mtcars
matrix <- as.matrix(mtcars)
# correct but irrelevant JSON file
fill <- list("a" = 1, "b" = 2)
json <- jsonlite::toJSON(fill)
madeup_json_file <- tempfile(fileext = ".json")
write(json, madeup_json_file)
## MERGE SHARDS ----------------------------------------
## BUILD_TOKEN_DF --------------------------------------------------------------
test_that("build_token_df() warns of input errors", {
expect_error(build_token_df(type = null), "Invalid type parameter.")
expect_error(build_token_df(type = na), "Invalid type parameter.")
expect_error(build_token_df(type = boolean), "Invalid type parameter.")
expect_error(build_token_df(type = number_random), "Invalid type parameter.")
expect_error(build_token_df(type = string_random), "Invalid type parameter.")
expect_error(build_token_df(type = vector_strings), "Invalid type parameter.")
expect_error(build_token_df(type = list_strings), "Invalid type parameter.")
expect_error(build_token_df(type = df), "Invalid type parameter.")
expect_error(build_token_df(type = matrix), "Invalid type parameter.")
expect_error(build_token_df(type = "sync", object = null), "Invalid object parameter.")
expect_error(build_token_df(type = "sync", object = na), "Invalid object parameter.")
expect_error(build_token_df(type = "sync", object = boolean), "Invalid object parameter.")
expect_error(build_token_df(type = "sync", object = number_random), "Invalid object parameter.")
expect_error(build_token_df(type = "sync", object = string_random), "Invalid object parameter.")
expect_error(build_token_df(type = "sync", object = vector_strings), "Invalid object parameter.")
expect_error(build_token_df(type = "sync", object = list_strings), "Invalid object parameter.")
expect_error(build_token_df(type = "sync", object = df), "Invalid object parameter.")
expect_error(build_token_df(type = "sync", object = matrix), "Invalid object parameter.")
})
test_that("build_token_df() warns of files not containing tokens", {
expect_error(build_token_df(type = "async", object = madeup_json_file), "JSON not in right format. Is it from DAI?")
# from DAI but blank:
blank <- testthat::test_path("examples", "output_blank.json")
expect_error(build_token_df(type = "async", object = blank), "DAI found no tokens. Was the document blank?")
})
test_that("build_token_df() builds a token dataframe", {
json <- testthat::test_path("examples", "output.json")
# is df:
df <- build_token_df(json, type = "async")
expect_true(is.data.frame(df))
# has right properties:
expect_equal(ncol(df), 10)
expect_setequal(colnames(df), c("token", "start_ind", "end_ind", "conf", "left", "right", "top", "bottom", "page", "block"))
expect_true(is.character(df$token))
expect_true(is.numeric(df$left))
expect_lt(max(df$right, na.rm = TRUE), 1)
expect_gt(min(df$top, na.rm = TRUE), 0)
jsonlist <- jsonlite::fromJSON(json)
expect_equal(max(df$page), nrow(jsonlist[["pages"]]))
words <- ngram::wordcount(get_text(json, type = "async"))
expect_lt(nrow(df), 1.5*words)
expect_gt(nrow(df), 0.5*words)
expect_false(is.unsorted(df$start_ind))
})
## BUILD_BLOCK_DF --------------------------------------------------------------
test_that("build_block_df() warns of input errors", {
expect_error(build_block_df(type = null), "Invalid type parameter.")
expect_error(build_block_df(type = na), "Invalid type parameter.")
expect_error(build_block_df(type = boolean), "Invalid type parameter.")
expect_error(build_block_df(type = number_random), "Invalid type parameter.")
expect_error(build_block_df(type = string_random), "Invalid type parameter.")
expect_error(build_block_df(type = vector_strings), "Invalid type parameter.")
expect_error(build_block_df(type = list_strings), "Invalid type parameter.")
expect_error(build_block_df(type = df), "Invalid type parameter.")
expect_error(build_block_df(type = matrix), "Invalid type parameter.")
expect_error(build_block_df(type = "sync", object = null), "Invalid object parameter.")
expect_error(build_block_df(type = "sync", object = na), "Invalid object parameter.")
expect_error(build_block_df(type = "sync", object = boolean), "Invalid object parameter.")
expect_error(build_block_df(type = "sync", object = number_random), "Invalid object parameter.")
expect_error(build_block_df(type = "sync", object = string_random), "Invalid object parameter.")
expect_error(build_block_df(type = "sync", object = vector_strings), "Invalid object parameter.")
expect_error(build_block_df(type = "sync", object = list_strings), "Invalid object parameter.")
expect_error(build_block_df(type = "sync", object = df), "Invalid object parameter.")
expect_error(build_block_df(type = "sync", object = matrix), "Invalid object parameter.")
})
test_that("build_block_df() warns of files not containing blocks", {
expect_error(build_block_df(type = "async", object = madeup_json_file), "JSON not in right format. Is it from DAI?")
# from DAI but blank:
blank <- testthat::test_path("examples", "output_blank.json")
expect_error(build_block_df(type = "async", object = blank), "DAI found no blocks. Was the document blank?")
})
test_that("build_block_df() builds a block dataframe", {
json <- testthat::test_path("examples", "output.json")
# is df:
df <- build_block_df(type = "async", object = json)
expect_true(is.data.frame(df))
# has right properties:
expect_equal(ncol(df), 7)
expect_setequal(colnames(df), c("page", "block", "conf", "left", "right", "top", "bottom"))
expect_true(is.numeric(df$left))
expect_lt(max(df$right, na.rm = TRUE), 1)
expect_gt(min(df$top, na.rm = TRUE), 0)
jsonlist <- jsonlite::fromJSON(json)
expect_equal(max(df$page), nrow(jsonlist[["pages"]]))
expect_false(is.unsorted(df$block))
})
## SPLIT_BLOCK -----------------------------------------------------------------
test_that("split_block() warns of input errors", {
expect_error(split_block(block_df = null), "Input not a data frame.")
expect_error(split_block(block_df = na), "Input not a data frame.")
expect_error(split_block(block_df = boolean), "Input not a data frame.")
expect_error(split_block(block_df = number_random), "Input not a data frame.")
expect_error(split_block(block_df = string_random), "Input not a data frame.")
expect_error(split_block(block_df = vector_strings), "Input not a data frame.")
expect_error(split_block(block_df = list_strings), "Input not a data frame.")
#expect_error(split_block(block_df = df), "Dataframe not recognized. Was it made with build_block_df?")
expect_error(split_block(block_df = matrix), "Input not a data frame.")
json <- testthat::test_path("examples", "output.json")
df <- build_block_df(type = "async", object = json)
expect_error(split_block(block_df = df, page = 1-2), "Invalid page parameter.")
expect_error(split_block(block_df = df, page = "one"), "Invalid page parameter.")
expect_error(split_block(block_df = df, page = c(1,2)), "Invalid page parameter.")
expect_error(split_block(block_df = df, page = 10), "No such page number in this dataframe.")
expect_error(split_block(block_df = df, block = 1-2), "Invalid block parameter.")
expect_error(split_block(block_df = df, block = "one"), "Invalid block parameter.")
expect_error(split_block(block_df = df, block = c(1,2)), "Invalid block parameter.")
expect_error(split_block(block_df = df, block = 50), "No such block number on this page.")
expect_error(split_block(block_df = df, block = 1, cut_point = "middle"), "Invalid cut point parameter.")
expect_error(split_block(block_df = df, block = 1, cut_point = 150), "Cut point out of range.")
expect_error(split_block(block_df = df, block = 1, cut_point = 50, direction = 1), "Invalid direction parameter.")
expect_error(split_block(block_df = df, block = 1, cut_point = 50, direction = "horizontal"), 'Split direction must be either "h" or "v".')
})
test_that("split_block() returns a revised block dataframe", {
json <- testthat::test_path("examples", "output.json")
df <- build_block_df(type = "async", object = json)
# choose random block and cut point:
block <- sample(min(df$block):max(df$block), 1)
cut_point <- sample(1:99, 1)
# check that output df has one block more:
new_df <- split_block(df, block = block, cut_point = cut_point)
expect_equal(max(new_df$block), max(df$block) + 1)
# check that the selected block is now different while others are similar:
old_block_n <- df[df$block == block,]
new_block_n <- new_df[new_df$block == block,]
#expect_false(isTRUE(all.equal(old_block_n, new_block_n))) # strangely fails on ci
rest_old_df <- df[!df$block == block,]
new_df_ex_last <- new_df[!new_df$block == max(new_df$block),]
rest_new_df <- new_df_ex_last[!new_df_ex_last$block == block,]
rownames(rest_new_df) <- NULL
rownames(rest_old_df) <- NULL
expect_true(isTRUE(all.equal(rest_old_df, rest_new_df)))
### same process but for block in middle of multipage doc:
json <- testthat::test_path("examples", "sample3pg.json")
df <- build_block_df(type = "async", object = json)
df_sub <- df[df$page == 2,]
# choose random block and cut point:
block <- sample(min(df_sub$block):max(df_sub$block), 1)
cut_point <- sample(1:99, 1)
# check that output df has one block more:
new_df <- split_block(df, block = block, cut_point = cut_point, page = 2)
expect_equal(nrow(new_df), nrow(df) + 1)
# check that the selected block is now different while others are similar:
new_df_sub <- new_df[new_df$page == 2,]
old_block_n <- df_sub[df_sub$block == block,]
new_block_n <- new_df_sub[new_df_sub$block == block,]
expect_false(isTRUE(all.equal(old_block_n, new_block_n)))
rest_old_df <- df_sub[!df_sub$block == block,]
new_df_ex_last <- new_df_sub[!new_df_sub$block == max(new_df_sub$block),]
rest_new_df <- new_df_ex_last[!new_df_ex_last$block == block,]
rownames(rest_new_df) <- NULL
rownames(rest_old_df) <- NULL
expect_true(isTRUE(all.equal(rest_old_df, rest_new_df)))
### same process but for block at end of multipage doc:
json <- testthat::test_path("examples", "sample3pg.json")
df <- build_block_df(type = "async", object = json)
df_sub <- df[df$page == 3,]
# choose random block and cut point:
block <- sample(min(df_sub$block):max(df_sub$block), 1)
cut_point <- sample(1:99, 1)
# check that output df has one block more:
new_df <- split_block(df, block = block, cut_point = cut_point, page = 3)
expect_equal(nrow(new_df), nrow(df) + 1)
# check that the selected block is now different while others are similar:
new_df_sub <- new_df[new_df$page == 3,]
old_block_n <- df_sub[df_sub$block == block,]
new_block_n <- new_df_sub[new_df_sub$block == block,]
expect_false(isTRUE(all.equal(old_block_n, new_block_n)))
rest_old_df <- df_sub[!df_sub$block == block,]
new_df_ex_last <- new_df_sub[!new_df_sub$block == max(new_df_sub$block),]
rest_new_df <- new_df_ex_last[!new_df_ex_last$block == block,]
rownames(rest_new_df) <- NULL
rownames(rest_old_df) <- NULL
expect_true(isTRUE(all.equal(rest_old_df, rest_new_df)))
### same process, but for horizontal split:
json <- testthat::test_path("examples", "sample3pg.json")
df <- build_block_df(type = "async", object = json)
# choose random block and cut point:
block <- sample(min(df$block):max(df$block), 1)
cut_point <- sample(1:99, 1)
# check that output df has one block more:
new_df <- split_block(df, block = block, cut_point = cut_point, direction = "h")
expect_equal(max(new_df$block), max(df$block) + 1)
# check that the selected block is now different while others are similar:
old_block_n <- df[df$block == block,]
new_block_n <- new_df[new_df$block == block,]
expect_false(isTRUE(all.equal(old_block_n, new_block_n)))
rest_old_df <- df[!df$block == block,]
new_df_ex_last <- new_df[!new_df$block == max(new_df$block),]
rest_new_df <- new_df_ex_last[!new_df_ex_last$block == block,]
rownames(rest_new_df) <- NULL
rownames(rest_old_df) <- NULL
expect_true(isTRUE(all.equal(rest_old_df, rest_new_df)))
### same process but for block in middle of multipage doc:
json <- testthat::test_path("examples", "sample3pg.json")
df <- build_block_df(type = "async", object = json)
df_sub <- df[df$page == 2,]
# choose random block and cut point:
block <- sample(min(df_sub$block):max(df_sub$block), 1)
cut_point <- sample(1:99, 1)
# check that output df has one block more:
new_df <- split_block(df, block = block, cut_point = cut_point, page = 2, direction = "h")
expect_equal(nrow(new_df), nrow(df) + 1)
# check that the selected block is now different while others are similar:
new_df_sub <- new_df[new_df$page == 2,]
old_block_n <- df_sub[df_sub$block == block,]
new_block_n <- new_df_sub[new_df_sub$block == block,]
expect_false(isTRUE(all.equal(old_block_n, new_block_n)))
rest_old_df <- df_sub[!df_sub$block == block,]
new_df_ex_last <- new_df_sub[!new_df_sub$block == max(new_df_sub$block),]
rest_new_df <- new_df_ex_last[!new_df_ex_last$block == block,]
rownames(rest_new_df) <- NULL
rownames(rest_old_df) <- NULL
expect_true(isTRUE(all.equal(rest_old_df, rest_new_df)))
### same process but for block at end of multipage doc:
json <- testthat::test_path("examples", "sample3pg.json")
df <- build_block_df(type = "async", object = json)
df_sub <- df[df$page == 3,]
# choose random block and cut point:
block <- sample(min(df_sub$block):max(df_sub$block), 1)
cut_point <- sample(1:99, 1)
# check that output df has one block more:
new_df <- split_block(df, block = block, cut_point = cut_point, page = 3, direction = "h")
expect_equal(nrow(new_df), nrow(df) + 1)
# check that the selected block is now different while others are similar:
new_df_sub <- new_df[new_df$page == 3,]
old_block_n <- df_sub[df_sub$block == block,]
new_block_n <- new_df_sub[new_df_sub$block == block,]
expect_false(isTRUE(all.equal(old_block_n, new_block_n)))
rest_old_df <- df_sub[!df_sub$block == block,]
new_df_ex_last <- new_df_sub[!new_df_sub$block == max(new_df_sub$block),]
rest_new_df <- new_df_ex_last[!new_df_ex_last$block == block,]
rownames(rest_new_df) <- NULL
rownames(rest_old_df) <- NULL
expect_true(isTRUE(all.equal(rest_old_df, rest_new_df)))
})
## REASSIGN_TOKENS -------------------------------------------------------------
test_that("reassign_tokens() warns of input errors", {
expect_error(reassign_tokens(token_df = null), "token_df not a data frame.")
expect_error(reassign_tokens(token_df = na), "token_df not a data frame.")
expect_error(reassign_tokens(token_df = boolean), "token_df not a data frame.")
expect_error(reassign_tokens(token_df = number_random), "token_df not a data frame.")
expect_error(reassign_tokens(token_df = string_random), "token_df not a data frame.")
expect_error(reassign_tokens(token_df = vector_strings), "token_df not a data frame.")
expect_error(reassign_tokens(token_df = list_strings), "token_df not a data frame.")
#expect_error(reassign_tokens(token_df = df), "Token dataframe not recognized. Was it made with build_token_df?")
expect_error(reassign_tokens(token_df = matrix), "token_df not a data frame.")
json <- testthat::test_path("examples", "output.json")
df <- build_token_df(type = "async", object = json)
expect_error(reassign_tokens(df, block_df = null), "block_df not a data frame.")
expect_error(reassign_tokens(df, block_df = na), "block_df not a data frame.")
expect_error(reassign_tokens(df, block_df = boolean), "block_df not a data frame.")
expect_error(reassign_tokens(df, block_df = number_random), "block_df not a data frame.")
expect_error(reassign_tokens(df, block_df = string_random), "block_df not a data frame.")
expect_error(reassign_tokens(df, block_df = vector_strings), "block_df not a data frame.")
expect_error(reassign_tokens(df, block_df = list_strings), "block_df not a data frame.")
expect_error(reassign_tokens(df, block_df = df), "Block dataframe not recognized. Was it made with build_block_df?")
expect_error(reassign_tokens(df, block_df = matrix), "block_df not a data frame.")
})
test_that("reassign_tokens() returns a revised token dataframe", {
# first get sample token df:
json <- testthat::test_path("examples", "output.json")
tdf_old <- build_token_df(type = "async", object = json)
# then use split blocks to make a revised block dataframe:
# start by making regular block df:
json <- testthat::test_path("examples", "output.json")
bdf_old <- build_block_df(type = "async", object = json)
# then split a random block (with a minimum of 10 words) to make a new block_df:
blocks <- list()
for (i in 1:max(tdf_old$block, na.rm=TRUE)) {
block <- list(tdf_old[tdf_old$block == i,])
blocks <- append(blocks, block)
}
sizes <- data.frame(block = numeric(), words = numeric())
for (i in 1:max(tdf_old$block, na.rm=TRUE)) {
ntokens <- nrow(blocks[[i]])
entry <- data.frame(block = i, words = ntokens)
sizes <- rbind(sizes, entry)
}
b10plus <- sizes$block[sizes$words >= 10]
to_split <- sample(b10plus, 1)
bdf_new <- split_block(bdf_old, block = to_split, cut_point = sample(1:99, 1))
# then reassign tokens:
tdf_new <- reassign_tokens(tdf_old, bdf_new)
# now check:
expect_true(is.data.frame(tdf_new))
#expect_equal(nrow(tdf_new), nrow(tdf_old)) # Not necessarily equal bc a word
# split in middle (as result of random split) can be counted twice
expect_true(identical(colnames(tdf_new), colnames(tdf_old)))
expect_false(isTRUE(all.equal(tdf_new, tdf_old)))
expect_true(is.unsorted(tdf_new$start_ind, na.rm = TRUE))
expect_false(is.unsorted(tdf_old$start_ind, na.rm = TRUE))
})
## REASSIGN_TOKENS2 ------------------------------------------------------------
test_that("reassign_tokens2() warns of input errors", {
expect_error(reassign_tokens2(token_df = null), "token_df not a data frame.")
expect_error(reassign_tokens2(token_df = na), "token_df not a data frame.")
expect_error(reassign_tokens2(token_df = boolean), "token_df not a data frame.")
expect_error(reassign_tokens2(token_df = number_random), "token_df not a data frame.")
expect_error(reassign_tokens2(token_df = string_random), "token_df not a data frame.")
expect_error(reassign_tokens2(token_df = vector_strings), "token_df not a data frame.")
expect_error(reassign_tokens2(token_df = list_strings), "token_df not a data frame.")
#expect_error(reassign_tokens2(token_df = df), "Token dataframe not recognized. Was it made with build_token_df?")
expect_error(reassign_tokens2(token_df = matrix), "token_df not a data frame.")
json <- testthat::test_path("examples", "output.json")
tdf <- build_token_df(type = "async", object = json)
expect_error(reassign_tokens2(tdf, block = null), "block input not a data frame.")
expect_error(reassign_tokens2(tdf, block = na), "block input not a data frame.")
expect_error(reassign_tokens2(tdf, block = boolean), "block input not a data frame.")
expect_error(reassign_tokens2(tdf, block = number_random), "block input not a data frame.")
expect_error(reassign_tokens2(tdf, block = string_random), "block input not a data frame.")
expect_error(reassign_tokens2(tdf, block = vector_strings), "block input not a data frame.")
expect_error(reassign_tokens2(tdf, block = list_strings), "block input not a data frame.")
expect_error(reassign_tokens2(tdf, block = df), "Block dataframe format not recognized.")
expect_error(reassign_tokens2(tdf, block = matrix), "block input not a data frame.")
bdf <- build_block_df(type = "async", object = json)
bdf <- bdf[1,]
expect_error(reassign_tokens2(tdf, bdf, page = 1-2), "Invalid page parameter.")
expect_error(reassign_tokens2(tdf, bdf, page = "one"), "Invalid page parameter.")
expect_error(reassign_tokens2(tdf, bdf, page = c(1,2)), "Invalid page parameter.")
expect_error(reassign_tokens2(tdf, bdf, page = 10), "No such page number in this dataframe.")
})
test_that("reassign_tokens2() returns a revised token dataframe", {
# make token df:
json <- testthat::test_path("examples", "peshtigo.json")
tdf_old <- build_token_df(type = "async", object = json)
# make block df with from_labelme():
json <- testthat::test_path("examples", "peshtigo_labelme.json")
block <- from_labelme(json)
# then reassign:
tdf_new <- reassign_tokens2(tdf_old, block)
# now check:
expect_true(is.data.frame(tdf_new))
expect_equal(max(tdf_new$block, na.rm = TRUE), max(tdf_old$block, na.rm = TRUE) + 1)
# expect_equal(nrow(tdf_new), nrow(tdf_old)) Not necessarily equal bc a word
# split in middle (as result of random split) can be counted twice
expect_true(identical(colnames(tdf_new), colnames(tdf_old)))
expect_false(isTRUE(all.equal(tdf_new, tdf_old)))
expect_false(is.unsorted(tdf_old$start_ind, na.rm = TRUE))
# expect_true(is.unsorted(tdf_new$start_ind, na.rm = TRUE))
# not necessarily true if the block split is the last one
})
## FROM_LABELME ------------------------------------------------------------
test_that("from_labelme() warns of input errors", {
expect_error(from_labelme(null), "Input file not .json.")
expect_error(from_labelme(na), "Input file not .json.")
expect_error(from_labelme(boolean), "Input file not .json.")
expect_error(from_labelme(number_random), "Input file not .json.")
expect_error(from_labelme(string_random), "Input file not .json.")
expect_error(from_labelme(vector_strings), "Input file not .json.")
expect_error(from_labelme(list_strings), "Input file not .json.")
expect_error(from_labelme(df), "Input file not .json.")
expect_error(from_labelme(matrix), "Input file not .json.")
expect_error(from_labelme("wrongfile.csv"), "Input file not .json.")
expect_error(from_labelme("madeup.json"), "Input file not .json.")
json <- testthat::test_path("examples", "peshtigo_labelme.json")
expect_error(from_labelme(json, page = 1-2), "Invalid page parameter.")
expect_error(from_labelme(json, page = "one"), "Invalid page parameter.")
expect_error(from_labelme(json, page = c(1, 2)), "Invalid page parameter.")
})
test_that("from_labelme() produces a properly formatted df", {
json <- testthat::test_path("examples", "peshtigo_labelme.json")
df <- from_labelme(json)
expect_true(is.data.frame(df))
#expect_true(identical(colnames(df), c("page", "block", "left", "right", "top", "bottom")))
expect_true(is.numeric(df$block))
expect_true(is.numeric(df$left))
expect_lt(max(df$right, na.rm = TRUE), 1)
expect_gt(min(df$top, na.rm = TRUE), 0)
})
unlink(madeup_json_file, force = TRUE)
## REDRAW_BLOCKS ----------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.