knitr::opts_chunk$set(echo = TRUE)
if (!require(pacman)) {install.packages('pacman')} p_load( dplyr )
#' Inspect Text String inspect <- function(x, m = 100, span = 20) { n = m + span - 1 x[m:n] }
regex_hypo_marker <- "<split>hypo (.*?):"
folder_path <- "./../../../inst/extdata/sample_documents/" pdf_paths <- list.files(recursive = FALSE, path = folder_path, pattern = ".pdf", full.names = TRUE) print(pdf_paths)
m = 15 n = 0 input_path <- pdf_paths[m:(m+n)] input_path text_raw <- pdf_to_text(input_path) text_raw_sample <- text_raw[1] text_processed <- process_text(text_raw_sample) text_processed # inspect(text_processed.v, m = 180)
hypothesis.df <- hypothesis_extraction(text_processed, apply_model = FALSE) hypothesis.df
The overall process works on a vector of hypothesis statements, but executes on a individual level, which is then vectorized. To see the individual steps, we will select a single hypothesis and analyze.
row_num <- 1 hypothesis <- hypothesis.df %>% slice(row_num) %>% pull(hypothesis) hypothesis
pred_classes <- gen_entity_class(hypothesis) pred_classes index_entities <- gen_entity_class_index(pred_classes) index_entities
both_entity_present = FALSE if ( !(purrr::is_empty(index_entities[[1]])) & !(purrr::is_empty(index_entities[[2]])) ) { both_entity_present = TRUE } both_entity_present
if (both_entity_present) { index_entities <- trim_overlapping_entities(index_entities) } index_entities
index_entities <- trim_outlier_indexes(index_entities) index_entities
entity_text_output <- index_to_entity(hypothesis, index_entities) entity_text_output
entity_extraction(hypothesis.df)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.