knitr::opts_chunk$set(echo = TRUE)

Import

Libraries

  if (!require(pacman)) {install.packages('pacman')}
  p_load(
    dplyr
  )

Assisting Functions

#' Inspect Text String
inspect <- function(x, m = 100, span = 20) {
  n = m + span - 1
  x[m:n]
}

Regex

regex_hypo_marker <- "<split>hypo (.*?):"

Data

folder_path <- "./../../../inst/extdata/sample_documents/"
pdf_paths <- list.files(recursive = FALSE, 
                       path = folder_path, 
                       pattern = ".pdf", 
                       full.names = TRUE)
print(pdf_paths)

Preceding Steps

Process Text

m = 15
n = 0
input_path <- pdf_paths[m:(m+n)]
input_path

text_raw <- pdf_to_text(input_path)
text_raw_sample <- text_raw[1]
text_processed <- process_text(text_raw_sample)
text_processed
# inspect(text_processed.v, m = 180)

Extract Hypothesis

hypothesis.df <- hypothesis_extraction(text_processed, apply_model = FALSE)

hypothesis.df

Extract Entities

Select Hypothesis Input

The overall process works on a vector of hypothesis statements, but executes on a individual level, which is then vectorized. To see the individual steps, we will select a single hypothesis and analyze.

row_num <- 1
hypothesis <- hypothesis.df %>% slice(row_num) %>% pull(hypothesis)
hypothesis

Generate entity class predictions

pred_classes <- gen_entity_class(hypothesis)
pred_classes

index_entities <- gen_entity_class_index(pred_classes)
index_entities

Verify both entities detected

both_entity_present = FALSE
if (
  !(purrr::is_empty(index_entities[[1]])) &
  !(purrr::is_empty(index_entities[[2]]))
) {
  both_entity_present = TRUE
}
both_entity_present

Trim Overlapping Entities

if (both_entity_present) {
  index_entities <- trim_overlapping_entities(index_entities)
}

index_entities

Remove Outliers

index_entities <- trim_outlier_indexes(index_entities)
index_entities

Convert Indexes to Text

entity_text_output <- index_to_entity(hypothesis, index_entities)
entity_text_output

Drop Trailing Stopwords


Final Function

entity_extraction(hypothesis.df)


canfielder/CausalityExtraction documentation built on Jan. 5, 2022, 10:55 a.m.