knitr::opts_chunk$set(echo = TRUE)

Import

Libraries

  if (!require(pacman)) {install.packages('pacman')}
  p_load(
    dplyr
  )

Data

folder_path <- "./../../../inst/extdata/sample_documents/"
pdf_paths <- list.files(recursive = FALSE, 
                       path = folder_path, 
                       pattern = ".pdf", 
                       full.names = TRUE)
print(pdf_paths)

Preceding Steps

Process Text

n = 19
input_path <- pdf_paths[n]
input_path

text_processed <- process_text(input_path)

# inspect(text_processed.v, m = 180)

Extract Hypothesis

hypothesis.df <- hypothesis_extraction(text_processed, apply_model = FALSE)

hypothesis.df

Hypothesis Select

hypothesis <- hypothesis.df %>%
  dplyr::select(hypothesis)

hypothesis

Drop Hypothesis Tag

hypothesis <- hypothesis %>% 
  mutate(
    hypothesis = gsub(
      pattern = "hypo (.*?):\\s*", 
      replacement = "", 
      x = hypothesis
      )
  )

hypothesis

Generate Entities

entities <- entity_extraction(hypothesis.df)
entities

Generate Model Input

Walkthrough

Compile Table

  model_input.df <- hypothesis %>%
    dplyr::bind_cols(entities) %>%
    dplyr::mutate(
      row_id= dplyr::row_number()
    ) %>%
    dplyr::select(row_id, dplyr::everything())

model_input.df

Strip Punctuation

pattern_punct <- "[[:punct:]]"

model_input.df <- model_input.df %>% 
    dplyr::mutate(
      hypothesis = stringr::str_remove_all(
        string  = hypothesis,
        pattern = pattern_punct
      ),
      cause = stringr::str_remove_all(
        string  = cause,
        pattern = pattern_punct
      ),
      effect = stringr::str_remove_all(
        string  = effect,
        pattern = pattern_punct
      )
    ) 

model_input.df

Replace Missing with Tag

missing_tag <- "<missing>"

model_input.df <- model_input.df %>% 
    dplyr::mutate(                               # Replace Missing With Tag
      cause   = dplyr::if_else(                  # Quiets warning to console
        condition = cause == "",
        true      =  missing_tag,
        false     = cause
      ),
      effect  = dplyr::if_else(
        condition = effect == "",
        true      =  missing_tag,
        false     = effect
      )
    )

model_input.df

Replace Entity with Node Tags

entity_extraction = FALSE

if (entity_extraction) {
  print("if")
  model_input.df_001 <- model_input.df %>% 
      dplyr::mutate(                               # Replace entity with node1/2
        causal_statement = dplyr::if_else(
          condition = cause != missing_tag,
          true = {
            stringr::str_replace(
              string      = hypothesis,
              pattern     = cause,
              replacement = "node1"
            )},
          false = hypothesis
        )
      ) %>%
      dplyr::mutate(
        causal_statement = dplyr::if_else(
          condition = effect != missing_tag,
          true = {
            stringr::str_replace(
              string      = causal_statement,
              pattern     = effect,
              replacement = "node2"
            )},
          false = causal_statement
        )
      )
} else {
  print("else")
  model_input.df_001 <- model_input.df %>% 
    dplyr::mutate(
      causal_statement = hypothesis
    )
}

model_input.df_001

Remove Stopwords

  ## Remove stopwords
  model_input.df_002 <- model_input.df_001 %>%
    tidytext::unnest_tokens(word, causal_statement) %>%
    dplyr::anti_join(
      tidytext::get_stopwords(),
      by = "word"
    ) %>%
    dplyr::select(row_id, word)

Function

Causality

gen_causality_direction_model_input(
  hypothesis.df, 
  entity_extraction = TRUE,
  token_method = "stem"
  )

Direction

gen_causality_direction_model_input(
  hypothesis.df, 
  entity_extraction = FALSE,
  token_method = "stem"
  )

Warning

gen_causality_direction_model_input(
  hypothesis.df, 
  token_method = "error"
  )

Causality / Direction

Generate Predictions

# Causality classification
causality_class <- causality_classification(hypothesis.df)
causality_class <- data.frame(causality_class)
causality_class
# Direction class
direction_class <- direction_classification(hypothesis.df)
direction_class <- data.frame(direction_class)
direction_class

Compile Table

file_name <- basename(input_path)

# Compile table
iter.df <- cbind(hypothesis.df, entities) %>%
  tidyr::drop_na()
iter.df <- cbind(iter.df, causality_class, direction_class)
iter.df$file_name <- file_name

iter.df

Modify Headers and Format

iter.df <- iter.df %>%
          dplyr::rename(
            hypothesis_num = h_id,
            causal_relationship = causality_pred,
            direction = direction_pred
          ) %>%
          dplyr::select(
            file_name, hypothesis_num, hypothesis, cause,
            effect, direction, causal_relationship
          ) %>%
          purrr::modify_if(is.factor, as.character)

iter.df

Remove trailing comma

iter.df %>% 
  dplyr::mutate(
    effect = stringr::str_remove_all(string = effect, pattern = stringr::regex("$,"))
  )



iter.df

Remove if both nodes not present

  iter.df %>%
    dplyr::mutate(
      causal_relationship = dplyr::if_else(
        condition = ((cause == "") | (effect == "")),
        true      = "",
        false     = as.character(causal_relationship)
      )
    )

Drop Hypothesis Tag

iter.df <- iter.df %>% 
  mutate(
    hypothesis = gsub(
      pattern = "hypo (.*?):\\s*", 
      replacement = "", 
      x = hypothesis
      )
  )

iter.df

Final Function

CausalityExtraction(file_path = input_path)


canfielder/CausalityExtraction documentation built on Jan. 5, 2022, 10:55 a.m.