In canfielder/CausalityExtraction: Hypothesis Extraction and Analysis for Scholarly Papers in Social Sciences

knitr::opts_chunk$set(echo = TRUE)

Import

Libraries

  if (!require(pacman)) {install.packages('pacman')}
  p_load(
    dplyr
  )

Data

folder_path <- "./../../../inst/extdata/sample_documents/"
pdf_paths <- list.files(recursive = FALSE, 
                       path = folder_path, 
                       pattern = ".pdf", 
                       full.names = TRUE)
print(pdf_paths)

Preceding Steps

Process Text

n = 19
input_path <- pdf_paths[n]
input_path

text_processed <- process_text(input_path)

# inspect(text_processed.v, m = 180)

Extract Hypothesis

hypothesis.df <- hypothesis_extraction(text_processed, apply_model = FALSE)

hypothesis.df

Hypothesis Select

hypothesis <- hypothesis.df %>%
  dplyr::select(hypothesis)

hypothesis

Drop Hypothesis Tag

hypothesis <- hypothesis %>% 
  mutate(
    hypothesis = gsub(
      pattern = "hypo (.*?):\\s*", 
      replacement = "", 
      x = hypothesis
      )
  )

hypothesis

Generate Entities

entities <- entity_extraction(hypothesis.df)
entities

Generate Model Input

Walkthrough

Compile Table

  model_input.df <- hypothesis %>%
    dplyr::bind_cols(entities) %>%
    dplyr::mutate(
      row_id= dplyr::row_number()
    ) %>%
    dplyr::select(row_id, dplyr::everything())

model_input.df

Strip Punctuation

pattern_punct <- "[[:punct:]]"

model_input.df <- model_input.df %>% 
    dplyr::mutate(
      hypothesis = stringr::str_remove_all(
        string  = hypothesis,
        pattern = pattern_punct
      ),
      cause = stringr::str_remove_all(
        string  = cause,
        pattern = pattern_punct
      ),
      effect = stringr::str_remove_all(
        string  = effect,
        pattern = pattern_punct
      )
    ) 

model_input.df

Replace Missing with Tag

missing_tag <- "<missing>"

model_input.df <- model_input.df %>% 
    dplyr::mutate(                               # Replace Missing With Tag
      cause   = dplyr::if_else(                  # Quiets warning to console
        condition = cause == "",
        true      =  missing_tag,
        false     = cause
      ),
      effect  = dplyr::if_else(
        condition = effect == "",
        true      =  missing_tag,
        false     = effect
      )
    )

model_input.df

Replace Entity with Node Tags

entity_extraction = FALSE

if (entity_extraction) {
  print("if")
  model_input.df_001 <- model_input.df %>% 
      dplyr::mutate(                               # Replace entity with node1/2
        causal_statement = dplyr::if_else(
          condition = cause != missing_tag,
          true = {
            stringr::str_replace(
              string      = hypothesis,
              pattern     = cause,
              replacement = "node1"
            )},
          false = hypothesis
        )
      ) %>%
      dplyr::mutate(
        causal_statement = dplyr::if_else(
          condition = effect != missing_tag,
          true = {
            stringr::str_replace(
              string      = causal_statement,
              pattern     = effect,
              replacement = "node2"
            )},
          false = causal_statement
        )
      )
} else {
  print("else")
  model_input.df_001 <- model_input.df %>% 
    dplyr::mutate(
      causal_statement = hypothesis
    )
}

model_input.df_001

Remove Stopwords

  ## Remove stopwords
  model_input.df_002 <- model_input.df_001 %>%
    tidytext::unnest_tokens(word, causal_statement) %>%
    dplyr::anti_join(
      tidytext::get_stopwords(),
      by = "word"
    ) %>%
    dplyr::select(row_id, word)

Function

Causality

gen_causality_direction_model_input(
  hypothesis.df, 
  entity_extraction = TRUE,
  token_method = "stem"
  )

Direction

gen_causality_direction_model_input(
  hypothesis.df, 
  entity_extraction = FALSE,
  token_method = "stem"
  )

Warning

gen_causality_direction_model_input(
  hypothesis.df, 
  token_method = "error"
  )

Causality / Direction

Generate Predictions

# Causality classification
causality_class <- causality_classification(hypothesis.df)
causality_class <- data.frame(causality_class)
causality_class

# Direction class
direction_class <- direction_classification(hypothesis.df)
direction_class <- data.frame(direction_class)
direction_class

Compile Table

file_name <- basename(input_path)

# Compile table
iter.df <- cbind(hypothesis.df, entities) %>%
  tidyr::drop_na()
iter.df <- cbind(iter.df, causality_class, direction_class)
iter.df$file_name <- file_name

iter.df

Modify Headers and Format

iter.df <- iter.df %>%
          dplyr::rename(
            hypothesis_num = h_id,
            causal_relationship = causality_pred,
            direction = direction_pred
          ) %>%
          dplyr::select(
            file_name, hypothesis_num, hypothesis, cause,
            effect, direction, causal_relationship
          ) %>%
          purrr::modify_if(is.factor, as.character)

iter.df

Remove trailing comma

iter.df %>% 
  dplyr::mutate(
    effect = stringr::str_remove_all(string = effect, pattern = stringr::regex("$,"))
  )



iter.df

Remove if both nodes not present

  iter.df %>%
    dplyr::mutate(
      causal_relationship = dplyr::if_else(
        condition = ((cause == "") | (effect == "")),
        true      = "",
        false     = as.character(causal_relationship)
      )
    )

Drop Hypothesis Tag

iter.df <- iter.df %>% 
  mutate(
    hypothesis = gsub(
      pattern = "hypo (.*?):\\s*", 
      replacement = "", 
      x = hypothesis
      )
  )

iter.df

Final Function

CausalityExtraction(file_path = input_path)

canfielder/CausalityExtraction documentation built on Jan. 5, 2022, 10:55 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

canfielder/CausalityExtraction
Hypothesis Extraction and Analysis for Scholarly Papers in Social Sciences

In canfielder/CausalityExtraction: Hypothesis Extraction and Analysis for Scholarly Papers in Social Sciences

Import

Libraries

Data

Preceding Steps

Process Text

Extract Hypothesis

Hypothesis Select

Drop Hypothesis Tag

Generate Entities

Generate Model Input

Walkthrough

Compile Table

Strip Punctuation

Replace Missing with Tag

Replace Entity with Node Tags

Remove Stopwords

Function

Causality

Direction

Warning

Causality / Direction

Generate Predictions

Compile Table

Modify Headers and Format

Remove trailing comma

Remove if both nodes not present

Drop Hypothesis Tag

Final Function

R Package Documentation

Browse R Packages

We want your feedback!

canfielder/CausalityExtraction Hypothesis Extraction and Analysis for Scholarly Papers in Social Sciences

In canfielder/CausalityExtraction: Hypothesis Extraction and Analysis for Scholarly Papers in Social Sciences

Import

Libraries

Data

Preceding Steps

Process Text

Extract Hypothesis

Hypothesis Select

Drop Hypothesis Tag

Generate Entities

Generate Model Input

Walkthrough

Compile Table

Strip Punctuation

Replace Missing with Tag

Replace Entity with Node Tags

Remove Stopwords

Function

Causality

Direction

Warning

Causality / Direction

Generate Predictions

Compile Table

Modify Headers and Format

Remove trailing comma

Remove if both nodes not present

Drop Hypothesis Tag

Final Function

R Package Documentation

Browse R Packages

We want your feedback!

canfielder/CausalityExtraction
Hypothesis Extraction and Analysis for Scholarly Papers in Social Sciences