In canfielder/CausalityExtraction: Hypothesis Extraction and Analysis for Scholarly Papers in Social Sciences

knitr::opts_chunk$set(echo = TRUE)

Purpose

The purpose of this notebook is to test miscellaneous parts of the package.

Import

Libraries

  if (!require(pacman)) {install.packages('pacman')}
  p_load(
    dplyr
  )

Install Tika Jar

jar <- rtika::tika_jar()

jar
is.na(jar)

if (is.na(jar)) {
  rtika::install_tika()
}

Break Out Nested Hypothesis

h_sample = "the strength of the relationship between (a) poor prior performance, (b) large firm size, and (c) young firm age on one hand and the appointment of outside directors to the board on the other hand will decrease over time added test (g) will this be detected"

# stringr::str_locate(string = h_sample, pattern = "poor")

idx_start <- c()
idx_end <- c()
idx_start <- c(idx_start, 1)

for (ltr in letters) {
  letter_lookup <- paste("\\(", ltr, "\\)", sep = "")
  detected <- stringr::str_detect(string = h_sample, pattern = letter_lookup)
  if (detected) {

    print(letter_lookup)
    letter_locate <- stringr::str_locate(string = h_sample, pattern = letter_lookup)
    print(letter_locate)
    idx_start <- c(idx_start, letter_locate[2])
    idx_end <- c(idx_end, letter_locate[1])


  } else {
    break
  }
}
idx_end <- c(idx_end, nchar(h_sample))

Segement

h_segments <- vector(mode = "character", length = length(idx_start))

for (i in seq_along(idx_start)) {
  h_segments[i] <- substr(x = h_sample, start = idx_start[i], stop = idx_end[i])
  h_segments[i]  <- gsub("\\($", "", h_segments[i])
  h_segments[i]  <- gsub("^\\)", "", h_segments[i])
  h_segments[i]  <- stringr::str_trim(string = h_segments[i])
} 
h_segments

Remove Stopwords

h_segments.df <- as.data.frame(h_segments)

  h_segments.df_001 <- h_segments.df %>%
    dplyr::mutate(
      row_id = dplyr::row_number()
    ) %>% 
    tidytext::unnest_tokens(word, h_segments, strip_punct = FALSE) 


  h_segments.df_002 <- h_segments.df_001%>%
    dplyr::anti_join(
      tidytext::get_stopwords(),
      by = "word"
    )

  nest_length <- h_segments.df_002 %>% 
    group_by(row_id) %>% 
    summarise(n_tokens= n()) %>% 
    slice(2:(length(h_segments) - 1)) %>% 
    filter(n_tokens == max(n_tokens)) %>% 
    select(n_tokens) %>% 
    distinct() %>% 
    pull(n_tokens)

h_segments.df_003 <- h_segments.df_002 %>% 
    dplyr::group_by(row_id) %>%
    dplyr::mutate(
      sentence = stringr::str_c(
        word,
        collapse = " ")
    ) %>% 
  ungroup() %>% 
  select(sentence) %>% 
  distinct()

h_sw_remove <- h_segments.df_003 %>% pull(sentence)
h_sw_remove

Remove trailing commas

for (i in seq_along(idx_start)) {
  h_sw_remove[i]  <- gsub("\\,$", "", h_sw_remove[i])
} 
h_sw_remove

Replace Interior segments with stopword removed segements

length(h_segments)
h_segments_002 <- vector(mode = "character", length = (length(h_segments) -1))
h_segments_002[1] <- h_segments[1]


h_segments_002

i = 1
while (i <= (length(h_segments) - 2)) {

  print (i)
  h_segments_002[i+1] <- h_sw_remove[i+1]

  i = i +1

}
h_segments_002

Subset final nested hypothesis

h_final_split <- h_segments.df_003 %>% 
  slice(length(h_segments)) %>% 
  tidytext::unnest_tokens(word, sentence, strip_punct = FALSE) %>% 
  mutate(
    segement_id = if_else(
      dplyr::row_number() <= nest_length, 1, 2
    )
  ) %>% 
  dplyr::group_by(segement_id) %>%
  dplyr::mutate(
      sentence = stringr::str_c(
        word,
        collapse = " ")
    ) %>% 
  ungroup() %>% 
  select(sentence) %>% 
  distinct() %>% 
  pull(sentence)

h_final_split

Replace last segment with split segment

h_segments_003 <- c(h_segments_002, h_segment_split)
h_segments_003

h_sample

h_segments.df_001a <- h_segments.df_001 %>% 
  dplyr::filter(row_id == max(row_id))

n_tokens_final <- length(h_segments.df_001a$word)

h_segments.df_001a %>% 
  slice((nest_length):n_tokens_final) %>% 
  mutate(
    row_id = 1
  ) %>% 
  group_by(row_id) %>% 
  dplyr::mutate(
    sentence = stringr::str_c(
      word,
      collapse = " ")
  ) %>% 
  ungroup() %>% 
  select(sentence) %>% 
  distinct() %>% 
  pull()

Reconstruct Segments

Fix using stop word removed final stirng

i = 1
while (i <= length(h_segments_003) - 2) {
  print(i)
  h_temp <- paste(h_segments_003[1], h_segments_003[1+i], 
                  h_segments_003[length(h_segments_003)], sep = " ")

  h_temp <- stringr::str_squish(h_temp)
  print(h_temp)
  i = i+1
}

canfielder/CausalityExtraction documentation built on Jan. 5, 2022, 10:55 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

canfielder/CausalityExtraction
Hypothesis Extraction and Analysis for Scholarly Papers in Social Sciences

In canfielder/CausalityExtraction: Hypothesis Extraction and Analysis for Scholarly Papers in Social Sciences

Purpose

Import

Libraries

Install Tika Jar

Break Out Nested Hypothesis

Segement

Remove Stopwords

Remove trailing commas

Replace Interior segments with stopword removed segements

Subset final nested hypothesis

Replace last segment with split segment

Reconstruct Segments

R Package Documentation

Browse R Packages

We want your feedback!

canfielder/CausalityExtraction Hypothesis Extraction and Analysis for Scholarly Papers in Social Sciences

In canfielder/CausalityExtraction: Hypothesis Extraction and Analysis for Scholarly Papers in Social Sciences

Purpose

Import

Libraries

Install Tika Jar

Break Out Nested Hypothesis

Segement

Remove Stopwords

Remove trailing commas

Replace Interior segments with stopword removed segements

Subset final nested hypothesis

Replace last segment with split segment

Reconstruct Segments

R Package Documentation

Browse R Packages

We want your feedback!

canfielder/CausalityExtraction
Hypothesis Extraction and Analysis for Scholarly Papers in Social Sciences