knitr::opts_chunk$set(echo = TRUE)
The purpose of this notebook is to test how to implement the Entity Extraction model
if (!require(pacman)) {install.packages('pacman')} p_load( dplyr )
folder_path <- "./../../../../PDFS/internal/" pdf_paths <- list.files(recursive = FALSE, path = folder_path, pattern = ".pdf", full.names = TRUE) # Format pdf path visualization data.frame(pdf_paths) %>% dplyr::mutate( pdf_paths = stringr::str_remove_all( string = pdf_paths, pattern = folder_path ), index = dplyr::row_number() ) %>% dplyr::select(index, pdf_paths)
index = 16 input_path <- pdf_paths[index] print(input_path) # Tika text_raw <- pdf_to_text(input_path) # Output Check print(paste("Text Vector Characters: ", nchar(text_raw)))
## Split text into character vector text_processed <- text_raw %>% stringr::str_split(pattern = "\n") %>% unlist() # Output Check inspect_text_vector( input_text = text_processed, start_index = 1, span = 20 ) text_processed
## Fix conversion error in italics text text_processed <- fix_italics_conversion_error(text_processed) # Output Check inspect_text_vector( input_text = text_processed, start_index = 955, span = 10 )
## Fix encoding errors text_processed <- fix_encoding_errors(text_processed) # Output Check inspect_text_vector( input_text = text_processed, start_index = 955, span = 20 )
text_processed <- tricky_terms( input_text = text_processed, normalize = TRUE ) # Output Check inspect_text_vector( input_text = text_processed, start_index = 520, span = 20 )
text_processed <- stringr::str_trim(string = text_processed) text_processed <- stringr::str_squish(string = text_processed) # Output Check inspect_text_vector( input_text = text_processed, start_index = 520, span = 20 )
empty_tag <- "<line_break>" logical_empty <- text_processed == "" text_processed <- replace( x = text_processed, list = logical_empty, values = empty_tag ) # Output Check inspect_text_vector( input_text = text_processed, start_index = 520, span = 20 )
text_processed <- drop_reference_section(text_processed) # Output Check inspect_text_vector( input_text = text_processed, start_index = 580, span = 25 )
## Drop lines with only numbers or symbols text_processed <- remove_if_detect( input_vector = text_processed, regex = regex_letters, logical_method = "inverse" ) # Output Check inspect_text_vector( input_text = text_processed, start_index = 580, span = 25 )
## Drop elements with length of 1 character logical_length <- nchar(text_processed) != 1 text_processed <- text_processed[logical_length] # Output Check inspect_text_vector( input_text = text_processed, start_index = 580, span = 25 )
# Drop any NA elements text_processed <- text_processed[!is.na(text_processed)] # Output Check inspect_text_vector( input_text = text_processed, start_index = 580, span = 25 )
text_processed <- remove_month_vectors(text_processed) # Output Check inspect_text_vector( input_text = text_processed, start_index = 580, span = 25 )
## Remove elements which contain text related to downloading documents. download_vec <- c('This content downloaded','http','jsto', 'JSTOR','DOI','doi', '://') text_processed <- remove_if_detect( input_vector = text_processed, remove_string = download_vec, location = "any" ) # Output Check inspect_text_vector( input_text = text_processed, start_index = 580, span = 25 )
## Remove elements which contain IP addresses ## Identify IP Address text_processed <- remove_if_detect( input_vector = text_processed, regex = regex_ip, location = "any" ) # Output Check inspect_text_vector( input_text = text_processed, start_index = 580, span = 25 )
## Identify line and page breaks text_processed <- page_line_break_identification(text_processed) # Drop page breaks text_processed <- text_processed[text_processed != "<page_break>"] ## Drop line breaks ### Trade-off with tokenizing on line break. Dropping all line break vectors ### will mean they cannot be used to tokenize drop_line_breaks <- TRUE if (drop_line_breaks) { text_processed <- text_processed[text_processed != line_break_tag] } # Output Check inspect_text_vector( input_text = text_processed, start_index = 580, span = 25 )
## Concatenate Adjacent Elements If Initial Element Ends With Hyphen text_processed <- concat_hypen_vector(text_processed) # Output Check inspect_text_vector( input_text = text_processed, start_index = 580, span = 25 )
## Remove text within parentheses # text_processed <- remove_text_parens(text_processed) # ## Output Check # inspect_text_vector(input_text = text_processed, start_index = 200, span = 20)
## Remove Periods From Common Abbreviations text_processed <- remove_period_abbr(text_processed) ## Adjust common error traps text_processed <- fix_common_error_traps(text_processed) # Output Check inspect_text_vector( input_text = text_processed, start_index = 1, span = 25 )
initial_cutoff <- 50 if (length(text_processed) > initial_cutoff) { text_length <- length(text_processed) text_processed <-text_processed[initial_cutoff:text_length] } # Output Check inspect_text_vector( input_text = text_processed, start_index = 1, span = 25 )
input_vector <- text_processed # Initialize output_vector <- vector( mode = "character", length = length(input_vector)) j <- 1 # Search for hypothesis in correct format for (i in seq_along(input_vector)) { input.str <- input_vector[i] print(i) print(input.str) # Test if hypothesis format is in string detect_hypothesis <- stringr::str_detect( string = input.str, pattern = regex_hypothesis_no_num ) # If hypothesis is detected, replace with standardized format if (detect_hypothesis) { standardized_string <- paste0("<split>hypo ", j, ": ") output_string <- stringr::str_replace( string = input.str, pattern = regex_hypothesis_no_num, replacement = standardized_string ) output_vector[i] <- output_string j = j + 1 } else { output_vector[i] <- input.str } } output_vector
## Hypothesis text_processed <- standardize_hypothesis_proposition( input.str = text_processed ) # Output Check inspect_text_vector( input_text = text_processed, start_index = 1, span = 25 )
## Test if any hypothesis standardized n_hypothesis_test <- sum( stringr::str_count( string = text_processed, pattern = "<split>hypo" ) ) ## If no hypothesis detected, attempt to standardize hypothesis/proposition ## formats without number/labels if (n_hypothesis_test == 0) { text_processed <- standardize_hypothesis_proposition_no_num( input_vector = text_processed ) } # Output Check inspect_text_vector( input_text = text_processed, start_index = 580, span = 25 )
## Remove Duplicate Tags text_processed <- remove_duplicate_tag( input.str = text_processed ) # Output Check inspect_text_vector( input_text = text_processed, start_index = 580, span = 25 )
# Remove trailing period for standardized hypothesis tags text_processed <- remove_period( input.str = text_processed ) # Output Check inspect_text_vector( input_text = text_processed, start_index = 580, span = 25 )
text_processed <- tricky_terms( input_text = text_processed, normalize = FALSE ) # Output Check inspect_text_vector( input_text = text_processed, start_index = 580, span = 25 )
## Pass 1 - Tokenizers - Default text_processed <- stringr::str_c( text_processed, collapse = " " ) text_processed <- tokenizers::tokenize_sentences( text_processed, strip_punct = FALSE) %>% unlist() # Output Check inspect_text_vector( input_text = text_processed, start_index = 100, span = 25 )
if (!(drop_line_breaks)) { text_processed <- tokenizers::tokenize_regex( text_processed, pattern = line_break_tag ) %>% unlist() } # Output Check inspect_text_vector( input_text = text_processed, start_index = 580, span = 25 )
text_processed <- stringr::str_split( string = text_processed, pattern = "\\.") %>% unlist() ## Drop empty vectors text_processed <- text_processed[text_processed!=""] text_processed <- text_processed[text_processed!=" "] # Output Check inspect_text_vector( input_text = text_processed, start_index = 580, span = 25 )
## Set everything to lowercase text_processed <- tolower(text_processed) # Output Check inspect_text_vector( input_text = text_processed, start_index = 580, span = 25 )
text_processed = break_out_hypothesis_tags(input.v = text_processed) # Output Check inspect_text_vector( input_text = text_processed, start_index = 580, span = 25 )
## Replace double colons text_processed <- stringr::str_replace_all( string = text_processed, pattern = ": :", replacement = ":" ) ## Replace colon/period instances (: .) text_processed <- stringr::str_replace_all( string = text_processed, pattern = ": \\.", replacement = ":" ) # Output Check inspect_text_vector( input_text = text_processed, start_index = 580, span = 25 )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.