In canfielder/CausalityExtraction: Hypothesis Extraction and Analysis for Scholarly Papers in Social Sciences

knitr::opts_chunk$set(echo = TRUE)

Purpose

The purpose of this notebook is to test using different methods for converting raw PDF to text

Import

Libraries

  if (!require(pacman)) {install.packages('pacman')}
  p_load(
    data.table,
    dplyr,
    readxl,
    reticulate,
    stringr,
    tidyr
  )

Source Files

The following imports functions defined in the sourced R scripts.

# Import All Scripts
script_path <- "../R/"
file_paths <- list.files(recursive = TRUE, 
                         path = script_path, pattern = ".R", 
                         full.names = TRUE)

for (file in file_paths){
  source(file)
}

Python Modules

parser <- import("tika.parser")

Data

# PDF Input
## Define Path
pdf_path <- "./../data/acadmic_papers_pdf_sample/jv04amj.pdf"

R - RTika

The follow processes the sample PDF using Tika via the R package.

# pdf_txt_raw_r <- tika_text(pdf_path)
# 
# pdf_txt_raw_r %>% substr(1,100)

Python - Tika

pdf_tika_py <- parser$from_file(pdf_path)

pdf_txt_raw_py_tika <- pdf_tika_py$content

pdf_txt_raw_py_tika %>% substr(1,100)

Python - PDF Miner

# Source PDFMiner Script
source_python("./../source/pdf_to_text.py")

pdf_txt_raw_py_pdfminer <- pdf_to_text(pdf_path)

pdf_txt_raw_py_pdfminer %>% substr(1,100)