knitr::opts_chunk$set(echo = TRUE)
The purpose of this notebook is to test using different methods for converting raw PDF to text
if (!require(pacman)) {install.packages('pacman')} p_load( data.table, dplyr, readxl, reticulate, stringr, tidyr )
The following imports functions defined in the sourced R scripts.
# Import All Scripts script_path <- "../R/" file_paths <- list.files(recursive = TRUE, path = script_path, pattern = ".R", full.names = TRUE) for (file in file_paths){ source(file) }
parser <- import("tika.parser")
# PDF Input ## Define Path pdf_path <- "./../data/acadmic_papers_pdf_sample/jv04amj.pdf"
The follow processes the sample PDF using Tika via the R package.
# pdf_txt_raw_r <- tika_text(pdf_path) # # pdf_txt_raw_r %>% substr(1,100)
pdf_tika_py <- parser$from_file(pdf_path) pdf_txt_raw_py_tika <- pdf_tika_py$content pdf_txt_raw_py_tika %>% substr(1,100)
# Source PDFMiner Script source_python("./../source/pdf_to_text.py") pdf_txt_raw_py_pdfminer <- pdf_to_text(pdf_path) pdf_txt_raw_py_pdfminer %>% substr(1,100)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.