R/utilExtractPdfText.R

#' Extraction of plain text from pdf file.
#'
#' This function uses the\code{readPDF} functioni from the\code{tm} (Text Mining) package to
#' extract the pdf file's text.
#'
#' @param uri a string corresponding to the pdf file's name or full path.
#' @return an object of type \code{character} containing the pdf file's extracted text.
#' @author Bruno M. S. S. Melo
#' @examples
#' \dontrun{
#' fullText <- extractPdfText(uri = "DAIR.pdf", enconding = 'UTF-8')
#' }
#' @seealso \code{tm::readPDF}
utilExtractPdfText <- function(uri, enconding = 'UTF-8'){

  pdfText <- tm::readPDF(control = list(text = "-layout"))(elem = list(uri = uri),
                                                           language = "en",
                                                           id = "id1")
  pdfText <- paste(pdfText[[1]], collapse = ' ')

  Encoding(pdfText) <- enconding
  pdfText <- iconv(enc2native(pdfText), to = "ASCII//TRANSLIT")
}
brunomssmelo/TseWrangler documentation built on May 13, 2019, 8:07 a.m.