R/pdf_text.R

Defines functions is_pdf_with_text pdf_into_text_table pdf_into_text_vector pdf_text_into_csv pdf_into_txt

#' @export
pdf_into_txt <- function(path, to){
  validate_ext(path, ext = "pdf")
  to <- to_parse(to, ext = "txt")
  v <- pdf_into_text_vector(path)
  #v <- paste0(v, collapse = "\n\n")
  readr::write_lines(v, to)
  to
}



#' @export
pdf_text_into_csv <- function(path, to = ""){
  # type... document or table
  validate_ext(path, ext = "pdf")
  if(!is_folder(to)) validate_ext(to, ext = "csv")
  to <- to_parse(to)
  table <- pdf_into_text_table(to)
  write_csv(table, to)
}

#' @export
pdf_into_text_vector <- function(path){
  if(is_pdf_with_text(path)){
    txt <- suppressMessages(pdftools::pdf_text(path))
  }else{
    txt <- pdftools::pdf_ocr_text(path, language = "spa")
  }
  txt
}


#' @export
pdf_into_text_table <- function(path){
  txt <- pdf_into_text_vector()
  d <- data.frame(page = seq_along(txt), text = txt)
}

#' @export
is_pdf_with_text <- function(path){
  # PDF error: Invalid Font Weight
  suppressMessages(
    nrow(pdftools::pdf_fonts(path)) > 0
  )
}
datasketch/turn documentation built on May 12, 2024, 7:48 a.m.