## PDE: Extract Sentences and Tables from PDF Files.
## Copyright (C) 2020-2023 Erik Stricker
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <https://www.gnu.org/licenses/>.
#' PDE: Extract Tables and Sentences from PDF Files.
#'
#' The package includes two main components: 1) The PDE analyzer performs the
#' sentence and table extraction while 2) the PDE reader allows the
#' user-friendly visualization and quick-processing of the obtained results.
#'
#' @section PDE functions: \code{\link{PDE_analyzer}}, \code{\link{PDE_analyzer_i}},
#' \code{\link{PDE_extr_data_from_pdfs}}, \code{\link{PDE_pdfs2table}},
#' \code{\link{PDE_pdfs2table_searchandfilter}},\code{\link{PDE_pdfs2txt_searchandfilter}},
#' \code{\link{PDE_reader_i}}, \code{\link{PDE_install_Xpdftools4.02}},
#' \code{\link{PDE_check_Xpdf_install}}
#'
#' @docType package
#' @name PDE
NULL
#> NULL
## 1.4.6
## TODO save progress with tsv file
## TODO export as excel file
## TODO export column width
## declare global variables
PDE.globals <- new.env()
PDE.globals$jumpto.list <- list()
PDE.globals$le.progress.textbox <- list()
PDE.globals$mark.list <- list()
PDE.globals$tables.masterlist <- list()
PDE.globals$ttanalyzer <- list()
#' Deprecated functions in package \sQuote{PDE}
#'
#' @description These functions are provided for compatibility with older versions
#' of \sQuote{PDE} only, and will be defunct at the next release.
#'
#' @details The following functions are deprecated and will be made defunct; use
#' the replacement indicated below:
#' \itemize{
#'
#' \item{PDE_path: \code{system.file(package = "PDE")}}
#'
#' }
#'
#' @name PDE-deprecated
NULL
#> NULL
#'Export the installation path the PDE (PDF Data Extractor) package
#'
#'\code{PDE_path} is deprecated. Please run system.file(package = "PDE") instead.
#'
#'@return The function returns a potential path for the PDE package. If the PDE
#'tool was not correctly installed it returns "".
#'
#'@export
PDE_path <- function(){
.Deprecated("system.file(package = \"PDE\")", package= "PDE",old = "PDE_path")
## set PDE library location
out_path <- ""
for (dir in .libPaths()){
if (dir.exists(paste0(dir,"/PDE/R"))){
out_path <- paste0(dir,"/PDE/")
break
}
}
return(out_path)
}
#'Check if the Xpdftools are installed an in the system path
#'
#'\code{PDE_check_Xpdf_install} runs a version test for pdftotext, pdftohtml and pdftopng.
#'
#'@param sysname String. In case the function returns "Unknown OS" the sysname can be set manually.
#' Allowed options are "Windows", "Linux", "SunOS" for Solaris, and "Darwin" for Mac. Default: \code{NULL}.
#'@param verbose Logical. Indicates whether messages will be printed in the console. Default: \code{TRUE}.
#'
#'
#'@return The function returns a Boolean for the installation status and a message in case
#' the commands are not detected.
#'
#'@examples
#'
#' PDE_check_Xpdf_install()
#'
#'@export
PDE_check_Xpdf_install <- function(sysname=NULL, verbose=TRUE){
## receive pdftotext, pdftohtml and pdftopng information from config file if it exists
xpdf_config_location <- paste0(system.file(package = "PDE"),"/bin/XPDF_DIR.config")
dir.create(dirname(xpdf_config_location), recursive = TRUE, showWarnings = FALSE)
pdftotext_location <- NULL
pdftohtml_location <- NULL
pdftopng_location <- NULL
if (file.exists(xpdf_config_location)){
pdftotext_location <- grep("pdftotext",readLines(xpdf_config_location), value = TRUE)
if (!length(pdftotext_location) > 0){
if (file.exists(pdftotext_location) == FALSE) pdftotext_location <- NULL
} else {
pdftotext_location <- NULL
}
pdftohtml_location <- grep("pdftohtml",readLines(xpdf_config_location), value = TRUE)
if (!length(pdftohtml_location) > 0){
if (file.exists(pdftohtml_location) == FALSE) pdftohtml_location <- NULL
} else {
pdftohtml_location <- NULL
}
pdftopng_location <- grep("pdftopng",readLines(xpdf_config_location), value = TRUE)
if (!length(pdftotext_location) > 0){
if (file.exists(pdftopng_location) == FALSE) pdftopng_location <- NULL
} else {
pdftotext_location <- NULL
}
}
# if either the config file does not exist or the xpdf tool files do not exist
if (!file.exists(xpdf_config_location) ||
is.null(pdftotext_location) ||
is.null(pdftohtml_location) ||
is.null(pdftopng_location)) {
if (is.null(sysname)) {
sysname <- Sys.info()["sysname"]
}
show_file_path_linux <- function(filename){
whereis_output <- system(paste0("whereis -b ",filename), intern = TRUE)
only_dirs <- sub("^ ","",sub(paste0("^",filename,":"),"",whereis_output))
if (only_dirs == ""){
return(NULL)
} else {
return(strsplit(gsub(" /",";/",only_dirs),split = ";")[[1]])
}
}
show_file_path_solaris <- function(filename){
whereis_test <- suppressWarnings(tryCatch(system(paste0("/usr/ucb/whereis ",filename),
, intern = TRUE)[1],
error=function(err) NULL))
if (length(whereis_test) != 0) {
whereis_output <- system(paste0("/usr/ucb/whereis ",filename), intern = TRUE)
only_dirs <- sub("^ ","",sub(paste0("^",filename,":"),"",whereis_output))
if (only_dirs == ""){
return(NULL)
} else {
return(strsplit(gsub(" /",";/",only_dirs),split = ";")[[1]])
}
} else {
return(NULL)
}
}
if (sysname == "Windows") {
pdftotext_location <- suppressWarnings(system("C:\\WINDOWS\\system32\\cmd.exe /c where pdftotext", intern = TRUE))
if (length(attributes(pdftotext_location)$status) > 0) pdftotext_location <- NULL
pdftohtml_location <- suppressWarnings(system("C:\\WINDOWS\\system32\\cmd.exe /c where pdftohtml", intern = TRUE))
if (length(attributes(pdftohtml_location)$status) > 0) pdftohtml_location <- NULL
pdftopng_location <- suppressWarnings(system("C:\\WINDOWS\\system32\\cmd.exe /c where pdftopng", intern = TRUE))
if (length(attributes(pdftopng_location)$status) > 0) pdftopng_location <- NULL
} else if (sysname == "Linux") {
pdftotext_location <- show_file_path_linux("pdftotext")
pdftohtml_location <- show_file_path_linux("pdftohtml")
pdftopng_location <- show_file_path_linux("pdftopng")
} else if (sysname == "SunOS") {
pdftotext_location <- show_file_path_solaris("pdftotext")
pdftohtml_location <- show_file_path_solaris("pdftohtml")
pdftopng_location <- show_file_path_solaris("pdftopng")
} else if (sysname == "Darwin") {
pdftotext_location <- suppressWarnings(system("which -a pdftotext", intern = TRUE))
pdftohtml_location <- suppressWarnings(system("which -a pdftohtml", intern = TRUE))
pdftopng_location <- suppressWarnings(system("which -a pdftopng", intern = TRUE))
} else{
stop("Unknown OS. Please set sysname option.")
}
}
out <- TRUE
files <- NULL
if (length(pdftotext_location) == 0) {
files <- c(files,"pdftotext")
out=FALSE
}
if (length(pdftohtml_location) == 0) {
files <- c(files,"pdftohtml")
out=FALSE
}
if (length(pdftopng_location) == 0) {
files <- c(files,"pdftopng")
out=FALSE
}
## if the command line tools where all detected
if (out == TRUE) {
## test pdftotext version
pdftotext_path <- NULL
pdfpath <- paste0(system.file(package = "PDE"),"/examples/Methotrexate/29973177_!.pdf")
keeplayouttxtpath <- paste0(dirname(pdfpath),"/test_txt/test_keeplayout.txt")
for (i in 1:length(pdftotext_location)){
dir.create(dirname(keeplayouttxtpath))
status <- suppressWarnings(system(paste0("\"", pdftotext_location[i], "\" \"", "-layout",
"\" \"", pdfpath, "\" \"", keeplayouttxtpath,
"\""),
wait = TRUE, ignore.stderr = TRUE, intern = TRUE))
if (file.exists(keeplayouttxtpath)) {
unlink(dirname(keeplayouttxtpath), recursive = TRUE)
pdftotext_path <- pdftotext_location[i]
break
}
unlink(dirname(keeplayouttxtpath), recursive = TRUE)
}
## test pdftohtml version
pdftohtml_path <- NULL
pdfpath <- paste0(system.file(package = "PDE"),"/examples/Methotrexate/29973177_!.pdf")
htmlpath <- paste0(dirname(pdfpath),"/test_html/test.html")
for (i in 1:length(pdftohtml_location)){
dir.create(dirname(htmlpath))
status <- system(paste0("\"",pdftohtml_location[i],"\" \"", pdfpath,
"\" \"", htmlpath, "\""), wait = TRUE,
ignore.stderr = TRUE, intern = TRUE)
if (dir.exists(htmlpath) && file.exists(paste0(htmlpath, "/index.html"))) {
unlink(dirname(htmlpath), recursive = TRUE)
pdftohtml_path <- pdftohtml_location[i]
break
}
unlink(dirname(htmlpath), recursive = TRUE)
}
## test pdftopng
pdftopng_path <- NULL
pdfpath <- paste0(system.file(package = "PDE"),"/examples/Methotrexate/29973177_!.pdf")
pngpath <- paste0(dirname(pdfpath),"/test_png/test.png")
for (i in 1:length(pdftopng_location)){
dir.create(dirname(pngpath))
status <- suppressWarnings(system(paste0("\"",pdftopng_location[i],"\" \"",
"-f", "\" \"", 1, "\" \"", "-l",
"\" \"", 1, "\" \"", pdfpath, "\" \"",
pngpath,"\""),
wait = TRUE, ignore.stderr = TRUE, intern = TRUE))
if (file.exists(sub("test.png$","test.png-000001.png",pngpath))) {
unlink(dirname(pngpath), recursive = TRUE)
pdftopng_path <- pdftopng_location[i]
break
}
unlink(dirname(pngpath), recursive = TRUE)
}
if (length(pdftotext_path) > 0 &&
length(pdftohtml_path) > 0 &&
length(pdftopng_path) > 0) {
write(paste(pdftotext_path,pdftohtml_path,pdftopng_path, sep = "\n"),
file = xpdf_config_location)
attributes(out) <- list(msg = "Correct version of Xpdf command line tools is installed.")
if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
} else {
if (length(pdftotext_path) == 0) {
files <- c(files,"pdftotext")
out=FALSE
}
if (length(pdftohtml_path) == 0) {
files <- c(files,"pdftohtml")
out=FALSE
}
if (length(pdftopng_path) == 0) {
files <- c(files,"pdftopng")
out=FALSE
}
msg1 <- paste(" installed. Please install the Xpdf command line tools",
"using PDE_install_Xpdftools4.02()")
if (length(files) == 1) {
out.file <- files
attributes(out) <- list(msg = paste0("The wrong version of the ",
out.file, " file is",msg1))
if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
} else if (length(files) == 2) {
out.file <- paste0(files[1], " and ", files[2])
attributes(out) <- list(msg = paste0("The wrong version of the ",
out.file, " files are",msg1))
if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
} else if (length(files) == 3) {
out.file <- paste0(files[1], ", ", files[2], " and " , files[3])
attributes(out) <- list(msg = paste0("The wrong version of the ",
out.file, " files are",msg1))
if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
}
}
## if one or more command line tools where not detected
} else {
msg1 <- paste(" not detected. Please install the Xpdf command line tools again",
"using PDE_install_Xpdftools4.02()")
if (length(files) == 1) {
out.file <- files
attributes(out) <- list(msg = paste0(out.file, " file",msg1))
if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
} else if (length(files) == 2) {
out.file <- paste0(files[1], " and ", files[2])
attributes(out) <- list(msg = paste0(out.file, " files",msg1))
if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
} else if (length(files) == 3) {
out.file <- paste0(files[1], ", ", files[2], " and " , files[3])
attributes(out) <- list(msg = paste0(out.file, " files",msg1))
if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
}
}
return(out)
}
#'Install the Xpdf command line tools 4.02
#'
#'\code{PDE_install_Xpdftools4.02} downloads and installs the XPDF command line tools 4.02.
#'
#'@param sysname String. In case the function returns "Unknown OS" the sysname can be set manually.
#' Allowed options are "Windows", "Linux", "SunOS" for Solaris, and "Darwin" for Mac. Default: \code{NULL}.
#'@param bin String. In case the function returns "Unknown OS" the bin of the operational system
#' can be set manually. Allowed options are "64", and "32". Default: \code{NULL}.
#'@param verbose Logical. Indicates whether messages will be printed in the console. Default: \code{TRUE}.
#'@param permission Numerical. If set to 0 the user is ask for a permission to
#' download Xpdftools. If set to 1, no user input is required. Default: \code{0}.
#'
#'
#'@return The function returns a Boolean for the installation status and a message in case
#' the commands are not installed.
#'
#'@examples
#' \dontrun{
#'
#' PDE_install_Xpdftools4.02()
#'
#' }
#'
#'
#'@export
PDE_install_Xpdftools4.02 <- function(sysname=NULL, bin=NULL, verbose=TRUE, permission = 0){
## check if Xpdftools are installed
install.test <- PDE_check_Xpdf_install(verbose=FALSE)
downloadq <- FALSE
installq <- FALSE
out_msg <- NULL
out <- NULL
xpdf_config_location <- paste0(system.file(package = "PDE"),"/bin/XPDF_DIR.config")
dir.create(dirname(xpdf_config_location), recursive = TRUE, showWarnings = FALSE)
## set xpdf library location
xpdf_bin_path <- paste0(system.file(package = "PDE"),"/bin")
if (is.null(sysname)) sysname <- Sys.info()["sysname"]
download.test <- FALSE
if (sysname == "Windows") {
if (dir.exists(paste0(xpdf_bin_path,"/xpdf-tools-win-4.02"))) download.test <- TRUE
} else if (sysname == "Linux" || sysname == "SunOS") {
if (dir.exists(paste0(xpdf_bin_path,"/xpdf-tools-linux-4.02"))) download.test <- TRUE
} else if (sysname == "Darwin") {
if (dir.exists(paste0(xpdf_bin_path,"/xpdf-tools-mac-4.02"))) download.test <- TRUE
} else {
stop("Unknown OS. Please set sysname option.")
}
if (is.null(bin)){
if (grepl("32",Sys.info()[["machine"]])) {
bin <- "32"
} else if (grepl("64",Sys.info()[["machine"]])) {
bin <- "64"
} else {
stop("Unknown OS. Please set sysname option.")
}
}
if (bin != "64" && bin != "32"){
stop("Unknown OS. Please set bin option.")
}
## determine operating system and download correct xpdf
if (permission == 0){
if (download.test == FALSE){
## determine operating system and download correct Xpdf command line tools
downloadq <- utils::menu(c("Y", "N"), title="Do you want to download and install xpdf version 4.02? (y/n)") == 1
installq <- downloadq
} else {
downloadq <- utils::menu(c("Y", "N"), title=paste("Xpdf command line tools 4.02 are already downloaded.",
"Do you want to download the Xpdf command line tools version 4.02 again? (y/n)")) == 1
if (install.test == TRUE){
installq <- utils::menu(c("Y", "N"), title=paste("Working versions of Xpdf command line tools are already installed.",
"Do you want to still (re)install",
"the Xpdf command line tools version 4.02? (y/n)")) == 1
} else {
installq <- utils::menu(c("Y", "N"), title=paste("Do you want to also install",
"the Xpdf command line tools version 4.02? (y/n)")) == 1
}
}
} else {
downloadq <- TRUE
installq <- TRUE
}
if (downloadq == TRUE){
if (sysname == "Windows") {
utils::download.file("https://raw.githubusercontent.com/erikstricker/PDE/master/inst/examples/bin/xpdf-tools-win-4.02.zip",
destfile = paste0(xpdf_bin_path,"/xpdf-tools-win-4.02.zip"),
mode = "wb")
utils::unzip(paste0(xpdf_bin_path,"/xpdf-tools-win-4.02.zip"),exdir = xpdf_bin_path)
remove.status <- suppressWarnings(file.remove(paste0(xpdf_bin_path,"/xpdf-tools-win-4.02.zip")))
download.test <- TRUE
} else if (sysname == "Linux" || sysname == "SunOS") {
utils::download.file("https://raw.githubusercontent.com/erikstricker/PDE/master/inst/examples/bin/xpdf-tools-linux-4.02.tar.gz",
destfile = paste0(xpdf_bin_path,"/xpdf-tools-linux-4.02.tar.gz"),
mode = "wb")
utils::untar(paste0(xpdf_bin_path,"/xpdf-tools-linux-4.02.tar.gz"),exdir = xpdf_bin_path)
remove.status <- suppressWarnings(file.remove(paste0(xpdf_bin_path,"/xpdf-tools-linux-4.02.tar.gz")))
download.test <- TRUE
} else if (sysname == "Darwin") {
utils::download.file("https://raw.githubusercontent.com/erikstricker/PDE/master/inst/examples/bin/xpdf-tools-mac-4.02.tar.gz",
destfile = paste0(xpdf_bin_path,"/xpdf-tools-mac-4.02.tar.gz"))
utils::untar(paste0(xpdf_bin_path,"/xpdf-tools-mac-4.02.tar.gz"),exdir = xpdf_bin_path)
remove.status <- suppressWarnings(file.remove(paste0(xpdf_bin_path,"/xpdf-tools-mac-4.02.tar.gz")))
download.test <- TRUE
} else {
stop("Unknown OS. Please set sysname option.")
}
}
if (download.test == TRUE){
if (sysname == "Windows") {
filepath <- normalizePath(paste0(xpdf_bin_path,"/xpdf-tools-win-4.02/bin",bin))
ext <- ".exe"
} else if (sysname == "Linux" || sysname == "SunOS") {
filepath <- normalizePath(paste0(xpdf_bin_path,"/xpdf-tools-linux-4.02/bin",bin))
ext <- ""
} else if (sysname == "Darwin") {
filepath <- normalizePath(paste0(xpdf_bin_path,"/xpdf-tools-mac-4.02/bin",bin))
ext <- ""
} else {
stop("Unknown OS. Please set sysname option.")
}
out_msg <- c(out_msg,paste0("Location of Xpdf command line tools 4.02: ",filepath))
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
attributes(out) <- list(msg = out_msg,path=filepath)
## "Installation"
if (installq == TRUE){
pdftotext_path <- normalizePath(paste0(filepath,"/pdftotext",ext))
pdftohtml_path <- normalizePath(paste0(filepath,"/pdftohtml",ext))
pdftopng_path <- normalizePath(paste0(filepath,"/pdftopng",ext))
write(paste(pdftotext_path,pdftohtml_path,pdftopng_path, sep = "\n"),
file = xpdf_config_location)
out_msg <- c(out_msg,"The Xpdf command line tools 4.02 were successfully installed.")
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
attributes(out) <- list(msg = out_msg,path=filepath)
out <- TRUE
} else {
out_msg <- c(out_msg,"The Xpdf command line tools 4.02 were not installed.")
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
attributes(out) <- list(msg = out_msg,path=filepath)
out <- FALSE
}
} else {
out_msg <- c(out_msg,"The Xpdf command line tools 4.02 were not downloaded.")
attributes(out) <- list(msg = out_msg,path="")
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
out <- FALSE
}
return(out)
}
#'Extracting data from a PDF (Protable Document Format) file
#'
#'\code{PDE_extr_data_from_pdf} extracts sentences or tables from a single PDF
#'file and writes output in the corresponding folder.
#'
#'@param pdf String. Path to the PDF file to be analyzed.
#'@param whattoextr String. Either \emph{txt}, \emph{tab}, or \emph{tabandtxt}
#' for PDFS2TXT (extract sentences from a PDF file) or PDFS2TABLE (table of a PDF
#' file to a Microsoft Excel file) extraction. \emph{tab} allows the extraction
#' of tables with and without search words while \emph{txt} and \emph{tabandtxt}
#' require search words.
#'@param out String. Directory chosen to save analysis results in. Default:
#' \code{"."}.
#'@param filter.words List of strings. The list of filter words. If not
#' \code{NA} or \code{""} a hit will be counted every time a word from the list
#' is detected in the article.
#' Default: \code{""}.
#'@param regex.fw Logical. If TRUE filter words will follow the regex rules
#' (see \url{https://github.com/erikstricker/PDE/blob/master/inst/examples/cheatsheets/regex.pdf}).
#' Default = \code{TRUE}.
#'@param ignore.case.fw Logical. Are the filter words case-sensitive (does
#' capitalization matter)? Default: \code{FALSE}.
#'@param filter.word.times Numeric or string. Can either be expressed as absolute number or percentage
#' of the total number of words (by adding the "%" sign). The minimum number of hits described for
#' \code{filter.words} for a paper to be further analyzed. Default: \code{0.2\%}.
#'@param table.heading.words List of strings. Different than standard (TABLE,
#' TAB or table plus number) headings to be detected. Regex rules apply (see
#' also
#' \url{https://github.com/erikstricker/PDE/blob/master/inst/examples/cheatsheets/regex.pdf}).
#' Default = \code{""}.
#'@param ignore.case.th Logical. Are the additional table headings (see
#' \code{table.heading.words}) case-sensitive (does capitalization matter)?
#' Default = \code{FALSE}.
#'@param search.words List of strings. List of search words. To extract all
#' tables from the PDF file leave \code{search.words = ""}.
#'@param search.word.categories List of strings. List of categories with the
#' same length as the list of search words. Accordingly, each search word can be
#' assigned to a category, of which the word counts will be summarized in the
#' \code{PDE_analyzer_word_stats.csv} file. If search.word.categories is a
#' different length than search.words the parameter will be ignored.
#' Default: \code{NULL}.
#'@param save.tab.by.category Logical. Can only be used with search.word.categories.
#' If set to TRUE, tables that carry search words will be saved in sub-folders
#' according to the search word category of the detected search word.
#' Default: \code{FALSE}.
#'@param regex.sw Logical. If TRUE search words will follow the regex rules
#' (see \url{https://github.com/erikstricker/PDE/blob/master/inst/examples/cheatsheets/regex.pdf}).
#' Default = \code{TRUE}.
#'@param ignore.case.sw Logical. Are the search words case-sensitive (does
#' capitalization matter)? Default: \code{FALSE}.
#'@param eval.abbrevs Logical. Should abbreviations for the search words be
#' automatically detected and then replaced with the search word + "$*"?
#' Default: \code{TRUE}.
#'@param out.table.format String. Output file format. Either comma separated
#' file \code{.csv} or tab separated file \code{.tsv}. The encoding indicated
#' in parantheses should be selected according to the operational system
#' exported tables are opened in, i.e., Windows: \code{"(WINDOWS-1252)"}; Mac:
#' \code{(macintosh)}; Linux: \code{(UTF-8)}. Default: \code{".csv"} and
#' encoding depending on the operational system.
#'@param dev_x Numeric. For a table the size of indention which would be
#' considered the same column. Default: \code{20}.
#'@param dev_y Numeric. For a table the vertical distance which would be
#' considered the same row. Can be either a number or set to dynamic detection
#' [9999], in which case the font size is used to detect which words are in the
#' same row.
#' Default: \code{9999}.
#'@param context Numeric. Number of sentences extracted before and after the
#' sentence with the detected search word. If \code{0} only the sentence with
#' the search word is extracted. Default: \code{0}.
#'@param write.table.locations Logical. If \code{TRUE}, a separate file with the
#' headings of all tables, their relative location in the generated html and
#' txt files, as well as information if search words were found will be
#' generated. Default: \code{FALSE}.
#'@param exp.nondetc.tabs Logical. If \code{TRUE}, if a table was detected in a
#' PDF file but is an image or cannot be read, the page with the table with be
#' exported as a png. Default: \code{TRUE}.
#'@param write.tab.doc.file Logical. If \code{TRUE}, if search words are used
#' for table detection and no search words were found in the tables of a PDF
#' file, a \strong{no.table.w.search.words}. Default: \code{TRUE}.
#'@param write.txt.doc.file Logical. If \code{TRUE}, if no search words were
#' found in the sentences of a PDF file, a file will be created with the PDF
#' filename followed by \strong{no.txt.w.search.words}. If the PDF file is
#' empty, a file will be created with the PDF filename followed by
#' \strong{no.content.detected}. If the filter word threshold is not met,
#' a file will be created with the PDF filename followed by
#' \strong{no.txt.w.filter.words}. Default: \code{TRUE}.
#'@param delete Logical. If \code{TRUE}, the intermediate \strong{txt},
#' \strong{keeplayouttxt} and \strong{html} copies of the PDF file will be
#' deleted. Default: \code{TRUE}.
#'@param cpy_mv String. Either "nocpymv", "cpy", or "mv". If filter words are used in the
#' analyses, the processed PDF files will either be copied ("cpy") or moved ("mv") into the
#' /pdf/ subfolder of the output folder. Default: \code{"nocpymv"}.
#'@param verbose Logical. Indicates whether messages will be printed in the
#' console. Default: \code{TRUE}.
#'
#'@return If tables were extracted from the PDF file the function returns a list of
#' following tables/items: 1) \strong{htmltablelines}, 2)
#' \strong{txttablelines}, 3) \strong{keeplayouttxttablelines}, 4) \strong{id},
#' 5) \strong{out_msg}.
#' The \strong{tablelines} are tables that provide the heading and position of
#' the detected tables. The \strong{id} provide the name of the PDF file. The
#' \strong{out_msg} includes all messages printed to the console or the suppressed
#' messages if \code{verbose=FALSE}.
#'
#'@examples
#'
#'## Running a simple analysis with filter and search words to extract sentences and tables
#'if(PDE_check_Xpdf_install() == TRUE){
#' outputtables <- .PDE_extr_data_from_pdf(pdf = "/examples/Methotrexate/29973177_!.pdf",
#' whattoextr = "tabandtxt",
#' out = paste0(system.file(package = "PDE"),"/examples/MTX_output+-0_test/"),
#' filter.words = strsplit("cohort;case-control;group;study population;study participants", ";")[[1]],
#' ignore.case.fw = TRUE,
#' regex.fw = FALSE,
#' search.words = strsplit("(M|m)ethotrexate;(T|t)rexal;(R|r)heumatrex;(O|o)trexup", ";")[[1]],
#' ignore.case.sw = FALSE,
#' regex.sw = TRUE)
#'}
#'
#'## Running an advanced analysis with filter and search words to
#'## extract sentences and tables and obtain documentation files
#'if(PDE_check_Xpdf_install() == TRUE){
#' outputtables <- .PDE_extr_data_from_pdf(pdf = paste0(system.file(package = "PDE"),
#' "/examples/Methotrexate/29973177_!.pdf"),
#' whattoextr = "tabandtxt",
#' out = paste0(system.file(package = "PDE"),"/examples/MTX_output+-1_test/"),
#' context = 1,
#' dev_x = 20,
#' dev_y = 9999,
#' filter.words = strsplit("cohort;case-control;group;study population;study participants", ";")[[1]],
#' ignore.case.fw = TRUE,
#' regex.fw = FALSE,
#' filter.word.times = "0.2%",
#' table.heading.words = "",
#' ignore.case.th = FALSE,
#' search.words = strsplit("(M|m)ethotrexate;(T|t)rexal;(R|r)heumatrex;(O|o)trexup", ";")[[1]],
#' ignore.case.sw = FALSE,
#' regex.sw = TRUE,
#' eval.abbrevs = TRUE,
#' out.table.format = ".csv (WINDOWS-1252)",
#' write.table.locations = TRUE,
#' write.tab.doc.file = TRUE,
#' write.txt.doc.file = TRUE,
#' exp.nondetc.tabs = TRUE,
#' cpy_mv = "nocpymv",
#' delete = TRUE)
#'}
#'
#'@seealso
#'\code{\link{PDE_pdfs2table}},\code{\link{PDE_pdfs2table_searchandfilter}},
#'\code{\link{PDE_pdfs2txt_searchandfilter}}
#'
#'@export
.PDE_extr_data_from_pdf <- function(pdf, whattoextr,
out = ".", filter.words = "", regex.fw = TRUE, ignore.case.fw = FALSE, filter.word.times = "0.2%",
table.heading.words = "", ignore.case.th = FALSE, search.words, search.word.categories = NULL,
save.tab.by.category = FALSE, regex.sw = TRUE,
ignore.case.sw = FALSE, eval.abbrevs = TRUE, out.table.format = ".csv (WINDOWS-1252)",
dev_x = 20, dev_y = 9999, context = 0,write.table.locations = FALSE, exp.nondetc.tabs = TRUE,
write.tab.doc.file = TRUE,write.txt.doc.file = TRUE, delete = TRUE, cpy_mv = "nocpymv",
verbose = TRUE){
## General functions -------------------------------------------
## Added re.escape function in v1.4.0 to convert regex to non-regex
re.escape <- function(string){
vals <- c("\\\\", "\\[", "\\]", "\\(", "\\)",
"\\{", "\\}", "\\^", "\\$","\\*",
"\\+", "\\?", "\\.", "\\|")
replace.vals <- paste0("\\\\", vals)
for(i in seq_along(vals)){
string <- gsub(vals[i], replace.vals[i], string)
}
return(string)
}
create_id_from_stringlist <- function(list_of_strings, char_len = 8){
list_of_ints <- NULL
for (string in list_of_strings){
final <- 20
numbers <- utf8ToInt(string) - utf8ToInt("a") + 1L
for (i in 2:length(numbers)){
final <- abs(round(mean(c(final, (numbers[[i-1]] %% numbers[[i]] + (numbers[[i]] * 10000))))))
}
list_of_ints <- c(list_of_ints, final)
}
set.seed(round(mean(list_of_ints)))
n <- 1
pool <- c(letters, LETTERS, 0:9)
res <- character(n) # pre-allocating vector is much faster than growing it
for(i in seq(n)){
this_res <- paste0(sample(pool, char_len, replace = TRUE), collapse = "")
while(this_res %in% res){ # if there was a duplicate, redo
this_res <- paste0(sample(pool, char_len, replace = TRUE), collapse = "")
}
res[i] <- this_res
}
return(res)
}
readin_txt <- function(txtpath) {
## read in the txt file
txtcontent_from_txtpath_lat1 <- readLines(txtpath, warn = FALSE, encoding = "latin1")
txtcontent_from_txtpath_utf8 <- readLines(txtpath, warn = FALSE, encoding = "UTF-8")
res_lat1 <- try(sum(nchar(txtcontent_from_txtpath_lat1)),silent = TRUE)
res_utf8 <- try(sum(nchar(txtcontent_from_txtpath_utf8)),silent = TRUE)
if (inherits(res_utf8,"try-error")){
## if utf8 throws an error
txtcontent_from_txtpath <- txtcontent_from_txtpath_lat1
} else if (inherits(res_lat1,"try-error")){
## if latin1 throws an error
txtcontent_from_txtpath <- txtcontent_from_txtpath_utf8
} else {
## latin1 preferred for 4.2.0 xpdf version
txtcontent_from_txtpath <- txtcontent_from_txtpath_lat1
}
for (r in 1:length(txtcontent_from_txtpath)){
## replace all fi
res <- try(gsub(intToUtf8(0xFB01),"fi",txtcontent_from_txtpath[r], fixed = TRUE),silent = TRUE)
if (inherits(res,"try-error")){
txtcontent_from_txtpath[r] <- iconv(txtcontent_from_txtpath[r], 'UTF-8', 'latin1', 'bit')
}
txtcontent_from_txtpath[r] <- gsub(intToUtf8(0xFB01),"fi",txtcontent_from_txtpath[r], fixed = TRUE)
}
return(txtcontent_from_txtpath)
}
page_splits <- function(txtcontent_to_split) {
## split the content according to pages
pagesplits <- grep("^\\f", txtcontent_to_split)
## if there is only one page or no txtcontent_to_split
if (length(pagesplits) > 1) {
## page split the line before
g <- rep.int(1, pagesplits[1] - 1)
for (p in 2:length(pagesplits)) {
g <- c(g, rep.int(p, pagesplits[p] - pagesplits[p - 1]))
}
g <- c(g, length(pagesplits))
splittxtcontent <- split(txtcontent_to_split, g)
} else {
splittxtcontent <- txtcontent_to_split
}
return(splittxtcontent)
}
find_similar_row <- function(originrow, targettablelines,
relative.match.col, output.for.originrow.only,
output.for.match, output.column.name) {
determine_similarity_1_vs_2 <- function(strings){
## determine similarity
matches_fsr <- NULL
for (pos in 1:length(strings[[1]])) {
counter_fsr <- 0
out_fsr <- FALSE
while (out_fsr == FALSE && counter_fsr < 4) {
## test is characters are the same
if (is.na(strings[[2]][pos + counter_fsr])) break
if (strings[[1]][pos] == strings[[2]][pos + counter_fsr]) {
out_fsr <- TRUE
} else {
counter_fsr <- counter_fsr + 1
}
} ## end while
matches_fsr <- c(matches_fsr, out_fsr)
}
## determine how much it matches_fsr
percent.match <- sum(matches_fsr, na.rm = TRUE)/length(strings[[1]])
return(percent.match)
}
## set variables
matchingrow <- NA
matchpercent <- NULL
targettablerow <- NULL
## check every row in targettablelines
for (targetrow in 1:nrow(targettablelines)) {
## skip lines that do not have matching pages
if (as.numeric(targettablelines[targetrow, "page"]) != as.numeric(originrow[["page"]])) next
x <- as.character(originrow[[relative.match.col]])
y <- as.character(targettablelines[targetrow, relative.match.col])
if (nchar(x) > nchar(y)) x <- substr(x, 1, nchar(y))
## Run one way around
strings1 <- sapply(list(x, y), strsplit, "")
strings2 <- sapply(list(y, x), strsplit, "")
percent.match1 <- determine_similarity_1_vs_2(strings1)
percent.match2 <- determine_similarity_1_vs_2(strings2)
percent.match <- max(percent.match1,percent.match2)
if (percent.match > 0.8) {
matchpercent <- rbind(matchpercent, c(originrow = 1,
targetrow = targetrow,
percent.match = percent.match))
}
}
## if multiple tables were on the same page if no
## matching table is found
if (is.null(matchpercent)) {
originrow[output.column.name] <- output.for.originrow.only
targettablerow <- NA
## if only one row matches_fsr
} else if (nrow(matchpercent) == 1) {
originrow[output.column.name] <- output.for.match
targettablelines[matchpercent[1, "targetrow"],
output.column.name] <- output.for.match
targettablerow <- targettablelines[matchpercent[1, "targetrow"],]
## if multiple rows
} else {
maxrow <- grep(TRUE, matchpercent[, "percent.match"] %in% max(matchpercent[, "percent.match"]))[1]
originrow[output.column.name] <- output.for.match
targettablelines[matchpercent[maxrow, "targetrow"], output.column.name] <- output.for.match
targettablerow <- targettablelines[matchpercent[maxrow, "targetrow"],]
}
return(list(originrow = originrow, targettablelines = targettablelines,
targettablerow = targettablerow))
}
exp_nondetc_tabs <- function(input_table, pdfpath,
outputpath, detectinfile,
based.on.column, matches_end) {
## find all rows that match
matched.rows <- grep(matches_end, input_table[, based.on.column])
## find page from input_table
pages <- input_table[matched.rows, "page"]
exp.pages <- unique(pages)
## only start function if there are pages to export
if (length(exp.pages) > 0) {
dir.create(paste0(outputpath,"/tables"), showWarnings = FALSE)
for (page in pages) {
system(paste0("\"",pdftopng_location,"\" \"",
"-f", "\" \"", page, "\" \"", "-l",
"\" \"", page, "\" \"", pdfpath, "\" \"",
outputpath, "/tables/", substr(basename(pdfpath),
1, regexpr(".pdf", basename(pdfpath)) -
1), "_page", page, "_w.table",
"\""), wait = TRUE, ignore.stderr = TRUE)
}
}
}
test_if_abbrev_in_parantheses <- function(searchword, paragraph,ignore.case.sw) {
output <- list(res = TRUE,"","","","")
## get the position of the search word in the paragraph
pos_searchword_start <- regexpr(searchword,paragraph,ignore.case = ignore.case.sw)[[1]]
pos_searchword_end <- pos_searchword_start + attr(regexpr(searchword,paragraph,
ignore.case = ignore.case.sw),"match.length") - 1
simple_searchword <- substr(paragraph, pos_searchword_start, pos_searchword_end)
## test if parantheses are after it +2 for \\
pos_open <- regexpr("\\(",substr(paragraph, pos_searchword_end + 1, nchar(paragraph)))[[1]]
pos_abbrev_start <- pos_open + 1 + pos_searchword_end
pos_close <- regexpr("\\)",substr(paragraph, pos_abbrev_start + 1, nchar(paragraph)))[[1]]
pos_abbrev_end <- pos_close + pos_abbrev_start - 1
sentence.end <- regexpr("\\. [0-9A-Z]",substr(paragraph, pos_searchword_end + 1, nchar(paragraph)))[[1]] +
pos_searchword_end - 1
if (sentence.end == -1) sentence.end <- 999999
## if both paranthesis were found and there is no end of a sentence
if (pos_open > 0 &&
pos_close > 0 &&
sentence.end > pos_abbrev_end &&
pos_searchword_start > 0 &&
pos_searchword_end > 0) {
pos_ext_searchword_end <- pos_abbrev_start - 3
ext_searchword <- substr(paragraph, pos_searchword_start, pos_ext_searchword_end)
## get all letters within parantheses
current_char_pos <- pos_abbrev_start
current_char <- substr(paragraph, current_char_pos, current_char_pos)
char_list <- NULL
while (!grepl(" |\\)", current_char)){
char_list <- c(char_list, current_char)
current_char_pos <- current_char_pos + 1
current_char <- substr(paragraph, current_char_pos, current_char_pos)
}
char_list <- char_list[!(char_list %in% c("(", ")","[", "]", "/",
"{","}","\\"))]
ext_searchword <- gsub("[^[:alnum:] ]","",ext_searchword)
## test if letter description was found
if (length(char_list) > 0){
trunc_searchword <- ext_searchword
## match the letters to letters in the searchword
for (n in 1:length(char_list)){
pos_char <- regexpr(char_list[n], trunc_searchword, ignore.case = TRUE)[[1]]
removed.chars <- substr(trunc_searchword, 1, pos_char)
## if character was found in search word and abbreviation and
## there were no words without character in searchword removed
if (pos_char > 0 && !length(gregexpr(" ",removed.chars)[[1]]) > 1) {
trunc_searchword <- substr(trunc_searchword, pos_char + 1, nchar(trunc_searchword))
} else {
output <- list(res = FALSE,"","","","")
break
}
}
## test if each word in extended searchword has letter in abbreviation
list.of.words <- gsub("[^A-z]","",
strsplit(ext_searchword," ")[[1]])[!(gsub("[^A-z]","",
strsplit(ext_searchword," ")[[1]]) %in% "")]
## allow one slack
if (length(list.of.words) - 1 <= length(char_list)){
result <- TRUE
trunc_abbrev <- char_list
## match the letters to letters in the searchword
for (n in 1:length(list.of.words)){
if (length(trunc_abbrev) == 0 && result == TRUE){
## one slack
result <- FALSE
next
} else {
output <- list(res = FALSE,"","","","")
break
}
pos_char <- regexpr(substr(list.of.words[n],1,1), trunc_abbrev, ignore.case = TRUE)[[1]]
removed.chars <- trunc_abbrev[pos_char]
## if character was found in search word and abbreviation and
## there were no words without character in searchword removed
if (pos_char > 0 && !nchar(removed.chars) > 1) {
trunc_abbrev <- trunc_abbrev[-pos_char]
} else if (result == TRUE){
## one slack
result <- FALSE
} else {
output <- list(res = FALSE,"","","","")
break
}
}
} else {
output <- list(res = FALSE,"","","","")
}
## test if letters did match search word
if (output[[1]]) {
## for plural search words
if (char_list[length(char_list)] == "s" &&
substr(ext_searchword, nchar(ext_searchword),nchar(ext_searchword)) == "s"){
abbrev_plural <- paste(char_list, collapse = "")
replacement_plural <- paste0(abbrev_plural," (",ext_searchword,")$*")
abbrev_singular <- paste(char_list[-length(char_list)], collapse = "")
simple_searchword <- substr(paragraph, pos_searchword_start, pos_searchword_end)
replacement_singular <- paste0(abbrev_singular," (",simple_searchword,")$*")
} else {
abbrev_singular <- paste(char_list, collapse = "")
replacement_singular <- paste0(abbrev_singular," (",ext_searchword,")$*")
abbrev_plural <- paste(c(char_list,"s"), collapse = "")
replacement_plural <- paste0(abbrev_plural," (",ext_searchword,"s)$*")
}
output <- list(res = TRUE,abbrev_singular = abbrev_singular,
replacement_singular = replacement_singular,
abbrev_plural = abbrev_plural,replacement_plural = replacement_plural)
}
} else {
output <- list(res = FALSE,"","","","")
}
} else {
## abbrev found, abbrev_singular, replacement_singular, abbrev_plural, replacement_plural
output <- list(res = FALSE,"","","","")
}
return(output)
}
test_if_abbrev_double_dots_or_equal <- function(searchword, paragraph, ignore.case.sw) {
output <- list(res = TRUE,"","","","")
## get the position of the search word in the paragraph
pos_searchword_start <- regexpr(searchword,paragraph, ignore.case = ignore.case.sw)[[1]]
pos_searchword_end <- pos_searchword_start + attr(regexpr(searchword,paragraph,
ignore.case = ignore.case.sw),
"match.length") - 1
## test if : or = is before it
minus_three_chars <- substr(paragraph, (pos_searchword_start - 3),(pos_searchword_start - 1))
if (grepl("(:|=)", minus_three_chars)) {
pos_ext_searchword_end <- (regexpr("[^[:alnum:] ]",
substr(paragraph, pos_searchword_end + 1,
pos_searchword_end + 3)))[[1]] +
pos_searchword_end - 1
ext_searchword <- substr(paragraph, pos_searchword_start, pos_ext_searchword_end)
## get all letters before the : or =
current_char_pos <- pos_searchword_start - 2 - lengths(regmatches(minus_three_chars,
gregexpr(" ", minus_three_chars)))
current_char <- substr(paragraph, current_char_pos, current_char_pos)
char_list <- NULL
while (grepl("[A-z|0-9]", current_char)){
char_list <- c(current_char, char_list)
current_char_pos <- current_char_pos - 1
current_char <- substr(paragraph, current_char_pos, current_char_pos)
}
char_list <- char_list[!(char_list %in% c("(", ")","[", "]", "/",
"{","}","\\"))]
## test if letter description was found
if (length(char_list) > 0){
trunc_searchword <- gsub("[^[:alnum:] ]","",ext_searchword)
## match the letters to letters in the searchword
for (n in 1:length(char_list)){
pos_char <- regexpr(char_list[n], trunc_searchword, ignore.case = TRUE)[[1]]
removed.chars <- substr(trunc_searchword, 1, pos_char)
## if character was found in search word and abbreviation and
## there were no words without character in searchword removed
if (pos_char > 0 && !length(gregexpr(" ",removed.chars)[[1]]) > 1) {
trunc_searchword <- substr(trunc_searchword, pos_char + 1, nchar(trunc_searchword))
} else {
output <- list(res = FALSE,"","","","")
break
}
}
## test if each word in extended searchword has letter in abbreviation
list.of.words <- gsub("[^A-z]","",
strsplit(ext_searchword," ")[[1]])[!(gsub("[^A-z]","",
strsplit(ext_searchword,
" ")[[1]]) %in% "")]
## allow one slack
if (length(list.of.words) - 1 <= length(char_list)){
result <- TRUE
trunc_abbrev <- char_list
## match the letters to letters in the searchword
for (n in 1:length(list.of.words)){
pos_char <- regexpr(substr(list.of.words[n],1,1), trunc_abbrev, ignore.case = TRUE)[[1]]
removed.chars <- trunc_abbrev[pos_char]
## if character was found in search word and abbreviation and
## there were no words without character in searchword removed
if (pos_char > 0 && !nchar(removed.chars) > 1) {
trunc_abbrev <- trunc_abbrev[-pos_char]
} else if (result == TRUE){
## one slack
result <- FALSE
} else {
output <- list(res = FALSE,"","","","")
break
}
}
} else {
output <- list(res = FALSE,"","","","")
}
## test if letters did match search word
if (output[[1]]) {
## for plural search words
if (char_list[length(char_list)] == "s" &&
substr(ext_searchword, nchar(ext_searchword),nchar(ext_searchword)) == "s"){
abbrev_plural <- paste(char_list, collapse = "")
replacement_plural <- paste0(abbrev_plural," (",ext_searchword,")$*")
abbrev_singular <- paste(char_list[-length(char_list)], collapse = "")
simple_searchword <- substr(paragraph, pos_searchword_start, pos_searchword_end)
replacement_singular <- paste0(abbrev_singular," (",simple_searchword,")$*")
} else {
abbrev_singular <- paste(char_list, collapse = "")
replacement_singular <- paste0(abbrev_singular," (",ext_searchword,")$*")
abbrev_plural <- paste(c(char_list,"s"), collapse = "")
replacement_plural <- paste0(abbrev_plural," (",ext_searchword,"s)$*")
}
output <- list(res = TRUE,abbrev_singular = abbrev_singular,
replacement_singular = replacement_singular,
abbrev_plural = abbrev_plural,replacement_plural = replacement_plural)
}
} else {
output <- list(res = FALSE,"","","","")
}
} else {
## abbrev found, abbrev_singular, replacement_singular, abbrev_plural, replacement_plural
output <- list(res = FALSE,"","","","")
}
return(output)
}
deletefile <- function(verbose=TRUE) {
out_msg <- NULL
if (delete == TRUE) {
## clean up
unlink(txtpath, recursive = TRUE)
if (exists(txtpath)) {
out_msg <- c(out_msg, paste0("Could not delete:", txtpath))
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
}
unlink(keeplayouttxtpath, recursive = TRUE)
if (exists(keeplayouttxtpath)) {
out_msg <- c(out_msg, paste0("Could not delete:", keeplayouttxtpath))
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
}
unlink(htmlpath, recursive = TRUE)
if (exists(htmlpath)){
if (exists(keeplayouttxtpath)) {
out_msg <- c(out_msg, paste0("Could not delete:", htmlpath))
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
}
}
}
return(out_msg)
}
replace.html.entity <- function(input.with.html) {
output.without.html <- input.with.html
output.without.html <- gsub("&","&",output.without.html)
output.without.html <- gsub("<","<",output.without.html)
output.without.html <- gsub(">",">",output.without.html)
output.without.html <- gsub(" "," ",output.without.html)
output.without.html <- gsub("À",intToUtf8(0x00C0),output.without.html)
output.without.html <- gsub("Á",intToUtf8(0x00C1),output.without.html)
output.without.html <- gsub("Â",intToUtf8(0x00C2),output.without.html)
output.without.html <- gsub("Ã",intToUtf8(0x00C3),output.without.html)
output.without.html <- gsub("Ä",intToUtf8(0x00C4),output.without.html)
output.without.html <- gsub("Å",intToUtf8(0x00C5),output.without.html)
output.without.html <- gsub("Æ",intToUtf8(0x00C6),output.without.html)
output.without.html <- gsub("Ç",intToUtf8(0x00C7),output.without.html)
output.without.html <- gsub("È",intToUtf8(0x00C8),output.without.html)
output.without.html <- gsub("É",intToUtf8(0x00C9),output.without.html)
output.without.html <- gsub("Ê",intToUtf8(0x00CA),output.without.html)
output.without.html <- gsub("Ë",intToUtf8(0x00CB),output.without.html)
output.without.html <- gsub("Ì",intToUtf8(0x00CC),output.without.html)
output.without.html <- gsub("Í",intToUtf8(0x00CD),output.without.html)
output.without.html <- gsub("Î",intToUtf8(0x00CE),output.without.html)
output.without.html <- gsub("Ï",intToUtf8(0x00CF),output.without.html)
output.without.html <- gsub("Ð",intToUtf8(0x00D0),output.without.html)
output.without.html <- gsub("Ñ",intToUtf8(0x00D1),output.without.html)
output.without.html <- gsub("Ò",intToUtf8(0x00D2),output.without.html)
output.without.html <- gsub("Ó",intToUtf8(0x00D3),output.without.html)
output.without.html <- gsub("Ô",intToUtf8(0x00D4),output.without.html)
output.without.html <- gsub("Õ",intToUtf8(0x00D5),output.without.html)
output.without.html <- gsub("Ö",intToUtf8(0x00D6),output.without.html)
output.without.html <- gsub("Ø",intToUtf8(0x00D8),output.without.html)
output.without.html <- gsub("Ù",intToUtf8(0x00D9),output.without.html)
output.without.html <- gsub("Ú",intToUtf8(0x00DA),output.without.html)
output.without.html <- gsub("Û",intToUtf8(0x00DB),output.without.html)
output.without.html <- gsub("Ü",intToUtf8(0x00DC),output.without.html)
output.without.html <- gsub("Ý",intToUtf8(0x00DD),output.without.html)
output.without.html <- gsub("Þ",intToUtf8(0x00DE),output.without.html)
output.without.html <- gsub("ß",intToUtf8(0x00DF),output.without.html)
output.without.html <- gsub("à",intToUtf8(0x00E0),output.without.html)
output.without.html <- gsub("á",intToUtf8(0x00E1),output.without.html)
output.without.html <- gsub("â",intToUtf8(0x00E2),output.without.html)
output.without.html <- gsub("ã",intToUtf8(0x00E3),output.without.html)
output.without.html <- gsub("ä",intToUtf8(0x00E4),output.without.html)
output.without.html <- gsub("å",intToUtf8(0x00E5),output.without.html)
output.without.html <- gsub("æ",intToUtf8(0x00E6),output.without.html)
output.without.html <- gsub("ç",intToUtf8(0x00E7),output.without.html)
output.without.html <- gsub("è",intToUtf8(0x00E8),output.without.html)
output.without.html <- gsub("é",intToUtf8(0x00E9),output.without.html)
output.without.html <- gsub("ê",intToUtf8(0x00EA),output.without.html)
output.without.html <- gsub("ë",intToUtf8(0x00EB),output.without.html)
output.without.html <- gsub("ì",intToUtf8(0x00EC),output.without.html)
output.without.html <- gsub("í",intToUtf8(0x00ED),output.without.html)
output.without.html <- gsub("î",intToUtf8(0x00EE),output.without.html)
output.without.html <- gsub("ï",intToUtf8(0x00EF),output.without.html)
output.without.html <- gsub("ð",intToUtf8(0x00F0),output.without.html)
output.without.html <- gsub("ñ",intToUtf8(0x00F1),output.without.html)
output.without.html <- gsub("ò",intToUtf8(0x00F2),output.without.html)
output.without.html <- gsub("ó",intToUtf8(0x00F3),output.without.html)
output.without.html <- gsub("ô",intToUtf8(0x00F4),output.without.html)
output.without.html <- gsub("õ",intToUtf8(0x00F5),output.without.html)
output.without.html <- gsub("ö",intToUtf8(0x00F6),output.without.html)
output.without.html <- gsub("ø",intToUtf8(0x00F8),output.without.html)
output.without.html <- gsub("ù",intToUtf8(0x00F9),output.without.html)
output.without.html <- gsub("ú",intToUtf8(0x00FA),output.without.html)
output.without.html <- gsub("û",intToUtf8(0x00FB),output.without.html)
output.without.html <- gsub("ü",intToUtf8(0x00FC),output.without.html)
output.without.html <- gsub("ý",intToUtf8(0x00FD),output.without.html)
output.without.html <- gsub("þ",intToUtf8(0x00FE),output.without.html)
output.without.html <- gsub("ÿ",intToUtf8(0x00FF),output.without.html)
output.without.html <- gsub("¡",intToUtf8(0x00A1),output.without.html)
output.without.html <- gsub("¢",intToUtf8(0x00A2),output.without.html)
output.without.html <- gsub("£",intToUtf8(0x00A3),output.without.html)
output.without.html <- gsub("¤",intToUtf8(0x00A4),output.without.html)
output.without.html <- gsub("¥",intToUtf8(0x00A5),output.without.html)
output.without.html <- gsub("¦",intToUtf8(0x00A6),output.without.html)
output.without.html <- gsub("§",intToUtf8(0x00A7),output.without.html)
output.without.html <- gsub("¨",intToUtf8(0x00A8),output.without.html)
output.without.html <- gsub("©",intToUtf8(0x00A9),output.without.html)
output.without.html <- gsub("ª",intToUtf8(0x00AA),output.without.html)
output.without.html <- gsub("«",intToUtf8(0x00AB),output.without.html)
output.without.html <- gsub("¬",intToUtf8(0x00AC),output.without.html)
output.without.html <- gsub("®",intToUtf8(0x00AE),output.without.html)
output.without.html <- gsub("¯",intToUtf8(0x00AF),output.without.html)
output.without.html <- gsub("°",intToUtf8(0x00B1),output.without.html)
output.without.html <- gsub("±",intToUtf8(0x00B2),output.without.html)
output.without.html <- gsub("²",intToUtf8(0x00B3),output.without.html)
output.without.html <- gsub("³",intToUtf8(0x00B4),output.without.html)
output.without.html <- gsub("´",intToUtf8(0x00B5),output.without.html)
output.without.html <- gsub("µ",intToUtf8(0x00B6),output.without.html)
output.without.html <- gsub("¶",intToUtf8(0x00B7),output.without.html)
output.without.html <- gsub("¸",intToUtf8(0x00B8),output.without.html)
output.without.html <- gsub("¹",intToUtf8(0x00B9),output.without.html)
output.without.html <- gsub("º",intToUtf8(0x00BA),output.without.html)
output.without.html <- gsub("»",intToUtf8(0x00BB),output.without.html)
output.without.html <- gsub("¼",intToUtf8(0x00BC),output.without.html)
output.without.html <- gsub("½",intToUtf8(0x00BD),output.without.html)
output.without.html <- gsub("¾",intToUtf8(0x00BE),output.without.html)
output.without.html <- gsub("¿",intToUtf8(0x00BF),output.without.html)
output.without.html <- gsub("×",intToUtf8(0x00D7),output.without.html)
output.without.html <- gsub("÷",intToUtf8(0x00F7),output.without.html)
output.without.html <- gsub("ˆ",intToUtf8(0x00FE),output.without.html)
output.without.html <- gsub("˜",intToUtf8(0x007E),output.without.html)
output.without.html <- gsub(intToUtf8(0xFB01),"fi",output.without.html, fixed = TRUE)
return(output.without.html)
}
insert.html.entity <- function(input.without.html) {
output.with.html <- input.without.html
output.with.html <- gsub(intToUtf8(0x00C0),"À",output.with.html)
output.with.html <- gsub(intToUtf8(0x00C1),"Á",output.with.html)
output.with.html <- gsub(intToUtf8(0x00C2),"Â",output.with.html)
output.with.html <- gsub(intToUtf8(0x00C3),"Ã",output.with.html)
output.with.html <- gsub(intToUtf8(0x00C4),"Ä",output.with.html)
output.with.html <- gsub(intToUtf8(0x00C5),"Å",output.with.html)
output.with.html <- gsub(intToUtf8(0x00C6),"Æ",output.with.html)
output.with.html <- gsub(intToUtf8(0x00C7),"Ç",output.with.html)
output.with.html <- gsub(intToUtf8(0x00C8),"È",output.with.html)
output.with.html <- gsub(intToUtf8(0x00C9),"É",output.with.html)
output.with.html <- gsub(intToUtf8(0x00CA),"Ê",output.with.html)
output.with.html <- gsub(intToUtf8(0x00CB),"Ë",output.with.html)
output.with.html <- gsub(intToUtf8(0x00CC),"Ì",output.with.html)
output.with.html <- gsub(intToUtf8(0x00CD),"Í",output.with.html)
output.with.html <- gsub(intToUtf8(0x00CE),"Î",output.with.html)
output.with.html <- gsub(intToUtf8(0x00CF),"Ï",output.with.html)
output.with.html <- gsub(intToUtf8(0x00D0),"Ð",output.with.html)
output.with.html <- gsub(intToUtf8(0x00D1),"Ñ",output.with.html)
output.with.html <- gsub(intToUtf8(0x00D2),"Ò",output.with.html)
output.with.html <- gsub(intToUtf8(0x00D3),"Ó",output.with.html)
output.with.html <- gsub(intToUtf8(0x00D4),"Ô",output.with.html)
output.with.html <- gsub(intToUtf8(0x00D5),"Õ",output.with.html)
output.with.html <- gsub(intToUtf8(0x00D6),"Ö",output.with.html)
output.with.html <- gsub(intToUtf8(0x00D8),"Ø",output.with.html)
output.with.html <- gsub(intToUtf8(0x00D9),"Ù",output.with.html)
output.with.html <- gsub(intToUtf8(0x00DA),"Ú",output.with.html)
output.with.html <- gsub(intToUtf8(0x00DB),"Û",output.with.html)
output.with.html <- gsub(intToUtf8(0x00DC),"Ü",output.with.html)
output.with.html <- gsub(intToUtf8(0x00DD),"Ý",output.with.html)
output.with.html <- gsub(intToUtf8(0x00DE),"Þ",output.with.html)
output.with.html <- gsub(intToUtf8(0x00DF),"ß",output.with.html)
output.with.html <- gsub(intToUtf8(0x00E0),"à",output.with.html)
output.with.html <- gsub(intToUtf8(0x00E1),"á",output.with.html)
output.with.html <- gsub(intToUtf8(0x00E2),"â",output.with.html)
output.with.html <- gsub(intToUtf8(0x00E3),"ã",output.with.html)
output.with.html <- gsub(intToUtf8(0x00E4),"ä",output.with.html)
output.with.html <- gsub(intToUtf8(0x00E5),"å",output.with.html)
output.with.html <- gsub(intToUtf8(0x00E6),"æ",output.with.html)
output.with.html <- gsub(intToUtf8(0x00E7),"ç",output.with.html)
output.with.html <- gsub(intToUtf8(0x00E8),"è",output.with.html)
output.with.html <- gsub(intToUtf8(0x00E9),"é",output.with.html)
output.with.html <- gsub(intToUtf8(0x00EA),"ê",output.with.html)
output.with.html <- gsub(intToUtf8(0x00EB),"ë",output.with.html)
output.with.html <- gsub(intToUtf8(0x00EC),"ì",output.with.html)
output.with.html <- gsub(intToUtf8(0x00ED),"í",output.with.html)
output.with.html <- gsub(intToUtf8(0x00EE),"î",output.with.html)
output.with.html <- gsub(intToUtf8(0x00EF),"ï",output.with.html)
output.with.html <- gsub(intToUtf8(0x00F0),"ð",output.with.html)
output.with.html <- gsub(intToUtf8(0x00F1),"ñ",output.with.html)
output.with.html <- gsub(intToUtf8(0x00F2),"ò",output.with.html)
output.with.html <- gsub(intToUtf8(0x00F3),"ó",output.with.html)
output.with.html <- gsub(intToUtf8(0x00F4),"ô",output.with.html)
output.with.html <- gsub(intToUtf8(0x00F5),"õ",output.with.html)
output.with.html <- gsub(intToUtf8(0x00F6),"ö",output.with.html)
output.with.html <- gsub(intToUtf8(0x00F8),"ø",output.with.html)
output.with.html <- gsub(intToUtf8(0x00F9),"ù",output.with.html)
output.with.html <- gsub(intToUtf8(0x00FA),"ú",output.with.html)
output.with.html <- gsub(intToUtf8(0x00FB),"û",output.with.html)
output.with.html <- gsub(intToUtf8(0x00FC),"ü",output.with.html)
output.with.html <- gsub(intToUtf8(0x00FD),"ý",output.with.html)
output.with.html <- gsub(intToUtf8(0x00FE),"þ",output.with.html)
output.with.html <- gsub(intToUtf8(0x00FF),"ÿ",output.with.html)
output.with.html <- gsub(intToUtf8(0x00A1),"¡",output.with.html)
output.with.html <- gsub(intToUtf8(0x00A2),"¢",output.with.html)
output.with.html <- gsub(intToUtf8(0x00A3),"£",output.with.html)
output.with.html <- gsub(intToUtf8(0x00A4),"¤",output.with.html)
output.with.html <- gsub(intToUtf8(0x00A5),"¥",output.with.html)
output.with.html <- gsub(intToUtf8(0x00A6),"¦",output.with.html)
output.with.html <- gsub(intToUtf8(0x00A7),"§",output.with.html)
output.with.html <- gsub(intToUtf8(0x00A8),"¨",output.with.html)
output.with.html <- gsub(intToUtf8(0x00A9),"©",output.with.html)
output.with.html <- gsub(intToUtf8(0x00AA),"ª",output.with.html)
output.with.html <- gsub(intToUtf8(0x00AB),"«",output.with.html)
output.with.html <- gsub(intToUtf8(0x00AC),"¬",output.with.html)
output.with.html <- gsub(intToUtf8(0x00AE),"®",output.with.html)
output.with.html <- gsub(intToUtf8(0x00AF),"¯",output.with.html)
output.with.html <- gsub(intToUtf8(0x00B1),"°",output.with.html)
output.with.html <- gsub(intToUtf8(0x00B2),"±",output.with.html)
output.with.html <- gsub(intToUtf8(0x00B3),"²",output.with.html)
output.with.html <- gsub(intToUtf8(0x00B4),"³",output.with.html)
output.with.html <- gsub(intToUtf8(0x00B5),"´",output.with.html)
output.with.html <- gsub(intToUtf8(0x00B6),"µ",output.with.html)
output.with.html <- gsub(intToUtf8(0x00B7),"¶",output.with.html)
output.with.html <- gsub(intToUtf8(0x00B8),"¸",output.with.html)
output.with.html <- gsub(intToUtf8(0x00B9),"¹",output.with.html)
output.with.html <- gsub(intToUtf8(0x00BA),"º",output.with.html)
output.with.html <- gsub(intToUtf8(0x00BB),"»",output.with.html)
output.with.html <- gsub(intToUtf8(0x00BC),"¼",output.with.html)
output.with.html <- gsub(intToUtf8(0x00BD),"½",output.with.html)
output.with.html <- gsub(intToUtf8(0x00BE),"¾",output.with.html)
output.with.html <- gsub(intToUtf8(0x00BF),"¿",output.with.html)
output.with.html <- gsub(intToUtf8(0x00D7),"×",output.with.html)
output.with.html <- gsub(intToUtf8(0x00F7),"÷",output.with.html)
output.with.html <- gsub(intToUtf8(0x00FE),"ˆ",output.with.html)
output.with.html <- gsub(intToUtf8(0x007E),"˜",output.with.html)
return(output.with.html)
}
update_progress_info <- function(print_message){
if (length(PDE.globals$le.progress.textbox) > 0){
## add completion info
progress_info_length <- length(tcltk2::tk2list.get(PDE.globals$le.progress.textbox))
if (progress_info_length > 3) {
new_list <- tcltk2::tk2list.get(PDE.globals$le.progress.textbox)[!grepl("^$",
tcltk2::tk2list.get(PDE.globals$le.progress.textbox))]
} else {
new_list <- tcltk2::tk2list.get(PDE.globals$le.progress.textbox)
}
tcltk::tkconfigure(PDE.globals$le.progress.textbox,values = c(new_list,print_message))
tcltk::tkconfigure(PDE.globals$le.progress.textbox,textvariable = tcltk::tclVar(print_message))
tcltk::tcl("update")
}
}
remove_backref <- function(x) {
for (s in 1:length(x)){
string_strsplit <- strsplit(x[s],"")[[1]]
for (l in 1:length(string_strsplit)){
if (l != length(string_strsplit)){
if ((string_strsplit[l] == "\\\\") && (string_strsplit[l+1] %in% as.character(0:9))){
string_strsplit <- gsub("\\\\","",string_strsplit)
}
}
}
string_strsplit <- gsub("\\\\","",string_strsplit)
x[s] <- paste0(string_strsplit,collapse ="")
}
return(x)
}
## set all indicator variables ---------------------------
integrity.indicator <- TRUE ## indicates if txt, keeplayouttxt and html copy of the PDF file are created correctly
filterwords.go <- FALSE ## indicator if filter words were found or not set
searchwords.go <- FALSE ## indicator if search words were found or not set
nexti <- FALSE ## indicator for html table to be processed
output_files <- NULL ## this is the output to return at the end
out_msg <- NULL
sw_in_tab_counted <- FALSE
## set statics output -----------------------------------------
stat_output <- NULL
pdf_word_count <- 0
pdf_page_count <- 0
pdf_filterwords <- NULL
pdf_filterword_times <- NULL
pdf_filterword_names <- NULL
pdf_filterword_total <- NULL
pdf_searchwords <- NULL
pdf_searchword_times <- NULL
pdf_searchword_names <- NULL
pdf_searchword_total <- NULL
pdf_sentences_w_searchwords <- NA
search.word.category_total <- NULL
## set the paths of the files ---------------------------------
output <- NULL
pdfpath <- pdf
txtpath <- gsub(".pdf[^.pdf]*$", ".txt", pdfpath)
keeplayouttxtpath <- gsub(".pdf[^.pdf]*$", "_keeplayout.txt",
pdfpath)
##make sure filter and search words do not have duplicates
stop_ind <- FALSE
if (any(duplicated(search.words))){
print_message <- paste0("Following search words are duplicated in list: ",
paste(search.words[duplicated(search.words)], collapse = ";"),
". Please remove the duplicates and restart the analysis.")
out_msg <- c(out_msg, print_message)
update_progress_info(print_message)
stop_ind <- TRUE
}
if (any(duplicated(filter.words))){
print_message <- paste0("Following search words are duplicated in list: ",
paste(filter.words[duplicated(filter.words)], collapse = ";"),
". Please remove the duplicates and restart the analysis.")
out_msg <- c(out_msg, print_message)
update_progress_info(print_message)
stop_ind <- TRUE
}
if (stop_ind == TRUE){
stop("Words were duplicated in the keyword list. Please remove the duplicates and restart the analysis.")
}
## create the id and output dir ----------------------------------
dir.create(out, showWarnings = FALSE)
id <- sub("^(.*)\\..*$", "\\1", basename(txtpath))
print_message <- paste0("Following file is processing: \'",id,".pdf\'")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
## 1) Create txt and html copies of PDF file ---------------------------------------
## test of Xpdftools are installed
xpdf_config_location <- paste0(system.file(package = "PDE"),"/bin/XPDF_DIR.config")
if (file.exists(xpdf_config_location)){
pdftotext_location <- grep("pdftotext",readLines(xpdf_config_location), value = TRUE)
pdftohtml_location <- grep("pdftohtml",readLines(xpdf_config_location), value = TRUE)
pdftopng_location <- grep("pdftopng",readLines(xpdf_config_location), value = TRUE)
if (length(file.exists(pdftotext_location)) == 0 ||
length(file.exists(pdftohtml_location)) == 0 ||
length(file.exists(pdftopng_location)) == 0){
install.test <- PDE_check_Xpdf_install(verbose=verbose)
} else {
install.test <- TRUE
}
} else {
install.test <- PDE_check_Xpdf_install(verbose=verbose)
}
if (install.test == FALSE) {
if (length(PDE.globals$le.progress.textbox) > 0){
result_install <- tk_messageBox(type = "yesno",
paste0(attributes(install.test)$msg,
" Do you want to download and install xpdf now?"), caption = "xpdf not installed")
if (result_install == "yes"){
PDE_install_Xpdftools4.02(permission = 1)
update_progress_info("Please stop and restart the analysis.")
stop("Please stop and restart the analysis.")
} else {
update_progress_info(attributes(install.test)$msg)
stop(attributes(install.test)$msg)
}
} else {
stop(attributes(install.test)$msg)
}
} else {
pdftotext_location <- grep("pdftotext",readLines(xpdf_config_location), value = TRUE)
pdftohtml_location <- grep("pdftohtml",readLines(xpdf_config_location), value = TRUE)
pdftopng_location <- grep("pdftopng",readLines(xpdf_config_location), value = TRUE)
}
system(paste0("\"",pdftotext_location,"\" -layout",
" \"", pdfpath, "\" \"", keeplayouttxtpath,
"\""), wait = TRUE, ignore.stderr = TRUE)
system(paste0("\"",pdftotext_location,"\" \"", pdfpath,
"\" \"", txtpath, "\""), wait = TRUE,
ignore.stderr = TRUE)
htmlpath <- gsub(".pdf[^.pdf]*$", ".html", pdfpath)
## convert PDF to HTML
system(paste0("\"",pdftohtml_location,"\" \"", pdfpath,
"\" \"", htmlpath, "\""), wait = TRUE,
ignore.stderr = TRUE)
## test if keeplayouttxt files were saved with funny name
fixed_basekeeplayouttxtpath1 <- iconv(basename(keeplayouttxtpath), from = "Windows-1252", to = "UTF-8")
fixed_keeplayouttxtpath1 <- paste0(dirname(keeplayouttxtpath),"/",fixed_basekeeplayouttxtpath1)
fixed_basekeeplayouttxtpath2 <- fixed_basekeeplayouttxtpath1
Encoding(fixed_basekeeplayouttxtpath2) <- "Windows-1252"
fixed_keeplayouttxtpath2 <- paste0(dirname(keeplayouttxtpath),"/",fixed_basekeeplayouttxtpath2)
if (file.exists(fixed_keeplayouttxtpath1)){
response <- file.rename(from=fixed_keeplayouttxtpath1, to=keeplayouttxtpath)
} else if (file.exists(fixed_keeplayouttxtpath2)){
response <- file.rename(from=fixed_keeplayouttxtpath2, to=keeplayouttxtpath)
}
## test if txt files were saved with funny name
fixed_basetxtpath1 <- iconv(basename(txtpath), from = "Windows-1252", to = "UTF-8")
fixed_txtpath1 <- paste0(dirname(txtpath),"/",fixed_basetxtpath1)
fixed_basetxtpath2 <- fixed_basetxtpath1
Encoding(fixed_basetxtpath2) <- "Windows-1252"
fixed_txtpath2 <- paste0(dirname(txtpath),"/",fixed_basetxtpath2)
if (file.exists(fixed_txtpath1)){
response <- file.rename(from=fixed_txtpath1, to=txtpath)
} else if (file.exists(fixed_txtpath2)){
response <- file.rename(from=fixed_txtpath2, to=txtpath)
}
## test if files were saved with funny name
fixed_basehtmlpath1 <- iconv(basename(htmlpath), from = "Windows-1252", to = "UTF-8")
fixed_htmlpath1 <- paste0(dirname(htmlpath),"/",fixed_basehtmlpath1)
fixed_basehtmlpath2 <- fixed_basehtmlpath1
Encoding(fixed_basehtmlpath2) <- "Windows-1252"
fixed_htmlpath2 <- paste0(dirname(htmlpath),"/",fixed_basehtmlpath2)
if (dir.exists(fixed_htmlpath1)){
htmlpath <- fixed_htmlpath1
} else if (file.exists(fixed_htmlpath2)){
htmlpath <- fixed_htmlpath2
}
## add completion info
update_progress_info(print_message)
## 2.1) Check txt and html file integrity ----------------------------------
integrity.indicator <- TRUE
## check if html was created (if pdf is secured)
if (!dir.exists(htmlpath) || !file.exists(paste0(htmlpath, "/index.html"))) {
## export error and do not remove file change
print_message <- paste0("\'", id, ".pdf\' is most likely secured and cannot be processed!")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
dir.create(paste0(out,"/secured"), showWarnings = FALSE)
write(paste0(pdfpath, " is most likely secured and cannot be processed!"),
file = paste0(out,"/secured/",id, "_is_secured.txt"))
integrity.indicator <- FALSE
## if cpymv is either cpy or mv and filterwords were used
if (cpy_mv == "cpy" && filter.words[1] != ""){
dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
dir.create(paste0(out,"/pdfs/secured"), showWarnings = FALSE)
file.copy(pdf, paste0(out,"/pdfs/secured"))
print_message <- paste0(basename(pdf),
" was copied to \'", out,"/pdfs/secured\'.")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
} else if (cpy_mv == "mv" && filter.words[1] != ""){
dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
dir.create(paste0(out,"/pdfs/secured"), showWarnings = FALSE)
file.copy(pdf, paste0(out,"/pdfs/secured"))
unlink(pdf)
print_message <- paste0(basename(pdf),
" was moved to \'", out,"/pdfs/secured\'.")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
}
## if html was created
} else {
## read in the txt files
txtcontent <- readin_txt(txtpath)
##split rows with double page splits
doublepagesplits <- grep("^\\f.*\\f", txtcontent)
counter <- 0
for (dbps in doublepagesplits){
new_rows <- strsplit(txtcontent[dbps + counter],"\\f")[[1]][-1]
for (r in length(new_rows):2){
txtcontent[dbps + counter] <- paste0("\f",new_rows[1])
txtcontent <- unlist(append(txtcontent, list(paste0("\f",new_rows[r])), dbps + counter))
counter <- counter + 1
}
}
splittxtcontent <- page_splits(txtcontent)
keeplayouttxtcontent <- readin_txt(keeplayouttxtpath)
indexcontent <- readLines(paste0(htmlpath, "/index.html"))
## if the txt or html files have no content
if (identical(indexcontent, "")) {
## export error and do not remove file change
print_message <- paste0("\'", id, ".pdf\' is most likely secured and cannot be processed!")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
dir.create(paste0(out,"/secured"), showWarnings = FALSE)
write(paste0(pdfpath, " is most likely secured and cannot be processed!"),
file = paste0(out,"/secured/",id, "_is_secured.txt"))
integrity.indicator <- FALSE
## if cpymv is either cpy or mv and filterwords were used
if (cpy_mv == "cpy" && filter.words[1] != ""){
dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
dir.create(paste0(out,"/pdfs/secured"), showWarnings = FALSE)
file.copy(pdf, paste0(out,"/pdfs/secured"))
print_message <- paste0(basename(pdf),
" was copied to \'", out,"/pdfs/secured\'.")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
} else if (cpy_mv == "mv" && filter.words[1] != ""){
dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
dir.create(paste0(out,"/pdfs/secured"), showWarnings = FALSE)
file.copy(pdf, paste0(out,"/pdfs/secured"))
unlink(pdf)
print_message <- paste0(basename(pdf),
" was moved to \'", out,"/pdfs/secured\'.")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
}
} else if (identical(txtcontent, "") || identical(keeplayouttxtcontent, "") ||
identical(gsub("\f","",txtcontent), "") || identical(gsub("\f","",keeplayouttxtcontent), "") ) {
print_message <- paste0("\'", id, ".pdf\' most likely contains no text content or is a scanned document!")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
dir.create(paste0(out,"/nr/"), showWarnings = FALSE)
write(paste0(pdfpath, " most likely contains no text content or is a scanned document!"),
file = paste0(out, "/nr/", id, "_non-readable.txt"))
integrity.indicator <- FALSE
## if cpymv is either cpy or mv and filterwords were used
if (cpy_mv == "cpy" && filter.words[1] != ""){
dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
dir.create(paste0(out,"/pdfs/nr"), showWarnings = FALSE)
file.copy(pdf, paste0(out,"/pdfs/nr"))
print_message <- paste0(basename(pdf),
" was copied to \'", out,"/pdfs/nr\'.")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
} else if (cpy_mv == "mv" && filter.words[1] != ""){
dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
dir.create(paste0(out,"/pdfs/nr"), showWarnings = FALSE)
file.copy(pdf, paste0(out,"/pdfs/nr"))
unlink(pdf)
print_message <- paste0(basename(pdf),
" was moved to \'", out,"/pdfs/nr\'.")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
}
## when txtcontent is there and html index was created
} else {
## extract all the page names ##
pages <- NULL
for (line in indexcontent) {
if (grepl("a href=\"", line)) pages <- c(pages,
substr(line, regexpr("a href=\"", line) + 8,
regexpr("html", line) + 3))
}
## read in the html file ##
htmlcontent <- vector("list", length(pages))
for (i in 1:length(pages)) {
htmlpagecontent <- readLines(paste0(htmlpath,"/", pages[i]), encoding = "UTF-8", warn = FALSE)
if (identical(htmlpagecontent, "")) {
print_message <- paste0("\'", id, ".pdf\' most likely contains no text content or is a scanned in document!")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
dir.create(paste0(out,"/nr/"), showWarnings = FALSE)
write(paste0(pdfpath, " most likely contains no text content or is a scanned in document!"),
file = paste0(out, "/nr/", id, "_non-readable.txt"))
integrity.indicator <- FALSE
## if cpymv is either cpy or mv and filterwords were used
if (cpy_mv == "cpy" && filter.words[1] != ""){
dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
dir.create(paste0(out,"/pdfs/nr"), showWarnings = FALSE)
file.copy(pdf, paste0(out,"/pdfs/nr"))
print_message <- paste0(basename(pdf),
" was copied to \'", out,"/pdfs/nr\'.")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
} else if (cpy_mv == "mv" && filter.words[1] != ""){
dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
dir.create(paste0(out,"/pdfs/nr"), showWarnings = FALSE)
file.copy(pdf, paste0(out,"/pdfs/nr"))
unlink(pdf)
print_message <- paste0(basename(pdf),
" was moved to \'", out,"/pdfs/nr\'.")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
}
break
}
## replace all fi
lines_with_fi <- grep(intToUtf8(0xFB01),htmlpagecontent)
for (line in lines_with_fi){
res <- try(gsub(intToUtf8(0xFB01),"fi",htmlpagecontent[line], fixed = TRUE),silent = TRUE)
if (inherits(res,"try-error")){
htmlpagecontent[line] <- iconv(htmlpagecontent[line], 'UTF-8', 'latin1', 'bit')
}
htmlpagecontent[line] <- gsub(intToUtf8(0xFB01),"fi",htmlpagecontent[line], fixed = TRUE)
}
## replace p styles
## get the list of p styles
list_of_fts <- NULL
lines_with_ft <- grep("\\.ft",htmlpagecontent)
if (length(lines_with_ft) > 0){
for (ln in lines_with_ft){
list_of_fts <- rbind(list_of_fts,
cbind(ft=sub("\\{.*$","",substr(htmlpagecontent[ln],
regexpr("\\.ft",htmlpagecontent[ln])[1]+1,
nchar(htmlpagecontent[ln]))),
style=sub("\\}.*$","",sub("^.*\\{","",substr(htmlpagecontent[ln],
regexpr("\\.ft",htmlpagecontent[ln])[1]+1,
nchar(htmlpagecontent[ln]))))))
}
## replace each ft style in that page
for (ft.numb in 1:nrow(list_of_fts)){
htmlpagecontent <- gsub(paste0("\" class=\"",list_of_fts[ft.numb,1],"\""),
paste0(";",list_of_fts[ft.numb,2],"\""),htmlpagecontent)
}
}
htmlcontent[[i]] <- htmlpagecontent
}
} ## end if the txt or html files have no content
} ## end if (!dir.exists(htmlpath))
if (integrity.indicator == TRUE) {
## if the file is only images or empty then don't process
realcontent <- gsub("^\\f", "", paste(txtcontent,
collapse = ""))
realcontent <- gsub("\t", "", realcontent)
realcontent <- gsub(" ", "", realcontent)
if (realcontent == "") {
integrity.indicator <- FALSE
print_message <- paste0("\'", id, ".pdf\' has no readable content")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
## write an empty file
if (write.txt.doc.file == TRUE) {
dir.create(paste0(out,"/nr/"), showWarnings = FALSE)
write(paste0(pdfpath, " has no readable content in PDF file"),
file = paste0(out, "/nr/", id, "_non-readable.txt"))
integrity.indicator <- FALSE
## if cpymv is either cpy or mv and filterwords were used
if (cpy_mv == "cpy" && filter.words[1] != ""){
dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
dir.create(paste0(out,"/pdfs/nr"), showWarnings = FALSE)
file.copy(pdf, paste0(out,"/pdfs/nr"))
print_message <- paste0(basename(pdf),
" was copied to \'", out,"/pdfs/nr\'.")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
} else if (cpy_mv == "mv" && filter.words[1] != ""){
dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
dir.create(paste0(out,"/pdfs/nr"), showWarnings = FALSE)
file.copy(pdf, paste0(out,"/pdfs/nr"))
unlink(pdf)
print_message <- paste0(basename(pdf),
" was moved to \'", out,"/pdfs/nr\'.")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
}
}
}
}
## if there was an issue with creating files
if (integrity.indicator == FALSE) {
pdf_word_count <- 0
pdf_page_count <- 0
pdf_filterwords <- NULL
pdf_searchwords <- NULL
search.word.category_total <- NULL
if (!filter.words[1] == "") {
for (i in 1:length(filter.words)) {
word <- filter.words[i]
## add filterwords to output
if (ignore.case.fw == TRUE){
ic <- "ic"
} else {
ic <- "nic"
}
if (regex.fw == TRUE){
reg <- "regex"
} else {
reg <- "nregex"
}
pdf_filterwords <- c(pdf_filterwords, paste0("FW_",ic,"_",reg,":",word))
pdf_filterword_total <- NA
pdf_filterword_times <- rep(NA,length(pdf_filterwords))
}
}
if (!search.words[1] == "") {
for (i in 1:length(search.words)) {
word <- search.words[i]
## add filterwords to output
if (ignore.case.sw == TRUE){
ic <- "ic"
} else {
ic <- "nic"
}
if (regex.sw == TRUE){
reg <- "regex"
} else {
reg <- "nregex"
}
pdf_searchwords <- c(pdf_searchwords, paste0("SW_",ic,"_",reg,":",word))
pdf_searchword_total <- NA
pdf_searchword_times <- rep(NA,length(pdf_searchwords))
## set search word category
search.word.category_total <- NULL
if (!is.null(search.word.categories) && length(search.words) == length(search.word.categories)){
for (swc in 1:length(unique(search.word.categories))){
search.word.category_total[swc] <- sum(pdf_searchword_times[search.word.categories %in% unique(search.word.categories)[swc]], na.rm = TRUE)
}
}
}
}
stat_output <- cbind(pdf_word_count = pdf_word_count, pdf_page_count = pdf_page_count,
pdf_filterword_total = pdf_filterword_total,
pdf_filterword_percentage = paste0(as.character(as.numeric(pdf_filterword_total)/as.numeric(pdf_word_count)*100),"%"),
pdf_searchword_total = pdf_searchword_total,rbind(search.word.category_total),
rbind(pdf_filterword_times), rbind(pdf_searchword_times))
extracol_num <- (ncol(stat_output) -
length(pdf_filterword_times) -
length(pdf_searchword_times) -
length(search.word.category_total) + 1)
colnames(stat_output)[extracol_num:ncol(stat_output)] <- c(unique(search.word.categories),pdf_filterwords,pdf_searchwords)
rownames(stat_output) <- id
output_files$stat_output <- data.frame(cbind(stat_output,
pdf_sentences_w_searchwords = NA),
check.names = FALSE)
##if everything is ok
} else {
## determine word count of pdf file
pdf_word_count <- sum(sapply(gregexpr("[[:alpha:]]+", txtcontent), function(x) sum(x > 0)))
pdf_page_count <- length(htmlcontent)
## 2.2) Check all the options chosen for PDE analyzer ------------------------------
## if general extraction (search words is undefined), then context <- 0
if (search.words[1] == "" || search.words[1] == "*" ||
search.words[1] == ".") {
context <- 0
search.words <- ""
}
if (ignore.case.sw == FALSE) ignore.case.sw <- FALSE
else if (!ignore.case.sw == TRUE) {
tcltk::tkmessageBox(title = "Warning",
type = "ok", icon = "warning",
message = "ignore.case.sw: ignore.case.sw has to be either TRUE or FALSE")
stop("ignore.case.sw: ignore.case.sw has to be either TRUE or FALSE")
}
if (is.na(filter.words[1])) filter.words <- ""
## adjust filter.words with regex
if (regex.fw == FALSE){
for (fw_pos in 1:length(filter.words)){
filter.words[fw_pos] <- re.escape(filter.words[fw_pos])
}
}
if (is.null(ncol(filter.words))) filter.word.table <- data.frame(words = filter.words,
ignore.case.fw = ignore.case.fw)
# if (!is.numeric(filter.word.times)) {
# tcltk::tkmessageBox(title = "Warning",
# type = "ok", icon = "warning",
# message = "filter.word.times: has to be a number")
# stop("filter.word.times: has to be a number")
# }
if (is.null(ncol(table.heading.words))) table.heading.words <- data.frame(words = table.heading.words,
ignore.case.th = ignore.case.th)
## adjust search.words with regex
if (regex.sw == FALSE){
for (sw_pos in 1:length(search.words)){
search.words[sw_pos] <- re.escape(search.words[sw_pos])
}
}
if (is.null(ncol(search.words))) search.word.table <- data.frame(words = search.words,
ignore.case.sw = ignore.case.sw)
if (write.table.locations == FALSE) write.table.locations <- FALSE
else if (!write.table.locations == TRUE) {
tcltk::tkmessageBox(title = "Warning",
type = "ok", icon = "warning",
message = "write.table.locations: has to be either TRUE or FALSE")
stop("write.table.locations: has to be either TRUE or FALSE")
}
if (is.na(out.table.format)) {
out.table.format <- ".csv (WINDOWS-1252)"
} else if (!(out.table.format %in% c(".csv (WINDOWS-1252)", ".csv (macintosh)", ".csv (UTF-8)",
".tsv (WINDOWS-1252)",".tsv (macintosh)",".tsv (UTF-8)"))) {
tcltk::tkmessageBox(title = "Warning",
type = "ok", icon = "warning",
message = paste("out.table.format: has to be either .csv (WINDOWS-1252), .csv (macintosh), .csv (UTF-8)",
"or .tsv (WINDOWS-1252), .tsv (macintosh), .tsv (UTF-8)"))
stop(paste("out.table.format: has to be either .csv (WINDOWS-1252), .csv (macintosh), .csv (UTF-8)",
"or .tsv (WINDOWS-1252), .tsv (macintosh), .tsv (UTF-8)"))
}
if (grepl("csv", out.table.format)) {
out.table.separator <- ","
out.table.ext <- ".csv"
}
if (grepl("tsv", out.table.format)) {
out.table.separator <- "\t"
out.table.ext <- ".tsv"
}
if (grepl("WINDOWS-1252", out.table.format)) {
out.encoding <- "WINDOWS-1252"
} else if (grepl("macintosh", out.table.format)) {
out.encoding <- "macintosh"
} else if (grepl("UTF-8", out.table.format)) {
out.encoding <- "UTF-8"
} else {
out.encoding <- "WINDOWS-1252"
}
if (!is.numeric(dev_x)) {
tcltk::tkmessageBox(title = "Warning",
type = "ok", icon = "warning",
message = "dev_x: has to be a number")
stop("dev_x: has to be a number") ## deviation between cell positions, might
##have to be increased if words that should be in the same column
}
if (!is.numeric(context)) {
tcltk::tkmessageBox(title = "Warning",
type = "ok", icon = "warning",
message = "context: has to be a number")
stop("context: has to be a number") ## +/- context number of sentences
## before and after the search word was found will be put out
}
if (write.tab.doc.file == FALSE) write.tab.doc.file <- FALSE
else if (!write.tab.doc.file == TRUE) {
tcltk::tkmessageBox(title = "Warning",
type = "ok", icon = "warning",
message = "write.tab.doc.file: has to be either TRUE or FALSE")
stop("write.tab.doc.file: has to be either TRUE or FALSE")
}
if (write.txt.doc.file == FALSE) write.txt.doc.file <- FALSE
else if (!write.txt.doc.file == TRUE) {
tcltk::tkmessageBox(title = "Warning",
type = "ok", icon = "warning",
message = "write.txt.doc.file: has to be either TRUE or FALSE")
stop("write.txt.doc.file: has to be either TRUE or FALSE")
}
}
## reset variables
if (integrity.indicator == TRUE) {
output <- NULL
htmltablelines <- NULL
keeplayouttxttablelines <- NULL
txttablelines <- NULL
}
## 2.3) Make content ----------------------------------------
if (integrity.indicator == TRUE) {
content <- list(txtcontent, keeplayouttxtcontent)
## make a variable only with the txt content
txthtmlcontent <- htmlcontent
## go through pages
for (j in 1:length(txthtmlcontent)) {
## go through each html line
for (z in 1:length(txthtmlcontent[[j]])) {
## Removing the Table wording
line <- txthtmlcontent[[j]][z]
line.txthtmlcontent <- ""
res <- try(utf8ToInt(line),silent = TRUE)
if (inherits(res,"try-error")) line <- iconv(line, 'UTF-8', 'latin1', 'byte')
##replace superscript
while (grepl("vertical-align:super",line)){
super_pos <- regexpr("vertical-align:super",line)[1]
## replace the end of the superscript with "]"
endspan_positions <- gregexpr("</span>",line)[[1]]
s <- 1
endspan_pos <- endspan_positions[s] - super_pos
while (endspan_pos<0){
s <- s + 1
endspan_pos <- endspan_positions[s] - super_pos
}
endspan_pos <- endspan_pos + super_pos
line_list <- unlist(strsplit(line, split = ""))
line_list[endspan_pos] <- "]<"
line <- paste(line_list,collapse = "")
## replace the beginning of the superscript with "^["
endofstartspan_positions <- gregexpr(";\">",line)[[1]]
s <- 1
endofstartspan_pos <- endofstartspan_positions[s] - super_pos
while (endofstartspan_pos<0){
s <- s + 1
endofstartspan_pos <- endofstartspan_positions[s] - super_pos
}
endofstartspan_pos <- endofstartspan_pos + super_pos
line_before_super <- sub("vertical-align:super.*", "", line)
startspan_pos <- max(gregexpr("<span id=",line_before_super)[[1]])
line_list <- unlist(strsplit(line, split = ""))
line_list[endofstartspan_pos+2] <- ">^["
line <- paste(line_list,collapse = "")
line <- sub("vertical-align:super", "vertical-align:baseline", line)
# line <- sub(paste0("^(.{",as.numeric(startspan_pos)-1,"})(.{",as.numeric(endofstartspan_pos-startspan_pos+2),"})."),"\\1^[",
# line)
}
##replace subscript
while (grepl("vertical-align:sub",line)){
sub_pos <- regexpr("vertical-align:sub",line)[1]
## replace the end of the subscript with "]"
endspan_positions <- gregexpr("</span>",line)[[1]]
s <- 1
endspan_pos <- endspan_positions[s] - sub_pos
while (endspan_pos<0){
s <- s + 1
endspan_pos <- endspan_positions[s] - sub_pos
}
endspan_pos <- endspan_pos + sub_pos
line_list <- unlist(strsplit(line, split = ""))
line_list[endspan_pos] <- "]<"
line <- paste(line_list,collapse = "")
## replace the beginning of the subscript with "_["
endofstartspan_positions <- gregexpr(";\">",line)[[1]]
s <- 1
endofstartspan_pos <- endofstartspan_positions[s] - sub_pos
while (endofstartspan_pos<0){
s <- s + 1
endofstartspan_pos <- endofstartspan_positions[s] - sub_pos
}
endofstartspan_pos <- endofstartspan_pos + sub_pos
line_before_sub <- sub("vertical-align:sub.*", "", line)
startspan_pos <- max(gregexpr("<span id=",line_before_sub)[[1]])
line_list <- unlist(strsplit(line, split = ""))
line_list[endofstartspan_pos+2] <- ">_["
line <- paste(line_list,collapse = "")
}
##replace different html formating
line <- gsub("</p>","</span>",line)
## replace line break with space (not optimal but better for searching)
line <- gsub("<br/>"," ",line)
## remove residue from hyperlinks
line <- gsub("<a href=\".*?>","",line)
line <- gsub("</a>","",line)
rev.line <- intToUtf8(rev(utf8ToInt(line)))
for (spanpos in rev(gregexpr(">naps/<",rev.line)[[1]])){
add.txthtmlcontent <- substr(rev.line,spanpos+7, regexpr(">\"",
substr(rev.line,spanpos+7,
nchar(rev.line)))+spanpos+7-2)
line.txthtmlcontent <- paste0(line.txthtmlcontent,intToUtf8(rev(utf8ToInt(add.txthtmlcontent))))
}
if (grepl("&",line.txthtmlcontent)){
txthtmlcontent[[j]][z] <- replace.html.entity(line.txthtmlcontent)
} else {
txthtmlcontent[[j]][z] <- line.txthtmlcontent
}
} ## end go through each line z
content[[(j+2)]] <- txthtmlcontent[[j]]
}
}
## 3) Evaluate for filter words ---------------------------------------
list_of_abbrevs <- NULL
## 3.1) Filter Search ---------------------------------------
if (integrity.indicator == TRUE && !filter.word.table[1, "words"] == "") {
word.txtline.fw <- NULL
for (i in 1:nrow(filter.word.table)) {
## search for lines with filter word in [txtcontent]
word <- filter.word.table[i, "words"]
ignore.case.fw <- filter.word.table[i, "ignore.case.fw"]
detected_line <- grep(word, txtcontent, ignore.case = ignore.case.fw)
word.txtline.fw <- c(word.txtline.fw,
detected_line)
## 3.2) Replace abbreviations -----------------------------------------------------
if (eval.abbrevs == TRUE && length(word.txtline.fw) > 0){
## Check if any occurences (heading + text) of the
## filter word are defining and abbreviation
## go through each txtcontent line that the searchword was found
for (nth in 1:length(word.txtline.fw)) {
paragraph <- txtcontent[word.txtline.fw[[nth]]]
## identify definitions of abbrev
occur_double_dots_or_equal <- test_if_abbrev_double_dots_or_equal(searchword = as.character(word),
paragraph = paragraph,
ignore.case = ignore.case.fw)
occur_in_parantheses <- test_if_abbrev_in_parantheses(searchword = as.character(word),
paragraph = paragraph,
ignore.case = ignore.case.fw)
## if abbrev was found in nth occurence
for (occur in list(occur_double_dots_or_equal, occur_in_parantheses)){
if (occur$res == TRUE) {
## replace if abbrev is not yet defined
for (abbrev in c("abbrev_singular","abbrev_plural")) {
if (!(occur[[abbrev]] %in% list_of_abbrevs)){
list_of_abbrevs <- c(list_of_abbrevs,occur[[abbrev]])
for (c in 1:length(content)){
to_find <- paste0("([^A-z|0-9]+|^)",occur[[abbrev]],"([^A-z|0-9|:]$|[^A-z|0-9|:][^=|:]|$)")
found_lines <- grep(to_find, content[[c]],
ignore.case = FALSE)
for (line in found_lines){
found_pos <- gregexpr(to_find, content[[c]][line])[[1]]
found_pos_list <- gregexpr(to_find, content[[c]][line])[[1]]
add_pos <- 0
for (p in 1:length(found_pos)){
sub <- substr(content[[c]][line],found_pos[p] + add_pos,nchar(content[[c]][line]))
## prevent double substitution if sub already contains substitution
if (grepl(occur[[sub("abbrev","replacement",abbrev)]], sub)) next
found_pos_list[p] <- regexpr(occur[[abbrev]],sub) + found_pos[p] + add_pos - 1
content[[c]][line] <- paste0(substr(content[[c]][line],1,found_pos_list[p]-1),
sub(occur[[abbrev]],
occur[[sub("abbrev","replacement",abbrev)]],
substr(content[[c]][line],
found_pos_list[p],nchar(content[[c]][line])),
ignore.case = FALSE))
add_pos <- add_pos + nchar(occur[[sub("abbrev","replacement",
abbrev)]]) - nchar(occur[[abbrev]])
## correct for sub of abbrev definition e.g. methotrexate (MTX (methotrexate))
len_diff <- nchar(content[[c]][line]) -
nchar(gsub(paste0("(",occur[[sub("abbrev","replacement",abbrev)]],")"),
paste0("(",occur[[abbrev]],")"),
content[[c]][line] , fixed = TRUE))
add_pos <- add_pos - len_diff
content[[c]][line] <- gsub(paste0("(",occur[[sub("abbrev","replacement",abbrev)]],")"),
paste0("(",occur[[abbrev]],")"),
content[[c]][line] , fixed = TRUE)
} ## end for (p in length(found_pos)){
} ## end for (line in found_lines){
} ## for (c in 1:length(content)){
} ##if (!(occur[[abbrev]] %in% list_of_abbrevs)){
} ## for (abbrev in c("abbrev_singular","abbrev_plural")) {
} ## if (occur$res) {
} ## end for (abbrev in c("abbrev_singular","abbrev_plural")) {
} ## end for (nth in 1:length(word.txtline.fw)) {
} ## end replace abbreviations in content
} ## end for each filter word
} ## if (integrity.indicator == TRUE)
## if filter word abbreviations were found replace the abbreviations in content
if (eval.abbrevs == TRUE && integrity.indicator == TRUE &&
!filter.word.table[i, "words"] == "" && !is.null(list_of_abbrevs)) {
txtcontent <- content[[1]]
keeplayouttxtcontent <- content[[2]]
## html content per page
for (pa in 3:length(content)) txthtmlcontent[[(pa-2)]] <- content[[pa]]
} ## end replace abbreviations
## 3.3) Real filter Search ---------------------------------------
if (integrity.indicator == TRUE) {
word.txtline.fw <- NULL
word.txtpos.fw <- NULL
pdf_filterwords <- NULL
## if there are filter words
if (!filter.word.table[1, "words"] == "") {
for (i in 1:nrow(filter.word.table)) {
## search for lines with filter word in [txtcontent]
word <- filter.word.table[i, "words"]
ignore.case.fw <- filter.word.table[i, "ignore.case.fw"]
## add filterwords to output
if (ignore.case.fw == TRUE){
ic <- "ic"
} else {
ic <- "nic"
}
if (regex.fw == TRUE){
reg <- "regex"
} else {
reg <- "nregex"
}
pdf_filterwords <- c(pdf_filterwords, paste0("FW_",ic,"_",reg,":",word))
detected_line <- grep(word, txtcontent, ignore.case = ignore.case.fw)
word.txtline.fw <- c(word.txtline.fw,
detected_line)
current_word.txtpos.fw <- NULL
for (li in detected_line){
current_word.txtpos.fw <- c(current_word.txtpos.fw,
gregexpr(word, txtcontent[li], ignore.case = ignore.case.fw)[[1]])
}
pdf_filterword_times <- c(pdf_filterword_times, length(current_word.txtpos.fw))
pdf_filterword_names <- c(pdf_filterword_names, word)
pdf_filterword_total <- sum(pdf_filterword_times)
word.txtpos.fw <- c(word.txtpos.fw,current_word.txtpos.fw)
}
## test if percent or number
pdf_word_count <- sum(sapply(gregexpr("[[:alpha:]]+", txtcontent), function(x) sum(x > 0)))
if (grepl("%",filter.word.times)){
if (length(word.txtpos.fw)/pdf_word_count >= (as.numeric(sub("%","",filter.word.times))/100)) {
filterwords.go <- TRUE
print_message <- paste0(round((length(word.txtpos.fw)/pdf_word_count*100),4),
"% of all words were filter word(s) in ", id, ".pdf.")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
} else {
filterwords.go <- FALSE
}
} else {
if (length(word.txtpos.fw) >= as.numeric(filter.word.times)) {
filterwords.go <- TRUE
print_message <- paste0(length(word.txtpos.fw),
" filter word(s) were detected in ", id, ".pdf.")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
} else {
filterwords.go <- FALSE
}
}
if (filterwords.go == FALSE){
pdf_word_count <- sum(sapply(gregexpr("[[:alpha:]]+", txtcontent), function(x) sum(x > 0)))
if (grepl("%",filter.word.times)){
print_message <- paste0("\'",id,".pdf\' was filtered out due to a lack of the filter words. ",
round((length(word.txtpos.fw)/pdf_word_count*100),4),
"% of all words were filter word(s) in ", id, ".pdf.")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
} else {
print_message <- paste0("\'",id,".pdf\' was filtered out due to a lack of the filter words. ",
length(word.txtpos.fw),
" filter word(s) were detected in ", id, ".pdf.")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
}
## write 0 search words
if (!search.word.table[1, "words"] == "") {
for (i in 1:nrow(search.word.table)) {
word <- search.word.table[i, "words"]
## add filterwords to output
if (ignore.case.sw == TRUE){
ic <- "ic"
} else {
ic <- "nic"
}
if (regex.sw == TRUE){
reg <- "regex"
} else {
reg <- "nregex"
}
pdf_searchwords <- c(pdf_searchwords, paste0("SW_",ic,"_",reg,":",word))
pdf_searchword_total <- 0
pdf_searchword_times <- rep(0,length(pdf_searchwords))
search.word.category_total <- NULL
if (!is.null(search.word.categories) && length(search.words) == length(search.word.categories)){
for (swc in 1:length(unique(search.word.categories))){
search.word.category_total[swc] <- sum(pdf_searchword_times[search.word.categories %in% unique(search.word.categories)[swc]], na.rm = TRUE)
}
}
}
}
if (write.txt.doc.file == TRUE) {
dir.create(paste0(out,"/excl_by_fw"), showWarnings = FALSE)
pdf_word_count <- sum(sapply(gregexpr("[[:alpha:]]+", txtcontent), function(x) sum(x > 0)))
if (grepl("%",filter.word.times)){
utils::write.table(paste0("Not enough txt lines with filter word found. ",
round((length(word.txtpos.fw)/pdf_word_count*100),4),
"% of words were filter word(s) in ", id, ".pdf."),
paste0(out,"/excl_by_fw/",id,"_too_few_fwds",
out.table.ext),
sep = out.table.separator, row.names = FALSE,
col.names = FALSE, na = "")
} else {
utils::write.table(paste0("Not enough txt lines with filter word found. ",
length(word.txtpos.fw),
" filter word(s) were detected in ", id, ".pdf."),
paste0(out,"/excl_by_fw/",id,"_too_few_fwds",
out.table.ext),
sep = out.table.separator, row.names = FALSE,
col.names = FALSE, na = "")
}
}
## if cpymv is either cpy or mv
if (cpy_mv == "cpy"){
dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
dir.create(paste0(out,"/pdfs/excl_by_fw"), showWarnings = FALSE)
file.copy(pdf, paste0(out,"/pdfs/excl_by_fw"))
print_message <- paste0(basename(pdf),
" was copied to \'", out,"/pdfs/excl_by_fw\'.")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
} else if (cpy_mv == "mv"){
dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
dir.create(paste0(out,"/pdfs/excl_by_fw"), showWarnings = FALSE)
file.copy(pdf, paste0(out,"/pdfs/excl_by_fw"))
unlink(pdf)
print_message <- paste0(basename(pdf),
" was moved to \'", out,"/pdfs/excl_by_fw\'.")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
}
} ## end if filter words were present
} else {
out_msg <- c(out_msg, "No filter words chosen for analysis.")
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info("No filter words chosen for analysis.")
filterwords.go <- TRUE
pdf_filterword_total <- NA
pdf_filterword_times <- NA
pdf_filterwords <- c("filter_word_list")
} ## end if filter words were set
} ## end 3.3) Filter Search
## 4) Search of search words -----------------------------------------------
## only if filter words were found or no filter was set continue and
## search words were set
if (filterwords.go == TRUE && integrity.indicator == TRUE) {
searchwords.go <- FALSE
## search for lines with search word
word.txtline <- NULL
for (i in 1:nrow(search.word.table)) {
## 4.1) Search for lines with search word -------------------------
word <- search.word.table[i, "words"]
ignore.case.sw <- search.word.table[i, "ignore.case.sw"]
word.txtline <- NULL
word.keeplayoutline <- NULL
## if search words were not chosen write all lines in txtline
if (search.word.table[i, "words"] == ""){
word.txtline <- 1:length(txtcontent)
word.keeplayoutline <- 1:length(word.keeplayoutline)
} else {
## search for lines with search word in [txtcontent]
word.txtline <- grep(word, txtcontent, ignore.case = ignore.case.sw)
## search for lines with search word in [keeplayouttxtcontent]
word.keeplayoutline <- grep(word, keeplayouttxtcontent,
ignore.case = ignore.case.sw)
}
## 4.2) Continue analysis when search words were found ----------------------------
if (length(word.txtline) > 0) searchwords.go <- TRUE
## 4.3) Replace abbreviations -----------------------------------------------------
if (eval.abbrevs == TRUE && length(word.txtline) > 0 &&
!search.word.table[i, "words"] == ""){
## Check if any occurences (heading + text) of the
## search word are defining and abbreviation
## go through each txtcontent line that the searchword was found
for (nth in 1:length(word.txtline)) {
paragraph <- txtcontent[word.txtline[[nth]]]
## identify definitions of abbrev
occur_double_dots_or_equal <- test_if_abbrev_double_dots_or_equal(searchword = as.character(word),
paragraph = paragraph,
ignore.case.sw = ignore.case.sw)
occur_in_parantheses <- test_if_abbrev_in_parantheses(searchword = as.character(word),
paragraph = paragraph,
ignore.case.sw = ignore.case.sw)
## if abbrev was found in nth occurence
for (occur in list(occur_double_dots_or_equal, occur_in_parantheses)){
if (occur$res == TRUE) {
## replace if abbrev is not yet defined
for (abbrev in c("abbrev_singular","abbrev_plural")) {
if (!(occur[[abbrev]] %in% list_of_abbrevs)){
list_of_abbrevs <- c(list_of_abbrevs,occur[[abbrev]])
for (c in 1:length(content)){
to_find <- paste0("([^A-z|0-9]+|^)",occur[[abbrev]],"([^A-z|0-9|:]$|[^A-z|0-9|:][^=|:]|$)")
found_lines <- grep(to_find, content[[c]],
ignore.case = FALSE)
for (line in found_lines){
found_pos <- gregexpr(to_find, content[[c]][line])[[1]]
found_pos_list <- gregexpr(to_find, content[[c]][line])[[1]]
add_pos <- 0
for (p in 1:length(found_pos)){
sub <- substr(content[[c]][line],found_pos[p] + add_pos,nchar(content[[c]][line]))
## prevent double substitution if sub already contains substitution
if (grepl(occur[[sub("abbrev","replacement",abbrev)]], sub)) next
found_pos_list[p] <- regexpr(occur[[abbrev]],sub) + found_pos[p] + add_pos - 1
content[[c]][line] <- paste0(substr(content[[c]][line],1,found_pos_list[p]-1),
sub(occur[[abbrev]],
occur[[sub("abbrev","replacement",abbrev)]],
substr(content[[c]][line],found_pos_list[p],
nchar(content[[c]][line])),
ignore.case = FALSE))
add_pos <- add_pos + nchar(occur[[sub("abbrev","replacement",abbrev)]]) - nchar(occur[[abbrev]])
## correct for sub of abbrev definition e.g. methotrexate (MTX (methotrexate))
len_diff <- nchar(content[[c]][line]) -
nchar(gsub(paste0("(",occur[[sub("abbrev","replacement",abbrev)]],")"),
paste0("(",occur[[abbrev]],")"),
content[[c]][line] , fixed = TRUE))
add_pos <- add_pos - len_diff
content[[c]][line] <- gsub(paste0("(",occur[[sub("abbrev","replacement",abbrev)]],")"),
paste0("(",occur[[abbrev]],")"),
content[[c]][line] , fixed = TRUE)
} ## end for (p in length(found_pos)){
} ## end for (line in found_lines){
} ## for (c in 1:length(content)){
} ##if (!(occur[[abbrev]] %in% list_of_abbrevs)){
} ## for (abbrev in c("abbrev_singular","abbrev_plural")) {
} ## if (occur$res) {
} ## end for (abbrev in c("abbrev_singular","abbrev_plural")) {
} ## end for (nth in 1:length(word.txtline)) {
} ## end replace abbreviations in content
} ## end for each search word
} ## if (filterwords.go == TRUE && integrity.indicator == TRUE)
## if search words were found replace the abbreviations in content
if (searchwords.go == TRUE && filterwords.go == TRUE &&
eval.abbrevs == TRUE && integrity.indicator == TRUE &&
!search.word.table[i, "words"] == "") {
txtcontent <- content[[1]]
keeplayouttxtcontent <- content[[2]]
## html content per page
for (pa in 3:length(content)) txthtmlcontent[[(pa-2)]] <- content[[pa]]
} ## end replace abbreviations
## if no search words are detected in document
if (filterwords.go == TRUE &&
searchwords.go == FALSE &&
integrity.indicator == TRUE){
print_message <- paste0("No text with search words for \'",id,".pdf\' found.")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
update_progress_info(print_message)
## write an empty file
if (write.txt.doc.file == TRUE) {
dir.create(paste0(out,"/excl_by_sw"), showWarnings = FALSE)
utils::write.table("No text line with search word found.",
paste0(out,"/excl_by_sw/",id, "_no_txt_w_swds",
out.table.ext),
sep = out.table.separator, row.names = FALSE,
col.names = FALSE, na = "")
}
} ## end if search words were present
## 5) Sort the html content ---------------------------------------
if (filterwords.go == TRUE && integrity.indicator == TRUE) {
## add the top and left ##
for (p in 1:length(txthtmlcontent)) {
## if page has one dimension
if (!"left" %in% colnames(htmlcontent[[p]]))
htmlcontent[[p]] <- cbind(htmlcontent[[p]],
left = NA)
if (!"top" %in% colnames(htmlcontent[[p]]))
htmlcontent[[p]] <- cbind(htmlcontent[[p]],
top = NA)
## add font size
if (!"font_size" %in% colnames(htmlcontent[[p]]))
htmlcontent[[p]] <- cbind(htmlcontent[[p]],
font_size = NA)
}
for (p in 1:length(txthtmlcontent)){
## 5.1) Assign top and left values ------------------------------
start <- grep("^<body>",htmlcontent[[p]][,1])[1] + 1
if (is.na(start)) start <- 1
end <- grep("^</body>",htmlcontent[[p]][,1])[1] - 1
if (is.na(end)) {
end <- nrow(htmlcontent[[p]])
print_message <- paste0("Page ", p, " of \'", id, ".html\' was incompletely read.",
" This might leader to incomplete table extraction but does not",
" affect search word detection")
out_msg <- c(out_msg, print_message)
if (verbose) cat(utils::tail(out_msg,1), sep="\n")
print_message_short <- paste0("Page ", p, " of \'", id, ".html\' was incompletely read",
" (does not affect txt detection though).")
update_progress_info(print_message_short)
}
## make the copy of htmlcontent for the sorting
## single row
if (start == end){
out.htmlcontent <- rbind(htmlcontent[[p]][start, ])
out.txthtmlcontent <- txthtmlcontent[[p]][start]
} else {
out.htmlcontent <- htmlcontent[[p]][start:end, ]
out.txthtmlcontent <- txthtmlcontent[[p]][start:end]
}
## for each line
for (line.number in 1:(end - start + 1)) {
## if content line
if (grepl("[\"|;]font-size:",
out.htmlcontent[line.number, 1])) {
## get the left position
pos.left.start <- regexpr("left:",
out.htmlcontent[line.number, 1])[[1]] + 5
out.htmlcontent[line.number, 1] <- iconv(out.htmlcontent[line.number, 1],"latin1","ASCII",sub="")
pos.left.end <- regexpr("px",
substr(out.htmlcontent[line.number, 1],
pos.left.start,
nchar(out.htmlcontent[line.number, 1])))[[1]] - 2 + pos.left.start
left.value <- suppressWarnings(as.integer(substr(out.htmlcontent[line.number, 1], pos.left.start, pos.left.end)))
if (is.na(left.value) && grepl(";vertical-align:baseline;",
out.htmlcontent[line.number, 1])) {
left.value <- out.htmlcontent[line.number-1, "left"]
}
out.htmlcontent[line.number, "left"] <- left.value
## get the top information
pos.top.start <- regexpr("top:",
out.htmlcontent[line.number, 1])[[1]] + 4
out.htmlcontent[line.number, 1] <- iconv(out.htmlcontent[line.number, 1],"latin1","ASCII",sub="")
pos.top.end <- regexpr("px",
substr(out.htmlcontent[line.number, 1],
pos.top.start,
nchar(out.htmlcontent[line.number, 1])))[[1]] - 2 + pos.top.start
top.value <- suppressWarnings(as.integer(substr(out.htmlcontent[line.number, 1], pos.top.start, pos.top.end)))
if (is.na(top.value) && grepl(";vertical-align:baseline;",
out.htmlcontent[line.number, 1])) {
top.value <- out.htmlcontent[line.number-1, "top"]
}
out.htmlcontent[line.number, "top"] <- top.value
## get font_size information
pos.fontsize.start <- regexpr("font-size:",
out.htmlcontent[line.number, 1])[[1]] + 10
out.htmlcontent[line.number, 1] <- iconv(out.htmlcontent[line.number, 1],"latin1","ASCII",sub="")
pos.fontsize.end <- regexpr("px;",substr(out.htmlcontent[line.number, 1],
pos.fontsize.start,
nchar(out.htmlcontent[line.number, 1])))[[1]] + pos.fontsize.start - 2
fontsize.value <- substr(out.htmlcontent[line.number, 1],
pos.fontsize.start, pos.fontsize.end)
fontsize.value <- as.numeric(fontsize.value)
out.htmlcontent[line.number, "font_size"] <- fontsize.value
} else {
## if the line does not have position info
out.htmlcontent[line.number, "left"] <- 0
out.htmlcontent[line.number, "top"] <- 9999
out.htmlcontent[line.number, "font_size"] <- 1
}
}
## 5.2) Sort lines according to top -------------------
## only the lines with top value
lines.with.top.value <- out.htmlcontent[!is.na(out.htmlcontent[, "top"]), ]
txtlines.with.top.value <- out.txthtmlcontent[!is.na(out.htmlcontent[, "top"])]
## if it is only one line no sorting necessary
if (length(which(!is.na(out.htmlcontent[, "top"]))) > 1) {
htmlorder <- order(as.numeric(lines.with.top.value[,"top"]), as.numeric(lines.with.top.value[,"left"]))
lines.with.top.value.sorted <- lines.with.top.value[htmlorder, ]
txtlines.with.top.value.sorted <- txtlines.with.top.value[htmlorder]
} else {
lines.with.top.value.sorted <- lines.with.top.value
txtlines.with.top.value.sorted <- txtlines.with.top.value
}
out.htmlcontent[!is.na(out.htmlcontent[, "top"]), ] <- lines.with.top.value.sorted
out.txthtmlcontent[!is.na(out.htmlcontent[, "top"])] <- txtlines.with.top.value.sorted
## make all 0 and 9999 to NA
no.pos.info.lines <- (out.htmlcontent[, "top"] == 9999)
out.htmlcontent[no.pos.info.lines, "left"] <- NA
out.htmlcontent[no.pos.info.lines, "top"] <- NA
out.htmlcontent[no.pos.info.lines, "font_size"] <- NA
htmlcontent[[p]][start:end, ] <- out.htmlcontent
txthtmlcontent[[p]][start:end] <- out.txthtmlcontent
}
}
## 6) Extract Tables --------------------------------------------
## Explanation: The table detection is important to distinguish tables from text
## even if tables will not be exported they will not be a part of the sentence
## detection if search words.
## Use html to find table end line by adding 5 lines and then search for span id= change
## test if file has tables --> only process tables when table is present
## 6.1) Test if document has tables -------
if (searchwords.go == TRUE && filterwords.go == TRUE &&
integrity.indicator == TRUE) {
## if there is no additional heading
if (table.heading.words[1, "words"] == "") {
tablestart.pos <- c(grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)",
txtcontent, ignore.case = TRUE),
grep("^(\f|)(Table )[0-99|MDCLXVI]+( )",
txtcontent, ignore.case = TRUE),
grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+$",
txtcontent, ignore.case = TRUE),
grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)(*)",
txthtmlcontent[[j]], ignore.case = TRUE),
grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+$",
txthtmlcontent[[j]], ignore.case = TRUE),
grep("^(\f|)(Table )[0-99|MDCLXVI]+( )",
txthtmlcontent[[j]], ignore.case = TRUE))
} else {
word.txtline.th <- NULL
for (i in 1:nrow(table.heading.words)) {
## search for lines with searchword info
## [txtcontent] ##
word <- table.heading.words[i, "words"]
ignore.case.th <- table.heading.words[i,
"ignore.case.th"]
word.txtline.th <- c(word.txtline.th,
grep(word, txtcontent, ignore.case = ignore.case.th))
}
tablestart.pos <- c(word.txtline.th,
grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)",
txtcontent, ignore.case = TRUE),
grep("^(\f|)(Table )[0-99|MDCLXVI]+( )",
txtcontent, ignore.case = TRUE),
grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+$",
txtcontent, ignore.case = TRUE),
grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)(*)",
txthtmlcontent[[j]], ignore.case = TRUE),
grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+$",
txthtmlcontent[[j]], ignore.case = TRUE),
grep("^(\f|)(Table )[0-99|MDCLXVI]+( )",
txthtmlcontent[[j]], ignore.case = TRUE))
}
}
## 6.2) Detect the table start positions by detecting headings ----------------
if (searchwords.go == TRUE && filterwords.go == TRUE &&
integrity.indicator == TRUE && !length(tablestart.pos) == 0) {
## Initialize master table for positions
htmltablelines <- data.frame(page = NULL,
tableheading = NULL, tablestart.pos = NULL,
tablelastline = NULL, tableend.pos = NULL,
legendlastline = NULL,
legendend.pos = NULL, txtfirstline = NULL)
txttablelines <- data.frame(page = NULL,
tableheading = NULL, tablestart.pos = NULL,
tablelastline = NULL, tableend.pos = NULL,
legendlastline = NULL,
legendend.pos = NULL, txtfirstline = NULL)
## go through pages
for (j in 1:length(txthtmlcontent)) {
## if there is no additional heading
if (table.heading.words[1, "words"] == "") {
html.tablestart.pos <- c(grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)(*)",
txthtmlcontent[[j]], ignore.case = TRUE),
grep("^(\f|)(Table )[0-99|MDCLXVI]+( )(*)",
txthtmlcontent[[j]], ignore.case = TRUE))
## search for tables with line break in title
lb.html.tablestart.pos <- c(grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+$",
txthtmlcontent[[j]], ignore.case = TRUE),
grep("^(\f|)(Table )[0-99|MDCLXVI]+( )$",
txthtmlcontent[[j]], ignore.case = TRUE))
## only look if the table is having also page
if (length(splittxtcontent) >= j) {
txt.tablestart.pos <- c(grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)(*)",
splittxtcontent[[j]], ignore.case = TRUE),
grep("^(\f|)(Table )[0-99|MDCLXVI]+( )(*)",
splittxtcontent[[j]], ignore.case = TRUE))
## search for tables with line break in title
lb.txt.tablestart.pos <- c(grep("^(\f|)(Table )[0-99|MDCLXVI]+$",
splittxtcontent[[j]], ignore.case = TRUE),
grep("^(\f|)(Table )[0-99|MDCLXVI]+( )$",
splittxtcontent[[j]], ignore.case = TRUE))
} else if (length(html.tablestart.pos) > 0) {
if (nrow(htmltablelines) == 0) {
htmltablelines <- data.frame(page = j,
tableheading = NA, tablestart.pos = NA,
tablelastline = NA, legendlastline = NA,
legendend.pos = NA, txtfirstline = NA,
detected.in = "htmlonly")
} else {
newrow <- htmltablelines[1, ]
newrow <- NA
newrow["page"] <- j
newrow["detected.in"] <- "htmlonly"
htmltablelines <- rbind(htmltablelines,
newrow)
}
}
} else {
## if there is an additional heading
txthtml.word.txtline.th <- NULL
for (i in 1:nrow(table.heading.words)) {
## search for lines with searchword info
## [txtcontent] ##
word <- table.heading.words[i, "words"]
ignore.case.th <- table.heading.words[i, "ignore.case.th"]
txthtml.word.txtline.th <- c(txthtml.word.txtline.th,
grep(word, txthtmlcontent[[j]],
ignore.case = ignore.case.th))
}
html.tablestart.pos <- c(txthtml.word.txtline.th,
grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)(*)",
txthtmlcontent[[j]], ignore.case = TRUE),
grep("^(\f|)(Table )[0-99|MDCLXVI]+( )(*)",
txthtmlcontent[[j]], ignore.case = TRUE))
## search for tables with line break in title
lb.html.tablestart.pos <- c(grep("^(\f|)(Table )[0-99|MDCLXVI]+$",
txthtmlcontent[[j]], ignore.case = TRUE),
grep("^(\f|)(Table )[0-99|MDCLXVI]+( )$",
txthtmlcontent[[j]], ignore.case = TRUE))
## only look if the table is having also page
if (length(splittxtcontent) >= j) {
splittxt.word.txtline.th <- NULL
for (i in 1:nrow(table.heading.words)) {
## search for lines with searchword info
## [txtcontent] ##
word <- table.heading.words[i, "words"]
ignore.case.th <- table.heading.words[i, "ignore.case.th"]
splittxt.word.txtline.th <- c(splittxt.word.txtline.th,
grep(word, splittxtcontent[[j]],
ignore.case = ignore.case.th))
}
txt.tablestart.pos <- c(splittxt.word.txtline.th,
grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)(*)",
splittxtcontent[[j]], ignore.case = TRUE),
grep("^(\f|)(Table )[0-99|MDCLXVI]+( )(*)",
splittxtcontent[[j]], ignore.case = TRUE))
## search for tables with line break in title
lb.txt.tablestart.pos <- c(grep("^(\f|)(Table )[0-99|MDCLXVI]+$",
splittxtcontent[[j]], ignore.case = TRUE),
grep("^(\f|)(Table )[0-99|MDCLXVI]+( )$",
splittxtcontent[[j]], ignore.case = TRUE))
} else if (length(html.tablestart.pos) > 0) {
if (nrow(htmltablelines) == 0) {
htmltablelines <- data.frame(page = j,
tableheading = NA, tablestart.pos = NA,
tablelastline = NA, legendlastline = NA,
legendend.pos = NA, txtfirstline = NA,
detected.in = "htmlonly")
} else {
newrow <- htmltablelines[1, ]
newrow <- NA
newrow["page"] <- j
newrow["detected.in"] <- "htmlonly"
htmltablelines <- rbind(htmltablelines,
newrow)
}
}
} ## end if there is (no) additional heading
## Look at tables in htmlcontent
if (length(html.tablestart.pos) > 0) {
for (i in 1:length(html.tablestart.pos)) {
## Removing the Table wording
line <- txthtmlcontent[[j]][html.tablestart.pos[i]]
tableheading <- substr(line,
1, 50)
## replace the paranthesis so that they don't give
## problems for grep
for (symbol in c("\\?","\\+","\\(", "\\)",
"\\[", "\\]", "\\/", "\\{",
"\\}")) {
tableheading <- gsub(symbol,
paste0("\\", symbol),
tableheading)
}
## Remove backreferences (\1) from tableheading
tableheading <- remove_backref(tableheading)
currentstartlines <- html.tablestart.pos[i]
## if the heading is on the site then add it to the
## table
if (length(currentstartlines) != 0) {
htmltablelines <- rbind(htmltablelines,
data.frame(page = j,
tableheading = tableheading,
tablestart.pos = currentstartlines,
tablelastline = NA,
txtfirstline = NA,
legendlastline = NA,
legendend.pos = NA))
}
}
} ## end if length(html.tablestart.pos) > 0
if (length(lb.html.tablestart.pos) > 0) {
for (i in 1:length(lb.html.tablestart.pos)) {
## when +1 is empty
s <- 1
while (txthtmlcontent[[j]][lb.html.tablestart.pos[i] + s] == "") {
s <- s + 1
if (s > 5 || (lb.html.tablestart.pos[i] + s) == length(txthtmlcontent[[j]])) break
}
if ((lb.html.tablestart.pos[i] + s) == length(txthtmlcontent[[j]])) next
## Removing the Table wording
line <- paste0(txthtmlcontent[[j]][lb.html.tablestart.pos[i]]," ",
txthtmlcontent[[j]][lb.html.tablestart.pos[i] + s])
tableheading <- substr(line,
1, 50)
## replace the paranthesis so that they don't give
## problems for grep
for (symbol in c("\\?","\\+","\\(", "\\)",
"\\[", "\\]", "\\/", "\\{",
"\\}")) {
tableheading <- gsub(symbol,
paste0("\\", symbol),
tableheading)
}
## Remove backreferences (\1) from tableheading
tableheading <- remove_backref(tableheading)
currentstartlines <- lb.html.tablestart.pos[i]
## if the heading is on the site then add it to the
## table
if (length(currentstartlines) != 0) {
htmltablelines <- rbind(htmltablelines,
data.frame(page = j,
tableheading = tableheading,
tablestart.pos = currentstartlines,
tablelastline = NA,
txtfirstline = NA,
legendlastline = NA,
legendend.pos = NA))
}
}
} ## end if there is a table on this page
## Look at tables in txtcontent
if (length(txt.tablestart.pos) > 0) {
for (i in 1:length(txt.tablestart.pos)) {
## Removing the Table wording
line <- splittxtcontent[[j]][txt.tablestart.pos[i]]
## extract the first 50 characters
tableheading <- substr(line,
1, 50)
## replace the paranthesis so that they don't give
## problems for grep
tableheading <- gsub("\\f", "", tableheading)
for (symbol in c("\\?","\\+","\\(", "\\)",
"\\[", "\\]", "\\/", "\\{",
"\\}","\\*")) {
tableheading <- gsub(symbol,
paste0("\\", symbol),
tableheading)
}
## Remove backreferences (\1) from tableheading
tableheading <- remove_backref(tableheading)
## from is where the heading was detected
from <- txt.tablestart.pos[i]
if (j > 1){
for (p in 1:(j-1)){
from <- from + length(splittxtcontent[[p]])
}
}
## to has to be the end of the page
to <- 0
if (j > 1){
for (p in 1:j){
to <- to + length(splittxtcontent[[p]])
}
}
##remove backreference from txtcontent
txtcontent[from:to] <- remove_backref(txtcontent[from:to])
## heading
currentstartlines <- grep(tableheading,
txtcontent[from:to], fixed = TRUE)[1] + from - 1
## if the heading is on the site then add it to the
## table
if (length(currentstartlines) != 0) {
txttablelines <- rbind(txttablelines,
data.frame(page = j,
tableheading = tableheading,
tablestart.pos = currentstartlines,
tablelastline = NA, tableend.pos = NA,
txtfirstline = NA,
legendlastline = NA,
legendend.pos = NA))
}
}
} ## end if there is a table on this page
if (length(lb.txt.tablestart.pos) > 0) {
for (i in 1:length(lb.txt.tablestart.pos)) {
## when +1 is empty
s <- 1
while (splittxtcontent[[j]][lb.txt.tablestart.pos[i] + s] == "") {
s <- s + 1
if (s > 5) break
}
## Removing the Table wording
line <- paste0(splittxtcontent[[j]][lb.txt.tablestart.pos[i]]," ",
splittxtcontent[[j]][lb.txt.tablestart.pos[i] + s])
tableheading <- substr(line, 1, 50)
## replace the paranthesis so that they don't give
## problems for grep
for (symbol in c("\\?","\\+","\\(", "\\)",
"\\[", "\\]", "\\/", "\\{",
"\\}","\\*")) {
tableheading <- gsub(symbol,
paste0("\\", symbol),
tableheading)
}
## Remove backreferences (\1) from tableheading
tableheading <- remove_backref(tableheading)
## get the position of the line that has exactly the
## heading
## from is where the heading was detected
from <- lb.txt.tablestart.pos[i]
if (j > 1){
for (p in 1:(j-1)){
from <- from + length(splittxtcontent[[p]])
}
}
## to has to be the end of the page
to <- 0
if (j > 1){
for (p in 1:j){
to <- to + length(splittxtcontent[[p]])
}
}
currentstartlines <- grep(splittxtcontent[[j]][lb.txt.tablestart.pos[i]],
txtcontent[from:to][1]) + from - 1
## if the heading is on the site then add it to the
## table
if (length(currentstartlines) != 0) {
txttablelines <- rbind(txttablelines,
data.frame(page = j,
tableheading = tableheading,
tablestart.pos = currentstartlines,
tablelastline = NA, tableend.pos = NA,
txtfirstline = NA,
legendlastline = NA,
legendend.pos = NA))
}
}
} ## end if there is a table on this page
} ## end go through each page j
## end if file has no tables (if function started
## then PDF file has to have search words)
} else if (searchwords.go == TRUE && filterwords.go == TRUE &&
integrity.indicator == TRUE && length(tablestart.pos) == 0) {
outtable <- cbind(txtcontent, layout = "txt",
rownumber = 1:length(txtcontent))
txtlines <- outtable[(outtable[, "layout"] == "txt"), ]
txttablelines <- NULL
htmltablelines <- NULL
} else {
outtable <- NULL
txtlines <- NULL
txttablelines <- NULL
htmltablelines <- NULL
}
## 6.3) Determine if tables were found in html, txt to both files ------------
if (searchwords.go == TRUE && filterwords.go == TRUE &&
integrity.indicator == TRUE && !length(tablestart.pos) == 0 &&
nrow(as.data.frame((htmltablelines))) > 0 && ncol(as.data.frame((htmltablelines))) > 0) {
## sort the htmltablelines table according to page
## and then tablestart.pos
htmltablelines <- htmltablelines[with(htmltablelines,
order(page, tablestart.pos)), ]
## add information about where table was found
if (!"detected.in" %in% colnames(htmltablelines))
htmltablelines <- cbind(htmltablelines,
detected.in = NA)
if (nrow(txttablelines) > 0 && ncol(txttablelines) > 0) {
## sort the txttablelines table according to page
## and then tablestart.pos
txttablelines <- txttablelines[with(txttablelines,
order(page, tablestart.pos)), ]
if (!"detected.in" %in% colnames(txttablelines)) {
txttablelines <- cbind(txttablelines,
detected.in = NA)
}
## go through each line of the tables and compare
## which tables are in the txt only
for (txtrow in 1:nrow(txttablelines)) {
txttablelines[txtrow, ] <- find_similar_row(originrow = txttablelines[txtrow, ],
targettablelines = htmltablelines,
relative.match.col = "tableheading",
output.for.originrow.only = "txtonly",
output.for.match = "txtandhtml",
output.column.name = "detected.in")$originrow
htmltablelines <- find_similar_row(originrow = txttablelines[txtrow, ],
targettablelines = htmltablelines,
relative.match.col = "tableheading",
output.for.originrow.only = "txtonly",
output.for.match = "txtandhtml",
output.column.name = "detected.in")$targettablelines
}
}
## name all the rows that are only found in html
if (nrow(htmltablelines[is.na(htmltablelines[, "detected.in"]), ]) > 0)
htmltablelines[is.na(htmltablelines[, "detected.in"]), "detected.in"] = "htmlonly"
}
## 6.4) Add positional value to tables -----
if (searchwords.go == TRUE && filterwords.go == TRUE &&
integrity.indicator == TRUE && !length(tablestart.pos) == 0 &&
nrow(as.data.frame((htmltablelines))) > 0 && ncol(as.data.frame((htmltablelines))) > 0) {
## add tableend.pos if column does not exist
if (!"tableend.pos" %in% colnames(htmltablelines)){
htmltablelines <- cbind(htmltablelines, tableend.pos = NA)
}
htmltablelines <- htmltablelines[!(is.na(htmltablelines[,"page"])),]
for (i in 1:nrow(htmltablelines)) {
position <- as.numeric(htmltablelines[i, "tablestart.pos"])
p <- as.numeric(htmltablelines[i, "page"])
## if end of page or the txt ended
npos <- position + 7
## test if the table is too short for detection
if (nrow(htmlcontent[[p]]) < npos){
outofbound <- TRUE
} else {
outofbound <- FALSE
}
if (outofbound == FALSE) {
if (is.null(htmlcontent[[p]][npos, "top"])){
notop <- TRUE
} else {
if (is.na(htmlcontent[[p]][npos, "top"])){
notop <- TRUE
} else {
notop <- FALSE
}
}
}
if ((outofbound == TRUE) || (notop == TRUE)) {
## end is at the end of the page
htmltablelines[i, "tableend.pos"] <- nrow(htmlcontent[[p]])
htmltablelines[i, "detected.in"] <- "txtonly.notabledetected"
if (nrow(txttablelines) > 0 &&
ncol(txttablelines) > 0) {
## go through each line of the tables and compare
## which tables are in the txt only
txttablelines <- txttablelines[!(is.na(txttablelines[,"page"])),]
txttablelines <- find_similar_row(originrow = htmltablelines[i,],
targettablelines = txttablelines,
relative.match.col = "tableheading",
output.for.originrow.only = "error",
output.for.match = "txtonly.notabledetected",
output.column.name = "detected.in")$targettablelines
}
## else start with +5
} else {
## end of the this is the start line +5
htmltablelines[i, "tableend.pos"] <- htmltablelines[i, "tablestart.pos"] + 5
}
} ## end go through each row
}
if (searchwords.go == TRUE && filterwords.go == TRUE &&
integrity.indicator == TRUE && !length(tablestart.pos) == 0 &&
nrow(as.data.frame((htmltablelines))) > 0 && ncol(as.data.frame((htmltablelines))) > 0) {
## determine if top is constant (when not start point +1)
## do for each table row
nexti <- FALSE
htmltablelines <- htmltablelines[!(is.na(htmltablelines[,"page"])),]
for (i in 1:nrow(htmltablelines)) {
## ignore the lines with txtonly
if ((htmltablelines[i, "detected.in"] == "txtonly.notabledetected") ||
(htmltablelines[i, "detected.in"] == "txtonly")) {
if (nrow(txttablelines) > 0 && ncol(txttablelines) > 0) {
## go through each line of the tables and compare
## which tables are in the txt only
txttablelines <- txttablelines[!(is.na(txttablelines[,"page"])),]
txttablelines <- find_similar_row(originrow = htmltablelines[i, ],
targettablelines = txttablelines,
relative.match.col = "tableheading",
output.for.originrow.only = "error",
output.for.match = "txtonly.notabledetected",
output.column.name = "detected.in")$targettablelines
}
next
}
## 6.5) Detect how many columns by evaluating how many different left values ---------------
## set the current line pos and page for the while loop
currentline.pos <- htmltablelines[i, "tableend.pos"]
currentline.page <- htmltablelines[i, "page"]
## initialize out.left.list
out.left.list <- NA
all.left.found <- FALSE
while (all.left.found == FALSE) {
constant.value <- NULL
while (is.null(constant.value)) {
## currentline.pos is either set before the loop or at the end
currentline <- htmlcontent[[currentline.page]][currentline.pos, ]
## get the top information
oritop.value <- paste0("top:", currentline["top"], "px")
## go to the next line
currentline.pos <- currentline.pos + 1
## set the start for of the next table
if (!is.na(htmltablelines[i + 1, "tablestart.pos"]) &&
htmltablelines[i + 1, "page"] == htmltablelines[i, "page"])
nextstartpos <- htmltablelines[i + 1, "tablestart.pos"] else nextstartpos <- 999999
## test if end of page is reached
currentline <- htmlcontent[[currentline.page]][currentline.pos, ]
## get the top information
top.value <- paste0("top:",
currentline["top"], "px")
## if beyond the end of the html page
if ((currentline.pos > nrow(htmlcontent[[currentline.page]])) ||
(top.value == "top:NApx")) {
# htmltablelines[i, "detected.in"] <- "txtonly.notabledetected"
if (nrow(txttablelines) > 0 && ncol(txttablelines) > 0) {
# ## go through each line of the tables and compare
# ## which tables are in the txt only
# txttablelines <- txttablelines[!(is.na(txttablelines[,"page"])),]
# txttablelines <- find_similar_row(originrow = htmltablelines[i, ],
# targettablelines = txttablelines,
# relative.match.col = "tableheading",
# output.for.originrow.only = "error",
# output.for.match = "txtonly.notabledetected",
# output.column.name = "detected.in")$targettablelines
htmltablelines[i, "tableend.pos"] <- currentline.pos - 1
htmltablelines[i, "legendend.pos"] <- currentline.pos - 1
## fill the txtcontent columns
htmltablelines[i, "tablelastline"] <- txthtmlcontent[[currentline.page]][as.numeric(htmltablelines[i,
"tableend.pos"])]
htmltablelines[i, "legendlastline"] <- txthtmlcontent[[currentline.page]][as.numeric(htmltablelines[i,
"legendend.pos"])]
htmltablelines[i, "txtfirstline"] <- txthtmlcontent[[currentline.page]][as.numeric(htmltablelines[i,
"legendend.pos"]) + 1]
}
currentline.pos <- currentline.pos - 1
all.left.found <- TRUE
constant.value <- top.value
# nexti <- TRUE
break
## if tables overlap
} else if (currentline.pos >= nextstartpos) {
htmltablelines[i, "tableend.pos"] <- as.numeric(htmltablelines[i + 1, "tablestart.pos"]) - 1
htmltablelines[i, "legendend.pos"] <- as.numeric(htmltablelines[i + 1, "tablestart.pos"]) - 1
## fill the txtcontent columns
htmltablelines[i, "tablelastline"] <- txthtmlcontent[[currentline.page]][as.numeric(htmltablelines[i,
"tableend.pos"])]
htmltablelines[i, "legendlastline"] <- txthtmlcontent[[currentline.page]][as.numeric(htmltablelines[i,
"legendend.pos"])]
htmltablelines[i, "txtfirstline"] <- txthtmlcontent[[currentline.page]][as.numeric(htmltablelines[i,
"legendend.pos"]) + 1]
currentline.pos <- as.numeric(htmltablelines[i + 1, "tablestart.pos"]) - 1
if (nrow(txttablelines) > 0 && ncol(txttablelines) > 0) {
txttablelines[i, "tableend.pos"] <- as.numeric(txttablelines[i + 1, "tablestart.pos"]) - 1
txttablelines[i, "legendend.pos"] <- as.numeric(txttablelines[i + 1, "tablestart.pos"]) - 1
txttablelines[i, "tablelastline"] <- txtcontent[as.numeric(txttablelines[i, "tableend.pos"])]
txttablelines[i, "legendlastline"] <- txtcontent[as.numeric(txttablelines[i, "legendend.pos"])]
txttablelines[i, "txtfirstline"] <- txtcontent[as.numeric(txttablelines[i, "legendend.pos"])+1]
}
nexti <- TRUE
break
} ## end if
if (oritop.value == top.value) {
## the constant value is the top
constant.value <- top.value
} else {
## use this currentline.pos as new start
htmltablelines[i, "tableend.pos"] <- currentline.pos
}
} ## end while is.null(constant.value)
if (nexti == TRUE) break
## determine the left range ##
## go to min
ind <- TRUE
while (ind == TRUE) {
currentline.pos <- currentline.pos - 1
currentline <- htmlcontent[[currentline.page]][currentline.pos, ]
ind <- grepl(constant.value, currentline[1])
}
currentline.pos <- currentline.pos + 1
currentline <- htmlcontent[[currentline.page]][currentline.pos, ]
left.list <- NULL
ind <- TRUE
while (ind == TRUE) {
# get the left position
left.value <- paste0("left:", currentline["left"], "px")
left.list <- c(left.list, left.value)
currentline.pos <- currentline.pos + 1
currentline <- htmlcontent[[currentline.page]][currentline.pos, ]
ind <- grepl(constant.value,
currentline[1])
}
## test if left.list (column list) is complete ##
if (all(left.list %in% out.left.list)) {
## if the left.list if complete, end the loop
all.left.found <- TRUE
break
} else {
## if left.list is incomplete, add the current
## left.list
out.left.list <- c(out.left.list[!is.na(out.left.list)],
left.list)
out.left.list <- unique(out.left.list)
}
} ## end of searching for all left values
## if only txt table to tables beside another table
if (nexti == TRUE) {
nexti <- FALSE
next
}
## 6.6) Determine the end of the table through the highest top value ---------------
## search for the max with all the left values ##
## if two tables are on the same page restrict lines
if (!is.na(htmltablelines[i + 1, "tablestart.pos"]) &&
htmltablelines[i + 1, "page"] == htmltablelines[i, "page"]){
nexttablestartpos <- htmltablelines[i + 1, "tablestart.pos"]
} else {
nexttablestartpos <- nrow(htmlcontent[[currentline.page]])
}
## make a toplist
top.list <- NULL
out.left.list <- out.left.list[out.left.list != "left:NApx"]
## if out.left.list is empty go to the next table
if (length(out.left.list) == 0) {
nexti <- FALSE
next
}
for (left.item in out.left.list) {
max.line <- max(grep(left.item,
htmlcontent[[currentline.page]][1:nexttablestartpos, 1]))
currentline <- htmlcontent[[currentline.page]][max.line, ]
top.value <- currentline["top"]
top.list <- c(top.list, top.value)
}
## choose the max top count that is at least double
top.value.found <- FALSE
unq.top.list <- unique(top.list)
match.list <- NULL
for (ti in 1:length(unq.top.list)) {
curr.top.value <- unq.top.list[ti]
match.list[ti] <- 0
for (li in 1:length(out.left.list)) {
match.one <- intersect(grep(out.left.list[li],
htmlcontent[[currentline.page]][1:nexttablestartpos, 1]),
grep(paste0("top:",curr.top.value, "px;"),
htmlcontent[[currentline.page]][1:nexttablestartpos, 1]))
## when there was a match and the left.item is not the last in the list
if (length(match.one) > 0){
match.list[ti] <- match.list[ti] + 1
}
}
}
max.top.value <- suppressWarnings(max(strtoi(unq.top.list[match.list > 1])))
## if not one value is duplicated then go with max
if (!any(match.list > 1))max.top.value <- max(strtoi(top.list))
## if not one value is duplicated then go with max
htmltablelines[i, "tableend.pos"] <- max(grep(paste0("top:",
max.top.value, "px;"),
htmlcontent[[currentline.page]][,1]))
currentline.pos <- htmltablelines[i, "tableend.pos"]
currentline <- htmlcontent[[currentline.page]][currentline.pos, ]
## add the last line to the htmltablelines ## add
## the last line to the htmltablelines
htmltablelines[i, "tablelastline"] <- txthtmlcontent[[currentline.page]][currentline.pos]
## save the end of the table
htmltablelines[i, "tableend.pos"] <- currentline.pos
## add everything below the table ##
## 6.7) Detect the legend by extracting all lines with a lower font that than the table -----
## get the current font size information ##
pos.fontsize.start <- regexpr("font-size:",
currentl