R/PDE.R

Defines functions PDE_reader_i PDE_analyzer PDE_analyzer_i PDE_pdfs2txt_searchandfilter PDE_pdfs2table_searchandfilter PDE_pdfs2table PDE_extr_data_from_pdfs .PDE_extr_data_from_pdf PDE_install_Xpdftools4.02 PDE_check_Xpdf_install PDE_path

Documented in PDE_analyzer PDE_analyzer_i PDE_check_Xpdf_install .PDE_extr_data_from_pdf PDE_extr_data_from_pdfs PDE_install_Xpdftools4.02 PDE_path PDE_pdfs2table PDE_pdfs2table_searchandfilter PDE_pdfs2txt_searchandfilter PDE_reader_i

## PDE: Extract Sentences and Tables from PDF Files.
## Copyright (C) 2020-2021  Erik Stricker
## 
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
## 
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
## You should have received a copy of the GNU General Public License
## along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
#' PDE: Extract Tables and Sentences from PDF Files.
#' 
#' The package includes two main components: 1) The PDE analyzer performs the
#' sentence and table extraction while 2) the PDE reader allows the
#' user-friendly visualization and quick-processing of the obtained results.
#'
#' @section PDE functions: \code{\link{PDE_analyzer}}, \code{\link{PDE_analyzer_i}},
#'   \code{\link{PDE_extr_data_from_pdfs}}, \code{\link{PDE_pdfs2table}}, 
#'   \code{\link{PDE_pdfs2table_searchandfilter}},\code{\link{PDE_pdfs2txt_searchandfilter}}, 
#'   \code{\link{PDE_reader_i}}, \code{\link{PDE_install_Xpdftools4.02}},
#'   \code{\link{PDE_check_Xpdf_install}}
#'
#' @docType package
#' @name PDE
NULL
#> NULL

## 1.2.1

## declare global variables
PDE.globals <- new.env()
PDE.globals$jumpto.list <- list()
PDE.globals$le.progress.textbox <- list()
PDE.globals$mark.list <- list()
PDE.globals$tables.masterlist <- list()
PDE.globals$ttanalyzer <- list()

#' Deprecated functions in package \sQuote{PDE}
#' 
#' @description  These functions are provided for compatibility with older versions
#' of \sQuote{PDE} only, and will be defunct at the next release.
#'
#' @details  The following functions are deprecated and will be made defunct; use
#' the replacement indicated below:
#'   \itemize{
#'     
#'     \item{PDE_path: \code{system.file(package = "PDE")}}
#'     
#'   }
#'
#' @name PDE-deprecated
NULL
#> NULL

#'Export the installation path the PDE (PDF Data Extractor) package
#'
#'\code{PDE_path} is deprecated. Please run system.file(package = "PDE") instead.
#'
#'@return The function returns a potential path for the PDE package. If the PDE
#'tool was not correctly installed it returns "".
#'
#'@export
PDE_path <- function(){
  .Deprecated("system.file(package = \"PDE\")", package= "PDE",old = "PDE_path")
  ## set PDE library location
  out_path <- ""
  for (dir in .libPaths()){
    if (dir.exists(paste0(dir,"/PDE/R"))){
      out_path <- paste0(dir,"/PDE/")
      break
    }
  }
  return(out_path)
}

#'Check if the Xpdftools are installed an in the system path
#'
#'\code{PDE_check_Xpdf_install} runs a version test for pdftotext, pdftohtml and pdftopng.
#'
#'@param sysname String. In case the function returns "Unknown OS" the sysname can be set manually. 
#' Allowed options are "Windows", "Linux",  "SunOS" for Solaris, and "Darwin" for Mac. Default: \code{NULL}.
#'@param verbose Logical. Indicates whether messages will be printed in the console. Default: \code{TRUE}.
#'
#'
#'@return The function returns a Boolean for the installation status and a message in case 
#' the commands are not detected.
#'
#'@examples
#'
#' PDE_check_Xpdf_install()
#'
#'@export
PDE_check_Xpdf_install <- function(sysname=NULL, verbose=TRUE){
  
  ## receive pdftotext, pdftohtml and pdftopng information from config file if it exists
  
  xpdf_config_location <- paste0(system.file(package = "PDE"),"/bin/XPDF_DIR.config")
  dir.create(dirname(xpdf_config_location), recursive = TRUE, showWarnings = FALSE)
  
  pdftotext_location <- NULL
  pdftohtml_location <- NULL
  pdftopng_location <- NULL
  
  if (file.exists(xpdf_config_location)){
    pdftotext_location <- grep("pdftotext",readLines(xpdf_config_location), value = TRUE)
    if (length(file.exists(pdftotext_location)) == 0) pdftotext_location <- NULL
    
    pdftohtml_location <- grep("pdftohtml",readLines(xpdf_config_location), value = TRUE)
    if (length(file.exists(pdftohtml_location)) == 0) pdftohtml_location <- NULL
    
    pdftopng_location <- grep("pdftopng",readLines(xpdf_config_location), value = TRUE)
    if (length(file.exists(pdftopng_location)) == 0) pdftopng_location <- NULL
  } 
  
  # if either the config file does not exist or the xpdf tool files do not exist
  if (!file.exists(xpdf_config_location) ||
      is.null(pdftotext_location) || 
      is.null(pdftohtml_location) || 
      is.null(pdftopng_location)) {
    
    if (is.null(sysname)) {
      sysname <- Sys.info()["sysname"]
    }
    
    show_file_path_linux <- function(filename){
      whereis_output <- system(paste0("whereis -b ",filename), intern = TRUE)
      only_dirs <- sub("^ ","",sub(paste0("^",filename,":"),"",whereis_output))
      if (only_dirs == ""){
        return(NULL)
      } else {
        return(strsplit(gsub(" /",";/",only_dirs),split = ";")[[1]])
      }
    }
    
    show_file_path_solaris <- function(filename){
      whereis_test <- suppressWarnings(tryCatch(system(paste0("/usr/ucb/whereis ",filename),
                                                       , intern = TRUE)[1], 
                                                          error=function(err) NULL))
      if (length(whereis_test) != 0) {
        whereis_output <- system(paste0("/usr/ucb/whereis ",filename), intern = TRUE)
        only_dirs <- sub("^ ","",sub(paste0("^",filename,":"),"",whereis_output))
        if (only_dirs == ""){
          return(NULL)
        } else {
          return(strsplit(gsub(" /",";/",only_dirs),split = ";")[[1]])
        }
      } else {
        return(NULL)
      }
    }
    
    if (sysname == "Windows") {
      pdftotext_location <- suppressWarnings(system("C:\\WINDOWS\\system32\\cmd.exe /c where pdftotext", intern = TRUE))
      if ("INFO: Could not find files for the given pattern(s)." %in% pdftotext_location) pdftotext_location <- NULL
      
      pdftohtml_location <- suppressWarnings(system("C:\\WINDOWS\\system32\\cmd.exe /c where pdftohtml", intern = TRUE))
      if ("INFO: Could not find files for the given pattern(s)." %in% pdftohtml_location) pdftohtml_location <- NULL
      
      pdftopng_location <- suppressWarnings(system("C:\\WINDOWS\\system32\\cmd.exe /c where pdftopng", intern = TRUE))
      if ("INFO: Could not find files for the given pattern(s)." %in% pdftopng_location) pdftopng_location <- NULL
    } else if (sysname == "Linux") {
      pdftotext_location <- show_file_path_linux("pdftotext")
      
      pdftohtml_location <- show_file_path_linux("pdftohtml")
      
      pdftopng_location <- show_file_path_linux("pdftopng")
      
    } else if (sysname == "SunOS") {
      pdftotext_location <- show_file_path_solaris("pdftotext")
      
      pdftohtml_location <- show_file_path_solaris("pdftohtml")
      
      pdftopng_location <- show_file_path_solaris("pdftopng")
      
    } else if (sysname == "Darwin") {
      
      pdftotext_location <- suppressWarnings(system("which -a pdftotext", intern = TRUE))
      
      pdftohtml_location <- suppressWarnings(system("which -a pdftohtml", intern = TRUE))
      
      pdftopng_location <- suppressWarnings(system("which -a pdftopng", intern = TRUE))
    } else{
      stop("Unknown OS. Please set sysname option.")
    }
  }
    
  out <- TRUE
  files <- NULL
  
  if (length(pdftotext_location) == 0) {
    files <- c(files,"pdftotext")
    out=FALSE 
  }
  if (length(pdftohtml_location) == 0) {
    files <- c(files,"pdftohtml")
    out=FALSE 
  }
  if (length(pdftopng_location) == 0) {
    files <- c(files,"pdftopng")
    out=FALSE 
  }
  
  ## if the command line tools where all detected
  if (out == TRUE) {
    ## test pdftotext version
    pdftotext_path <- NULL
    pdfpath <- paste0(system.file(package = "PDE"),"/examples/Methotrexate/29973177_!.pdf")
    keeplayouttxtpath <- paste0(dirname(pdfpath),"/test_txt/test_keeplayout.txt")
    for (i in 1:length(pdftotext_location)){
      dir.create(dirname(keeplayouttxtpath))
      status <- suppressWarnings(system(paste0("\"", pdftotext_location[i], "\" \"", "-layout",
                                                  "\" \"", pdfpath, "\" \"", keeplayouttxtpath,
                                                  "\""), 
                                        wait = TRUE, ignore.stderr = TRUE, intern = TRUE))
      if (file.exists(keeplayouttxtpath)) {
        unlink(dirname(keeplayouttxtpath), recursive = TRUE)
        pdftotext_path <- pdftotext_location[i]
        break
      }
      unlink(dirname(keeplayouttxtpath), recursive = TRUE)
    }
    
    ## test pdftohtml version
    pdftohtml_path <- NULL
    pdfpath <- paste0(system.file(package = "PDE"),"/examples/Methotrexate/29973177_!.pdf")
    htmlpath <- paste0(dirname(pdfpath),"/test_html/test.html")
    for (i in 1:length(pdftohtml_location)){
      dir.create(dirname(htmlpath))
      status <- system(paste0("\"",pdftohtml_location[i],"\" \"", pdfpath,
                  "\" \"", htmlpath, "\""), wait = TRUE,
           ignore.stderr = TRUE, intern = TRUE)
      if (dir.exists(htmlpath) && file.exists(paste0(htmlpath, "/index.html"))) {
        unlink(dirname(htmlpath), recursive = TRUE)
        pdftohtml_path <- pdftohtml_location[i]
        break
      }
      unlink(dirname(htmlpath), recursive = TRUE)
    }
    
    ## test pdftopng
    pdftopng_path <- NULL
    pdfpath <- paste0(system.file(package = "PDE"),"/examples/Methotrexate/29973177_!.pdf")
    pngpath <- paste0(dirname(pdfpath),"/test_png/test.png")
    for (i in 1:length(pdftopng_location)){
      dir.create(dirname(pngpath))
      status <- suppressWarnings(system(paste0("\"",pdftopng_location[i],"\" \"",
                    "-f", "\" \"", 1, "\" \"", "-l",
                    "\" \"", 1, "\" \"", pdfpath, "\" \"",
                    pngpath,"\""),
                    wait = TRUE, ignore.stderr = TRUE, intern = TRUE))
      if (file.exists(sub("test.png$","test.png-000001.png",pngpath))) {
        unlink(dirname(pngpath), recursive = TRUE)
        pdftopng_path <- pdftopng_location[i]
        break
      }
      unlink(dirname(pngpath), recursive = TRUE)
    }
    
    if (length(pdftotext_path) > 0 && 
        length(pdftohtml_path) > 0 && 
        length(pdftopng_path) > 0) {
      write(paste(pdftotext_path,pdftohtml_path,pdftopng_path, sep = "\n"),
            file = xpdf_config_location)
      attributes(out) <- list(msg = "Correct version of Xpdf command line tools is installed.")
      if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
    } else {
      if (length(pdftotext_path) == 0) {
        files <- c(files,"pdftotext")
        out=FALSE 
      }
      if (length(pdftohtml_path) == 0) {
        files <- c(files,"pdftohtml")
        out=FALSE 
      }
      if (length(pdftopng_path) == 0) {
        files <- c(files,"pdftopng")
        out=FALSE 
      }
      
      msg1 <- paste(" installed. Please install the Xpdf command line tools",
                    "using PDE_install_Xpdftools4.02()")
      
      if (length(files) == 1) {
        out.file <- files
        attributes(out) <- list(msg = paste0("The wrong version of the ",
                                             out.file, " file is",msg1))
        if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
      } else if (length(files) == 2) {
        out.file <- paste0(files[1], " and ", files[2])
        attributes(out) <- list(msg = paste0("The wrong version of the ", 
                                             out.file, " files are",msg1))
        if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
      } else if (length(files) == 3) {
        out.file <- paste0(files[1], ", ", files[2], " and " , files[3])
        attributes(out) <- list(msg = paste0("The wrong version of the ",
                                             out.file, " files are",msg1))
        if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
      } 
      
    }
    
  ## if one or more command line tools where not detected    
  } else {
    msg1 <- paste(" not detected. Please install the Xpdf command line tools again",
                 "using PDE_install_Xpdftools4.02()")
    
    if (length(files) == 1) {
      out.file <- files
      attributes(out) <- list(msg = paste0(out.file, " file",msg1))
      if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
    } else if (length(files) == 2) {
      out.file <- paste0(files[1], " and ", files[2])
      attributes(out) <- list(msg = paste0(out.file, " files",msg1))
      if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
    } else if (length(files) == 3) {
      out.file <- paste0(files[1], ", ", files[2], " and " , files[3])
      attributes(out) <- list(msg = paste0(out.file, " files",msg1))
      if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
    } 
  } 
  
  return(out)
}

#'Install the Xpdf command line tools 4.02
#'
#'\code{PDE_install_Xpdftools4.02} downloads and installs the XPDF command line tools 4.02.
#'
#'@param sysname String. In case the function returns "Unknown OS" the sysname can be set manually. 
#' Allowed options are "Windows", "Linux", "SunOS" for Solaris, and "Darwin" for Mac. Default: \code{NULL}.
#'@param bin String. In case the function returns "Unknown OS" the bin of the operational system
#' can be set manually. Allowed options are "64", and "32". Default: \code{NULL}.
#'@param verbose Logical. Indicates whether messages will be printed in the console. Default: \code{TRUE}.
#'
#'
#'@return The function returns a Boolean for the installation status and a message in case 
#' the commands are not installed.
#'
#'@examples
#' \dontrun{
#' 
#' PDE_install_Xpdftools4.02()
#' 
#' }
#'
#'
#'@export
PDE_install_Xpdftools4.02 <- function(sysname=NULL, bin=NULL, verbose=TRUE){
  ## check if Xpdftools are installed
  install.test <- PDE_check_Xpdf_install(verbose=FALSE)
  downloadq <- FALSE
  installq <- FALSE
  out_msg <- NULL
  out <- NULL
  
  xpdf_config_location <- paste0(system.file(package = "PDE"),"/bin/XPDF_DIR.config")
  dir.create(dirname(xpdf_config_location), recursive = TRUE, showWarnings = FALSE)
  
  ## set xpdf library location
  xpdf_bin_path <- paste0(system.file(package = "PDE"),"/bin")
  if (is.null(sysname)) sysname <- Sys.info()["sysname"]
  
  download.test <- FALSE
  if (sysname == "Windows") {
    if (dir.exists(paste0(xpdf_bin_path,"/xpdf-tools-win-4.02"))) download.test <- TRUE
  } else if (sysname == "Linux" || sysname == "SunOS") {
    if (dir.exists(paste0(xpdf_bin_path,"/xpdf-tools-linux-4.02"))) download.test <- TRUE
  } else if (sysname == "Darwin") {
    if (dir.exists(paste0(xpdf_bin_path,"/xpdf-tools-mac-4.02"))) download.test <- TRUE
  } else {
    stop("Unknown OS. Please set sysname option.")
  }
  
  if (is.null(bin)){
    if (grepl("32",Sys.info()[["machine"]])) {
      bin <- "32"
    } else if (grepl("64",Sys.info()[["machine"]])) {
      bin <- "64"
    } else {
      stop("Unknown OS. Please set sysname option.")
    }
  }
  
  if (bin != "64" && bin != "32"){
    stop("Unknown OS. Please set bin option.")
  }
  
  ## determine operating system and download correct xpdf
  if (download.test == FALSE){
    ## determine operating system and download correct Xpdf command line tools
    downloadq <- utils::menu(c("Y", "N"), title="Do you want to download and install xpdf version 4.02? (y/n)") == 1
    installq <- downloadq
  } else {
    downloadq <- utils::menu(c("Y", "N"), title=paste("Xpdf command line tools 4.02 are already downloaded.",
                                                      "Do you want to download the Xpdf command line tools version 4.02 again? (y/n)")) == 1
    if (install.test == TRUE){
      installq <- utils::menu(c("Y", "N"), title=paste("Working versions of Xpdf command line tools are already installed.",
                                                       "Do you want to still (re)install",
                                                       "the Xpdf command line tools version 4.02? (y/n)")) == 1
    } else {
      installq <- utils::menu(c("Y", "N"), title=paste("Do you want to also install",
                                                       "the Xpdf command line tools version 4.02? (y/n)")) == 1
    }
  }
  
  if (downloadq == TRUE){
    
    if (sysname == "Windows") {
      utils::download.file("https://dl.xpdfreader.com/xpdf-tools-win-4.02.zip", 
                           destfile = paste0(xpdf_bin_path,"/xpdf-tools-win-4.02.zip"),
                           mode = "wb")
      utils::unzip(paste0(xpdf_bin_path,"/xpdf-tools-win-4.02.zip"),exdir = xpdf_bin_path)
      remove.status <- suppressWarnings(file.remove(paste0(xpdf_bin_path,"/xpdf-tools-win-4.02.zip")))
      download.test <- TRUE
    } else if (sysname == "Linux" || sysname == "SunOS") {
      utils::download.file("https://dl.xpdfreader.com/xpdf-tools-linux-4.02.tar.gz", 
                           destfile = paste0(xpdf_bin_path,"/xpdf-tools-linux-4.02.tar.gz"),
                           mode = "wb")
      utils::untar(paste0(xpdf_bin_path,"/xpdf-tools-linux-4.02.tar.gz"),exdir = xpdf_bin_path)
      remove.status <- suppressWarnings(file.remove(paste0(xpdf_bin_path,"/xpdf-tools-linux-4.02.tar.gz")))
      download.test <- TRUE
    } else if (sysname == "Darwin") {
      utils::download.file("https://dl.xpdfreader.com/xpdf-tools-mac-4.02.tar.gz", 
                           destfile = paste0(xpdf_bin_path,"/xpdf-tools-mac-4.02.tar.gz"))
      utils::untar(paste0(xpdf_bin_path,"/xpdf-tools-mac-4.02.tar.gz"),exdir = xpdf_bin_path)
      remove.status <- suppressWarnings(file.remove(paste0(xpdf_bin_path,"/xpdf-tools-mac-4.02.tar.gz")))
      download.test <- TRUE
    } else {
      stop("Unknown OS. Please set sysname option.")
    }
  }
  
  if (download.test == TRUE){
    if (sysname == "Windows") {
      filepath <- normalizePath(paste0(xpdf_bin_path,"/xpdf-tools-win-4.02/bin",bin))
      ext <- ".exe"
    } else if (sysname == "Linux" || sysname == "SunOS") {
      filepath <- normalizePath(paste0(xpdf_bin_path,"/xpdf-tools-linux-4.02/bin",bin))
      ext <- ""
    } else if (sysname == "Darwin") {
      filepath <- normalizePath(paste0(xpdf_bin_path,"/xpdf-tools-mac-4.02/bin",bin))
      ext <- ""
    } else {
      stop("Unknown OS. Please set sysname option.")
    }
    out_msg <- c(out_msg,paste0("Location of Xpdf command line tools 4.02: ",filepath))
    if (verbose) cat(utils::tail(out_msg,1), sep="\n")
    attributes(out) <- list(msg = out_msg,path=filepath)
    
    ## "Installation"
    if (installq == TRUE){
      
      pdftotext_path <- normalizePath(paste0(filepath,"/pdftotext",ext))
      
      pdftohtml_path <- normalizePath(paste0(filepath,"/pdftohtml",ext))
      
      pdftopng_path <- normalizePath(paste0(filepath,"/pdftopng",ext))
      
      write(paste(pdftotext_path,pdftohtml_path,pdftopng_path, sep = "\n"),
            file = xpdf_config_location)
      
      out_msg <- c(out_msg,"The Xpdf command line tools 4.02 were successfully installed.")
      if (verbose) cat(utils::tail(out_msg,1), sep="\n")
      attributes(out) <- list(msg = out_msg,path=filepath)
      out <- TRUE
    } else {
      out_msg <- c(out_msg,"The Xpdf command line tools 4.02 were not installed.")
      if (verbose) cat(utils::tail(out_msg,1), sep="\n")
      attributes(out) <- list(msg = out_msg,path=filepath)
      out <- FALSE
    }
  } else {
    out_msg <- c(out_msg,"The Xpdf command line tools 4.02 were not downloaded.")
    attributes(out) <- list(msg = out_msg,path="")
    if (verbose) cat(utils::tail(out_msg,1), sep="\n")
    out <- FALSE
  }
  
  return(out)
}

#'Extracting data from a PDF (Protable Document Format) file
#'
#'\code{PDE_extr_data_from_pdf} extracts sentences or tables from a single PDF
#'file and writes output in the corresponding folder.
#'
#'@param pdf String. Path to the PDF file to be analyzed.
#'@param whattoextr String. Either \emph{txt}, \emph{tab}, or \emph{tabandtxt}
#'  for PDFS2TXT (extract sentences from a PDF file) or PDFS2TABLE (table of a PDF
#'  file to a Microsoft Excel file) extraction. \emph{tab} allows the extraction
#'  of tables with and without search words while \emph{txt} and \emph{tabandtxt}
#'  require search words.
#'@param out String. Directory chosen to save analysis results in. Default:
#'  \code{"."}.
#'@param filter.words List of strings. The list of filter words. If not
#'  \code{NA} or \code{""} a hit will be counted every time a word from the list
#'  is detected in the article. Regex rules apply (see also
#'  \url{https://www.rstudio.com/wp-content/uploads/2016/09/RegExCheatsheet.pdf}).
#'   Default: \code{""}.
#'@param ignore.case.fw Logical. Are the filter words case-sensitive (does
#'  capitalization matter)? Default: \code{FALSE}.
#'@param filter.word.times Numeric. The minimum number of hits described for
#'  \code{filter.words} for a paper to be further analyzed. Default: \code{20}.
#'@param table.heading.words List of strings. Different than standard (TABLE,
#'  TAB or table plus number) headings to be detected. Regex rules apply (see
#'  also
#'  \url{https://www.rstudio.com/wp-content/uploads/2016/09/RegExCheatsheet.pdf}).
#'   Default = \code{""}.
#'@param ignore.case.th Logical. Are the additional table headings (see
#'  \code{table.heading.words}) case-sensitive (does capitalization matter)?
#'  Default = \code{FALSE}.
#'@param search.words List of strings. List of search words. To extract all
#'  tables from the PDF file leave \code{search.words = ""}. Regex rules apply (see
#'  also
#'  \url{https://www.rstudio.com/wp-content/uploads/2016/09/RegExCheatsheet.pdf}).
#'@param ignore.case.sw Logical. Are the search words case-sensitive (does
#'  capitalization matter)? Default: \code{FALSE}.
#'@param eval.abbrevs Logical. Should abbreviations for the search words be
#'  automatically detected and then replaced with the search word + "$*"?
#'  Default: \code{TRUE}.
#'@param out.table.format String. Output file format. Either comma separated
#'  file \code{.csv} or tab separated file \code{.tsv}. The encoding indicated
#'  in parantheses should be selected according to the operational system 
#'  exported tables are opened in, i.e., Windows: \code{"(WINDOWS-1252)"}; Mac: 
#'  \code{(macintosh)}; Linux: \code{(UTF-8)}. Default: \code{".csv"} and 
#'  encoding depending on the operational system.
#'@param dev Numeric. For a table the size of indention which would be
#'  considered the same column. Default: \code{20}.
#'@param context Numeric. Number of sentences extracted before and after the
#'  sentence with the detected search word. If \code{0} only the sentence with
#'  the search word is extracted. Default: \code{0}.
#'@param write.table.locations Logical. If \code{TRUE}, a separate file with the
#'  headings of all tables, their relative location in the generated html and
#'  txt files, as well as information if search words were found will be
#'  generated. Default: \code{FALSE}.
#'@param exp.nondetc.tabs Logical. If \code{TRUE}, if a table was detected in a
#'  PDF file but is an image or cannot be read, the page with the table with be
#'  exported as a png. Default: \code{TRUE}.
#'@param write.tab.doc.file Logical. If \code{TRUE}, if search words are used
#'  for table detection and no search words were found in the tables of a PDF 
#'  file, a \strong{no.table.w.search.words}. Default: \code{TRUE}.
#'@param write.txt.doc.file Logical. If \code{TRUE}, if no search words were
#'  found in the sentences of a PDF file, a file will be created with the PDF
#'  filename followed by \strong{no.txt.w.search.words}. If the PDF file is
#'  empty, a file will be created with the PDF filename followed by
#'  \strong{no.content.detected}. If the filter word threshold is not met, 
#'  a file will be created with the PDF filename followed by 
#'  \strong{no.txt.w.filter.words}. Default: \code{TRUE}.
#'@param delete Logical. If \code{TRUE}, the intermediate \strong{txt},
#'  \strong{keeplayouttxt} and \strong{html} copies of the PDF file will be 
#'  deleted. Default: \code{TRUE}.
#'@param verbose Logical. Indicates whether messages will be printed in the 
#'  console. Default: \code{TRUE}.
#'
#'@return If tables were extracted from the PDF file the function returns a list of
#'  following tables/items: 1) \strong{htmltablelines}, 2)
#'  \strong{txttablelines}, 3) \strong{keeplayouttxttablelines}, 4) \strong{id},
#'  5) \strong{out_msg}.
#'  The \strong{tablelines} are tables that provide the heading and position of
#'  the detected tables. The \strong{id} provide the name of the PDF file. The
#'  \strong{out_msg} includes all messages printed to the console or the suppressed
#'  messages if \code{verbose=FALSE}.
#'
#'@examples
#'
#'## Running a simple analysis with filter and search words to extract sentences and tables
#'if(PDE_check_Xpdf_install() == TRUE){
#'  outputtables <- .PDE_extr_data_from_pdf(pdf = "examples/Methotrexate/29973177_!.pdf",
#'  whattoextr = "tabandtxt",
#'  out = paste0(system.file(package = "PDE"),"/examples/MTX_all_files+-0_test/"),
#'  filter.words = strsplit("cohort;case-control;group;study population;study participants", ";")[[1]],
#'  ignore.case.fw = TRUE,
#'  search.words = strsplit("(M|m)ethotrexate;(T|t)rexal;(R|r)heumatrex;(O|o)trexup", ";")[[1]],
#'  ignore.case.sw = FALSE)
#'}
#'
#'## Running an advanced analysis with filter and search words to
#'## extract sentences and tables and obtain documentation files
#'if(PDE_check_Xpdf_install() == TRUE){
#'  outputtables <- .PDE_extr_data_from_pdf(pdf = paste0(system.file(package = "PDE"),
#'                        "/examples/Methotrexate/29973177_!.pdf"),
#'  whattoextr = "tabandtxt",
#'  out = paste0(system.file(package = "PDE"),"/examples/MTX_all_files+-1_test/"),
#'  context = 1,
#'  dev = 20,
#'  filter.words = strsplit("cohort;case-control;group;study population;study participants", ";")[[1]],
#'  ignore.case.fw = TRUE,
#'  filter.word.times = 20,
#'  table.heading.words = "",
#'  ignore.case.th = FALSE,
#'  search.words = strsplit("(M|m)ethotrexate;(T|t)rexal;(R|r)heumatrex;(O|o)trexup", ";")[[1]],
#'  ignore.case.sw = FALSE,
#'  eval.abbrevs = TRUE,
#'  out.table.format = ".csv (WINDOWS-1252)",
#'  write.table.locations = TRUE,
#'  write.tab.doc.file = TRUE,
#'  write.txt.doc.file = TRUE,
#'  exp.nondetc.tabs = TRUE,
#'  delete = TRUE)
#'}
#'
#'@seealso
#'\code{\link{PDE_pdfs2table}},\code{\link{PDE_pdfs2table_searchandfilter}},
#'\code{\link{PDE_pdfs2txt_searchandfilter}}
#'
#'@export
.PDE_extr_data_from_pdf <- function(pdf, whattoextr,
                  out = ".", filter.words = "", ignore.case.fw = FALSE, filter.word.times = 20,
                  table.heading.words = "", ignore.case.th = FALSE, search.words,
                  ignore.case.sw = FALSE, eval.abbrevs = TRUE, out.table.format = ".csv (WINDOWS-1252)", 
                  dev = 20, context = 0,write.table.locations = FALSE, exp.nondetc.tabs = TRUE, 
                  write.tab.doc.file = TRUE,write.txt.doc.file = TRUE, delete = TRUE, verbose = TRUE){

  ## General functions -------------------------------------------
  readin_txt <- function(txtpath) {
    
    ## read in the txt file
    txtcontent_from_txtpath_lat1 <- readLines(txtpath, warn = FALSE, encoding = "latin1")
    txtcontent_from_txtpath_utf8 <- readLines(txtpath, warn = FALSE, encoding = "UTF-8")
    res_lat1 <- try(sum(nchar(txtcontent_from_txtpath_lat1)),silent = TRUE)
    res_utf8 <- try(sum(nchar(txtcontent_from_txtpath_utf8)),silent = TRUE)    
    if (class(res_utf8) == "try-error"){
      ## if utf8 throws an error
      txtcontent_from_txtpath <- txtcontent_from_txtpath_lat1
    } else if (class(res_lat1) == "try-error"){
      ## if latin1 throws an error
      txtcontent_from_txtpath <- txtcontent_from_txtpath_utf8
    } else {
      ## latin1 preferred for 4.2.0 xpdf version
      txtcontent_from_txtpath <- txtcontent_from_txtpath_lat1
    }
    
    
    for (r in 1:length(txtcontent_from_txtpath)){
      ## replace all fi
      res <- try(gsub(intToUtf8(0xFB01),"fi",txtcontent_from_txtpath[r], fixed = TRUE),silent = TRUE)
      if (class(res) == "try-error"){
        txtcontent_from_txtpath[r] <- iconv(txtcontent_from_txtpath[r], 'UTF-8', 'latin1', 'bit')
      }
      txtcontent_from_txtpath[r] <- gsub(intToUtf8(0xFB01),"fi",txtcontent_from_txtpath[r], fixed = TRUE)
    }
    return(txtcontent_from_txtpath)
    
  }
  

  page_splits <- function(txtcontent_to_split) {

    ## split the content according to pages
    pagesplits <- grep("^\\f", txtcontent_to_split)

    ## if there is only one page or no txtcontent_to_split
    if (length(pagesplits) > 1) {
      ## page split the line before
      g <- rep.int(1, pagesplits[1] - 1)
      for (p in 2:length(pagesplits)) {
        g <- c(g, rep.int(p, pagesplits[p] - pagesplits[p - 1]))
      }
      g <- c(g, length(pagesplits))
      splittxtcontent <- split(txtcontent_to_split, g)
    } else {
      splittxtcontent <- txtcontent_to_split
    }

    return(splittxtcontent)
  }

  find_similar_row <- function(originrow, targettablelines,
                               relative.match.col, output.for.originrow.only,
                               output.for.match, output.column.name) {

    determine_similarity_1_vs_2 <- function(strings){
      ## determine similarity
      matches_fsr <- NULL
      for (pos in 1:length(strings[[1]])) {
        counter_fsr <- 0
        out_fsr <- FALSE
        while (out_fsr == FALSE && counter_fsr < 4) {
          ## test is characters are the same
          if (is.na(strings[[2]][pos + counter_fsr])) break
          if (strings[[1]][pos] == strings[[2]][pos + counter_fsr]) {
            out_fsr <- TRUE
          } else {
            counter_fsr <- counter_fsr + 1
          }
        }  ## end while
        matches_fsr <- c(matches_fsr, out_fsr)
      }
      ## determine how much it matches_fsr
      percent.match <- sum(matches_fsr, na.rm = TRUE)/length(strings[[1]])
      return(percent.match)
    }

    ## set variables
    matchingrow <- NA
    matchpercent <- NULL
    targettablerow <- NULL

    ## check every row in targettablelines
    for (targetrow in 1:nrow(targettablelines)) {
      ## skip lines that do not have matching pages
      if (as.numeric(targettablelines[targetrow, "page"]) != as.numeric(originrow[["page"]])) next
      x <- as.character(originrow[[relative.match.col]])
      y <- as.character(targettablelines[targetrow, relative.match.col])
      if (nchar(x) > nchar(y)) x <- substr(x, 1, nchar(y))
      ## Run one way around
      strings1 <- sapply(list(x, y), strsplit, "")
      strings2 <- sapply(list(y, x), strsplit, "")
      percent.match1 <- determine_similarity_1_vs_2(strings1)
      percent.match2 <- determine_similarity_1_vs_2(strings2)
      percent.match <- max(percent.match1,percent.match2)

      if (percent.match > 0.8) {
        matchpercent <- rbind(matchpercent, c(originrow = 1,
                                              targetrow = targetrow,
                                              percent.match = percent.match))
      }
    }

    ## if multiple tables were on the same page if no
    ## matching table is found
    if (is.null(matchpercent)) {
      originrow[output.column.name] <- output.for.originrow.only
      targettablerow <- NA
      ## if only one row matches_fsr
    } else if (nrow(matchpercent) == 1) {
      originrow[output.column.name] <- output.for.match
      targettablelines[matchpercent[1, "targetrow"],
                       output.column.name] <- output.for.match
      targettablerow <- targettablelines[matchpercent[1, "targetrow"],]
      ## if multiple rows
    } else {
      maxrow <- grep(TRUE, matchpercent[, "percent.match"] %in% max(matchpercent[, "percent.match"]))[1]
      originrow[output.column.name] <- output.for.match
      targettablelines[matchpercent[maxrow, "targetrow"], output.column.name] <- output.for.match
      targettablerow <- targettablelines[matchpercent[maxrow, "targetrow"],]
    }

    return(list(originrow = originrow, targettablelines = targettablelines,
                targettablerow = targettablerow))
  }

  exp_nondetc_tabs <- function(input_table, pdfpath,
                               outputpath, detectinfile,
                               based.on.column, matches_end) {
    
    ## find all rows that match
    matched.rows <- grep(matches_end, input_table[, based.on.column])

    ## find page from input_table
    pages <- input_table[matched.rows, "page"]

    exp.pages <- unique(pages)

    ## only start function if there are pages to export
    if (length(exp.pages) > 0) {
      dir.create(paste0(outputpath,"/tables"), showWarnings = FALSE)
      for (page in pages) {
          system(paste0("\"",pdftopng_location,"\" \"",
                      "-f", "\" \"", page, "\" \"", "-l",
                      "\" \"", page, "\" \"", pdfpath, "\" \"",
                      outputpath, "/tables/", substr(basename(pdfpath),
                                                     1, regexpr(".pdf", basename(pdfpath)) -
                                                       1), "_page", page, "_w.table",
                      "\""), wait = TRUE, ignore.stderr = TRUE)
      }
    }
  }

  test_if_abbrev_in_parantheses <- function(searchword, paragraph,ignore.case.sw) {
    output <- list(res = TRUE,"","","","")
    ## get the position of the search word in the paragraph
    pos_searchword_start <- regexpr(searchword,paragraph,ignore.case = ignore.case.sw)[[1]]
    pos_searchword_end <- pos_searchword_start + attr(regexpr(searchword,paragraph,
                                                              ignore.case = ignore.case.sw),"match.length") - 1
    simple_searchword <- substr(paragraph, pos_searchword_start, pos_searchword_end)
    ## test if parantheses are after it +2 for \\
    pos_open <- regexpr("\\(",substr(paragraph, pos_searchword_end + 1, nchar(paragraph)))[[1]]
    pos_abbrev_start <- pos_open + 1 + pos_searchword_end
    pos_close <- regexpr("\\)",substr(paragraph, pos_abbrev_start + 1, nchar(paragraph)))[[1]]
    pos_abbrev_end <- pos_close + pos_abbrev_start - 1
    sentence.end <-  regexpr("\\. [0-9A-Z]",substr(paragraph, pos_searchword_end + 1, nchar(paragraph)))[[1]] +
      pos_searchword_end - 1
    if (sentence.end == -1) sentence.end <- 999999
    ## if both paranthesis were found and there is no end of a sentence
    if (pos_open > 0 &&
        pos_close > 0 &&
        sentence.end > pos_abbrev_end &&
        pos_searchword_start > 0 &&
        pos_searchword_end > 0) {
      pos_ext_searchword_end <- pos_abbrev_start - 3
      ext_searchword <- substr(paragraph, pos_searchword_start, pos_ext_searchword_end)
      ## get all letters within parantheses
      current_char_pos <- pos_abbrev_start
      current_char <- substr(paragraph, current_char_pos, current_char_pos)
      char_list <- NULL
      while (!grepl(" |\\)", current_char)){
        char_list <- c(char_list, current_char)
        current_char_pos <- current_char_pos + 1
        current_char <- substr(paragraph, current_char_pos, current_char_pos)
      }

      char_list <- char_list[!(char_list %in% c("(", ")","[", "]", "/",
                                                "{","}","\\"))]

      ext_searchword <- gsub("[^[:alnum:] ]","",ext_searchword)

      ## test if letter description was found
      if (length(char_list) > 0){
        trunc_searchword <- ext_searchword
        ## match the letters to letters in the searchword
        for (n in 1:length(char_list)){
          pos_char <- regexpr(char_list[n], trunc_searchword, ignore.case = TRUE)[[1]]
          removed.chars <- substr(trunc_searchword, 1, pos_char)
          ## if character was found in search word and abbreviation and
          ## there were no words without character in searchword removed
          if (pos_char > 0 && !length(gregexpr(" ",removed.chars)[[1]]) > 1) {
            trunc_searchword <- substr(trunc_searchword, pos_char + 1, nchar(trunc_searchword))
          } else {
            output <- list(res = FALSE,"","","","")
            break
          }
        }

        ## test if each word in extended searchword has letter in abbreviation
        list.of.words <- gsub("[^A-z]","",
                              strsplit(ext_searchword," ")[[1]])[!(gsub("[^A-z]","",
                                                                        strsplit(ext_searchword," ")[[1]]) %in% "")]
        ## allow one slack
        if (length(list.of.words) - 1 <= length(char_list)){
          result <- TRUE
          trunc_abbrev <- char_list
          ## match the letters to letters in the searchword
          for (n in 1:length(list.of.words)){
            if (length(trunc_abbrev) == 0 && result == TRUE){
              ## one slack
              result <- FALSE
              next
            } else {
              output <- list(res = FALSE,"","","","")
              break
            }
            pos_char <- regexpr(substr(list.of.words[n],1,1), trunc_abbrev, ignore.case = TRUE)[[1]]
            removed.chars <- trunc_abbrev[pos_char]
            ## if character was found in search word and abbreviation and
            ## there were no words without character in searchword removed
            if (pos_char > 0 && !nchar(removed.chars) > 1) {
              trunc_abbrev <- trunc_abbrev[-pos_char]
            } else if (result == TRUE){
              ## one slack
              result <- FALSE
            } else {
              output <- list(res = FALSE,"","","","")
              break
            }
          }
        } else {
          output <- list(res = FALSE,"","","","")
        }

        ## test if letters did match search word
        if (output[[1]]) {
          ## for plural search words
          if (char_list[length(char_list)] == "s" &&
              substr(ext_searchword, nchar(ext_searchword),nchar(ext_searchword)) == "s"){
            abbrev_plural <- paste(char_list, collapse = "")
            replacement_plural <- paste0(abbrev_plural," (",ext_searchword,")$*")
            abbrev_singular <- paste(char_list[-length(char_list)], collapse = "")
            simple_searchword <- substr(paragraph, pos_searchword_start, pos_searchword_end)
            replacement_singular <- paste0(abbrev_singular," (",simple_searchword,")$*")
          } else {
            abbrev_singular <- paste(char_list, collapse = "")
            replacement_singular <- paste0(abbrev_singular," (",ext_searchword,")$*")
            abbrev_plural <- paste(c(char_list,"s"), collapse = "")
            replacement_plural <- paste0(abbrev_plural," (",ext_searchword,"s)$*")
          }
          output <- list(res = TRUE,abbrev_singular = abbrev_singular,
                         replacement_singular = replacement_singular,
                         abbrev_plural = abbrev_plural,replacement_plural = replacement_plural)
        }
      } else {
        output <- list(res = FALSE,"","","","")
      }
    } else {
      ## abbrev found, abbrev_singular, replacement_singular, abbrev_plural, replacement_plural
      output <- list(res = FALSE,"","","","")
    }
    return(output)
  }

  test_if_abbrev_double_dots_or_equal <- function(searchword, paragraph, ignore.case.sw) {
    output <- list(res = TRUE,"","","","")
    ## get the position of the search word in the paragraph
    pos_searchword_start <- regexpr(searchword,paragraph, ignore.case = ignore.case.sw)[[1]]
    pos_searchword_end <- pos_searchword_start + attr(regexpr(searchword,paragraph, 
                                                              ignore.case = ignore.case.sw),
                                                      "match.length") - 1
    ## test if : or = is before it
    minus_three_chars <- substr(paragraph, (pos_searchword_start - 3),(pos_searchword_start - 1))
    if (grepl("(:|=)", minus_three_chars)) {
      pos_ext_searchword_end <- (regexpr("[^[:alnum:] ]", 
                                         substr(paragraph, pos_searchword_end + 1, 
                                                pos_searchword_end + 3)))[[1]] +
        pos_searchword_end - 1
      ext_searchword <- substr(paragraph, pos_searchword_start, pos_ext_searchword_end)
      ## get all letters before the : or =
      current_char_pos <- pos_searchword_start - 2 - lengths(regmatches(minus_three_chars, 
                                                                        gregexpr(" ", minus_three_chars)))
      current_char <- substr(paragraph, current_char_pos, current_char_pos)
      char_list <- NULL
      while (grepl("[A-z|0-9]", current_char)){
        char_list <- c(current_char, char_list)
        current_char_pos <- current_char_pos - 1
        current_char <- substr(paragraph, current_char_pos, current_char_pos)
      }

      char_list <- char_list[!(char_list %in% c("(", ")","[", "]", "/",
                                                "{","}","\\"))]

      ## test if letter description was found
      if (length(char_list) > 0){
        trunc_searchword <- gsub("[^[:alnum:] ]","",ext_searchword)
        ## match the letters to letters in the searchword
        for (n in 1:length(char_list)){
          pos_char <- regexpr(char_list[n], trunc_searchword, ignore.case = TRUE)[[1]]
          removed.chars <- substr(trunc_searchword, 1, pos_char)
          ## if character was found in search word and abbreviation and
          ## there were no words without character in searchword removed
          if (pos_char > 0 && !length(gregexpr(" ",removed.chars)[[1]]) > 1) {
            trunc_searchword <- substr(trunc_searchword, pos_char + 1, nchar(trunc_searchword))
          } else {
            output <- list(res = FALSE,"","","","")
            break
          }
        }

        ## test if each word in extended searchword has letter in abbreviation
        list.of.words <- gsub("[^A-z]","",
                              strsplit(ext_searchword," ")[[1]])[!(gsub("[^A-z]","",
                                                                        strsplit(ext_searchword,
                                                                                 " ")[[1]]) %in% "")]

        ## allow one slack
        if (length(list.of.words) - 1 <= length(char_list)){
          result <- TRUE
          trunc_abbrev <- char_list
          ## match the letters to letters in the searchword
          for (n in 1:length(list.of.words)){
            pos_char <- regexpr(substr(list.of.words[n],1,1), trunc_abbrev, ignore.case = TRUE)[[1]]
            removed.chars <- trunc_abbrev[pos_char]
            ## if character was found in search word and abbreviation and
            ## there were no words without character in searchword removed
            if (pos_char > 0 && !nchar(removed.chars) > 1) {
              trunc_abbrev <- trunc_abbrev[-pos_char]
            } else if (result == TRUE){
              ## one slack
              result <- FALSE
            } else {
              output <- list(res = FALSE,"","","","")
              break
            }
          }
        } else {
          output <- list(res = FALSE,"","","","")
        }

        ## test if letters did match search word
        if (output[[1]]) {
          ## for plural search words
          if (char_list[length(char_list)] == "s" &&
              substr(ext_searchword, nchar(ext_searchword),nchar(ext_searchword)) == "s"){
            abbrev_plural <- paste(char_list, collapse = "")
            replacement_plural <- paste0(abbrev_plural," (",ext_searchword,")$*")
            abbrev_singular <- paste(char_list[-length(char_list)], collapse = "")
            simple_searchword <- substr(paragraph, pos_searchword_start, pos_searchword_end)
            replacement_singular <- paste0(abbrev_singular," (",simple_searchword,")$*")
          } else {
            abbrev_singular <- paste(char_list, collapse = "")
            replacement_singular <- paste0(abbrev_singular," (",ext_searchword,")$*")
            abbrev_plural <- paste(c(char_list,"s"), collapse = "")
            replacement_plural <- paste0(abbrev_plural," (",ext_searchword,"s)$*")
          }
          output <- list(res = TRUE,abbrev_singular = abbrev_singular,
                         replacement_singular = replacement_singular,
                         abbrev_plural = abbrev_plural,replacement_plural = replacement_plural)
        }
      } else {
        output <- list(res = FALSE,"","","","")
      }
    } else {
      ## abbrev found, abbrev_singular, replacement_singular, abbrev_plural, replacement_plural
      output <- list(res = FALSE,"","","","")
    }
    return(output)
  }

  deletefile <- function(verbose=TRUE) {
    out_msg <- NULL
    
    if (delete == TRUE) {
      ## clean up
      unlink(txtpath, recursive = TRUE)
      if (exists(txtpath)) {
        out_msg <- c(out_msg, paste0("Could not delete:", txtpath))
        if (verbose) cat(utils::tail(out_msg,1), sep="\n")
      }
      unlink(keeplayouttxtpath, recursive = TRUE)
      if (exists(keeplayouttxtpath)) {
        out_msg <- c(out_msg, paste0("Could not delete:", keeplayouttxtpath))
        if (verbose) cat(utils::tail(out_msg,1), sep="\n")
      }
      unlink(htmlpath, recursive = TRUE)
      if (exists(htmlpath)){ 
        if (exists(keeplayouttxtpath)) {
          out_msg <- c(out_msg, paste0("Could not delete:", htmlpath))
          if (verbose) cat(utils::tail(out_msg,1), sep="\n")
        }
      }
    }
    return(out_msg)
  }

  replace.html.entity <- function(input.with.html) {
    output.without.html <- input.with.html
    output.without.html <- gsub("&amp;","&",output.without.html)
    output.without.html <- gsub("&lt;","<",output.without.html)
    output.without.html <- gsub("&gt;",">",output.without.html)
    output.without.html <- gsub("&#160;"," ",output.without.html)
    output.without.html <- gsub("&Agrave;",intToUtf8(0x00C0),output.without.html)
    output.without.html <- gsub("&Aacute;",intToUtf8(0x00C1),output.without.html)
    output.without.html <- gsub("&Acirc;",intToUtf8(0x00C2),output.without.html)
    output.without.html <- gsub("&Atilde;",intToUtf8(0x00C3),output.without.html)
    output.without.html <- gsub("&Auml;",intToUtf8(0x00C4),output.without.html)
    output.without.html <- gsub("&Aring;",intToUtf8(0x00C5),output.without.html)
    output.without.html <- gsub("&AElig;",intToUtf8(0x00C6),output.without.html)
    output.without.html <- gsub("&Ccedil;",intToUtf8(0x00C7),output.without.html)
    output.without.html <- gsub("&Egrave;",intToUtf8(0x00C8),output.without.html)
    output.without.html <- gsub("&Eacute;",intToUtf8(0x00C9),output.without.html)
    output.without.html <- gsub("&Ecirc;",intToUtf8(0x00CA),output.without.html)
    output.without.html <- gsub("&Euml;",intToUtf8(0x00CB),output.without.html)
    output.without.html <- gsub("&Igrave;",intToUtf8(0x00CC),output.without.html)
    output.without.html <- gsub("&Iacute;",intToUtf8(0x00CD),output.without.html)
    output.without.html <- gsub("&Icirc;",intToUtf8(0x00CE),output.without.html)
    output.without.html <- gsub("&Iuml;",intToUtf8(0x00CF),output.without.html)
    output.without.html <- gsub("&ETH;",intToUtf8(0x00D0),output.without.html)
    output.without.html <- gsub("&Ntilde;",intToUtf8(0x00D1),output.without.html)
    output.without.html <- gsub("&Ograve;",intToUtf8(0x00D2),output.without.html)
    output.without.html <- gsub("&Oacute;",intToUtf8(0x00D3),output.without.html)
    output.without.html <- gsub("&Ocirc;",intToUtf8(0x00D4),output.without.html)
    output.without.html <- gsub("&Otilde;",intToUtf8(0x00D5),output.without.html)
    output.without.html <- gsub("&Ouml;",intToUtf8(0x00D6),output.without.html)
    output.without.html <- gsub("&Oslash;",intToUtf8(0x00D8),output.without.html)
    output.without.html <- gsub("&Ugrave;",intToUtf8(0x00D9),output.without.html)
    output.without.html <- gsub("&Uacute;",intToUtf8(0x00DA),output.without.html)
    output.without.html <- gsub("&Ucirc;",intToUtf8(0x00DB),output.without.html)
    output.without.html <- gsub("&Uuml;",intToUtf8(0x00DC),output.without.html)
    output.without.html <- gsub("&Yacute;",intToUtf8(0x00DD),output.without.html)
    output.without.html <- gsub("&THORN;",intToUtf8(0x00DE),output.without.html)
    output.without.html <- gsub("&szlig;",intToUtf8(0x00DF),output.without.html)
    output.without.html <- gsub("&agrave;",intToUtf8(0x00E0),output.without.html)
    output.without.html <- gsub("&aacute;",intToUtf8(0x00E1),output.without.html)
    output.without.html <- gsub("&acirc;",intToUtf8(0x00E2),output.without.html)
    output.without.html <- gsub("&atilde;",intToUtf8(0x00E3),output.without.html)
    output.without.html <- gsub("&auml;",intToUtf8(0x00E4),output.without.html)
    output.without.html <- gsub("&aring;",intToUtf8(0x00E5),output.without.html)
    output.without.html <- gsub("&aelig;",intToUtf8(0x00E6),output.without.html)
    output.without.html <- gsub("&ccedil;",intToUtf8(0x00E7),output.without.html)
    output.without.html <- gsub("&egrave;",intToUtf8(0x00E8),output.without.html)
    output.without.html <- gsub("&eacute;",intToUtf8(0x00E9),output.without.html)
    output.without.html <- gsub("&ecirc;",intToUtf8(0x00EA),output.without.html)
    output.without.html <- gsub("&euml;",intToUtf8(0x00EB),output.without.html)
    output.without.html <- gsub("&igrave;",intToUtf8(0x00EC),output.without.html)
    output.without.html <- gsub("&iacute;",intToUtf8(0x00ED),output.without.html)
    output.without.html <- gsub("&icirc;",intToUtf8(0x00EE),output.without.html)
    output.without.html <- gsub("&iuml;",intToUtf8(0x00EF),output.without.html)
    output.without.html <- gsub("&eth;",intToUtf8(0x00F0),output.without.html)
    output.without.html <- gsub("&ntilde;",intToUtf8(0x00F1),output.without.html)
    output.without.html <- gsub("&ograve;",intToUtf8(0x00F2),output.without.html)
    output.without.html <- gsub("&oacute;",intToUtf8(0x00F3),output.without.html)
    output.without.html <- gsub("&ocirc;",intToUtf8(0x00F4),output.without.html)
    output.without.html <- gsub("&otilde;",intToUtf8(0x00F5),output.without.html)
    output.without.html <- gsub("&ouml;",intToUtf8(0x00F6),output.without.html)
    output.without.html <- gsub("&oslash;",intToUtf8(0x00F8),output.without.html)
    output.without.html <- gsub("&ugrave;",intToUtf8(0x00F9),output.without.html)
    output.without.html <- gsub("&uacute;",intToUtf8(0x00FA),output.without.html)
    output.without.html <- gsub("&ucirc;",intToUtf8(0x00FB),output.without.html)
    output.without.html <- gsub("&uuml;",intToUtf8(0x00FC),output.without.html)
    output.without.html <- gsub("&yacute;",intToUtf8(0x00FD),output.without.html)
    output.without.html <- gsub("&thorn;",intToUtf8(0x00FE),output.without.html)
    output.without.html <- gsub("&yuml;",intToUtf8(0x00FF),output.without.html)
    output.without.html <- gsub("&iexcl;",intToUtf8(0x00A1),output.without.html)
    output.without.html <- gsub("&cent;",intToUtf8(0x00A2),output.without.html)
    output.without.html <- gsub("&pound;",intToUtf8(0x00A3),output.without.html)
    output.without.html <- gsub("&curren;",intToUtf8(0x00A4),output.without.html)
    output.without.html <- gsub("&yen;",intToUtf8(0x00A5),output.without.html)
    output.without.html <- gsub("&brvbar;",intToUtf8(0x00A6),output.without.html)
    output.without.html <- gsub("&sect;",intToUtf8(0x00A7),output.without.html)
    output.without.html <- gsub("&uml;",intToUtf8(0x00A8),output.without.html)
    output.without.html <- gsub("&copy;",intToUtf8(0x00A9),output.without.html)
    output.without.html <- gsub("&ordf;",intToUtf8(0x00AA),output.without.html)
    output.without.html <- gsub("&laquo;",intToUtf8(0x00AB),output.without.html)
    output.without.html <- gsub("&not;",intToUtf8(0x00AC),output.without.html)
    output.without.html <- gsub("&reg;",intToUtf8(0x00AE),output.without.html)
    output.without.html <- gsub("&macr;",intToUtf8(0x00AF),output.without.html)
    output.without.html <- gsub("&deg;",intToUtf8(0x00B1),output.without.html)
    output.without.html <- gsub("&plusmn;",intToUtf8(0x00B2),output.without.html)
    output.without.html <- gsub("&sup2;",intToUtf8(0x00B3),output.without.html)
    output.without.html <- gsub("&sup3;",intToUtf8(0x00B4),output.without.html)
    output.without.html <- gsub("&acute;",intToUtf8(0x00B5),output.without.html)
    output.without.html <- gsub("&micro;",intToUtf8(0x00B6),output.without.html)
    output.without.html <- gsub("&para;",intToUtf8(0x00B7),output.without.html)
    output.without.html <- gsub("&cedil;",intToUtf8(0x00B8),output.without.html)
    output.without.html <- gsub("&sup1;",intToUtf8(0x00B9),output.without.html)
    output.without.html <- gsub("&ordm;",intToUtf8(0x00BA),output.without.html)
    output.without.html <- gsub("&raquo;",intToUtf8(0x00BB),output.without.html)
    output.without.html <- gsub("&frac14;",intToUtf8(0x00BC),output.without.html)
    output.without.html <- gsub("&frac12;",intToUtf8(0x00BD),output.without.html)
    output.without.html <- gsub("&frac34;",intToUtf8(0x00BE),output.without.html)
    output.without.html <- gsub("&iquest;",intToUtf8(0x00BF),output.without.html)
    output.without.html <- gsub("&times;",intToUtf8(0x00D7),output.without.html)
    output.without.html <- gsub("&divide;",intToUtf8(0x00F7),output.without.html)
    output.without.html <- gsub("&circ;",intToUtf8(0x00FE),output.without.html)
    output.without.html <- gsub("&tilde;",intToUtf8(0x007E),output.without.html)
    output.without.html <- gsub(intToUtf8(0xFB01),"fi",output.without.html, fixed = TRUE)
    return(output.without.html)
  }

  insert.html.entity <- function(input.without.html) {
    output.with.html <- input.without.html
    output.with.html <- gsub(intToUtf8(0x00C0),"&Agrave;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00C1),"&Aacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00C2),"&Acirc;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00C3),"&Atilde;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00C4),"&Auml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00C5),"&Aring;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00C6),"&AElig;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00C7),"&Ccedil;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00C8),"&Egrave;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00C9),"&Eacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00CA),"&Ecirc;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00CB),"&Euml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00CC),"&Igrave;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00CD),"&Iacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00CE),"&Icirc;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00CF),"&Iuml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00D0),"&ETH;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00D1),"&Ntilde;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00D2),"&Ograve;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00D3),"&Oacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00D4),"&Ocirc;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00D5),"&Otilde;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00D6),"&Ouml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00D8),"&Oslash;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00D9),"&Ugrave;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00DA),"&Uacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00DB),"&Ucirc;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00DC),"&Uuml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00DD),"&Yacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00DE),"&THORN;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00DF),"&szlig;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00E0),"&agrave;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00E1),"&aacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00E2),"&acirc;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00E3),"&atilde;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00E4),"&auml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00E5),"&aring;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00E6),"&aelig;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00E7),"&ccedil;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00E8),"&egrave;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00E9),"&eacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00EA),"&ecirc;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00EB),"&euml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00EC),"&igrave;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00ED),"&iacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00EE),"&icirc;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00EF),"&iuml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00F0),"&eth;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00F1),"&ntilde;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00F2),"&ograve;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00F3),"&oacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00F4),"&ocirc;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00F5),"&otilde;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00F6),"&ouml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00F8),"&oslash;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00F9),"&ugrave;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00FA),"&uacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00FB),"&ucirc;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00FC),"&uuml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00FD),"&yacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00FE),"&thorn;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00FF),"&yuml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00A1),"&iexcl;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00A2),"&cent;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00A3),"&pound;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00A4),"&curren;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00A5),"&yen;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00A6),"&brvbar;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00A7),"&sect;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00A8),"&uml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00A9),"&copy;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00AA),"&ordf;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00AB),"&laquo;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00AC),"&not;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00AE),"&reg;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00AF),"&macr;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00B1),"&deg;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00B2),"&plusmn;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00B3),"&sup2;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00B4),"&sup3;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00B5),"&acute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00B6),"&micro;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00B7),"&para;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00B8),"&cedil;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00B9),"&sup1;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00BA),"&ordm;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00BB),"&raquo;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00BC),"&frac14;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00BD),"&frac12;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00BE),"&frac34;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00BF),"&iquest;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00D7),"&times;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00F7),"&divide;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00FE),"&circ;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x007E),"&tilde;",output.with.html)
    return(output.with.html)
  }

  update_progress_info <- function(print_message){
    if (length(PDE.globals$le.progress.textbox) > 0){
      ## add completion  info
      progress_info_length <- length(tcltk2::tk2list.get(PDE.globals$le.progress.textbox))
      if (progress_info_length > 3) {
        new_list <- tcltk2::tk2list.get(PDE.globals$le.progress.textbox)[!grepl("^$",
                                                              tcltk2::tk2list.get(PDE.globals$le.progress.textbox))]
      } else {
        new_list <- tcltk2::tk2list.get(PDE.globals$le.progress.textbox)
      }
      tcltk::tkconfigure(PDE.globals$le.progress.textbox,values = c(new_list,print_message))
      tcltk::tkconfigure(PDE.globals$le.progress.textbox,textvariable = tcltk::tclVar(print_message))
      tcltk::tcl("update")
    }
  }
  
  remove_backref <- function(x) {
    for (s in 1:length(x)){
      string_strsplit <- strsplit(x[s],"")[[1]]
      for (l in 1:length(string_strsplit)){
        if (l != length(string_strsplit)){
          if ((string_strsplit[l] == "\\\\") && (string_strsplit[l+1] %in% as.character(0:9))){
            string_strsplit <- gsub("\\\\","",string_strsplit)
          }
        }
      }
      string_strsplit <- gsub("\\\\","",string_strsplit)
      x[s] <- paste0(string_strsplit,collapse ="")
    }
    return(x)
  }

  ## set all indicator variables ---------------------------
  integrity.indicator <- TRUE ## indicates if txt, keeplayouttxt and html copy of the PDF file are created correctly
  filterwords.go <- FALSE ## indicator if filter words were found or not set
  searchwords.go <- FALSE ## indicator if search words were found or not set
  nexti <- FALSE ## indicator for html table to be processed
  output_files <- NULL ## this is the output to return at the end
  out_msg <- NULL

  ## set the paths of the files ---------------------------------
  output <- NULL
  pdfpath <- pdf
  txtpath <- gsub(".pdf[^.pdf]*$", ".txt", pdfpath)
  keeplayouttxtpath <- gsub(".pdf[^.pdf]*$", "_keeplayout.txt",
                            pdfpath)

  ## create the id and output dir ----------------------------------
  dir.create(out, showWarnings = FALSE)
  id <- sub("^(.*)\\..*$", "\\1", basename(txtpath))
  print_message <- paste0("Following file is processing: \'",id,".pdf\'")
  out_msg <- c(out_msg, print_message)
  if (verbose) cat(utils::tail(out_msg,1), sep="\n")

  ## 1) Create txt and html copies of PDF file ---------------------------------------
  ## test of Xpdftools are installed
  xpdf_config_location <- paste0(system.file(package = "PDE"),"/bin/XPDF_DIR.config")
  if (file.exists(xpdf_config_location)){
    pdftotext_location <- grep("pdftotext",readLines(xpdf_config_location), value = TRUE)
    
    pdftohtml_location <- grep("pdftohtml",readLines(xpdf_config_location), value = TRUE)
    
    pdftopng_location <- grep("pdftopng",readLines(xpdf_config_location), value = TRUE)
    
    if (length(file.exists(pdftotext_location)) == 0 ||
        length(file.exists(pdftohtml_location)) == 0 ||
        length(file.exists(pdftopng_location)) == 0){
      install.test <- PDE_check_Xpdf_install(verbose=verbose)
    } else {
      install.test <- TRUE
    }
      
  } else {
    install.test <- PDE_check_Xpdf_install(verbose=verbose)
  }
  if (install.test == FALSE) {
    stop(attributes(install.test)$msg)
  } else {
    pdftotext_location <- grep("pdftotext",readLines(xpdf_config_location), value = TRUE)
    
    pdftohtml_location <- grep("pdftohtml",readLines(xpdf_config_location), value = TRUE)
    
    pdftopng_location <- grep("pdftopng",readLines(xpdf_config_location), value = TRUE)
  }
  
  system(paste0("\"",pdftotext_location,"\" -layout",
                " \"", pdfpath, "\" \"", keeplayouttxtpath,
                "\""), wait = TRUE, ignore.stderr = TRUE)
  system(paste0("\"",pdftotext_location,"\" \"", pdfpath,
                "\" \"", txtpath, "\""), wait = TRUE,
         ignore.stderr = TRUE)
  htmlpath <- gsub(".pdf[^.pdf]*$", ".html", pdfpath)
  ## convert PDF to HTML
  system(paste0("\"",pdftohtml_location,"\" \"", pdfpath,
                "\" \"", htmlpath, "\""), wait = TRUE,
         ignore.stderr = TRUE)
  
  ## add completion  info
  update_progress_info(print_message)
  ## 2.1) Check txt and html file integrity ----------------------------------
  integrity.indicator <- TRUE

  ## check if html was created (if pdf is secured)
  if (!dir.exists(htmlpath) || !file.exists(paste0(htmlpath, "/index.html"))) {
    ## export error and do not remove file change
    print_message <- paste0(id, " is most likely secured and cannot be processed!")
    out_msg <- c(out_msg, print_message)
    if (verbose) cat(utils::tail(out_msg,1), sep="\n")
    update_progress_info(print_message)
    dir.create(paste0(out,"/secured"), showWarnings = FALSE)
    write(paste0(pdfpath, " is most likely secured and cannot be processed!"),
          file = paste0(out,"/secured/",id, "_is_secured.txt"))
    integrity.indicator <- FALSE
    ## if html was created
  } else {
    ## read in the txt files
    txtcontent <- readin_txt(txtpath)
    splittxtcontent <- page_splits(txtcontent)
    keeplayouttxtcontent <- readin_txt(keeplayouttxtpath)
    indexcontent <- readLines(paste0(htmlpath, "/index.html"))
    ## if the txt or html files have no content
    if (identical(indexcontent, "")) {
      ## export error and do not remove file change
      print_message <- paste0(id, " is most likely secured and cannot be processed!")
      out_msg <- c(out_msg, print_message)
      if (verbose) cat(utils::tail(out_msg,1), sep="\n")
      update_progress_info(print_message)
      dir.create(paste0(out,"/secured"), showWarnings = FALSE)
      write(paste0(pdfpath, " is most likely secured and cannot be processed!"),
            file = paste0(out,"/secured/",id, "_is_secured.txt"))
      integrity.indicator <- FALSE
    } else if (identical(txtcontent, "") || identical(keeplayouttxtcontent, "") ||
               identical(gsub("\f","",txtcontent), "") || identical(gsub("\f","",keeplayouttxtcontent), "")  ) {
      print_message <- paste0(id, " most likely contains no text content or is a scanned document!")
      out_msg <- c(out_msg, print_message)
      if (verbose) cat(utils::tail(out_msg,1), sep="\n")
      update_progress_info(print_message)
      dir.create(paste0(out,"/nr/"), showWarnings = FALSE)
      write(paste0(pdfpath, " most likely contains no text content or is a scanned document!"),
            file = paste0(out, "/nr/", id, "_non-readable.txt"))
      integrity.indicator <- FALSE
      ## when txtcontent is there and html index was created
    } else {
      ## extract all the page names ##
      pages <- NULL
      for (line in indexcontent) {
        if (grepl("a href=\"", line)) pages <- c(pages,
                                                 substr(line, regexpr("a href=\"", line) + 8,
                                                         regexpr("html", line) + 3))
      }

      ## read in the html file ##
      htmlcontent <- vector("list", length(pages))
      for (i in 1:length(pages)) {
        htmlpagecontent <- readLines(paste0(htmlpath,"/", pages[i]), encoding = "UTF-8", warn = FALSE)
        if (identical(htmlpagecontent, "")) {
          print_message <- paste0(id, " most likely contains no text content or is a scanned in document!")
          out_msg <- c(out_msg, print_message)
          if (verbose) cat(utils::tail(out_msg,1), sep="\n")
          update_progress_info(print_message)
          dir.create(paste0(out,"/nr/"), showWarnings = FALSE)
          write(paste0(pdfpath, " most likely contains no text content or is a scanned in document!"),
                file = paste0(out, "/nr/", id, "_non-readable.txt"))
          integrity.indicator <- FALSE
          break
        }
        ## replace p styles
        ## get the list of p styles
        list_of_fts <- NULL
        lines_with_ft <- grep("\\.ft",htmlpagecontent)
        if (length(lines_with_ft) > 0){
          for (ln in lines_with_ft){
            list_of_fts <- rbind(list_of_fts,
                                 cbind(ft=sub("\\{.*$","",substr(htmlpagecontent[ln],
                                           regexpr("\\.ft",htmlpagecontent[ln])[1]+1,
                                            nchar(htmlpagecontent[ln]))),
                                 style=sub("\\}.*$","",sub("^.*\\{","",substr(htmlpagecontent[ln],
                                                              regexpr("\\.ft",htmlpagecontent[ln])[1]+1,
                                                              nchar(htmlpagecontent[ln]))))))
                                                                      
          }
          ## replace each ft style in that page
          for (ft.numb in 1:nrow(list_of_fts)){
            htmlpagecontent <- gsub(paste0("\" class=\"",list_of_fts[ft.numb,1],"\""),
                 paste0(";",list_of_fts[ft.numb,2],"\""),htmlpagecontent)
          }
        }
        htmlcontent[[i]] <- htmlpagecontent
      }
    } ## end if the txt or html files have no content
  } ## end if (!dir.exists(htmlpath))
  
  if (integrity.indicator == TRUE) {
    ## if the file is only images or empty then don't process
    realcontent <- gsub("^\\f", "", paste(txtcontent,
                                         collapse = ""))
    realcontent <- gsub("\t", "", realcontent)
    realcontent <- gsub(" ", "", realcontent)
    if (realcontent == "") {
      integrity.indicator <- FALSE
      print_message <- paste0(id, ".pdf has no readable content")
      out_msg <- c(out_msg, print_message)
      if (verbose) cat(utils::tail(out_msg,1), sep="\n")
      update_progress_info(print_message)
      ## write an empty file
      if (write.txt.doc.file == TRUE) {
        dir.create(paste0(out,"/nr/"), showWarnings = FALSE)
        write(paste0(pdfpath, " has no readable content in PDF file"),
              file = paste0(out, "/nr/", id, "_non-readable.txt"))
        integrity.indicator <- FALSE
      }
    }
  }

  ## if there was an issue with creating files
  if (integrity.indicator == FALSE) {
    output_files <- NULL
    ##if everything is ok
  } else {
    ## 2.2) Check all the options chosen for PDE analyzer ------------------------------
    ## if general extraction (search words is undefined), then context <- 0
    if (search.words[1] == "" || search.words[1] == "*" ||
        search.words[1] == ".") {
      context <- 0
      search.words <- ""
    }
    if (ignore.case.sw == FALSE) ignore.case.sw <- FALSE
    else if (!ignore.case.sw == TRUE) {
      tcltk::tkmessageBox(title = "Warning",
                          type = "ok", icon = "warning",
                          message = "ignore.case.sw: ignore.case.sw has to be either TRUE or FALSE")
      stop("ignore.case.sw: ignore.case.sw has to be either TRUE or FALSE")
    }
    if (is.na(filter.words[1])) filter.words <- ""
    if (is.null(ncol(filter.words))) filter.words <- data.frame(words = filter.words,
                                                                ignore.case.fw = ignore.case.fw)
    if (!is.numeric(filter.word.times)) {
      tcltk::tkmessageBox(title = "Warning",
                          type = "ok", icon = "warning",
                          message = "filter.word.times: has to be a number")
      stop("filter.word.times: has to be a number")
    }
    if (is.null(ncol(table.heading.words))) table.heading.words <- data.frame(words = table.heading.words,
                                                                              ignore.case.th = ignore.case.th)
    if (is.null(ncol(search.words))) search.words <- data.frame(words = search.words,
                                                                ignore.case.sw = ignore.case.sw)
    if (write.table.locations == FALSE) write.table.locations <- FALSE
    else if (!write.table.locations == TRUE) {
      tcltk::tkmessageBox(title = "Warning",
                          type = "ok", icon = "warning",
                          message = "write.table.locations: has to be either TRUE or FALSE")
      stop("write.table.locations: has to be either TRUE or FALSE")
    }
    if (is.na(out.table.format)) {
       out.table.format <- ".csv (WINDOWS-1252)"
    } else if (!(out.table.format %in% c(".csv (WINDOWS-1252)", ".csv (macintosh)", ".csv (UTF-8)", 
                       ".tsv (WINDOWS-1252)",".tsv (macintosh)",".tsv (UTF-8)"))) {
      tcltk::tkmessageBox(title = "Warning",
                          type = "ok", icon = "warning",
                          message = paste("out.table.format: has to be either .csv (WINDOWS-1252), .csv (macintosh), .csv (UTF-8)",
                                          "or .tsv (WINDOWS-1252), .tsv (macintosh), .tsv (UTF-8)"))
      stop(paste("out.table.format: has to be either .csv (WINDOWS-1252), .csv (macintosh), .csv (UTF-8)",
                 "or .tsv (WINDOWS-1252), .tsv (macintosh), .tsv (UTF-8)"))
    }
    if (grepl("csv", out.table.format)) {
      out.table.separator <- ","
      out.table.ext <- ".csv"
    }
    if (grepl("tsv", out.table.format)) {
      out.table.separator <- "\t"
      out.table.ext <- ".tsv"
    }
    if (grepl("WINDOWS-1252", out.table.format)) {
      out.encoding <- "WINDOWS-1252"
    } else if (grepl("macintosh", out.table.format)) {
      out.encoding <- "macintosh"
    } else if (grepl("UTF-8", out.table.format)) {
      out.encoding <- "UTF-8"
    } else {
      out.encoding <- "WINDOWS-1252"
    }
    
    
    if (!is.numeric(dev)) {
      tcltk::tkmessageBox(title = "Warning",
                          type = "ok", icon = "warning",
                          message = "dev: has to be a number")
      stop("dev: has to be a number")  ## deviation between cell positions, might 
                                       ##have to be increased if words that should be in the same column
    }
    if (!is.numeric(context)) {
      tcltk::tkmessageBox(title = "Warning",
                          type = "ok", icon = "warning",
                          message = "context: has to be a number")
      stop("context: has to be a number")  ## +/- context number of sentences 
                                           ## before and after the search word was found will be put out
    }
    if (write.tab.doc.file == FALSE) write.tab.doc.file <- FALSE
    else if (!write.tab.doc.file == TRUE) {
      tcltk::tkmessageBox(title = "Warning",
                          type = "ok", icon = "warning",
                          message = "write.tab.doc.file: has to be either TRUE or FALSE")
      stop("write.tab.doc.file: has to be either TRUE or FALSE")
    }
    if (write.txt.doc.file == FALSE) write.txt.doc.file <- FALSE
    else if (!write.txt.doc.file == TRUE) {
      tcltk::tkmessageBox(title = "Warning",
                          type = "ok", icon = "warning",
                          message = "write.txt.doc.file: has to be either TRUE or FALSE")
      stop("write.txt.doc.file: has to be either TRUE or FALSE")
    }
  }

  ## reset variables
  if (integrity.indicator == TRUE) {
    output <- NULL
    htmltablelines <- NULL
    keeplayouttxttablelines <- NULL
    txttablelines <- NULL
  }

  ## 2.3) Make content ----------------------------------------
  if (integrity.indicator == TRUE) {
    content <- list(txtcontent, keeplayouttxtcontent)

    ## make a variable only with the txt content
    txthtmlcontent <- htmlcontent

    ## go through pages
    for (j in 1:length(txthtmlcontent)) {
      ## go through each html line
      for (z in 1:length(txthtmlcontent[[j]])) {
        ## Removing the Table wording
        line <- txthtmlcontent[[j]][z]
        line.txthtmlcontent <- ""
        res <- try(utf8ToInt(line),silent = TRUE)
        if (class(res) == "try-error") line <- iconv(line, 'UTF-8', 'latin1', 'byte')
        ##replace different html formating
        line <- gsub("</p>","</span>",line)
        ## replace line break with space (not optimal but better for searching)
        line <- gsub("<br/>"," ",line)
        ## remove residue from hyperlinks
        line <- gsub("<a href=\".*?>","",line)
        line <- gsub("</a>","",line)
        rev.line <- intToUtf8(rev(utf8ToInt(line)))
        for (spanpos in rev(gregexpr(">naps/<",rev.line)[[1]])){
          add.txthtmlcontent <- substr(rev.line,spanpos+7, regexpr(">\"",
                                                                   substr(rev.line,spanpos+7, 
                                                                          nchar(rev.line)))+spanpos+7-2)
          line.txthtmlcontent <- paste0(line.txthtmlcontent,intToUtf8(rev(utf8ToInt(add.txthtmlcontent))))
        }
        if (grepl("&",line.txthtmlcontent)){
          txthtmlcontent[[j]][z] <- replace.html.entity(line.txthtmlcontent)
        } else {
          txthtmlcontent[[j]][z] <- line.txthtmlcontent
        }
      }  ## end go through each line z
      content[[(j+2)]] <- txthtmlcontent[[j]]
    }
  }


  ## 3) Evaluate for filter words ---------------------------------------
  list_of_abbrevs <- NULL

  ## 3.1) Filter Search ---------------------------------------
  if (integrity.indicator == TRUE && !filter.words[1, "words"] == "") {
    word.txtline.fw <- NULL
    for (i in 1:nrow(filter.words)) {
      ## search for lines with filter word in [txtcontent]
      word <- filter.words[i, "words"]
      ignore.case.fw <- filter.words[i, "ignore.case.fw"]
      detected_line <- grep(word, txtcontent, ignore.case = ignore.case.fw)
      word.txtline.fw <- c(word.txtline.fw,
                           detected_line)

      ## 3.2) Replace abbreviations -----------------------------------------------------

      if (eval.abbrevs == TRUE && length(word.txtline.fw) > 0){
        ## Check if any occurences (heading + text) of the
        ## filter word are defining and abbreviation
        ## go through each txtcontent line that the searchword was found
        for (nth in 1:length(word.txtline.fw)) {
          paragraph <- txtcontent[word.txtline.fw[[nth]]]
          ## identify definitions of abbrev
          occur_double_dots_or_equal <- test_if_abbrev_double_dots_or_equal(searchword = as.character(word), 
                                                                            paragraph = paragraph,
                                                                            ignore.case = ignore.case.fw)
          occur_in_parantheses <- test_if_abbrev_in_parantheses(searchword = as.character(word), 
                                                                paragraph = paragraph,
                                                                ignore.case = ignore.case.fw)
          ## if abbrev was found in nth occurence
          for (occur in list(occur_double_dots_or_equal, occur_in_parantheses)){
            if (occur$res == TRUE) {
              ## replace if abbrev is not yet defined
              for (abbrev in c("abbrev_singular","abbrev_plural")) {
                if (!(occur[[abbrev]] %in% list_of_abbrevs)){
                  list_of_abbrevs <- c(list_of_abbrevs,occur[[abbrev]])
                  for (c in 1:length(content)){
                    to_find <- paste0("([^A-z|0-9]+|^)",occur[[abbrev]],"([^A-z|0-9|:]$|[^A-z|0-9|:][^=|:]|$)")
                    found_lines <- grep(to_find, content[[c]],
                                        ignore.case = FALSE)
                    for (line in found_lines){
                      found_pos <- gregexpr(to_find, content[[c]][line])[[1]]
                      found_pos_list <- gregexpr(to_find, content[[c]][line])[[1]]
                      add_pos <- 0
                      for (p in 1:length(found_pos)){
                        sub <- substr(content[[c]][line],found_pos[p] + add_pos,nchar(content[[c]][line]))
                        ## prevent double substitution if sub already contains substitution
                        if (grepl(occur[[sub("abbrev","replacement",abbrev)]], sub)) next
                        found_pos_list[p] <- regexpr(occur[[abbrev]],sub) + found_pos[p] + add_pos - 1
                        content[[c]][line] <- paste0(substr(content[[c]][line],1,found_pos_list[p]-1),
                                                     sub(occur[[abbrev]],
                                                         occur[[sub("abbrev","replacement",abbrev)]],
                                                         substr(content[[c]][line],
                                                                found_pos_list[p],nchar(content[[c]][line])),
                                                         ignore.case = FALSE))
                        add_pos <- add_pos + nchar(occur[[sub("abbrev","replacement",
                                                              abbrev)]]) - nchar(occur[[abbrev]])
                        ## correct for sub of abbrev definition e.g. methotrexate (MTX (methotrexate))
                        len_diff <- nchar(content[[c]][line]) - 
                                    nchar(gsub(paste0("(",occur[[sub("abbrev","replacement",abbrev)]],")"),
                                         paste0("(",occur[[abbrev]],")"),
                                         content[[c]][line] , fixed = TRUE))
                        add_pos <- add_pos - len_diff
                        content[[c]][line]  <- gsub(paste0("(",occur[[sub("abbrev","replacement",abbrev)]],")"),
                                                    paste0("(",occur[[abbrev]],")"),
                                                    content[[c]][line] , fixed = TRUE)
                      } ## end for (p in length(found_pos)){
                    } ## end for (line in found_lines){
                  } ## for (c in 1:length(content)){
                } ##if (!(occur[[abbrev]] %in% list_of_abbrevs)){
              } ## for (abbrev in c("abbrev_singular","abbrev_plural")) {
            } ## if (occur$res) {
          } ## end for (abbrev in c("abbrev_singular","abbrev_plural")) {
        } ## end for (nth in 1:length(word.txtline.fw)) {
      } ## end replace abbreviations in content
    } ## end for each filter word
  } ## if (integrity.indicator == TRUE)

  ## if filter word abbreviations were found replace the abbreviations in content
  if (eval.abbrevs == TRUE && integrity.indicator == TRUE &&
      !filter.words[i, "words"] == "" && !is.null(list_of_abbrevs)) {
    txtcontent <- content[[1]]
    keeplayouttxtcontent <- content[[2]]

    ## html content per page
    for (pa in 3:length(content)) txthtmlcontent[[(pa-2)]] <- content[[pa]]

  } ## end replace abbreviations

  ## 3.3) Real filter Search ---------------------------------------
  if (integrity.indicator == TRUE) {
    word.txtline.fw <- NULL
    word.txtpos.fw <- NULL
    ## if there are filter words
    if (!filter.words[1, "words"] == "") {
      for (i in 1:nrow(filter.words)) {
        ## search for lines with filter word in [txtcontent]
        word <- filter.words[i, "words"]
        ignore.case.fw <- filter.words[i, "ignore.case.fw"]
        detected_line <- grep(word, txtcontent, ignore.case = ignore.case.fw)
        word.txtline.fw <- c(word.txtline.fw,
                             detected_line)
        for (li in detected_line){
          word.txtpos.fw <- c(word.txtpos.fw,
                              gregexpr(word, txtcontent[li], ignore.case = ignore.case.fw)[[1]])
        }
      }
      if (length(word.txtpos.fw) > filter.word.times - 1) {
        filterwords.go <- TRUE
        print_message <- paste0(length(word.txtpos.fw),
                                " filter word(s) were detected in ", id, ".pdf.")
        out_msg <- c(out_msg, print_message)
        if (verbose) cat(utils::tail(out_msg,1), sep="\n")
        update_progress_info(print_message)
      } else {
        filterwords.go <- FALSE
        print_message <- paste0("\'",id,".pdf\' was filtered out due to a lack of the filter words. ",
                                length(word.txtpos.fw),
                                " filter word(s) were detected in ", id, ".pdf.")
        out_msg <- c(out_msg, print_message)
        if (verbose) cat(utils::tail(out_msg,1), sep="\n")
        update_progress_info(print_message)
        if (write.txt.doc.file == TRUE) {
          dir.create(paste0(out,"/excl_by_fw"), showWarnings = FALSE)
          utils::write.table(paste0("Not enough txt lines with filter word found. ",
                                    length(word.txtpos.fw),
                                    " filter word(s) were detected in ", id, ".pdf."),
                             paste0(out,"/excl_by_fw/",id,"_not.enough.txt.w.filter.words",
                                    out.table.ext),
                             sep = out.table.separator, row.names = FALSE,
                             col.names = FALSE, na = "")
        }
      }  ## end if filter words were present
    } else {
      out_msg <- c(out_msg, "No filter words chosen for analysis.")
      if (verbose) cat(utils::tail(out_msg,1), sep="\n")
      update_progress_info("No filter words chosen for analysis.")
      filterwords.go <- TRUE
    }  ## end if filter words were set
  } ## end 3.3) Filter Search
  
  ## 4) Search of search words -----------------------------------------------
  ## only if filter words were found or no filter was set continue and
  ## search words were set
  if (filterwords.go == TRUE && integrity.indicator == TRUE) {
    searchwords.go <- FALSE
    ## search for lines with search word
    word.txtline <- NULL

    for (i in 1:nrow(search.words)) {
      ## 4.1) Search for lines with search word -------------------------
      word <- search.words[i, "words"]
      ignore.case.sw <- search.words[i, "ignore.case.sw"]
      word.txtline <- NULL
      word.keeplayoutline <- NULL
      ## if search words were not chosen write all lines in txtline
      if (search.words[i, "words"] == ""){
        word.txtline <- 1:length(txtcontent)
        word.keeplayoutline <- 1:length(word.keeplayoutline)
      } else {
        ## search for lines with search word in [txtcontent]
        word.txtline <- grep(word, txtcontent, ignore.case = ignore.case.sw)
        ## search for lines with search word in [keeplayouttxtcontent]
        word.keeplayoutline <- grep(word, keeplayouttxtcontent,
                                    ignore.case = ignore.case.sw)
      }

      ## 4.2) Continue analysis when search words were found ----------------------------
      if (length(word.txtline) > 0) searchwords.go <- TRUE

      ## 4.3) Replace abbreviations -----------------------------------------------------
      if (eval.abbrevs == TRUE && length(word.txtline) > 0 &&
          !search.words[i, "words"] == ""){
        ## Check if any occurences (heading + text) of the
        ## search word are defining and abbreviation
        ## go through each txtcontent line that the searchword was found
        for (nth in 1:length(word.txtline)) {
          paragraph <- txtcontent[word.txtline[[nth]]]
          ## identify definitions of abbrev
          occur_double_dots_or_equal <- test_if_abbrev_double_dots_or_equal(searchword = as.character(word), 
                                                                            paragraph = paragraph,
                                                                            ignore.case.sw = ignore.case.sw)
          occur_in_parantheses <- test_if_abbrev_in_parantheses(searchword = as.character(word), 
                                                                paragraph = paragraph,
                                                                ignore.case.sw = ignore.case.sw)
          ## if abbrev was found in nth occurence
          for (occur in list(occur_double_dots_or_equal, occur_in_parantheses)){
            if (occur$res == TRUE) {
              ## replace if abbrev is not yet defined
              for (abbrev in c("abbrev_singular","abbrev_plural")) {
                if (!(occur[[abbrev]] %in% list_of_abbrevs)){
                  list_of_abbrevs <- c(list_of_abbrevs,occur[[abbrev]])
                  for (c in 1:length(content)){
                    to_find <- paste0("([^A-z|0-9]+|^)",occur[[abbrev]],"([^A-z|0-9|:]$|[^A-z|0-9|:][^=|:]|$)")
                    found_lines <- grep(to_find, content[[c]],
                                        ignore.case = FALSE)
                    for (line in found_lines){
                      found_pos <- gregexpr(to_find, content[[c]][line])[[1]]
                      found_pos_list <- gregexpr(to_find, content[[c]][line])[[1]]
                      add_pos <- 0
                      for (p in 1:length(found_pos)){
                        sub <- substr(content[[c]][line],found_pos[p] + add_pos,nchar(content[[c]][line]))
                        ## prevent double substitution if sub already contains substitution
                        if (grepl(occur[[sub("abbrev","replacement",abbrev)]], sub)) next
                        found_pos_list[p] <- regexpr(occur[[abbrev]],sub) + found_pos[p] + add_pos - 1
                        content[[c]][line] <- paste0(substr(content[[c]][line],1,found_pos_list[p]-1),
                                                     sub(occur[[abbrev]],
                                                         occur[[sub("abbrev","replacement",abbrev)]],
                                                         substr(content[[c]][line],found_pos_list[p],
                                                                nchar(content[[c]][line])),
                                                         ignore.case = FALSE))
                        add_pos <- add_pos + nchar(occur[[sub("abbrev","replacement",abbrev)]]) - nchar(occur[[abbrev]])
                        ## correct for sub of abbrev definition e.g. methotrexate (MTX (methotrexate))
                        len_diff <- nchar(content[[c]][line]) - 
                          nchar(gsub(paste0("(",occur[[sub("abbrev","replacement",abbrev)]],")"),
                                     paste0("(",occur[[abbrev]],")"),
                                     content[[c]][line] , fixed = TRUE))
                        add_pos <- add_pos - len_diff
                        content[[c]][line]  <- gsub(paste0("(",occur[[sub("abbrev","replacement",abbrev)]],")"),
                                                    paste0("(",occur[[abbrev]],")"),
                                                    content[[c]][line] , fixed = TRUE)
                      } ## end for (p in length(found_pos)){
                    } ## end for (line in found_lines){
                  } ## for (c in 1:length(content)){
                } ##if (!(occur[[abbrev]] %in% list_of_abbrevs)){
              } ## for (abbrev in c("abbrev_singular","abbrev_plural")) {
            } ## if (occur$res) {
          } ## end for (abbrev in c("abbrev_singular","abbrev_plural")) {
        } ## end for (nth in 1:length(word.txtline)) {
      } ## end replace abbreviations in content
    } ## end for each search word
  } ## if (filterwords.go == TRUE && integrity.indicator == TRUE)

  ## if search words were found replace the abbreviations in content
  if (searchwords.go == TRUE && filterwords.go == TRUE &&
      eval.abbrevs == TRUE && integrity.indicator == TRUE &&
      !search.words[i, "words"] == "") {
    txtcontent <- content[[1]]
    keeplayouttxtcontent <- content[[2]]

    ## html content per page
    for (pa in 3:length(content)) txthtmlcontent[[(pa-2)]] <- content[[pa]]

  } ## end replace abbreviations

  ## if no search words are detected in document
  if (filterwords.go == TRUE &&
      searchwords.go == FALSE &&
      integrity.indicator == TRUE){
    print_message <- paste0("No text with search words for \'",id,".pdf\' found.")
    out_msg <- c(out_msg, print_message)
    if (verbose) cat(utils::tail(out_msg,1), sep="\n")
    update_progress_info(print_message)
    ## write an empty file
    if (write.txt.doc.file == TRUE) {
      dir.create(paste0(out,"/excl_by_sw"), showWarnings = FALSE)
      utils::write.table("No text line with search word found.",
                         paste0(out,"/excl_by_sw/",id, "_no.txt.w.search.words",
                                out.table.ext),
                         sep = out.table.separator, row.names = FALSE,
                         col.names = FALSE, na = "")
    }
  }  ## end if search words were present

  ## 5) Sort the html content ---------------------------------------
  if (filterwords.go == TRUE && integrity.indicator == TRUE) {
    ## add the top and left ##
    for (p in 1:length(txthtmlcontent)) {
      ## if page has one dimension
      if (!"left" %in% colnames(htmlcontent[[p]]))
        htmlcontent[[p]] <- cbind(htmlcontent[[p]],
                                  left = NA)
      if (!"top" %in% colnames(htmlcontent[[p]]))
        htmlcontent[[p]] <- cbind(htmlcontent[[p]],
                                  top = NA)
    }
    for (p in 1:length(txthtmlcontent)){
      ## 5.1) Assign top and left values ------------------------------

      start <- grep("^<body>",htmlcontent[[p]][,1])[1] + 1
      if (is.na(start)) start <- 1
      end <- grep("^</body>",htmlcontent[[p]][,1])[1] - 1
      if (is.na(end)) {
        end <- nrow(htmlcontent[[p]])
        print_message <- paste0("Page ", p, " of \'", id, ".html\' was incompletely read.",
                                " This might leader to incomplete table extraction but does not",
                                " affect search word detection")
        out_msg <- c(out_msg, print_message)
        if (verbose) cat(utils::tail(out_msg,1), sep="\n")
        print_message_short <- paste0("Page ", p, " of \'", id, ".html\' was incompletely read",
                                      " (does not affect txt detection though).")
        update_progress_info(print_message_short)
      }
      
      ## make the copy of htmlcontent for the sorting
      ## single row
      if (start == end){
        out.htmlcontent <- rbind(htmlcontent[[p]][start, ])
        out.txthtmlcontent <- txthtmlcontent[[p]][start]
      } else {
        out.htmlcontent <- htmlcontent[[p]][start:end, ]
        out.txthtmlcontent <- txthtmlcontent[[p]][start:end]
      }
      
      ## for each line
      for (line.number in 1:(end - start + 1)) {
        ## if content line
        if (grepl("[\"|;]font-size:",
                  out.htmlcontent[line.number, 1])) {
          ## get the left position
          pos.left.start <- regexpr("left:",
                                    out.htmlcontent[line.number, 1])[[1]] + 5
          pos.left.end <- regexpr("px",
                                  substr(out.htmlcontent[line.number, 1],
                                         pos.left.start,
                                         nchar(out.htmlcontent[line.number, 1])))[[1]] - 2 + pos.left.start
          left.value <- suppressWarnings(as.integer(substr(out.htmlcontent[line.number, 1], pos.left.start, pos.left.end)))
          if (is.na(left.value) && grepl(";vertical-align:baseline;",
                                         out.htmlcontent[line.number, 1])) {
            left.value <- out.htmlcontent[line.number-1, "left"]
          }
          
          out.htmlcontent[line.number, "left"] <- left.value
          ## get the top information
          pos.top.start <- regexpr("top:",
                                    out.htmlcontent[line.number, 1])[[1]] + 4
          pos.top.end <- regexpr("px",
                                  substr(out.htmlcontent[line.number, 1],
                                         pos.top.start,
                                         nchar(out.htmlcontent[line.number, 1])))[[1]] - 2 + pos.top.start
          top.value <- suppressWarnings(as.integer(substr(out.htmlcontent[line.number, 1], pos.top.start, pos.top.end)))
          if (is.na(top.value) && grepl(";vertical-align:baseline;",
                                        out.htmlcontent[line.number, 1])) {
            top.value <- out.htmlcontent[line.number-1, "top"]
          }
          
          out.htmlcontent[line.number, "top"] <- top.value
        } else {
          ## if the line does not have position info
          out.htmlcontent[line.number, "left"] <- 0
          out.htmlcontent[line.number, "top"] <- 9999
        }
      }
      
      ## 5.2) Sort lines according to top -------------------
      ## only the lines with top value
      lines.with.top.value <- out.htmlcontent[!is.na(out.htmlcontent[, "top"]), ]
      txtlines.with.top.value <- out.txthtmlcontent[!is.na(out.htmlcontent[, "top"])]
      ## if it is only one line no sorting necessary
      if (length(which(!is.na(out.htmlcontent[, "top"]))) > 1) {
        htmlorder <- order(as.numeric(lines.with.top.value[,"top"]), as.numeric(lines.with.top.value[,"left"]))
        lines.with.top.value.sorted <- lines.with.top.value[htmlorder, ]
        txtlines.with.top.value.sorted <- txtlines.with.top.value[htmlorder]
      } else {
        lines.with.top.value.sorted <- lines.with.top.value
        txtlines.with.top.value.sorted <- txtlines.with.top.value
      }
      out.htmlcontent[!is.na(out.htmlcontent[, "top"]), ] <- lines.with.top.value.sorted
      out.txthtmlcontent[!is.na(out.htmlcontent[, "top"])] <- txtlines.with.top.value.sorted
      
      ## make all 0 and 9999 to NA
      no.pos.info.lines <- (out.htmlcontent[, "top"] == 9999)
      out.htmlcontent[no.pos.info.lines, "left"] <- NA
      out.htmlcontent[no.pos.info.lines, "top"] <- NA
      
      htmlcontent[[p]][start:end, ] <- out.htmlcontent
      txthtmlcontent[[p]][start:end] <- out.txthtmlcontent
    }  
  }

  ## 6) Extract Tables --------------------------------------------
  ## Explanation: The table detection is important to destinguish tables from text
  ## even if tables will not be exported they will not be a part of the sentence
  ## detection if search words.

  ## Use html to find table end line by adding 5 lines and then search for span id= change
  ## test if file has tables --> only process tables when table is present

  ## 6.1) Test if document has tables -------
  if (searchwords.go == TRUE && filterwords.go == TRUE &&
      integrity.indicator == TRUE) {
    ## if there is no additional heading
    if (table.heading.words[1, "words"] == "") {
      tablestart.pos <- c(grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)",
                               txtcontent, ignore.case = TRUE),
                          grep("^(\f|)(Table )[0-99|MDCLXVI]+( )",
                               txtcontent, ignore.case = TRUE),
                          grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+$",
                               txtcontent, ignore.case = TRUE),
                          grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)(*)",
                               txthtmlcontent[[j]], ignore.case = TRUE),
                          grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+$",
                               txthtmlcontent[[j]], ignore.case = TRUE),
                          grep("^(\f|)(Table )[0-99|MDCLXVI]+( )",
                               txthtmlcontent[[j]], ignore.case = TRUE))
    } else {
      word.txtline.th <- NULL
      for (i in 1:nrow(table.heading.words)) {
        ## search for lines with searchword info
        ## [txtcontent] ##
        word <- table.heading.words[i, "words"]
        ignore.case.th <- table.heading.words[i,
                                              "ignore.case.th"]
        word.txtline.th <- c(word.txtline.th,
                             grep(word, txtcontent, ignore.case = ignore.case.th))
      }
      tablestart.pos <- c(word.txtline.th,
                          grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)",
                               txtcontent, ignore.case = TRUE),
                          grep("^(\f|)(Table )[0-99|MDCLXVI]+( )",
                               txtcontent, ignore.case = TRUE),
                          grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+$",
                               txtcontent, ignore.case = TRUE),
                          grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)(*)",
                               txthtmlcontent[[j]], ignore.case = TRUE),
                          grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+$",
                               txthtmlcontent[[j]], ignore.case = TRUE),
                          grep("^(\f|)(Table )[0-99|MDCLXVI]+( )",
                               txthtmlcontent[[j]], ignore.case = TRUE))
    }
  }

  ## 6.2) Detect the table start positions by detecting headings ----------------
  if (searchwords.go == TRUE && filterwords.go == TRUE &&
      integrity.indicator == TRUE && !length(tablestart.pos) == 0) {

    ## Initialize master table for positions
    htmltablelines <- data.frame(page = NULL,
                                 tableheading = NULL, tablestart.pos = NULL,
                                 tablelastline = NULL, tableend.pos = NULL,
                                 legendlastline = NULL,
                                 legendend.pos = NULL, txtfirstline = NULL)
    txttablelines <- data.frame(page = NULL,
                                tableheading = NULL, tablestart.pos = NULL,
                                tablelastline = NULL, tableend.pos = NULL,
                                legendlastline = NULL,
                                legendend.pos = NULL, txtfirstline = NULL)

    ## go through pages
    for (j in 1:length(txthtmlcontent)) {

      ## if there is no additional heading
      if (table.heading.words[1, "words"] == "") {
        html.tablestart.pos <- c(grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)(*)",
                                      txthtmlcontent[[j]], ignore.case = TRUE),
                                 grep("^(\f|)(Table )[0-99|MDCLXVI]+( )(*)",
                                      txthtmlcontent[[j]], ignore.case = TRUE))
        ## search for tables with line break in title
        lb.html.tablestart.pos <- c(grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+$",
                                         txthtmlcontent[[j]], ignore.case = TRUE),
                                    grep("^(\f|)(Table )[0-99|MDCLXVI]+( )$",
                                         txthtmlcontent[[j]], ignore.case = TRUE))
        ## only look if the table is having also page
        if (length(splittxtcontent) >= j) {
          txt.tablestart.pos <- c(grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)(*)",
                                       splittxtcontent[[j]], ignore.case = TRUE),
                                  grep("^(\f|)(Table )[0-99|MDCLXVI]+( )(*)",
                                       splittxtcontent[[j]], ignore.case = TRUE))
          ## search for tables with line break in title
          lb.txt.tablestart.pos <- c(grep("^(\f|)(Table )[0-99|MDCLXVI]+$",
                                          splittxtcontent[[j]], ignore.case = TRUE),
                                     grep("^(\f|)(Table )[0-99|MDCLXVI]+( )$",
                                          splittxtcontent[[j]], ignore.case = TRUE))
        } else if (length(html.tablestart.pos) > 0) {
          if (nrow(htmltablelines) == 0) {
            htmltablelines <- data.frame(page = j,
                                         tableheading = NA, tablestart.pos = NA,
                                         tablelastline = NA, legendlastline = NA,
                                         legendend.pos = NA, txtfirstline = NA,
                                         detected.in = "htmlonly")
          } else {
            newrow <- htmltablelines[1, ]
            newrow <- NA
            newrow["page"] <- j
            newrow["detected.in"] <- "htmlonly"
            htmltablelines <- rbind(htmltablelines,
                                    newrow)
          }
        }
      } else
      {
        ## if there is an additional heading
        txthtml.word.txtline.th <- NULL
        for (i in 1:nrow(table.heading.words)) {
          ## search for lines with searchword info
          ## [txtcontent] ##
          word <- table.heading.words[i, "words"]
          ignore.case.th <- table.heading.words[i, "ignore.case.th"]
          txthtml.word.txtline.th <- c(txthtml.word.txtline.th,
                                       grep(word, txthtmlcontent[[j]],
                                            ignore.case = ignore.case.th))
        }
        html.tablestart.pos <- c(txthtml.word.txtline.th,
                                 grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)(*)",
                                      txthtmlcontent[[j]], ignore.case = TRUE),
                                 grep("^(\f|)(Table )[0-99|MDCLXVI]+( )(*)",
                                      txthtmlcontent[[j]], ignore.case = TRUE))
        ## search for tables with line break in title
        lb.html.tablestart.pos <- c(grep("^(\f|)(Table )[0-99|MDCLXVI]+$",
                                         txthtmlcontent[[j]], ignore.case = TRUE),
                                    grep("^(\f|)(Table )[0-99|MDCLXVI]+( )$",
                                         txthtmlcontent[[j]], ignore.case = TRUE))
        ## only look if the table is having also page
        if (length(splittxtcontent) >= j) {
          splittxt.word.txtline.th <- NULL
          for (i in 1:nrow(table.heading.words)) {
            ## search for lines with searchword info
            ## [txtcontent] ##
            word <- table.heading.words[i, "words"]
            ignore.case.th <- table.heading.words[i, "ignore.case.th"]
            splittxt.word.txtline.th <- c(splittxt.word.txtline.th,
                                          grep(word, splittxtcontent[[j]],
                                               ignore.case = ignore.case.th))
          }
          txt.tablestart.pos <- c(splittxt.word.txtline.th,
                                  grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)(*)",
                                       splittxtcontent[[j]], ignore.case = TRUE),
                                  grep("^(\f|)(Table )[0-99|MDCLXVI]+( )(*)",
                                       splittxtcontent[[j]], ignore.case = TRUE))
          ## search for tables with line break in title
          lb.txt.tablestart.pos <- c(grep("^(\f|)(Table )[0-99|MDCLXVI]+$",
                                          splittxtcontent[[j]], ignore.case = TRUE),
                                     grep("^(\f|)(Table )[0-99|MDCLXVI]+( )$",
                                          splittxtcontent[[j]], ignore.case = TRUE))
        } else if (length(html.tablestart.pos) > 0) {
          if (nrow(htmltablelines) == 0) {
            htmltablelines <- data.frame(page = j,
                                         tableheading = NA, tablestart.pos = NA,
                                         tablelastline = NA, legendlastline = NA,
                                         legendend.pos = NA, txtfirstline = NA,
                                         detected.in = "htmlonly")
          } else {
            newrow <- htmltablelines[1, ]
            newrow <- NA
            newrow["page"] <- j
            newrow["detected.in"] <- "htmlonly"
            htmltablelines <- rbind(htmltablelines,
                                    newrow)
          }
        }
      }  ## end if there is (no) additional heading

      ## Look at tables in htmlcontent
      if (length(html.tablestart.pos) > 0) {
        for (i in 1:length(html.tablestart.pos)) {
          ## Removing the Table wording
          line <- txthtmlcontent[[j]][html.tablestart.pos[i]]
          tableheading <- substr(line,
                                 1, 50)
          ## replace the paranthesis so that they don't give
          ## problems for grep
          for (symbol in c("\\?","\\+","\\(", "\\)",
                           "\\[", "\\]", "\\/", "\\{",
                           "\\}")) {
            tableheading <- gsub(symbol,
                                 paste0("\\", symbol),
                                 tableheading)
          }
          ## Remove backreferences (\1) from tableheading
          tableheading <- remove_backref(tableheading)
          
          currentstartlines <- html.tablestart.pos[i]
          ## if the heading is on the site then add it to the
          ## table
          if (length(currentstartlines) != 0) {
            htmltablelines <- rbind(htmltablelines,
                                    data.frame(page = j,
                                               tableheading = tableheading,
                                               tablestart.pos = currentstartlines,
                                               tablelastline = NA,
                                               txtfirstline = NA,
                                               legendlastline = NA,
                                               legendend.pos = NA))
          }
        }
      }  ## end if length(html.tablestart.pos) > 0
      
      if (length(lb.html.tablestart.pos) > 0) {
        for (i in 1:length(lb.html.tablestart.pos)) {
          ## when +1 is empty
          s <- 1
          while (txthtmlcontent[[j]][lb.html.tablestart.pos[i] + s] == "") {
            s <- s + 1
            if (s > 5 || (lb.html.tablestart.pos[i] + s) == length(txthtmlcontent[[j]])) break
          }
          if ((lb.html.tablestart.pos[i] + s) == length(txthtmlcontent[[j]])) next
          ## Removing the Table wording
          line <- paste0(txthtmlcontent[[j]][lb.html.tablestart.pos[i]]," ",
                         txthtmlcontent[[j]][lb.html.tablestart.pos[i] + s])
          tableheading <- substr(line,
                                 1, 50)
          ## replace the paranthesis so that they don't give
          ## problems for grep
          for (symbol in c("\\?","\\+","\\(", "\\)",
                           "\\[", "\\]", "\\/", "\\{",
                           "\\}")) {
            tableheading <- gsub(symbol,
                                 paste0("\\", symbol),
                                 tableheading)
          }
          ## Remove backreferences (\1) from tableheading
          tableheading <- remove_backref(tableheading)
          
          currentstartlines <- lb.html.tablestart.pos[i]
          ## if the heading is on the site then add it to the
          ## table
          if (length(currentstartlines) != 0) {
            htmltablelines <- rbind(htmltablelines,
                                    data.frame(page = j,
                                               tableheading = tableheading,
                                               tablestart.pos = currentstartlines,
                                               tablelastline = NA,
                                               txtfirstline = NA,
                                               legendlastline = NA,
                                               legendend.pos = NA))
          }
        }
      }  ## end if there is a table on this page
      ## Look at tables in txtcontent
      if (length(txt.tablestart.pos) > 0) {
        for (i in 1:length(txt.tablestart.pos)) {
          ## Removing the Table wording
          line <- splittxtcontent[[j]][txt.tablestart.pos[i]]
          ## extract the first 50 characters
          tableheading <- substr(line,
                                 1, 50)
          ## replace the paranthesis so that they don't give
          ## problems for grep
          tableheading <- gsub("\\f", "", tableheading)
          for (symbol in c("\\?","\\+","\\(", "\\)",
                           "\\[", "\\]", "\\/", "\\{",
                           "\\}","\\*")) {
            tableheading <- gsub(symbol,
                                 paste0("\\", symbol),
                                 tableheading)
          }
          ## Remove backreferences (\1) from tableheading
          tableheading <- remove_backref(tableheading)
          
          ## from is where the heading was detected
          from <- txt.tablestart.pos[i]
          if (j > 1){
            for (p in 1:(j-1)){
              from <- from + length(splittxtcontent[[p]])
            }
          }
          
          ## to has to be the end of the page
          to <- 0
          if (j > 1){
            for (p in 1:j){
              to <- to + length(splittxtcontent[[p]])
            }
          }
          ##remove backreference from txtcontent
          txtcontent[from:to] <- remove_backref(txtcontent[from:to])
          
          ## heading
          currentstartlines <- grep(tableheading,
                                    txtcontent[from:to], fixed = TRUE)[1] + from - 1
          ## if the heading is on the site then add it to the
          ## table
          if (length(currentstartlines) != 0) {
            txttablelines <- rbind(txttablelines,
                                   data.frame(page = j,
                                              tableheading = tableheading,
                                              tablestart.pos = currentstartlines,
                                              tablelastline = NA, tableend.pos = NA,
                                              txtfirstline = NA,
                                              legendlastline = NA,
                                              legendend.pos = NA))
          }
        }
      }  ## end if there is a table on this page
      
      if (length(lb.txt.tablestart.pos) > 0) {
        for (i in 1:length(lb.txt.tablestart.pos)) {
          ## when +1 is empty
          s <- 1
          while (splittxtcontent[[j]][lb.txt.tablestart.pos[i] + s] == "") {
            s <- s + 1
            if (s > 5) break
          }
          ## Removing the Table wording
          line <- paste0(splittxtcontent[[j]][lb.txt.tablestart.pos[i]]," ",
                         splittxtcontent[[j]][lb.txt.tablestart.pos[i] + s])
          tableheading <- substr(line, 1, 50)
          ## replace the paranthesis so that they don't give
          ## problems for grep
          for (symbol in c("\\?","\\+","\\(", "\\)",
                           "\\[", "\\]", "\\/", "\\{",
                           "\\}","\\*")) {
            tableheading <- gsub(symbol,
                                 paste0("\\", symbol),
                                 tableheading)
          }
          ## Remove backreferences (\1) from tableheading
          tableheading <- remove_backref(tableheading)
          
          ## get the position of the line that has exactly the
          ## heading
          ## from is where the heading was detected
          from <- lb.txt.tablestart.pos[i]
          if (j > 1){
            for (p in 1:(j-1)){
              from <- from + length(splittxtcontent[[p]])
            }
          }
          
          ## to has to be the end of the page
          to <- 0
          if (j > 1){
            for (p in 1:j){
              to <- to + length(splittxtcontent[[p]])
            }
          }
          currentstartlines <- grep(splittxtcontent[[j]][lb.txt.tablestart.pos[i]],
                                    txtcontent[from:to][1]) + from - 1
          ## if the heading is on the site then add it to the
          ## table
          if (length(currentstartlines) != 0) {
            txttablelines <- rbind(txttablelines,
                                   data.frame(page = j,
                                              tableheading = tableheading,
                                              tablestart.pos = currentstartlines,
                                              tablelastline = NA, tableend.pos = NA,
                                              txtfirstline = NA,
                                              legendlastline = NA,
                                              legendend.pos = NA))
          }
        }
      }  ## end if there is a table on this page
    }  ## end go through each page j

    ## end if file has no tables (if function started
    ## then PDF file has to have search words)
  } else if (searchwords.go == TRUE && filterwords.go == TRUE &&
             integrity.indicator == TRUE && length(tablestart.pos) == 0) {
    outtable <- cbind(txtcontent, layout = "txt",
                      rownumber = 1:length(txtcontent))
    txtlines <- outtable[(outtable[, "layout"] == "txt"), ]
    txttablelines <- NULL
    htmltablelines <- NULL
  } else {
    outtable <- NULL
    txtlines <- NULL
    txttablelines <- NULL
    htmltablelines <- NULL
  }

  ## 6.3) Determine if tables were found in html, txt to both files ------------
  if (searchwords.go == TRUE && filterwords.go == TRUE &&
      integrity.indicator == TRUE && !length(tablestart.pos) == 0 &&
      nrow(as.data.frame((htmltablelines))) > 0 && ncol(as.data.frame((htmltablelines))) > 0) {
    ## sort the htmltablelines table according to page
    ## and then tablestart.pos
    htmltablelines <- htmltablelines[with(htmltablelines,
                                          order(page, tablestart.pos)), ]
    ## add information about where table was found
    if (!"detected.in" %in% colnames(htmltablelines))
      htmltablelines <- cbind(htmltablelines,
                              detected.in = NA)
    if (nrow(txttablelines) > 0 && ncol(txttablelines) > 0) {
      ## sort the txttablelines table according to page
      ## and then tablestart.pos
      txttablelines <- txttablelines[with(txttablelines,
                                          order(page, tablestart.pos)), ]
      if (!"detected.in" %in% colnames(txttablelines)) {
        txttablelines <- cbind(txttablelines,
                               detected.in = NA)
      }

      ## go through each line of the tables and compare
      ## which tables are in the txt only
      for (txtrow in 1:nrow(txttablelines)) {
        txttablelines[txtrow, ] <- find_similar_row(originrow = txttablelines[txtrow, ],
                                                    targettablelines = htmltablelines,
                                                    relative.match.col = "tableheading",
                                                    output.for.originrow.only = "txtonly",
                                                    output.for.match = "txtandhtml",
                                                    output.column.name = "detected.in")$originrow
        htmltablelines <- find_similar_row(originrow = txttablelines[txtrow, ], 
                                           targettablelines = htmltablelines,
                                           relative.match.col = "tableheading",
                                           output.for.originrow.only = "txtonly",
                                           output.for.match = "txtandhtml",
                                           output.column.name = "detected.in")$targettablelines
      }
    }
    ## name all the rows that are only found in html
    if (nrow(htmltablelines[is.na(htmltablelines[, "detected.in"]), ]) > 0)
      htmltablelines[is.na(htmltablelines[, "detected.in"]), "detected.in"] = "htmlonly"
  }


  ## 6.4) Add positional value to tables -----
  if (searchwords.go == TRUE && filterwords.go == TRUE &&
      integrity.indicator == TRUE && !length(tablestart.pos) == 0 &&
      nrow(as.data.frame((htmltablelines))) > 0 && ncol(as.data.frame((htmltablelines))) > 0) {

    ## add tableend.pos if column does not exist
    if (!"tableend.pos" %in% colnames(htmltablelines)){
      htmltablelines <- cbind(htmltablelines, tableend.pos = NA)
    }
    htmltablelines <-  htmltablelines[!(is.na(htmltablelines[,"page"])),]
    for (i in 1:nrow(htmltablelines)) {
      position <- as.numeric(htmltablelines[i, "tablestart.pos"])
      p <- as.numeric(htmltablelines[i, "page"])
      ## if end of page or the txt ended
      npos <- position + 7
      ## test if the table is too short for detection
      if (nrow(htmlcontent[[p]]) < npos){
        outofbound <- TRUE 
      } else {
        outofbound <- FALSE
      }
      if (outofbound == FALSE) {
        if (is.null(htmlcontent[[p]][npos, "top"]))
          notop <- TRUE else notop <- FALSE
      }

      if ((outofbound == TRUE) || (notop == TRUE)) {
        ## end is at the end of the page
        htmltablelines[i, "tableend.pos"] <- nrow(htmlcontent[[p]])
        htmltablelines[i, "detected.in"] <- "txtonly.notabledetected"
        if (nrow(txttablelines) > 0 &&
            ncol(txttablelines) > 0) {
          ## go through each line of the tables and compare
          ## which tables are in the txt only
          txttablelines <-  txttablelines[!(is.na(txttablelines[,"page"])),]
          txttablelines <- find_similar_row(originrow = htmltablelines[i,],
                                            targettablelines = txttablelines,
                                            relative.match.col = "tableheading",
                                            output.for.originrow.only = "error",
                                            output.for.match = "txtonly.notabledetected",
                                            output.column.name = "detected.in")$targettablelines
        }
        ## else start with +5
      } else {
        ## end of the this is the start line +5
        htmltablelines[i, "tableend.pos"] <- htmltablelines[i, "tablestart.pos"] + 5
      }
    }  ## end go through each row
  }

  if (searchwords.go == TRUE && filterwords.go == TRUE &&
      integrity.indicator == TRUE && !length(tablestart.pos) == 0 &&
      nrow(as.data.frame((htmltablelines))) > 0 && ncol(as.data.frame((htmltablelines))) > 0) {
    ## determine if top is constant (when not start point +1)
    ## do for each table row
    nexti <- FALSE
    htmltablelines <-  htmltablelines[!(is.na(htmltablelines[,"page"])),]

    for (i in 1:nrow(htmltablelines)) {
      ## ignore the lines with txtonly
      if ((htmltablelines[i, "detected.in"] == "txtonly.notabledetected") ||
          (htmltablelines[i, "detected.in"] == "txtonly")) {
        if (nrow(txttablelines) > 0 && ncol(txttablelines) > 0) {
          ## go through each line of the tables and compare
          ## which tables are in the txt only
          txttablelines <-  txttablelines[!(is.na(txttablelines[,"page"])),]
          txttablelines <- find_similar_row(originrow = htmltablelines[i, ],
                                            targettablelines = txttablelines,
                                            relative.match.col = "tableheading",
                                            output.for.originrow.only = "error",
                                            output.for.match = "txtonly.notabledetected",
                                            output.column.name = "detected.in")$targettablelines
        }
        next
      }

      ## 6.5) Detect how many columns by evaluating how many different left values ---------------

      ## set the current line pos and page for the while loop
      currentline.pos <- htmltablelines[i, "tableend.pos"]
      currentline.page <- htmltablelines[i, "page"]
      ## initialize out.left.list
      out.left.list <- NA
      all.left.found <- FALSE
      while (all.left.found == FALSE) {
        constant.value <- NULL

        while (is.null(constant.value)) {
          ## currentline.pos is either set before the loop or at the end
          currentline <- htmlcontent[[currentline.page]][currentline.pos, ]

          ## get the top information
          oritop.value <- paste0("top:", currentline["top"], "px")

          ## go to the next line
          currentline.pos <- currentline.pos + 1
          ## set the start for of the next table
          if (!is.na(htmltablelines[i + 1, "tablestart.pos"]) &&
              htmltablelines[i + 1, "page"] == htmltablelines[i, "page"])
            nextstartpos <- htmltablelines[i + 1, "tablestart.pos"] else nextstartpos <- 999999
          ## test if end of page is reached
          currentline <- htmlcontent[[currentline.page]][currentline.pos, ]

          ## get the top information
          top.value <- paste0("top:",
                              currentline["top"], "px")

          ## if beyond the end of the html page
          if ((currentline.pos > nrow(htmlcontent[[currentline.page]])) ||
              (top.value == "top:NApx")) {
            htmltablelines[i, "detected.in"] <- "txtonly.notabledetected"
            if (nrow(txttablelines) > 0 && ncol(txttablelines) > 0) {
              ## go through each line of the tables and compare
              ## which tables are in the txt only
              txttablelines <- txttablelines[!(is.na(txttablelines[,"page"])),]
              txttablelines <- find_similar_row(originrow = htmltablelines[i, ],
                                                targettablelines = txttablelines,
                                                relative.match.col = "tableheading",
                                                output.for.originrow.only = "error",
                                                output.for.match = "txtonly.notabledetected",
                                                output.column.name = "detected.in")$targettablelines
            }
            currentline.pos <- currentline.pos - 1
            nexti <- TRUE
            break
            ## if tables overlap
          } else if (currentline.pos >= nextstartpos) {
            htmltablelines[i, "tableend.pos"] <- as.numeric(htmltablelines[i + 1, "tablestart.pos"]) - 1
            htmltablelines[i, "legendend.pos"] <- as.numeric(htmltablelines[i + 1, "tablestart.pos"]) - 1
            ## fill the txtcontent columns
            htmltablelines[i, "tablelastline"] <- txthtmlcontent[[currentline.page]][as.numeric(htmltablelines[i, 
                                                                                              "tableend.pos"])]
            htmltablelines[i, "legendlastline"] <- txthtmlcontent[[currentline.page]][as.numeric(htmltablelines[i, 
                                                                                                "legendend.pos"])]
            htmltablelines[i, "txtfirstline"] <- txthtmlcontent[[currentline.page]][as.numeric(htmltablelines[i, 
                                                                                            "legendend.pos"]) + 1]
            currentline.pos <- as.numeric(htmltablelines[i + 1, "tablestart.pos"]) - 1

            if (nrow(txttablelines) > 0 && ncol(txttablelines) > 0) {
              txttablelines[i, "tableend.pos"] <- as.numeric(txttablelines[i + 1, "tablestart.pos"]) - 1
              txttablelines[i, "legendend.pos"] <- as.numeric(txttablelines[i + 1, "tablestart.pos"]) - 1
              txttablelines[i, "tablelastline"] <-  txtcontent[as.numeric(txttablelines[i, "tableend.pos"])]
              txttablelines[i, "legendlastline"] <- txtcontent[as.numeric(txttablelines[i, "legendend.pos"])]
              txttablelines[i, "txtfirstline"] <-  txtcontent[as.numeric(txttablelines[i, "legendend.pos"])+1]
            }
            nexti <- TRUE
            break
          }  ## end if

          if (oritop.value == top.value) {
            ## the constant value is the top
            constant.value <- top.value
          } else {
            ## use this currentline.pos as new start
            htmltablelines[i, "tableend.pos"] <- currentline.pos
          }

        }  ## end while is.null(constant.value)

        if (nexti == TRUE) break
        ## determine the left range ##

        ## go to min
        ind <- TRUE
        while (ind == TRUE) {
          currentline.pos <- currentline.pos - 1
          currentline <- htmlcontent[[currentline.page]][currentline.pos, ]
          ind <- grepl(constant.value, currentline[1])
        }
        currentline.pos <- currentline.pos + 1
        currentline <- htmlcontent[[currentline.page]][currentline.pos, ]

        left.list <- NULL
        ind <- TRUE
        while (ind == TRUE) {
          # get the left position
          left.value <- paste0("left:", currentline["left"], "px")
          left.list <- c(left.list,  left.value)
          currentline.pos <- currentline.pos + 1
          currentline <- htmlcontent[[currentline.page]][currentline.pos, ]
          ind <- grepl(constant.value,
                       currentline[1])
        }

        ## test if left.list (column list) is complete ##
        if (all(left.list %in% out.left.list)) {
          ## if the left.list if complete, end the loop
          all.left.found <- TRUE
          break
        } else {
          ## if left.list is incomplete, add the current
          ## left.list
          out.left.list <- c(out.left.list[!is.na(out.left.list)],
                             left.list)
          out.left.list <- unique(out.left.list)
        }
      }  ## end of searching for all left values
      
      ## if only txt table to tables beside another table
      if (nexti == TRUE) {
        nexti <- FALSE
        next
      }

      ## 6.6) Determine the end of the table through the highest top value ---------------
      ## search for the max with all the left values ##


      ## if two tables are on the same page restrict lines
      if (!is.na(htmltablelines[i + 1, "tablestart.pos"]) &&
          htmltablelines[i + 1, "page"] == htmltablelines[i, "page"]){
        nexttablestartpos <- htmltablelines[i + 1, "tablestart.pos"]
      } else {
        nexttablestartpos <- nrow(htmlcontent[[currentline.page]])
      }

      ## make a toplist
      top.list <- NULL
      for (left.item in out.left.list) {
        max.line <- max(grep(left.item,
                             htmlcontent[[currentline.page]][1:nexttablestartpos, 1]))
        currentline <- htmlcontent[[currentline.page]][max.line, ]
        top.value <- currentline["top"]
        top.list <- c(top.list, top.value)
      }
      ## choose the max top count that is at least double
      top.value.found <- FALSE
      unq.top.list <- unique(top.list)
      match.list <- NULL
      for (ti in 1:length(unq.top.list)) {
        curr.top.value <- unq.top.list[ti]
        match.list[ti] <- 0
        for (li in 1:length(out.left.list)) {
          match.one <- intersect(grep(out.left.list[li],
                                      htmlcontent[[currentline.page]][1:nexttablestartpos, 1]),
                                 grep(paste0("top:",curr.top.value, "px;"),
                                      htmlcontent[[currentline.page]][1:nexttablestartpos, 1]))
          ## when there was a match and the left.item is not the last in the list
          if (length(match.one) > 0){
            match.list[ti] <- match.list[ti] + 1
          }
        }
      }
      max.top.value <- suppressWarnings(max(strtoi(unq.top.list[match.list > 1])))

      ## if not one value is duplicated then go with max
      if (!any(match.list > 1))
        max.top.value <- max(strtoi(top.list))

      ## if not one value is duplicated then go with max
      htmltablelines[i, "tableend.pos"] <- max(grep(paste0("top:",
                                                           max.top.value, "px;"),
                                                    htmlcontent[[currentline.page]][,1]))
      currentline.pos <- htmltablelines[i, "tableend.pos"]
      currentline <- htmlcontent[[currentline.page]][currentline.pos, ]

      ## add the last line to the htmltablelines ## add
      ## the last line to the htmltablelines
      htmltablelines[i, "tablelastline"] <- txthtmlcontent[[currentline.page]][currentline.pos]
      ## save the end of the table
      htmltablelines[i, "tableend.pos"] <- currentline.pos

      ## add everything below the table ##

      ## 6.7) Detect the legend by extracting all lines with a lower font that than the table -----
      ## get the current font size information ##
      pos.fontsize.start <- regexpr("font-size:",
                                    currentline[1])[[1]] + 10
      pos.fontsize.end <- regexpr("px;",substr(currentline[1],
                                               pos.fontsize.start, 
                                               nchar(currentline[1])))[[1]] + pos.fontsize.start - 2
      current.fontsize <- substr(currentline[1],
                                 pos.fontsize.start, pos.fontsize.end)
      current.fontsize <- as.numeric(current.fontsize)

      ## add everything with smaller font size
      fontsize <- 0
      while (current.fontsize >= fontsize) {
        ##determine max font size in table
        ## if next line does not exist
        currentline.pos <- currentline.pos + 1
        if (currentline.pos > nrow(htmlcontent[[currentline.page]])) break
        currentline <- htmlcontent[[currentline.page]][currentline.pos, 1]
        ## get the font size information
        pos.fontsize.start <- regexpr("font-size:",
                                      currentline)[[1]] + 10
        if (pos.fontsize.start > 9){
          pos.fontsize.end <- regexpr("px;",substr(currentline,
                                           pos.fontsize.start, 
                                           nchar(currentline)))[[1]] + pos.fontsize.start - 2
        } else {
          break
        }
        str.fontsize <- substr(currentline,
                               pos.fontsize.start, pos.fontsize.end)
        ## when the next line has no font size
        if (str.fontsize == "") break
        fontsize <- as.numeric(str.fontsize)

      }
      
      currentline.pos <- currentline.pos - 1
      ## add the last line to the htmltablelines
      linecontent <- txthtmlcontent[[currentline.page]][currentline.pos]
      htmltablelines[i, "legendlastline"] <- linecontent
      htmltablelines[i, "legendend.pos"] <- currentline.pos

      ## add the last line to the htmltablelines
      if (currentline.pos + 1 > nrow(htmlcontent[[currentline.page]])) {
        htmltablelines[i, "txtfirstline"] <- ""
      } else {
        linecontent <- txthtmlcontent[[currentline.page]][currentline.pos + 1]
        htmltablelines[i, "txtfirstline"] <- linecontent
      }
    }  ## end for (i in 1:nrow(htmltablelines)) {
  }

  ## 6.8) Transfer information gained from html file to txt and keeplayouttxt tables-----------

  if (searchwords.go == TRUE && filterwords.go == TRUE &&
      integrity.indicator == TRUE && !length(tablestart.pos) == 0 &&
      nrow(as.data.frame((htmltablelines))) > 0 && ncol(as.data.frame((htmltablelines))) > 0) {

    htmltablelines <- htmltablelines[,
                                     c("page", "tableheading", "tablestart.pos",
                                       "tablelastline", "tableend.pos",
                                       "legendlastline", "legendend.pos",
                                       "txtfirstline", "detected.in")]
    if (nrow(txttablelines) > 0 && ncol(txttablelines) > 0) {
      ## adjust txttablelines ##
      txttablelines <- txttablelines[,c("page", "tableheading", "tablestart.pos",
                                        "tablelastline", "tableend.pos",
                                        "legendlastline", "legendend.pos",
                                        "txtfirstline", "detected.in")]

      ## convert into dataframe without levels
      txttablelines <- data.frame(lapply(txttablelines,
                                         as.character), stringsAsFactors = FALSE)
      txttablelines <-  txttablelines[!(is.na(txttablelines[,"page"])),]
      for (i in 1:nrow(txttablelines)) {
        ## adjust all tablestart.pos
        start <- as.numeric(txttablelines[i, "tablestart.pos"])
        ## get legendlastline from html
        matchingrow_from_html <- find_similar_row(originrow = txttablelines[i, ],
                                                  targettablelines = htmltablelines,
                                                  relative.match.col = "tableheading",
                                                  output.for.originrow.only = NA,
                                                  output.for.match = NA,
                                                  output.column.name = "detected.in")$targettablerow

        ## if table was not detected table is only one line
        if (txttablelines[i, "detected.in"] == "txtonly.notabledetected" ||
            txttablelines[i, "detected.in"] == "txtonly") {
          txttablelines[i, "tableend.pos"] <- txttablelines[i, "tablestart.pos"]
          txttablelines[i, "legendend.pos"] <- txttablelines[i, "tablestart.pos"]
          txttablelines[i, "tablelastline"] <-  txtcontent[as.numeric(txttablelines[i, "tableend.pos"])]
          txttablelines[i, "legendlastline"] <- txtcontent[as.numeric(txttablelines[i, "legendend.pos"])]
          txttablelines[i, "txtfirstline"] <-  txtcontent[as.numeric(txttablelines[i, "legendend.pos"]) + 1]

          next
        } else {
          ## get legendlastline from html
          txttablelines[i,"legendlastline"] <- as.character(matchingrow_from_html[["legendlastline"]])
          ## get txtfirstline from html
          txttablelines[i,"txtfirstline"] <- as.character(matchingrow_from_html[["txtfirstline"]])
          ## get tablelastline from html
          txttablelines[i,"tablelastline"] <- as.character(matchingrow_from_html[["tablelastline"]])
        }
        ## if txtfirstline is not empty (i.e. at the end of
        ## the page)
        ## if tables overlapped (i.e. legend.end was determined by table afterwards
        if (!is.na(txttablelines[i, "legendend.pos"])){
          ## adjust txtfirstline
          leg.pos <- as.numeric(txttablelines[i, "legendend.pos"])
          txt.pos <- leg.pos+1
          txttablelines[i, "legendlastline"] <- txtcontent[leg.pos]
          txttablelines[i, "tablelastline"] <- txtcontent[leg.pos]
          txttablelines[i, "txtfirstline"] <- txtcontent[txt.pos]

        } else if (!txttablelines[i, "txtfirstline"] == "") {
          ## adjust txtfirstline
          leg.pos <- suppressWarnings(min(grep(txttablelines[i, "legendlastline"],
                                               txtcontent[start:length(txtcontent)],
                                               fixed = TRUE))) + start - 1
          txt.pos <- suppressWarnings(min(grep(txttablelines[i, "txtfirstline"], 
                                               txtcontent[start:length(txtcontent)],
                                               fixed = TRUE))) + start - 1
          ## shorten the txtfirstline till hit is found
          while (leg.pos == Inf && txt.pos == Inf &&
                 txttablelines[i, "txtfirstline"] != "") {
            txttablelines[i, "txtfirstline"] <- strtrim(txttablelines[i, "txtfirstline"],
                                                        nchar(txttablelines[i, "txtfirstline"]) - 1)
            txt.pos <- suppressWarnings(min(grep(txttablelines[i, "txtfirstline"],
                                                 txtcontent[start:length(txtcontent)],
                                                 fixed = TRUE))) + start - 1
          }
          if (txttablelines[i, "txtfirstline"] == "" || txt.pos == Inf)
            txt.pos <- start + 2
          ## if min of integer(0) then Inf
          if (leg.pos == Inf)
            leg.pos <- start + 1
          txttablelines[i, "legendend.pos"] <- max(leg.pos,
                                                   txt.pos - 1)
          ## reverse order detection
          leg.pos.final <- as.numeric(txttablelines[i, "legendend.pos"])
          rev.pos.search.item <- txttablelines[i, "tablelastline"]
          for (symbol in c("\\+", "\\*","\\?",
                           "\\/", "\\(", "\\)", "\\[",
                           "\\]", "\\{", "\\}")) {
            rev.pos.search.item <- gsub(symbol,
                                        paste0("\\", symbol),
                                        rev.pos.search.item)
          }
          rev.pos <- suppressWarnings(min(grep(rev.pos.search.item,
                                               rev(txtcontent[1:leg.pos.final]))))
          if (rev.pos == Inf)
            rev.pos <- 1
          txttablelines[i, "tableend.pos"] <- leg.pos.final - rev.pos + 1
        } else {
          ## reverse order detection
          leg.pos.final <- suppressWarnings(min(grep(txttablelines[i, "legendlastline"],
                                                     txtcontent[start:length(txtcontent)],
                                                     fixed = TRUE))) + start - 1
          if (leg.pos.final == Inf)
            leg.pos.final <- start + 1
          txttablelines[i, "legendend.pos"] <- leg.pos.final
          rev.pos.search.item <- txttablelines[i, "tablelastline"]
          for (symbol in c("\\+", "\\*","\\?",
                           "\\/", "\\(", "\\)", "\\[",
                           "\\]", "\\{", "\\}")) {
            rev.pos.search.item <- gsub(symbol,
                                        paste0("\\", symbol),
                                        rev.pos.search.item)
          }
          rev.pos <- suppressWarnings(min(grep(rev.pos.search.item,
                                               rev(txtcontent[1:leg.pos.final]))))
          if (rev.pos == Inf)
            rev.pos <- 2
          txttablelines[i, "tableend.pos"] <- leg.pos.final - rev.pos + 1
        }
      }  ## end

      ## adjust keeplayouttxttablelines ##

      keeplayouttxttablelines <- htmltablelines[,
                                                c("page", "tableheading", "tablestart.pos",
                                                  "tablelastline", "tableend.pos",
                                                  "legendlastline", "legendend.pos",
                                                  "txtfirstline", "detected.in")]
      ## convert into dataframe without levels
      keeplayouttxttablelines <- data.frame(lapply(keeplayouttxttablelines,
                                                   as.character), stringsAsFactors = FALSE)
      for (i in 1:nrow(keeplayouttxttablelines)) {

        ## adjust tablestart.pos
        tabhead <- keeplayouttxttablelines[i, "tableheading"]
        for (symbol in c("\\+", "\\*","\\?",
                         "\\/", "\\(", "\\)", "\\[",
                         "\\]", "\\{", "\\}")) {
          tabhead <- gsub(symbol,"\\", tabhead)
        }
        ## Remove backreferences (\1) from tableheading
        tabhead <- remove_backref(tabhead)
        
        start <- grep(as.character(tabhead), keeplayouttxtcontent)[1]
        
        keeplayouttxttablelines[i, "tableheading"] <- tabhead
        trimed <- tabhead
        while (is.na(start)) {
          headinglength <- nchar(as.character(trimed))
          trimed <- strtrim(as.character(trimed),
                            headinglength - 1)
          headinglength <- nchar(as.character(trimed))
          lastchar <- substr(as.character(trimed),
                             headinglength, headinglength)
          while (lastchar == "\\"){
            trimed <- strtrim(as.character(trimed),
                              headinglength - 1)
            headinglength <- nchar(as.character(trimed))
            lastchar <- substr(as.character(trimed),
                               headinglength, headinglength)
          }
          start <- grep(trimed, keeplayouttxtcontent)[1]
        }
        keeplayouttxttablelines[i, "tablestart.pos"] <- start
        ## if table keeplayouttxttablelines not detected
        ## table is only one line
        if (keeplayouttxttablelines[i, "detected.in"] == "txtonly.notabledetected") {
          keeplayouttxttablelines[i, "tableend.pos"] <- keeplayouttxttablelines[i, "tablestart.pos"]
          keeplayouttxttablelines[i, "legendend.pos"] <- keeplayouttxttablelines[i, "tablestart.pos"]
          next
        }
        ## if keeplayouttxtfirstline is not empty (i.e. at
        ## the end of the page)
        if (!is.na(keeplayouttxttablelines[i, "legendend.pos"])){
          ## adjust txtfirstline
          leg.pos <- as.numeric(keeplayouttxttablelines[i, "legendend.pos"])
          txt.pos <- leg.pos+1
          keeplayouttxttablelines[i, "legendlastline"] <- keeplayouttxtcontent[leg.pos]
          keeplayouttxttablelines[i, "tablelastline"] <- keeplayouttxtcontent[leg.pos]
          keeplayouttxttablelines[i, "txtfirstline"] <- keeplayouttxtcontent[leg.pos+1]

        } else if (!keeplayouttxttablelines[i, "txtfirstline"] == "") {
          ## adjust keeplayouttxtfirstline
          leg.pos <- suppressWarnings(min(grep(keeplayouttxttablelines[i, "legendlastline"], 
                                               keeplayouttxtcontent[start:length(keeplayouttxtcontent)],
                                               fixed = TRUE))) + start - 1

          keeplayouttxt.pos <- suppressWarnings(min(grep(keeplayouttxttablelines[i, "txtfirstline"],
                                                         keeplayouttxtcontent[start:length(keeplayouttxtcontent)],
                                                         fixed = TRUE))) + start - 1
          ## shorten the keeplayouttxtfirstline till hit is
          ## found
          while (leg.pos == Inf && keeplayouttxt.pos ==
                 Inf && keeplayouttxttablelines[i,"txtfirstline"] != "") {
            keeplayouttxttablelines[i, "txtfirstline"] <- strtrim(keeplayouttxttablelines[i, "txtfirstline"],
                                                                  nchar(keeplayouttxttablelines[i, "txtfirstline"]) - 1)
            keeplayouttxt.pos <- suppressWarnings(min(grep(keeplayouttxttablelines[i, "txtfirstline"],
                                                           keeplayouttxtcontent[start:length(keeplayouttxtcontent)],
                                                           fixed = TRUE))) + start - 1
          }
          if (keeplayouttxttablelines[i, "txtfirstline"] == "" ||
              keeplayouttxt.pos == Inf)
            keeplayouttxt.pos <- start + 2
          ## if min of integer(0) then Inf
          if (leg.pos == Inf)
            leg.pos <- start + 1
          keeplayouttxttablelines[i, "legendend.pos"] <- max(leg.pos, keeplayouttxt.pos - 1)
          ## reverse order detection
          leg.pos.final <- as.numeric(keeplayouttxttablelines[i, "legendend.pos"])
          rev.pos.search.item <- gsub("/", "_",
                                      keeplayouttxttablelines[i, "tablelastline"])
          for (symbol in c("\\+", "\\*","\\?",
                           "\\/", "\\(", "\\)", "\\[",
                           "\\]", "\\{", "\\}")) {
            rev.pos.search.item <- gsub(symbol,
                                        paste0("\\", symbol),
                                        rev.pos.search.item)
          }
          rev.pos.search.item <- gsub("\\(",
                                      "", rev.pos.search.item)
          rev.pos.search.item <- gsub("\\)",
                                      "", rev.pos.search.item)
          rev.pos <- suppressWarnings(min(grep(rev.pos.search.item,
                                               rev(keeplayouttxtcontent[1:leg.pos.final]))))
          if (rev.pos == Inf)
            rev.pos <- 1
          keeplayouttxttablelines[i, "tableend.pos"] <- leg.pos.final - rev.pos + 1
        } else {
          ## reverse order detection
          leg.pos.final <- suppressWarnings(min(grep(keeplayouttxttablelines[i, "legendlastline"],
                                                     keeplayouttxtcontent[start:length(keeplayouttxtcontent)],
                                                     fixed = TRUE))) + start - 1
          if (leg.pos.final == Inf)
            leg.pos.final <- start + 1
          keeplayouttxttablelines[i, "legendend.pos"] <- leg.pos.final
          rev.pos.search.item <- gsub("/",
                                      "_", keeplayouttxttablelines[i, "tablelastline"])
          for (symbol in c("\\+", "\\*","\\?",
                           "\\/", "\\(", "\\)", "\\[",
                           "\\]", "\\{", "\\}")) {
            rev.pos.search.item <- gsub(symbol,
                                        paste0("\\", symbol),
                                        rev.pos.search.item)
          }
          rev.pos.search.item <- gsub("\\(",
                                      "", rev.pos.search.item)
          rev.pos.search.item <- gsub("\\)",
                                      "", rev.pos.search.item)
          rev.pos <- suppressWarnings(min(grep(rev.pos.search.item,
                                               rev(keeplayouttxtcontent[1:leg.pos.final]))))
          if (rev.pos == Inf)
            rev.pos <- 2
          keeplayouttxttablelines[i, "tableend.pos"] <- leg.pos.final - rev.pos + 1
        }
      }
    } #end if (nrow(txttablelines) > 0 && ncol(txttablelines) > 0) {
  }

  ## 6.9) Assign table, legend or txt to the each html and txtline ----------
  if (searchwords.go == TRUE && filterwords.go == TRUE &&
      integrity.indicator == TRUE && !length(tablestart.pos) == 0 &&
      nrow(as.data.frame((htmltablelines))) > 0 && ncol(as.data.frame((htmltablelines))) > 0) {

    ## Assign table, legend or txt to the each htmlline
    ## initialize the output table
    htmltables <- list()
    ## export htmltables
    if (nrow(htmltablelines) > 0) {
      for (i in 1:nrow(htmltablelines)) {
        if ((htmltablelines[i, "detected.in"] == "txtonly.notabledetected") ||
            (htmltablelines[i, "detected.in"] == "txtonly")) {
          htmltables[[i]] <- as.character(htmltablelines[i, "tableheading"])
          names(htmltables)[i] <- as.character(htmltablelines[i, "tableheading"])
          print_message <- paste0("The following table was detected but not processable for extraction: ",
                                  htmltablelines[i, "tableheading"])
          out_msg <- c(out_msg, print_message)
          if (verbose) cat(utils::tail(out_msg,1), sep="\n")
          ## create empty table
          if (write.tab.doc.file == TRUE) {
            ## write the mock table
            tableheader <- as.character(htmltablelines[i, "tableheading"])
            for (symbol in c("\\\\", "\\/",
                             "\\:", "\\*", "\\?",
                             "\"", ">", "<", "\\|", "\\a")) {
              tableheader <- gsub(symbol,"_", tableheader)
            }
            ## write table file
            dir.create(paste0(out,"/tables"), showWarnings = FALSE)
            outputtable.name.part <- paste0(out,"/tables/",id,
                                            "_")
            outputtable.name <- gsub(" ","_",paste0(out,"/tables/",id,
                                                    "_", i, "_",
                                                    tableheader))
            if (nchar(outputtable.name.part) > 100) {
              print_message <- paste0("The file path of ",paste0(outputtable.name,
                                                                 out.table.ext),
                                      " might be too long to be read by some programs. Consider using a shorter output path.")
              out_msg <- c(out_msg, print_message)
              if (verbose) cat(utils::tail(out_msg,1), sep="\n")
              update_progress_info(paste0(outputtable.name,
                                          out.table.format,
                                          " file path maybe too long."))
              utils::write.table(print_message, file = paste0(outputtable.name,
                                                                             out.table.ext),
                                 sep = out.table.separator,
                                 row.names = FALSE, col.names = FALSE,
                                 na = "")
            } else {
              utils::write.table(print_message, file = paste0(substr(outputtable.name, 1, 100),
                                                                             out.table.ext),
                                 sep = out.table.separator,
                                 row.names = FALSE, col.names = FALSE,
                                 na = "")
            }
          } ##end if if (write.tab.doc.file == TRUE) {
          next
        }
        ## save the current page
        p <- htmltablelines[i, "page"]
        ## copy the page as currenttable
        currenttable <- cbind(txthtmlcontent[[p]],
                              left = htmlcontent[[p]][, "left"],
                              top = htmlcontent[[p]][, "top"],
                              layout = NA)
        ## add legend info to layout
        currenttable[htmltablelines[i, "tableend.pos"]:htmltablelines[i, "legendend.pos"], "layout"] <- "legend"
        ## add table info to layout
        currenttable[htmltablelines[i, "tablestart.pos"]:htmltablelines[i, "tableend.pos"], "layout"] <- "table"
        outtable <- currenttable[htmltablelines[i, "tablestart.pos"]:htmltablelines[i, "legendend.pos"], ]
        htmltables[[i]] <- outtable
        names(htmltables)[i] <- as.character(htmltablelines[i, "tableheading"])
      }
    } else {
      htmltables <- list()
    }

    ## assign table, legend or txt to the each txtline
    ##

    ## initialize outtable
    outtable <- cbind(txtcontent, layout = NA,
                      rownumber = 1:length(txtcontent))
    ## export htmltables
    if (nrow(txttablelines) > 0 && ncol(txttablelines) > 0) {
      for (i in 1:nrow(txttablelines)) {
        ## add legend info to layout
        outtable[txttablelines[i, "tableend.pos"]:txttablelines[i, "legendend.pos"], "layout"] <- "legend"
        ## add table info to layout
        outtable[txttablelines[i, "tablestart.pos"]:txttablelines[i, "tableend.pos"], "layout"] <- "table"
      }
    }
    outtable[is.na(outtable[, "layout"]),
             "layout"] <- "txt"

    txtlines <- outtable[(outtable[, "layout"] == "txt"), ]
  } else if (filterwords.go == TRUE &&
             integrity.indicator == TRUE) {
    print_message <- paste0("\'",id,".pdf\' has no tables that could be detected")
    out_msg <- c(out_msg, print_message)
    if (verbose) cat(utils::tail(out_msg,1), sep="\n")
    update_progress_info(print_message)
    
    if (write.tab.doc.file == TRUE) {
      dir.create(paste0(out,"/no_tab"), showWarnings = FALSE)
      utils::write.table("no table found",
                         paste0(out,"/no_tab/",id, "_no.table",
                                out.table.ext),
                         sep = out.table.separator,
                         row.names = FALSE, col.names = FALSE,
                         na = "")
    }
    
    ## all lines are txt lines (no table or legend)
    if (!is.null(txttablelines)){
      if (nrow(txttablelines) > 0){
        txttablelines[, "detected.in"] <- "txtonly"
      }
    }
    htmltablelines <- NULL
    outtable <- cbind(txtcontent, layout = "txt",
                      rownumber = 1:length(txtcontent))
    txtlines <- outtable[(outtable[, "layout"] == "txt"), ]
  }

  ## 7.1) Second search of search words in htmllines -----------------------------------------------
  if (searchwords.go == TRUE && filterwords.go == TRUE &&
      integrity.indicator == TRUE && !length(tablestart.pos) == 0 &&
      nrow(as.data.frame((htmltablelines))) > 0 && ncol(as.data.frame((htmltablelines))) > 0&&
      (whattoextr == "tabandtxt" ||
       whattoextr == "tab" ||
       whattoextr == "table" ||
       whattoextr == "txtandtab")) {
    ## search for tables with search word in html to
    ## differentiate table content from sentences ##
    ## if column does not exist
    if (!"searchwords.found" %in% colnames(htmltablelines)) htmltablelines <- cbind(htmltablelines,
                                                                                    searchwords.found = NA)
    word.table <- NULL
    ## start search only when search words are in the
    ## list
    if (!is.na(search.words[1, "words"])) {
      ## go through each word
      for (i in 1:nrow(search.words)) {
        ## search for tables with searchword ##
        word <- search.words[i, "words"]
        ignore.case.sw <- search.words[i,
                                       "ignore.case.sw"]
        word.table <- c(word.table,
                        grep(word, htmltables, ignore.case = ignore.case.sw))
        word.table <- unique(word.table)
      }

      ## choose tables with search word
      processed.tables <- htmltables[word.table]
      names(processed.tables) <- names(htmltables)[word.table]
      if (length(word.table) > 0) {
        htmltablelines[word.table, "searchwords.found"] <- "YES"
        htmltablelines[-word.table, "searchwords.found"] <- "NO"
      } else {
        htmltablelines["searchwords.found"] <- "NO"
      }

      ## add all tables that have the word 'continue' in
      ## it and follow a table with the search word
      n <- 1
      while (n < nrow(htmltablelines)) {
        ## if current row is having search word and next
        ## table not
        if ((htmltablelines[n, "searchwords.found"] == "YES") && (htmltablelines[n + 1, 
                                                                                 "searchwords.found"] == "NO")) {
          ## if next table includes 'continue'
          if (grepl("continue", htmltables[n + 1], ignore.case = "TRUE")) {
            htmltablelines[n + 1, "searchwords.found"] <- "YES"
          }  ## else it remains NO
        }
        n <- n + 1
      }

    } else {
      processed.tables <- htmltables
      names(processed.tables) <- names(htmltables)
    }
  }

  ## 7.2) Export TABLE only if tab function was called --------------
  if (searchwords.go == TRUE && filterwords.go == TRUE &&
      integrity.indicator == TRUE && !length(tablestart.pos) == 0 &&
      nrow(as.data.frame((htmltablelines))) > 0 && ncol(as.data.frame((htmltablelines))) > 0 &&
      (whattoextr == "tabandtxt" ||
       whattoextr == "tab" ||
       whattoextr == "table" ||
       whattoextr == "txtandtab")) {

    ## export tables ##

    ## sort processed tables

    ## look if any table with search word was found
    if (!length(processed.tables) == 0) {
      processed.tables <- processed.tables[order(names(processed.tables))]
      for (t in 1:length(processed.tables)) {
        ## when table was not detected then skip export
        if (is.null(nrow(processed.tables[[t]]))) {
          next
        } else if (nrow(processed.tables[[t]]) < 2) {
          ## write the table
          tableheader <- names(processed.tables)[[t]]
          for (symbol in c("\\\\", "\\/",
                           "\\:", "\\*", "\\?",
                           "\"", ">", "<", "\\|", "\\a")) {
            tableheader <- gsub(symbol,"_", tableheader)
          }
          dir.create(paste0(out,"/tables"), showWarnings = FALSE)
          outputtable.name.part <- paste0(out,"/tables/",id,
                                          "_")
          outputtable.name <- gsub(" ","_",paste0(out,"/tables/",id,
                                                  "_", t, "_",
                                                  tableheader))
          if (nchar(outputtable.name.part) > 100) {
            print_message <- paste0("The file path of ",paste0(outputtable.name,
                                                    out.table.ext),
                         " might be too long to be read by some programs. Consider using a shorter output path.")
            out_msg <- c(out_msg, print_message)
            if (verbose) cat(utils::tail(out_msg,1), sep="\n")
            update_progress_info(paste0(outputtable.name,
                                        out.table.format,
                                        " file path maybe too long."))
            utils::write.table(names(processed.tables)[[t]], file = paste0(outputtable.name,
                                                                           out.table.ext),
                               sep = out.table.separator,
                               row.names = FALSE, col.names = FALSE,
                               na = "")
          } else {
            utils::write.table(names(processed.tables)[[t]], file = paste0(substr(outputtable.name, 1, 100),
                                                                           out.table.ext),
                               sep = out.table.separator,
                               row.names = FALSE, col.names = FALSE,
                               na = "")
          }
          next
        }

        for (o in c("left", "top")) {
          ## save the range of orient values
          orient.range <- sort(strtoi(unique(processed.tables[[t]][, o])), decreasing = FALSE)
          ## combine if deviation is first value stays the
          ## same
          change.table <- data.frame(from = orient.range[1],
                                     to = orient.range[1])

          ## only combine for left values and when it is a table with more than 1 column (orient.range > 1)
          if (o == "left" && length(orient.range) > 1) {
            for (i in 2:length(orient.range)) {
              if (orient.range[i] - dev == orient.range[i - 1] ||
                  orient.range[i] - dev < orient.range[i - 1]) {
                change.table <- rbind(change.table,
                                      data.frame(from = orient.range[i],
                                                 to = orient.range[i - 1]))
                orient.range[i] <- orient.range[i - 1]
              } else {
                change.table <- rbind(change.table,
                                      data.frame(from = orient.range[i],
                                                 to = orient.range[i]))
              }
            }  ## end for
          } else if (o == "top" && length(orient.range) > 1) {
            for (i in 2:length(orient.range)) {
              if (orient.range[i] - 1 == orient.range[i - 1] ||
                  orient.range[i] - 1 < orient.range[i - 1]) {
                change.table <- rbind(change.table,
                                      data.frame(from = orient.range[i],
                                                 to = orient.range[i - 1]))
                orient.range[i] <- orient.range[i - 1]
              } else {
                change.table <- rbind(change.table,
                                      data.frame(from = orient.range[i],
                                                 to = orient.range[i]))
              }
            }  ## end for
          }
          ## delete duplicate lines
          change.table <- unique.data.frame(change.table)
          ## delete duplicates
          orient.range <- sort.int(unique(orient.range))
          ## go through each line and replace the orient value
          for (l in 1:length(processed.tables[[t]][, o])) {
            orient <- strtoi(processed.tables[[t]][l, o])
            ## first change according to deviation adjustment
            orient <- change.table[grep(paste0("\\b",
                                               orient, "\\b"),
                                        change.table[, 1]), 2]
            ## save the processed orient value
            processed.tables[[t]][l, o] <- grep(paste0("\\b",
                                                       orient, "\\b"),
                                                orient.range)
          }
        }  ## end orient

        ## make table fuse lines that have the same
        ## coordinates
        new.processed.tables <- processed.tables
        li <- 1
        while (li <= nrow(new.processed.tables[[t]])) {
          ## find rows with same left value
          same.left <- which(new.processed.tables[[t]][, "left"] %in% new.processed.tables[[t]][li, "left"])
          ## find rows with same top value
          same.top <- which(new.processed.tables[[t]][, "top"] %in% new.processed.tables[[t]][li, "top"])
          ## this gives at least the current line and the
          ## duplicated
          duplicated.lines <- intersect(same.left,
                                        same.top)
          ## change the currentline when lines with same
          ## coordinates are present
          if (length(duplicated.lines) >
              1) {
            ## make a new current line
            new.processed.tables[[t]][li, 1] <- paste(unlist(new.processed.tables[[t]][c(duplicated.lines), 1]), 
                                                      collapse = " ")
            ## delete the other lines
            new.processed.tables[[t]] <- new.processed.tables[[t]][-duplicated.lines[-1], ]
          }
          li <- li + 1
        }
        processed.tables <- new.processed.tables
        output.table <- data.frame(matrix(NA,
                                          nrow = max(strtoi(processed.tables[[t]][, "top"])),
                                          ncol = max(strtoi(processed.tables[[t]][, "left"]))))
        for (l in 1:nrow(processed.tables[[t]])) {
          output.table[strtoi(processed.tables[[t]][l, "top"]),
                       strtoi(processed.tables[[t]][l, "left"])] <- processed.tables[[t]][l, 1]
        }
        ## write the table
        tableheader <- names(processed.tables)[[t]]
        for (symbol in