R/PDE.R

Defines functions PDE_reader_i PDE_analyzer PDE_analyzer_i PDE_pdfs2txt_searchandfilter PDE_pdfs2table_searchandfilter PDE_pdfs2table PDE_extr_data_from_pdfs .PDE_extr_data_from_pdf PDE_install_Xpdftools4.02 PDE_check_Xpdf_install PDE_path

Documented in PDE_analyzer PDE_analyzer_i PDE_check_Xpdf_install .PDE_extr_data_from_pdf PDE_extr_data_from_pdfs PDE_install_Xpdftools4.02 PDE_path PDE_pdfs2table PDE_pdfs2table_searchandfilter PDE_pdfs2txt_searchandfilter PDE_reader_i

## PDE: Extract Sentences and Tables from PDF Files.
## Copyright (C) 2020-2023  Erik Stricker
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
## You should have received a copy of the GNU General Public License
## along with this program.  If not, see <https://www.gnu.org/licenses/>.

#' PDE: Extract Tables and Sentences from PDF Files.
#'
#' The package includes two main components: 1) The PDE analyzer performs the
#' sentence and table extraction while 2) the PDE reader allows the
#' user-friendly visualization and quick-processing of the obtained results.
#'
#' @section PDE functions: \code{\link{PDE_analyzer}}, \code{\link{PDE_analyzer_i}},
#'   \code{\link{PDE_extr_data_from_pdfs}}, \code{\link{PDE_pdfs2table}},
#'   \code{\link{PDE_pdfs2table_searchandfilter}},\code{\link{PDE_pdfs2txt_searchandfilter}},
#'   \code{\link{PDE_reader_i}}, \code{\link{PDE_install_Xpdftools4.02}},
#'   \code{\link{PDE_check_Xpdf_install}}
#'
#' @docType package
#' @name PDE
NULL
#> NULL

## 1.4.6

## TODO save progress with tsv file
## TODO export as excel file
## TODO export column width

## declare global variables
PDE.globals <- new.env()
PDE.globals$jumpto.list <- list()
PDE.globals$le.progress.textbox <- list()
PDE.globals$mark.list <- list()
PDE.globals$tables.masterlist <- list()
PDE.globals$ttanalyzer <- list()

#' Deprecated functions in package \sQuote{PDE}
#'
#' @description  These functions are provided for compatibility with older versions
#' of \sQuote{PDE} only, and will be defunct at the next release.
#'
#' @details  The following functions are deprecated and will be made defunct; use
#' the replacement indicated below:
#'   \itemize{
#'
#'     \item{PDE_path: \code{system.file(package = "PDE")}}
#'
#'   }
#'
#' @name PDE-deprecated
NULL
#> NULL

#'Export the installation path the PDE (PDF Data Extractor) package
#'
#'\code{PDE_path} is deprecated. Please run system.file(package = "PDE") instead.
#'
#'@return The function returns a potential path for the PDE package. If the PDE
#'tool was not correctly installed it returns "".
#'
#'@export
PDE_path <- function(){
  .Deprecated("system.file(package = \"PDE\")", package= "PDE",old = "PDE_path")
  ## set PDE library location
  out_path <- ""
  for (dir in .libPaths()){
    if (dir.exists(paste0(dir,"/PDE/R"))){
      out_path <- paste0(dir,"/PDE/")
      break
    }
  }
  return(out_path)
}

#'Check if the Xpdftools are installed an in the system path
#'
#'\code{PDE_check_Xpdf_install} runs a version test for pdftotext, pdftohtml and pdftopng.
#'
#'@param sysname String. In case the function returns "Unknown OS" the sysname can be set manually.
#' Allowed options are "Windows", "Linux",  "SunOS" for Solaris, and "Darwin" for Mac. Default: \code{NULL}.
#'@param verbose Logical. Indicates whether messages will be printed in the console. Default: \code{TRUE}.
#'
#'
#'@return The function returns a Boolean for the installation status and a message in case
#' the commands are not detected.
#'
#'@examples
#'
#' PDE_check_Xpdf_install()
#'
#'@export
PDE_check_Xpdf_install <- function(sysname=NULL, verbose=TRUE){

  ## receive pdftotext, pdftohtml and pdftopng information from config file if it exists

  xpdf_config_location <- paste0(system.file(package = "PDE"),"/bin/XPDF_DIR.config")
  dir.create(dirname(xpdf_config_location), recursive = TRUE, showWarnings = FALSE)

  pdftotext_location <- NULL
  pdftohtml_location <- NULL
  pdftopng_location <- NULL

  if (file.exists(xpdf_config_location)){
    pdftotext_location <- grep("pdftotext",readLines(xpdf_config_location), value = TRUE)
    if (!length(pdftotext_location) > 0){
      if (file.exists(pdftotext_location) == FALSE) pdftotext_location <- NULL
    } else {
      pdftotext_location <- NULL
    }

    pdftohtml_location <- grep("pdftohtml",readLines(xpdf_config_location), value = TRUE)
    if (!length(pdftohtml_location) > 0){
      if (file.exists(pdftohtml_location) == FALSE) pdftohtml_location <- NULL
    } else {
      pdftohtml_location <- NULL
    }

    pdftopng_location <- grep("pdftopng",readLines(xpdf_config_location), value = TRUE)
    if (!length(pdftotext_location) > 0){
      if (file.exists(pdftopng_location) == FALSE) pdftopng_location <- NULL
    } else {
      pdftotext_location <- NULL
    }
  }

  # if either the config file does not exist or the xpdf tool files do not exist
  if (!file.exists(xpdf_config_location) ||
      is.null(pdftotext_location) ||
      is.null(pdftohtml_location) ||
      is.null(pdftopng_location)) {

    if (is.null(sysname)) {
      sysname <- Sys.info()["sysname"]
    }

    show_file_path_linux <- function(filename){
      whereis_output <- system(paste0("whereis -b ",filename), intern = TRUE)
      only_dirs <- sub("^ ","",sub(paste0("^",filename,":"),"",whereis_output))
      if (only_dirs == ""){
        return(NULL)
      } else {
        return(strsplit(gsub(" /",";/",only_dirs),split = ";")[[1]])
      }
    }

    show_file_path_solaris <- function(filename){
      whereis_test <- suppressWarnings(tryCatch(system(paste0("/usr/ucb/whereis ",filename),
                                                       , intern = TRUE)[1],
                                                error=function(err) NULL))
      if (length(whereis_test) != 0) {
        whereis_output <- system(paste0("/usr/ucb/whereis ",filename), intern = TRUE)
        only_dirs <- sub("^ ","",sub(paste0("^",filename,":"),"",whereis_output))
        if (only_dirs == ""){
          return(NULL)
        } else {
          return(strsplit(gsub(" /",";/",only_dirs),split = ";")[[1]])
        }
      } else {
        return(NULL)
      }
    }

    if (sysname == "Windows") {
      pdftotext_location <- suppressWarnings(system("C:\\WINDOWS\\system32\\cmd.exe /c where pdftotext", intern = TRUE))
      if (length(attributes(pdftotext_location)$status) > 0) pdftotext_location <- NULL
      
      pdftohtml_location <- suppressWarnings(system("C:\\WINDOWS\\system32\\cmd.exe /c where pdftohtml", intern = TRUE))
      if (length(attributes(pdftohtml_location)$status) > 0) pdftohtml_location <- NULL
      
      pdftopng_location <- suppressWarnings(system("C:\\WINDOWS\\system32\\cmd.exe /c where pdftopng", intern = TRUE))
      if (length(attributes(pdftopng_location)$status) > 0) pdftopng_location <- NULL
    } else if (sysname == "Linux") {
      pdftotext_location <- show_file_path_linux("pdftotext")

      pdftohtml_location <- show_file_path_linux("pdftohtml")

      pdftopng_location <- show_file_path_linux("pdftopng")

    } else if (sysname == "SunOS") {
      pdftotext_location <- show_file_path_solaris("pdftotext")

      pdftohtml_location <- show_file_path_solaris("pdftohtml")

      pdftopng_location <- show_file_path_solaris("pdftopng")

    } else if (sysname == "Darwin") {

      pdftotext_location <- suppressWarnings(system("which -a pdftotext", intern = TRUE))

      pdftohtml_location <- suppressWarnings(system("which -a pdftohtml", intern = TRUE))

      pdftopng_location <- suppressWarnings(system("which -a pdftopng", intern = TRUE))
    } else{
      stop("Unknown OS. Please set sysname option.")
    }
  }

  out <- TRUE
  files <- NULL

  if (length(pdftotext_location) == 0) {
    files <- c(files,"pdftotext")
    out=FALSE
  }
  if (length(pdftohtml_location) == 0) {
    files <- c(files,"pdftohtml")
    out=FALSE
  }
  if (length(pdftopng_location) == 0) {
    files <- c(files,"pdftopng")
    out=FALSE
  }

  ## if the command line tools where all detected
  if (out == TRUE) {
    ## test pdftotext version
    pdftotext_path <- NULL
    pdfpath <- paste0(system.file(package = "PDE"),"/examples/Methotrexate/29973177_!.pdf")
    keeplayouttxtpath <- paste0(dirname(pdfpath),"/test_txt/test_keeplayout.txt")
    for (i in 1:length(pdftotext_location)){
      dir.create(dirname(keeplayouttxtpath))
      status <- suppressWarnings(system(paste0("\"", pdftotext_location[i], "\" \"", "-layout",
                                               "\" \"", pdfpath, "\" \"", keeplayouttxtpath,
                                               "\""),
                                        wait = TRUE, ignore.stderr = TRUE, intern = TRUE))
      if (file.exists(keeplayouttxtpath)) {
        unlink(dirname(keeplayouttxtpath), recursive = TRUE)
        pdftotext_path <- pdftotext_location[i]
        break
      }

      unlink(dirname(keeplayouttxtpath), recursive = TRUE)
    }

    ## test pdftohtml version
    pdftohtml_path <- NULL
    pdfpath <- paste0(system.file(package = "PDE"),"/examples/Methotrexate/29973177_!.pdf")
    htmlpath <- paste0(dirname(pdfpath),"/test_html/test.html")
    for (i in 1:length(pdftohtml_location)){
      dir.create(dirname(htmlpath))
      status <- system(paste0("\"",pdftohtml_location[i],"\" \"", pdfpath,
                              "\" \"", htmlpath, "\""), wait = TRUE,
                       ignore.stderr = TRUE, intern = TRUE)
      if (dir.exists(htmlpath) && file.exists(paste0(htmlpath, "/index.html"))) {
        unlink(dirname(htmlpath), recursive = TRUE)
        pdftohtml_path <- pdftohtml_location[i]
        break
      }
      unlink(dirname(htmlpath), recursive = TRUE)
    }

    ## test pdftopng
    pdftopng_path <- NULL
    pdfpath <- paste0(system.file(package = "PDE"),"/examples/Methotrexate/29973177_!.pdf")
    pngpath <- paste0(dirname(pdfpath),"/test_png/test.png")
    for (i in 1:length(pdftopng_location)){
      dir.create(dirname(pngpath))
      status <- suppressWarnings(system(paste0("\"",pdftopng_location[i],"\" \"",
                                               "-f", "\" \"", 1, "\" \"", "-l",
                                               "\" \"", 1, "\" \"", pdfpath, "\" \"",
                                               pngpath,"\""),
                                        wait = TRUE, ignore.stderr = TRUE, intern = TRUE))
      if (file.exists(sub("test.png$","test.png-000001.png",pngpath))) {
        unlink(dirname(pngpath), recursive = TRUE)
        pdftopng_path <- pdftopng_location[i]
        break
      }
      unlink(dirname(pngpath), recursive = TRUE)
    }

    if (length(pdftotext_path) > 0 &&
        length(pdftohtml_path) > 0 &&
        length(pdftopng_path) > 0) {
      write(paste(pdftotext_path,pdftohtml_path,pdftopng_path, sep = "\n"),
            file = xpdf_config_location)
      attributes(out) <- list(msg = "Correct version of Xpdf command line tools is installed.")
      if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
    } else {
      if (length(pdftotext_path) == 0) {
        files <- c(files,"pdftotext")
        out=FALSE
      }
      if (length(pdftohtml_path) == 0) {
        files <- c(files,"pdftohtml")
        out=FALSE
      }
      if (length(pdftopng_path) == 0) {
        files <- c(files,"pdftopng")
        out=FALSE
      }

      msg1 <- paste(" installed. Please install the Xpdf command line tools",
                    "using PDE_install_Xpdftools4.02()")

      if (length(files) == 1) {
        out.file <- files
        attributes(out) <- list(msg = paste0("The wrong version of the ",
                                             out.file, " file is",msg1))
        if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
      } else if (length(files) == 2) {
        out.file <- paste0(files[1], " and ", files[2])
        attributes(out) <- list(msg = paste0("The wrong version of the ",
                                             out.file, " files are",msg1))
        if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
      } else if (length(files) == 3) {
        out.file <- paste0(files[1], ", ", files[2], " and " , files[3])
        attributes(out) <- list(msg = paste0("The wrong version of the ",
                                             out.file, " files are",msg1))
        if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
      }

    }

    ## if one or more command line tools where not detected
  } else {
    msg1 <- paste(" not detected. Please install the Xpdf command line tools again",
                  "using PDE_install_Xpdftools4.02()")

    if (length(files) == 1) {
      out.file <- files
      attributes(out) <- list(msg = paste0(out.file, " file",msg1))
      if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
    } else if (length(files) == 2) {
      out.file <- paste0(files[1], " and ", files[2])
      attributes(out) <- list(msg = paste0(out.file, " files",msg1))
      if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
    } else if (length(files) == 3) {
      out.file <- paste0(files[1], ", ", files[2], " and " , files[3])
      attributes(out) <- list(msg = paste0(out.file, " files",msg1))
      if (verbose == TRUE) cat(attributes(out)$msg, sep="\n")
    }
  }

  return(out)
}

#'Install the Xpdf command line tools 4.02
#'
#'\code{PDE_install_Xpdftools4.02} downloads and installs the XPDF command line tools 4.02.
#'
#'@param sysname String. In case the function returns "Unknown OS" the sysname can be set manually.
#' Allowed options are "Windows", "Linux", "SunOS" for Solaris, and "Darwin" for Mac. Default: \code{NULL}.
#'@param bin String. In case the function returns "Unknown OS" the bin of the operational system
#' can be set manually. Allowed options are "64", and "32". Default: \code{NULL}.
#'@param verbose Logical. Indicates whether messages will be printed in the console. Default: \code{TRUE}.
#'@param permission Numerical. If set to 0 the user is ask for a permission to
#' download Xpdftools. If set to 1, no user input is required. Default: \code{0}.
#'
#'
#'@return The function returns a Boolean for the installation status and a message in case
#' the commands are not installed.
#'
#'@examples
#' \dontrun{
#'
#' PDE_install_Xpdftools4.02()
#'
#' }
#'
#'
#'@export
PDE_install_Xpdftools4.02 <- function(sysname=NULL, bin=NULL, verbose=TRUE, permission = 0){
  ## check if Xpdftools are installed
  install.test <- PDE_check_Xpdf_install(verbose=FALSE)
  downloadq <- FALSE
  installq <- FALSE
  out_msg <- NULL
  out <- NULL

  xpdf_config_location <- paste0(system.file(package = "PDE"),"/bin/XPDF_DIR.config")
  dir.create(dirname(xpdf_config_location), recursive = TRUE, showWarnings = FALSE)

  ## set xpdf library location
  xpdf_bin_path <- paste0(system.file(package = "PDE"),"/bin")
  if (is.null(sysname)) sysname <- Sys.info()["sysname"]

  download.test <- FALSE
  if (sysname == "Windows") {
    if (dir.exists(paste0(xpdf_bin_path,"/xpdf-tools-win-4.02"))) download.test <- TRUE
  } else if (sysname == "Linux" || sysname == "SunOS") {
    if (dir.exists(paste0(xpdf_bin_path,"/xpdf-tools-linux-4.02"))) download.test <- TRUE
  } else if (sysname == "Darwin") {
    if (dir.exists(paste0(xpdf_bin_path,"/xpdf-tools-mac-4.02"))) download.test <- TRUE
  } else {
    stop("Unknown OS. Please set sysname option.")
  }

  if (is.null(bin)){
    if (grepl("32",Sys.info()[["machine"]])) {
      bin <- "32"
    } else if (grepl("64",Sys.info()[["machine"]])) {
      bin <- "64"
    } else {
      stop("Unknown OS. Please set sysname option.")
    }
  }

  if (bin != "64" && bin != "32"){
    stop("Unknown OS. Please set bin option.")
  }

  ## determine operating system and download correct xpdf
  if (permission == 0){
    if (download.test == FALSE){
      ## determine operating system and download correct Xpdf command line tools
      downloadq <- utils::menu(c("Y", "N"), title="Do you want to download and install xpdf version 4.02? (y/n)") == 1
      installq <- downloadq
    } else {
      downloadq <- utils::menu(c("Y", "N"), title=paste("Xpdf command line tools 4.02 are already downloaded.",
                                                        "Do you want to download the Xpdf command line tools version 4.02 again? (y/n)")) == 1
      if (install.test == TRUE){
        installq <- utils::menu(c("Y", "N"), title=paste("Working versions of Xpdf command line tools are already installed.",
                                                         "Do you want to still (re)install",
                                                         "the Xpdf command line tools version 4.02? (y/n)")) == 1
      } else {
        installq <- utils::menu(c("Y", "N"), title=paste("Do you want to also install",
                                                         "the Xpdf command line tools version 4.02? (y/n)")) == 1
      }
    }
  } else {
    downloadq <- TRUE
    installq <- TRUE
  }

  if (downloadq == TRUE){

    if (sysname == "Windows") {
      utils::download.file("https://raw.githubusercontent.com/erikstricker/PDE/master/inst/examples/bin/xpdf-tools-win-4.02.zip",
                           destfile = paste0(xpdf_bin_path,"/xpdf-tools-win-4.02.zip"),
                           mode = "wb")
      utils::unzip(paste0(xpdf_bin_path,"/xpdf-tools-win-4.02.zip"),exdir = xpdf_bin_path)
      remove.status <- suppressWarnings(file.remove(paste0(xpdf_bin_path,"/xpdf-tools-win-4.02.zip")))
      download.test <- TRUE
    } else if (sysname == "Linux" || sysname == "SunOS") {
      utils::download.file("https://raw.githubusercontent.com/erikstricker/PDE/master/inst/examples/bin/xpdf-tools-linux-4.02.tar.gz",
                           destfile = paste0(xpdf_bin_path,"/xpdf-tools-linux-4.02.tar.gz"),
                           mode = "wb")
      utils::untar(paste0(xpdf_bin_path,"/xpdf-tools-linux-4.02.tar.gz"),exdir = xpdf_bin_path)
      remove.status <- suppressWarnings(file.remove(paste0(xpdf_bin_path,"/xpdf-tools-linux-4.02.tar.gz")))
      download.test <- TRUE
    } else if (sysname == "Darwin") {
      utils::download.file("https://raw.githubusercontent.com/erikstricker/PDE/master/inst/examples/bin/xpdf-tools-mac-4.02.tar.gz",
                           destfile = paste0(xpdf_bin_path,"/xpdf-tools-mac-4.02.tar.gz"))
      utils::untar(paste0(xpdf_bin_path,"/xpdf-tools-mac-4.02.tar.gz"),exdir = xpdf_bin_path)
      remove.status <- suppressWarnings(file.remove(paste0(xpdf_bin_path,"/xpdf-tools-mac-4.02.tar.gz")))
      download.test <- TRUE
    } else {
      stop("Unknown OS. Please set sysname option.")
    }
  }

  if (download.test == TRUE){
    if (sysname == "Windows") {
      filepath <- normalizePath(paste0(xpdf_bin_path,"/xpdf-tools-win-4.02/bin",bin))
      ext <- ".exe"
    } else if (sysname == "Linux" || sysname == "SunOS") {
      filepath <- normalizePath(paste0(xpdf_bin_path,"/xpdf-tools-linux-4.02/bin",bin))
      ext <- ""
    } else if (sysname == "Darwin") {
      filepath <- normalizePath(paste0(xpdf_bin_path,"/xpdf-tools-mac-4.02/bin",bin))
      ext <- ""
    } else {
      stop("Unknown OS. Please set sysname option.")
    }
    out_msg <- c(out_msg,paste0("Location of Xpdf command line tools 4.02: ",filepath))
    if (verbose) cat(utils::tail(out_msg,1), sep="\n")
    attributes(out) <- list(msg = out_msg,path=filepath)

    ## "Installation"
    if (installq == TRUE){

      pdftotext_path <- normalizePath(paste0(filepath,"/pdftotext",ext))

      pdftohtml_path <- normalizePath(paste0(filepath,"/pdftohtml",ext))

      pdftopng_path <- normalizePath(paste0(filepath,"/pdftopng",ext))

      write(paste(pdftotext_path,pdftohtml_path,pdftopng_path, sep = "\n"),
            file = xpdf_config_location)

      out_msg <- c(out_msg,"The Xpdf command line tools 4.02 were successfully installed.")
      if (verbose) cat(utils::tail(out_msg,1), sep="\n")
      attributes(out) <- list(msg = out_msg,path=filepath)
      out <- TRUE
    } else {
      out_msg <- c(out_msg,"The Xpdf command line tools 4.02 were not installed.")
      if (verbose) cat(utils::tail(out_msg,1), sep="\n")
      attributes(out) <- list(msg = out_msg,path=filepath)
      out <- FALSE
    }
  } else {
    out_msg <- c(out_msg,"The Xpdf command line tools 4.02 were not downloaded.")
    attributes(out) <- list(msg = out_msg,path="")
    if (verbose) cat(utils::tail(out_msg,1), sep="\n")
    out <- FALSE
  }

  return(out)
}

#'Extracting data from a PDF (Protable Document Format) file
#'
#'\code{PDE_extr_data_from_pdf} extracts sentences or tables from a single PDF
#'file and writes output in the corresponding folder.
#'
#'@param pdf String. Path to the PDF file to be analyzed.
#'@param whattoextr String. Either \emph{txt}, \emph{tab}, or \emph{tabandtxt}
#'  for PDFS2TXT (extract sentences from a PDF file) or PDFS2TABLE (table of a PDF
#'  file to a Microsoft Excel file) extraction. \emph{tab} allows the extraction
#'  of tables with and without search words while \emph{txt} and \emph{tabandtxt}
#'  require search words.
#'@param out String. Directory chosen to save analysis results in. Default:
#'  \code{"."}.
#'@param filter.words List of strings. The list of filter words. If not
#'  \code{NA} or \code{""} a hit will be counted every time a word from the list
#'  is detected in the article.
#'   Default: \code{""}.
#'@param regex.fw Logical. If TRUE filter words will follow the regex rules
#' (see \url{https://github.com/erikstricker/PDE/blob/master/inst/examples/cheatsheets/regex.pdf}).
#'  Default = \code{TRUE}.
#'@param ignore.case.fw Logical. Are the filter words case-sensitive (does
#'  capitalization matter)? Default: \code{FALSE}.
#'@param filter.word.times Numeric or string. Can either be expressed as absolute number or percentage
#'  of the total number of words (by adding the "%" sign). The minimum number of hits described for
#'  \code{filter.words} for a paper to be further analyzed. Default: \code{0.2\%}.
#'@param table.heading.words List of strings. Different than standard (TABLE,
#'  TAB or table plus number) headings to be detected. Regex rules apply (see
#'  also
#'  \url{https://github.com/erikstricker/PDE/blob/master/inst/examples/cheatsheets/regex.pdf}).
#'   Default = \code{""}.
#'@param ignore.case.th Logical. Are the additional table headings (see
#'  \code{table.heading.words}) case-sensitive (does capitalization matter)?
#'  Default = \code{FALSE}.
#'@param search.words List of strings. List of search words. To extract all
#'  tables from the PDF file leave \code{search.words = ""}.
#'@param search.word.categories List of strings. List of categories with the
#'  same length as the list of search words. Accordingly, each search word can be
#'  assigned to a category, of which the word counts will be summarized in the
#'  \code{PDE_analyzer_word_stats.csv} file. If search.word.categories is a
#'  different length than search.words the parameter will be ignored.
#'  Default: \code{NULL}.
#'@param save.tab.by.category Logical. Can only be used with search.word.categories.
#'  If set to TRUE, tables that carry search words will be saved in sub-folders 
#'  according to the search word category of the detected search word.
#'  Default: \code{FALSE}.
#'@param regex.sw Logical. If TRUE search words will follow the regex rules
#' (see \url{https://github.com/erikstricker/PDE/blob/master/inst/examples/cheatsheets/regex.pdf}).
#'  Default = \code{TRUE}.
#'@param ignore.case.sw Logical. Are the search words case-sensitive (does
#'  capitalization matter)? Default: \code{FALSE}.
#'@param eval.abbrevs Logical. Should abbreviations for the search words be
#'  automatically detected and then replaced with the search word + "$*"?
#'  Default: \code{TRUE}.
#'@param out.table.format String. Output file format. Either comma separated
#'  file \code{.csv} or tab separated file \code{.tsv}. The encoding indicated
#'  in parantheses should be selected according to the operational system
#'  exported tables are opened in, i.e., Windows: \code{"(WINDOWS-1252)"}; Mac:
#'  \code{(macintosh)}; Linux: \code{(UTF-8)}. Default: \code{".csv"} and
#'  encoding depending on the operational system.
#'@param dev_x Numeric. For a table the size of indention which would be
#'  considered the same column. Default: \code{20}.
#'@param dev_y Numeric. For a table the vertical distance which would be
#'  considered the same row. Can be either a number or set to dynamic detection
#'  [9999], in which case the font size is used to detect which words are in the
#'  same row.
#'  Default: \code{9999}.
#'@param context Numeric. Number of sentences extracted before and after the
#'  sentence with the detected search word. If \code{0} only the sentence with
#'  the search word is extracted. Default: \code{0}.
#'@param write.table.locations Logical. If \code{TRUE}, a separate file with the
#'  headings of all tables, their relative location in the generated html and
#'  txt files, as well as information if search words were found will be
#'  generated. Default: \code{FALSE}.
#'@param exp.nondetc.tabs Logical. If \code{TRUE}, if a table was detected in a
#'  PDF file but is an image or cannot be read, the page with the table with be
#'  exported as a png. Default: \code{TRUE}.
#'@param write.tab.doc.file Logical. If \code{TRUE}, if search words are used
#'  for table detection and no search words were found in the tables of a PDF
#'  file, a \strong{no.table.w.search.words}. Default: \code{TRUE}.
#'@param write.txt.doc.file Logical. If \code{TRUE}, if no search words were
#'  found in the sentences of a PDF file, a file will be created with the PDF
#'  filename followed by \strong{no.txt.w.search.words}. If the PDF file is
#'  empty, a file will be created with the PDF filename followed by
#'  \strong{no.content.detected}. If the filter word threshold is not met,
#'  a file will be created with the PDF filename followed by
#'  \strong{no.txt.w.filter.words}. Default: \code{TRUE}.
#'@param delete Logical. If \code{TRUE}, the intermediate \strong{txt},
#'  \strong{keeplayouttxt} and \strong{html} copies of the PDF file will be
#'  deleted. Default: \code{TRUE}.
#'@param cpy_mv String. Either "nocpymv", "cpy", or "mv". If filter words are used in the
#'  analyses, the processed PDF files will either be copied ("cpy") or moved ("mv") into the
#'  /pdf/ subfolder of the output folder. Default: \code{"nocpymv"}.
#'@param verbose Logical. Indicates whether messages will be printed in the
#'  console. Default: \code{TRUE}.
#'
#'@return If tables were extracted from the PDF file the function returns a list of
#'  following tables/items: 1) \strong{htmltablelines}, 2)
#'  \strong{txttablelines}, 3) \strong{keeplayouttxttablelines}, 4) \strong{id},
#'  5) \strong{out_msg}.
#'  The \strong{tablelines} are tables that provide the heading and position of
#'  the detected tables. The \strong{id} provide the name of the PDF file. The
#'  \strong{out_msg} includes all messages printed to the console or the suppressed
#'  messages if \code{verbose=FALSE}.
#'
#'@examples
#'
#'## Running a simple analysis with filter and search words to extract sentences and tables
#'if(PDE_check_Xpdf_install() == TRUE){
#'  outputtables <- .PDE_extr_data_from_pdf(pdf = "/examples/Methotrexate/29973177_!.pdf",
#'  whattoextr = "tabandtxt",
#'  out = paste0(system.file(package = "PDE"),"/examples/MTX_output+-0_test/"),
#'  filter.words = strsplit("cohort;case-control;group;study population;study participants", ";")[[1]],
#'  ignore.case.fw = TRUE,
#'  regex.fw = FALSE,
#'  search.words = strsplit("(M|m)ethotrexate;(T|t)rexal;(R|r)heumatrex;(O|o)trexup", ";")[[1]],
#'  ignore.case.sw = FALSE,
#'  regex.sw = TRUE)
#'}
#'
#'## Running an advanced analysis with filter and search words to
#'## extract sentences and tables and obtain documentation files
#'if(PDE_check_Xpdf_install() == TRUE){
#'  outputtables <- .PDE_extr_data_from_pdf(pdf = paste0(system.file(package = "PDE"),
#'                        "/examples/Methotrexate/29973177_!.pdf"),
#'  whattoextr = "tabandtxt",
#'  out = paste0(system.file(package = "PDE"),"/examples/MTX_output+-1_test/"),
#'  context = 1,
#'  dev_x = 20,
#'  dev_y = 9999,
#'  filter.words = strsplit("cohort;case-control;group;study population;study participants", ";")[[1]],
#'  ignore.case.fw = TRUE,
#'  regex.fw = FALSE,
#'  filter.word.times = "0.2%",
#'  table.heading.words = "",
#'  ignore.case.th = FALSE,
#'  search.words = strsplit("(M|m)ethotrexate;(T|t)rexal;(R|r)heumatrex;(O|o)trexup", ";")[[1]],
#'  ignore.case.sw = FALSE,
#'  regex.sw = TRUE,
#'  eval.abbrevs = TRUE,
#'  out.table.format = ".csv (WINDOWS-1252)",
#'  write.table.locations = TRUE,
#'  write.tab.doc.file = TRUE,
#'  write.txt.doc.file = TRUE,
#'  exp.nondetc.tabs = TRUE,
#'  cpy_mv = "nocpymv",
#'  delete = TRUE)
#'}
#'
#'@seealso
#'\code{\link{PDE_pdfs2table}},\code{\link{PDE_pdfs2table_searchandfilter}},
#'\code{\link{PDE_pdfs2txt_searchandfilter}}
#'
#'@export
.PDE_extr_data_from_pdf <- function(pdf, whattoextr,
                                    out = ".", filter.words = "", regex.fw = TRUE, ignore.case.fw = FALSE, filter.word.times = "0.2%",
                                    table.heading.words = "", ignore.case.th = FALSE, search.words, search.word.categories = NULL, 
                                    save.tab.by.category = FALSE, regex.sw = TRUE,
                                    ignore.case.sw = FALSE, eval.abbrevs = TRUE, out.table.format = ".csv (WINDOWS-1252)",
                                    dev_x = 20, dev_y = 9999, context = 0,write.table.locations = FALSE, exp.nondetc.tabs = TRUE,
                                    write.tab.doc.file = TRUE,write.txt.doc.file = TRUE, delete = TRUE, cpy_mv = "nocpymv",
                                    verbose = TRUE){

  ## General functions -------------------------------------------
  ## Added re.escape function in v1.4.0 to convert regex to non-regex
  re.escape <- function(string){
    vals <- c("\\\\", "\\[", "\\]", "\\(", "\\)",
              "\\{", "\\}", "\\^", "\\$","\\*",
              "\\+", "\\?", "\\.", "\\|")
    replace.vals <- paste0("\\\\", vals)
    for(i in seq_along(vals)){
      string <- gsub(vals[i], replace.vals[i], string)
    }
    return(string)
  }

  create_id_from_stringlist <- function(list_of_strings, char_len = 8){
    list_of_ints <- NULL
    for (string in list_of_strings){
      final <- 20
      numbers <- utf8ToInt(string) - utf8ToInt("a") + 1L
      for (i in 2:length(numbers)){
        final <- abs(round(mean(c(final, (numbers[[i-1]] %% numbers[[i]] + (numbers[[i]] * 10000))))))
      }
      list_of_ints <- c(list_of_ints, final)
    }
    set.seed(round(mean(list_of_ints)))
    n <- 1
    pool <- c(letters, LETTERS, 0:9)

    res <- character(n) # pre-allocating vector is much faster than growing it
    for(i in seq(n)){
      this_res <- paste0(sample(pool, char_len, replace = TRUE), collapse = "")
      while(this_res %in% res){ # if there was a duplicate, redo
        this_res <- paste0(sample(pool, char_len, replace = TRUE), collapse = "")
      }
      res[i] <- this_res
    }
    return(res)

  }

  readin_txt <- function(txtpath) {

    ## read in the txt file
    txtcontent_from_txtpath_lat1 <- readLines(txtpath, warn = FALSE, encoding = "latin1")
    txtcontent_from_txtpath_utf8 <- readLines(txtpath, warn = FALSE, encoding = "UTF-8")
    res_lat1 <- try(sum(nchar(txtcontent_from_txtpath_lat1)),silent = TRUE)
    res_utf8 <- try(sum(nchar(txtcontent_from_txtpath_utf8)),silent = TRUE)
    if (inherits(res_utf8,"try-error")){
      ## if utf8 throws an error
      txtcontent_from_txtpath <- txtcontent_from_txtpath_lat1
    } else if (inherits(res_lat1,"try-error")){
      ## if latin1 throws an error
      txtcontent_from_txtpath <- txtcontent_from_txtpath_utf8
    } else {
      ## latin1 preferred for 4.2.0 xpdf version
      txtcontent_from_txtpath <- txtcontent_from_txtpath_lat1
    }


    for (r in 1:length(txtcontent_from_txtpath)){
      ## replace all fi
      res <- try(gsub(intToUtf8(0xFB01),"fi",txtcontent_from_txtpath[r], fixed = TRUE),silent = TRUE)
      if (inherits(res,"try-error")){
        txtcontent_from_txtpath[r] <- iconv(txtcontent_from_txtpath[r], 'UTF-8', 'latin1', 'bit')
      }
      txtcontent_from_txtpath[r] <- gsub(intToUtf8(0xFB01),"fi",txtcontent_from_txtpath[r], fixed = TRUE)
    }
    return(txtcontent_from_txtpath)

  }


  page_splits <- function(txtcontent_to_split) {

    ## split the content according to pages
    pagesplits <- grep("^\\f", txtcontent_to_split)

    ## if there is only one page or no txtcontent_to_split
    if (length(pagesplits) > 1) {
      ## page split the line before
      g <- rep.int(1, pagesplits[1] - 1)
      for (p in 2:length(pagesplits)) {
        g <- c(g, rep.int(p, pagesplits[p] - pagesplits[p - 1]))
      }
      g <- c(g, length(pagesplits))
      splittxtcontent <- split(txtcontent_to_split, g)
    } else {
      splittxtcontent <- txtcontent_to_split
    }

    return(splittxtcontent)
  }

  find_similar_row <- function(originrow, targettablelines,
                               relative.match.col, output.for.originrow.only,
                               output.for.match, output.column.name) {

    determine_similarity_1_vs_2 <- function(strings){
      ## determine similarity
      matches_fsr <- NULL
      for (pos in 1:length(strings[[1]])) {
        counter_fsr <- 0
        out_fsr <- FALSE
        while (out_fsr == FALSE && counter_fsr < 4) {
          ## test is characters are the same
          if (is.na(strings[[2]][pos + counter_fsr])) break
          if (strings[[1]][pos] == strings[[2]][pos + counter_fsr]) {
            out_fsr <- TRUE
          } else {
            counter_fsr <- counter_fsr + 1
          }
        }  ## end while
        matches_fsr <- c(matches_fsr, out_fsr)
      }
      ## determine how much it matches_fsr
      percent.match <- sum(matches_fsr, na.rm = TRUE)/length(strings[[1]])
      return(percent.match)
    }

    ## set variables
    matchingrow <- NA
    matchpercent <- NULL
    targettablerow <- NULL

    ## check every row in targettablelines
    for (targetrow in 1:nrow(targettablelines)) {
      ## skip lines that do not have matching pages
      if (as.numeric(targettablelines[targetrow, "page"]) != as.numeric(originrow[["page"]])) next
      x <- as.character(originrow[[relative.match.col]])
      y <- as.character(targettablelines[targetrow, relative.match.col])
      if (nchar(x) > nchar(y)) x <- substr(x, 1, nchar(y))
      ## Run one way around
      strings1 <- sapply(list(x, y), strsplit, "")
      strings2 <- sapply(list(y, x), strsplit, "")
      percent.match1 <- determine_similarity_1_vs_2(strings1)
      percent.match2 <- determine_similarity_1_vs_2(strings2)
      percent.match <- max(percent.match1,percent.match2)

      if (percent.match > 0.8) {
        matchpercent <- rbind(matchpercent, c(originrow = 1,
                                              targetrow = targetrow,
                                              percent.match = percent.match))
      }
    }

    ## if multiple tables were on the same page if no
    ## matching table is found
    if (is.null(matchpercent)) {
      originrow[output.column.name] <- output.for.originrow.only
      targettablerow <- NA
      ## if only one row matches_fsr
    } else if (nrow(matchpercent) == 1) {
      originrow[output.column.name] <- output.for.match
      targettablelines[matchpercent[1, "targetrow"],
                       output.column.name] <- output.for.match
      targettablerow <- targettablelines[matchpercent[1, "targetrow"],]
      ## if multiple rows
    } else {
      maxrow <- grep(TRUE, matchpercent[, "percent.match"] %in% max(matchpercent[, "percent.match"]))[1]
      originrow[output.column.name] <- output.for.match
      targettablelines[matchpercent[maxrow, "targetrow"], output.column.name] <- output.for.match
      targettablerow <- targettablelines[matchpercent[maxrow, "targetrow"],]
    }

    return(list(originrow = originrow, targettablelines = targettablelines,
                targettablerow = targettablerow))
  }

  exp_nondetc_tabs <- function(input_table, pdfpath,
                               outputpath, detectinfile,
                               based.on.column, matches_end) {

    ## find all rows that match
    matched.rows <- grep(matches_end, input_table[, based.on.column])

    ## find page from input_table
    pages <- input_table[matched.rows, "page"]

    exp.pages <- unique(pages)

    ## only start function if there are pages to export
    if (length(exp.pages) > 0) {
      dir.create(paste0(outputpath,"/tables"), showWarnings = FALSE)
      for (page in pages) {
        system(paste0("\"",pdftopng_location,"\" \"",
                      "-f", "\" \"", page, "\" \"", "-l",
                      "\" \"", page, "\" \"", pdfpath, "\" \"",
                      outputpath, "/tables/", substr(basename(pdfpath),
                                                     1, regexpr(".pdf", basename(pdfpath)) -
                                                       1), "_page", page, "_w.table",
                      "\""), wait = TRUE, ignore.stderr = TRUE)
      }
    }
  }

  test_if_abbrev_in_parantheses <- function(searchword, paragraph,ignore.case.sw) {
    output <- list(res = TRUE,"","","","")
    ## get the position of the search word in the paragraph
    pos_searchword_start <- regexpr(searchword,paragraph,ignore.case = ignore.case.sw)[[1]]
    pos_searchword_end <- pos_searchword_start + attr(regexpr(searchword,paragraph,
                                                              ignore.case = ignore.case.sw),"match.length") - 1
    simple_searchword <- substr(paragraph, pos_searchword_start, pos_searchword_end)
    ## test if parantheses are after it +2 for \\
    pos_open <- regexpr("\\(",substr(paragraph, pos_searchword_end + 1, nchar(paragraph)))[[1]]
    pos_abbrev_start <- pos_open + 1 + pos_searchword_end
    pos_close <- regexpr("\\)",substr(paragraph, pos_abbrev_start + 1, nchar(paragraph)))[[1]]
    pos_abbrev_end <- pos_close + pos_abbrev_start - 1
    sentence.end <-  regexpr("\\. [0-9A-Z]",substr(paragraph, pos_searchword_end + 1, nchar(paragraph)))[[1]] +
      pos_searchword_end - 1
    if (sentence.end == -1) sentence.end <- 999999
    ## if both paranthesis were found and there is no end of a sentence
    if (pos_open > 0 &&
        pos_close > 0 &&
        sentence.end > pos_abbrev_end &&
        pos_searchword_start > 0 &&
        pos_searchword_end > 0) {
      pos_ext_searchword_end <- pos_abbrev_start - 3
      ext_searchword <- substr(paragraph, pos_searchword_start, pos_ext_searchword_end)
      ## get all letters within parantheses
      current_char_pos <- pos_abbrev_start
      current_char <- substr(paragraph, current_char_pos, current_char_pos)
      char_list <- NULL
      while (!grepl(" |\\)", current_char)){
        char_list <- c(char_list, current_char)
        current_char_pos <- current_char_pos + 1
        current_char <- substr(paragraph, current_char_pos, current_char_pos)
      }

      char_list <- char_list[!(char_list %in% c("(", ")","[", "]", "/",
                                                "{","}","\\"))]

      ext_searchword <- gsub("[^[:alnum:] ]","",ext_searchword)

      ## test if letter description was found
      if (length(char_list) > 0){
        trunc_searchword <- ext_searchword
        ## match the letters to letters in the searchword
        for (n in 1:length(char_list)){
          pos_char <- regexpr(char_list[n], trunc_searchword, ignore.case = TRUE)[[1]]
          removed.chars <- substr(trunc_searchword, 1, pos_char)
          ## if character was found in search word and abbreviation and
          ## there were no words without character in searchword removed
          if (pos_char > 0 && !length(gregexpr(" ",removed.chars)[[1]]) > 1) {
            trunc_searchword <- substr(trunc_searchword, pos_char + 1, nchar(trunc_searchword))
          } else {
            output <- list(res = FALSE,"","","","")
            break
          }
        }

        ## test if each word in extended searchword has letter in abbreviation
        list.of.words <- gsub("[^A-z]","",
                              strsplit(ext_searchword," ")[[1]])[!(gsub("[^A-z]","",
                                                                        strsplit(ext_searchword," ")[[1]]) %in% "")]
        ## allow one slack
        if (length(list.of.words) - 1 <= length(char_list)){
          result <- TRUE
          trunc_abbrev <- char_list
          ## match the letters to letters in the searchword
          for (n in 1:length(list.of.words)){
            if (length(trunc_abbrev) == 0 && result == TRUE){
              ## one slack
              result <- FALSE
              next
            } else {
              output <- list(res = FALSE,"","","","")
              break
            }
            pos_char <- regexpr(substr(list.of.words[n],1,1), trunc_abbrev, ignore.case = TRUE)[[1]]
            removed.chars <- trunc_abbrev[pos_char]
            ## if character was found in search word and abbreviation and
            ## there were no words without character in searchword removed
            if (pos_char > 0 && !nchar(removed.chars) > 1) {
              trunc_abbrev <- trunc_abbrev[-pos_char]
            } else if (result == TRUE){
              ## one slack
              result <- FALSE
            } else {
              output <- list(res = FALSE,"","","","")
              break
            }
          }
        } else {
          output <- list(res = FALSE,"","","","")
        }

        ## test if letters did match search word
        if (output[[1]]) {
          ## for plural search words
          if (char_list[length(char_list)] == "s" &&
              substr(ext_searchword, nchar(ext_searchword),nchar(ext_searchword)) == "s"){
            abbrev_plural <- paste(char_list, collapse = "")
            replacement_plural <- paste0(abbrev_plural," (",ext_searchword,")$*")
            abbrev_singular <- paste(char_list[-length(char_list)], collapse = "")
            simple_searchword <- substr(paragraph, pos_searchword_start, pos_searchword_end)
            replacement_singular <- paste0(abbrev_singular," (",simple_searchword,")$*")
          } else {
            abbrev_singular <- paste(char_list, collapse = "")
            replacement_singular <- paste0(abbrev_singular," (",ext_searchword,")$*")
            abbrev_plural <- paste(c(char_list,"s"), collapse = "")
            replacement_plural <- paste0(abbrev_plural," (",ext_searchword,"s)$*")
          }
          output <- list(res = TRUE,abbrev_singular = abbrev_singular,
                         replacement_singular = replacement_singular,
                         abbrev_plural = abbrev_plural,replacement_plural = replacement_plural)
        }
      } else {
        output <- list(res = FALSE,"","","","")
      }
    } else {
      ## abbrev found, abbrev_singular, replacement_singular, abbrev_plural, replacement_plural
      output <- list(res = FALSE,"","","","")
    }
    return(output)
  }

  test_if_abbrev_double_dots_or_equal <- function(searchword, paragraph, ignore.case.sw) {
    output <- list(res = TRUE,"","","","")
    ## get the position of the search word in the paragraph
    pos_searchword_start <- regexpr(searchword,paragraph, ignore.case = ignore.case.sw)[[1]]
    pos_searchword_end <- pos_searchword_start + attr(regexpr(searchword,paragraph,
                                                              ignore.case = ignore.case.sw),
                                                      "match.length") - 1
    ## test if : or = is before it
    minus_three_chars <- substr(paragraph, (pos_searchword_start - 3),(pos_searchword_start - 1))
    if (grepl("(:|=)", minus_three_chars)) {
      pos_ext_searchword_end <- (regexpr("[^[:alnum:] ]",
                                         substr(paragraph, pos_searchword_end + 1,
                                                pos_searchword_end + 3)))[[1]] +
        pos_searchword_end - 1
      ext_searchword <- substr(paragraph, pos_searchword_start, pos_ext_searchword_end)
      ## get all letters before the : or =
      current_char_pos <- pos_searchword_start - 2 - lengths(regmatches(minus_three_chars,
                                                                        gregexpr(" ", minus_three_chars)))
      current_char <- substr(paragraph, current_char_pos, current_char_pos)
      char_list <- NULL
      while (grepl("[A-z|0-9]", current_char)){
        char_list <- c(current_char, char_list)
        current_char_pos <- current_char_pos - 1
        current_char <- substr(paragraph, current_char_pos, current_char_pos)
      }

      char_list <- char_list[!(char_list %in% c("(", ")","[", "]", "/",
                                                "{","}","\\"))]

      ## test if letter description was found
      if (length(char_list) > 0){
        trunc_searchword <- gsub("[^[:alnum:] ]","",ext_searchword)
        ## match the letters to letters in the searchword
        for (n in 1:length(char_list)){
          pos_char <- regexpr(char_list[n], trunc_searchword, ignore.case = TRUE)[[1]]
          removed.chars <- substr(trunc_searchword, 1, pos_char)
          ## if character was found in search word and abbreviation and
          ## there were no words without character in searchword removed
          if (pos_char > 0 && !length(gregexpr(" ",removed.chars)[[1]]) > 1) {
            trunc_searchword <- substr(trunc_searchword, pos_char + 1, nchar(trunc_searchword))
          } else {
            output <- list(res = FALSE,"","","","")
            break
          }
        }

        ## test if each word in extended searchword has letter in abbreviation
        list.of.words <- gsub("[^A-z]","",
                              strsplit(ext_searchword," ")[[1]])[!(gsub("[^A-z]","",
                                                                        strsplit(ext_searchword,
                                                                                 " ")[[1]]) %in% "")]

        ## allow one slack
        if (length(list.of.words) - 1 <= length(char_list)){
          result <- TRUE
          trunc_abbrev <- char_list
          ## match the letters to letters in the searchword
          for (n in 1:length(list.of.words)){
            pos_char <- regexpr(substr(list.of.words[n],1,1), trunc_abbrev, ignore.case = TRUE)[[1]]
            removed.chars <- trunc_abbrev[pos_char]
            ## if character was found in search word and abbreviation and
            ## there were no words without character in searchword removed
            if (pos_char > 0 && !nchar(removed.chars) > 1) {
              trunc_abbrev <- trunc_abbrev[-pos_char]
            } else if (result == TRUE){
              ## one slack
              result <- FALSE
            } else {
              output <- list(res = FALSE,"","","","")
              break
            }
          }
        } else {
          output <- list(res = FALSE,"","","","")
        }

        ## test if letters did match search word
        if (output[[1]]) {
          ## for plural search words
          if (char_list[length(char_list)] == "s" &&
              substr(ext_searchword, nchar(ext_searchword),nchar(ext_searchword)) == "s"){
            abbrev_plural <- paste(char_list, collapse = "")
            replacement_plural <- paste0(abbrev_plural," (",ext_searchword,")$*")
            abbrev_singular <- paste(char_list[-length(char_list)], collapse = "")
            simple_searchword <- substr(paragraph, pos_searchword_start, pos_searchword_end)
            replacement_singular <- paste0(abbrev_singular," (",simple_searchword,")$*")
          } else {
            abbrev_singular <- paste(char_list, collapse = "")
            replacement_singular <- paste0(abbrev_singular," (",ext_searchword,")$*")
            abbrev_plural <- paste(c(char_list,"s"), collapse = "")
            replacement_plural <- paste0(abbrev_plural," (",ext_searchword,"s)$*")
          }
          output <- list(res = TRUE,abbrev_singular = abbrev_singular,
                         replacement_singular = replacement_singular,
                         abbrev_plural = abbrev_plural,replacement_plural = replacement_plural)
        }
      } else {
        output <- list(res = FALSE,"","","","")
      }
    } else {
      ## abbrev found, abbrev_singular, replacement_singular, abbrev_plural, replacement_plural
      output <- list(res = FALSE,"","","","")
    }
    return(output)
  }

  deletefile <- function(verbose=TRUE) {
    out_msg <- NULL

    if (delete == TRUE) {
      ## clean up
      unlink(txtpath, recursive = TRUE)
      if (exists(txtpath)) {
        out_msg <- c(out_msg, paste0("Could not delete:", txtpath))
        if (verbose) cat(utils::tail(out_msg,1), sep="\n")
      }
      unlink(keeplayouttxtpath, recursive = TRUE)
      if (exists(keeplayouttxtpath)) {
        out_msg <- c(out_msg, paste0("Could not delete:", keeplayouttxtpath))
        if (verbose) cat(utils::tail(out_msg,1), sep="\n")
      }
      unlink(htmlpath, recursive = TRUE)
      if (exists(htmlpath)){
        if (exists(keeplayouttxtpath)) {
          out_msg <- c(out_msg, paste0("Could not delete:", htmlpath))
          if (verbose) cat(utils::tail(out_msg,1), sep="\n")
        }
      }
    }
    return(out_msg)
  }

  replace.html.entity <- function(input.with.html) {
    output.without.html <- input.with.html
    output.without.html <- gsub("&amp;","&",output.without.html)
    output.without.html <- gsub("&lt;","<",output.without.html)
    output.without.html <- gsub("&gt;",">",output.without.html)
    output.without.html <- gsub("&#160;"," ",output.without.html)
    output.without.html <- gsub("&Agrave;",intToUtf8(0x00C0),output.without.html)
    output.without.html <- gsub("&Aacute;",intToUtf8(0x00C1),output.without.html)
    output.without.html <- gsub("&Acirc;",intToUtf8(0x00C2),output.without.html)
    output.without.html <- gsub("&Atilde;",intToUtf8(0x00C3),output.without.html)
    output.without.html <- gsub("&Auml;",intToUtf8(0x00C4),output.without.html)
    output.without.html <- gsub("&Aring;",intToUtf8(0x00C5),output.without.html)
    output.without.html <- gsub("&AElig;",intToUtf8(0x00C6),output.without.html)
    output.without.html <- gsub("&Ccedil;",intToUtf8(0x00C7),output.without.html)
    output.without.html <- gsub("&Egrave;",intToUtf8(0x00C8),output.without.html)
    output.without.html <- gsub("&Eacute;",intToUtf8(0x00C9),output.without.html)
    output.without.html <- gsub("&Ecirc;",intToUtf8(0x00CA),output.without.html)
    output.without.html <- gsub("&Euml;",intToUtf8(0x00CB),output.without.html)
    output.without.html <- gsub("&Igrave;",intToUtf8(0x00CC),output.without.html)
    output.without.html <- gsub("&Iacute;",intToUtf8(0x00CD),output.without.html)
    output.without.html <- gsub("&Icirc;",intToUtf8(0x00CE),output.without.html)
    output.without.html <- gsub("&Iuml;",intToUtf8(0x00CF),output.without.html)
    output.without.html <- gsub("&ETH;",intToUtf8(0x00D0),output.without.html)
    output.without.html <- gsub("&Ntilde;",intToUtf8(0x00D1),output.without.html)
    output.without.html <- gsub("&Ograve;",intToUtf8(0x00D2),output.without.html)
    output.without.html <- gsub("&Oacute;",intToUtf8(0x00D3),output.without.html)
    output.without.html <- gsub("&Ocirc;",intToUtf8(0x00D4),output.without.html)
    output.without.html <- gsub("&Otilde;",intToUtf8(0x00D5),output.without.html)
    output.without.html <- gsub("&Ouml;",intToUtf8(0x00D6),output.without.html)
    output.without.html <- gsub("&Oslash;",intToUtf8(0x00D8),output.without.html)
    output.without.html <- gsub("&Ugrave;",intToUtf8(0x00D9),output.without.html)
    output.without.html <- gsub("&Uacute;",intToUtf8(0x00DA),output.without.html)
    output.without.html <- gsub("&Ucirc;",intToUtf8(0x00DB),output.without.html)
    output.without.html <- gsub("&Uuml;",intToUtf8(0x00DC),output.without.html)
    output.without.html <- gsub("&Yacute;",intToUtf8(0x00DD),output.without.html)
    output.without.html <- gsub("&THORN;",intToUtf8(0x00DE),output.without.html)
    output.without.html <- gsub("&szlig;",intToUtf8(0x00DF),output.without.html)
    output.without.html <- gsub("&agrave;",intToUtf8(0x00E0),output.without.html)
    output.without.html <- gsub("&aacute;",intToUtf8(0x00E1),output.without.html)
    output.without.html <- gsub("&acirc;",intToUtf8(0x00E2),output.without.html)
    output.without.html <- gsub("&atilde;",intToUtf8(0x00E3),output.without.html)
    output.without.html <- gsub("&auml;",intToUtf8(0x00E4),output.without.html)
    output.without.html <- gsub("&aring;",intToUtf8(0x00E5),output.without.html)
    output.without.html <- gsub("&aelig;",intToUtf8(0x00E6),output.without.html)
    output.without.html <- gsub("&ccedil;",intToUtf8(0x00E7),output.without.html)
    output.without.html <- gsub("&egrave;",intToUtf8(0x00E8),output.without.html)
    output.without.html <- gsub("&eacute;",intToUtf8(0x00E9),output.without.html)
    output.without.html <- gsub("&ecirc;",intToUtf8(0x00EA),output.without.html)
    output.without.html <- gsub("&euml;",intToUtf8(0x00EB),output.without.html)
    output.without.html <- gsub("&igrave;",intToUtf8(0x00EC),output.without.html)
    output.without.html <- gsub("&iacute;",intToUtf8(0x00ED),output.without.html)
    output.without.html <- gsub("&icirc;",intToUtf8(0x00EE),output.without.html)
    output.without.html <- gsub("&iuml;",intToUtf8(0x00EF),output.without.html)
    output.without.html <- gsub("&eth;",intToUtf8(0x00F0),output.without.html)
    output.without.html <- gsub("&ntilde;",intToUtf8(0x00F1),output.without.html)
    output.without.html <- gsub("&ograve;",intToUtf8(0x00F2),output.without.html)
    output.without.html <- gsub("&oacute;",intToUtf8(0x00F3),output.without.html)
    output.without.html <- gsub("&ocirc;",intToUtf8(0x00F4),output.without.html)
    output.without.html <- gsub("&otilde;",intToUtf8(0x00F5),output.without.html)
    output.without.html <- gsub("&ouml;",intToUtf8(0x00F6),output.without.html)
    output.without.html <- gsub("&oslash;",intToUtf8(0x00F8),output.without.html)
    output.without.html <- gsub("&ugrave;",intToUtf8(0x00F9),output.without.html)
    output.without.html <- gsub("&uacute;",intToUtf8(0x00FA),output.without.html)
    output.without.html <- gsub("&ucirc;",intToUtf8(0x00FB),output.without.html)
    output.without.html <- gsub("&uuml;",intToUtf8(0x00FC),output.without.html)
    output.without.html <- gsub("&yacute;",intToUtf8(0x00FD),output.without.html)
    output.without.html <- gsub("&thorn;",intToUtf8(0x00FE),output.without.html)
    output.without.html <- gsub("&yuml;",intToUtf8(0x00FF),output.without.html)
    output.without.html <- gsub("&iexcl;",intToUtf8(0x00A1),output.without.html)
    output.without.html <- gsub("&cent;",intToUtf8(0x00A2),output.without.html)
    output.without.html <- gsub("&pound;",intToUtf8(0x00A3),output.without.html)
    output.without.html <- gsub("&curren;",intToUtf8(0x00A4),output.without.html)
    output.without.html <- gsub("&yen;",intToUtf8(0x00A5),output.without.html)
    output.without.html <- gsub("&brvbar;",intToUtf8(0x00A6),output.without.html)
    output.without.html <- gsub("&sect;",intToUtf8(0x00A7),output.without.html)
    output.without.html <- gsub("&uml;",intToUtf8(0x00A8),output.without.html)
    output.without.html <- gsub("&copy;",intToUtf8(0x00A9),output.without.html)
    output.without.html <- gsub("&ordf;",intToUtf8(0x00AA),output.without.html)
    output.without.html <- gsub("&laquo;",intToUtf8(0x00AB),output.without.html)
    output.without.html <- gsub("&not;",intToUtf8(0x00AC),output.without.html)
    output.without.html <- gsub("&reg;",intToUtf8(0x00AE),output.without.html)
    output.without.html <- gsub("&macr;",intToUtf8(0x00AF),output.without.html)
    output.without.html <- gsub("&deg;",intToUtf8(0x00B1),output.without.html)
    output.without.html <- gsub("&plusmn;",intToUtf8(0x00B2),output.without.html)
    output.without.html <- gsub("&sup2;",intToUtf8(0x00B3),output.without.html)
    output.without.html <- gsub("&sup3;",intToUtf8(0x00B4),output.without.html)
    output.without.html <- gsub("&acute;",intToUtf8(0x00B5),output.without.html)
    output.without.html <- gsub("&micro;",intToUtf8(0x00B6),output.without.html)
    output.without.html <- gsub("&para;",intToUtf8(0x00B7),output.without.html)
    output.without.html <- gsub("&cedil;",intToUtf8(0x00B8),output.without.html)
    output.without.html <- gsub("&sup1;",intToUtf8(0x00B9),output.without.html)
    output.without.html <- gsub("&ordm;",intToUtf8(0x00BA),output.without.html)
    output.without.html <- gsub("&raquo;",intToUtf8(0x00BB),output.without.html)
    output.without.html <- gsub("&frac14;",intToUtf8(0x00BC),output.without.html)
    output.without.html <- gsub("&frac12;",intToUtf8(0x00BD),output.without.html)
    output.without.html <- gsub("&frac34;",intToUtf8(0x00BE),output.without.html)
    output.without.html <- gsub("&iquest;",intToUtf8(0x00BF),output.without.html)
    output.without.html <- gsub("&times;",intToUtf8(0x00D7),output.without.html)
    output.without.html <- gsub("&divide;",intToUtf8(0x00F7),output.without.html)
    output.without.html <- gsub("&circ;",intToUtf8(0x00FE),output.without.html)
    output.without.html <- gsub("&tilde;",intToUtf8(0x007E),output.without.html)
    output.without.html <- gsub(intToUtf8(0xFB01),"fi",output.without.html, fixed = TRUE)
    return(output.without.html)
  }

  insert.html.entity <- function(input.without.html) {
    output.with.html <- input.without.html
    output.with.html <- gsub(intToUtf8(0x00C0),"&Agrave;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00C1),"&Aacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00C2),"&Acirc;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00C3),"&Atilde;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00C4),"&Auml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00C5),"&Aring;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00C6),"&AElig;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00C7),"&Ccedil;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00C8),"&Egrave;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00C9),"&Eacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00CA),"&Ecirc;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00CB),"&Euml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00CC),"&Igrave;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00CD),"&Iacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00CE),"&Icirc;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00CF),"&Iuml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00D0),"&ETH;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00D1),"&Ntilde;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00D2),"&Ograve;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00D3),"&Oacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00D4),"&Ocirc;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00D5),"&Otilde;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00D6),"&Ouml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00D8),"&Oslash;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00D9),"&Ugrave;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00DA),"&Uacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00DB),"&Ucirc;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00DC),"&Uuml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00DD),"&Yacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00DE),"&THORN;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00DF),"&szlig;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00E0),"&agrave;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00E1),"&aacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00E2),"&acirc;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00E3),"&atilde;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00E4),"&auml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00E5),"&aring;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00E6),"&aelig;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00E7),"&ccedil;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00E8),"&egrave;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00E9),"&eacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00EA),"&ecirc;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00EB),"&euml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00EC),"&igrave;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00ED),"&iacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00EE),"&icirc;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00EF),"&iuml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00F0),"&eth;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00F1),"&ntilde;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00F2),"&ograve;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00F3),"&oacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00F4),"&ocirc;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00F5),"&otilde;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00F6),"&ouml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00F8),"&oslash;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00F9),"&ugrave;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00FA),"&uacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00FB),"&ucirc;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00FC),"&uuml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00FD),"&yacute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00FE),"&thorn;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00FF),"&yuml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00A1),"&iexcl;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00A2),"&cent;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00A3),"&pound;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00A4),"&curren;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00A5),"&yen;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00A6),"&brvbar;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00A7),"&sect;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00A8),"&uml;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00A9),"&copy;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00AA),"&ordf;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00AB),"&laquo;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00AC),"&not;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00AE),"&reg;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00AF),"&macr;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00B1),"&deg;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00B2),"&plusmn;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00B3),"&sup2;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00B4),"&sup3;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00B5),"&acute;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00B6),"&micro;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00B7),"&para;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00B8),"&cedil;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00B9),"&sup1;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00BA),"&ordm;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00BB),"&raquo;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00BC),"&frac14;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00BD),"&frac12;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00BE),"&frac34;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00BF),"&iquest;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00D7),"&times;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00F7),"&divide;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x00FE),"&circ;",output.with.html)
    output.with.html <- gsub(intToUtf8(0x007E),"&tilde;",output.with.html)
    return(output.with.html)
  }

  update_progress_info <- function(print_message){
    if (length(PDE.globals$le.progress.textbox) > 0){
      ## add completion  info
      progress_info_length <- length(tcltk2::tk2list.get(PDE.globals$le.progress.textbox))
      if (progress_info_length > 3) {
        new_list <- tcltk2::tk2list.get(PDE.globals$le.progress.textbox)[!grepl("^$",
                                                                                tcltk2::tk2list.get(PDE.globals$le.progress.textbox))]
      } else {
        new_list <- tcltk2::tk2list.get(PDE.globals$le.progress.textbox)
      }
      tcltk::tkconfigure(PDE.globals$le.progress.textbox,values = c(new_list,print_message))
      tcltk::tkconfigure(PDE.globals$le.progress.textbox,textvariable = tcltk::tclVar(print_message))
      tcltk::tcl("update")
    }
  }

  remove_backref <- function(x) {
    for (s in 1:length(x)){
      string_strsplit <- strsplit(x[s],"")[[1]]
      for (l in 1:length(string_strsplit)){
        if (l != length(string_strsplit)){
          if ((string_strsplit[l] == "\\\\") && (string_strsplit[l+1] %in% as.character(0:9))){
            string_strsplit <- gsub("\\\\","",string_strsplit)
          }
        }
      }
      string_strsplit <- gsub("\\\\","",string_strsplit)
      x[s] <- paste0(string_strsplit,collapse ="")
    }
    return(x)
  }

  ## set all indicator variables ---------------------------
  integrity.indicator <- TRUE ## indicates if txt, keeplayouttxt and html copy of the PDF file are created correctly
  filterwords.go <- FALSE ## indicator if filter words were found or not set
  searchwords.go <- FALSE ## indicator if search words were found or not set
  nexti <- FALSE ## indicator for html table to be processed
  output_files <- NULL ## this is the output to return at the end
  out_msg <- NULL
  sw_in_tab_counted <- FALSE

  ## set statics output -----------------------------------------
  stat_output <- NULL
  pdf_word_count <- 0
  pdf_page_count <- 0
  pdf_filterwords <- NULL
  pdf_filterword_times <- NULL
  pdf_filterword_names <- NULL
  pdf_filterword_total <- NULL
  pdf_searchwords <- NULL
  pdf_searchword_times <- NULL
  pdf_searchword_names <- NULL
  pdf_searchword_total <- NULL
  pdf_sentences_w_searchwords <- NA
  search.word.category_total <- NULL

  ## set the paths of the files ---------------------------------
  output <- NULL
  pdfpath <- pdf
  txtpath <- gsub(".pdf[^.pdf]*$", ".txt", pdfpath)
  keeplayouttxtpath <- gsub(".pdf[^.pdf]*$", "_keeplayout.txt",
                            pdfpath)

  ##make sure filter and search words do not have duplicates
  stop_ind <- FALSE
  if (any(duplicated(search.words))){
    print_message <- paste0("Following search words are duplicated in list: ",
                            paste(search.words[duplicated(search.words)], collapse = ";"),
                            ". Please remove the duplicates and restart the analysis.")
    out_msg <- c(out_msg, print_message)
    update_progress_info(print_message)
    stop_ind <- TRUE
  }
  if (any(duplicated(filter.words))){
    print_message <- paste0("Following search words are duplicated in list: ",
                            paste(filter.words[duplicated(filter.words)], collapse = ";"),
                            ". Please remove the duplicates and restart the analysis.")
    out_msg <- c(out_msg, print_message)
    update_progress_info(print_message)
    stop_ind <- TRUE
  }
  if (stop_ind == TRUE){
    stop("Words were duplicated in the keyword list. Please remove the duplicates and restart the analysis.")
  }


  ## create the id and output dir ----------------------------------
  dir.create(out, showWarnings = FALSE)
  id <- sub("^(.*)\\..*$", "\\1", basename(txtpath))
  print_message <- paste0("Following file is processing: \'",id,".pdf\'")
  out_msg <- c(out_msg, print_message)
  if (verbose) cat(utils::tail(out_msg,1), sep="\n")

  ## 1) Create txt and html copies of PDF file ---------------------------------------
  ## test of Xpdftools are installed
  xpdf_config_location <- paste0(system.file(package = "PDE"),"/bin/XPDF_DIR.config")
  if (file.exists(xpdf_config_location)){
    pdftotext_location <- grep("pdftotext",readLines(xpdf_config_location), value = TRUE)

    pdftohtml_location <- grep("pdftohtml",readLines(xpdf_config_location), value = TRUE)

    pdftopng_location <- grep("pdftopng",readLines(xpdf_config_location), value = TRUE)

    if (length(file.exists(pdftotext_location)) == 0 ||
        length(file.exists(pdftohtml_location)) == 0 ||
        length(file.exists(pdftopng_location)) == 0){
      install.test <- PDE_check_Xpdf_install(verbose=verbose)
    } else {
      install.test <- TRUE
    }

  } else {
    install.test <- PDE_check_Xpdf_install(verbose=verbose)
  }
  if (install.test == FALSE) {
    if (length(PDE.globals$le.progress.textbox) > 0){
      result_install <- tk_messageBox(type = "yesno",
                    paste0(attributes(install.test)$msg,
                           " Do you want to download and install xpdf now?"), caption = "xpdf not installed")
      if (result_install == "yes"){
        PDE_install_Xpdftools4.02(permission = 1)
        update_progress_info("Please stop and restart the analysis.")
        stop("Please stop and restart the analysis.")
      } else {
        update_progress_info(attributes(install.test)$msg)
        stop(attributes(install.test)$msg)
      }
    } else {
      stop(attributes(install.test)$msg)
    }
  } else {
    pdftotext_location <- grep("pdftotext",readLines(xpdf_config_location), value = TRUE)

    pdftohtml_location <- grep("pdftohtml",readLines(xpdf_config_location), value = TRUE)

    pdftopng_location <- grep("pdftopng",readLines(xpdf_config_location), value = TRUE)
  }

  system(paste0("\"",pdftotext_location,"\" -layout",
                " \"", pdfpath, "\" \"", keeplayouttxtpath,
                "\""), wait = TRUE, ignore.stderr = TRUE)
  system(paste0("\"",pdftotext_location,"\" \"", pdfpath,
                "\" \"", txtpath, "\""), wait = TRUE,
         ignore.stderr = TRUE)
  htmlpath <- gsub(".pdf[^.pdf]*$", ".html", pdfpath)
  ## convert PDF to HTML
  system(paste0("\"",pdftohtml_location,"\" \"", pdfpath,
                "\" \"", htmlpath, "\""), wait = TRUE,
         ignore.stderr = TRUE)

  ## test if keeplayouttxt files were saved with funny name
  fixed_basekeeplayouttxtpath1 <- iconv(basename(keeplayouttxtpath), from = "Windows-1252", to = "UTF-8")
  fixed_keeplayouttxtpath1 <- paste0(dirname(keeplayouttxtpath),"/",fixed_basekeeplayouttxtpath1)
  fixed_basekeeplayouttxtpath2 <- fixed_basekeeplayouttxtpath1
  Encoding(fixed_basekeeplayouttxtpath2) <- "Windows-1252"
  fixed_keeplayouttxtpath2 <- paste0(dirname(keeplayouttxtpath),"/",fixed_basekeeplayouttxtpath2)
  if (file.exists(fixed_keeplayouttxtpath1)){
    response <- file.rename(from=fixed_keeplayouttxtpath1, to=keeplayouttxtpath)
  } else if (file.exists(fixed_keeplayouttxtpath2)){
    response <- file.rename(from=fixed_keeplayouttxtpath2, to=keeplayouttxtpath)
  }

  ## test if txt files were saved with funny name
  fixed_basetxtpath1 <- iconv(basename(txtpath), from = "Windows-1252", to = "UTF-8")
  fixed_txtpath1 <- paste0(dirname(txtpath),"/",fixed_basetxtpath1)
  fixed_basetxtpath2 <- fixed_basetxtpath1
  Encoding(fixed_basetxtpath2) <- "Windows-1252"
  fixed_txtpath2 <- paste0(dirname(txtpath),"/",fixed_basetxtpath2)
  if (file.exists(fixed_txtpath1)){
    response <- file.rename(from=fixed_txtpath1, to=txtpath)
  } else if (file.exists(fixed_txtpath2)){
    response <- file.rename(from=fixed_txtpath2, to=txtpath)
  }

  ## test if files were saved with funny name
  fixed_basehtmlpath1 <- iconv(basename(htmlpath), from = "Windows-1252", to = "UTF-8")
  fixed_htmlpath1 <- paste0(dirname(htmlpath),"/",fixed_basehtmlpath1)
  fixed_basehtmlpath2 <- fixed_basehtmlpath1
  Encoding(fixed_basehtmlpath2) <- "Windows-1252"
  fixed_htmlpath2 <- paste0(dirname(htmlpath),"/",fixed_basehtmlpath2)
  if (dir.exists(fixed_htmlpath1)){
    htmlpath <- fixed_htmlpath1
  } else if (file.exists(fixed_htmlpath2)){
    htmlpath <- fixed_htmlpath2
  }

  ## add completion  info
  update_progress_info(print_message)
  ## 2.1) Check txt and html file integrity ----------------------------------
  integrity.indicator <- TRUE
  ## check if html was created (if pdf is secured)
  if (!dir.exists(htmlpath) || !file.exists(paste0(htmlpath, "/index.html"))) {
    ## export error and do not remove file change
    print_message <- paste0("\'", id, ".pdf\' is most likely secured and cannot be processed!")
    out_msg <- c(out_msg, print_message)
    if (verbose) cat(utils::tail(out_msg,1), sep="\n")
    update_progress_info(print_message)
    dir.create(paste0(out,"/secured"), showWarnings = FALSE)
    write(paste0(pdfpath, " is most likely secured and cannot be processed!"),
          file = paste0(out,"/secured/",id, "_is_secured.txt"))
    integrity.indicator <- FALSE
    ## if cpymv is either cpy or mv and filterwords were used
    if (cpy_mv == "cpy" && filter.words[1] != ""){
      dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
      dir.create(paste0(out,"/pdfs/secured"), showWarnings = FALSE)
      file.copy(pdf, paste0(out,"/pdfs/secured"))
      print_message <- paste0(basename(pdf),
                              " was copied to \'", out,"/pdfs/secured\'.")
      out_msg <- c(out_msg, print_message)
      if (verbose) cat(utils::tail(out_msg,1), sep="\n")
      update_progress_info(print_message)
    } else if (cpy_mv == "mv" && filter.words[1] != ""){
      dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
      dir.create(paste0(out,"/pdfs/secured"), showWarnings = FALSE)
      file.copy(pdf, paste0(out,"/pdfs/secured"))
      unlink(pdf)
      print_message <- paste0(basename(pdf),
                              " was moved to \'", out,"/pdfs/secured\'.")
      out_msg <- c(out_msg, print_message)
      if (verbose) cat(utils::tail(out_msg,1), sep="\n")
      update_progress_info(print_message)
    }
    ## if html was created
  } else {
    ## read in the txt files
    txtcontent <- readin_txt(txtpath)
    ##split rows with double page splits
    doublepagesplits <- grep("^\\f.*\\f", txtcontent)
    counter <- 0
    for (dbps in doublepagesplits){
      new_rows <- strsplit(txtcontent[dbps + counter],"\\f")[[1]][-1]
      for (r in length(new_rows):2){
        txtcontent[dbps + counter] <- paste0("\f",new_rows[1])
        txtcontent <- unlist(append(txtcontent, list(paste0("\f",new_rows[r])), dbps + counter))
        counter <- counter + 1
      }
    }

    splittxtcontent <- page_splits(txtcontent)
    keeplayouttxtcontent <- readin_txt(keeplayouttxtpath)
    indexcontent <- readLines(paste0(htmlpath, "/index.html"))
    ## if the txt or html files have no content
    if (identical(indexcontent, "")) {
      ## export error and do not remove file change
      print_message <- paste0("\'", id, ".pdf\' is most likely secured and cannot be processed!")
      out_msg <- c(out_msg, print_message)
      if (verbose) cat(utils::tail(out_msg,1), sep="\n")
      update_progress_info(print_message)
      dir.create(paste0(out,"/secured"), showWarnings = FALSE)
      write(paste0(pdfpath, " is most likely secured and cannot be processed!"),
            file = paste0(out,"/secured/",id, "_is_secured.txt"))
      integrity.indicator <- FALSE
      ## if cpymv is either cpy or mv and filterwords were used
      if (cpy_mv == "cpy" && filter.words[1] != ""){
        dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
        dir.create(paste0(out,"/pdfs/secured"), showWarnings = FALSE)
        file.copy(pdf, paste0(out,"/pdfs/secured"))
        print_message <- paste0(basename(pdf),
                                " was copied to \'", out,"/pdfs/secured\'.")
        out_msg <- c(out_msg, print_message)
        if (verbose) cat(utils::tail(out_msg,1), sep="\n")
        update_progress_info(print_message)
      } else if (cpy_mv == "mv" && filter.words[1] != ""){
        dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
        dir.create(paste0(out,"/pdfs/secured"), showWarnings = FALSE)
        file.copy(pdf, paste0(out,"/pdfs/secured"))
        unlink(pdf)
        print_message <- paste0(basename(pdf),
                                " was moved to \'", out,"/pdfs/secured\'.")
        out_msg <- c(out_msg, print_message)
        if (verbose) cat(utils::tail(out_msg,1), sep="\n")
        update_progress_info(print_message)
      }
    } else if (identical(txtcontent, "") || identical(keeplayouttxtcontent, "") ||
               identical(gsub("\f","",txtcontent), "") || identical(gsub("\f","",keeplayouttxtcontent), "")  ) {
      print_message <- paste0("\'", id, ".pdf\' most likely contains no text content or is a scanned document!")
      out_msg <- c(out_msg, print_message)
      if (verbose) cat(utils::tail(out_msg,1), sep="\n")
      update_progress_info(print_message)
      dir.create(paste0(out,"/nr/"), showWarnings = FALSE)
      write(paste0(pdfpath, " most likely contains no text content or is a scanned document!"),
            file = paste0(out, "/nr/", id, "_non-readable.txt"))
      integrity.indicator <- FALSE
      ## if cpymv is either cpy or mv and filterwords were used
      if (cpy_mv == "cpy" && filter.words[1] != ""){
        dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
        dir.create(paste0(out,"/pdfs/nr"), showWarnings = FALSE)
        file.copy(pdf, paste0(out,"/pdfs/nr"))
        print_message <- paste0(basename(pdf),
                                " was copied to \'", out,"/pdfs/nr\'.")
        out_msg <- c(out_msg, print_message)
        if (verbose) cat(utils::tail(out_msg,1), sep="\n")
        update_progress_info(print_message)
      } else if (cpy_mv == "mv" && filter.words[1] != ""){
        dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
        dir.create(paste0(out,"/pdfs/nr"), showWarnings = FALSE)
        file.copy(pdf, paste0(out,"/pdfs/nr"))
        unlink(pdf)
        print_message <- paste0(basename(pdf),
                                " was moved to \'", out,"/pdfs/nr\'.")
        out_msg <- c(out_msg, print_message)
        if (verbose) cat(utils::tail(out_msg,1), sep="\n")
        update_progress_info(print_message)
      }
      ## when txtcontent is there and html index was created
    } else {
      ## extract all the page names ##
      pages <- NULL
      for (line in indexcontent) {
        if (grepl("a href=\"", line)) pages <- c(pages,
                                                 substr(line, regexpr("a href=\"", line) + 8,
                                                        regexpr("html", line) + 3))
      }

      ## read in the html file ##
      htmlcontent <- vector("list", length(pages))
      for (i in 1:length(pages)) {
        htmlpagecontent <- readLines(paste0(htmlpath,"/", pages[i]), encoding = "UTF-8", warn = FALSE)
        if (identical(htmlpagecontent, "")) {
          print_message <- paste0("\'", id, ".pdf\' most likely contains no text content or is a scanned in document!")
          out_msg <- c(out_msg, print_message)
          if (verbose) cat(utils::tail(out_msg,1), sep="\n")
          update_progress_info(print_message)
          dir.create(paste0(out,"/nr/"), showWarnings = FALSE)
          write(paste0(pdfpath, " most likely contains no text content or is a scanned in document!"),
                file = paste0(out, "/nr/", id, "_non-readable.txt"))
          integrity.indicator <- FALSE
          ## if cpymv is either cpy or mv and filterwords were used
          if (cpy_mv == "cpy" && filter.words[1] != ""){
            dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
            dir.create(paste0(out,"/pdfs/nr"), showWarnings = FALSE)
            file.copy(pdf, paste0(out,"/pdfs/nr"))
            print_message <- paste0(basename(pdf),
                                    " was copied to \'", out,"/pdfs/nr\'.")
            out_msg <- c(out_msg, print_message)
            if (verbose) cat(utils::tail(out_msg,1), sep="\n")
            update_progress_info(print_message)
          } else if (cpy_mv == "mv" && filter.words[1] != ""){
            dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
            dir.create(paste0(out,"/pdfs/nr"), showWarnings = FALSE)
            file.copy(pdf, paste0(out,"/pdfs/nr"))
            unlink(pdf)
            print_message <- paste0(basename(pdf),
                                    " was moved to \'", out,"/pdfs/nr\'.")
            out_msg <- c(out_msg, print_message)
            if (verbose) cat(utils::tail(out_msg,1), sep="\n")
            update_progress_info(print_message)
          }
          break
        }
        ## replace all fi
        lines_with_fi <- grep(intToUtf8(0xFB01),htmlpagecontent)
        for (line in lines_with_fi){
          res <- try(gsub(intToUtf8(0xFB01),"fi",htmlpagecontent[line], fixed = TRUE),silent = TRUE)
          if (inherits(res,"try-error")){
            htmlpagecontent[line] <- iconv(htmlpagecontent[line], 'UTF-8', 'latin1', 'bit')
          }
          htmlpagecontent[line] <- gsub(intToUtf8(0xFB01),"fi",htmlpagecontent[line], fixed = TRUE)
        }
        ## replace p styles
        ## get the list of p styles
        list_of_fts <- NULL
        lines_with_ft <- grep("\\.ft",htmlpagecontent)
        if (length(lines_with_ft) > 0){
          for (ln in lines_with_ft){
            list_of_fts <- rbind(list_of_fts,
                                 cbind(ft=sub("\\{.*$","",substr(htmlpagecontent[ln],
                                                                 regexpr("\\.ft",htmlpagecontent[ln])[1]+1,
                                                                 nchar(htmlpagecontent[ln]))),
                                       style=sub("\\}.*$","",sub("^.*\\{","",substr(htmlpagecontent[ln],
                                                                                    regexpr("\\.ft",htmlpagecontent[ln])[1]+1,
                                                                                    nchar(htmlpagecontent[ln]))))))
          }
          ## replace each ft style in that page
          for (ft.numb in 1:nrow(list_of_fts)){
            htmlpagecontent <- gsub(paste0("\" class=\"",list_of_fts[ft.numb,1],"\""),
                                    paste0(";",list_of_fts[ft.numb,2],"\""),htmlpagecontent)
          }
        }

        htmlcontent[[i]] <- htmlpagecontent
      }
    } ## end if the txt or html files have no content
  } ## end if (!dir.exists(htmlpath))

  if (integrity.indicator == TRUE) {
    ## if the file is only images or empty then don't process
    realcontent <- gsub("^\\f", "", paste(txtcontent,
                                          collapse = ""))
    realcontent <- gsub("\t", "", realcontent)
    realcontent <- gsub(" ", "", realcontent)
    if (realcontent == "") {
      integrity.indicator <- FALSE
      print_message <- paste0("\'", id, ".pdf\' has no readable content")
      out_msg <- c(out_msg, print_message)
      if (verbose) cat(utils::tail(out_msg,1), sep="\n")
      update_progress_info(print_message)
      ## write an empty file
      if (write.txt.doc.file == TRUE) {
        dir.create(paste0(out,"/nr/"), showWarnings = FALSE)
        write(paste0(pdfpath, " has no readable content in PDF file"),
              file = paste0(out, "/nr/", id, "_non-readable.txt"))
        integrity.indicator <- FALSE
        ## if cpymv is either cpy or mv and filterwords were used
        if (cpy_mv == "cpy" && filter.words[1] != ""){
          dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
          dir.create(paste0(out,"/pdfs/nr"), showWarnings = FALSE)
          file.copy(pdf, paste0(out,"/pdfs/nr"))
          print_message <- paste0(basename(pdf),
                                  " was copied to \'", out,"/pdfs/nr\'.")
          out_msg <- c(out_msg, print_message)
          if (verbose) cat(utils::tail(out_msg,1), sep="\n")
          update_progress_info(print_message)
        } else if (cpy_mv == "mv" && filter.words[1] != ""){
          dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
          dir.create(paste0(out,"/pdfs/nr"), showWarnings = FALSE)
          file.copy(pdf, paste0(out,"/pdfs/nr"))
          unlink(pdf)
          print_message <- paste0(basename(pdf),
                                  " was moved to \'", out,"/pdfs/nr\'.")
          out_msg <- c(out_msg, print_message)
          if (verbose) cat(utils::tail(out_msg,1), sep="\n")
          update_progress_info(print_message)
        }
      }
    }
  }

  ## if there was an issue with creating files
  if (integrity.indicator == FALSE) {
    pdf_word_count <- 0
    pdf_page_count <- 0
    pdf_filterwords <- NULL
    pdf_searchwords <- NULL
    search.word.category_total <- NULL
    if (!filter.words[1] == "") {
      for (i in 1:length(filter.words)) {
        word <- filter.words[i]
        ## add filterwords to output
        if (ignore.case.fw == TRUE){
          ic <- "ic"
        } else {
          ic <- "nic"
        }
        if (regex.fw == TRUE){
          reg <- "regex"
        } else {
          reg <- "nregex"
        }
        pdf_filterwords <- c(pdf_filterwords, paste0("FW_",ic,"_",reg,":",word))
        pdf_filterword_total <- NA
        pdf_filterword_times <- rep(NA,length(pdf_filterwords))
      }
    }
    if (!search.words[1] == "") {
      for (i in 1:length(search.words)) {
        word <- search.words[i]
        ## add filterwords to output
        if (ignore.case.sw == TRUE){
          ic <- "ic"
        } else {
          ic <- "nic"
        }
        if (regex.sw == TRUE){
          reg <- "regex"
        } else {
          reg <- "nregex"
        }
        pdf_searchwords <- c(pdf_searchwords, paste0("SW_",ic,"_",reg,":",word))
        pdf_searchword_total <- NA
        pdf_searchword_times <- rep(NA,length(pdf_searchwords))
        ## set search word category
        search.word.category_total <- NULL
        if (!is.null(search.word.categories) && length(search.words) == length(search.word.categories)){
          for (swc in 1:length(unique(search.word.categories))){
            search.word.category_total[swc] <- sum(pdf_searchword_times[search.word.categories %in% unique(search.word.categories)[swc]], na.rm = TRUE)
          }
        }
      }
    }
    stat_output <- cbind(pdf_word_count = pdf_word_count, pdf_page_count = pdf_page_count,
                         pdf_filterword_total = pdf_filterword_total,
                         pdf_filterword_percentage = paste0(as.character(as.numeric(pdf_filterword_total)/as.numeric(pdf_word_count)*100),"%"),
                         pdf_searchword_total = pdf_searchword_total,rbind(search.word.category_total),
                         rbind(pdf_filterword_times), rbind(pdf_searchword_times))
    extracol_num <- (ncol(stat_output) -
                       length(pdf_filterword_times) -
                       length(pdf_searchword_times) -
                       length(search.word.category_total) + 1)
    colnames(stat_output)[extracol_num:ncol(stat_output)] <- c(unique(search.word.categories),pdf_filterwords,pdf_searchwords)
    rownames(stat_output) <- id
    output_files$stat_output <- data.frame(cbind(stat_output,
                                                 pdf_sentences_w_searchwords = NA),
                                           check.names = FALSE)
    ##if everything is ok
  } else {
    ## determine word count of pdf file
    pdf_word_count <- sum(sapply(gregexpr("[[:alpha:]]+", txtcontent), function(x) sum(x > 0)))
    pdf_page_count <- length(htmlcontent)

    ## 2.2) Check all the options chosen for PDE analyzer ------------------------------
    ## if general extraction (search words is undefined), then context <- 0
    if (search.words[1] == "" || search.words[1] == "*" ||
        search.words[1] == ".") {
      context <- 0
      search.words <- ""
    }
    if (ignore.case.sw == FALSE) ignore.case.sw <- FALSE
    else if (!ignore.case.sw == TRUE) {
      tcltk::tkmessageBox(title = "Warning",
                          type = "ok", icon = "warning",
                          message = "ignore.case.sw: ignore.case.sw has to be either TRUE or FALSE")
      stop("ignore.case.sw: ignore.case.sw has to be either TRUE or FALSE")
    }
    if (is.na(filter.words[1])) filter.words <- ""
    ## adjust filter.words with regex
    if (regex.fw == FALSE){
      for (fw_pos in 1:length(filter.words)){
        filter.words[fw_pos] <- re.escape(filter.words[fw_pos])
      }
    }
    if (is.null(ncol(filter.words))) filter.word.table <- data.frame(words = filter.words,
                                                                ignore.case.fw = ignore.case.fw)
    # if (!is.numeric(filter.word.times)) {
    #   tcltk::tkmessageBox(title = "Warning",
    #                       type = "ok", icon = "warning",
    #                       message = "filter.word.times: has to be a number")
    #   stop("filter.word.times: has to be a number")
    # }
    if (is.null(ncol(table.heading.words))) table.heading.words <- data.frame(words = table.heading.words,
                                                                              ignore.case.th = ignore.case.th)
    ## adjust search.words with regex
    if (regex.sw == FALSE){
      for (sw_pos in 1:length(search.words)){
        search.words[sw_pos] <- re.escape(search.words[sw_pos])
      }
    }
    if (is.null(ncol(search.words))) search.word.table <- data.frame(words = search.words,
                                                                ignore.case.sw = ignore.case.sw)
    if (write.table.locations == FALSE) write.table.locations <- FALSE
    else if (!write.table.locations == TRUE) {
      tcltk::tkmessageBox(title = "Warning",
                          type = "ok", icon = "warning",
                          message = "write.table.locations: has to be either TRUE or FALSE")
      stop("write.table.locations: has to be either TRUE or FALSE")
    }
    if (is.na(out.table.format)) {
      out.table.format <- ".csv (WINDOWS-1252)"
    } else if (!(out.table.format %in% c(".csv (WINDOWS-1252)", ".csv (macintosh)", ".csv (UTF-8)",
                                         ".tsv (WINDOWS-1252)",".tsv (macintosh)",".tsv (UTF-8)"))) {
      tcltk::tkmessageBox(title = "Warning",
                          type = "ok", icon = "warning",
                          message = paste("out.table.format: has to be either .csv (WINDOWS-1252), .csv (macintosh), .csv (UTF-8)",
                                          "or .tsv (WINDOWS-1252), .tsv (macintosh), .tsv (UTF-8)"))
      stop(paste("out.table.format: has to be either .csv (WINDOWS-1252), .csv (macintosh), .csv (UTF-8)",
                 "or .tsv (WINDOWS-1252), .tsv (macintosh), .tsv (UTF-8)"))
    }
    if (grepl("csv", out.table.format)) {
      out.table.separator <- ","
      out.table.ext <- ".csv"
    }
    if (grepl("tsv", out.table.format)) {
      out.table.separator <- "\t"
      out.table.ext <- ".tsv"
    }
    if (grepl("WINDOWS-1252", out.table.format)) {
      out.encoding <- "WINDOWS-1252"
    } else if (grepl("macintosh", out.table.format)) {
      out.encoding <- "macintosh"
    } else if (grepl("UTF-8", out.table.format)) {
      out.encoding <- "UTF-8"
    } else {
      out.encoding <- "WINDOWS-1252"
    }


    if (!is.numeric(dev_x)) {
      tcltk::tkmessageBox(title = "Warning",
                          type = "ok", icon = "warning",
                          message = "dev_x: has to be a number")
      stop("dev_x: has to be a number")  ## deviation between cell positions, might
      ##have to be increased if words that should be in the same column
    }
    if (!is.numeric(context)) {
      tcltk::tkmessageBox(title = "Warning",
                          type = "ok", icon = "warning",
                          message = "context: has to be a number")
      stop("context: has to be a number")  ## +/- context number of sentences
      ## before and after the search word was found will be put out
    }
    if (write.tab.doc.file == FALSE) write.tab.doc.file <- FALSE
    else if (!write.tab.doc.file == TRUE) {
      tcltk::tkmessageBox(title = "Warning",
                          type = "ok", icon = "warning",
                          message = "write.tab.doc.file: has to be either TRUE or FALSE")
      stop("write.tab.doc.file: has to be either TRUE or FALSE")
    }
    if (write.txt.doc.file == FALSE) write.txt.doc.file <- FALSE
    else if (!write.txt.doc.file == TRUE) {
      tcltk::tkmessageBox(title = "Warning",
                          type = "ok", icon = "warning",
                          message = "write.txt.doc.file: has to be either TRUE or FALSE")
      stop("write.txt.doc.file: has to be either TRUE or FALSE")
    }
  }

  ## reset variables
  if (integrity.indicator == TRUE) {
    output <- NULL
    htmltablelines <- NULL
    keeplayouttxttablelines <- NULL
    txttablelines <- NULL
  }

  ## 2.3) Make content ----------------------------------------
  if (integrity.indicator == TRUE) {
    content <- list(txtcontent, keeplayouttxtcontent)

    ## make a variable only with the txt content
    txthtmlcontent <- htmlcontent
    ## go through pages
    for (j in 1:length(txthtmlcontent)) {
      ## go through each html line
      for (z in 1:length(txthtmlcontent[[j]])) {
        ## Removing the Table wording
        line <- txthtmlcontent[[j]][z]
        line.txthtmlcontent <- ""
        res <- try(utf8ToInt(line),silent = TRUE)
        if (inherits(res,"try-error")) line <- iconv(line, 'UTF-8', 'latin1', 'byte')
        ##replace superscript
        while (grepl("vertical-align:super",line)){
          super_pos <- regexpr("vertical-align:super",line)[1]
          ## replace the end of the superscript with "]"
          endspan_positions <- gregexpr("</span>",line)[[1]]
          s <- 1
          endspan_pos <- endspan_positions[s] - super_pos
          while (endspan_pos<0){
            s <- s + 1
            endspan_pos <- endspan_positions[s] - super_pos
          }
          endspan_pos <- endspan_pos + super_pos
          line_list <- unlist(strsplit(line, split = ""))
          line_list[endspan_pos] <- "]<"
          line <- paste(line_list,collapse = "")

          ## replace the beginning of the superscript with "^["
          endofstartspan_positions <- gregexpr(";\">",line)[[1]]
          s <- 1
          endofstartspan_pos <- endofstartspan_positions[s] - super_pos
          while (endofstartspan_pos<0){
            s <- s + 1
            endofstartspan_pos <- endofstartspan_positions[s] - super_pos
          }
          endofstartspan_pos <- endofstartspan_pos + super_pos

          line_before_super <- sub("vertical-align:super.*", "", line)
          startspan_pos <- max(gregexpr("<span id=",line_before_super)[[1]])
          line_list <- unlist(strsplit(line, split = ""))
          line_list[endofstartspan_pos+2] <- ">^["
          line <- paste(line_list,collapse = "")
          line <- sub("vertical-align:super", "vertical-align:baseline", line)
          # line <- sub(paste0("^(.{",as.numeric(startspan_pos)-1,"})(.{",as.numeric(endofstartspan_pos-startspan_pos+2),"})."),"\\1^[",
          #             line)
        }
        ##replace subscript
        while (grepl("vertical-align:sub",line)){
          sub_pos <- regexpr("vertical-align:sub",line)[1]
          ## replace the end of the subscript with "]"
          endspan_positions <- gregexpr("</span>",line)[[1]]
          s <- 1
          endspan_pos <- endspan_positions[s] - sub_pos
          while (endspan_pos<0){
            s <- s + 1
            endspan_pos <- endspan_positions[s] - sub_pos
          }
          endspan_pos <- endspan_pos + sub_pos
          line_list <- unlist(strsplit(line, split = ""))
          line_list[endspan_pos] <- "]<"
          line <- paste(line_list,collapse = "")

          ## replace the beginning of the subscript with "_["
          endofstartspan_positions <- gregexpr(";\">",line)[[1]]
          s <- 1
          endofstartspan_pos <- endofstartspan_positions[s] - sub_pos
          while (endofstartspan_pos<0){
            s <- s + 1
            endofstartspan_pos <- endofstartspan_positions[s] - sub_pos
          }
          endofstartspan_pos <- endofstartspan_pos + sub_pos

          line_before_sub <- sub("vertical-align:sub.*", "", line)
          startspan_pos <- max(gregexpr("<span id=",line_before_sub)[[1]])
          line_list <- unlist(strsplit(line, split = ""))
          line_list[endofstartspan_pos+2] <- ">_["
          line <- paste(line_list,collapse = "")

        }
        ##replace different html formating
        line <- gsub("</p>","</span>",line)
        ## replace line break with space (not optimal but better for searching)
        line <- gsub("<br/>"," ",line)
        ## remove residue from hyperlinks
        line <- gsub("<a href=\".*?>","",line)
        line <- gsub("</a>","",line)
        rev.line <- intToUtf8(rev(utf8ToInt(line)))
        for (spanpos in rev(gregexpr(">naps/<",rev.line)[[1]])){
          add.txthtmlcontent <- substr(rev.line,spanpos+7, regexpr(">\"",
                                                                   substr(rev.line,spanpos+7,
                                                                          nchar(rev.line)))+spanpos+7-2)
          line.txthtmlcontent <- paste0(line.txthtmlcontent,intToUtf8(rev(utf8ToInt(add.txthtmlcontent))))
        }
        if (grepl("&",line.txthtmlcontent)){
          txthtmlcontent[[j]][z] <- replace.html.entity(line.txthtmlcontent)
        } else {
          txthtmlcontent[[j]][z] <- line.txthtmlcontent
        }
      }  ## end go through each line z
      content[[(j+2)]] <- txthtmlcontent[[j]]
    }
  }


  ## 3) Evaluate for filter words ---------------------------------------
  list_of_abbrevs <- NULL
  ## 3.1) Filter Search ---------------------------------------
  if (integrity.indicator == TRUE && !filter.word.table[1, "words"] == "") {
    word.txtline.fw <- NULL
    for (i in 1:nrow(filter.word.table)) {
      ## search for lines with filter word in [txtcontent]
      word <- filter.word.table[i, "words"]
      ignore.case.fw <- filter.word.table[i, "ignore.case.fw"]
      detected_line <- grep(word, txtcontent, ignore.case = ignore.case.fw)
      word.txtline.fw <- c(word.txtline.fw,
                           detected_line)

      ## 3.2) Replace abbreviations -----------------------------------------------------
      if (eval.abbrevs == TRUE && length(word.txtline.fw) > 0){
        ## Check if any occurences (heading + text) of the
        ## filter word are defining and abbreviation
        ## go through each txtcontent line that the searchword was found
        for (nth in 1:length(word.txtline.fw)) {
          paragraph <- txtcontent[word.txtline.fw[[nth]]]
          ## identify definitions of abbrev
          occur_double_dots_or_equal <- test_if_abbrev_double_dots_or_equal(searchword = as.character(word),
                                                                            paragraph = paragraph,
                                                                            ignore.case = ignore.case.fw)
          occur_in_parantheses <- test_if_abbrev_in_parantheses(searchword = as.character(word),
                                                                paragraph = paragraph,
                                                                ignore.case = ignore.case.fw)
          ## if abbrev was found in nth occurence
          for (occur in list(occur_double_dots_or_equal, occur_in_parantheses)){
            if (occur$res == TRUE) {
              ## replace if abbrev is not yet defined
              for (abbrev in c("abbrev_singular","abbrev_plural")) {
                if (!(occur[[abbrev]] %in% list_of_abbrevs)){
                  list_of_abbrevs <- c(list_of_abbrevs,occur[[abbrev]])
                  for (c in 1:length(content)){
                    to_find <- paste0("([^A-z|0-9]+|^)",occur[[abbrev]],"([^A-z|0-9|:]$|[^A-z|0-9|:][^=|:]|$)")
                    found_lines <- grep(to_find, content[[c]],
                                        ignore.case = FALSE)
                    for (line in found_lines){
                      found_pos <- gregexpr(to_find, content[[c]][line])[[1]]
                      found_pos_list <- gregexpr(to_find, content[[c]][line])[[1]]
                      add_pos <- 0
                      for (p in 1:length(found_pos)){
                        sub <- substr(content[[c]][line],found_pos[p] + add_pos,nchar(content[[c]][line]))
                        ## prevent double substitution if sub already contains substitution
                        if (grepl(occur[[sub("abbrev","replacement",abbrev)]], sub)) next
                        found_pos_list[p] <- regexpr(occur[[abbrev]],sub) + found_pos[p] + add_pos - 1
                        content[[c]][line] <- paste0(substr(content[[c]][line],1,found_pos_list[p]-1),
                                                     sub(occur[[abbrev]],
                                                         occur[[sub("abbrev","replacement",abbrev)]],
                                                         substr(content[[c]][line],
                                                                found_pos_list[p],nchar(content[[c]][line])),
                                                         ignore.case = FALSE))
                        add_pos <- add_pos + nchar(occur[[sub("abbrev","replacement",
                                                              abbrev)]]) - nchar(occur[[abbrev]])
                        ## correct for sub of abbrev definition e.g. methotrexate (MTX (methotrexate))
                        len_diff <- nchar(content[[c]][line]) -
                          nchar(gsub(paste0("(",occur[[sub("abbrev","replacement",abbrev)]],")"),
                                     paste0("(",occur[[abbrev]],")"),
                                     content[[c]][line] , fixed = TRUE))
                        add_pos <- add_pos - len_diff
                        content[[c]][line]  <- gsub(paste0("(",occur[[sub("abbrev","replacement",abbrev)]],")"),
                                                    paste0("(",occur[[abbrev]],")"),
                                                    content[[c]][line] , fixed = TRUE)
                      } ## end for (p in length(found_pos)){
                    } ## end for (line in found_lines){
                  } ## for (c in 1:length(content)){
                } ##if (!(occur[[abbrev]] %in% list_of_abbrevs)){
              } ## for (abbrev in c("abbrev_singular","abbrev_plural")) {
            } ## if (occur$res) {
          } ## end for (abbrev in c("abbrev_singular","abbrev_plural")) {
        } ## end for (nth in 1:length(word.txtline.fw)) {
      } ## end replace abbreviations in content
    } ## end for each filter word
  } ## if (integrity.indicator == TRUE)

  ## if filter word abbreviations were found replace the abbreviations in content
  if (eval.abbrevs == TRUE && integrity.indicator == TRUE &&
      !filter.word.table[i, "words"] == "" && !is.null(list_of_abbrevs)) {
    txtcontent <- content[[1]]
    keeplayouttxtcontent <- content[[2]]

    ## html content per page
    for (pa in 3:length(content)) txthtmlcontent[[(pa-2)]] <- content[[pa]]

  } ## end replace abbreviations

  ## 3.3) Real filter Search ---------------------------------------
  if (integrity.indicator == TRUE) {
    word.txtline.fw <- NULL
    word.txtpos.fw <- NULL
    pdf_filterwords <- NULL
    ## if there are filter words
    if (!filter.word.table[1, "words"] == "") {
      for (i in 1:nrow(filter.word.table)) {
        ## search for lines with filter word in [txtcontent]
        word <- filter.word.table[i, "words"]
        ignore.case.fw <- filter.word.table[i, "ignore.case.fw"]
        ## add filterwords to output
        if (ignore.case.fw == TRUE){
          ic <- "ic"
        } else {
          ic <- "nic"
        }
        if (regex.fw == TRUE){
          reg <- "regex"
        } else {
          reg <- "nregex"
        }
        pdf_filterwords <- c(pdf_filterwords, paste0("FW_",ic,"_",reg,":",word))
        detected_line <- grep(word, txtcontent, ignore.case = ignore.case.fw)
        word.txtline.fw <- c(word.txtline.fw,
                             detected_line)
        current_word.txtpos.fw <- NULL
        for (li in detected_line){
          current_word.txtpos.fw <- c(current_word.txtpos.fw,
                                      gregexpr(word, txtcontent[li], ignore.case = ignore.case.fw)[[1]])
        }
        pdf_filterword_times <- c(pdf_filterword_times, length(current_word.txtpos.fw))
        pdf_filterword_names <- c(pdf_filterword_names, word)
        pdf_filterword_total <- sum(pdf_filterword_times)
        word.txtpos.fw <- c(word.txtpos.fw,current_word.txtpos.fw)
      }
      ## test if percent or number
      pdf_word_count <- sum(sapply(gregexpr("[[:alpha:]]+", txtcontent), function(x) sum(x > 0)))
      if (grepl("%",filter.word.times)){
        if (length(word.txtpos.fw)/pdf_word_count >= (as.numeric(sub("%","",filter.word.times))/100)) {
          filterwords.go <- TRUE
          print_message <- paste0(round((length(word.txtpos.fw)/pdf_word_count*100),4),
                                  "% of all words were filter word(s) in ", id, ".pdf.")
          out_msg <- c(out_msg, print_message)
          if (verbose) cat(utils::tail(out_msg,1), sep="\n")
          update_progress_info(print_message)
        } else {
          filterwords.go <- FALSE
        }
      } else {
        if (length(word.txtpos.fw) >= as.numeric(filter.word.times)) {
          filterwords.go <- TRUE
          print_message <- paste0(length(word.txtpos.fw),
                                  " filter word(s) were detected in ", id, ".pdf.")
          out_msg <- c(out_msg, print_message)
          if (verbose) cat(utils::tail(out_msg,1), sep="\n")
          update_progress_info(print_message)


        } else {
          filterwords.go <- FALSE
        }
      }

      if (filterwords.go == FALSE){
        pdf_word_count <- sum(sapply(gregexpr("[[:alpha:]]+", txtcontent), function(x) sum(x > 0)))
        if (grepl("%",filter.word.times)){
          print_message <- paste0("\'",id,".pdf\' was filtered out due to a lack of the filter words. ",
                                  round((length(word.txtpos.fw)/pdf_word_count*100),4),
                                  "% of all words were filter word(s) in ", id, ".pdf.")
          out_msg <- c(out_msg, print_message)
          if (verbose) cat(utils::tail(out_msg,1), sep="\n")
          update_progress_info(print_message)
        } else {
          print_message <- paste0("\'",id,".pdf\' was filtered out due to a lack of the filter words. ",
                                  length(word.txtpos.fw),
                                  " filter word(s) were detected in ", id, ".pdf.")
          out_msg <- c(out_msg, print_message)
          if (verbose) cat(utils::tail(out_msg,1), sep="\n")
          update_progress_info(print_message)
        }
        ## write 0 search words
        if (!search.word.table[1, "words"] == "") {
          for (i in 1:nrow(search.word.table)) {
            word <- search.word.table[i, "words"]
            ## add filterwords to output
            if (ignore.case.sw == TRUE){
              ic <- "ic"
            } else {
              ic <- "nic"
            }
            if (regex.sw == TRUE){
              reg <- "regex"
            } else {
              reg <- "nregex"
            }
            pdf_searchwords <- c(pdf_searchwords, paste0("SW_",ic,"_",reg,":",word))
            pdf_searchword_total <- 0
            pdf_searchword_times <- rep(0,length(pdf_searchwords))
            search.word.category_total <- NULL
            if (!is.null(search.word.categories) && length(search.words) == length(search.word.categories)){
              for (swc in 1:length(unique(search.word.categories))){
                search.word.category_total[swc] <- sum(pdf_searchword_times[search.word.categories %in% unique(search.word.categories)[swc]], na.rm = TRUE)
              }
            }
          }
        }
        if (write.txt.doc.file == TRUE) {
          dir.create(paste0(out,"/excl_by_fw"), showWarnings = FALSE)
          pdf_word_count <- sum(sapply(gregexpr("[[:alpha:]]+", txtcontent), function(x) sum(x > 0)))
          if (grepl("%",filter.word.times)){
            utils::write.table(paste0("Not enough txt lines with filter word found. ",
                                      round((length(word.txtpos.fw)/pdf_word_count*100),4),
                                      "% of words were filter word(s) in ", id, ".pdf."),
                               paste0(out,"/excl_by_fw/",id,"_too_few_fwds",
                                      out.table.ext),
                               sep = out.table.separator, row.names = FALSE,
                               col.names = FALSE, na = "")
          } else {
            utils::write.table(paste0("Not enough txt lines with filter word found. ",
                                      length(word.txtpos.fw),
                                      " filter word(s) were detected in ", id, ".pdf."),
                               paste0(out,"/excl_by_fw/",id,"_too_few_fwds",
                                      out.table.ext),
                               sep = out.table.separator, row.names = FALSE,
                               col.names = FALSE, na = "")
          }
        }
        ## if cpymv is either cpy or mv
        if (cpy_mv == "cpy"){
          dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
          dir.create(paste0(out,"/pdfs/excl_by_fw"), showWarnings = FALSE)
          file.copy(pdf, paste0(out,"/pdfs/excl_by_fw"))
          print_message <- paste0(basename(pdf),
                                  " was copied to \'", out,"/pdfs/excl_by_fw\'.")
          out_msg <- c(out_msg, print_message)
          if (verbose) cat(utils::tail(out_msg,1), sep="\n")
          update_progress_info(print_message)
        } else if (cpy_mv == "mv"){
          dir.create(paste0(out,"/pdfs"), showWarnings = FALSE)
          dir.create(paste0(out,"/pdfs/excl_by_fw"), showWarnings = FALSE)
          file.copy(pdf, paste0(out,"/pdfs/excl_by_fw"))
          unlink(pdf)
          print_message <- paste0(basename(pdf),
                                  " was moved to \'", out,"/pdfs/excl_by_fw\'.")
          out_msg <- c(out_msg, print_message)
          if (verbose) cat(utils::tail(out_msg,1), sep="\n")
          update_progress_info(print_message)
        }
      }  ## end if filter words were present
    } else {
      out_msg <- c(out_msg, "No filter words chosen for analysis.")
      if (verbose) cat(utils::tail(out_msg,1), sep="\n")
      update_progress_info("No filter words chosen for analysis.")
      filterwords.go <- TRUE
      pdf_filterword_total <- NA
      pdf_filterword_times <- NA
      pdf_filterwords <- c("filter_word_list")
    }  ## end if filter words were set
  } ## end 3.3) Filter Search

  ## 4) Search of search words -----------------------------------------------
  ## only if filter words were found or no filter was set continue and
  ## search words were set
  if (filterwords.go == TRUE && integrity.indicator == TRUE) {
    searchwords.go <- FALSE
    ## search for lines with search word
    word.txtline <- NULL
    for (i in 1:nrow(search.word.table)) {
      ## 4.1) Search for lines with search word -------------------------
      word <- search.word.table[i, "words"]
      ignore.case.sw <- search.word.table[i, "ignore.case.sw"]
      word.txtline <- NULL
      word.keeplayoutline <- NULL
      ## if search words were not chosen write all lines in txtline
      if (search.word.table[i, "words"] == ""){
        word.txtline <- 1:length(txtcontent)
        word.keeplayoutline <- 1:length(word.keeplayoutline)
      } else {
        ## search for lines with search word in [txtcontent]
        word.txtline <- grep(word, txtcontent, ignore.case = ignore.case.sw)
        ## search for lines with search word in [keeplayouttxtcontent]
        word.keeplayoutline <- grep(word, keeplayouttxtcontent,
                                    ignore.case = ignore.case.sw)
      }

      ## 4.2) Continue analysis when search words were found ----------------------------
      if (length(word.txtline) > 0) searchwords.go <- TRUE
      ## 4.3) Replace abbreviations -----------------------------------------------------
      if (eval.abbrevs == TRUE && length(word.txtline) > 0 &&
          !search.word.table[i, "words"] == ""){
        ## Check if any occurences (heading + text) of the
        ## search word are defining and abbreviation
        ## go through each txtcontent line that the searchword was found
        for (nth in 1:length(word.txtline)) {
          paragraph <- txtcontent[word.txtline[[nth]]]
          ## identify definitions of abbrev
          occur_double_dots_or_equal <- test_if_abbrev_double_dots_or_equal(searchword = as.character(word),
                                                                            paragraph = paragraph,
                                                                            ignore.case.sw = ignore.case.sw)
          occur_in_parantheses <- test_if_abbrev_in_parantheses(searchword = as.character(word),
                                                                paragraph = paragraph,
                                                                ignore.case.sw = ignore.case.sw)
          ## if abbrev was found in nth occurence
          for (occur in list(occur_double_dots_or_equal, occur_in_parantheses)){
            if (occur$res == TRUE) {
              ## replace if abbrev is not yet defined
              for (abbrev in c("abbrev_singular","abbrev_plural")) {
                if (!(occur[[abbrev]] %in% list_of_abbrevs)){
                  list_of_abbrevs <- c(list_of_abbrevs,occur[[abbrev]])
                  for (c in 1:length(content)){
                    to_find <- paste0("([^A-z|0-9]+|^)",occur[[abbrev]],"([^A-z|0-9|:]$|[^A-z|0-9|:][^=|:]|$)")
                    found_lines <- grep(to_find, content[[c]],
                                        ignore.case = FALSE)
                    for (line in found_lines){
                      found_pos <- gregexpr(to_find, content[[c]][line])[[1]]
                      found_pos_list <- gregexpr(to_find, content[[c]][line])[[1]]
                      add_pos <- 0
                      for (p in 1:length(found_pos)){
                        sub <- substr(content[[c]][line],found_pos[p] + add_pos,nchar(content[[c]][line]))
                        ## prevent double substitution if sub already contains substitution
                        if (grepl(occur[[sub("abbrev","replacement",abbrev)]], sub)) next
                        found_pos_list[p] <- regexpr(occur[[abbrev]],sub) + found_pos[p] + add_pos - 1
                        content[[c]][line] <- paste0(substr(content[[c]][line],1,found_pos_list[p]-1),
                                                     sub(occur[[abbrev]],
                                                         occur[[sub("abbrev","replacement",abbrev)]],
                                                         substr(content[[c]][line],found_pos_list[p],
                                                                nchar(content[[c]][line])),
                                                         ignore.case = FALSE))
                        add_pos <- add_pos + nchar(occur[[sub("abbrev","replacement",abbrev)]]) - nchar(occur[[abbrev]])
                        ## correct for sub of abbrev definition e.g. methotrexate (MTX (methotrexate))
                        len_diff <- nchar(content[[c]][line]) -
                          nchar(gsub(paste0("(",occur[[sub("abbrev","replacement",abbrev)]],")"),
                                     paste0("(",occur[[abbrev]],")"),
                                     content[[c]][line] , fixed = TRUE))
                        add_pos <- add_pos - len_diff
                        content[[c]][line]  <- gsub(paste0("(",occur[[sub("abbrev","replacement",abbrev)]],")"),
                                                    paste0("(",occur[[abbrev]],")"),
                                                    content[[c]][line] , fixed = TRUE)
                      } ## end for (p in length(found_pos)){
                    } ## end for (line in found_lines){
                  } ## for (c in 1:length(content)){
                } ##if (!(occur[[abbrev]] %in% list_of_abbrevs)){
              } ## for (abbrev in c("abbrev_singular","abbrev_plural")) {
            } ## if (occur$res) {
          } ## end for (abbrev in c("abbrev_singular","abbrev_plural")) {
        } ## end for (nth in 1:length(word.txtline)) {
      } ## end replace abbreviations in content
    } ## end for each search word
  } ## if (filterwords.go == TRUE && integrity.indicator == TRUE)

  ## if search words were found replace the abbreviations in content
  if (searchwords.go == TRUE && filterwords.go == TRUE &&
      eval.abbrevs == TRUE && integrity.indicator == TRUE &&
      !search.word.table[i, "words"] == "") {
    txtcontent <- content[[1]]
    keeplayouttxtcontent <- content[[2]]

    ## html content per page
    for (pa in 3:length(content)) txthtmlcontent[[(pa-2)]] <- content[[pa]]

  } ## end replace abbreviations

  ## if no search words are detected in document
  if (filterwords.go == TRUE &&
      searchwords.go == FALSE &&
      integrity.indicator == TRUE){
    print_message <- paste0("No text with search words for \'",id,".pdf\' found.")
    out_msg <- c(out_msg, print_message)
    if (verbose) cat(utils::tail(out_msg,1), sep="\n")
    update_progress_info(print_message)
    ## write an empty file
    if (write.txt.doc.file == TRUE) {
      dir.create(paste0(out,"/excl_by_sw"), showWarnings = FALSE)
      utils::write.table("No text line with search word found.",
                         paste0(out,"/excl_by_sw/",id, "_no_txt_w_swds",
                                out.table.ext),
                         sep = out.table.separator, row.names = FALSE,
                         col.names = FALSE, na = "")
    }
  }  ## end if search words were present

  ## 5) Sort the html content ---------------------------------------
  if (filterwords.go == TRUE && integrity.indicator == TRUE) {
    ## add the top and left ##
    for (p in 1:length(txthtmlcontent)) {
      ## if page has one dimension
      if (!"left" %in% colnames(htmlcontent[[p]]))
        htmlcontent[[p]] <- cbind(htmlcontent[[p]],
                                  left = NA)
      if (!"top" %in% colnames(htmlcontent[[p]]))
        htmlcontent[[p]] <- cbind(htmlcontent[[p]],
                                  top = NA)
      ## add font size
      if (!"font_size" %in% colnames(htmlcontent[[p]]))
        htmlcontent[[p]] <- cbind(htmlcontent[[p]],
                                  font_size = NA)
    }
    for (p in 1:length(txthtmlcontent)){
      ## 5.1) Assign top and left values ------------------------------

      start <- grep("^<body>",htmlcontent[[p]][,1])[1] + 1
      if (is.na(start)) start <- 1
      end <- grep("^</body>",htmlcontent[[p]][,1])[1] - 1
      if (is.na(end)) {
        end <- nrow(htmlcontent[[p]])
        print_message <- paste0("Page ", p, " of \'", id, ".html\' was incompletely read.",
                                " This might leader to incomplete table extraction but does not",
                                " affect search word detection")
        out_msg <- c(out_msg, print_message)
        if (verbose) cat(utils::tail(out_msg,1), sep="\n")
        print_message_short <- paste0("Page ", p, " of \'", id, ".html\' was incompletely read",
                                      " (does not affect txt detection though).")
        update_progress_info(print_message_short)
      }

      ## make the copy of htmlcontent for the sorting
      ## single row
      if (start == end){
        out.htmlcontent <- rbind(htmlcontent[[p]][start, ])
        out.txthtmlcontent <- txthtmlcontent[[p]][start]
      } else {
        out.htmlcontent <- htmlcontent[[p]][start:end, ]
        out.txthtmlcontent <- txthtmlcontent[[p]][start:end]
      }

      ## for each line
      for (line.number in 1:(end - start + 1)) {
        ## if content line
        if (grepl("[\"|;]font-size:",
                  out.htmlcontent[line.number, 1])) {
          ## get the left position
          pos.left.start <- regexpr("left:",
                                    out.htmlcontent[line.number, 1])[[1]] + 5
          out.htmlcontent[line.number, 1] <- iconv(out.htmlcontent[line.number, 1],"latin1","ASCII",sub="")
          pos.left.end <- regexpr("px",
                                  substr(out.htmlcontent[line.number, 1],
                                         pos.left.start,
                                         nchar(out.htmlcontent[line.number, 1])))[[1]] - 2 + pos.left.start
          left.value <- suppressWarnings(as.integer(substr(out.htmlcontent[line.number, 1], pos.left.start, pos.left.end)))
          if (is.na(left.value) && grepl(";vertical-align:baseline;",
                                         out.htmlcontent[line.number, 1])) {
            left.value <- out.htmlcontent[line.number-1, "left"]
          }

          out.htmlcontent[line.number, "left"] <- left.value

          ## get the top information
          pos.top.start <- regexpr("top:",
                                   out.htmlcontent[line.number, 1])[[1]] + 4
          out.htmlcontent[line.number, 1] <- iconv(out.htmlcontent[line.number, 1],"latin1","ASCII",sub="")
          pos.top.end <- regexpr("px",
                                 substr(out.htmlcontent[line.number, 1],
                                        pos.top.start,
                                        nchar(out.htmlcontent[line.number, 1])))[[1]] - 2 + pos.top.start
          top.value <- suppressWarnings(as.integer(substr(out.htmlcontent[line.number, 1], pos.top.start, pos.top.end)))
          if (is.na(top.value) && grepl(";vertical-align:baseline;",
                                        out.htmlcontent[line.number, 1])) {
            top.value <- out.htmlcontent[line.number-1, "top"]
          }

          out.htmlcontent[line.number, "top"] <- top.value

          ## get font_size information
          pos.fontsize.start <- regexpr("font-size:",
                                        out.htmlcontent[line.number, 1])[[1]] + 10
          out.htmlcontent[line.number, 1] <- iconv(out.htmlcontent[line.number, 1],"latin1","ASCII",sub="")
          pos.fontsize.end <- regexpr("px;",substr(out.htmlcontent[line.number, 1],
                                                   pos.fontsize.start,
                                                   nchar(out.htmlcontent[line.number, 1])))[[1]] + pos.fontsize.start - 2
          fontsize.value <- substr(out.htmlcontent[line.number, 1],
                                   pos.fontsize.start, pos.fontsize.end)
          fontsize.value <- as.numeric(fontsize.value)
          out.htmlcontent[line.number, "font_size"] <- fontsize.value
        } else {
          ## if the line does not have position info
          out.htmlcontent[line.number, "left"] <- 0
          out.htmlcontent[line.number, "top"] <- 9999
          out.htmlcontent[line.number, "font_size"] <- 1
        }
      }


      ## 5.2) Sort lines according to top -------------------
      ## only the lines with top value
      lines.with.top.value <- out.htmlcontent[!is.na(out.htmlcontent[, "top"]), ]
      txtlines.with.top.value <- out.txthtmlcontent[!is.na(out.htmlcontent[, "top"])]
      ## if it is only one line no sorting necessary
      if (length(which(!is.na(out.htmlcontent[, "top"]))) > 1) {
        htmlorder <- order(as.numeric(lines.with.top.value[,"top"]), as.numeric(lines.with.top.value[,"left"]))
        lines.with.top.value.sorted <- lines.with.top.value[htmlorder, ]
        txtlines.with.top.value.sorted <- txtlines.with.top.value[htmlorder]
      } else {
        lines.with.top.value.sorted <- lines.with.top.value
        txtlines.with.top.value.sorted <- txtlines.with.top.value
      }
      out.htmlcontent[!is.na(out.htmlcontent[, "top"]), ] <- lines.with.top.value.sorted
      out.txthtmlcontent[!is.na(out.htmlcontent[, "top"])] <- txtlines.with.top.value.sorted

      ## make all 0 and 9999 to NA
      no.pos.info.lines <- (out.htmlcontent[, "top"] == 9999)
      out.htmlcontent[no.pos.info.lines, "left"] <- NA
      out.htmlcontent[no.pos.info.lines, "top"] <- NA
      out.htmlcontent[no.pos.info.lines, "font_size"] <- NA

      htmlcontent[[p]][start:end, ] <- out.htmlcontent
      txthtmlcontent[[p]][start:end] <- out.txthtmlcontent
    }
  }

  ## 6) Extract Tables --------------------------------------------
  ## Explanation: The table detection is important to distinguish tables from text
  ## even if tables will not be exported they will not be a part of the sentence
  ## detection if search words.

  ## Use html to find table end line by adding 5 lines and then search for span id= change
  ## test if file has tables --> only process tables when table is present

  ## 6.1) Test if document has tables -------
  if (searchwords.go == TRUE && filterwords.go == TRUE &&
      integrity.indicator == TRUE) {
    ## if there is no additional heading
    if (table.heading.words[1, "words"] == "") {
      tablestart.pos <- c(grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)",
                               txtcontent, ignore.case = TRUE),
                          grep("^(\f|)(Table )[0-99|MDCLXVI]+( )",
                               txtcontent, ignore.case = TRUE),
                          grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+$",
                               txtcontent, ignore.case = TRUE),
                          grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)(*)",
                               txthtmlcontent[[j]], ignore.case = TRUE),
                          grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+$",
                               txthtmlcontent[[j]], ignore.case = TRUE),
                          grep("^(\f|)(Table )[0-99|MDCLXVI]+( )",
                               txthtmlcontent[[j]], ignore.case = TRUE))
    } else {
      word.txtline.th <- NULL
      for (i in 1:nrow(table.heading.words)) {
        ## search for lines with searchword info
        ## [txtcontent] ##
        word <- table.heading.words[i, "words"]
        ignore.case.th <- table.heading.words[i,
                                              "ignore.case.th"]
        word.txtline.th <- c(word.txtline.th,
                             grep(word, txtcontent, ignore.case = ignore.case.th))
      }
      tablestart.pos <- c(word.txtline.th,
                          grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)",
                               txtcontent, ignore.case = TRUE),
                          grep("^(\f|)(Table )[0-99|MDCLXVI]+( )",
                               txtcontent, ignore.case = TRUE),
                          grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+$",
                               txtcontent, ignore.case = TRUE),
                          grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)(*)",
                               txthtmlcontent[[j]], ignore.case = TRUE),
                          grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+$",
                               txthtmlcontent[[j]], ignore.case = TRUE),
                          grep("^(\f|)(Table )[0-99|MDCLXVI]+( )",
                               txthtmlcontent[[j]], ignore.case = TRUE))
    }
  }

  ## 6.2) Detect the table start positions by detecting headings ----------------
  if (searchwords.go == TRUE && filterwords.go == TRUE &&
      integrity.indicator == TRUE && !length(tablestart.pos) == 0) {

    ## Initialize master table for positions
    htmltablelines <- data.frame(page = NULL,
                                 tableheading = NULL, tablestart.pos = NULL,
                                 tablelastline = NULL, tableend.pos = NULL,
                                 legendlastline = NULL,
                                 legendend.pos = NULL, txtfirstline = NULL)
    txttablelines <- data.frame(page = NULL,
                                tableheading = NULL, tablestart.pos = NULL,
                                tablelastline = NULL, tableend.pos = NULL,
                                legendlastline = NULL,
                                legendend.pos = NULL, txtfirstline = NULL)

    ## go through pages
    for (j in 1:length(txthtmlcontent)) {

      ## if there is no additional heading
      if (table.heading.words[1, "words"] == "") {
        html.tablestart.pos <- c(grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)(*)",
                                      txthtmlcontent[[j]], ignore.case = TRUE),
                                 grep("^(\f|)(Table )[0-99|MDCLXVI]+( )(*)",
                                      txthtmlcontent[[j]], ignore.case = TRUE))
        ## search for tables with line break in title
        lb.html.tablestart.pos <- c(grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+$",
                                         txthtmlcontent[[j]], ignore.case = TRUE),
                                    grep("^(\f|)(Table )[0-99|MDCLXVI]+( )$",
                                         txthtmlcontent[[j]], ignore.case = TRUE))
        ## only look if the table is having also page
        if (length(splittxtcontent) >= j) {
          txt.tablestart.pos <- c(grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)(*)",
                                       splittxtcontent[[j]], ignore.case = TRUE),
                                  grep("^(\f|)(Table )[0-99|MDCLXVI]+( )(*)",
                                       splittxtcontent[[j]], ignore.case = TRUE))
          ## search for tables with line break in title
          lb.txt.tablestart.pos <- c(grep("^(\f|)(Table )[0-99|MDCLXVI]+$",
                                          splittxtcontent[[j]], ignore.case = TRUE),
                                     grep("^(\f|)(Table )[0-99|MDCLXVI]+( )$",
                                          splittxtcontent[[j]], ignore.case = TRUE))
        } else if (length(html.tablestart.pos) > 0) {
          if (nrow(htmltablelines) == 0) {
            htmltablelines <- data.frame(page = j,
                                         tableheading = NA, tablestart.pos = NA,
                                         tablelastline = NA, legendlastline = NA,
                                         legendend.pos = NA, txtfirstline = NA,
                                         detected.in = "htmlonly")
          } else {
            newrow <- htmltablelines[1, ]
            newrow <- NA
            newrow["page"] <- j
            newrow["detected.in"] <- "htmlonly"
            htmltablelines <- rbind(htmltablelines,
                                    newrow)
          }
        }
      } else  {
        ## if there is an additional heading
        txthtml.word.txtline.th <- NULL
        for (i in 1:nrow(table.heading.words)) {
          ## search for lines with searchword info
          ## [txtcontent] ##
          word <- table.heading.words[i, "words"]
          ignore.case.th <- table.heading.words[i, "ignore.case.th"]
          txthtml.word.txtline.th <- c(txthtml.word.txtline.th,
                                       grep(word, txthtmlcontent[[j]],
                                            ignore.case = ignore.case.th))
        }
        html.tablestart.pos <- c(txthtml.word.txtline.th,
                                 grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)(*)",
                                      txthtmlcontent[[j]], ignore.case = TRUE),
                                 grep("^(\f|)(Table )[0-99|MDCLXVI]+( )(*)",
                                      txthtmlcontent[[j]], ignore.case = TRUE))
        ## search for tables with line break in title
        lb.html.tablestart.pos <- c(grep("^(\f|)(Table )[0-99|MDCLXVI]+$",
                                         txthtmlcontent[[j]], ignore.case = TRUE),
                                    grep("^(\f|)(Table )[0-99|MDCLXVI]+( )$",
                                         txthtmlcontent[[j]], ignore.case = TRUE))
        ## only look if the table is having also page
        if (length(splittxtcontent) >= j) {
          splittxt.word.txtline.th <- NULL
          for (i in 1:nrow(table.heading.words)) {
            ## search for lines with searchword info
            ## [txtcontent] ##
            word <- table.heading.words[i, "words"]
            ignore.case.th <- table.heading.words[i, "ignore.case.th"]
            splittxt.word.txtline.th <- c(splittxt.word.txtline.th,
                                          grep(word, splittxtcontent[[j]],
                                               ignore.case = ignore.case.th))
          }
          txt.tablestart.pos <- c(splittxt.word.txtline.th,
                                  grep("^(\f|)(Table |Tab. )[0-99|MDCLXVI]+(\\.)(*)",
                                       splittxtcontent[[j]], ignore.case = TRUE),
                                  grep("^(\f|)(Table )[0-99|MDCLXVI]+( )(*)",
                                       splittxtcontent[[j]], ignore.case = TRUE))
          ## search for tables with line break in title
          lb.txt.tablestart.pos <- c(grep("^(\f|)(Table )[0-99|MDCLXVI]+$",
                                          splittxtcontent[[j]], ignore.case = TRUE),
                                     grep("^(\f|)(Table )[0-99|MDCLXVI]+( )$",
                                          splittxtcontent[[j]], ignore.case = TRUE))
        } else if (length(html.tablestart.pos) > 0) {
          if (nrow(htmltablelines) == 0) {
            htmltablelines <- data.frame(page = j,
                                         tableheading = NA, tablestart.pos = NA,
                                         tablelastline = NA, legendlastline = NA,
                                         legendend.pos = NA, txtfirstline = NA,
                                         detected.in = "htmlonly")
          } else {
            newrow <- htmltablelines[1, ]
            newrow <- NA
            newrow["page"] <- j
            newrow["detected.in"] <- "htmlonly"
            htmltablelines <- rbind(htmltablelines,
                                    newrow)
          }
        }
      }  ## end if there is (no) additional heading

      ## Look at tables in htmlcontent
      if (length(html.tablestart.pos) > 0) {
        for (i in 1:length(html.tablestart.pos)) {
          ## Removing the Table wording
          line <- txthtmlcontent[[j]][html.tablestart.pos[i]]
          tableheading <- substr(line,
                                 1, 50)
          ## replace the paranthesis so that they don't give
          ## problems for grep
          for (symbol in c("\\?","\\+","\\(", "\\)",
                           "\\[", "\\]", "\\/", "\\{",
                           "\\}")) {
            tableheading <- gsub(symbol,
                                 paste0("\\", symbol),
                                 tableheading)
          }
          ## Remove backreferences (\1) from tableheading
          tableheading <- remove_backref(tableheading)

          currentstartlines <- html.tablestart.pos[i]
          ## if the heading is on the site then add it to the
          ## table
          if (length(currentstartlines) != 0) {
            htmltablelines <- rbind(htmltablelines,
                                    data.frame(page = j,
                                               tableheading = tableheading,
                                               tablestart.pos = currentstartlines,
                                               tablelastline = NA,
                                               txtfirstline = NA,
                                               legendlastline = NA,
                                               legendend.pos = NA))
          }
        }
      }  ## end if length(html.tablestart.pos) > 0

      if (length(lb.html.tablestart.pos) > 0) {
        for (i in 1:length(lb.html.tablestart.pos)) {
          ## when +1 is empty
          s <- 1
          while (txthtmlcontent[[j]][lb.html.tablestart.pos[i] + s] == "") {
            s <- s + 1
            if (s > 5 || (lb.html.tablestart.pos[i] + s) == length(txthtmlcontent[[j]])) break
          }
          if ((lb.html.tablestart.pos[i] + s) == length(txthtmlcontent[[j]])) next
          ## Removing the Table wording
          line <- paste0(txthtmlcontent[[j]][lb.html.tablestart.pos[i]]," ",
                         txthtmlcontent[[j]][lb.html.tablestart.pos[i] + s])
          tableheading <- substr(line,
                                 1, 50)
          ## replace the paranthesis so that they don't give
          ## problems for grep
          for (symbol in c("\\?","\\+","\\(", "\\)",
                           "\\[", "\\]", "\\/", "\\{",
                           "\\}")) {
            tableheading <- gsub(symbol,
                                 paste0("\\", symbol),
                                 tableheading)
          }
          ## Remove backreferences (\1) from tableheading
          tableheading <- remove_backref(tableheading)

          currentstartlines <- lb.html.tablestart.pos[i]
          ## if the heading is on the site then add it to the
          ## table
          if (length(currentstartlines) != 0) {
            htmltablelines <- rbind(htmltablelines,
                                    data.frame(page = j,
                                               tableheading = tableheading,
                                               tablestart.pos = currentstartlines,
                                               tablelastline = NA,
                                               txtfirstline = NA,
                                               legendlastline = NA,
                                               legendend.pos = NA))
          }
        }
      }  ## end if there is a table on this page
      ## Look at tables in txtcontent

      if (length(txt.tablestart.pos) > 0) {
        for (i in 1:length(txt.tablestart.pos)) {
          ## Removing the Table wording
          line <- splittxtcontent[[j]][txt.tablestart.pos[i]]
          ## extract the first 50 characters
          tableheading <- substr(line,
                                 1, 50)
          ## replace the paranthesis so that they don't give
          ## problems for grep
          tableheading <- gsub("\\f", "", tableheading)
          for (symbol in c("\\?","\\+","\\(", "\\)",
                           "\\[", "\\]", "\\/", "\\{",
                           "\\}","\\*")) {
            tableheading <- gsub(symbol,
                                 paste0("\\", symbol),
                                 tableheading)
          }
          ## Remove backreferences (\1) from tableheading
          tableheading <- remove_backref(tableheading)

          ## from is where the heading was detected
          from <- txt.tablestart.pos[i]
          if (j > 1){
            for (p in 1:(j-1)){
              from <- from + length(splittxtcontent[[p]])
            }
          }

          ## to has to be the end of the page
          to <- 0
          if (j > 1){
            for (p in 1:j){
              to <- to + length(splittxtcontent[[p]])
            }
          }
          ##remove backreference from txtcontent
          txtcontent[from:to] <- remove_backref(txtcontent[from:to])

          ## heading
          currentstartlines <- grep(tableheading,
                                    txtcontent[from:to], fixed = TRUE)[1] + from - 1
          ## if the heading is on the site then add it to the
          ## table
          if (length(currentstartlines) != 0) {
            txttablelines <- rbind(txttablelines,
                                   data.frame(page = j,
                                              tableheading = tableheading,
                                              tablestart.pos = currentstartlines,
                                              tablelastline = NA, tableend.pos = NA,
                                              txtfirstline = NA,
                                              legendlastline = NA,
                                              legendend.pos = NA))
          }
        }
      }  ## end if there is a table on this page

      if (length(lb.txt.tablestart.pos) > 0) {
        for (i in 1:length(lb.txt.tablestart.pos)) {
          ## when +1 is empty
          s <- 1
          while (splittxtcontent[[j]][lb.txt.tablestart.pos[i] + s] == "") {
            s <- s + 1
            if (s > 5) break
          }
          ## Removing the Table wording
          line <- paste0(splittxtcontent[[j]][lb.txt.tablestart.pos[i]]," ",
                         splittxtcontent[[j]][lb.txt.tablestart.pos[i] + s])
          tableheading <- substr(line, 1, 50)
          ## replace the paranthesis so that they don't give
          ## problems for grep
          for (symbol in c("\\?","\\+","\\(", "\\)",
                           "\\[", "\\]", "\\/", "\\{",
                           "\\}","\\*")) {
            tableheading <- gsub(symbol,
                                 paste0("\\", symbol),
                                 tableheading)
          }
          ## Remove backreferences (\1) from tableheading
          tableheading <- remove_backref(tableheading)

          ## get the position of the line that has exactly the
          ## heading
          ## from is where the heading was detected
          from <- lb.txt.tablestart.pos[i]
          if (j > 1){
            for (p in 1:(j-1)){
              from <- from + length(splittxtcontent[[p]])
            }
          }

          ## to has to be the end of the page
          to <- 0
          if (j > 1){
            for (p in 1:j){
              to <- to + length(splittxtcontent[[p]])
            }
          }
          currentstartlines <- grep(splittxtcontent[[j]][lb.txt.tablestart.pos[i]],
                                    txtcontent[from:to][1]) + from - 1
          ## if the heading is on the site then add it to the
          ## table
          if (length(currentstartlines) != 0) {
            txttablelines <- rbind(txttablelines,
                                   data.frame(page = j,
                                              tableheading = tableheading,
                                              tablestart.pos = currentstartlines,
                                              tablelastline = NA, tableend.pos = NA,
                                              txtfirstline = NA,
                                              legendlastline = NA,
                                              legendend.pos = NA))
          }
        }
      }  ## end if there is a table on this page
    }  ## end go through each page j

    ## end if file has no tables (if function started
    ## then PDF file has to have search words)
  } else if (searchwords.go == TRUE && filterwords.go == TRUE &&
             integrity.indicator == TRUE && length(tablestart.pos) == 0) {
    outtable <- cbind(txtcontent, layout = "txt",
                      rownumber = 1:length(txtcontent))
    txtlines <- outtable[(outtable[, "layout"] == "txt"), ]
    txttablelines <- NULL
    htmltablelines <- NULL
  } else {
    outtable <- NULL
    txtlines <- NULL
    txttablelines <- NULL
    htmltablelines <- NULL
  }

  ## 6.3) Determine if tables were found in html, txt to both files ------------
  if (searchwords.go == TRUE && filterwords.go == TRUE &&
      integrity.indicator == TRUE && !length(tablestart.pos) == 0 &&
      nrow(as.data.frame((htmltablelines))) > 0 && ncol(as.data.frame((htmltablelines))) > 0) {
    ## sort the htmltablelines table according to page
    ## and then tablestart.pos
    htmltablelines <- htmltablelines[with(htmltablelines,
                                          order(page, tablestart.pos)), ]
    ## add information about where table was found
    if (!"detected.in" %in% colnames(htmltablelines))
      htmltablelines <- cbind(htmltablelines,
                              detected.in = NA)
    if (nrow(txttablelines) > 0 && ncol(txttablelines) > 0) {
      ## sort the txttablelines table according to page
      ## and then tablestart.pos
      txttablelines <- txttablelines[with(txttablelines,
                                          order(page, tablestart.pos)), ]
      if (!"detected.in" %in% colnames(txttablelines)) {
        txttablelines <- cbind(txttablelines,
                               detected.in = NA)
      }

      ## go through each line of the tables and compare
      ## which tables are in the txt only
      for (txtrow in 1:nrow(txttablelines)) {
        txttablelines[txtrow, ] <- find_similar_row(originrow = txttablelines[txtrow, ],
                                                    targettablelines = htmltablelines,
                                                    relative.match.col = "tableheading",
                                                    output.for.originrow.only = "txtonly",
                                                    output.for.match = "txtandhtml",
                                                    output.column.name = "detected.in")$originrow
        htmltablelines <- find_similar_row(originrow = txttablelines[txtrow, ],
                                           targettablelines = htmltablelines,
                                           relative.match.col = "tableheading",
                                           output.for.originrow.only = "txtonly",
                                           output.for.match = "txtandhtml",
                                           output.column.name = "detected.in")$targettablelines
      }
    }

    ## name all the rows that are only found in html
    if (nrow(htmltablelines[is.na(htmltablelines[, "detected.in"]), ]) > 0)
      htmltablelines[is.na(htmltablelines[, "detected.in"]), "detected.in"] = "htmlonly"
  }


  ## 6.4) Add positional value to tables -----
  if (searchwords.go == TRUE && filterwords.go == TRUE &&
      integrity.indicator == TRUE && !length(tablestart.pos) == 0 &&
      nrow(as.data.frame((htmltablelines))) > 0 && ncol(as.data.frame((htmltablelines))) > 0) {

    ## add tableend.pos if column does not exist
    if (!"tableend.pos" %in% colnames(htmltablelines)){
      htmltablelines <- cbind(htmltablelines, tableend.pos = NA)
    }
    htmltablelines <-  htmltablelines[!(is.na(htmltablelines[,"page"])),]
    for (i in 1:nrow(htmltablelines)) {
      position <- as.numeric(htmltablelines[i, "tablestart.pos"])
      p <- as.numeric(htmltablelines[i, "page"])
      ## if end of page or the txt ended
      npos <- position + 7
      ## test if the table is too short for detection
      if (nrow(htmlcontent[[p]]) < npos){
        outofbound <- TRUE
      } else {
        outofbound <- FALSE
      }
      if (outofbound == FALSE) {
        if (is.null(htmlcontent[[p]][npos, "top"])){
          notop <- TRUE
        } else {
          if (is.na(htmlcontent[[p]][npos, "top"])){
            notop <- TRUE
          } else {
            notop <- FALSE
          }
        }
      }

      if ((outofbound == TRUE) || (notop == TRUE)) {
        ## end is at the end of the page
        htmltablelines[i, "tableend.pos"] <- nrow(htmlcontent[[p]])
        htmltablelines[i, "detected.in"] <- "txtonly.notabledetected"
        if (nrow(txttablelines) > 0 &&
            ncol(txttablelines) > 0) {
          ## go through each line of the tables and compare
          ## which tables are in the txt only
          txttablelines <-  txttablelines[!(is.na(txttablelines[,"page"])),]
          txttablelines <- find_similar_row(originrow = htmltablelines[i,],
                                            targettablelines = txttablelines,
                                            relative.match.col = "tableheading",
                                            output.for.originrow.only = "error",
                                            output.for.match = "txtonly.notabledetected",
                                            output.column.name = "detected.in")$targettablelines
        }
        ## else start with +5
      } else {
        ## end of the this is the start line +5
        htmltablelines[i, "tableend.pos"] <- htmltablelines[i, "tablestart.pos"] + 5
      }
    }  ## end go through each row
  }

  if (searchwords.go == TRUE && filterwords.go == TRUE &&
      integrity.indicator == TRUE && !length(tablestart.pos) == 0 &&
      nrow(as.data.frame((htmltablelines))) > 0 && ncol(as.data.frame((htmltablelines))) > 0) {
    ## determine if top is constant (when not start point +1)
    ## do for each table row
    nexti <- FALSE
    htmltablelines <-  htmltablelines[!(is.na(htmltablelines[,"page"])),]
    for (i in 1:nrow(htmltablelines)) {
      ## ignore the lines with txtonly
      if ((htmltablelines[i, "detected.in"] == "txtonly.notabledetected") ||
          (htmltablelines[i, "detected.in"] == "txtonly")) {
        if (nrow(txttablelines) > 0 && ncol(txttablelines) > 0) {
          ## go through each line of the tables and compare
          ## which tables are in the txt only
          txttablelines <-  txttablelines[!(is.na(txttablelines[,"page"])),]
          txttablelines <- find_similar_row(originrow = htmltablelines[i, ],
                                            targettablelines = txttablelines,
                                            relative.match.col = "tableheading",
                                            output.for.originrow.only = "error",
                                            output.for.match = "txtonly.notabledetected",
                                            output.column.name = "detected.in")$targettablelines
        }
        next
      }

      ## 6.5) Detect how many columns by evaluating how many different left values ---------------
      ## set the current line pos and page for the while loop
      currentline.pos <- htmltablelines[i, "tableend.pos"]
      currentline.page <- htmltablelines[i, "page"]
      ## initialize out.left.list
      out.left.list <- NA
      all.left.found <- FALSE
      while (all.left.found == FALSE) {
        constant.value <- NULL

        while (is.null(constant.value)) {
          ## currentline.pos is either set before the loop or at the end
          currentline <- htmlcontent[[currentline.page]][currentline.pos, ]

          ## get the top information
          oritop.value <- paste0("top:", currentline["top"], "px")

          ## go to the next line
          currentline.pos <- currentline.pos + 1
          ## set the start for of the next table
          if (!is.na(htmltablelines[i + 1, "tablestart.pos"]) &&
              htmltablelines[i + 1, "page"] == htmltablelines[i, "page"])
            nextstartpos <- htmltablelines[i + 1, "tablestart.pos"] else nextstartpos <- 999999
          ## test if end of page is reached
          currentline <- htmlcontent[[currentline.page]][currentline.pos, ]

          ## get the top information
          top.value <- paste0("top:",
                              currentline["top"], "px")

          ## if beyond the end of the html page
          if ((currentline.pos > nrow(htmlcontent[[currentline.page]])) ||
              (top.value == "top:NApx")) {
            # htmltablelines[i, "detected.in"] <- "txtonly.notabledetected"
            if (nrow(txttablelines) > 0 && ncol(txttablelines) > 0) {
              # ## go through each line of the tables and compare
              # ## which tables are in the txt only
              # txttablelines <- txttablelines[!(is.na(txttablelines[,"page"])),]
              # txttablelines <- find_similar_row(originrow = htmltablelines[i, ],
              #                                   targettablelines = txttablelines,
              #                                   relative.match.col = "tableheading",
              #                                   output.for.originrow.only = "error",
              #                                   output.for.match = "txtonly.notabledetected",
              #                                   output.column.name = "detected.in")$targettablelines
              htmltablelines[i, "tableend.pos"] <- currentline.pos - 1
              htmltablelines[i, "legendend.pos"] <- currentline.pos - 1
              ## fill the txtcontent columns
              htmltablelines[i, "tablelastline"] <- txthtmlcontent[[currentline.page]][as.numeric(htmltablelines[i,
                                                                                                                 "tableend.pos"])]
              htmltablelines[i, "legendlastline"] <- txthtmlcontent[[currentline.page]][as.numeric(htmltablelines[i,
                                                                                                                  "legendend.pos"])]
              htmltablelines[i, "txtfirstline"] <- txthtmlcontent[[currentline.page]][as.numeric(htmltablelines[i,
                                                                                                                "legendend.pos"]) + 1]

            }
            currentline.pos <- currentline.pos - 1
            all.left.found <- TRUE
            constant.value <- top.value
            # nexti <- TRUE
            break
            ## if tables overlap
          } else if (currentline.pos >= nextstartpos) {
            htmltablelines[i, "tableend.pos"] <- as.numeric(htmltablelines[i + 1, "tablestart.pos"]) - 1
            htmltablelines[i, "legendend.pos"] <- as.numeric(htmltablelines[i + 1, "tablestart.pos"]) - 1
            ## fill the txtcontent columns
            htmltablelines[i, "tablelastline"] <- txthtmlcontent[[currentline.page]][as.numeric(htmltablelines[i,
                                                                                                               "tableend.pos"])]
            htmltablelines[i, "legendlastline"] <- txthtmlcontent[[currentline.page]][as.numeric(htmltablelines[i,
                                                                                                                "legendend.pos"])]
            htmltablelines[i, "txtfirstline"] <- txthtmlcontent[[currentline.page]][as.numeric(htmltablelines[i,
                                                                                                              "legendend.pos"]) + 1]
            currentline.pos <- as.numeric(htmltablelines[i + 1, "tablestart.pos"]) - 1

            if (nrow(txttablelines) > 0 && ncol(txttablelines) > 0) {
              txttablelines[i, "tableend.pos"] <- as.numeric(txttablelines[i + 1, "tablestart.pos"]) - 1
              txttablelines[i, "legendend.pos"] <- as.numeric(txttablelines[i + 1, "tablestart.pos"]) - 1
              txttablelines[i, "tablelastline"] <-  txtcontent[as.numeric(txttablelines[i, "tableend.pos"])]
              txttablelines[i, "legendlastline"] <- txtcontent[as.numeric(txttablelines[i, "legendend.pos"])]
              txttablelines[i, "txtfirstline"] <-  txtcontent[as.numeric(txttablelines[i, "legendend.pos"])+1]
            }
            nexti <- TRUE
            break
          }  ## end if

          if (oritop.value == top.value) {
            ## the constant value is the top
            constant.value <- top.value
          } else {
            ## use this currentline.pos as new start
            htmltablelines[i, "tableend.pos"] <- currentline.pos
          }

        }  ## end while is.null(constant.value)

        if (nexti == TRUE) break
        ## determine the left range ##

        ## go to min
        ind <- TRUE
        while (ind == TRUE) {
          currentline.pos <- currentline.pos - 1
          currentline <- htmlcontent[[currentline.page]][currentline.pos, ]
          ind <- grepl(constant.value, currentline[1])
        }
        currentline.pos <- currentline.pos + 1
        currentline <- htmlcontent[[currentline.page]][currentline.pos, ]

        left.list <- NULL
        ind <- TRUE
        while (ind == TRUE) {
          # get the left position
          left.value <- paste0("left:", currentline["left"], "px")
          left.list <- c(left.list,  left.value)
          currentline.pos <- currentline.pos + 1
          currentline <- htmlcontent[[currentline.page]][currentline.pos, ]
          ind <- grepl(constant.value,
                       currentline[1])
        }

        ## test if left.list (column list) is complete ##
        if (all(left.list %in% out.left.list)) {
          ## if the left.list if complete, end the loop
          all.left.found <- TRUE
          break
        } else {
          ## if left.list is incomplete, add the current
          ## left.list
          out.left.list <- c(out.left.list[!is.na(out.left.list)],
                             left.list)
          out.left.list <- unique(out.left.list)
        }
      }  ## end of searching for all left values

      ## if only txt table to tables beside another table
      if (nexti == TRUE) {
        nexti <- FALSE
        next
      }

      ## 6.6) Determine the end of the table through the highest top value ---------------
      ## search for the max with all the left values ##

      ## if two tables are on the same page restrict lines
      if (!is.na(htmltablelines[i + 1, "tablestart.pos"]) &&
          htmltablelines[i + 1, "page"] == htmltablelines[i, "page"]){
        nexttablestartpos <- htmltablelines[i + 1, "tablestart.pos"]
      } else {
        nexttablestartpos <- nrow(htmlcontent[[currentline.page]])
      }

      ## make a toplist
      top.list <- NULL
      out.left.list <- out.left.list[out.left.list != "left:NApx"]

      ## if out.left.list is empty go to the next table
      if (length(out.left.list) == 0) {
        nexti <- FALSE
        next
      }
      for (left.item in out.left.list) {
        max.line <- max(grep(left.item,
                             htmlcontent[[currentline.page]][1:nexttablestartpos, 1]))
        currentline <- htmlcontent[[currentline.page]][max.line, ]
        top.value <- currentline["top"]
        top.list <- c(top.list, top.value)
      }
      ## choose the max top count that is at least double
      top.value.found <- FALSE
      unq.top.list <- unique(top.list)
      match.list <- NULL
      for (ti in 1:length(unq.top.list)) {
        curr.top.value <- unq.top.list[ti]
        match.list[ti] <- 0
        for (li in 1:length(out.left.list)) {
          match.one <- intersect(grep(out.left.list[li],
                                      htmlcontent[[currentline.page]][1:nexttablestartpos, 1]),
                                 grep(paste0("top:",curr.top.value, "px;"),
                                      htmlcontent[[currentline.page]][1:nexttablestartpos, 1]))
          ## when there was a match and the left.item is not the last in the list
          if (length(match.one) > 0){
            match.list[ti] <- match.list[ti] + 1
          }
        }
      }
      max.top.value <- suppressWarnings(max(strtoi(unq.top.list[match.list > 1])))

      ## if not one value is duplicated then go with max
      if (!any(match.list > 1))max.top.value <- max(strtoi(top.list))

      ## if not one value is duplicated then go with max
      htmltablelines[i, "tableend.pos"] <- max(grep(paste0("top:",
                                                           max.top.value, "px;"),
                                                    htmlcontent[[currentline.page]][,1]))
      currentline.pos <- htmltablelines[i, "tableend.pos"]
      currentline <- htmlcontent[[currentline.page]][currentline.pos, ]

      ## add the last line to the htmltablelines ## add
      ## the last line to the htmltablelines
      htmltablelines[i, "tablelastline"] <- txthtmlcontent[[currentline.page]][currentline.pos]
      ## save the end of the table
      htmltablelines[i, "tableend.pos"] <- currentline.pos

      ## add everything below the table ##

      ## 6.7) Detect the legend by extracting all lines with a lower font that than the table -----
      ## get the current font size information ##
      pos.fontsize.start <- regexpr("font-size:",
                                    currentl