R/obj_LargeDataSetForTexts.R

# This file is part of the R package "aifeducation".
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 3 as published by
# the Free Software Foundation.
#
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>

#' @title Abstract class for large data sets containing raw texts
#' @description This object stores raw texts. The data of this objects is not stored in memory directly. By using memory
#'   mapping these objects allow to work with data sets which do not fit into memory/RAM.
#'
#' @return Returns a new object of this class.
#' @export
#' @family Data Management
LargeDataSetForText <- R6::R6Class(
  classname = "LargeDataSetForText",
  inherit = LargeDataSetBase,
  public = list(
    #--------------------------------------------------------------------------
    #' @description Method for creation of [LargeDataSetForText] instance. It can be initialized with `init_data`
    #'   parameter if passed (Uses `add_from_data.frame()` method if `init_data` is `data.frame`).
    #' @param init_data Initial `data.frame` for dataset.
    #' @return A new instance of this class initialized with `init_data` if passed.
    initialize = function(init_data = NULL) {
      if (is.data.frame(init_data)) {
        self$add_from_data.frame(init_data)
      }
    },
    #--------------------------------------------------------------------------
    #' @description Method for adding raw texts saved within .txt files to the data set. Please note the the directory
    #'   should contain one folder for each .txt file. In order to create an informative data set every folder can
    #'   contain the following additional files:
    #'
    #' * bib_entry.txt: containing a text version of the bibliographic information of the raw text.
    #' * license.txt: containing a statement about the license to use the raw text such as "CC BY".
    #' * url_license.txt: containing the url/link to the license in the internet.
    #' * text_license.txt: containing the license in raw text.
    #' * url_source.txt: containing the url/link to the source in the internet.
    #'
    #'   The id of every .txt file is the file name without file extension. Please be aware to provide unique file
    #'   names. Id and raw texts are mandatory, bibliographic and license information are optional.
    #' @param dir_path Path to the directory where the files are stored.
    #' @param batch_size `int` determining the number of files to process at once.
    #' @param log_file `string` Path to the file where the log should be saved. If no logging is desired set this
    #'   argument to `NULL`.
    #' @param log_write_interval `int` Time in seconds determining the interval in which the logger should try to update
    #'   the log files. Only relevant if `log_file` is not `NULL`.
    #' @param log_top_value `int` indicating the current iteration of the process.
    #' @param log_top_total `int` determining the maximal number of iterations.
    #' @param log_top_message `string` providing additional information of the process.
    #' @param trace `bool` If `TRUE` information on the progress is printed to the console.
    #' @param clean_text `bool` If `TRUE` the text is modified to improve the quality of the following analysis:
    #'  * Some special symbols are removed.
    #'  * All spaces at the beginning and the end of a row are removed.
    #'  * Multiple spaces are reduced to single space.
    #'  * All rows with a number from 1 to 999 at the beginning or at the end are removed (header and footer).
    #'  * List of content is removed.
    #'  * Hyphenation is made undone.
    #'  * Line breaks within a paragraph are removed.
    #'  * Multiple line breaks are reduced to a single line break.
    #' @return The method does not return anything. It adds new raw texts to the data set.
    add_from_files_txt = function(dir_path,
                                  batch_size = 500L,
                                  log_file = NULL,
                                  log_write_interval = 2L,
                                  log_top_value = 0L,
                                  log_top_total = 1L,
                                  log_top_message = NA,
                                  clean_text = TRUE,
                                  trace = TRUE) {
      # Gather all text files
      file_paths <- private$get_file_paths(dir_path, ".txt")

      if (length(file_paths) > 0L) {
        # calculate number of batches
        n_batches <- ceiling(length(file_paths) / batch_size)

        # get indices for every batch
        batches <- get_batches_index(
          number_rows = length(file_paths),
          batch_size = batch_size
        )

        # Process every batch
        list_datasets <- list()
        last_log <- NULL
        for (i in 1L:n_batches) {
          chunk <- private$get_batch(batches[[i]],
            file_paths = file_paths,
            clean_text = clean_text
          )
          chunk_dataset <- data.frame_to_py_dataset(chunk)
          list_datasets[i] <- list(chunk_dataset)
          if (trace) {
            message(
              get_time_stamp(),
              " Batch ", i, " from ", n_batches, " processed"
            )
          }
          last_log <- write_log(
            log_file = log_file,
            last_log = last_log,
            write_interval = log_write_interval,
            value_top = log_top_value,
            value_middle = i,
            value_bottom = 0L,
            total_top = log_top_total,
            total_middle = n_batches,
            total_bottom = 1L,
            message_top = log_top_message,
            message_middle = ".txt files",
            message_bottom = NA
          )
        }

        # concatenate datasets
        new_dataset <- datasets$concatenate_datasets(dsets = list_datasets, axis = 0L)

        # Add new dataset
        private$add(new_dataset)
      }
    },

    #--------------------------------------------------------------------------
    #' @description Method for adding raw texts saved within .pdf files to the data set. Please note the the directory
    #'   should contain one folder for each .pdf file. In order to create an informative data set every folder can
    #'   contain the following additional files:
    #'
    #' * bib_entry.txt: containing a text version of the bibliographic information
    #'   of the raw text.
    #' * license.txt: containing a statement about the license to use the raw text
    #'   such as "CC BY".
    #' * url_license.txt: containing the url/link to the license in the internet.
    #' * text_license.txt: containing the license in raw text.
    #' * url_source.txt: containing the url/link to the source in the internet.
    #'
    #'   The id of every .pdf file is the file name without file extension. Please be aware to provide unique file
    #'   names. Id and raw texts are mandatory, bibliographic and license information are optional.
    #' @param dir_path Path to the directory where the files are stored.
    #' @param batch_size `int` determining the number of files to process at once.
    #' @param trace `bool` If `TRUE` information on the progress is printed to the console.
    #' @param clean_text `bool` If `TRUE` the text is modified to improve the quality of the following analysis:
    #'  * Some special symbols are removed.
    #'  * All spaces at the beginning and the end of a row are removed.
    #'  * Multiple spaces are reduced to single space.
    #'  * All rows with a number from 1 to 999 at the beginning or at the end are removed (header and footer).
    #'  * List of content is removed.
    #'  * Hyphenation is made undone.
    #'  * Line breaks within a paragraph are removed.
    #'  * Multiple line breaks are reduced to a single line break.
    #' @param log_top_value `int` indicating the current iteration of the process.
    #' @param log_top_total `int` determining the maximal number of iterations.
    #' @param log_top_message `string` providing additional information of the process.
    #' @param log_file `string` Path to the file where the log should be saved. If no logging is desired set this
    #'   argument to `NULL`.
    #' @param log_write_interval `int` Time in seconds determining the interval in which the logger should try to update
    #'   the log files. Only relevant if `log_file` is not `NULL`.
    #' @return The method does not return anything. It adds new raw texts to the data set.
    add_from_files_pdf = function(dir_path,
                                  batch_size = 500L,
                                  log_file = NULL,
                                  log_write_interval = 2L,
                                  log_top_value = 0L,
                                  log_top_total = 1L,
                                  log_top_message = NA,
                                  clean_text = TRUE,
                                  trace = TRUE) {
      # Gather all files
      file_paths <- private$get_file_paths(dir_path, ".pdf")
      if (length(file_paths) > 0L) {
        # calculate number of batches
        n_batches <- ceiling(length(file_paths) / batch_size)

        # get indices for every batch
        batches <- get_batches_index(
          number_rows = length(file_paths),
          batch_size = batch_size
        )

        # Process every batch
        list_datasets <- list()
        last_log <- NULL

        for (i in 1L:n_batches) {
          chunk <- private$get_batch(batches[[i]],
            file_paths = file_paths,
            clean_text = clean_text
          )
          chunk_dataset <- data.frame_to_py_dataset(chunk)
          list_datasets[i] <- list(chunk_dataset)
          if (trace) {
            message(
              get_time_stamp(),
              " Batch ", i, " from ", n_batches, " processed"
            )
          }
          last_log <- write_log(
            log_file = log_file,
            last_log = last_log,
            write_interval = log_write_interval,
            value_top = log_top_value,
            value_middle = i,
            value_bottom = 0L,
            total_top = log_top_total,
            total_middle = n_batches,
            total_bottom = 1L,
            message_top = log_top_message,
            message_middle = ".pdf files",
            message_bottom = NA
          )
        }

        # concatenate datasets
        new_dataset <- datasets$concatenate_datasets(dsets = list_datasets, axis = 0L)

        # Add new dataset
        private$add(new_dataset)
      }
    },

    #--------------------------------------------------------------------------
    #' @description Method for adding raw texts saved within .xlsx files to the data set. The method assumes that the
    #'   texts are saved in the rows and that the columns store the id and the raw texts in the columns. In addition, a
    #'   column for the bibliography information and the license can be added. The column names for these rows must be
    #'   specified with the following arguments. They must be the same for all .xlsx files in the chosen directory. Id
    #'   and raw texts are mandatory, bibliographic, license, license's url, license's text, and source's url are
    #'   optional. Additional columns are dropped.
    #' @param dir_path Path to the directory where the files are stored.
    #' @param id_column `string` Name of the column storing the ids for the texts.
    #' @param text_column `string` Name of the column storing the raw text.
    #' @param bib_entry_column `string` Name of the column storing the bibliographic information of the texts.
    #' @param license_column `string` Name of the column storing information about the licenses.
    #' @param url_license_column `string` Name of the column storing information about the url to the license in the
    #'   internet.
    #' @param text_license_column `string` Name of the column storing the license as text.
    #' @param url_source_column `string` Name of the column storing information about about the url to the source in the
    #'   internet.
    #' @param trace `bool` If `TRUE` prints information on the progress to the console.
    #' @param log_top_value `int` indicating the current iteration of the process.
    #' @param log_top_total `int` determining the maximal number of iterations.
    #' @param log_top_message `string` providing additional information of the process.
    #' @param log_file `string` Path to the file where the log should be saved. If no logging is desired set this
    #'   argument to `NULL`.
    #' @param log_write_interval `int` Time in seconds determining the interval in which the logger should try to update
    #'   the log files. Only relevant if `log_file` is not `NULL`.
    #' @return The method does not return anything. It adds new raw texts to the data set.
    add_from_files_xlsx = function(dir_path,
                                   trace = TRUE,
                                   id_column = "id",
                                   text_column = "text",
                                   bib_entry_column = "bib_entry",
                                   license_column = "license",
                                   url_license_column = "url_license",
                                   text_license_column = "text_license",
                                   url_source_column = "url_source",
                                   log_file = NULL,
                                   log_write_interval = 2L,
                                   log_top_value = 0L,
                                   log_top_total = 1L,
                                   log_top_message = NA) {
      # Check
      check_type(object = id_column, type = "string", FALSE)
      check_type(object = text_column, type = "string", FALSE)
      check_type(object = bib_entry_column, type = "string", TRUE)
      check_type(object = license_column, type = "string", TRUE)
      check_type(object = url_license_column, type = "string", TRUE)
      check_type(object = text_license_column, type = "string", TRUE)
      check_type(object = url_source_column, type = "string", TRUE)
      check_type(object = trace, type = "bool", FALSE)
      check_type(object = dir_path, type = "string", FALSE)

      # Gather all files
      file_paths <- private$get_file_paths(dir_path, file_type = ".xlsx")
      n_batches <- length(file_paths)

      # Process every batch
      list_datasets <- list()
      last_log <- NULL
      for (i in 1L:n_batches) {
        chunk <- readtext::readtext(
          file = file_paths[i],
          docid_field = id_column,
          text_field = text_column,
          docvarsfrom = "metadata"
        )
        if (nrow(chunk) < 2L) {
          chunk <- readtext::readtext(
            file = file_paths[i],
            docid_field = NULL,
            text_field = text_column,
            docvarsfrom = "metadata"
          )
        } else {
          # Set correct name of id column
          index <- which(colnames(chunk) %in% "doc_id")
          colnames(chunk)[index] <- "id"
        }

        # Bib_entry column
        index <- which(colnames(chunk) %in% bib_entry_column)
        if (length(index) == 0L) {
          chunk$bib_entry <- rep(NA_character_, nrow(chunk))
        } else {
          colnames(chunk)[index] <- "bib_entry"
        }

        # License column
        index <- which(colnames(chunk) %in% license_column)
        if (length(index) == 0L) {
          chunk$license <- rep(NA_character_, nrow(chunk))
        } else {
          colnames(chunk)[index] <- "license"
        }

        # URL License column
        index <- which(colnames(chunk) %in% url_license_column)
        if (length(index) == 0L) {
          chunk$url_license <- rep(NA_character_, nrow(chunk))
        } else {
          colnames(chunk)[index] <- "url_license"
        }

        # Text License column
        index <- which(colnames(chunk) %in% text_license_column)
        if (length(index) == 0L) {
          chunk$text_license <- rep(NA_character_, nrow(chunk))
        } else {
          colnames(chunk)[index] <- "text_license"
        }

        # URL source column
        index <- which(colnames(chunk) %in% url_source_column)
        if (length(index) == 0L) {
          chunk$url_source <- rep(NA_character_, nrow(chunk))
        } else {
          colnames(chunk)[index] <- "url_source"
        }

        # Select only the necessary columns
        chunk <- chunk[c(
          "id",
          "text",
          "bib_entry",
          "license",
          "url_license",
          "text_license",
          "url_source"
        )]

        chunk_dataset <- data.frame_to_py_dataset(chunk)

        list_datasets[i] <- list(chunk_dataset)
        if (trace) {
          message(
            get_time_stamp(),
            " Batch ", i, " from ", n_batches, " processed"
          )
        }
        last_log <- write_log(
          log_file = log_file,
          last_log = last_log,
          write_interval = log_write_interval,
          value_top = log_top_value,
          value_middle = i,
          value_bottom = 0L,
          total_top = log_top_total,
          total_middle = n_batches,
          total_bottom = 1L,
          message_top = log_top_message,
          message_middle = ".xlsx files",
          message_bottom = NA
        )

        # concatenate datasets
        new_dataset <- datasets$concatenate_datasets(dsets = list_datasets, axis = 0L)

        # Add new dataset
        private$add(new_dataset)
      }
    },

    #--------------------------------------------------------------------------
    #' @description Method for adding raw texts from a `data.frame`
    #' @param data_frame Object of class `data.frame` with at least the following columns "id","text","bib_entry",
    #'   "license", "url_license", "text_license", and "url_source". If "id" and7or "text" is missing an error occurs.
    #'   If the other columns are not present in the `data.frame` they are added with empty values(`NA`).
    #'   Additional columns are dropped.
    #' @return The method does not return anything. It adds new raw texts to the data set.
    add_from_data.frame = function(data_frame) {
      # Necessary columns
      if (!is.data.frame(data_frame)) {
        stop("Input must be of type data.frame")
      }
      if (!"id" %in% colnames(data_frame)) {
        stop("data.frame must contain a column id.")
      }
      if (!"text" %in% colnames(data_frame)) {
        stop("data.frame must contain a column text.")
      }

      data_to_add <- data_frame

      # optional columns
      if (!"bib_entry" %in% colnames(data_to_add)) {
        data_to_add["bib_entry"] <- rep(x = NA, times = nrow(data_to_add))
      }
      if (!"license" %in% colnames(data_to_add)) {
        data_to_add["license"] <- rep(x = NA, times = nrow(data_to_add))
      }
      if (!"url_license" %in% colnames(data_to_add)) {
        data_to_add["url_license"] <- rep(x = NA, times = nrow(data_to_add))
      }
      if (!"text_license" %in% colnames(data_to_add)) {
        data_to_add["text_license"] <- rep(x = NA, times = nrow(data_to_add))
      }
      if (!"url_source" %in% colnames(data_to_add)) {
        data_to_add["url_source"] <- rep(x = NA, times = nrow(data_to_add))
      }

      # Transform to a python dataset
      new_dataset <- data.frame_to_py_dataset(data_to_add[c(
        "id",
        "text",
        "bib_entry",
        "license",
        "url_license",
        "text_license",
        "url_source"
      )])

      # Add new dataset
      private$add(new_dataset)
    },
    #--------------------------------------------------------------------------
    #' @description Method for requesting all private fields and methods. Used for loading and updating an object.
    #' @return Returns a `list` with all private fields and methods.
    get_private = function() {
      return(private)
    }
  ),
  private = list(
    get_file_paths = function(dir_path, file_type) {
      file_paths <- list.files(
        path = dir_path,
        include.dirs = FALSE,
        all.files = TRUE,
        full.names = TRUE,
        recursive = TRUE,
        pattern = paste0("*", file_type)
      )
      if (length(file_paths) > 0L) {
        file_paths <- private$clean_path(file_paths)
      }
      return(file_paths)
    },
    clean_path = function(paths) {
      new_paths <- vector(length = length(paths))
      new_paths[] <- NA
      for (i in seq_len(length(paths))) {
        path <- paths[i]
        bib_entry_path <- file.path(
          dirname(path), "bib_entry.txt"
        )
        licence_path <- file.path(
          dirname(path), "license.txt"
        )
        license_text_path <- file.path(
          dirname(path), "text_license.txt"
        )
        url_licence_path <- file.path(
          dirname(path), "url_license.txt"
        )
        url_source_path <- file.path(
          dirname(path), "url_source.txt"
        )
        path_list <- c(bib_entry_path, licence_path, license_text_path, url_licence_path, url_source_path)
        if (!path %in% path_list) {
          new_paths[i] <- path
        }
      }
      return(na.omit(new_paths))
    },
    get_batch = function(batch, file_paths, clean_text = TRUE) {
      col_names <- c(
        "id",
        "text",
        "bib_entry",
        "license",
        "url_license",
        "text_license",
        "url_source"
      )
      tmp_data <- matrix(
        data = NA,
        nrow = length(batch),
        ncol = length(col_names)
      )
      colnames(tmp_data) <- col_names

      for (i in seq_len(length(batch))) {
        index <- batch[i]
        document <- readtext::readtext(file = file_paths[index])

        # ID
        tmp_data[i, 1L] <- private$remove_file_extenstion(document$doc_id)

        # Text
        if (clean_text) {
          tmp_text <- private$clean_text(document$text)
        } else {
          tmp_text <- document$text
        }
        tmp_data[i, 2L] <- tmp_text

        # Bib_entry
        file_path <- file.path(dirname(file_paths[index]), "bib_entry.txt")
        if (file.exists(file_path)) {
          tmp_data[i, 3L] <- readLines(con = file_path, warn = FALSE)
        } else {
          tmp_data[i, 3L] <- NA
        }

        # License
        file_path <- file.path(dirname(file_paths[index]), "license.txt")
        if (file.exists(file_path)) {
          tmp_data[i, 4L] <- readLines(con = file_path, warn = FALSE)
        } else {
          tmp_data[i, 4L] <- NA
        }

        # URL License
        file_path <- file.path(dirname(file_paths[index]), "url_license.txt")
        if (file.exists(file_path)) {
          tmp_data[i, 5L] <- readLines(con = file_path, warn = FALSE)
        } else {
          tmp_data[i, 5L] <- NA
        }

        # Text License
        file_path <- file.path(dirname(file_paths[index]), "text_license.txt")
        if (file.exists(file_path)) {
          tmp_data[i, 6L] <- readLines(con = file_path, warn = FALSE)
        } else {
          tmp_data[i, 6L] <- NA
        }

        # URL Source
        file_path <- file.path(dirname(file_paths[index]), "url_source.txt")
        if (file.exists(file_path)) {
          tmp_data[i, 7L] <- readLines(con = file_path, warn = FALSE)
        } else {
          tmp_data[i, 7L] <- NA
        }
      }
      return(as.data.frame(tmp_data))
    },
    clean_text = function(text) {
      # Remove some special symbols-------------------------------------------------
      text <- stringi::stri_replace_all(text, regex = "\\|", replacement = "")
      text <- stringi::stri_replace_all(text, regex = "([:blank:]*)-([:blank:]*)", replacement = "-")

      # Normalization of blank positions and new lines---------------------------------------------
      # Entferne alle Leerstellen zu beginn einer Zeile
      text <- stringi::stri_replace_all(text, regex = "\\n([:blank:]{1,})", replacement = "\n")

      # Entferne alle Leerstellen zum Ende einer Zeile
      text <- stringi::stri_replace_all(text, regex = "([:blank:]{1,})\\n", replacement = "\n")

      # Entferne alle Leerzeichen, die mehrfach vorkommen
      text <- stringi::stri_replace_all(text, regex = "[:blank:]{2,}", replacement = " ")

      # Remove running heads---------------------------------------------------------
      # Entferne alle Zeilen, die mit einer Zahl bis 999 beginnen oder mit einer Gliederungsnummer (z. B 1.2)
      text <- stringi::stri_replace_all(text, regex = "\\n(([:digit:]|[:punct:]){1,10})([:alpha:]|[:punct:]|[:blank:])*\\n", replacement = "\n")

      # Entferne alle Zeilen, die mit einer Zahl bis 999 enden
      text <- stringi::stri_replace_all(text, regex = "\\n[:alpha:]([:digit:]|[:alpha:]|[:punct:]|[:blank:])*([:digit:]{1,10})\\n", replacement = "\n")

      # Remove list of contents------------------------------------------------------
      text <- stringi::stri_replace_all(text, regex = "\\n(([:digit:]|[:punct:]){1,10})([:alpha:]|[:punct:]|[:digit:]|[:blank:])*(([:punct:]|[:blank:]){2,})([:digit:]{1,10})\\n", replacement = "\n")
      text <- stringi::stri_replace_all(text, regex = "\\n(([:digit:]|[:punct:]){1,10})([:alpha:]|[:punct:]|[:digit:]|[:blank:])*(([:punct:]|[:blank:]){2,})([:digit:]{1,10})\\n", replacement = "\n")
      text <- stringi::stri_replace_all(text, regex = "\\n(([:digit:]|[:punct:]){1,10})([:alpha:]|[:punct:]|[:digit:]|[:blank:])*(([:punct:]|[:blank:]){2,})([:digit:]{1,10})\\n", replacement = "\n")
      # text <- stringi::stri_replace_all(text, regex = "\\n(([:graph:]|[:blank]){1,})([:punct:]{5,})([:graph:]|[:blank:])*\\n", replacement = "\n")

      # Normalization of paragraphs-------------------------------------------------
      # Mache Silbentrennung rückgängig
      text <- stringi::stri_replace_all(text, regex = "-\\n([:space:]*)", replacement = "")

      # Entferne Zeilenumbrüche innerhalb von Absätzen
      text <- stringi::stri_replace_all(text, regex = "(?<![:space:])\\n(?![:space:])", replacement = " ")

      # Finale Textbereinigungen----------------------------------------------------
      # Entferne alle Zeilenumbrüche, die mehrfach vorkommen
      # text <- stringi::stri_replace_all(text, regex = "\\n{2,}", replacement = "")
      text <- stringi::stri_replace_all(text, regex = "\\n([:space:]*)\\n", replacement = "\n")

      return(text)
    },
    remove_file_extenstion = function(file) {
      # tmp_string <- stringr::str_split_fixed(file, pattern = "\\.", n = Inf)
      tmp_string <- stringi::stri_split_fixed(file, pattern = ".", n = -1L, simplify = TRUE)
      return(paste0(tmp_string[1L, 1L:(ncol(tmp_string) - 1L)], collapse = "."))
    }
  )
)


# Add the model to the user list
DataSetsIndex$LargeDataSetForText <- ("LargeDataSetForText")

Try the aifeducation package in your browser

Any scripts or data that you put into this service are public.

aifeducation documentation built on Nov. 19, 2025, 5:08 p.m.