Nothing
# This file is part of the R package "aifeducation".
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 3 as published by
# the Free Software Foundation.
#
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>
#' @title Abstract class for large data sets containing raw texts
#' @description This object stores raw texts. The data of this objects is not stored in memory directly. By using memory
#' mapping these objects allow to work with data sets which do not fit into memory/RAM.
#'
#' @return Returns a new object of this class.
#' @export
#' @family Data Management
LargeDataSetForText <- R6::R6Class(
classname = "LargeDataSetForText",
inherit = LargeDataSetBase,
public = list(
#--------------------------------------------------------------------------
#' @description Method for creation of [LargeDataSetForText] instance. It can be initialized with `init_data`
#' parameter if passed (Uses `add_from_data.frame()` method if `init_data` is `data.frame`).
#' @param init_data Initial `data.frame` for dataset.
#' @return A new instance of this class initialized with `init_data` if passed.
initialize = function(init_data = NULL) {
if (is.data.frame(init_data)) {
self$add_from_data.frame(init_data)
}
},
#--------------------------------------------------------------------------
#' @description Method for adding raw texts saved within .txt files to the data set. Please note the the directory
#' should contain one folder for each .txt file. In order to create an informative data set every folder can
#' contain the following additional files:
#'
#' * bib_entry.txt: containing a text version of the bibliographic information of the raw text.
#' * license.txt: containing a statement about the license to use the raw text such as "CC BY".
#' * url_license.txt: containing the url/link to the license in the internet.
#' * text_license.txt: containing the license in raw text.
#' * url_source.txt: containing the url/link to the source in the internet.
#'
#' The id of every .txt file is the file name without file extension. Please be aware to provide unique file
#' names. Id and raw texts are mandatory, bibliographic and license information are optional.
#' @param dir_path Path to the directory where the files are stored.
#' @param batch_size `int` determining the number of files to process at once.
#' @param log_file `string` Path to the file where the log should be saved. If no logging is desired set this
#' argument to `NULL`.
#' @param log_write_interval `int` Time in seconds determining the interval in which the logger should try to update
#' the log files. Only relevant if `log_file` is not `NULL`.
#' @param log_top_value `int` indicating the current iteration of the process.
#' @param log_top_total `int` determining the maximal number of iterations.
#' @param log_top_message `string` providing additional information of the process.
#' @param trace `bool` If `TRUE` information on the progress is printed to the console.
#' @param clean_text `bool` If `TRUE` the text is modified to improve the quality of the following analysis:
#' * Some special symbols are removed.
#' * All spaces at the beginning and the end of a row are removed.
#' * Multiple spaces are reduced to single space.
#' * All rows with a number from 1 to 999 at the beginning or at the end are removed (header and footer).
#' * List of content is removed.
#' * Hyphenation is made undone.
#' * Line breaks within a paragraph are removed.
#' * Multiple line breaks are reduced to a single line break.
#' @return The method does not return anything. It adds new raw texts to the data set.
add_from_files_txt = function(dir_path,
batch_size = 500L,
log_file = NULL,
log_write_interval = 2L,
log_top_value = 0L,
log_top_total = 1L,
log_top_message = NA,
clean_text = TRUE,
trace = TRUE) {
# Gather all text files
file_paths <- private$get_file_paths(dir_path, ".txt")
if (length(file_paths) > 0L) {
# calculate number of batches
n_batches <- ceiling(length(file_paths) / batch_size)
# get indices for every batch
batches <- get_batches_index(
number_rows = length(file_paths),
batch_size = batch_size
)
# Process every batch
list_datasets <- list()
last_log <- NULL
for (i in 1L:n_batches) {
chunk <- private$get_batch(batches[[i]],
file_paths = file_paths,
clean_text = clean_text
)
chunk_dataset <- data.frame_to_py_dataset(chunk)
list_datasets[i] <- list(chunk_dataset)
if (trace) {
message(
get_time_stamp(),
" Batch ", i, " from ", n_batches, " processed"
)
}
last_log <- write_log(
log_file = log_file,
last_log = last_log,
write_interval = log_write_interval,
value_top = log_top_value,
value_middle = i,
value_bottom = 0L,
total_top = log_top_total,
total_middle = n_batches,
total_bottom = 1L,
message_top = log_top_message,
message_middle = ".txt files",
message_bottom = NA
)
}
# concatenate datasets
new_dataset <- datasets$concatenate_datasets(dsets = list_datasets, axis = 0L)
# Add new dataset
private$add(new_dataset)
}
},
#--------------------------------------------------------------------------
#' @description Method for adding raw texts saved within .pdf files to the data set. Please note the the directory
#' should contain one folder for each .pdf file. In order to create an informative data set every folder can
#' contain the following additional files:
#'
#' * bib_entry.txt: containing a text version of the bibliographic information
#' of the raw text.
#' * license.txt: containing a statement about the license to use the raw text
#' such as "CC BY".
#' * url_license.txt: containing the url/link to the license in the internet.
#' * text_license.txt: containing the license in raw text.
#' * url_source.txt: containing the url/link to the source in the internet.
#'
#' The id of every .pdf file is the file name without file extension. Please be aware to provide unique file
#' names. Id and raw texts are mandatory, bibliographic and license information are optional.
#' @param dir_path Path to the directory where the files are stored.
#' @param batch_size `int` determining the number of files to process at once.
#' @param trace `bool` If `TRUE` information on the progress is printed to the console.
#' @param clean_text `bool` If `TRUE` the text is modified to improve the quality of the following analysis:
#' * Some special symbols are removed.
#' * All spaces at the beginning and the end of a row are removed.
#' * Multiple spaces are reduced to single space.
#' * All rows with a number from 1 to 999 at the beginning or at the end are removed (header and footer).
#' * List of content is removed.
#' * Hyphenation is made undone.
#' * Line breaks within a paragraph are removed.
#' * Multiple line breaks are reduced to a single line break.
#' @param log_top_value `int` indicating the current iteration of the process.
#' @param log_top_total `int` determining the maximal number of iterations.
#' @param log_top_message `string` providing additional information of the process.
#' @param log_file `string` Path to the file where the log should be saved. If no logging is desired set this
#' argument to `NULL`.
#' @param log_write_interval `int` Time in seconds determining the interval in which the logger should try to update
#' the log files. Only relevant if `log_file` is not `NULL`.
#' @return The method does not return anything. It adds new raw texts to the data set.
add_from_files_pdf = function(dir_path,
batch_size = 500L,
log_file = NULL,
log_write_interval = 2L,
log_top_value = 0L,
log_top_total = 1L,
log_top_message = NA,
clean_text = TRUE,
trace = TRUE) {
# Gather all files
file_paths <- private$get_file_paths(dir_path, ".pdf")
if (length(file_paths) > 0L) {
# calculate number of batches
n_batches <- ceiling(length(file_paths) / batch_size)
# get indices for every batch
batches <- get_batches_index(
number_rows = length(file_paths),
batch_size = batch_size
)
# Process every batch
list_datasets <- list()
last_log <- NULL
for (i in 1L:n_batches) {
chunk <- private$get_batch(batches[[i]],
file_paths = file_paths,
clean_text = clean_text
)
chunk_dataset <- data.frame_to_py_dataset(chunk)
list_datasets[i] <- list(chunk_dataset)
if (trace) {
message(
get_time_stamp(),
" Batch ", i, " from ", n_batches, " processed"
)
}
last_log <- write_log(
log_file = log_file,
last_log = last_log,
write_interval = log_write_interval,
value_top = log_top_value,
value_middle = i,
value_bottom = 0L,
total_top = log_top_total,
total_middle = n_batches,
total_bottom = 1L,
message_top = log_top_message,
message_middle = ".pdf files",
message_bottom = NA
)
}
# concatenate datasets
new_dataset <- datasets$concatenate_datasets(dsets = list_datasets, axis = 0L)
# Add new dataset
private$add(new_dataset)
}
},
#--------------------------------------------------------------------------
#' @description Method for adding raw texts saved within .xlsx files to the data set. The method assumes that the
#' texts are saved in the rows and that the columns store the id and the raw texts in the columns. In addition, a
#' column for the bibliography information and the license can be added. The column names for these rows must be
#' specified with the following arguments. They must be the same for all .xlsx files in the chosen directory. Id
#' and raw texts are mandatory, bibliographic, license, license's url, license's text, and source's url are
#' optional. Additional columns are dropped.
#' @param dir_path Path to the directory where the files are stored.
#' @param id_column `string` Name of the column storing the ids for the texts.
#' @param text_column `string` Name of the column storing the raw text.
#' @param bib_entry_column `string` Name of the column storing the bibliographic information of the texts.
#' @param license_column `string` Name of the column storing information about the licenses.
#' @param url_license_column `string` Name of the column storing information about the url to the license in the
#' internet.
#' @param text_license_column `string` Name of the column storing the license as text.
#' @param url_source_column `string` Name of the column storing information about about the url to the source in the
#' internet.
#' @param trace `bool` If `TRUE` prints information on the progress to the console.
#' @param log_top_value `int` indicating the current iteration of the process.
#' @param log_top_total `int` determining the maximal number of iterations.
#' @param log_top_message `string` providing additional information of the process.
#' @param log_file `string` Path to the file where the log should be saved. If no logging is desired set this
#' argument to `NULL`.
#' @param log_write_interval `int` Time in seconds determining the interval in which the logger should try to update
#' the log files. Only relevant if `log_file` is not `NULL`.
#' @return The method does not return anything. It adds new raw texts to the data set.
add_from_files_xlsx = function(dir_path,
trace = TRUE,
id_column = "id",
text_column = "text",
bib_entry_column = "bib_entry",
license_column = "license",
url_license_column = "url_license",
text_license_column = "text_license",
url_source_column = "url_source",
log_file = NULL,
log_write_interval = 2L,
log_top_value = 0L,
log_top_total = 1L,
log_top_message = NA) {
# Check
check_type(object = id_column, type = "string", FALSE)
check_type(object = text_column, type = "string", FALSE)
check_type(object = bib_entry_column, type = "string", TRUE)
check_type(object = license_column, type = "string", TRUE)
check_type(object = url_license_column, type = "string", TRUE)
check_type(object = text_license_column, type = "string", TRUE)
check_type(object = url_source_column, type = "string", TRUE)
check_type(object = trace, type = "bool", FALSE)
check_type(object = dir_path, type = "string", FALSE)
# Gather all files
file_paths <- private$get_file_paths(dir_path, file_type = ".xlsx")
n_batches <- length(file_paths)
# Process every batch
list_datasets <- list()
last_log <- NULL
for (i in 1L:n_batches) {
chunk <- readtext::readtext(
file = file_paths[i],
docid_field = id_column,
text_field = text_column,
docvarsfrom = "metadata"
)
if (nrow(chunk) < 2L) {
chunk <- readtext::readtext(
file = file_paths[i],
docid_field = NULL,
text_field = text_column,
docvarsfrom = "metadata"
)
} else {
# Set correct name of id column
index <- which(colnames(chunk) %in% "doc_id")
colnames(chunk)[index] <- "id"
}
# Bib_entry column
index <- which(colnames(chunk) %in% bib_entry_column)
if (length(index) == 0L) {
chunk$bib_entry <- rep(NA_character_, nrow(chunk))
} else {
colnames(chunk)[index] <- "bib_entry"
}
# License column
index <- which(colnames(chunk) %in% license_column)
if (length(index) == 0L) {
chunk$license <- rep(NA_character_, nrow(chunk))
} else {
colnames(chunk)[index] <- "license"
}
# URL License column
index <- which(colnames(chunk) %in% url_license_column)
if (length(index) == 0L) {
chunk$url_license <- rep(NA_character_, nrow(chunk))
} else {
colnames(chunk)[index] <- "url_license"
}
# Text License column
index <- which(colnames(chunk) %in% text_license_column)
if (length(index) == 0L) {
chunk$text_license <- rep(NA_character_, nrow(chunk))
} else {
colnames(chunk)[index] <- "text_license"
}
# URL source column
index <- which(colnames(chunk) %in% url_source_column)
if (length(index) == 0L) {
chunk$url_source <- rep(NA_character_, nrow(chunk))
} else {
colnames(chunk)[index] <- "url_source"
}
# Select only the necessary columns
chunk <- chunk[c(
"id",
"text",
"bib_entry",
"license",
"url_license",
"text_license",
"url_source"
)]
chunk_dataset <- data.frame_to_py_dataset(chunk)
list_datasets[i] <- list(chunk_dataset)
if (trace) {
message(
get_time_stamp(),
" Batch ", i, " from ", n_batches, " processed"
)
}
last_log <- write_log(
log_file = log_file,
last_log = last_log,
write_interval = log_write_interval,
value_top = log_top_value,
value_middle = i,
value_bottom = 0L,
total_top = log_top_total,
total_middle = n_batches,
total_bottom = 1L,
message_top = log_top_message,
message_middle = ".xlsx files",
message_bottom = NA
)
# concatenate datasets
new_dataset <- datasets$concatenate_datasets(dsets = list_datasets, axis = 0L)
# Add new dataset
private$add(new_dataset)
}
},
#--------------------------------------------------------------------------
#' @description Method for adding raw texts from a `data.frame`
#' @param data_frame Object of class `data.frame` with at least the following columns "id","text","bib_entry",
#' "license", "url_license", "text_license", and "url_source". If "id" and7or "text" is missing an error occurs.
#' If the other columns are not present in the `data.frame` they are added with empty values(`NA`).
#' Additional columns are dropped.
#' @return The method does not return anything. It adds new raw texts to the data set.
add_from_data.frame = function(data_frame) {
# Necessary columns
if (!is.data.frame(data_frame)) {
stop("Input must be of type data.frame")
}
if (!"id" %in% colnames(data_frame)) {
stop("data.frame must contain a column id.")
}
if (!"text" %in% colnames(data_frame)) {
stop("data.frame must contain a column text.")
}
data_to_add <- data_frame
# optional columns
if (!"bib_entry" %in% colnames(data_to_add)) {
data_to_add["bib_entry"] <- rep(x = NA, times = nrow(data_to_add))
}
if (!"license" %in% colnames(data_to_add)) {
data_to_add["license"] <- rep(x = NA, times = nrow(data_to_add))
}
if (!"url_license" %in% colnames(data_to_add)) {
data_to_add["url_license"] <- rep(x = NA, times = nrow(data_to_add))
}
if (!"text_license" %in% colnames(data_to_add)) {
data_to_add["text_license"] <- rep(x = NA, times = nrow(data_to_add))
}
if (!"url_source" %in% colnames(data_to_add)) {
data_to_add["url_source"] <- rep(x = NA, times = nrow(data_to_add))
}
# Transform to a python dataset
new_dataset <- data.frame_to_py_dataset(data_to_add[c(
"id",
"text",
"bib_entry",
"license",
"url_license",
"text_license",
"url_source"
)])
# Add new dataset
private$add(new_dataset)
},
#--------------------------------------------------------------------------
#' @description Method for requesting all private fields and methods. Used for loading and updating an object.
#' @return Returns a `list` with all private fields and methods.
get_private = function() {
return(private)
}
),
private = list(
get_file_paths = function(dir_path, file_type) {
file_paths <- list.files(
path = dir_path,
include.dirs = FALSE,
all.files = TRUE,
full.names = TRUE,
recursive = TRUE,
pattern = paste0("*", file_type)
)
if (length(file_paths) > 0L) {
file_paths <- private$clean_path(file_paths)
}
return(file_paths)
},
clean_path = function(paths) {
new_paths <- vector(length = length(paths))
new_paths[] <- NA
for (i in seq_len(length(paths))) {
path <- paths[i]
bib_entry_path <- file.path(
dirname(path), "bib_entry.txt"
)
licence_path <- file.path(
dirname(path), "license.txt"
)
license_text_path <- file.path(
dirname(path), "text_license.txt"
)
url_licence_path <- file.path(
dirname(path), "url_license.txt"
)
url_source_path <- file.path(
dirname(path), "url_source.txt"
)
path_list <- c(bib_entry_path, licence_path, license_text_path, url_licence_path, url_source_path)
if (!path %in% path_list) {
new_paths[i] <- path
}
}
return(na.omit(new_paths))
},
get_batch = function(batch, file_paths, clean_text = TRUE) {
col_names <- c(
"id",
"text",
"bib_entry",
"license",
"url_license",
"text_license",
"url_source"
)
tmp_data <- matrix(
data = NA,
nrow = length(batch),
ncol = length(col_names)
)
colnames(tmp_data) <- col_names
for (i in seq_len(length(batch))) {
index <- batch[i]
document <- readtext::readtext(file = file_paths[index])
# ID
tmp_data[i, 1L] <- private$remove_file_extenstion(document$doc_id)
# Text
if (clean_text) {
tmp_text <- private$clean_text(document$text)
} else {
tmp_text <- document$text
}
tmp_data[i, 2L] <- tmp_text
# Bib_entry
file_path <- file.path(dirname(file_paths[index]), "bib_entry.txt")
if (file.exists(file_path)) {
tmp_data[i, 3L] <- readLines(con = file_path, warn = FALSE)
} else {
tmp_data[i, 3L] <- NA
}
# License
file_path <- file.path(dirname(file_paths[index]), "license.txt")
if (file.exists(file_path)) {
tmp_data[i, 4L] <- readLines(con = file_path, warn = FALSE)
} else {
tmp_data[i, 4L] <- NA
}
# URL License
file_path <- file.path(dirname(file_paths[index]), "url_license.txt")
if (file.exists(file_path)) {
tmp_data[i, 5L] <- readLines(con = file_path, warn = FALSE)
} else {
tmp_data[i, 5L] <- NA
}
# Text License
file_path <- file.path(dirname(file_paths[index]), "text_license.txt")
if (file.exists(file_path)) {
tmp_data[i, 6L] <- readLines(con = file_path, warn = FALSE)
} else {
tmp_data[i, 6L] <- NA
}
# URL Source
file_path <- file.path(dirname(file_paths[index]), "url_source.txt")
if (file.exists(file_path)) {
tmp_data[i, 7L] <- readLines(con = file_path, warn = FALSE)
} else {
tmp_data[i, 7L] <- NA
}
}
return(as.data.frame(tmp_data))
},
clean_text = function(text) {
# Remove some special symbols-------------------------------------------------
text <- stringi::stri_replace_all(text, regex = "\\|", replacement = "")
text <- stringi::stri_replace_all(text, regex = "([:blank:]*)-([:blank:]*)", replacement = "-")
# Normalization of blank positions and new lines---------------------------------------------
# Entferne alle Leerstellen zu beginn einer Zeile
text <- stringi::stri_replace_all(text, regex = "\\n([:blank:]{1,})", replacement = "\n")
# Entferne alle Leerstellen zum Ende einer Zeile
text <- stringi::stri_replace_all(text, regex = "([:blank:]{1,})\\n", replacement = "\n")
# Entferne alle Leerzeichen, die mehrfach vorkommen
text <- stringi::stri_replace_all(text, regex = "[:blank:]{2,}", replacement = " ")
# Remove running heads---------------------------------------------------------
# Entferne alle Zeilen, die mit einer Zahl bis 999 beginnen oder mit einer Gliederungsnummer (z. B 1.2)
text <- stringi::stri_replace_all(text, regex = "\\n(([:digit:]|[:punct:]){1,10})([:alpha:]|[:punct:]|[:blank:])*\\n", replacement = "\n")
# Entferne alle Zeilen, die mit einer Zahl bis 999 enden
text <- stringi::stri_replace_all(text, regex = "\\n[:alpha:]([:digit:]|[:alpha:]|[:punct:]|[:blank:])*([:digit:]{1,10})\\n", replacement = "\n")
# Remove list of contents------------------------------------------------------
text <- stringi::stri_replace_all(text, regex = "\\n(([:digit:]|[:punct:]){1,10})([:alpha:]|[:punct:]|[:digit:]|[:blank:])*(([:punct:]|[:blank:]){2,})([:digit:]{1,10})\\n", replacement = "\n")
text <- stringi::stri_replace_all(text, regex = "\\n(([:digit:]|[:punct:]){1,10})([:alpha:]|[:punct:]|[:digit:]|[:blank:])*(([:punct:]|[:blank:]){2,})([:digit:]{1,10})\\n", replacement = "\n")
text <- stringi::stri_replace_all(text, regex = "\\n(([:digit:]|[:punct:]){1,10})([:alpha:]|[:punct:]|[:digit:]|[:blank:])*(([:punct:]|[:blank:]){2,})([:digit:]{1,10})\\n", replacement = "\n")
# text <- stringi::stri_replace_all(text, regex = "\\n(([:graph:]|[:blank]){1,})([:punct:]{5,})([:graph:]|[:blank:])*\\n", replacement = "\n")
# Normalization of paragraphs-------------------------------------------------
# Mache Silbentrennung rückgängig
text <- stringi::stri_replace_all(text, regex = "-\\n([:space:]*)", replacement = "")
# Entferne Zeilenumbrüche innerhalb von Absätzen
text <- stringi::stri_replace_all(text, regex = "(?<![:space:])\\n(?![:space:])", replacement = " ")
# Finale Textbereinigungen----------------------------------------------------
# Entferne alle Zeilenumbrüche, die mehrfach vorkommen
# text <- stringi::stri_replace_all(text, regex = "\\n{2,}", replacement = "")
text <- stringi::stri_replace_all(text, regex = "\\n([:space:]*)\\n", replacement = "\n")
return(text)
},
remove_file_extenstion = function(file) {
# tmp_string <- stringr::str_split_fixed(file, pattern = "\\.", n = Inf)
tmp_string <- stringi::stri_split_fixed(file, pattern = ".", n = -1L, simplify = TRUE)
return(paste0(tmp_string[1L, 1L:(ncol(tmp_string) - 1L)], collapse = "."))
}
)
)
# Add the model to the user list
DataSetsIndex$LargeDataSetForText <- ("LargeDataSetForText")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.