Nothing
# This file is part of the R package "aifeducation".
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 3 as published by
# the Free Software Foundation.
#
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>
#' @title Convert data.frame to arrow data set
#' @description Function for converting a data.frame into a pyarrow data set.
#'
#' @param data_frame Object of class `data.frame`.
#' @return Returns the `data.frame` as a pyarrow data set of class `datasets.arrow_dataset.Dataset`.
#'
#' @family Data Management
#' @keywords internal
#' @noRd
data.frame_to_py_dataset <- function(data_frame) {
if (nrow(data_frame) == 1) {
data_frame <- rbind(data_frame, data_frame)
data_list <- as.list(data_frame)
data_dict <- reticulate::dict(data_list)
dataset <- datasets$Dataset$from_dict(data_dict)
dataset <- dataset$select(indices = list(as.integer(0)))
} else {
data_list <- as.list(data_frame)
data_dict <- reticulate::dict(data_list)
dataset <- datasets$Dataset$from_dict(data_dict)
}
return(dataset)
}
#' @title Convert arrow data set to an arrow data set
#' @description Function for converting an arrow data set into a data set that can be used to store and process
#' embeddings.
#'
#' @param py_dataset Object of class `datasets.arrow_dataset.Dataset`.
#' @return Returns the data set of class `datasets.arrow_dataset.Dataset` with only two columns ("id","input"). "id"
#' stores the name of the cases while "input" stores the embeddings.
#'
#' @family Data Management
#' @keywords internal
#' @noRd
py_dataset_to_embeddings <- function(py_dataset) {
py_dataset$set_format("np")
embeddings <- py_dataset["input"]
rownames(embeddings) <- as.character(py_dataset["id"])
return(embeddings)
}
#' @title Convert R array for arrow data set
#' @description Function converts a R array into a numpy array that can be added to an arrow data set. The array should
#' represent embeddings.
#'
#' @param r_array `array` representing embeddings.
#' @return Returns a numpy array.
#'
#' @family Data Management
#' @keywords internal
#' @noRd
prepare_r_array_for_dataset <- function(r_array) {
tmp_np_array<-reticulate::r_to_py(
np$squeeze(np$split(reticulate::np_array(r_array), as.integer(nrow(r_array)), axis = 0L))
)
if(length(tmp_np_array$shape)==2){
tmp_np_array=np$expand_dims(
tmp_np_array,
1L
)
}
return(tmp_np_array)
}
#' @title Assign cases to batches
#' @description Function groups cases into batches.
#'
#' @param number_rows `int` representing the number of cases or rows of a matrix or array.
#' @param batch_size `int` size of a batch.
#' @param zero_based `bool` If `TRUE` the indices of the cases within each batch are zero based. One based if `FALSE`.
#' @return Returns a `list` of batches. Each entry in the list contains a `vector` of `int` representing the cases
#' belonging to that batch.
#'
#' @family Data Management
#' @keywords internal
#' @noRd
get_batches_index <- function(number_rows, batch_size, zero_based = FALSE) {
n_batches <- ceiling(number_rows / batch_size)
index_list <- NULL
for (i in 1:n_batches) {
min <- 1 + (i - 1) * batch_size
max <- min(batch_size + (i - 1) * batch_size, number_rows)
if (zero_based == FALSE) {
index_list[i] <- list(seq.int(from = min, to = max, by = 1))
} else {
index_list[i] <- list(seq.int(from = min, to = max, by = 1) - 1)
}
}
return(index_list)
}
#' @title Reduce to unique cases
#' @description Function creates an arrow data set that contains only unique cases. That is, duplicates are removed.
#'
#' @param dataset_to_reduce Object of class `datasets.arrow_dataset.Dataset`.
#' @param column_name `string` Name of the column whose values should be unique.
#' @return Returns a data set of class `datasets.arrow_dataset.Dataset` where the duplicates are removed according to
#' the given column.
#'
#' @family Data Management
#' @keywords internal
#' @noRd
reduce_to_unique <- function(dataset_to_reduce, column_name) {
selected_column <- dataset_to_reduce$data$column(column_name)
unique_values <- pyarrow$compute$unique(selected_column)
unique_indices <- pyarrow$compute$index_in(unique_values, selected_column)
reduced_dataset <- dataset_to_reduce$select(unique_indices)
return(reduced_dataset)
}
#' @title Convert class vector to arrow data set
#' @description Function converts a vector of class indices into an arrow data set.
#'
#' @param vector `vector` of class indices.
#' @return Returns a data set of class `datasets.arrow_dataset.Dataset` containing the class indices.
#'
#' @family Data Management
#' @keywords internal
#' @noRd
class_vector_to_py_dataset <- function(vector) {
data_frame <- as.data.frame(vector)
data_frame$id <- names(vector)
colnames(data_frame) <- c("labels", "id")
if (length(data_frame) == 1) {
data_frame <- rbind(data_frame, data_frame)
data_list <- as.list(data_frame)
data_dict <- reticulate::dict(data_list)
dataset <- datasets$Dataset$from_dict(data_dict)
dataset <- dataset$select(indices = list(as.integer(0)))
} else {
data_list <- as.list(data_frame)
data_dict <- reticulate::dict(data_list)
dataset <- datasets$Dataset$from_dict(data_dict)
}
return(dataset)
}
#' @title Combine embeddings in array form
#' @description Function combines two array by adding an array to another array along the
#' first dimension.
#'
#' @param ... `array` or `list` of `array`s. Arrays to be combined. All array must have row names.
#'
#' @return Returns the combined array
#'
#' @family Data Management
#' @keywords internal
#' @noRd
array_form_bind <- function(...) {
objects <- list(...)
arrays <- NULL
for (object in objects) {
if (is.list(object)) {
for (j in 1:length(object)) {
arrays[length(arrays) + 1] <- list(object[[j]])
}
} else {
arrays[length(arrays) + 1] <- list(object)
}
}
# arrays <- list(...)
if (length(arrays) > 1) {
total_rows <- 0
dimension <- dim(arrays[[1]])
for (i in 1:length(arrays)) {
total_rows <- total_rows + dim(arrays[[i]])[1]
# Check number of dimensions
if (sum(dim(arrays[[i]])[-1] != dimension[-1])) {
stop("The dimensions of the array differ.")
}
}
combined_array <- array(
data = NA,
dim = c(total_rows, dimension[2], dimension[3])
)
intercept <- 0
row_names <- NULL
for (i in 1:length(arrays)) {
index <- seq.int(
from = 1,
to = nrow(arrays[[i]]),
by = 1
) + intercept
combined_array[index, , ] <- arrays[[i]]
intercept <- intercept + length(index)
row_names <- append(x = row_names, rownames(arrays[[i]]))
}
rownames(combined_array) <- row_names
return(combined_array)
} else {
return(arrays[[1]])
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.