Nothing
#' @title DataBackend for data.table
#'
#' @description
#' [DataBackend] for \CRANpkg{data.table} which serves as an efficient in-memory data base.
#'
#' @template param_rows
#' @template param_cols
#' @template param_data_format
#' @template param_primary_key
#' @template param_na_rm
#'
#' @template seealso_databackend
#' @export
#' @examples
#' data = as.data.table(palmerpenguins::penguins)
#' data$id = seq_len(nrow(palmerpenguins::penguins))
#' b = DataBackendDataTable$new(data = data, primary_key = "id")
#' print(b)
#' b$head()
#' b$data(rows = 100:101, cols = "species")
#'
#' b$nrow
#' head(b$rownames)
#'
#' b$ncol
#' b$colnames
#'
#' # alternative construction
#' as_data_backend(palmerpenguins::penguins)
DataBackendDataTable = R6Class("DataBackendDataTable", inherit = DataBackend,
cloneable = FALSE,
public = list(
#' @field compact_seq `logical(1)`\cr
#' If `TRUE`, row ids are a natural sequence from 1 to `nrow(data)` (determined internally).
#' In this case, row lookup uses faster positional indices instead of equi joins.
compact_seq = FALSE,
#' @description
#' Creates a new instance of this [R6][R6::R6Class] class.
#'
#' Note that `DataBackendDataTable` does not copy the input data, while `as_data_backend()` calls [data.table::copy()].
#' `as_data_backend()` also takes care about casting to a `data.table()` and adds a primary key column if necessary.
#'
#' @param data ([data.table::data.table()])\cr
#' The input [data.table()].
initialize = function(data, primary_key) {
assert_data_table(data, col.names = "unique")
super$initialize(setkeyv(data, primary_key), primary_key)
ii = match(primary_key, names(data))
if (is.na(ii)) {
stopf("Primary key '%s' not in 'data'", primary_key)
}
private$.cache = set_names(replace(rep(NA, ncol(data)), ii, FALSE), names(data))
},
#' @description
#' Returns a slice of the data in the specified format.
#' Currently, the only supported formats are `"data.table"` and `"Matrix"`.
#' The rows must be addressed as vector of primary key values, columns must be referred to via column names.
#' Queries for rows with no matching row id and queries for columns with no matching column name are silently ignored.
#' Rows are guaranteed to be returned in the same order as `rows`, columns may be returned in an arbitrary order.
#' Duplicated row ids result in duplicated rows, duplicated column names lead to an exception.
data = function(rows, cols, data_format) {
rows = assert_integerish(rows, coerce = TRUE)
assert_names(cols, type = "unique")
if (!missing(data_format)) warn_deprecated("DataBackendDataTable$data argument 'data_format'")
cols = intersect(cols, colnames(private$.data))
if (self$compact_seq) {
private$.data[rows, cols, with = FALSE, nomatch = NULL]
} else {
ijoin(private$.data, rows, cols, self$primary_key)
}
},
#' @description
#' Retrieve the first `n` rows.
#'
#' @param n (`integer(1)`)\cr
#' Number of rows.
#'
#' @return [data.table::data.table()] of the first `n` rows.
head = function(n = 6L) {
head(private$.data, n)
},
#' @description
#' Returns a named list of vectors of distinct values for each column
#' specified. If `na_rm` is `TRUE`, missing values are removed from the
#' returned vectors of distinct values. Non-existing rows and columns are
#' silently ignored.
#'
#' @return Named `list()` of distinct values.
distinct = function(rows, cols, na_rm = TRUE) {
cols = intersect(cols, colnames(private$.data))
if (is.null(rows)) {
set_names(lapply(cols, function(x) distinct_values(private$.data[[x]], drop = FALSE, na_rm = na_rm)), cols)
} else {
lapply(self$data(rows, cols), distinct_values, drop = TRUE, na_rm = na_rm)
}
},
#' @description
#' Returns the number of missing values per column in the specified slice
#' of data. Non-existing rows and columns are silently ignored.
#'
#' @return Total of missing values per column (named `numeric()`).
missings = function(rows, cols) {
missind = private$.cache
missind = missind[reorder_vector(names(missind), cols)]
# update cache
ii = which(is.na(missind))
if (length(ii)) {
missind[ii] = map_lgl(private$.data[, names(missind[ii]), with = FALSE], anyMissing)
private$.cache = insert_named(private$.cache, missind[ii])
}
# query required columns
query_cols = which(missind)
insert_named(
named_vector(names(missind), 0L),
map_int(self$data(rows, names(query_cols)), count_missing)
)
}
),
active = list(
#' @field rownames (`integer()`)\cr
#' Returns vector of all distinct row identifiers, i.e. the contents of the primary key column.
rownames = function() {
private$.data[[self$primary_key]]
},
#' @field colnames (`character()`)\cr
#' Returns vector of all column names, including the primary key column.
colnames = function() {
colnames(private$.data)
},
#' @field nrow (`integer(1)`)\cr
#' Number of rows (observations).
nrow = function() {
nrow(private$.data)
},
#' @field ncol (`integer(1)`)\cr
#' Number of columns (variables), including the primary key column.
ncol = function() {
ncol(private$.data)
}
),
private = list(
.calculate_hash = function() {
calculate_hash(self$compact_seq, private$.data)
},
.cache = NULL
)
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.