Nothing
# [description] get all column classes of a data.table or data.frame.
# This function collapses the result of class() into a single string
.get_column_classes <- function(df) {
return(
vapply(
X = df
, FUN = function(x) {paste0(class(x), collapse = ",")}
, FUN.VALUE = character(1L)
)
)
}
# [description] check a data frame or data table for columns tthat are any
# type other than numeric and integer. This is used by gpb.convert()
# and gpb.convert_with_rules() too warn if more action is needed by users
# before a dataset can be converted to a gpb.Dataset.
.warn_for_unconverted_columns <- function(df, function_name) {
column_classes <- .get_column_classes(df = df)
unconverted_columns <- column_classes[!(column_classes %in% c("numeric", "integer"))]
if (length(unconverted_columns) > 0L) {
col_detail_string <- paste0(
paste0(
names(unconverted_columns)
, " ("
, unconverted_columns
, ")"
)
, collapse = ", "
)
msg <- paste0(
function_name
, ": "
, length(unconverted_columns)
, " columns are not numeric or integer. These need to be dropped or converted to "
, "be used in an gpb.Dataset object. "
, col_detail_string
)
warning(msg)
}
return(invisible(NULL))
}
.LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA <- function() {return(-1L)}
.LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA <- function() {return(0L)}
#' @name gpb.convert_with_rules
#' @title Data preparator for GPBoost datasets with rules (integer)
#' @description Attempts to prepare a clean dataset to prepare to put in a \code{gpb.Dataset}.
#' Factor, character, and logical columns are converted to integer. Missing values
#' in factors and characters will be filled with 0L. Missing values in logicals
#' will be filled with -1L.
#'
#' This function returns and optionally takes in "rules" the describe exactly
#' how to convert values in columns.
#'
#' Columns that contain only NA values will be converted by this function but will
#' not show up in the returned \code{rules}.
#'
#' @param data A data.frame or data.table to prepare.
#' @param rules A set of rules from the data preparator, if already used. This should be an R list,
#' where names are column names in \code{data} and values are named character
#' vectors whose names are column values and whose values are new values to
#' replace them with.
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
#' Note that the data must be converted to a matrix format (\code{as.matrix}) for input in
#' \code{gpb.Dataset}.
#'
#' @examples
#' \donttest{
#' data(iris)
#'
#' str(iris)
#'
#' new_iris <- gpb.convert_with_rules(data = iris)
#' str(new_iris$data)
#'
#' data(iris) # Erase iris dataset
#' iris$Species[1L] <- "NEW FACTOR" # Introduce junk factor (NA)
#'
#' # Use conversion using known rules
#' # Unknown factors become 0, excellent for sparse datasets
#' newer_iris <- gpb.convert_with_rules(data = iris, rules = new_iris$rules)
#'
#' # Unknown factor is now zero, perfect for sparse datasets
#' newer_iris$data[1L, ] # Species became 0 as it is an unknown factor
#'
#' newer_iris$data[1L, 5L] <- 1.0 # Put back real initial value
#'
#' # Is the newly created dataset equal? YES!
#' all.equal(new_iris$data, newer_iris$data)
#'
#' # Can we test our own rules?
#' data(iris) # Erase iris dataset
#'
#' # We remapped values differently
#' personal_rules <- list(
#' Species = c(
#' "setosa" = 3L
#' , "versicolor" = 2L
#' , "virginica" = 1L
#' )
#' )
#' newest_iris <- gpb.convert_with_rules(data = iris, rules = personal_rules)
#' str(newest_iris$data) # SUCCESS!
#' }
#' @importFrom data.table set
#' @export
gpb.convert_with_rules <- function(data, rules = NULL) {
column_classes <- .get_column_classes(df = data)
is_char <- which(column_classes == "character")
is_factor <- which(column_classes == "factor")
is_logical <- which(column_classes == "logical")
is_data_table <- data.table::is.data.table(x = data)
is_data_frame <- is.data.frame(data)
if (!(is_data_table || is_data_frame)) {
stop(
"gpb.convert_with_rules: you provided "
, paste(class(data), collapse = " & ")
, " but data should have class data.frame or data.table"
)
}
# if user didn't provide rules, create them
if (is.null(rules)) {
rules <- list()
columns_to_fix <- which(column_classes %in% c("character", "factor", "logical"))
for (i in columns_to_fix) {
col_values <- data[[i]]
# Get unique values
if (is.factor(col_values)) {
unique_vals <- levels(col_values)
unique_vals <- unique_vals[!is.na(unique_vals)]
mini_numeric <- seq_along(unique_vals) # respect ordinal
} else if (is.character(col_values)) {
unique_vals <- as.factor(unique(col_values))
unique_vals <- unique_vals[!is.na(unique_vals)]
mini_numeric <- as.integer(unique_vals) # no respect for ordinal
} else if (is.logical(col_values)) {
unique_vals <- c(FALSE, TRUE)
mini_numeric <- c(0L, 1L)
}
# don't add rules for all-NA columns
if (length(unique_vals) > 0L) {
col_name <- names(data)[i]
rules[[col_name]] <- mini_numeric
names(rules[[col_name]]) <- unique_vals
}
}
}
for (col_name in names(rules)) {
if (column_classes[[col_name]] == "logical") {
default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA()
} else {
default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA()
}
if (is_data_table) {
data.table::set(
x = data
, j = col_name
, value = unname(rules[[col_name]][data[[col_name]]])
)
data[is.na(get(col_name)), (col_name) := default_value_for_na]
} else {
data[[col_name]] <- unname(rules[[col_name]][data[[col_name]]])
data[is.na(data[col_name]), col_name] <- default_value_for_na
}
}
# if any all-NA columns exist, they won't be in rules. Convert them
all_na_cols <- which(
sapply(
X = data
, FUN = function(x) {
(is.factor(x) || is.character(x) || is.logical(x)) && all(is.na(unique(x)))
}
)
)
for (col_name in all_na_cols) {
if (column_classes[[col_name]] == "logical") {
default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA()
} else {
default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA()
}
if (is_data_table) {
data[, (col_name) := rep(default_value_for_na, .N)]
} else {
data[[col_name]] <- default_value_for_na
}
}
.warn_for_unconverted_columns(df = data, function_name = "gpb.convert_with_rules")
return(list(data = data, rules = rules))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.