R/lgb.convert_with_rules.R

Defines functions lgb.convert_with_rules .LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA .LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA .warn_for_unconverted_columns .get_column_classes

Documented in lgb.convert_with_rules

# [description] get all column classes of a data.table or data.frame.
#               This function collapses the result of class() into a single string
.get_column_classes <- function(df) {
    return(
        vapply(
            X = df
            , FUN = function(x) {paste0(class(x), collapse = ",")}
            , FUN.VALUE = character(1L)
        )
    )
}

# [description] check a data frame or data table for columns that are any
#               type other than numeric and integer. This is used by lgb.convert_with_rules()
#               to warn if more action is needed by users
#               before a dataset can be converted to a lgb.Dataset.
.warn_for_unconverted_columns <- function(df, function_name) {
    column_classes <- .get_column_classes(df = df)
    unconverted_columns <- column_classes[!(column_classes %in% c("numeric", "integer"))]
    if (length(unconverted_columns) > 0L) {
        col_detail_string <- paste0(
            paste0(
                names(unconverted_columns)
                , " ("
                , unconverted_columns
                , ")"
            )
            , collapse = ", "
        )
        msg <- paste0(
            function_name
            , ": "
            , length(unconverted_columns)
            , " columns are not numeric or integer. These need to be dropped or converted to "
            , "be used in an lgb.Dataset object. "
            , col_detail_string
        )
        warning(msg)
    }
    return(invisible(NULL))
}

.LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA <- function() {return(-1L)}
.LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA <- function() {return(0L)}


#' @name lgb.convert_with_rules
#' @title Data preparator for LightGBM datasets with rules (integer)
#' @description Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
#'              Factor, character, and logical columns are converted to integer. Missing values
#'              in factors and characters will be filled with 0L. Missing values in logicals
#'              will be filled with -1L.
#'
#'              This function returns and optionally takes in "rules" the describe exactly
#'              how to convert values in columns.
#'
#'              Columns that contain only NA values will be converted by this function but will
#'              not show up in the returned \code{rules}.
#'
#'              NOTE: In previous releases of LightGBM, this function was called \code{lgb.prepare_rules2}.
#' @param data A data.frame or data.table to prepare.
#' @param rules A set of rules from the data preparator, if already used. This should be an R list,
#'              where names are column names in \code{data} and values are named character
#'              vectors whose names are column values and whose values are new values to
#'              replace them with.
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
#'         Note that the data must be converted to a matrix format (\code{as.matrix}) for input in
#'         \code{lgb.Dataset}.
#'
#' @examples
#' \donttest{
#' data(iris)
#'
#' str(iris)
#'
#' new_iris <- lgb.convert_with_rules(data = iris)
#' str(new_iris$data)
#'
#' data(iris) # Erase iris dataset
#' iris$Species[1L] <- "NEW FACTOR" # Introduce junk factor (NA)
#'
#' # Use conversion using known rules
#' # Unknown factors become 0, excellent for sparse datasets
#' newer_iris <- lgb.convert_with_rules(data = iris, rules = new_iris$rules)
#'
#' # Unknown factor is now zero, perfect for sparse datasets
#' newer_iris$data[1L, ] # Species became 0 as it is an unknown factor
#'
#' newer_iris$data[1L, 5L] <- 1.0 # Put back real initial value
#'
#' # Is the newly created dataset equal? YES!
#' all.equal(new_iris$data, newer_iris$data)
#'
#' # Can we test our own rules?
#' data(iris) # Erase iris dataset
#'
#' # We remapped values differently
#' personal_rules <- list(
#'   Species = c(
#'     "setosa" = 3L
#'     , "versicolor" = 2L
#'     , "virginica" = 1L
#'   )
#' )
#' newest_iris <- lgb.convert_with_rules(data = iris, rules = personal_rules)
#' str(newest_iris$data) # SUCCESS!
#' }
#' @importFrom data.table set
#' @export
lgb.convert_with_rules <- function(data, rules = NULL) {

    column_classes <- .get_column_classes(df = data)

    is_char <- which(column_classes == "character")
    is_factor <- which(column_classes == "factor")
    is_logical <- which(column_classes == "logical")

    is_data_table <- data.table::is.data.table(x = data)
    is_data_frame <- is.data.frame(data)

    if (!(is_data_table || is_data_frame)) {
        stop(
            "lgb.convert_with_rules: you provided "
            , paste(class(data), collapse = " & ")
            , " but data should have class data.frame or data.table"
        )
    }

    # if user didn't provide rules, create them
    if (is.null(rules)) {
        rules <- list()
        columns_to_fix <- which(column_classes %in% c("character", "factor", "logical"))

        for (i in columns_to_fix) {

          col_values <- data[[i]]

          # Get unique values
          if (is.factor(col_values)) {
              unique_vals <- levels(col_values)
              unique_vals <- unique_vals[!is.na(unique_vals)]
              mini_numeric <- seq_along(unique_vals) # respect ordinal
          } else if (is.character(col_values)) {
              unique_vals <- as.factor(unique(col_values))
              unique_vals <- unique_vals[!is.na(unique_vals)]
              mini_numeric <- as.integer(unique_vals)  # no respect for ordinal
          } else if (is.logical(col_values)) {
              unique_vals <- c(FALSE, TRUE)
              mini_numeric <- c(0L, 1L)
          }

          # don't add rules for all-NA columns
          if (length(unique_vals) > 0L) {
              col_name <- names(data)[i]
              rules[[col_name]] <- mini_numeric
              names(rules[[col_name]]) <- unique_vals
          }
        }
    }

    for (col_name in names(rules)) {
        if (column_classes[[col_name]] == "logical") {
            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA()
        } else {
            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA()
        }
        if (is_data_table) {
            data.table::set(
                x = data
                , j = col_name
                , value = unname(rules[[col_name]][data[[col_name]]])
            )
            data[is.na(get(col_name)), (col_name) := default_value_for_na]
        } else {
            data[[col_name]] <- unname(rules[[col_name]][data[[col_name]]])
            data[is.na(data[col_name]), col_name] <- default_value_for_na
        }
    }

    # if any all-NA columns exist, they won't be in rules. Convert them
    all_na_cols <- which(
        sapply(
            X = data
            , FUN = function(x) {
                (is.factor(x) || is.character(x) || is.logical(x)) && all(is.na(unique(x)))
            }
        )
    )
    for (col_name in all_na_cols) {
        if (column_classes[[col_name]] == "logical") {
            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA()
        } else {
            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA()
        }
        if (is_data_table) {
            data[, (col_name) := rep(default_value_for_na, .N)]
        } else {
            data[[col_name]] <- default_value_for_na
        }
    }

    .warn_for_unconverted_columns(df = data, function_name = "lgb.convert_with_rules")

    return(list(data = data, rules = rules))

}

Try the lightgbm package in your browser

Any scripts or data that you put into this service are public.

lightgbm documentation built on Jan. 17, 2023, 1:13 a.m.