R/utils.R
In stochtree: Stochastic Tree Ensembles (XBART and BART) for Supervised Learning and Causal Inference

Documented in convertPreprocessorToJson createPreprocessorFromJson createPreprocessorFromJsonString preprocessPredictionData preprocessTrainData savePreprocessorToJsonString

#' Preprocess a parameter list, overriding defaults with any provided parameters.
#'
#' @param default_params List of parameters with default values set.
#' @param user_params (Optional) User-supplied overrides to `default_params`.
#' @noRd
#'
#' @return Parameter list with defaults overriden by values supplied in `user_params`
preprocessParams <- function(default_params, user_params = NULL) {
    # Override defaults from general_params
    if (!is.null(user_params)) {
        for (key in names(user_params)) {
            if (key %in% names(default_params)) {
                val <- user_params[[key]]
                if (!is.null(val)) default_params[[key]] <- val
            }
        }
    }

    # Return result
    return(default_params)
}

#' Preprocess covariates. DataFrames will be preprocessed based on their column 
#' types. Matrices will be passed through assuming all columns are numeric.
#'
#' @param input_data Covariates, provided as either a dataframe or a matrix
#'
#' @return List with preprocessed (unmodified) data and details on the number of each type 
#' of variable, unique categories associated with categorical variables, and the 
#' vector of feature types needed for calls to BART and BCF.
#' @export
#'
#' @examples
#' cov_mat <- matrix(1:12, ncol = 3)
#' preprocess_list <- preprocessTrainData(cov_mat)
#' X <- preprocess_list$X
preprocessTrainData <- function(input_data) {
    # Input checks
    if ((!is.matrix(input_data)) && (!is.data.frame(input_data))) {
        stop("Covariates provided must be a dataframe or matrix")
    }
    
    # Routing the correct preprocessing function
    if (is.matrix(input_data)) {
        output <- preprocessTrainMatrix(input_data)
    } else {
        output <- preprocessTrainDataFrame(input_data)
    }
    
    return(output)
}

#' Preprocess covariates. DataFrames will be preprocessed based on their column 
#' types. Matrices will be passed through assuming all columns are numeric.
#'
#' @param input_data Covariates, provided as either a dataframe or a matrix
#' @param metadata List containing information on variables, including train set 
#' categories for categorical variables
#'
#' @return Preprocessed data with categorical variables appropriately handled
#' @export
#'
#' @examples
#' cov_df <- data.frame(x1 = 1:5, x2 = 5:1, x3 = 6:10)
#' metadata <- list(num_ordered_cat_vars = 0, num_unordered_cat_vars = 0, 
#'                  num_numeric_vars = 3, numeric_vars = c("x1", "x2", "x3"))
#' X_preprocessed <- preprocessPredictionData(cov_df, metadata)
preprocessPredictionData <- function(input_data, metadata) {
    # Input checks
    if ((!is.matrix(input_data)) && (!is.data.frame(input_data))) {
        stop("Covariates provided must be a dataframe or matrix")
    }
    
    # Routing the correct preprocessing function
    if (is.matrix(input_data)) {
        X <- preprocessPredictionMatrix(input_data, metadata)
    } else {
        X <- preprocessPredictionDataFrame(input_data, metadata)
    }
    
    return(X)
}

#' Preprocess a matrix of covariate values, assuming all columns are numeric.
#' Returns a list including a matrix of preprocessed covariate values and associated tracking.
#'
#' @param input_matrix Covariate matrix.
#'
#' @return List with preprocessed (unmodified) data and details on the number of each type 
#' of variable, unique categories associated with categorical variables, and the 
#' vector of feature types needed for calls to BART and BCF.
#' @noRd
#'
#' @examples
#' cov_mat <- matrix(1:12, ncol = 3)
#' preprocess_list <- preprocessTrainMatrix(cov_mat)
#' X <- preprocess_list$X
preprocessTrainMatrix <- function(input_matrix) {
    # Input checks
    if (!is.matrix(input_matrix)) {
        stop("covariates provided must be a matrix")
    }
    
    # Unpack metadata (assuming all variables are numeric)
    names(input_matrix) <- paste0("x", 1:ncol(input_matrix))
    df_vars <- names(input_matrix)
    num_ordered_cat_vars <- 0
    num_unordered_cat_vars <- 0
    num_numeric_vars <- ncol(input_matrix)
    numeric_vars <- names(input_matrix)
    feature_types <- rep(0, ncol(input_matrix))

    # Unpack data
    X <- input_matrix

    # Aggregate results into a list
    metadata <- list(
        feature_types = feature_types, 
        num_ordered_cat_vars = num_ordered_cat_vars, 
        num_unordered_cat_vars = num_unordered_cat_vars, 
        num_numeric_vars = num_numeric_vars, 
        numeric_vars = numeric_vars, 
        original_var_indices = 1:num_numeric_vars
    )
    output <- list(
        data = X, 
        metadata = metadata
    )
    
    return(output)
}

#' Preprocess a matrix of covariate values, assuming all columns are numeric.
#'
#' @param input_matrix Covariate matrix.
#' @param metadata List containing information on variables, including train set 
#' categories for categorical variables
#'
#' @return Preprocessed data with categorical variables appropriately preprocessed
#' @noRd
#'
#' @examples
#' cov_mat <- matrix(c(1:5, 5:1, 6:10), ncol = 3)
#' metadata <- list(num_ordered_cat_vars = 0, num_unordered_cat_vars = 0, 
#'                  num_numeric_vars = 3, numeric_vars = c("x1", "x2", "x3"))
#' X_preprocessed <- preprocessPredictionMatrix(cov_mat, metadata)
preprocessPredictionMatrix <- function(input_matrix, metadata) {
    # Input checks
    if (!is.matrix(input_matrix)) {
        stop("covariates provided must be a matrix")
    }
    if (!(ncol(input_matrix) == metadata$num_numeric_vars)) {
        stop("Prediction set covariates have inconsistent dimension from train set covariates")
    }
    
    return(input_matrix)
}

#' Preprocess a dataframe of covariate values, converting categorical variables 
#' to integers and one-hot encoding if need be. Returns a list including a 
#' matrix of preprocessed covariate values and associated tracking.
#'
#' @param input_df Dataframe of covariates. Users must pre-process any 
#' categorical variables as factors (ordered for ordered categorical).
#' @noRd
#'
#' @return List with preprocessed data and details on the number of each type 
#' of variable, unique categories associated with categorical variables, and the 
#' vector of feature types needed for calls to BART and BCF.
preprocessTrainDataFrame <- function(input_df) {
    # Input checks / details
    if (!is.data.frame(input_df)) {
        stop("covariates provided must be a data frame")
    }
    df_vars <- names(input_df)
    
    # Detect ordered and unordered categorical variables
    
    # First, ordered categorical: users must have explicitly 
    # converted this to a factor with ordered = TRUE
    factor_mask <- sapply(input_df, is.factor)
    ordered_mask <- sapply(input_df, is.ordered)
    ordered_cat_matches <- factor_mask & ordered_mask
    ordered_cat_vars <- df_vars[ordered_cat_matches]
    ordered_cat_var_inds <- unname(which(ordered_cat_matches))
    num_ordered_cat_vars <- length(ordered_cat_vars)
    if (num_ordered_cat_vars > 0) ordered_cat_df <- input_df[,ordered_cat_vars,drop=FALSE]
    
    # Next, unordered categorical: we will convert character 
    # columns but not integer columns (users must explicitly 
    # convert these to factor)
    character_mask <- sapply(input_df, is.character)
    unordered_cat_matches <- (factor_mask & (!ordered_mask)) | character_mask
    unordered_cat_vars <- df_vars[unordered_cat_matches]
    unordered_cat_var_inds <- unname(which(unordered_cat_matches))
    num_unordered_cat_vars <- length(unordered_cat_vars)
    if (num_unordered_cat_vars > 0) unordered_cat_df <- input_df[,unordered_cat_vars,drop=FALSE]
    
    # Numeric variables
    numeric_matches <- (!ordered_cat_matches) & (!unordered_cat_matches)
    numeric_vars <- df_vars[numeric_matches]
    numeric_var_inds <- unname(which(numeric_matches))
    num_numeric_vars <- length(numeric_vars)
    if (num_numeric_vars > 0) numeric_df <- input_df[,numeric_vars,drop=FALSE]
    
    # Empty outputs
    X <- double(0)
    unordered_unique_levels <- list()
    ordered_unique_levels <- list()
    feature_types <- integer(0)
    original_var_indices <- integer(0)
    
    # First, extract the numeric covariates
    if (num_numeric_vars > 0) {
        Xnum <- double(0)
        for (i in 1:ncol(numeric_df)) {
            stopifnot(is.numeric(numeric_df[,i]))
            Xnum <- cbind(Xnum, numeric_df[,i])
        }
        X <- cbind(X, unname(Xnum))
        feature_types <- c(feature_types, rep(0, ncol(Xnum)))
        original_var_indices <- c(original_var_indices, numeric_var_inds)
    }
    
    # Next, run some simple preprocessing on the ordered categorical covariates
    if (num_ordered_cat_vars > 0) {
        Xordcat <- double(0)
        for (i in 1:ncol(ordered_cat_df)) {
            var_name <- names(ordered_cat_df)[i]
            preprocess_list <- orderedCatInitializeAndPreprocess(ordered_cat_df[,i])
            ordered_unique_levels[[var_name]] <- preprocess_list$unique_levels
            Xordcat <- cbind(Xordcat, preprocess_list$x_preprocessed)
        }
        X <- cbind(X, unname(Xordcat))
        feature_types <- c(feature_types, rep(1, ncol(Xordcat)))
        original_var_indices <- c(original_var_indices, ordered_cat_var_inds)
    }
    
    # Finally, one-hot encode the unordered categorical covariates
    if (num_unordered_cat_vars > 0) {
        one_hot_mats <- list()
        for (i in 1:ncol(unordered_cat_df)) {
            var_name <- names(unordered_cat_df)[i]
            encode_list <- oneHotInitializeAndEncode(unordered_cat_df[,i])
            unordered_unique_levels[[var_name]] <- encode_list$unique_levels
            one_hot_mats[[var_name]] <- encode_list$Xtilde
            one_hot_var <- rep(unordered_cat_var_inds[i], ncol(encode_list$Xtilde))
            original_var_indices <- c(original_var_indices, one_hot_var)
        }
        Xcat <- do.call(cbind, one_hot_mats)
        X <- cbind(X, unname(Xcat))
        feature_types <- c(feature_types, rep(1, ncol(Xcat)))
    }
    
    # Aggregate results into a list
    metadata <- list(
        feature_types = feature_types, 
        num_ordered_cat_vars = num_ordered_cat_vars, 
        num_unordered_cat_vars = num_unordered_cat_vars, 
        num_numeric_vars = num_numeric_vars, 
        original_var_indices = original_var_indices
    )
    if (num_ordered_cat_vars > 0) {
        metadata[["ordered_cat_vars"]] = ordered_cat_vars
        metadata[["ordered_unique_levels"]] = ordered_unique_levels
    }
    if (num_unordered_cat_vars > 0) {
        metadata[["unordered_cat_vars"]] = unordered_cat_vars
        metadata[["unordered_unique_levels"]] = unordered_unique_levels
    }
    if (num_numeric_vars > 0) metadata[["numeric_vars"]] = numeric_vars
    output <- list(
        data = X, 
        metadata = metadata
    )
    
    return(output)
}

#' Preprocess a dataframe of covariate values, converting categorical variables 
#' to integers and one-hot encoding if need be.
#'
#' @param input_df Dataframe of covariates. Users must pre-process any 
#' categorical variables as factors (ordered for ordered categorical).
#' @param metadata List containing information on variables, including train set 
#' categories for categorical variables
#'
#' @return Preprocessed data with categorical variables appropriately preprocessed
#' @noRd
#'
#' @examples
#' cov_df <- data.frame(x1 = 1:5, x2 = 5:1, x3 = 6:10)
#' metadata <- list(num_ordered_cat_vars = 0, num_unordered_cat_vars = 0, 
#'                  num_numeric_vars = 3, numeric_vars = c("x1", "x2", "x3"))
#' X_preprocessed <- preprocessPredictionDataFrame(cov_df, metadata)
preprocessPredictionDataFrame <- function(input_df, metadata) {
    if (!is.data.frame(input_df)) {
        stop("covariates provided must be a data frame")
    }
    df_vars <- names(input_df)
    num_ordered_cat_vars <- metadata$num_ordered_cat_vars
    num_unordered_cat_vars <- metadata$num_unordered_cat_vars
    num_numeric_vars <- metadata$num_numeric_vars
    
    if (num_ordered_cat_vars > 0) {
        ordered_cat_vars <- metadata$ordered_cat_vars
        ordered_cat_df <- input_df[,ordered_cat_vars,drop=FALSE]
    }
    if (num_unordered_cat_vars > 0) {
        unordered_cat_vars <- metadata$unordered_cat_vars
        unordered_cat_df <- input_df[,unordered_cat_vars,drop=FALSE]
    }
    if (num_numeric_vars > 0) {
        numeric_vars <- metadata$numeric_vars
        numeric_df <- input_df[,numeric_vars,drop=FALSE]
    }
    
    # Empty outputs
    X <- double(0)
    
    # First, extract the numeric covariates
    if (num_numeric_vars > 0) {
        Xnum <- double(0)
        for (i in 1:ncol(numeric_df)) {
            stopifnot(is.numeric(numeric_df[,i]))
            Xnum <- cbind(Xnum, numeric_df[,i])
        }
        X <- cbind(X, unname(Xnum))
    }
    
    # Next, run some simple preprocessing on the ordered categorical covariates
    if (num_ordered_cat_vars > 0) {
        Xordcat <- double(0)
        for (i in 1:ncol(ordered_cat_df)) {
            var_name <- names(ordered_cat_df)[i]
            x_preprocessed <- orderedCatPreprocess(ordered_cat_df[,i], metadata$ordered_unique_levels[[var_name]])
            Xordcat <- cbind(Xordcat, x_preprocessed)
        }
        X <- cbind(X, unname(Xordcat))
    }
    
    # Finally, one-hot encode the unordered categorical covariates
    if (num_unordered_cat_vars > 0) {
        one_hot_mats <- list()
        for (i in 1:ncol(unordered_cat_df)) {
            var_name <- names(unordered_cat_df)[i]
            Xtilde <- oneHotEncode(unordered_cat_df[,i], metadata$unordered_unique_levels[[var_name]])
            one_hot_mats[[var_name]] <- Xtilde
        }
        Xcat <- do.call(cbind, one_hot_mats)
        X <- cbind(X, unname(Xcat))
    }
    
    return(X)
}

#' Convert the persistent aspects of a covariate preprocessor to (in-memory) C++ JSON object
#'
#' @param object List containing information on variables, including train set 
#' categories for categorical variables 
#'
#' @return wrapper around in-memory C++ JSON object
#' @export
#'
#' @examples
#' cov_mat <- matrix(1:12, ncol = 3)
#' preprocess_list <- preprocessTrainData(cov_mat)
#' preprocessor_json <- convertPreprocessorToJson(preprocess_list$metadata)
convertPreprocessorToJson <- function(object) {
    jsonobj <- createCppJson()
    if (is.null(object$feature_types)) {
        stop("This covariate preprocessor has not yet been fit")
    }
    
    # Add internal scalars
    jsonobj$add_integer("num_numeric_vars", object$num_numeric_vars)
    jsonobj$add_integer("num_ordered_cat_vars", object$num_ordered_cat_vars)
    jsonobj$add_integer("num_unordered_cat_vars", object$num_unordered_cat_vars)
    
    # Add internal vectors
    jsonobj$add_vector("feature_types", object$feature_types)
    jsonobj$add_vector("original_var_indices", object$original_var_indices)
    if (object$num_numeric_vars > 0) {
        jsonobj$add_string_vector("numeric_vars", object$numeric_vars)
    }
    if (object$num_ordered_cat_vars > 0) {
        jsonobj$add_string_vector("ordered_cat_vars", object$ordered_cat_vars)
        for (i in 1:object$num_ordered_cat_vars) {
            var_key <- names(object$ordered_unique_levels)[i]
            jsonobj$add_string(paste0("key_", i), var_key, "ordered_unique_level_keys")
            jsonobj$add_string_vector(var_key, object$ordered_unique_levels[[i]], "ordered_unique_levels")
        }
    }
    if (object$num_unordered_cat_vars > 0) {
        jsonobj$add_string_vector("unordered_cat_vars", object$unordered_cat_vars)
        for (i in 1:object$num_unordered_cat_vars) {
            var_key <- names(object$unordered_unique_levels)[i]
            jsonobj$add_string(paste0("key_", i), var_key, "unordered_unique_level_keys")
            jsonobj$add_string_vector(var_key, object$unordered_unique_levels[[i]], "unordered_unique_levels")
        }
    }
    
    return(jsonobj)
}

#' Convert the persistent aspects of a covariate preprocessor to (in-memory) JSON string
#'
#' @param object List containing information on variables, including train set 
#' categories for categorical variables  
#'
#' @return in-memory JSON string
#' @export
#'
#' @examples
#' cov_mat <- matrix(1:12, ncol = 3)
#' preprocess_list <- preprocessTrainData(cov_mat)
#' preprocessor_json_string <- savePreprocessorToJsonString(preprocess_list$metadata)
savePreprocessorToJsonString <- function(object){
    # Convert to Json
    jsonobj <- convertPreprocessorToJson(object)
    
    # Dump to string
    return(jsonobj$return_json_string())
}

#' Reload a covariate preprocessor object from a JSON string containing a serialized preprocessor
#'
#' @param json_object in-memory wrapper around JSON C++ object containing covariate preprocessor metadata
#'
#' @returns Preprocessor object that can be used with the `preprocessPredictionData` function
#' @export
#'
#' @examples
#' cov_mat <- matrix(1:12, ncol = 3)
#' preprocess_list <- preprocessTrainData(cov_mat)
#' preprocessor_json <- convertPreprocessorToJson(preprocess_list$metadata)
#' preprocessor_roundtrip <- createPreprocessorFromJson(preprocessor_json)
createPreprocessorFromJson <- function(json_object){
    # Initialize the metadata list
    metadata <- list()
    
    # Unpack internal scalars
    metadata[["num_numeric_vars"]] <- json_object$get_integer("num_numeric_vars")
    metadata[["num_ordered_cat_vars"]] <- json_object$get_integer("num_ordered_cat_vars")
    metadata[["num_unordered_cat_vars"]] <- json_object$get_integer("num_unordered_cat_vars")
    
    # Unpack internal vectors
    metadata[["feature_types"]] <- json_object$get_vector("feature_types")
    metadata[["original_var_indices"]] <- json_object$get_vector("original_var_indices")
    if (metadata$num_numeric_vars > 0) {
        metadata[["numeric_vars"]] <- json_object$get_string_vector("numeric_vars")
    }
    if (metadata$num_ordered_cat_vars > 0) {
        metadata[["ordered_cat_vars"]] <- json_object$get_string_vector("ordered_cat_vars")
        ordered_unique_levels <- list()
        for (i in 1:metadata$num_ordered_cat_vars) {
            var_key <- json_object$get_string(paste0("key_", i), "ordered_unique_level_keys")
            ordered_unique_levels[[var_key]] <- json_object$get_string_vector(var_key, "ordered_unique_levels")
        }
        metadata[["ordered_unique_levels"]] <- ordered_unique_levels
    }
    if (metadata$num_unordered_cat_vars > 0) {
        metadata[["unordered_cat_vars"]] <- json_object$get_string_vector("unordered_cat_vars")
        unordered_unique_levels <- list()
        for (i in 1:metadata$num_unordered_cat_vars) {
            var_key <- json_object$get_string(paste0("key_", i), "unordered_unique_level_keys")
            unordered_unique_levels[[var_key]] <- json_object$get_string_vector(var_key, "unordered_unique_levels")
        }
        metadata[["unordered_unique_levels"]] <- unordered_unique_levels
    }
    
    return(metadata)
}

#' Reload a covariate preprocessor object from a JSON string containing a serialized preprocessor
#'
#' @param json_string in-memory JSON string containing covariate preprocessor metadata
#'
#' @return Preprocessor object that can be used with the `preprocessPredictionData` function
#' @export
#'
#' @examples
#' cov_mat <- matrix(1:12, ncol = 3)
#' preprocess_list <- preprocessTrainData(cov_mat)
#' preprocessor_json_string <- savePreprocessorToJsonString(preprocess_list$metadata)
#' preprocessor_roundtrip <- createPreprocessorFromJsonString(preprocessor_json_string)
createPreprocessorFromJsonString <- function(json_string){
    # Load a `CppJson` object from string
    preprocessor_json <- createCppJsonString(json_string)
    
    # Create and return the BCF object
    preprocessor_object <- createPreprocessorFromJson(preprocessor_json)
    
    return(preprocessor_object)
}

#' Preprocess a dataframe of covariate values, converting categorical variables 
#' to integers and one-hot encoding if need be. Returns a list including a 
#' matrix of preprocessed covariate values and associated tracking.
#'
#' @param input_data Dataframe or matrix of covariates. Users may pre-process any 
#' categorical variables as factors but it is not necessary.
#' @param ordered_cat_vars (Optional) Vector of names of ordered categorical variables, or vector of column indices if `input_data` is a matrix.
#' @param unordered_cat_vars (Optional) Vector of names of unordered categorical variables, or vector of column indices if `input_data` is a matrix.
#'
#' @return List with preprocessed data and details on the number of each type 
#' of variable, unique categories associated with categorical variables, and the 
#' vector of feature types needed for calls to BART and BCF.
#' @noRd
#'
#' @examples
#' cov_df <- data.frame(x1 = 1:5, x2 = 5:1, x3 = 6:10)
#' preprocess_list <- createForestCovariates(cov_df)
#' X <- preprocess_list$X
createForestCovariates <- function(input_data, ordered_cat_vars = NULL, unordered_cat_vars = NULL) {
    if (is.matrix(input_data)) {
        input_df <- as.data.frame(input_data)
        names(input_df) <- paste0("x", 1:ncol(input_data))
        if (!is.null(ordered_cat_vars)) {
            if (is.numeric(ordered_cat_vars)) ordered_cat_vars <- paste0("x", as.integer(ordered_cat_vars))
        }
        if (!is.null(unordered_cat_vars)) {
            if (is.numeric(unordered_cat_vars)) unordered_cat_vars <- paste0("x", as.integer(unordered_cat_vars))
        }
    } else if (is.data.frame(input_data)) {
        input_df <- input_data
    } else {
        stop("input_data must be either a matrix or a data frame")
    }
    df_vars <- names(input_df)
    if (is.null(ordered_cat_vars)) ordered_cat_matches <- rep(FALSE, length(df_vars))
    else ordered_cat_matches <- df_vars %in% ordered_cat_vars
    if (is.null(unordered_cat_vars)) unordered_cat_matches <- rep(FALSE, length(df_vars))
    else unordered_cat_matches <- df_vars %in% unordered_cat_vars
    numeric_matches <- ((!ordered_cat_matches) & (!unordered_cat_matches))
    ordered_cat_vars <- df_vars[ordered_cat_matches]
    unordered_cat_vars <- df_vars[unordered_cat_matches]
    numeric_vars <- df_vars[numeric_matches]
    num_ordered_cat_vars <- length(ordered_cat_vars)
    num_unordered_cat_vars <- length(unordered_cat_vars)
    num_numeric_vars <- length(numeric_vars)
    if (num_ordered_cat_vars > 0) ordered_cat_df <- input_df[,ordered_cat_vars,drop=FALSE]
    if (num_unordered_cat_vars > 0) unordered_cat_df <- input_df[,unordered_cat_vars,drop=FALSE]
    if (num_numeric_vars > 0) numeric_df <- input_df[,numeric_vars,drop=FALSE]
    
    # Empty outputs
    X <- double(0)
    unordered_unique_levels <- list()
    ordered_unique_levels <- list()
    feature_types <- integer(0)
    
    # First, extract the numeric covariates
    if (num_numeric_vars > 0) {
        Xnum <- double(0)
        for (i in 1:ncol(numeric_df)) {
            stopifnot(is.numeric(numeric_df[,i]))
            Xnum <- cbind(Xnum, numeric_df[,i])
        }
        X <- cbind(X, unname(Xnum))
        feature_types <- c(feature_types, rep(0, ncol(Xnum)))
    }
    
    # Next, run some simple preprocessing on the ordered categorical covariates
    if (num_ordered_cat_vars > 0) {
        Xordcat <- double(0)
        for (i in 1:ncol(ordered_cat_df)) {
            var_name <- names(ordered_cat_df)[i]
            preprocess_list <- orderedCatInitializeAndPreprocess(ordered_cat_df[,i])
            ordered_unique_levels[[var_name]] <- preprocess_list$unique_levels
            Xordcat <- cbind(Xordcat, preprocess_list$x_preprocessed)
        }
        X <- cbind(X, unname(Xordcat))
        feature_types <- c(feature_types, rep(1, ncol(Xordcat)))
    }
    
    # Finally, one-hot encode the unordered categorical covariates
    if (num_unordered_cat_vars > 0) {
        one_hot_mats <- list()
        for (i in 1:ncol(unordered_cat_df)) {
            var_name <- names(unordered_cat_df)[i]
            encode_list <- oneHotInitializeAndEncode(unordered_cat_df[,i])
            unordered_unique_levels[[var_name]] <- encode_list$unique_levels
            one_hot_mats[[var_name]] <- encode_list$Xtilde
        }
        Xcat <- do.call(cbind, one_hot_mats)
        X <- cbind(X, unname(Xcat))
        feature_types <- c(feature_types, rep(1, ncol(Xcat)))
    }
    
    # Aggregate results into a list
    metadata <- list(
        feature_types = feature_types, 
        num_ordered_cat_vars = num_ordered_cat_vars, 
        num_unordered_cat_vars = num_unordered_cat_vars, 
        num_numeric_vars = num_numeric_vars
    )
    if (num_ordered_cat_vars > 0) {
        metadata[["ordered_cat_vars"]] = ordered_cat_vars
        metadata[["ordered_unique_levels"]] = ordered_unique_levels
    }
    if (num_unordered_cat_vars > 0) {
        metadata[["unordered_cat_vars"]] = unordered_cat_vars
        metadata[["unordered_unique_levels"]] = unordered_unique_levels
    }
    if (num_numeric_vars > 0) metadata[["numeric_vars"]] = numeric_vars
    output <- list(
        data = X, 
        metadata = metadata
    )
    
    return(output)
}

#' Preprocess a dataframe of covariate values, converting categorical variables 
#' to integers and one-hot encoding if need be. Returns a list including a 
#' matrix of preprocessed covariate values and associated tracking.
#'
#' @param input_data Dataframe or matrix of covariates. Users may pre-process any 
#' categorical variables as factors but it is not necessary.
#' @param metadata List containing information on variables, including train set 
#' categories for categorical variables
#'
#' @return Preprocessed data with categorical variables appropriately preprocessed
#' @noRd
#'
#' @examples
#' cov_df <- data.frame(x1 = 1:5, x2 = 5:1, x3 = 6:10)
#' metadata <- list(num_ordered_cat_vars = 0, num_unordered_cat_vars = 0, 
#'                  num_numeric_vars = 3, numeric_vars = c("x1", "x2", "x3"))
#' X_preprocessed <- createForestCovariatesFromMetadata(cov_df, metadata)
createForestCovariatesFromMetadata <- function(input_data, metadata) {
    if (is.matrix(input_data)) {
        input_df <- as.data.frame(input_data)
        names(input_df) <- paste0("x", 1:ncol(input_data))
    } else if (is.data.frame(input_data)) {
        input_df <- input_data
    } else {
        stop("input_data must be either a matrix or a data frame")
    }
    df_vars <- names(input_df)
    num_ordered_cat_vars <- metadata$num_ordered_cat_vars
    num_unordered_cat_vars <- metadata$num_unordered_cat_vars
    num_numeric_vars <- metadata$num_numeric_vars
    
    if (num_ordered_cat_vars > 0) {
        ordered_cat_vars <- metadata$ordered_cat_vars
        ordered_cat_df <- input_df[,ordered_cat_vars,drop=FALSE]
    }
    if (num_unordered_cat_vars > 0) {
        unordered_cat_vars <- metadata$unordered_cat_vars
        unordered_cat_df <- input_df[,unordered_cat_vars,drop=FALSE]
    }
    if (num_numeric_vars > 0) {
        numeric_vars <- metadata$numeric_vars
        numeric_df <- input_df[,numeric_vars,drop=FALSE]
    }
        
    # Empty outputs
    X <- double(0)

    # First, extract the numeric covariates
    if (num_numeric_vars > 0) {
        Xnum <- double(0)
        for (i in 1:ncol(numeric_df)) {
            stopifnot(is.numeric(numeric_df[,i]))
            Xnum <- cbind(Xnum, numeric_df[,i])
        }
        X <- cbind(X, unname(Xnum))
    }
    
    # Next, run some simple preprocessing on the ordered categorical covariates
    if (num_ordered_cat_vars > 0) {
        Xordcat <- double(0)
        for (i in 1:ncol(ordered_cat_df)) {
            var_name <- names(ordered_cat_df)[i]
            x_preprocessed <- orderedCatPreprocess(ordered_cat_df[,i], metadata$ordered_unique_levels[[var_name]])
            Xordcat <- cbind(Xordcat, x_preprocessed)
        }
        X <- cbind(X, unname(Xordcat))
    }
    
    # Finally, one-hot encode the unordered categorical covariates
    if (num_unordered_cat_vars > 0) {
        one_hot_mats <- list()
        for (i in 1:ncol(unordered_cat_df)) {
            var_name <- names(unordered_cat_df)[i]
            Xtilde <- oneHotEncode(unordered_cat_df[,i], metadata$unordered_unique_levels[[var_name]])
            one_hot_mats[[var_name]] <- Xtilde
        }
        Xcat <- do.call(cbind, one_hot_mats)
        X <- cbind(X, unname(Xcat))
    }
    
    return(X)
}

#' Convert a vector of unordered categorical data (either numeric or character 
#' labels) to a "one-hot" encoded matrix in which a 1 in a column indicates 
#' the presence of the relevant category. 
#' 
#' To allow for prediction on "unseen" categories in a test dataset, this 
#' procedure pads the one-hot matrix with a blank "other" column. 
#' Test set observations that contain categories not in `levels(factor(x_input))`
#' will all be mapped to this column.
#'
#' @param x_input Vector of unordered categorical data (typically either strings 
#' integers, but this function also accepts floating point data).
#'
#' @return List containing a binary one-hot matrix and the unique levels of the 
#' input variable. These unique levels are used in the BCF and BART functions.
#' @noRd
#'
#' @examples
#' x <- c("a","c","b","c","d","a","c","a","b","d")
#' x_onehot <- oneHotInitializeAndEncode(x)
oneHotInitializeAndEncode <- function(x_input) {
    stopifnot((is.null(dim(x_input)) && length(x_input) > 0))
    if (is.factor(x_input) && is.ordered(x_input)) warning("One-hot encoding an ordered categorical variable")
    x_factor <- factor(x_input)
    unique_levels <- levels(x_factor)
    Xtilde <- cbind(unname(model.matrix(~0+x_factor)), 0)
    output <- list(Xtilde = Xtilde, unique_levels = unique_levels)
    return(output)
}

#' Convert a vector of unordered categorical data (either numeric or character 
#' labels) to a "one-hot" encoded matrix in which a 1 in a column indicates 
#' the presence of the relevant category. 
#' 
#' This procedure assumes that a reference set of observations for this variable 
#' (typically a training set that was used to sample a forest) has already been
#' one-hot encoded and that the unique levels of the training set variable are 
#' available (and passed as `unique_levels`). Test set observations that contain 
#' categories not in `unique_levels` will all be mapped to the last column of 
#' this matrix
#'
#' @param x_input Vector of unordered categorical data (typically either strings 
#' integers, but this function also accepts floating point data).
#' @param unique_levels Unique values of the categorical variable used to create 
#' the initial one-hot matrix (typically a training set)
#'
#' @return Binary one-hot matrix
#' @noRd
#'
#' @examples
#' x <- sample(1:8, 100, TRUE)
#' x_test <- sample(1:9, 10, TRUE)
#' x_onehot <- oneHotEncode(x_test, levels(factor(x)))
oneHotEncode <- function(x_input, unique_levels) {
    stopifnot((is.null(dim(x_input)) && length(x_input) > 0))
    stopifnot((is.null(dim(unique_levels)) && length(unique_levels) > 0))
    num_unique_levels <- length(unique_levels)
    in_sample <- x_input %in% unique_levels
    out_of_sample <- !(x_input %in% unique_levels)
    has_out_of_sample <- sum(out_of_sample) > 0
    if (has_out_of_sample) {
        x_factor_insample <- factor(x_input[in_sample], levels = unique_levels)
        Xtilde <- matrix(0, nrow = length(x_input), ncol = num_unique_levels + 1)
        Xtilde_insample <- cbind(unname(model.matrix(~0+x_factor_insample)), 0)
        Xtilde_out_of_sample <- cbind(matrix(0, nrow=sum(out_of_sample), ncol=num_unique_levels), 1)
        Xtilde[in_sample,] <- Xtilde_insample
        Xtilde[out_of_sample,] <- Xtilde_out_of_sample
    } else {
        x_factor <- factor(x_input, levels = unique_levels)
        Xtilde <- cbind(unname(model.matrix(~0+x_factor)), 0)
    }
    return(Xtilde)
}

#' Run some simple preprocessing of ordered categorical variables, converting 
#' ordered levels to integers if necessary, and storing the unique levels of a 
#' variable.
#'
#' @param x_input Vector of ordered categorical data. If the data is not already 
#' stored as an ordered factor, it will be converted to one using the default 
#' sort order.
#'
#' @return List containing a preprocessed vector of integer-converted ordered 
#' categorical observations and the unique level of the original ordered 
#' categorical feature.
#' @noRd
#'
#' @examples
#' x <- c("1. Strongly disagree", "3. Neither agree nor disagree", "2. Disagree", 
#'        "4. Agree", "3. Neither agree nor disagree", "5. Strongly agree", "4. Agree")
#' preprocess_list <- orderedCatInitializeAndPreprocess(x)
#' x_preprocessed <- preprocess_list$x_preprocessed
orderedCatInitializeAndPreprocess <- function(x_input) {
    stopifnot((is.null(dim(x_input)) && length(x_input) > 0))
    already_ordered_factor <- (is.factor(x_input)) && (is.ordered(x_input))
    if (already_ordered_factor) {
        x_preprocessed <- as.integer(x_input)
        unique_levels <- levels(x_input)
    } else {
        x_factor <- factor(x_input, ordered = TRUE)
        x_preprocessed <- as.integer(x_factor)
        unique_levels <- levels(x_factor)
    }
    return(list(x_preprocessed = x_preprocessed, unique_levels = unique_levels))
}

#' Run some simple preprocessing of ordered categorical variables, converting 
#' ordered levels to integers if necessary, and storing the unique levels of a 
#' variable.
#'
#' @param x_input Vector of ordered categorical data. If the data is not already 
#' stored as an ordered factor, it will be converted to one using the default 
#' sort order.
#' @param unique_levels Vector of unique levels for a categorical feature.
#' @param var_name (Optional) Name of variable.
#'
#' @return List containing a preprocessed vector of integer-converted ordered 
#' categorical observations and the unique level of the original ordered 
#' categorical feature.
#' @noRd
#'
#' @examples
#' x_levels <- c("1. Strongly disagree", "2. Disagree", 
#'               "3. Neither agree nor disagree", 
#'               "4. Agree", "5. Strongly agree")
#' x <- c("1. Strongly disagree", "3. Neither agree nor disagree", "2. Disagree", 
#'        "4. Agree", "3. Neither agree nor disagree", "5. Strongly agree", "4. Agree")
#' x_processed <- orderedCatPreprocess(x, x_levels)
orderedCatPreprocess <- function(x_input, unique_levels, var_name = NULL) {
    stopifnot((is.null(dim(x_input)) && length(x_input) > 0))
    stopifnot((is.null(dim(unique_levels)) && length(unique_levels) > 0))
    already_ordered_factor <- (is.factor(x_input)) && (is.ordered(x_input))
    if (already_ordered_factor) {
        # Run time checks
        levels_not_in_reflist <- !(levels(x_input) %in% unique_levels)
        if (sum(levels_not_in_reflist) > 0) {
            if (!is.null(var_name)) warning_message <- paste0("Variable ", var_name, " includes ordered categorical levels not included in the original training set")
            else warning_message <- paste0("Variable includes ordered categorical levels not included in the original training set")
            warning(warning_message)
        }
        # Preprocessing
        x_string <- as.character(x_input)
        x_factor <- factor(x_string, unique_levels, ordered = TRUE)
        x_preprocessed <- as.integer(x_factor)
        x_preprocessed[is.na(x_preprocessed)] <- length(unique_levels) + 1
    } else {
        x_factor <- factor(x_input, ordered = TRUE)
        # Run time checks
        levels_not_in_reflist <- !(levels(x_factor) %in% unique_levels)
        if (sum(levels_not_in_reflist) > 0) {
            if (!is.null(var_name)) warning_message <- paste0("Variable ", var_name, " includes ordered categorical levels not included in the original training set")
            else warning_message <- paste0("Variable includes ordered categorical levels not included in the original training set")
            warning(warning_message)
        }
        # Preprocessing
        x_string <- as.character(x_input)
        x_factor <- factor(x_string, unique_levels, ordered = TRUE)
        x_preprocessed <- as.integer(x_factor)
        x_preprocessed[is.na(x_preprocessed)] <- length(unique_levels) + 1
    }
    return(x_preprocessed)
}
Any scripts or data that you put into this service are public.
stochtree documentation built on April 4, 2025, 2:11 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
stochtree
Stochastic Tree Ensembles (XBART and BART) for Supervised Learning and Causal Inference

R/utils.R
In stochtree: Stochastic Tree Ensembles (XBART and BART) for Supervised Learning and Causal Inference

Documented in convertPreprocessorToJson createPreprocessorFromJson createPreprocessorFromJsonString preprocessPredictionData preprocessTrainData savePreprocessorToJsonString

Try the stochtree package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

stochtree Stochastic Tree Ensembles (XBART and BART) for Supervised Learning and Causal Inference

R/utils.R In stochtree: Stochastic Tree Ensembles (XBART and BART) for Supervised Learning and Causal Inference

Documented in convertPreprocessorToJson createPreprocessorFromJson createPreprocessorFromJsonString preprocessPredictionData preprocessTrainData savePreprocessorToJsonString

Try the stochtree package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

stochtree
Stochastic Tree Ensembles (XBART and BART) for Supervised Learning and Causal Inference

R/utils.R
In stochtree: Stochastic Tree Ensembles (XBART and BART) for Supervised Learning and Causal Inference