R/R_preprocessing.R
In Rforestry: Random Forests, Linear Trees, and Gradient Boosting for Inference and Interpretability

Documented in preprocess_testing preprocess_training scale_center unscale_uncenter

# --header ---------------------------------------------------------------------
# globally import the ability to create classes methods etc
#' @import methods
NULL
# This is just here, because methods will be used in many places for creating
# classes, methods etc etc.


# -- Methods for Preprocessing Data --------------------------------------------
#' @title preprocess_training
#' @description Perform preprocessing for the training data, including
#'   converting data to dataframe, and encoding categorical data into numerical
#'   representation.
#' @inheritParams forestry
#' @return A list of two datasets along with necessary information that encodes
#'   the preprocessing.
preprocess_training <- function(x, y) {
  x <- as.data.frame(x)
  # Check if the input dimension of x matches y
  if (nrow(x) != length(y)) {
    stop("The dimension of input dataset x doesn't match the output vector y.")
  }

  # Track the order of all features
  featureNames <- colnames(x)
  if (is.null(featureNames)) {
    warning("No names are given for each column.")
  }

  # Track all categorical features (both factors and characters)
  featureFactorCols <- which(sapply(x, is.factor) == TRUE)
  featureCharacterCols <- which(sapply(x, is.character) == TRUE)
  if (length(featureCharacterCols) != 0) {
    stop("Character value features must be cast to factors.")
  }
  categoricalFeatureCols <-
    c(featureFactorCols, featureCharacterCols)
  if (length(categoricalFeatureCols) == 0) {
    categoricalFeatureCols <- list()
  } else {
    categoricalFeatureCols <- list(categoricalFeatureCols)
  }

  # For each categorical feature, encode x into numeric representation and
  # save the encoding mapping
  categoricalFeatureMapping <- list()
  dummyIndex <- 1
  for (categoricalFeatureCol in unlist(categoricalFeatureCols)) {
    # drop unused levels
    x[, categoricalFeatureCol] <- factor(x[, categoricalFeatureCol])
    categoricalFeatureMapping[[dummyIndex]] <- list(
      "categoricalFeatureCol" = categoricalFeatureCol,
      "uniqueFeatureValues" = levels(x[, categoricalFeatureCol]),
      "numericFeatureValues" = 1:length(levels(x[, categoricalFeatureCol]))
    )
    x[, categoricalFeatureCol] <- as.integer(x[, categoricalFeatureCol])
    dummyIndex <- dummyIndex + 1
  }

  # Return transformed data and encoding information
  return(
    list(
      "x" = x,
      "categoricalFeatureCols" = categoricalFeatureCols,
      "categoricalFeatureMapping" = categoricalFeatureMapping
    )
  )
}

#' @title preprocess_testing
#' @description Perform preprocessing for the testing data, including converting
#'   data to dataframe, and testing if the columns are consistent with the
#'   training data and encoding categorical data into numerical representation
#'   in the same way as training data.
#' @inheritParams forestry
#' @param categoricalFeatureCols A list of index for all categorical data. Used
#'   for trees to detect categorical columns.
#' @param categoricalFeatureMapping A list of encoding details for each
#'   categorical column, including all unique factor values and their
#'   corresponding numeric representation.
#' @return A preprocessed training dataaset x
preprocess_testing <- function(x,
                               categoricalFeatureCols,
                               categoricalFeatureMapping) {
  x <- as.data.frame(x)

  # Track the order of all features
  testingFeatureNames <- colnames(x)
  if (is.null(testingFeatureNames)) {
    warning("No names are given for each column.")
  }

  # Track all categorical features (both factors and characters)
  featureFactorCols <- which(sapply(x, is.factor) == TRUE)
  featureCharacterCols <- which(sapply(x, is.character) == TRUE)
  testingCategoricalFeatureCols <-
    c(featureFactorCols, featureCharacterCols)
  if (length(testingCategoricalFeatureCols) == 0) {
    testingCategoricalFeatureCols <- list()
  } else {
    testingCategoricalFeatureCols <- list(testingCategoricalFeatureCols)
  }

  if (length(setdiff(categoricalFeatureCols,
                     testingCategoricalFeatureCols)) != 0) {
    stop("Categorical columns are different between testing and training data.")
  }

  # For each categorical feature, encode x into numeric representation
  for (categoricalFeatureMapping_ in categoricalFeatureMapping) {
    categoricalFeatureCol <-
      categoricalFeatureMapping_$categoricalFeatureCol
    # Get all unique feature values
    testingUniqueFeatureValues <- unique(x[, categoricalFeatureCol])
    uniqueFeatureValues <-
      categoricalFeatureMapping_$uniqueFeatureValues
    numericFeatureValues <-
      categoricalFeatureMapping_$numericFeatureValues

    # If testing dataset contains more, adding new factors to the mapping list
    diffUniqueFeatureValues <- setdiff(testingUniqueFeatureValues,
                                       uniqueFeatureValues)
    if (length(diffUniqueFeatureValues) != 0) {
      uniqueFeatureValues <-
        c(uniqueFeatureValues, diffUniqueFeatureValues)
      numericFeatureValues <- 1:length(uniqueFeatureValues)
    }

    x[, categoricalFeatureCol] <- match(
      as.character(x[, categoricalFeatureCol]),
      uniqueFeatureValues
    )
  }

  # Return transformed data and encoding information
  return(x)
}


#' @title scale_center
#' @description Given a dataframe, scale and center the continous features
#' @param x A dataframe in order to be processed.
#' @param categoricalFeatureCols A vector of the categorical features, we
#'   don't want to scale/center these. Should be 1-indexed.
#' @param colMeans A vector of the means to center each column.
#' @param colSd A vector of the standard deviations to scale each column with.
#' @return A scaled and centered  dataset x
scale_center <- function(
  x,
  categoricalFeatureCols,
  colMeans,
  colSd
) {
  # Center and scale continous features and outcome
  for (col_idx in 1:ncol(x)) {
    if (col_idx %in% categoricalFeatureCols) {
      next
    } else {
      if (colSd[col_idx] != 0) {
        x[,col_idx] <- ( x[,col_idx] - colMeans[col_idx]) / colSd[col_idx]
      } else {
        x[,col_idx] <- ( x[,col_idx] - colMeans[col_idx])
      }
    }
  }
  return(x)
}

#' @title unscale_uncenter
#' @description Given a dataframe, un scale and un center the continous features
#' @param x A dataframe in order to be processed.
#' @param categoricalFeatureCols A vector of the categorical features, we
#'   don't want to scale/center these. Should be 1-indexed.
#' @param colMeans A vector of the means to add to each column.
#' @param colSd A vector of the standard deviations to rescale each column with.
#' @return A dataset x in it's original scaling
unscale_uncenter <- function(
  x,
  categoricalFeatureCols,
  colMeans,
  colSd
) {
  # Center and scale continous features and outcome
  for (col_idx in 1:ncol(x)) {
    if (col_idx %in% categoricalFeatureCols) {
      next
    } else {
      if (colSd[col_idx] != 0) {
        x[,col_idx] <- ( x[,col_idx]* colSd[col_idx] + colMeans[col_idx])
      } else {
        x[,col_idx] <- ( x[,col_idx] + colMeans[col_idx])
      }
    }
  }
  return(x)
}

Any scripts or data that you put into this service are public.

Rforestry documentation built on March 31, 2023, 11:33 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Rforestry
Random Forests, Linear Trees, and Gradient Boosting for Inference and Interpretability

R/R_preprocessing.R
In Rforestry: Random Forests, Linear Trees, and Gradient Boosting for Inference and Interpretability

Defines functions unscale_uncenter scale_center preprocess_testing preprocess_training

Documented in preprocess_testing preprocess_training scale_center unscale_uncenter

Try the Rforestry package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Rforestry Random Forests, Linear Trees, and Gradient Boosting for Inference and Interpretability

R/R_preprocessing.R In Rforestry: Random Forests, Linear Trees, and Gradient Boosting for Inference and Interpretability

Defines functions unscale_uncenter scale_center preprocess_testing preprocess_training

Documented in preprocess_testing preprocess_training scale_center unscale_uncenter

Try the Rforestry package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Rforestry
Random Forests, Linear Trees, and Gradient Boosting for Inference and Interpretability

R/R_preprocessing.R
In Rforestry: Random Forests, Linear Trees, and Gradient Boosting for Inference and Interpretability