# --header ---------------------------------------------------------------------
# globally import the ability to create classes methods etc
#' @import methods
NULL
# This is just here, because methods will be used in many places for creating
# classes, methods etc etc.
# -- Methods for Preprocessing Data --------------------------------------------
#' @title preprocess_training
#' @description Perform preprocessing for the training data, including
#' converting data to dataframe, and encoding categorical data into numerical
#' representation.
#' @inheritParams forestry
#' @import plyr
#' @return A list of two datasets along with necessary information that encoding
#' the preprocessing.
#' @noRd
preprocess_training <- function(x, y) {
x <- as.data.frame(x)
# Check if the input dimension of x matches y
if (nrow(x) != length(y)) {
stop("The dimension of input dataset x doesn't match the output vector y.")
}
# Track the order of all features
featureNames <- colnames(x)
if (is.null(featureNames)) {
warning("No names are given for each column.")
}
# Track all categorical features (both factors and characters)
featureFactorCols <- which(sapply(x, is.factor) == TRUE)
featureCharacterCols <- which(sapply(x, is.character) == TRUE)
if (length(featureCharacterCols) != 0) {
stop("Character value features must be cast to factors.")
}
categoricalFeatureCols <-
c(featureFactorCols, featureCharacterCols)
if (length(categoricalFeatureCols) == 0) {
categoricalFeatureCols <- list()
} else {
categoricalFeatureCols <- list(categoricalFeatureCols)
}
# For each categorical feature, encode x into numeric representation and
# save the encoding mapping
categoricalFeatureMapping <- list()
dummyIndex <- 1
for (categoricalFeatureCol in unlist(categoricalFeatureCols)) {
uniqueFeatureValues <- unique(x[, categoricalFeatureCol])
numericFeatureValues <- 1:length(uniqueFeatureValues)
x[, categoricalFeatureCol] <-
plyr::mapvalues(x = x[, categoricalFeatureCol],
from = uniqueFeatureValues,
to = numericFeatureValues)
categoricalFeatureMapping[[dummyIndex]] <- list(
"categoricalFeatureCol" = categoricalFeatureCol,
"uniqueFeatureValues" = uniqueFeatureValues,
"numericFeatureValues" = numericFeatureValues
)
dummyIndex <- dummyIndex + 1
}
# Return transformed data and encoding information
return(
list(
"x" = x,
"categoricalFeatureCols" = categoricalFeatureCols,
"categoricalFeatureMapping" = categoricalFeatureMapping
)
)
}
#' @title preprocess_testing
#' @description Perform preprocessing for the testing data, including converting
#' data to dataframe, and testing if the columns are consistent with the
#' training data and encoding categorical data into numerical representation
#' in the same way as training data.
#' @inheritParams forestry
#' @param featureNames A vector of column names in training data.
#' @param categoricalFeatureCols A list of index for all categorical data. Used
#' for trees to detect categorical columns.
#' @param categoricalFeatureMapping A list of encoding details for each
#' categorical column, including all unique factor values and their
#' corresponding numeric representation.
#' @import plyr
#' @return A preprocessed training dataaset x
#' @noRd
preprocess_testing <- function(x,
featureNames,
categoricalFeatureCols,
categoricalFeatureMapping) {
x <- as.data.frame(x)
# Track the order of all features
testingFeatureNames <- colnames(x)
if (is.null(testingFeatureNames)) {
warning("No names are given for each column.")
}
if (!(identical((featureNames), testingFeatureNames))) {
stop("Training data and testing data column names must be the same.")
}
# Track all categorical features (both factors and characters)
featureFactorCols <- which(sapply(x, is.factor) == TRUE)
featureCharacterCols <- which(sapply(x, is.character) == TRUE)
testingCategoricalFeatureCols <-
c(featureFactorCols, featureCharacterCols)
if (length(testingCategoricalFeatureCols) == 0) {
testingCategoricalFeatureCols <- list()
} else {
testingCategoricalFeatureCols <- list(testingCategoricalFeatureCols)
}
if (length(setdiff(categoricalFeatureCols,
testingCategoricalFeatureCols)) != 0) {
stop("Categorical columns are different between testing and training data.")
}
# For each categorical feature, encode x into numeric representation
for (categoricalFeatureMapping_ in categoricalFeatureMapping) {
categoricalFeatureCol <-
categoricalFeatureMapping_$categoricalFeatureCol
# Get all unique feature values
testingUniqueFeatureValues <- unique(x[, categoricalFeatureCol])
uniqueFeatureValues <-
categoricalFeatureMapping_$uniqueFeatureValues
numericFeatureValues <-
categoricalFeatureMapping_$numericFeatureValues
# If testing dataset contains more, adding new factors to the mapping list
diffUniqueFeatureValues <- setdiff(testingUniqueFeatureValues,
uniqueFeatureValues)
if (length(diffUniqueFeatureValues) != 0) {
uniqueFeatureValues <-
c(uniqueFeatureValues, diffUniqueFeatureValues)
numericFeatureValues <- 1:length(uniqueFeatureValues)
}
x[, categoricalFeatureCol] <-
plyr::mapvalues(x = x[, categoricalFeatureCol],
from = uniqueFeatureValues,
to = numericFeatureValues)
}
# Return transformed data and encoding information
return(x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.