#' @include 03PreprocessorClass.R
NULL
## REPORT THE OUTCOME OF DATA VALIDATION
reportexitstatus <- function(preprocesseddatasets){
if(class(preprocesseddatasets)!="list"){stop("Argument 'preprocesseddatasets' to function 'reportexitstatus' must of a list.")}
variance <- unlist(lapply(preprocesseddatasets, function(x) slot(x, "variance")))
finite <- unlist(lapply(preprocesseddatasets, function(x) slot(x, "finite")))
completeobs <- unlist(lapply(preprocesseddatasets, function(x) slot(x, "completeobs")))
classbalance <- unlist(lapply(preprocesseddatasets, function(x) slot(x, "classbalance")))
ntopratiotwoplus <- unlist(lapply(preprocesseddatasets, function(x) slot(x, "ntopratiotwoplus")))
mindimensions <- unlist(lapply(preprocesseddatasets, function(x) slot(x, "mindimensions")))
check <- all(c(variance, finite, completeobs, classbalance, ntopratiotwoplus, mindimensions))
return(check)
}
## CREATE GRID OF PREPROCESSING COMBINATIONS FROM FROM PHASES
creategrid <- function(phases){
if(class(phases)!="list"){stop("Argument 'phases' to function 'creategrid' must of a list.")}
grid <- expand.grid(lapply(phases, function(x) eval(as.name(x))@preprotransformations))
colnames(grid) <- unlist(phases)
return(grid)
}
## PREPROCESS DATA FOR THE WHOLE GRID
executepreprocessing <- function(grid, dataobject){
if(class(grid)!="data.frame"){stop("Argument 'grid' must of a data frame of a GridClass object.")}
if(class(dataobject)!="DataClass"){stop("Argument 'dataobject' must of a DataClass object.")}
out_preprocesseddatasets <- vector(mode="list", nrow(grid))
firstcolumningrid <- 1
for (rowingrid in 1:nrow(grid))
{
# PREPROCESS INPUT DATA BY THE FIRST COLUMN OF THE GRID
out_preprocesseddatasets[[rowingrid]] <- initializedataslot(as.character(grid[rowingrid, firstcolumningrid]), dataobject) # first column of grid
if (ncol(grid) > 1){
# PREPROCESS CONSEQUENT COLUMNS
for (columningrid in 2:ncol(grid))
{
out_preprocesseddatasets[[rowingrid]] <- initializedataslot(as.character(grid[rowingrid,columningrid]), out_preprocesseddatasets[[rowingrid]]@data)
}
}
}
# VALIDATE AND REPORT PREPROCESSED DATA MODEL FITTING STATUS
out_preprocesseddatasets <- lapply(out_preprocesseddatasets, function(x) slot(x, "data"))
out_preprocesseddatasets <- lapply(out_preprocesseddatasets, validatedata)
return(out_preprocesseddatasets)
}
## GRID
#' An S4 class representing (unevaluated) preprocessing combinations
#'
#' Preprocessing techniques defined with setpreprocessor() can be combined to a phase.
#' Phases defined with setphase() can be combined to a grid of combinations with setgrid().
#' The main programmatic use with preprocomb() takes a GridClass object as argument.
#'
#' GridClass is also an interface for extending the system to package 'metaheur', which takes
#' a GridClass object to find near-optiomal combinations fast.
#'
#' @slot grid (data frame) preprocessor combinations
#' @slot data (list) DataClass objects
#' @slot validation (data frame) validation results
#' @export
setClass("GridClass", representation(grid="data.frame", data="list", validation="data.frame"))
#' constructor function for creating preprocessing combinations
#'
#' setgrid takes the preprocessing phases, which contain preprocessors and creates
#' the combinations of them as a grid. It then computes and stores the transformed
#' data sets for each combination. setgrid initializes a GridClass object.
#
#' @param phases (character) vector of phases
#' @param data (data frame)
#' @return a GridClass object
#' @examples
#' grid <- setgrid(phases=c("outliers", "irrelfeatures"), data=iris)
#' @details If there are missing values, imputation phase must be set as first phase.
#' Default phase "sampling" can only be used with data, which has binary class labels.
#' @export
setgrid <- function(phases, data){
# Validate arguments
if(class(phases)!="character"){stop("Argument 'phases' must be a character vector.")}
if(class(data)!="data.frame"){stop("Argument 'data' must of a data frame.")}
phases <- as.list(phases)
if(!all(lapply(phases, function(x) class(eval(as.name(x))))=="PhaseClass")){
stop("All elements in argument 'phases' must point to PhaseClass objects.")}
# Initialize objects
dataclassobject <- initializedataclassobject(data)
issamplingincluded <- "sampling" %in% phases
hasmorethantwolevels <- length(levels(dataclassobject@y)) > 2
if (issamplingincluded==TRUE & hasmorethantwolevels==TRUE) {stop("Default phase 'sampling' can only be used with data, which has binary class labels.")}
gridclassobject <- new("GridClass")
# Create grid
gridclassobject@grid <- creategrid(phases)
# Test preprocessors
# Preprocess data
gridclassobject@data <- executepreprocessing(gridclassobject@grid, dataclassobject)
# Collect validation results on combinations
validationresults <- data.frame(gridclassobject@grid, data.frame(t(data.frame(lapply(gridclassobject@data, extract)))))
row.names(validationresults) <- NULL
gridclassobject@validation <- validationresults
return(gridclassobject)
}
#' get preprocessing combinations from a grid
#' @param gridobject (grid class) object
#' @export
#' @examples
#' head(getcombinations(examplecombgrid))
getcombinations <- function(gridobject){
gridobject@grid
}
#' get preprocessed data set from a grid
#' @param gridobject (grid class object)
#' @param nro (integer) combination number
#' @param type (character) default "dataframe", option "summary"
#' @export
#' @examples
#' str(getpreprocombdf(examplecombgrid, 3))
getpreprocombdf <- function(gridobject,nro, type="dataframe"){
if (type=="dataframe") {
tempdata <- gridobject@data[[nro]]
df <- data.frame(tempdata@x, class=tempdata@y)
}
if (type=="summary") {
tempdata <- gridobject@data[[nro]]
df <- t(data.frame(lapply(tempdata@x, dfsummary)))
}
return(df)
}
dfsummary <- function(x){
c(mean=mean(x, na.rm=TRUE), sd=sd(x, na.rm=TRUE), min=min(x, na.rm=TRUE), max=max(x, na.rm=TRUE))
}
setMethod("show", signature(object = "GridClass"), function(object){
cat("GRIDCLASS OBJECT", "\n")
cat("Number of preprocecessing phases:", ncol(object@grid), "\n")
cat("Number of preprocecessing combinations:", nrow(object@grid), "\n")
cat("NUMBER OF COMBINATIONS HAVING", "\n")
cat("variance (not near zero): ", sum(object@validation$variance.1), "\n")
cat("finite values: ", sum(object@validation$finite), "\n")
cat("complete observations: ", sum(object@validation$completeobs), "\n")
cat("ntop ratio more than two: ", sum(object@validation$ntopratiotwoplus), "\n")
cat("minimum dimensions (20+ observations, 3+ variables): ", sum(object@validation$mindimensions), "\n")
cat("classes not imbalanced: ", sum(object@validation$classbalance), "\n")
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.