R/04GridClass.R

Defines functions reportexitstatus creategrid executepreprocessing setgrid getcombinations getpreprocombdf dfsummary

Documented in getcombinations getpreprocombdf setgrid

#' @include 03PreprocessorClass.R
NULL

## REPORT THE OUTCOME OF DATA VALIDATION

reportexitstatus <- function(preprocesseddatasets){

  if(class(preprocesseddatasets)!="list"){stop("Argument 'preprocesseddatasets' to function 'reportexitstatus' must of a list.")}

  variance <- unlist(lapply(preprocesseddatasets, function(x) slot(x, "variance")))
  finite <- unlist(lapply(preprocesseddatasets, function(x) slot(x, "finite")))
  completeobs <- unlist(lapply(preprocesseddatasets, function(x) slot(x, "completeobs")))
  classbalance <- unlist(lapply(preprocesseddatasets, function(x) slot(x, "classbalance")))
  ntopratiotwoplus <- unlist(lapply(preprocesseddatasets, function(x) slot(x, "ntopratiotwoplus")))
  mindimensions <- unlist(lapply(preprocesseddatasets, function(x) slot(x, "mindimensions")))

  check <- all(c(variance, finite, completeobs, classbalance, ntopratiotwoplus, mindimensions))

  return(check)

}

## CREATE GRID OF PREPROCESSING COMBINATIONS FROM FROM PHASES

creategrid <- function(phases){
  if(class(phases)!="list"){stop("Argument 'phases' to function 'creategrid' must of a list.")}
  grid <- expand.grid(lapply(phases, function(x) eval(as.name(x))@preprotransformations))
  colnames(grid) <- unlist(phases)
  return(grid)
}



## PREPROCESS DATA FOR THE WHOLE GRID

executepreprocessing <- function(grid, dataobject){

  if(class(grid)!="data.frame"){stop("Argument 'grid' must of a data frame of a GridClass object.")}
  if(class(dataobject)!="DataClass"){stop("Argument 'dataobject' must of a DataClass object.")}

  out_preprocesseddatasets <- vector(mode="list", nrow(grid))
  firstcolumningrid <- 1

  for (rowingrid in 1:nrow(grid))
  {

    # PREPROCESS INPUT DATA BY THE FIRST COLUMN OF THE GRID

    out_preprocesseddatasets[[rowingrid]] <- initializedataslot(as.character(grid[rowingrid, firstcolumningrid]), dataobject) # first column of grid

    if (ncol(grid) > 1){

      # PREPROCESS CONSEQUENT COLUMNS

      for (columningrid in 2:ncol(grid))
      {
        out_preprocesseddatasets[[rowingrid]] <- initializedataslot(as.character(grid[rowingrid,columningrid]), out_preprocesseddatasets[[rowingrid]]@data)
      }

    }

  }

  # VALIDATE AND REPORT PREPROCESSED DATA MODEL FITTING STATUS
  out_preprocesseddatasets <- lapply(out_preprocesseddatasets, function(x) slot(x, "data"))
  out_preprocesseddatasets <- lapply(out_preprocesseddatasets, validatedata)
  return(out_preprocesseddatasets)
}


## GRID

#' An S4 class representing (unevaluated) preprocessing combinations
#'
#' Preprocessing techniques defined with setpreprocessor() can be combined to a phase.
#' Phases defined with setphase() can be combined to a grid of combinations with setgrid().
#' The main programmatic use with preprocomb() takes a GridClass object as argument.
#'
#' GridClass is also an interface for extending the system to package 'metaheur', which takes
#' a GridClass object to find near-optiomal combinations fast.
#'
#' @slot grid (data frame) preprocessor combinations
#' @slot data (list) DataClass objects
#' @slot validation (data frame) validation results
#' @export

setClass("GridClass", representation(grid="data.frame", data="list", validation="data.frame"))

#' constructor function for creating preprocessing combinations
#'
#' setgrid takes the preprocessing phases, which contain preprocessors and creates
#' the combinations of them as a grid. It then computes and stores the transformed
#' data sets for each combination. setgrid initializes a GridClass object.
#
#' @param phases (character) vector of phases
#' @param data (data frame)
#' @return a GridClass object
#' @examples
#' grid <- setgrid(phases=c("outliers", "irrelfeatures"), data=iris)
#' @details If there are missing values, imputation phase must be set as first phase.
#' Default phase "sampling" can only be used with data, which has binary class labels.
#' @export

setgrid <- function(phases, data){

# Validate arguments
if(class(phases)!="character"){stop("Argument 'phases' must be a character vector.")}
if(class(data)!="data.frame"){stop("Argument 'data' must of a data frame.")}

phases <- as.list(phases)
if(!all(lapply(phases, function(x) class(eval(as.name(x))))=="PhaseClass")){
stop("All elements in argument 'phases' must point to PhaseClass objects.")}

# Initialize objects

dataclassobject <- initializedataclassobject(data)

issamplingincluded <- "sampling" %in% phases
hasmorethantwolevels <- length(levels(dataclassobject@y)) > 2
if (issamplingincluded==TRUE & hasmorethantwolevels==TRUE) {stop("Default phase 'sampling' can only be used with data, which has binary class labels.")}

gridclassobject <- new("GridClass")

# Create grid

gridclassobject@grid <- creategrid(phases)

# Test preprocessors

# Preprocess data

gridclassobject@data <- executepreprocessing(gridclassobject@grid, dataclassobject)

# Collect validation results on combinations
validationresults <- data.frame(gridclassobject@grid, data.frame(t(data.frame(lapply(gridclassobject@data, extract)))))
row.names(validationresults) <- NULL
gridclassobject@validation <- validationresults

return(gridclassobject)
}


#' get preprocessing combinations from a grid
#' @param gridobject (grid class) object
#' @export
#' @examples
#' head(getcombinations(examplecombgrid))

getcombinations <- function(gridobject){
  gridobject@grid
}

#' get preprocessed data set from a grid
#' @param gridobject (grid class object)
#' @param nro (integer) combination number
#' @param type (character) default "dataframe", option "summary"
#' @export
#' @examples
#' str(getpreprocombdf(examplecombgrid, 3))

getpreprocombdf <- function(gridobject,nro, type="dataframe"){

  if (type=="dataframe") {
  tempdata <- gridobject@data[[nro]]
  df <- data.frame(tempdata@x, class=tempdata@y)
  }

  if (type=="summary") {
    tempdata <- gridobject@data[[nro]]
    df <- t(data.frame(lapply(tempdata@x, dfsummary)))
  }

  return(df)
}

dfsummary <- function(x){
  c(mean=mean(x, na.rm=TRUE), sd=sd(x, na.rm=TRUE), min=min(x, na.rm=TRUE), max=max(x, na.rm=TRUE))
}



setMethod("show", signature(object = "GridClass"), function(object){
  cat("GRIDCLASS OBJECT", "\n")
  cat("Number of preprocecessing phases:", ncol(object@grid), "\n")
  cat("Number of preprocecessing combinations:", nrow(object@grid), "\n")
  cat("NUMBER OF COMBINATIONS HAVING", "\n")
  cat("variance (not near zero): ", sum(object@validation$variance.1), "\n")
  cat("finite values: ", sum(object@validation$finite), "\n")
  cat("complete observations: ", sum(object@validation$completeobs), "\n")
  cat("ntop ratio more than two: ", sum(object@validation$ntopratiotwoplus), "\n")
  cat("minimum dimensions (20+ observations, 3+ variables): ", sum(object@validation$mindimensions), "\n")
  cat("classes not imbalanced: ", sum(object@validation$classbalance), "\n")
})
mvattulainen/preprocomb documentation built on May 23, 2019, 10:54 a.m.