R/ModelGeneration.R

Defines functions CreateModelComparison GetAccuracy summary.ModelComparison predict.ModelComparison GetPredType prepData BuildModels GetBuildFlags GetTrainingInfo GetModelComparisons ModelComparison

Documented in BuildModels CreateModelComparison GetAccuracy GetBuildFlags GetModelComparisons GetPredType GetTrainingInfo ModelComparison predict.ModelComparison prepData summary.ModelComparison

#' A helper function used by GetModelComparison and ModelComparison to create the
#' ModelComparison object. It gives the object it's class and assigns it it's members
#'
#' @param model.list a list of models that are already trained.  The name of the model.list should
#' be accurate to the names of the models, unless other names are given in diff.names
#' @param multi.class a boolean value of whether the classification output is more than two outputs
#' @param force.prepped a boolean value describing whether the dataset was forced prepared into
#' one hot encoding for training.  Only applicable with categorical data
#' @param diff.names a vector or list containing the names of the model.list models, in that
#' respective order.  This is optional if the names are already attatched to the model.list
#'  and will default to NULL.
#'
#' @return the fully created ModelComparison object
#'
#' @examples
#' # This function is used by GetModelComparison and ModelComparison to create the
#' # ModelComparison object. The follow code is what is used by ModelComparison:
#' # CreateModelComparison(model.list, multi.class, F, names(model.list))
#'
#' @keywords internal
#' @export
CreateModelComparison <- function(model.list, multi.class, force.prepped, diff.names=NULL) {
  # we can add our own integrity checks
  comparison <- list()
  if (class(diff.names) == "NULL") {
    model.list = model.list
  } else {
    model.list = model.list
    names(model.list) <- diff.names
  }
  # give object its members
  comparison$model.list <- list()
  comparison$model.list <- as.list(model.list)
  comparison$force.prepared = force.prepped
  comparison$.multi.class <- multi.class
  class(comparison) <- "ModelComparison"
  comparison$accuracy.list <- GetAccuracy(comparison)
  return(comparison)
}

#' Gives a summary of the models used in the ModelComparison object
#'
#' @param object the ModelComparison object to get the accuracies from
#'
#' @return a list of numeric values that are the accuracies of the models
#'
#' @examples
#' # see the CreateModelComparison function for it's use as follows:
#' #  comparison$accuracy.list <- GetAccuracy(comparison)
#'
#' @keywords internal
#' @export
GetAccuracy <- function(object) {
  out <- tryCatch(
    {
      accuracy.list <- list()
      i = 0
      for (model in object$model.list) {
        if (class(model)[[1]] == "Ensemble") {
          # TODO implement this
          return(NULL)
        }
        i = i + 1
        accuracy.list[[i]] <- model$results["Accuracy"]
      }
      names(accuracy.list) <- names(object$model.list)
      return(accuracy.list)
    },
    error=function(cond) {
      # model's accuracy was not given
      return(NULL)
    }
  )
  return(out)

}

#' Gives a summary of the models used in the ModelComparison object
#' @param object the ModelComparison object to summarize
#' @param extra a boolean value on whether accuracy values will be returned, if present.
#' By default this value is TRUE
#' @param ... Other arguments, if needed
#'
#' @return NULL, only prints out
#'
#' @examples
#' # load the csv file for the dataset "titanic"
#' # titanic <- PrepareNumericTitanic()
#' # create the ModelComparison object by passing in the training set and training labels
#' # comp <- GetModelComparisons(titanic[, -1], titanic[, 1])
#' # predict by passing in the new df for the object to predict on
#' # summary(comp)
#'
#' @export
summary.ModelComparison <- function(object, extra=TRUE, ...) {
  start <- "This object is a comparison of the following models:"
  cat(start)
  cat("\n")
  models <- paste(names(object$model.list), collapse = ", ")
  cat(models)
  if (any(!is.na(object$accuracy.list)) && extra) {
    cat("\n\n")
    cat("with respective accuracies of:")
    cat("\n")
    for (acc in object$accuracy.list) {
      cat(as.character(acc[[1]]))
      cat(" ")
    }
  }
  if (any(!is.na(object$auc.list))) {
    cat("\n\n")
    cat("and with respective AUC's of: ")
    cat("\n")
    for (auc in object$auc.list) {
      cat(as.character(auc[[1]]))
      cat(" ")
    }
  }
}

#' Predicts values for a ModelComparison object.
#'
#' @param object The ModelComparison object whose models will be predicted on.
#' @param newdata A dataframe of new data that the models will use to predict.
#' @param ... Other arguments, if needed
#'
#' @return A list of dataframes, where each dataframe is the predictions from the respective
#'            model for the two output classes.
#'
#' @examples
#' # load the csv file for the dataset "titanic"
#' # titanic <- PrepareNumericTitanic()
#' # create the ModelComparison object by passing in the training set and training labels
#' # comp <- GetModelComparisons(titanic[, -1], titanic[, 1])
#' # predict by passing in the new df for the object to predict on
#' # pred.list <- predict(comp, titanic[, -1])
#'
#' @export
predict.ModelComparison <- function(object, newdata, ...) {
  # # check to see if function is good
  # is.prepped <- sapply(training.data, function(x) (is.numeric(x) || length(levels(x)) <= 2))
  #
  # # Data is not in one hot encoding - try to do it
  # if (sum(is.prepped) != ncol(training.data)) {
  #   training.data = prepData(training.data)
  #   forced.prepared = T
  # } else {
  #   forced.prepared = F
  # }


  predictions.list = list()
  i = 0
  if (object$.multi.class) {
    # do something TODO
    message("predicting on a multi-class outcome")
    for (model in object$model.list) {
      i = i + 1
    }
  } else {
    # predict over the list of models
    pred.basic <- list()
    for (model in object$model.list) {
      i = i + 1
      pred.basic[[i]] = GetPredType(model, newdata)
    }
    .getClass <- function(model) { return(class(model)[[1]])    }
    names(pred.basic) <- sapply(object$model.list, .getClass)
    return(pred.basic)
  }
}

#' A helper function used by the predict function. Since the many types of models are not
#' uniform in how they are called, this function determines what values to pass into "type"
#' and then returns the predictions.
#'
#' @param model The model to be used for prediction
#' @param newdata A dataframe for the model to predict on
#'
#' @return The predictions for the model on the given data, returned in dataframe format
#'
#' @examples
#' # This function is used by predict.ModelComparison. It's usage is as follows:
#' # pred.basic[[i]] = GetPredType(model, newdata)
#'
#' @keywords internal
#' @export
GetPredType <- function(model, newdata) {
  if (class(model)[1] == "naiveBayes" || class(model)[1] == "nnet.formula" ||
      class(model)[1] == "nnet") {
    return(predict(model, newdata, type="raw", prob=TRUE))

  } else if (class(model)[[1]] == "lognet" || class(model)[[1]] == "glmnet") {
    return (predict(model, data.matrix(newdata), type="response", prob=TRUE))

    # SVM's require more work to get the probabilites
  } else if (class(model)[[1]] == "svm.formula" || class(model)[[1]] == "svm") {
    svm.pred <- predict(model, data.matrix(newdata), type="response", decision.values=TRUE)
    #return (svm.pred[1:nrow(newdata)])
    return(attr(svm.pred, "decision.values"))

  } else {
    return (predict(model, newdata, type="prob", prob=TRUE))
  }
}

#' A helper function used by the GetModelComparison function. If the data given is categorical
#' and not in one-hot encoding the model training will fail.  This will attempt to conver the
#' dataset into that format, or, to send an error if it fails to do so.
#'
#' @param training.set the training data that is in the wrong format. This is the same
#' dataframe as in GetModelComparison parameters.
#'
#' @return The dataframe in one-hot encoding.
#'
#' @examples
#' # This function is used by GetModelComparison. It's usage is as follows:
#' # training.set = prepData(training.set)
#'
#' @keywords internal
#' @export
prepData <- function(training.set) {
    out <- tryCatch(
      {
        dmy <- caret::dummyVars(" ~ .", data = training.set)
        training.set <- data.frame(predict(dmy, newdata = training.set))
        return(training.set)
      },
      error=function(cond) {
        message("Data set is not numeric or in one hot encoding.  Will try to convert")
        message(cond)
        # Choose a return value in case of error
        return(NA)
      }
    )
  return(out)
}

#' A helper function used by the GetModelComparison function. It will take the given inputs
#' and build the models for the ModelComparison object.
#'
#' @param training.data The training data to build the models.
#' @param training.classes The labels for the training data.
#' @param trctrl The trctrl used for the caret package to train the models.
#' @param tune.length The amound of time to tune to the model. Defaults to 1.
#' @param multi.class A boolean value of whether the model has multiple classification outputs
#' @param build.flags A vector of flags that decide what models to build.
#' @param force.prepared A boolean value describing whether the dataset was forced prepared into
#' one hot encoding for training.  Only applicable with categorical data
#'
#' @return A vector of the models created.
#'
#' @examples
#' # This function is used by GetModelComparison. It's usage is as follows:
#' # modelVec = BuildModels(training.data, training.classes, trctrl,
#' #            tune.length, multi.class, build.flags, force.prepared = forced.prepared)
#'
#' @keywords internal
#' @export
BuildModels <- function(training.data, training.classes, trctrl,
                     tune.length, multi.class, build.flags, force.prepared = F ) {
  modelVec <- list()
  out <- tryCatch(
    {
      message("Building...")
      if (build.flags["svmlinear"]) {
        svmLinear <- caret::train(training.data, training.classes, method = "svmLinear",
                                  trControl=trctrl,
                                  preProcess = c("center", "scale"),
                                  tuneLength = tune.length)

        modelVec[["svmLinear"]] = svmLinear
      }
      if (build.flags["neuralnet"]) {
        # capture output to reduce the excessive output this package gives in training
        capture.output(neuralNet <- caret::train(training.data, training.classes,
                                                 method = "nnet",
                                                 trControl=trctrl,
                                                 preProcess = c("center", "scale"),
                                                 tuneLength = tune.length))
        modelVec[["neuralNet"]] = neuralNet
      }

      if (build.flags["glmnet"]) {
        # prepare data for a GLMNET
        train <- data.frame(training.data, training.classes)
        x.m <- model.matrix( ~.+0, training.data)
        glmnet <- caret::train(x.m, training.classes,  method = "glmnet",
                               trControl=trctrl,
                               preProcess = c("center", "scale"),
                               tuneLength = tune.length)
        modelVec[["glmnet"]] = glmnet

      }

      if (build.flags["randomforest"]) {
        randomForest <- caret::train(training.data, training.classes, method = "rf",
                                     trControl=trctrl,
                                     preProcess = c("center", "scale"),
                                     tuneLength = tune.length)
        modelVec[["randomForest"]] = randomForest
      }

      if (build.flags["svmradial"]) {
        svmRadial <- caret::train(training.data, training.classes, method = "svmRadial",
                                     trControl=trctrl,
                                     preProcess = c("center", "scale"),
                                     tuneLength = tune.length)
        modelVec[["svmRadial"]] = svmRadial
      }

      if (build.flags["knn"]) {
        knn <- caret::train(training.data, training.classes, method = "knn",
                                     trControl=trctrl,
                                     preProcess = c("center", "scale"),
                                     tuneLength = tune.length)
        modelVec[["knn"]] = knn
      }

      if (!multi.class) {
        if (build.flags["glm"]) {
          glm <- caret::train(training.data, as.factor(training.classes), method = "glm",
                                    trControl=trctrl,
                                    preProcess = c("center", "scale"),
                                    tuneLength = tune.length)
          modelVec[["glm"]] = glm

        }
      } else {
        if (build.flags["glm"]) {
          glm = NULL
          # TODO build a multiclass
        }
      }

      message("Models are done building")
      return(modelVec)

    },
    error=function(cond) {
      if (force.prepared) {
        message("Forced conversion to one hot encoding did not work - convert and try again")
        message(cond)
      } else {
        message("Error in building models: ")
        message(cond)
      }
      # Choose a return value in case of error
      return(NA)
    }
  )
  return(out)
}

#' A helper function used by the GetModelComparison function. It takes the inputs and parses
#' it to see what models to build
#'
#' @param model.list The input parameters from GetModelComparison on what models to create.
#'
#' @return A vector of flags that contain what models to build.  To be used in BuildModels
#' function
#'
#' @examples
#' # This function is used by GetModelComparison. It's usage is as follows:
#' # build.flags <- GetBuildFlags(model.list)
#'
#' @keywords internal
#' @export
GetBuildFlags <- function(model.list) {
  # Flags include:
  # neuralnet, svmlinear, svmradial, knn, randomforest, glmnet, glm
  # keywords include "fast", "all", "expensive"

  build.glm = FALSE
  build.glmnet = FALSE
  build.randomforest = FALSE
  build.knn = FALSE
  build.svmradial = FALSE
  build.svmlinear = FALSE
  build.neuralnet = FALSE
  # turn all to lowercase
  model.list <- sapply(model.list, tolower)
  if (sum(is.element(model.list, "fast"))) {
    build.glmnet = TRUE
    build.svmlinear = TRUE
  } else if (sum(is.element(model.list, "all"))) {
    build.glm = TRUE
    build.svmlinear = TRUE
    build.svmradial = TRUE
    build.neuralnet = TRUE
    build.knn = TRUE
    build.glmnet = TRUE
    build.randomforest = TRUE
  } else if (sum(is.element(model.list, "expensive"))) {
    build.svmradial = TRUE
    build.neuralnet = TRUE
    build.glmnet = TRUE
    build.randomforest = TRUE
  }
  if (sum(is.element(model.list, "neuralnet"))) {
    build.neuralnet = TRUE
  }
  if (sum(is.element(model.list, "svmlinear"))) {
    build.svmlinear = TRUE
  }
  if (sum(is.element(model.list, "svmradial"))) {
    build.svmradial = TRUE
  }
  if (sum(is.element(model.list, "knn"))) {
    build.knn = TRUE
  }
  if (sum(is.element(model.list, "randomforest"))) {
    build.randomforest = TRUE
  }
  if (sum(is.element(model.list, "glmnet"))) {
    build.glmnet = TRUE
  }
  if (sum(is.element(model.list, "glm"))) {
    build.glm = TRUE
  }
  # assign flags to vector
    flag.vector <- vector()
  flag.vector["glm"] = build.glm
  flag.vector["glmnet"] = build.glmnet
  flag.vector["randomforest"] = build.randomforest
  flag.vector["knn"] = build.knn
  flag.vector["svmradial"] = build.svmradial
  flag.vector["svmlinear"] = build.svmlinear
  flag.vector["neuralnet"] = build.neuralnet
  return(flag.vector)
}

#' A helper function used by the GetModelComparison function.
#'
#' It takes the given inputs, data, and trctrl needed to build the models.
#'
#' @param training.set The training data to build the models.
#' @param training.classes.input The labels for the training data.
#' @param validation The type of model validation: cross-validation ("cv"), or a training
#' split (in the form XX/XX where XX is a two digit percent).
#' @param trctrl.given The trctrl given in GetModelComparison parameters for custom training.
#'
#' @return A vector containg the following for training the models. It is given in this order:
#' training.data, training.classes, trctrl
#'
#' @examples
#' # This function is used by GetModelComparison. It's usage is as follows:
#' # training.info <- GetTrainingInfo(training.set, training.classes.input, validation, trctrl)
#'
#' @keywords internal
#' @export
GetTrainingInfo <- function(training.set, training.classes.input, validation, trctrl.given) {
  # Get the method of validation and prepare testing and training sets
  split <- regmatches(validation, regexpr("[0-9][0-9]/[0-9][0-9]",validation))
  if (!identical(split, character(0))) {
    # take the first two characters and turn them into a percent
    percent = as.numeric(substr(split, start = 1, stop = 2)) / 100
    # partition the data into training and testing data stratified by class
    trainIndex <- caret::createDataPartition(training.classes.input, p=percent, list=F)
    # get the dataframes
    training.data <- training.set[trainIndex,]
    testing.data <- training.set[-trainIndex,]
    # get the labels
    training.classes <- training.classes.input[trainIndex]
    testing.classes <- training.classes.input[-trainIndex]
    # we will take care of the validation
    trctrl <- caret::trainControl(savePredictions = "final", classProbs =  TRUE)

  } else {
    # we don't need a specific testing set
    training.data = training.set
    testing.data <- training.set
    training.classes <- training.classes.input
    if (validation == "cv") {
      trctrl <- caret::trainControl(method = "cv", savePredictions = "final", classProbs=TRUE)
    } else {
      trctrl <- caret::trainControl(savePredictions = "final", classProbs=TRUE)
    }
  }
  if (class(trctrl.given) != "character") {
    trctrl = trctrl.given
  }
  return(list(training.data, training.classes, trctrl))
}

#' A function to build and return a ModelComparison object.
#'
#' This function does not take any
#' pre-built models, instead it creates them based on input to model.list.
#'
#' @param training.set The training data to build the models.
#' @param training.classes.input The labels for the training data.
#' @param validation The type of model validation: cross-validation ("cv"), or a training
#' split (in the form XX/XX where XX is a two digit percent).
#' @param model.list A vector or list of characters that describe what models to build.
#' Models include "neuralnet", "svmlinear", "svmradial", "knn", "randomforest", 'glmnet",
#' and 'glm." Keywords include "fast" (glmnet, svmlinear), "all", and "expensive"
#' (neuralnet, svmradial, glmnet, and randomforest)
#' @param trctrl The trctrl used for the caret package to train the models. Defaults to a basic
#' version. Used for customized training options including parallelization, etc.
#'
#' @return A fully created ModelComparison object.
#'
#' @examples
#' titanic <- PrepareNumericTitanic()
#'
#' # create the models
#' comp <- GetModelComparisons(titanic[, -1], titanic[, 1], validation="80/20",
#'                             model.list="fast")
#'
#' @export
GetModelComparisons <-function(training.set, training.classes.input, validation="80/20",
                               model.list="fast", trctrl="none") {
  # check to see if function is good
  is.prepped <- sapply(training.set, function(x) (is.numeric(x) || length(levels(x)) <= 2))

  # Data is not in one hot encoding - try to do it
  if (sum(is.prepped) != ncol(training.set)) {
    training.set = prepData(training.set)
    forced.prepared = T
  } else {
    forced.prepared = F
  }

  # prep the data if not already a factor
  training.classes.input = as.factor(training.classes.input)
  set.seed(sample(1:9999999, 1))
  multi.class = (nlevels(training.classes.input) > 2)

  training.info <- GetTrainingInfo(training.set, training.classes.input, validation, trctrl)
  training.data = training.info[[1]]
  training.classes = training.info[[2]]
  trctrl = training.info[[3]]

  build.flags <- GetBuildFlags(model.list)

  message("Settings configured successfully")
  tune.length = 1

  modelVec = BuildModels(training.data, training.classes, trctrl,
                       tune.length, multi.class, build.flags, force.prepared = forced.prepared)
  modelComp <- CreateModelComparison(modelVec, multi.class, forced.prepared)
  return(modelComp)

}

#' A function to return a ModelComparison object.
#'
#'   This function take pre-built models, and puts
#' them into a ModelComparison object.
#'
#' @param model.list The training data to build the models.
#' @param multi.class A boolean of whether the models predict a multi-outcome response
#'
#' @return A ModelComparison object.
#'
#' @examples
#' # models <- list(model1, model2, model3)
#' # names(models) <- c("NeuralNet", "K-NN", "SVM")
#' # comp <- ModelComparison(models, F)
#'
#' @export
ModelComparison <- function(model.list, multi.class) {
  if (anyNA(model.list) || anyNA(names(model.list))) {
    stop("One of the models or model names is NA.  Please fix that and try again")
  }
  return (CreateModelComparison(model.list, multi.class, F, names(model.list)))
}
orionw/BestModel documentation built on Aug. 17, 2019, 7:29 p.m.