knitr::opts_chunk$set(echo = TRUE, cache=FALSE,
                      warning = FALSE, message = FALSE,
                      results = 'show',
                      eval = TRUE)  ## flag eval = false for quick text edits
# install.packages("mlr", dependencies = TRUE)
library(mlr)
library(tidyverse)

## Load the data
modDat <- params$traindat
target <- params$target

## define output directory
outDir <- params$outDir

Response variable: r target

These are the response variable classes including the number in each class that we want to predict.

table(modDat[, target])

Begin modeling

Define the Task and Learner

## use or create a random number seed -- this can be used to repeat results in future.
if (!is.na(rseed)) {
  set.seed(rseed)
  print(paste("Random number generator seed set to:", rseed))
  } else {
  rseed <- as.integer(Sys.time())
  print(paste("Random number generator seed set to:", rseed))
}
## Create task
tsk <- makeClassifTask(data = modDat, target = target)

## Define Learner -- here we create an ensemble learner
esbLrns <- c("classif.ranger", "classif.glmnet", "classif.xgboost", "classif.nnTrain")

## Creates a list of learners using producing probability
lrns <- list(mlr::makeLearner(esbLrns[1]),
            mlr::makeLearner(esbLrns[2]),
            mlr::makeLearner(esbLrns[3], verbose=1),
            mlr::makeLearner(esbLrns[4]))
lrns <- lapply(lrns, setPredictType, "prob") ## use prob as the type 


## converts the list of learners to a ensemble 
lrn <- mlr::makeStackedLearner(base.learners = lrns, predict.type = "prob", 
                               method = "stack.cv", 
                               super.learner = "classif.glmnet")

Complete Validataion

This is done by creating a test and training data sets from the data provided. Twenty percent of the data is set asside for testing (1 in 5). Note the current script selects the 20% in a regualar sequence ... this should be changed to a random draw.

### Not working with Ensemble model -- this section is skipped.
## Defines the validation method 
# resp <- makeResampleDesc("RepCV",     ## repeated cross fold
#                          folds = 10,   ## k-folds 5 or 10 as default.  Ideally all folds should be equal size.
#                          reps  = 3)   ## note this will mean 5 x 3 iterations through the data
#    ## note: 5 fold 3 repeats is a little low.  I would prefer 10 x 10 but that takes additional time...
# 
# ## Execute cross validation
# cv <- mlr::resample(learner = lrn,
#                resampling = resp)
# 
# saveRDS(cv, file = paste(outDir, "cv_results.rds", sep = "/"))
#      task = tsk,
#           


## specified validation -- test and training sets 
## seperate the test and training datasets 
test <- seq(5, nrow(modDat), by = 5) ## create a sequence to select every 5th row 

test.dat  <- modDat[test,]
train.dat <- modDat[-test,]

## define task, train, and then predict
## task defined 
train.tsk  <- mlr::makeClassifTask(data = train.dat,  
                                   target = target) 

## train 
parallelMap::parallelStartSocket(parallel::detectCores()-1)
  train.mod  <- mlr::train(lrn, train.tsk)              
parallelMap::parallelStop()

## Predict 
pred <- predict(train.mod, newdata = test.dat)

pred$data$truth <- test.dat[,1]


### performance metrics 
val <- mlr::performance(pred, measures = list(acc,logloss))

Resampling results

Confusion Matrices

_not currently available ... _

Train the model

The model is trained using all the data and then saved.

parallelMap::parallelStartSocket(parallel::detectCores()-1)
  mod <- train(lrn, tsk)
parallelMap::parallelStop()
saveRDS(mod, paste(outDir, "model.rds", sep = "/"))

Variable importance

not available for this type of model

# var_imp <- as.data.frame(mod$learner.model$variable.importance) %>%
#     rownames_to_column()
#   names(var_imp) <- c("name", "VaribleImportance")
# 
# knitr::kable(var_imp %>% arrange(desc(VaribleImportance)) %>% head(., 20))

Complete

Congratulations your model has been generated.

Files are saved:

# outDir <- "e:/tmp/model_gen_test/" ## Testing
fn <- as.data.frame(list.files(outDir, full.names = TRUE))
names(fn) <- "Files"
knitr::kable(fn)


ColinChisholm/pemGeneratr documentation built on March 18, 2023, 1:45 a.m.