knitr::opts_chunk$set(echo = TRUE, cache=FALSE,
                      warning = FALSE, message = FALSE,
                      results = 'show',
                      eval = TRUE)  ## flag eval = false for quick text edits


# install.packages("mlr", dependencies = TRUE)
library(mlr)
library(tidyverse)
library(knitr)
# library(sf)
# 
# 
# datSets <- data.frame(
#   Name = c("Aleza","Eagle Hills", "Eagle Hills,", "Deception", "Deception"),
#   Subzone = c("SBSwk1","IDFxh", "IDFdk", "ESSFmc", "SBSmc2"),
#   DatLoc  = c("/home/rstudio/data/FLNR_PEM/Covariates_Paper/AlezaLake/training_pts/aleza_tpts.gpkg",
#               "/home/rstudio/data/FLNR_PEM/Covariates_Paper/EagleHills/training_pts/IDFxh2.gpkg",
#               "/home/rstudio/data/FLNR_PEM/Covariates_Paper/EagleHills/training_pts/IDFdk1.gpkg",
#               "/home/rstudio/data/FLNR_PEM/Covariates_Paper/Deception/training_pts/ESSFmc.gpkg",
#               "/home/rstudio/data/FLNR_PEM/Covariates_Paper/Deception/training_pts/SBSmc2.gpkg"),
#   outDir = c("mods/SBSwk1_reduced","mods/IDFxh2_reduced", "mods/IDFdk1_reduced",
#              "mods/ESSFmc_reduced", "mods/SBSmc2_reduced")
#   )
# 
# 
# 
# 
# i <- 1
# 
# 
# dat <- st_read(datSets$DatLoc[i]) %>% as.data.frame()
# dat <- dat[1:11]
# dat <- dat[complete.cases(dat),]
# 
# traindat <- dat
# modDat   <- traindat
# target   <- "SiteSeries"
# trees    <- 10
# outDir   <- datSets$outDir[i]
# rseed    <- 220228
# 
## Load the data
modDat <- params$traindat
target <- params$target

trees <- params$trees

rseed <- params$rseed

## define output directory
outDir <- params$outDir
# 
# print(paste("Target:", target))
# print(paste("saving to:", outDir))
# print(names(modDat))
# print(head(modDat))
# print(paste("Growing", trees ,"trees"))
# print(paste("Random seed", rseed))
# ## Load the data
# modDat <- sf::st_read("e:/workspace/2020/PEM/ALRF_PEMv2/spatialData/SamplePoints_with_rast_values.gpkg",
#   quiet = TRUE) %>%
#   dplyr::select(SiteSeries, aspect_025:VerticalDistance_25m) %>%
#   dplyr::select(-c(Vertical_distance_025:VerticalDistance_25m,   ## remove problematic covariates
#                    overland_flow_025:OverlandFlowDistance_25m)) %>%
#   as.data.frame()
# modDat <- modDat[, -length(modDat)]
# modDat <- modDat[!is.na(modDat$SiteSeries),]
# target <- "SiteSeries"
# 
# ## define output directory
# outDir <- "e:/tmp/mlr_5m_200323/"
# rseed <- NA

Response variable: r target

These are the response variable classes including the number in each class that we want to predict.

table(modDat[, target])

Begin modeling

Define the Task and Learner

PERHAPS -- this should be another small function or sourced other options

## use or create a random number seed -- this can be used to repeat results in future.
if (!is.na(rseed)) {
  set.seed(rseed)
  print(paste("Random number generator seed set to:", rseed))
  } else {
  rseed <- as.integer(Sys.time())
  print(paste("Random number generator seed set to:", rseed))
}
## Create task
tsk <- makeClassifTask(data = modDat, target = target)

## Define Learner
lrn <- makeLearner("classif.ranger",
                   num.trees = trees,                         ## number of trees DEFAULT: 500
                   mtry = round(sqrt(ncol(modDat)-1)),      ## someone showed me to declare mtry this way
                   num.threads = parallel::detectCores()*2, ## CAUTION HERE: how many threads does your machine have?
                   importance = "impurity",                 ## collect var importance data
                   predict.type = "prob")                   ## model will generate prob. and multi-class

Complete repeated cross-fold validataion

PERHAPS -- this should be another small function or sourced other options

## Defines the validation method
resp <- makeResampleDesc("RepCV",     ## repeated cross fold
                         folds = 10,   ## k-folds 5 or 10 as default.  Ideally all folds should be equal size.
                         reps  = 5)   ## note this will mean 10 x 5 iterations through the data
   ## note: 5 fold 3 repeats is a little low.  I would prefer 10 x 10 but that takes additional time...

## Execute cross validation
cv <- mlr::resample(learner = lrn,
               task = tsk,
               resampling = resp)

saveRDS(cv, file = paste0(outDir, "/cv_results.rds"))

Resampling results

The mean mis-classification error is r mean(cv$measures.test$mmce).

Confusion Matrices

cf_matrix <- calculateConfusionMatrix(cv$pred,
                                      relative = TRUE,
                                      sums = TRUE)
Absolute matrix
wzxhzdk:10
Relative Matrix
wzxhzdk:11

Train the model

The model is trained using all the data and then saved.

mod <- train(lrn, tsk)

saveRDS(mod, paste(outDir, "model.rds", sep = "/"))

Variable importance

var_imp <- as.data.frame(mod$learner.model$variable.importance) %>%
    rownames_to_column()
  names(var_imp) <- c("name", "VaribleImportance")

knitr::kable(var_imp %>% arrange(desc(VaribleImportance)) %>% head(., 20))

Complete

Congratulations your model has been generated.

Files are saved:

# outDir <- "e:/tmp/model_gen_test/" ## Testing
fn <- as.data.frame(list.files(outDir, full.names = TRUE))
names(fn) <- "Files"
knitr::kable(fn)


ColinChisholm/PEMGeneratr documentation built on March 12, 2023, 12:52 p.m.