knitr::opts_chunk$set(echo = TRUE, cache=FALSE, warning = FALSE, message = FALSE, results = 'show', eval = TRUE) ## flag eval = false for quick text edits # install.packages("mlr", dependencies = TRUE) library(mlr) library(tidyverse) library(knitr)
# library(sf) # # # datSets <- data.frame( # Name = c("Aleza","Eagle Hills", "Eagle Hills,", "Deception", "Deception"), # Subzone = c("SBSwk1","IDFxh", "IDFdk", "ESSFmc", "SBSmc2"), # DatLoc = c("/home/rstudio/data/FLNR_PEM/Covariates_Paper/AlezaLake/training_pts/aleza_tpts.gpkg", # "/home/rstudio/data/FLNR_PEM/Covariates_Paper/EagleHills/training_pts/IDFxh2.gpkg", # "/home/rstudio/data/FLNR_PEM/Covariates_Paper/EagleHills/training_pts/IDFdk1.gpkg", # "/home/rstudio/data/FLNR_PEM/Covariates_Paper/Deception/training_pts/ESSFmc.gpkg", # "/home/rstudio/data/FLNR_PEM/Covariates_Paper/Deception/training_pts/SBSmc2.gpkg"), # outDir = c("mods/SBSwk1_reduced","mods/IDFxh2_reduced", "mods/IDFdk1_reduced", # "mods/ESSFmc_reduced", "mods/SBSmc2_reduced") # ) # # # # # i <- 1 # # # dat <- st_read(datSets$DatLoc[i]) %>% as.data.frame() # dat <- dat[1:11] # dat <- dat[complete.cases(dat),] # # traindat <- dat # modDat <- traindat # target <- "SiteSeries" # trees <- 10 # outDir <- datSets$outDir[i] # rseed <- 220228 #
## Load the data modDat <- params$traindat target <- params$target trees <- params$trees rseed <- params$rseed ## define output directory outDir <- params$outDir
# # print(paste("Target:", target)) # print(paste("saving to:", outDir)) # print(names(modDat)) # print(head(modDat)) # print(paste("Growing", trees ,"trees")) # print(paste("Random seed", rseed))
# ## Load the data # modDat <- sf::st_read("e:/workspace/2020/PEM/ALRF_PEMv2/spatialData/SamplePoints_with_rast_values.gpkg", # quiet = TRUE) %>% # dplyr::select(SiteSeries, aspect_025:VerticalDistance_25m) %>% # dplyr::select(-c(Vertical_distance_025:VerticalDistance_25m, ## remove problematic covariates # overland_flow_025:OverlandFlowDistance_25m)) %>% # as.data.frame() # modDat <- modDat[, -length(modDat)] # modDat <- modDat[!is.na(modDat$SiteSeries),] # target <- "SiteSeries" # # ## define output directory # outDir <- "e:/tmp/mlr_5m_200323/" # rseed <- NA
r target
These are the response variable classes including the number in each class that we want to predict.
table(modDat[, target])
PERHAPS -- this should be another small function or sourced other options
## use or create a random number seed -- this can be used to repeat results in future. if (!is.na(rseed)) { set.seed(rseed) print(paste("Random number generator seed set to:", rseed)) } else { rseed <- as.integer(Sys.time()) print(paste("Random number generator seed set to:", rseed)) }
## Create task tsk <- makeClassifTask(data = modDat, target = target) ## Define Learner lrn <- makeLearner("classif.ranger", num.trees = trees, ## number of trees DEFAULT: 500 mtry = round(sqrt(ncol(modDat)-1)), ## someone showed me to declare mtry this way num.threads = parallel::detectCores()*2, ## CAUTION HERE: how many threads does your machine have? importance = "impurity", ## collect var importance data predict.type = "prob") ## model will generate prob. and multi-class
PERHAPS -- this should be another small function or sourced other options
## Defines the validation method resp <- makeResampleDesc("RepCV", ## repeated cross fold folds = 10, ## k-folds 5 or 10 as default. Ideally all folds should be equal size. reps = 5) ## note this will mean 10 x 5 iterations through the data ## note: 5 fold 3 repeats is a little low. I would prefer 10 x 10 but that takes additional time... ## Execute cross validation cv <- mlr::resample(learner = lrn, task = tsk, resampling = resp) saveRDS(cv, file = paste0(outDir, "/cv_results.rds"))
The mean mis-classification error is r mean(cv$measures.test$mmce)
.
cf_matrix <- calculateConfusionMatrix(cv$pred, relative = TRUE, sums = TRUE)
The model is trained using all the data and then saved.
mod <- train(lrn, tsk) saveRDS(mod, paste(outDir, "model.rds", sep = "/"))
var_imp <- as.data.frame(mod$learner.model$variable.importance) %>% rownames_to_column() names(var_imp) <- c("name", "VaribleImportance") knitr::kable(var_imp %>% arrange(desc(VaribleImportance)) %>% head(., 20))
Congratulations your model has been generated.
Files are saved:
# outDir <- "e:/tmp/model_gen_test/" ## Testing fn <- as.data.frame(list.files(outDir, full.names = TRUE)) names(fn) <- "Files" knitr::kable(fn)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.