The following gives a short overview on how to use GPM for prediction. For this example, the iris dataset is used.

library(gpm)
head(iris)

First, the gpm object is created by defining meta information and then adding the dataset and the corresponding meta information to the gpm class.

iris$Plots = paste0(c("a", "b", "c", "d", "e"), sample(seq(30), nrow(iris), replace = TRUE))
iris$SelCat = substr(iris$Plots, 1, 1)
iris$SelNbr = as.numeric(substr(iris$Plots, 2, nchar(iris$Plots)))

meta <- createGPMMeta(iris, type = "input",
                      selector = which(names(iris) %in% c("SelCat", "SelNbr")), response = 5, 
                      predictor = c(1:4), meta = NULL)
iris_gpm <- gpm(iris, meta)

iris_gpm = createIndexFolds(x = iris_gpm, nested_cv = FALSE)

iris_gpm = trainModel(x = iris_gpm,
                      metric = "Accuracy",
                      n_var = NULL, 
                      mthd = "pls",
                      mode = "ffs",
                      seed_nbr = 11, 
                      cv_nbr = NULL,
                      var_selection = "indv",
                      filepath_tmp = NULL)
meta <- createGPMMeta(iris, type = "input",
                      selector = which(names(iris) %in% c("Species")), response = 5, 
                      predictor = c(1:4), meta = NULL)


# for(i in seq(10)){
#   print(table(iris_gpm@data$input$Species[iris_gpm@meta$input$TRAIN_TEST[[1]][[1]]$training_indexOut[[i]]]))
# }


iris_gpm <- gpm(iris, meta)

iris_gpm = trainModel(x = iris_gpm,
                      metric = "Accuracy",
                      n_var = NULL, 
                      mthd = "pls",
                      mode = "ffs",
                      seed_nbr = 11, 
                      cv_nbr = NULL,
                      var_selection = "indv",
                      filepath_tmp = NULL)

Once the gpm object has been created, it can be used as input for computing resamples and splitting the resamples into test and training datasets afterwards.

iris_gpm <- createFolds(x = iris_gpm, nbr = 10, nested_cv = FALSE)
iris_gpm = createIndexFolds(x = iris_gpm, nested_cv = TRUE)
iris_gpm = trainModel(x = iris_gpm,
                    n_var = NULL, 
                    mthd = "pls",
                    mode = "rfe",
                    seed_nbr = 11, 
                    cv_nbr = 5,
                    var_selection = "indv",
                    filepath_tmp = NULL)


iris_gpm@model$pls_ffs[[1]][[1]]$model$results
iris_gpm@model$pls_ffs[[1]][[2]]$model

Once the resampling has been defined, the model tuning and feature selection is handeld by the trainModel function.

iris_gpm = trainModel(x = iris_gpm,
                    n_var = NULL, 
                    mthd = "pls",
                    mode = "rfe",
                    seed_nbr = 11, 
                    cv_nbr = 5,
                    var_selection = "indv",
                    filepath_tmp = NULL)

Since some information related to the model and its performance is commonly required, the following functions compute the unscaled or scaled importance of the individual explanatory (i.e. independent) varialbes. Only those variables are considered, which have been included at least once into a final model.

var_imp <- compVarImp(models, scale = FALSE)
var_imp_scale <- compVarImp(models, scale = TRUE)

The statistics can further be used to create some plots.

plotVarImp(var_imp)

plotVarImpHeatmap(var_imp_scale, xlab = "Species", ylab = "Band")

Finally, performance statistics can be computed for regression or - as in this example - classification models.

tstat <- compContTests(models, mean = TRUE)
summary(tstat[[2]])
obb <- unlist(lapply(models[[1]], function(x){x$model$fit$err.rate[x$model$fit$ntree,1]}))
kappa_int <- unlist(lapply(models[[1]], function(x){helpCalcKappa(x$model$fit$confusion)}))
kappa_int <- kappa_int[names(kappa_int) == "Kappa"]

summary(kappa_int)
summary(tstat[[2]]$Kappa)
summary(obb)

boxplot(kappa_int, tstat[[2]]$Kappa, (1-obb),
        names = c("Kappa internal", "Kappa CV", "OBB"),
        sub=("20-fold sampling of iris data with a 50:50 training to test ratio."))
mrg_tbl_gpm = readRDS("C:/Users/tnauss/permanent/plygrnd/insa/mrg_tbl_gpm_ffs_all_taxa.rds")
responses = mrg_tbl_gpm@meta$input$RESPONSE_FINAL
mrg_tbl_gpm@meta$input$RESPONSE_FINAL = responses[1]
mrg_tbl_gpm@data$input = mrg_tbl_gpm@data$input[complete.cases(mrg_tbl_gpm@data$input[,mrg_tbl_gpm@meta$input$RESPONSE_FINAL]), ]
mrg_tbl_gpm = createIndexFolds(x = mrg_tbl_gpm, nbr = 1) 

mrg_tbl_gpm_ffs_model = trainModel(x = mrg_tbl_gpm,
                                  metric = "RMSE",
                                  n_var = NULL,
                                  mthd = "pls",
                                  mode = "ffs",
                                  seed_nbr = 11,
                                  cv_nbr = 5,
                                  var_selection = "indv") 


9response = mrg_tbl_gpm@meta$input$TRAIN_TEST[[1]][[1]]$training$RESPONSE
index = mrg_tbl_gpm@meta$input$TRAIN_TEST[[1]][[1]]$training_index[[1]]
indexOut = mrg_tbl_gpm@meta$input$TRAIN_TEST[[1]][[1]]$training_indexOut[[1]]
mrg_tbl_gpm@data$input[index, response]
class(mrg_tbl_gpm@data$input[index, response])
summary(mrg_tbl_gpm@data$input[index, mrg_tbl_gpm@meta$input$PREDICTOR_FINAL])
class(mrg_tbl_gpm@data$input[index, mrg_tbl_gpm@meta$input$PREDICTOR_FINAL])


environmentalinformatics-marburg/gpm documentation built on July 11, 2020, 11:12 a.m.