R/gcForest.R
In gcForest: Deep Forest Model

Documented in gcdata gcforest model_load model_save req_py

#' @title R for Deep Forest Model (gcForest)
#' @description gcforest() base on a  Python Deep Forest application programming interface (API). Reference \url{https://github.com/pylablanche/gcForest}.
#'
#' @name gcforest
#' @aliases gcforest
#' @author Xu Jing
#' @usage gcforest(shape_1X=NA, n_mgsRFtree=30L, window=NA, stride=1L,
#'     cascade_test_size=0.2, n_cascadeRF=2L, n_cascadeRFtree=101L,
#'     cascade_layer=Inf,min_samples_mgs=0.1, min_samples_cascade=0.05,
#'     tolerance=0.0)
#'
#' @param shape_1X int or tuple list or np.array (default=None)Shape of a single sample element [n_lines, n_cols]. Required when calling mg_scanning!For sequence data a single int can be given.
#' @param n_mgsRFtree int (default=30) Number of trees in a Random Forest during Multi Grain Scanning.
#' @param window int (default=None)List of window sizes to use during Multi Grain Scanning. If 'None' no slicing will be done.
#' @param stride int (default=1)Step used when slicing the data.
#' @param cascade_test_size float or int (default=0.2) Split fraction or absolute number for cascade training set splitting.
#' @param n_cascadeRF int (default=2)Number of Random Forests in a cascade layer. For each pseudo Random Forest a complete Random Forest is created, hence the total numbe of Random Forests in a layer will be 2*n_cascadeRF.
#' @param n_cascadeRFtree int (default=101) Number of trees in a single Random Forest in a cascade layer.
#' @param min_samples_mgs float or int (default=0.1) Minimum number of samples in a node to perform a split during the training of Multi-Grain Scanning Random Forest. If int number_of_samples = int. If float, min_samples represents the fraction of the initial n_samples to consider.
#' @param min_samples_cascade float or int (default=0.1) Minimum number of samples in a node to perform a split during the training of Cascade Random Forest. If int number_of_samples = int. If float, min_samples represents the fraction of the initial n_samples to consider.
#' @param cascade_layer int (default=np.inf) mMximum number of cascade layers allowed. Useful to limit the contruction of the cascade.
#' @param tolerance float (default=0.0) Accuracy tolerance for the casacade growth. If the improvement in accuracy is not better than the tolerance the construction is stopped.
#'
#' @details gcForest provides several important function interfaces, just like the style of Python sklearn.
#' \enumerate{
#'     \item \strong{fit(X,y)} Training the gcForest on input data X and associated target y;
#'     \item \strong{predict(X)} Predict the class of unknown samples X;
#'     \item \strong{predict_proba(X)} Predict the class probabilities of unknown samples X;
#'     \item \strong{mg_scanning(X, y=None)} Performs a Multi Grain Scanning on input data;
#'     \item \strong{window_slicing_pred_prob(X, window, shape_1X, y=None)} Performs a window slicing of the input data and send them through Random Forests. If target values 'y' are provided sliced data are then used to train the Random Forests;
#'     \item \strong{cascade_forest(X, y=None)} Perform (or train if 'y' is not None) a cascade forest estimator;
#'     }
#'
#' @import reticulate
#' @import pkgdown
#' @import crayon
#' @import cli
#' @import utils
#'
#' @examples
#'
#' have_numpy <- reticulate::py_module_available("numpy")
#' have_sklearn <- reticulate::py_module_available("sklearn")
#'
#' if(have_numpy && have_sklearn){
#'     library(gcForest)
#'     req_py()
#'
#'     sk <- NULL
#'
#'     .onLoad <- function(libname, pkgname) {
#'         sk <<- reticulate::import("sklearn", delay_load = TRUE)
#'       }
#'
#'     sk <<- reticulate::import("sklearn", delay_load = TRUE)
#'     train_test_split <- sk$model_selection$train_test_split
#'
#'     data <- sk$datasets$load_iris
#'     iris <- data()
#'     X = iris$data
#'     y = iris$target
#'     data_split = train_test_split(X, y, test_size=0.33)
#'
#'     X_tr <- data_split[[1]]
#'     X_te <- data_split[[2]]
#'     y_tr <- data_split[[3]]
#'     y_te <- data_split[[4]]
#'
#'     gcforest_m <- gcforest(shape_1X=4L, window=2L, tolerance=0.0)
#'
#'     gcforest_m$fit(X_tr, y_tr)
#'
#'     pred_X = gcforest_m$predict(X_te)
#'     print(pred_X)
#' }else{
#'     print('You should have the Python testing environment!')
#' }
#'
#'
#' @export gcforest
gcforest <- function(shape_1X=NA, n_mgsRFtree=30L, window=NA, stride=1L,
                     cascade_test_size=0.2, n_cascadeRF=2L, n_cascadeRFtree=101L, cascade_layer=Inf,
                     min_samples_mgs=0.1, min_samples_cascade=0.05, tolerance=0.0){

  req_py()

  # reticulate::source_python("exec/GCForest.py")
  # environment(gcForest) <- globalenv()
  gcForest <- NULL
  py_path <- system.file("python", "GCForest.py", package = "gcForest")
  reticulate::source_python(py_path,envir=globalenv())

  message(paste0(crayon::green(cli::symbol$tick),crayon::blue(" Welcome to use gcForest packages! \n"),
                 paste0(crayon::green(cli::symbol$tick),crayon::blue(" read the docs form https://gituhb.com/DataXujing/gcForest_r"))))

  gcf <-  gcForest(shape_1X=shape_1X, n_mgsRFtree=n_mgsRFtree, window=window, stride=stride,
                   cascade_test_size=cascade_test_size, n_cascadeRF=n_cascadeRF,
                   n_cascadeRFtree=n_cascadeRFtree, cascade_layer=cascade_layer,
                   min_samples_mgs= min_samples_mgs, min_samples_cascade=min_samples_cascade,
                   tolerance=tolerance, n_jobs=1L)
  return(gcf)
}


#' @title Detect Python Module
#' @description  A function to detect Python module.
#' @name req_py
#' @aliases req_py
#' @author Xu Jing
#' @usage req_py()
#'
#' @import reticulate
#' @import pkgdown
#' @import crayon
#' @import cli
#'
#' @export req_py
req_py <- function() {
  have_numpy <- reticulate::py_module_available("numpy")
  have_sklearn <- reticulate::py_module_available("sklearn")
  if (!have_numpy && !have_sklearn){
    message(paste0(crayon::green(cli::symbol$tick),crayon::blue(" Install the requirement Python module! \n")))
    reticulate::py_install('numpy')
    reticulate::py_install('sklearn')
  }else{
    message(paste0(crayon::green(cli::symbol$tick),crayon::blue(" It is detected that you have configured the necessary Python module.! \n")))
  }

}


#' @title R Data Transform to Python Data
#' @description  A function to tansform R data structure to Python data structure,
#' which based on the reticulate package.
#' @name gcdata
#' @aliases gcdata
#' @author Xu Jing
#' @usage gcdata(x)
#'
#' @param x  The R project like data.frame,vector, array etc..
#'
#' @import reticulate
#' @import pkgdown
#' @import crayon
#' @import cli
#'
#' @examples
#'
#' have_numpy <- reticulate::py_module_available("numpy")
#' have_sklearn <- reticulate::py_module_available("sklearn")
#'
#' if(have_numpy && have_sklearn){
#'
#'     library(gcForest)
#'     req_py()
#'
#'     r_dat <- data.frame('x1'=c(1L,2L,3L),'x2'=c(2L,3L,4L))
#'     py_dat <- gcdata(r_dat)
#'     class(py_dat)
#'
#'     r_vec <- c('a','b','c')
#'     py_vec <- gcdata(r_vec)
#'     class(py_vec)
#' }else{
#'     print('You should have the Python testing environment!')
#' }
#'
#'
#' @export gcdata
gcdata <- function(x){
  tryCatch(y <- reticulate::r_to_py(x),
           error = function(e){
             "Your input data is wrongful, place Check!"
           },
           finally = {return(y)})
}





#' @title gcForest Model Persistence Function
#' @description  It is a sklearn APIs to save your training model, and load it to predict, now you can
#' use R to callback. see also \code{\link{model_load}}
#' @name model_save
#' @aliases model_save
#' @author Xu Jing
#' @usage model_save(model,path)
#'
#' @param model The train model,like gcforest(see also \code{\link{gcforest}}).
#' @param path The path to save model.
#'
#' @import reticulate
#' @import pkgdown
#' @import crayon
#' @import cli
#'
#' @examples
#'
#' have_numpy <- reticulate::py_module_available("numpy")
#' have_sklearn <- reticulate::py_module_available("sklearn")
#'
#' if(have_numpy && have_sklearn){
#'     library(gcForest)
#'     req_py()
#'
#'     sk <- NULL
#'
#'     .onLoad <- function(libname, pkgname) {
#'         sk <<- reticulate::import("sklearn", delay_load = TRUE)
#'      }
#'     sk <<- reticulate::import("sklearn", delay_load = TRUE)
#'     train_test_split <- sk$model_selection$train_test_split
#'
#'     data <- sk$datasets$load_iris
#'     iris <- data()
#'     X = iris$data
#'     y = iris$target
#'     data_split = train_test_split(X, y, test_size=0.33)
#'
#'     X_tr <- data_split[[1]]
#'     X_te <- data_split[[2]]
#'     y_tr <- data_split[[3]]
#'     y_te <- data_split[[4]]
#'
#'     gcforest_m <- gcforest(shape_1X=4L, window=2L, tolerance=0.0)
#'     gcforest_m$fit(X_tr, y_tr)
#'     gcf_model <- model_save(gcforest_m,'gcforest_model.model')
#'
#'     gcf <- model_load('gcforest_model.model')
#'     gcf$predict(X_te)
#'
#' }else{
#'     print('You should have the Python testing environment!')
#' }
#'
#'
#'
#' @export model_save
model_save <- function(model,path){

  # skjoblib <- NULL
  #
  # .onLoad <- function(libname, pkgname) {
  #   skjoblib <<- reticulate::import("sklearn", delay_load = TRUE)
  # }
  skjoblib <- reticulate::import('sklearn')
  joblib_r <- skjoblib$externals$joblib

  cat(paste0(crayon::green(cli::symbol$tick)," Model has been saved in: ",path,'/n'))
  return(joblib_r$dump(model,path))

}



#' @title gcForest Model Persistence Function
#' @description  It is a sklearn APIs to save your training model, and load it to predict, now you can
#' use R to callback. see also \code{\link{model_save}}
#' @name model_load
#' @aliases model_load
#' @author Xu Jing
#' @usage model_load(path)
#'
#' @param path The path to save model(see also \code{\link{model_save}}.
#'
#' @import reticulate
#' @import pkgdown
#' @import crayon
#' @import cli
#'
#' @examples
#'
#' have_numpy <- reticulate::py_module_available("numpy")
#' have_sklearn <- reticulate::py_module_available("sklearn")
#'
#' if(have_numpy && have_sklearn){
#'     library(gcForest)
#'     req_py()
#'
#'     sk <- NULL
#'
#'     .onLoad <- function(libname, pkgname) {
#'         sk <<- reticulate::import("sklearn", delay_load = TRUE)
#'       }
#'     sk <<- reticulate::import("sklearn", delay_load = TRUE)
#'     train_test_split <- sk$model_selection$train_test_split
#'
#'     data <- sk$datasets$load_iris
#'     iris <- data()
#'     X = iris$data
#'     y = iris$target
#'     data_split = train_test_split(X, y, test_size=0.33)
#'
#'     X_tr <- data_split[[1]]
#'     X_te <- data_split[[2]]
#'     y_tr <- data_split[[3]]
#'     y_te <- data_split[[4]]
#'
#'     gcforest_m <- gcforest(shape_1X=4L, window=2L, tolerance=0.0)
#'     gcforest_m$fit(X_tr, y_tr)
#'     gcf_model <- model_save(gcforest_m,'gcforest_model.model')
#'
#'     gcf <- model_load('gcforest_model.model')
#'     gcf$predict(X_te)
#'
#' }else{
#'     print('You should have the Python testing environment!')
#' }
#'
#'
#'
#' @export model_load
model_load <- function(path){
  # skjoblib <- NULL
  #
  # .onLoad <- function(libname, pkgname) {
  #   skjoblib <<- reticulate::import("sklearn", delay_load = TRUE)
  # }

  skjoblib <- reticulate::import('sklearn')
  joblib_r <- skjoblib$externals$joblib

  cat(paste0(crayon::green(cli::symbol$tick)," Model has been load from: ",path,'/n'))
  return(joblib_r$load(path))
}