# 25 Oct 2019 'fitted', 'features.include' and 'transformer' added to reserved_words
# Todos:
# 1- Add imputer module to config to impute for missing values
# look at Recipes: https://conf20-intro-ml.netlify.app/slides/06-recipes.html#2
# 2- Add upsampling and downsampling modules
library(magrittr)
#' Abstract Model Class
#' @description Reference Class \code{MODEL} is the highest level of herarchy in a series of classes defined in the \code{rml} package.
#' All transformers, classifiers, regressors and survival models inherit from this class.
#'
#' @field name character containing name of the model
#'
#' @field type character containing model type. Specifies transformer type
#' @field package character specifies name of the package model is using
#' @field package_language character specifies programming language of the package either R or Python.
#' @field description character descripbes what the model is.
#' @field config list containing all parameters and settings of the model. Values depend on the model class.
#' @field fitted logical specifies if the model is fitted (trained) or not
#'
#' @field transformers list containing all transformer objects. Each transformer is an instance inheriting from MODEL class.
#' @field gradient_transformers list containing all gradient transformer objects. Each gradient transformer is an instance inheriting from MODEL class.
#' @field yin_transformers list of functions.
#' @field yout_transformers list of functions.
#' @field objects list containing additional data
#' @field objects$features data.frame containing list of features and their characteristics. This table is created after fitting the model.
#' @field objects$fitting_time POSIXct containing time when the model's training is complete. Created after the model is fitted.
#'
#' @export MODEL
#' @exportClass MODEL
MODEL = setRefClass('MODEL',
fields = list(name = "character", type = "character", package = "character", package_language = "character", packages_required = "character", description = "character", reserved_words = "character",
config = "list", fitted = 'logical', transformers = 'list', gradient_transformers = 'list', yin_transformers = 'list', yout_transformers = 'list', objects = "list"),
methods = list(
initialize = function(..., name = character(), transformers = list(), gradient_transformers = list(), mother = NULL, features = NULL, pupils = NULL){
callSuper(name = name)
settings = list(...)
ns = names(settings)
for (sn in ns){
set = settings[[sn]]
if(inherits(set, 'list') & sn == 'config') {
settings = settings %<==>% set
settings[[sn]] <- NULL
}
}
packages_required <<- c('magrittr', 'dplyr', 'rutils', 'rml', 'rbig', 'reticulate')
reserved_words <<- c('keep_columns', 'keep_features',
'name_in_output', 'metrics',
'cv.ntrain', 'cv.ntest', 'cv.test_ratio','cv.train_ratio', 'cv.split_method',
'cv.reset_transformer', 'cv.restore_model', 'cv.set',
'sfs.enabled',
'fe.enabled','fe.recursive', 'fe.importance_threshold', 'fe.quantile',
'mc.enabled', 'mc.num_cores',
'remove_failed_transformers',
'pp.coerce_integer_features',
'pp.trim_outliers', 'pp.trim_outliers.adaptive', 'pp.trim_outliers.recursive', 'pp.trim_outliers.sd_threshold',
'pp.mask_missing_values',
'model.class', 'model.config', 'model.module',
'eda.enabled',
'verbose', 'pass_columns', 'remove_columns', 'name', 'column_filters',
# smp.enabled: boolean parameter. default = FALSE. Should I do any sampling of the training rows at all? If FALSE (Default)
# model will be trained by the entire training data table (X) without any changes.
'smp.enabled', 'smp.class_ratio', 'smp.sample_ratio', 'smp.num_rows', 'smp.method', 'smp.config',
'features.include', 'features.include.at', 'features.exclude', 'features.exclude.at',
'ts.enabled', 'ts.id_col', 'ts.time_col',
'return_features', 'feature_subset_size', 'gradient_transformers_aggregator', 'save_predictions',
'yin_transformer_function', 'yin_transformer_arguments', 'yout_transformer_function', 'yout_transformer_arguments',
'metric', 'transformers', 'fitted')
for(pn in c('keep_columns', 'keep_features', 'cv.restore_model', 'fe.enabled', 'mc.enabled',
'pp.coerce_integer_features', 'pp.trim_outliers',
'eda.enabled', 'smp.enabled', 'ts.enabled', 'save_predictions')){
settings[[pn]] <- verify(settings[[pn]], 'logical', lengths = 1, domain = c(T,F), default = F)
}
# Note: if a transformer fails to fit, config parameter 'remove_failed_transformer' determines if the failed transformer should be removed or not.
# if it is set to FALSE, then failed transformer remains among transformers.
# Then as part of the fitting procedure, the predict() method of the failed transformer will be called to transform the training set.
# This will fail, but it does not stop the process but no columns will be generated in the transformed output.
# Now, if there is another successfully fit transformer, there will be some columns in the transformed table and the main model will be fit based on that.
# Now, for test, since the failed transformer still exists and is not fit, it's fit method is called as part of the predict procedure of the main model.
# The failed transformer now is being fitted with the test set which is not right, however,
# if it fits successfully, it generates columns in the final output anyway,
# but those columns will not be read by the main model at prediction because it only reads columns which are specified
# in objects$features table. So no problem will happen.
# The advantage of setting this parameter to False is that after resetting the model and fitting it with a different dataset
# the failed transformer may not fail this time and be used in the main model.
for(pn in c('cv.reset_transformer', 'name_in_output', 'remove_failed_transformers')){
settings[[pn]] <- verify(settings[[pn]], 'logical', lengths = 1, domain = c(T,F), default = T)
}
settings$verbose <- verify(config$verbose, c('numeric', 'integer'), lengths = 1, default = 1) %>% as.integer
settings$column_filters <- verify(settings$column_filters, 'list', default = list(list(column = 'n_unique', filter = ' > 1')))
settings$metrics <- verify(settings$metrics, 'character', default = 'pearson')
settings$cv.split_method <- verify(settings$cv.split_method, lengths = 1, 'character', default = 'shuffle')
settings$cv.ntest <- verify(settings$cv.ntest , c('numeric', 'integer'), lengths = 1, domain = c(1, Inf), default = 5) %>% as.integer
settings$cv.ntrain <- verify(settings$cv.ntrain, c('numeric', 'integer'), lengths = 1, domain = c(1, Inf), default = 1) %>% as.integer
settings$cv.train_ratio <- verify(settings$cv.train_ratio, c('numeric', 'integer'), lengths = 1, domain = c(0, 1), default = 0.5) %>% as.numeric
settings$cv.test_ratio <- verify(settings$cv.test_ratio, c('numeric', 'integer') , lengths = 1, domain = c(0, 1), default = 0.2) %>% as.numeric
settings$fe.importance_threshold <- verify(settings$fe.importance_threshold, c('numeric', 'integer') , lengths = 1, default = 0) %>% as.numeric
# if(is.null(settings$ts.id_col)){settings$ts.id_col = 'caseID'}
# if(is.null(settings$ts.time_col)){settings$ts.time_col = 'time'}
if(is.null(settings$gradient_transformers_aggregator)){settings$gradient_transformers_aggregator = mean}
config <<- settings
fitted <<- FALSE
transformers <<- transformers %>% {if(inherits(.,'list')) . else list(.)}
gradient_transformers <<- gradient_transformers %>% {if(inherits(.,'list')) . else list(.)}
objects$mother <<- mother
objects$features <<- features
objects$pupils <<- pupils
},
retrieve_model = function(){
for(tr in transformers){tr$retrieve_model()}
for(tr in gradient_transformers){tr$retrieve_model()}
},
keep_model = function(){
for(tr in transformers){tr$keep_model()}
for(tr in gradient_transformers){tr$keep_model()}
},
# deletes temporary file if model object is temporarily saved
release_model = function(){
if(!is.null(objects$model_filename)){
if(file.exists(objects$model_filename)){
unlink(objects$model_filename)
}
objects$model_filename <<- NULL
}
for(tr in transformers){tr$release_model()}
for(tr in gradient_transformers){tr$release_model()}
},
reset = function(reset_transformers = T, reset_gradient_transformers = T, set_features.include = T){
fitted <<- FALSE
if(set_features.include & (length(transformers) == 0)){
config$features.include <<- objects$features$fname
}
objects$features <<- NULL
objects$model <<- NULL
if(!is.null(objects$model_filename)){
unlink(objects$model_filename)
objects$model_filename <<- NULL
}
objects$saved_pred <<- NULL
if (reset_transformers & !is.empty(transformers)){
for (transformer in transformers) transformer$reset(reset_transformers = T, reset_gradient_transformers = reset_gradient_transformers, set_features.include = set_features.include)
}
if (reset_gradient_transformers & !is.empty(gradient_transformers)){
for (transformer in gradient_transformers) transformer$reset(reset_transformers = reset_transformers, reset_gradient_transformers = T, set_features.include = set_features.include)
}
},
info.features = function(include_transformers = T){
fet = objects$features
if(include_transformers){
for(tr in transformers){
fet %<>% rbind(tr$info.features())
}
}
return(fet)
},
canuse_saved_prediction = function(X){
permit = F
if(!is.null(objects$saved_pred)){
X = X[numerics(X)]
permit = identical(colSums(X), objects$saved_pred$CSUMS)
permit = permit & identical(rowSums(X), objects$saved_pred$RSUMS)
permit = permit & identical(X[objects$saved_pred$RNDM, ], objects$saved_pred$XSAMP)
}
return(permit)
},
keep_prediction = function(X, Y){
X = X[numerics(X)]
rnd = sequence(nrow(X)) %>% sample(size = ncol(X), replace = T)
objects$saved_pred <<- list(CSUMS = colSums(X), RSUMS = rowSums(X), XOUT = Y, RNDM = rnd, XSAMP = X[rnd,])
},
predict = function(X){
if(!fitted) stop(paste('from', name, 'of type', type, ':', 'Model not fitted!', '\n'))
if(inherits(X, 'matrix')){X %<>% as.data.frame}
XORG = transform_x(X)
zero = objects$features$fname %-% rbig::colnames(XORG)
if(length(zero) > 0){
for(z in zero) XORG[,z] <- 0
cat('Warning from ', name, ': Features ', paste(zero, collapse = ','), ' were not generated by transformers! Filled with zero.', '\n')
}
XFET = XORG[objects$features$fname]
cusp = config$save_predictions
if(cusp){
cusp = canuse_saved_prediction(XFET)
}
if(cusp){
XOUT = objects$saved_pred$XOUT
} else {
# .self$retrieve_model()
if(!is.null(config$pp.mask_missing_values)){
# This will edit the table, so WideTables cannot be used yet.
if(inherits(XFET, 'WIDETABLE')){XFET = rbig::as.data.frame(XFET)}
for(i in sequence(ncol(XFET))){
wna = which(is.na(XFET[[i]]))
if(length(wna) > 0){
XFET[wna, i] <- config$pp.mask_missing_values[1] %>% rutils::coerce(class(XFET[wna, i])[1])
}
}
}
XOUT = .self$model.predict(XFET)
if(config$save_predictions){
keep_prediction(XFET, XOUT)
}
}
XOUT = transform_yout(X, XOUT)
colnames(XOUT)[which(colnames(XOUT) %in% c('.'))] <- ''
if((ncol(XOUT) > 0) & config$name_in_output) colnames(XOUT) <- name %>% paste(colnames(XOUT), sep = ifelse(colnames(XOUT) == '', '', '_'))
objects$n_output <<- ncol(XOUT)
treat(XOUT, XFET, XORG)
},
treat = function(out, fet, org){
if(!is.null(config$pass_columns)) org = org[config$pass_columns]
if(!is.null(config$remove_columns)) org = org[colnames(org) %-% config$remove_columns]
if(config$keep_columns)
if(config$keep_features) return(cbind(org, out))
else return(cbind(org %>% column_drop(colnames(fet)), out))
else if (config$keep_features){
# if(!is.null(config$features.include)){
# extra = config$features.include %-% colnames(fet)
# return(cbind(fet, org[extra], out))
# } else return(cbind(fet, out))
return(cbind(fet, out))
}
else return(out)
},
# Fitting with feature elimination (FE). Works only for model fitters which generate feature importance
fit.fe = function(X, y){
.self$model.fit(X, y)
if('importance' %in% colnames(objects$features)){
if(!is.null(config$fe.quantile)){
threshold = objects$features$importance %>% quantile(probs = config$fe.quantile)
} else {
threshold = config$fe.importance_threshold
}
fns = objects$features$fname
ftk = fns[which(objects$features$importance > threshold)] # features to keep
fte = fns %-% ftk
while(length(fte) > 0){
objects$features <<- objects$features %>% filter(fname %in% ftk)
if(is.empty(objects$features)){
# .self$fit.distribution(X, y); fte = c()
stop('No features left for training the model!')
} else {
.self$model.fit(X[objects$features$fname], y)
fns = objects$features$fname
ftk = fns[which(objects$features$importance > threshold)] # features to keep
fte = chif(config$fe.recursive, fns %-% ftk, c())
}
}
if(is.empty(objects$features)){
cat(name, ': ', 'No features left after elimination! Distribution fitted for output variable.', '\n')
}
}
},
fit.quad = function(X, y){
fit.fe(X, y)
clmns = rbig::colnames(X)
XP = X[objects$features$fname]
for(i in 1:ncol(X)){
v = X[,i]
Xi <- X %>% as.matrix %>% apply(2, function(u) u*v) %>% as.data.frame %>% {colnames(.) <- clmns %>% paste(clmns[i], sep = 'X');.}
XP = XP %>% cbind(Xi)
objects$features <<- rbig::colnames(XP) %>% sapply(function(i) XP %>% pull(i) %>% class) %>% as.data.frame %>% {colnames(.)<-'fclass';.} %>% rownames2Column('fname') %>% mutate(fname = as.character(fname), fclass = as.character(fclass))
fit.fe(XP, y)
XP = XP[objects$features$fname]
}
},
fit.distribution = function(X = NULL, y){
out = outliers(y); while(!is.empty(out)){y = y[-out]; out = outliers(y)}
objects$model <<- list(family = 'normal', mean = mean(y, na.rm = T), sd = sd(y, na.rm = T))
},
predict.distribution = function(X){
N = try(nrow(X), silent = T)
if(!inherits(N, 'integer')) N = as.integer(N)
# rnorm(N, objects$model$mean, objects$model$sd) %>% as.data.frame
rep(objects$model$mean, N) %>% as.data.frame
},
fit = function(X, y = NULL){
if(!fitted){
if(inherits(X, 'matrix')){X %<>% as.data.frame}
y = transform_yin(X, y)
if(!is.null(config[['features.include']])){X = X[config$features.include %^% rbig::colnames(X)]}
if(!is.null(config[['features.exclude']])){X = X[rbig::colnames(X) %-% config$features.exclude]}
X = transform_x(X, y)
if(!is.null(config[['features.include.at']])){X = X[config$features.include.at %^% rbig::colnames(X)]}
if(!is.null(config[['features.exclude.at']])){X = X[rbig::colnames(X) %-% config$features.exclude.at]}
if(config$smp.enabled){
smp_method <- verify(config$smp.method, 'character', domain = c('upsample', 'downsample', 'smote'), default = 'upsample')
actual_ratio = mean(y)
n_train_rows = length(y)
if(!is.null(verify(config$smp.class_ratio, 'numeric', domain = c(.Machine$double.eps, 1 - .Machine$double.eps), null_allowed = T))){
if(smp_method == 'downsample'){
if(config$smp.class_ratio >= actual_ratio){
# downsample negative class, keep all positive class:
w1 = which(y == 1)
n1 = length(w1)
rr = config$smp.class_ratio
w0 = which(y == 0) %>% sample(as.integer(n1*(1.0 - rr)/rr))
} else {
# downsample positive class, keep all negative class:
w0 = which(y == 0)
n0 = length(w0)
rr = config$smp.class_ratio
w1 = which(y == 1) %>% sample(as.integer(n0*(1.0 - rr)/rr))
}
} else if(smp_method == 'upsample'){
if(config$smp.class_ratio >= actual_ratio){
# upsample positive class, keep all negative class:
w0 = which(y == 0)
n0 = length(w0)
rr = config$smp.class_ratio
w1 = which(y == 1) %>% sample(as.integer(n0*rr/(1.0 - rr)), replace = T)
} else {
# upsample negative class, keep all positive class:
w1 = which(y == 1)
n1 = length(w1)
rr = config$smp.class_ratio
w0 = which(y == 0) %>% sample(as.integer(n1*(1.0 - rr)/rr), replace = T)
}
} else if (smp_method == 'smote'){
sv = reticulate::import('smote_variants')
smote_model = verify(config$smp.config[['model']], 'character', domain = names(sv), lengths = 1, default = 'distance_SMOTE')
config$smp.config[['proportion']] <<- verify(config$smp.config[['proportion']], 'numeric', lengths = 1, default = config$smp.class_ratio/(1.0 - config$smp.class_ratio))
oversampler = do.call(sv[[smote_model]], config$smp.config %>% list.remove('model'))
res = oversampler$sample(X %>% as.matrix, y)
X = res[[1]] %>% as.data.frame %>% {colnames(.) <- colnames(X);.}
y = res[[2]] %>% as.numeric
w0 = which(y == 0)
w1 = which(y == 1)
} else {stop('Unknown sampling method specified!')}
ind = c(w0, w1) %>% sample(length(c(w0, w1)))
# X = X[ind,]; y = y[ind]
n_train_rows = length(ind)
} else {ind = sequence(length(y))}
n_rows = verify(config$smp.num_rows, c('numeric', 'integer'), lengths = 1, domain = c(1, Inf), default = n_train_rows) %>%
min(n_train_rows) %>% {.*verify(config$smp.sample_ratio, 'numeric', lengths = 1, domain = c(0, 1), default = 1.0)} %>% as.integer
ind_2 = n_train_rows %>% sequence %>% sample(n_rows)
X = X[ind[ind_2],]; y = y[ind[ind_2]]
}
# This code chunk must only be here. Please don't relocate it from here!
if(config$pp.coerce_integer_features){
X %<>% int_ordinals
}
assert(ncol(X) > 0, 'No column found in the training dataset!')
if(inherits(X, 'WIDETABLE')){
objects$features <<- X$meta %>% distinct(column, .keep_all = T) %>% select(fname = column, fclass = class, n_unique = n_unique)
} else {
objects$features <<- colnames(X) %>% sapply(function(i) X %>% pull(i) %>% class) %>% as.data.frame %>% {colnames(.)<-'fclass';.} %>% rownames2Column('fname') %>% mutate(fname = as.character(fname), fclass = as.character(fclass))
}
nums = objects$features$fclass %in% c('numeric', 'integer')
cats = objects$features$fclass %in% c('character', 'integer', 'factor')
if((sum(nums) > 0) & (config$eda.enabled)){
numinfo = feature_info_numeric(X)
objects$features <<- objects$features %>% left_join(numinfo, by = 'fname')
}
if((sum(cats) > 0) & (config$eda.enabled)){
catinfo = feature_info_categorical(X)
objects$features <<- objects$features %>% merge(catinfo, all = T)
}
if(inherits(config$column_filters, 'list')){
for(fitem in config$column_filters){
verify(fitem, 'list', names_include = c('column', 'filter'))
assert(is.character(fitem$column), "Element 'column' must be character. %s was of class %s!" %>% sprintf(fitem$column, class(fitem$column)[1]))
assert(is.character(fitem$filter), "Element 'filter' must be character. %s was of class %s!" %>% sprintf(fitem$filter, class(fitem$filter)[1]))
if((fitem$column == 'n_unique') & !('n_unique' %in% colnames(objects$features))){
objects$features$n_unique <<- rbig::colnames(X) %>% sapply(function(x) X %>% pull(x) %>% unique %>% length) %>% unlist
}
script = 'objects$features'
script %<>% paste("filter(%s %s)" %>% sprintf(fitem$column, fitem$filter), sep = " %>% ")
objects$features <<- parse(text = script) %>% eval
}
X = X[objects$features$fname]
}
assert(nrow(objects$features) > 0, 'No features left for training!')
# Apply preprocessing: (Don't take preprocessing to the transform_x method again. It causes problems with WideTables. Furthermore, they should not be applied on test. mask_missing_values is an exception which is applied to test in predict method)
# todo: missing values should be treated differently for each column or groups of columns or classes of columns
# todo: missing values could be imputed by aggregations of non-missing values like mean, median or most_frequent
if(!is.null(config$pp.mask_missing_values)){
# This will edit the table, so WideTables cannot be used yet.
if(inherits(X, 'WIDETABLE')){X = rbig::as.data.frame(X)}
for(i in sequence(ncol(X))){
wna = which(is.na(X[[i]]))
if(length(wna) > 0){
X[wna, i] <- config$pp.mask_missing_values[1] %>% rutils::coerce(class(X[wna, i])[1])
}
}
}
# todo: remove outliers can be considered as a transformer as well.
# Add transformers (transformer type: preprocessor)
# This way of outlier trimming is applied only on the training set and not on the test set
# If you use outlier trimmer as a transformer, it will be applied on both test and train
if(config$pp.trim_outliers){
adapt = verify(config$pp.trim_outliers.adaptive, 'logical', default = F)
recur = verify(config$pp.trim_outliers.recursive, 'logical', default = F)
sdcut = verify(config$pp.trim_outliers.sd_threshold, 'numeric', domain = c(0,Inf), default = 4)
X %<>% trim_outliers(sd_threshold = sdcut, adaptive = adapt, recursive = recur)
}
if(config$fe.enabled) {
config$fe.recursive <<- verify(config$fe.recursive, 'logical', lengths = 1, domain = c(T,F), default = F)
fit.fe(X, y)
} else {.self$model.fit(X, y)}
# todo: if(config$quad.enabled) {fit.quad(X, y)}
}
fitted <<- TRUE
objects$fitting_time <<- Sys.time()
},
transform_x = function(X, y = NULL){
nt = length(transformers)
if(nt > 0){
## Fitting:
transformers <<- fit_models(transformers, X = X, y = y, num_cores = config$mc.num_cores, verbose = config$verbose,
remove_failed_models = config$remove_failed_transformers)
# # (uft: unfitted transformers)
# uft = transformers %>% lapply(function(x) {!x$fitted}) %>% unlist %>% which
# num_cores = 1
# if(config$mc.enabled){
# ncores_available = parallel::detectCores()
# num_cores <- verify(config$mc.num_cores, c('numeric', 'integer'), lengths = 1, domain = c(1, ncores_available), default = ncores_available - 1) %>% as.integer
# }
# if(config$mc.enabled & (num_cores > 1) & (length(uft) > 1)){
# requirements = transformers %>% lapply(function(x) x$packages_required) %>% unlist %>% unique
# cl = rutils::setup_multicore(n_jobs = num_cores)
# if(!config$silent){cat('\n', 'Fitting %s transformers ... ' %>% sprintf(length(uft)))}
# transformers <<- foreach(transformer = transformers,
# .combine = c, .packages = requirements,
# .errorhandling = rutils::chif(config$remove_failed_transformers, 'remove', 'stop')) %dopar% {
# if(!transformer$fitted){
# transformer$fit(X, y)
# }
# gc()
# list(transformer)
# }
# stopCluster(cl)
# gc()
# if(!config$silent){cat('Done!')}
# } else {
# for(i in uft){
# transformer = transformers[[i]]
# if(!config$silent){
# cat('\n', 'Fitting transformer %s of type %s: %s ... ' %>% sprintf(transformer$name, transformer$type, transformer$description))
# }
# res = try(transformer$fit(X, y), silent = config$remove_failed_transformers)
# if(!config$silent){
# rutils::warnif(inherits(res, 'try-error'), sprintf("Transformer %s failed to fit!", transformer$name), as.character(res))
# cat('Done!')
# }
# }
# }
# ## Remove unfitted transformers:
# # ft: fitted transformers
# ft = transformers %>% lapply(function(x) {x$fitted}) %>% unlist %>% which
# nt = length(ft)
# warnif(nt < length(transformers), sprintf("%s transformers failed to fit and removed!", length(transformers) - nt))
# transformers <<- transformers %>% list.extract(ft)
# assert(nt == length(transformers))
nt = length(transformers)
}
if(nt > 0){
## Prediction:
XT = predict_models(transformers, X = X, num_cores = config$mc.num_cores, verbose = config$verbose)
# if(config$mc.enabled & (num_cores > 1) & (nt > 1)){
# requirements = transformers %>% lapply(function(x) x$packages_required) %>% unlist %>% unique
# cl = rutils::setup_multicore(n_jobs = num_cores)
# if(!config$silent){cat('\n', 'Generate transformed columns from %s transformers ... ' %>% sprintf(nt))}
# XT = foreach(transformer = transformers, .combine = cbind, .packages = requirements,
# .errorhandling = rutils::chif(config$remove_failed_transformers, 'remove', 'stop')) %dopar% {
# gc()
# transformer$predict(X)
# }
# stopCluster(cl)
# gc()
# if(!config$silent){
# cat('Done!')
# }
# } else {
# for(i in sequence(nt)){
# transformer = transformers[[i]]
# if(!config$silent){
# cat('\n', 'Generate transformed columns from transformer %s ... ' %>% sprintf(transformer$name))
# }
# if(i == 1){
# XT = transformer$predict(X)
# } else {
# XT = cbind(XT, transformer$predict(X) %>% {.[colnames(.) %-% colnames(XT)]})
# }
# if(!config$silent){
# cat('Done!')
# }
# }
} else {XT = X}
return(XT)
},
y_gradient = function(X, y = NULL){
nt = length(gradient_transformers)
if(nt > 0){
for(i in sequence(nt)){
transformer = gradient_transformers[[i]]
if(inherits(transformer, 'character')){
assert(transformer %in% rbig::colnames(X))
if(i == 1){
YT = X[transformer]
} else {
YT = cbind(YT, X[transformer])
}
} else if (inherits(transformer, 'MODEL')){
if(!transformer$fitted) {transformer$fit(X, y)}
if(i == 1){
YT = transformer$predict(X)
} else {
YT = cbind(YT, transformer$predict(X) %>% {.[colnames(.) %-% colnames(YT)]})
}
}
yt = YT %>% as.matrix %>% apply(1, config$gradient_transformers_aggregator)
}
} else {yt = 0}
return(yt)
},
transform_yin = function(X, y){
grad = y_gradient(X, y)
if(sum(abs(grad)) > .Machine$double.eps){
attr(y, 'gradient') <- grad
}
if(!is.null(config$yin_transformer_function)){
y = do.call(what = config$yin_transformer_function, args = list(y, config$yin_transformer_arguments))
}
return(y)
},
transform_yout = function(X, Y){
if(!is.null(config$yout_transformer_function)){
Y = do.call(what = config$yout_transformer_function, args = list(Y, config$yout_transformer_arguments))
}
if(length(gradient_transformers) > 0){
grad = y_gradient(X)
for(i in numerics(Y)){
Y[, i] <- Y[, i] + grad
}
# todo: apply y_transformers (list of functions)
}
return(Y)
},
performance.fit = function(){},
info.transformer_count = function(){
cnt = 1
for(tr in transformers){
cnt = cnt + tr$info.transformer_count()
}
return(cnt)
},
info.transformer_names = function(){
mdlns = name
for(tr in transformers){
mdlns = c(mdlns, tr$info.transformer_names())
}
return(mdlns %>% unique)
},
info.edge_list = function(first = T){
edgelist = NULL
if(is.empty(transformers)){
edgelist %<>% rbind(c('INPUT', name))
} else {
for(tr in transformers){
edgelist %<>% rbind(c(tr$name, name))
edgelist %<>% rbind(tr$info.edge_list(first = F))
}
}
if(first){
edgelist %<>% rbind(c(name, 'OUTPUT'))
}
return(edgelist)
},
info.model = function(){
info = list(name = name, type = type, class = class(.self)[1], description = description, package = package,
language = package_language, outputs = objects$n_output, fitted = fitted) %<==>%
list.extract(config, c('keep_columns', 'keep_features', 'smp.enabled', 'smp.class_ratio', 'smp.sample_ratio',
'smp.method', 'fe.enabled', 'metric')) %>% list.clean
return(info)
},
info.transformers = function(){
tbl = do.call(data.frame, info.model() %>% list.add(stringsAsFactors = F))
for(tr in transformers){
tbl_tr = do.call(data.frame, tr$info.model() %>% list.add(stringsAsFactors = F))
tbl %<>% rbind(tbl_tr)
}
return(tbl)
},
plot.network = function(plotter = 'visNetwork', ...){
nodes = xgb$info.transformers()
rownames(nodes) <- nodes$name
nodes['INPUT', 'name'] <- 'INPUT'
nodes['INPUT', 'type'] <- 'Data'
nodes['INPUT', 'class'] <- 'INPUT'
nodes['INPUT', 'description'] <- 'Features'
nodes['OUTPUT', 'name'] <- 'OUTPUT'
nodes['OUTPUT', 'type'] <- 'Final Prediction'
nodes['OUTPUT', 'class'] <- 'OUTPUT'
nodes['OUTPUT', 'description'] <- ''
nodes$outputs[is.na(nodes$outputs)] <- '?'
nodes[is.na(nodes)] <- 'NULL'
nodes$type %<>% as.factor
nodes$description = paste(nodes$class, nodes$description, nodes$type, sep = '\n')
links = xgb$info.edge_list() %>% {colnames(.) <- c('source', 'target');.} %>% as.data.frame %>%
mutate(source = as.character(source), target = as.character(target)) %>%
left_join(nodes %>% select(source = name, outputs), by = 'source')
rvis::viserPlot(list(nodes = nodes, links = links), source = 'source', target = 'target', linkWidth = 'outputs', key = 'name', label = 'description',
color = 'type', config = list(...), linkLabel = 'outputs', type = 'graph', plotter = plotter)
},
# info.transformers = function(){
# translist = data.frame()
# translist
# for(tr in transformers){
# mdlns = c(mdlns, tr$info.transformer_names())
# }
# return(mdlns %>% unique)
# },
model.save = function(path = getwd()){
if(!file.exists(path)) {dir.create(path)}
for(tr in transformers){
tr$model.save(path)
}
for(gtr in gradient_transformers){
gtr$model.save(path)
}
},
model.load = function(path = getwd()){
for(tr in transformers){
tr$model.load(path)
}
for(gtr in gradient_transformers){
gtr$model.load(path)
}
},
# if name_suffix is specified, it will be added to the model names and all its transformers and gradient transformers
deep_copy = function(name_suffix = NULL){
obj = .self$copy()
if(!is.null(name_suffix)){obj$name = obj$name %>% paste0(name_suffix)}
for (i in sequence(length(transformers))){
obj$transformers[[i]] <- transformers[[i]]$deep_copy(name_suffix = name_suffix)
}
for (i in sequence(length(gradient_transformers))){
obj$gradient_transformers[[i]] <- gradient_transformers[[i]]$deep_copy(name_suffix = name_suffix)
}
return(obj)
},
# todo: add k-fold, chronological shuffle, chronological split
performance.cv = function(X, y, metrics = config$metrics, ...){
cvmodel = .self$copy()
# Split by shuffling: todo: support other splitting methods(i.e.: chronological)
N = nrow(X)
scores = list()
for (i in sequence(cvmodel$config$cv.ntrain)){
ind_train = N %>% sequence %>% sample(size = floor(config$cv.train_ratio*N), replace = F)
X_train = X[ind_train, ,drop = F]
y_train = y[ind_train]
cvmodel$reset(config$cv.reset_transformer)
cvmodel$fit(X_train, y_train)
if(is.null(config$cv.set)){
for(j in sequence(config$cv.ntest)){
N2 = N - length(ind_train)
ind_test = sequence(N) %>% setdiff(ind_train) %>% sample(size = floor(config$cv.test_ratio*N2), replace = F)
X_test = X[ind_test, , drop = F]
y_test = y[ind_test]
scores[[length(scores) + 1]] <- cvmodel$performance(X_test, y_test, metrics = metrics, ...)
}
} else {
for(vset in config$cv.set){
scores = c(scores, cvmodel$performance(vset$X, vset$y, metrics = config$metrics))
}
}
}
# while((length(scores) == 1) & inherits(scores, 'list')) scores = scores[[1]]
# if(inherits(scores, 'list')) {
# cls = scores %>% lapply(class) %>% unlist
# if(length(unique(cls)) == 1) {scores %<>% unlist}
# }
return(scores)
},
# get.performance = function(X_train, y_train, X_test, y_test){
#
#
# reset(config$cv.reset_transformer)
# .self$fit(X_train, y_train)
# yhat = predict(X_test) # this has error! Fix it!
# objects$model <<- keep
# return(list(y_pred = yhat, y_true = y_test))
# },
info.size = function(){
t_size = list(config = 0, objects = 0, transformers = 0)
for(tr in transformers){
slist = tr$info.size()
t_size$config = t_size$config + slist$config
t_size$objects = t_size$objects + slist$objects
t_size$transformers = t_size$transformers + slist$transformers
}
list(config = object.size(config),
objects = object.size(objects),
transformers = object.size(transformers) + t_size$config + t_size$objects + t_size$transformers)
},
get.parameters = function(){},
get.expert.predictor = function(){},
get.expert.features = function(){}
))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.