# Header
# Filename: regressors.R
# Version History:
# Version Date Action
# ----------------------------------
# 1.0.0 16 September 2020 Initial Issue.
# 1.0.1 26 August 2024 maler_words renamed to reserved_words
REGRESSOR = setRefClass('REGRESSOR', contains = "MODEL",
methods = list(
initialize = function(...){
callSuper(...)
type <<- 'Abstract Regressor'
config$sig_level <<- config$sig_level %>% verify('numeric', domain = c(0,1), default = 0.05)
config$sfs.enabled <<- config$sfs.enabled %>% verify('logical', domain = c(F,T), default = F)
if(is.null(config$metrics)){config$metrics <<- 'mae'}
},
performance = function(X, y, metrics = config$metrics, ...){
yp = predict(X)[, 1]
# we transform y only if yin has a transformer function and yout has not. This means the y in training set has been transformed
# so the input y needs to have the same transformation to be comparable to the original label.
# if yout has transformer function as well, then yp is already transformed and should be compared to y directly not to the transformed y
if(!is.null(config$yin_transformer_function) & is.null(config$yout_transformer_function)){
y = do.call(what = config$yin_transformer_function, args = list(y, config$yin_transformer_arguments))
}
correlation(yp, y, metrics = metrics, ...)
},
transform_yin = function(X, y){
y = callSuper(X, y)
# Apply gradient transformers directly by subtracting from y in regression
if(!is.null(attr(y, 'gradient'))){
y = y - attr(y, 'gradient')
}
return(y)
}
)
)
# A simple linear regression model. Features with linear dependency to others, will be removed to avoid singularity.
# feature importances are based on p-values of coefficients.
#' @export REG.STATS.LNR
REG.STATS.LNR = setRefClass('REG.STATS.LNR', contains = "REGRESSOR",
methods = list(
initialize = function(...){
callSuper(...)
config$sig_level <<- config$sig_level %>% verify('numeric', domain = c(0,1), default = '0.1')
type <<- 'Regressor'
description <<- 'Linear Regression'
package <<- 'stats'
package_language <<- 'R'
if(is.empty(name)){name <<- 'LNR' %>% paste0(sample(1000:9999, 1))}
},
model.fit = function(X, y){
objects$features <<- objects$features %>% filter(fclass %in% c('numeric', 'integer'))
X = X[objects$features$fname] %>% na2zero
if(ncol(X) == 0){fit.distribution(X, y); return(NULL)}
if(config$sfs.enabled){
D = cbind(X, Y = y) %>% as.matrix
res = evaluate(D)
objects$features <<- objects$features %>% filter(fname %in% res$sig.feature.names)
}
# forml = as.formula('y ~ ' %>% paste(paste(colnames(X), collapse = ' + ')))
objects$model <<- stats::lm(y ~ ., data = cbind(X, y))
singulars = is.na(objects$model$coefficients) %>% which %>% names
while(length(singulars) > 0){
objects$features <<- objects$features %>% filter(!(fname %in% singulars))
model.fit(X, y)
singulars = is.na(objects$model$coefficients) %>% which %>% names
}
gw = get.features.weight()
objects$features$importance <<- gw[objects$features$fname]
},
get.features.weight = function(){
objects$model.summary <<- summary(objects$model)
pv = objects$model.summary$coefficients[-1, 'Pr(>|t|)'] %>% na2zero
# Usually NA p-values appear when there is a perfect fit (100% R-squared), so each feature shall be considerd as important!?
keep = (pv < 0.1)
weights = pv
weights[!keep] <- 0
weights[keep] <- 1.0 - weights[keep]/0.1
return(weights)
},
model.predict = function(X){
if(inherits(X, 'WIDETABLE')){X = rbig::as.data.frame(X)}
objects$model %>% stats::predict(X %>% na2zero) %>% as.data.frame
},
get.performance.fit = function(){
if(is.null(objects$model.summary)){objects$model.summary <<- summary(objects$model)}
return(objects$model.summary$adj.r.squared)
},
# lazy
get.parameters = function(){},
# lazy
get.predictor = function(){
function(inputs, params){
params$model$predict(inputs %>% as.data.frame)
}
},
get.predictor.gradient = function(){
function(inputs, params, wrt){
params$coefficient[wrt]
}
},
get.expert.predictor = function(X, y){
# select random subset from the big dataset
expert = new('FUNCTION', name = name %>% paste('predictor', sep = '.'), inputs = get.features.name() %>% {names(.) <- .;.})
# congrats: baby is now born! Now its time for the kid to be trained in a school:
# train the expert and find input weights and performances:
fit(X, y)
expert$params = get.parameters()
expert$objects$weights = get.features.weight()
expert$objects$parents = get.features.name()
expert$objects$performance.fit = get.performance.fit(X, y)
expert$objects$performance.cv = get.performance.cv(X, y)
expert$rule.output =
function(inputs, params){
params$model$predict(inputs %>% as.data.frame)
}
expert$rule.gradient =
function(inputs, params, wrt){
params$coefficient[wrt]
}
return(expert)
}
)
)
#' @export REG.XGBOOST
REG.XGBOOST = setRefClass('REG.XGBOOST', contains = "REGRESSOR",
methods = list(
initialize = function(...){
callSuper(...)
packages_required <<- c(packages_required, 'xgboost')
type <<- 'Regressor'
description <<- 'XGBoost Regression'
package <<- 'xgboost'
package_language <<- 'R'
if(is.empty(name)){name <<- 'XGBREG' %>% paste0(sample(1000:9999, 1))}
if(is.null(config$nrounds)){config$nrounds <<- 100}
# nrounds is passed as a separate argument not within argument params:
reserved_words <<- c(reserved_words, 'nrounds', 'sig_level')
},
model.fit = function(X, y){
objects$features <<- objects$features %>% filter(fclass %in% c('numeric', 'integer'))
X = X[objects$features$fname] %>% na2zero
if(ncol(X) == 0){fit.distribution(X, y); return(NULL)}
if(config$sfs.enabled){
# Not supported yet
}
xgb_train = xgboost::xgb.DMatrix(data = X %>% as.matrix, label = y)
objects$model <<- xgboost::xgb.train(
data = xgb_train,
nrounds = config$nrounds,
# watchlist = watchlist,
params = config %>% list.remove(reserved_words))
imp = try(xgb.importance(model = objects$model) %>% select(fname = Feature, importance = Gain), silent = T)
if(!inherits(imp, 'try-error')){
if(!is.null(objects$features$importance)) objects$features$importance <<- NULL
objects$features %>% dplyr::left_join(imp, by = 'fname') %>% na2zero ->> objects$features
} else if(is.null(objects$features$importance)){
objects$features$importance <<- 1.0/nrow(objects$features)
}
},
model.predict = function(X){
objects$model %>% stats::predict(X %>% as.matrix) %>% as.data.frame
}
)
)
#' @export REG.SKLEARN.XGB
REG.SKLEARN.XGB = setRefClass('REG.SKLEARN.XGB', contains = "REGRESSOR",
methods = list(
initialize = function(...){
callSuper(...)
type <<- 'Extreme Gradient Boosting for Regression'
if(is.empty(name)){name <<- 'SKXGBREG' %>% paste0(sample(1000:9999, 1))}
module_xgb = reticulate::import('xgboost')
objects$model <<- do.call(module_xgb$XGBRegressor, config %>% list.remove(reserved_words))
},
model.fit = function(X, y){
objects$features <<- objects$features %>% filter(fclass %in% c('numeric', 'integer'))
X = X[objects$features$fname]
objects$model$fit(X %>% data.matrix, y)
imp = try(objects$model$feature_importances_ %>% as.numeric, silent = T)
if(inherits(imp, 'numeric')) objects$features$importance <<- imp
},
model.predict = function(X){
objects$model$predict(X %>% data.matrix) %>% as.data.frame
}
)
)
# REG.TAYLOR = setRefClass('REG.TAYLOR', contains = 'MODEL', methods = list(
# initialize = function(...){
# callSuper(...)
# type <<- 'Taylor Booster'
# config$model_class <<- 'REG.LM'
# },
#
# model.fit = function(X, y){
# objects$features <<- objects$features %>% filter(fclass %in% c('numeric', 'integer'))
# X = X[objects$features$fname]
#
# mdl <- new(config$model_class, config = config$model_config, rfe.enabled = T, sfs.enabled = T)
# mdl$fit(X, y)
# Xp = X[mdl$objects$features$fname]
# for(col in mdl$objects$features$fname){
# Xi = X %>% as.matrix %>% apply(2, function(v) X[,col]*v) %>% {colnames(.) <- colnames(.) %>% paste('X', col, sep = '_')}
# Xi = cbind(Xp, Xi)
# mdl <- new(config$model_class, config = config$model_config, rfe.enabled = T, sfs.enabled = T)
# mdl$fit(Xi, y)
# Xp = Xi[mdl$objects$features$fname]
# }
# object
# }
# ))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.