R/regressors.R

# Header
# Filename:     regressors.R
# Version History:
# Version   Date                 Action
# ----------------------------------
# 1.0.0     16 September 2020    Initial Issue.
# 1.0.1     26 August 2024       maler_words renamed to reserved_words

REGRESSOR = setRefClass('REGRESSOR', contains = "MODEL",
 methods = list(
   initialize = function(...){
     callSuper(...)
     type               <<- 'Abstract Regressor'
     config$sig_level   <<- config$sig_level   %>% verify('numeric', domain = c(0,1), default = 0.05)
     config$sfs.enabled <<- config$sfs.enabled %>% verify('logical', domain = c(F,T), default = F)
     if(is.null(config$metrics)){config$metrics <<- 'mae'}
   },
   
   performance = function(X, y, metrics = config$metrics, ...){
     yp = predict(X)[, 1]
     # we transform y only if yin has a transformer function and yout has not. This means the y in training set has been transformed
     # so the input y needs to have the same transformation to be comparable to the original label.
     # if yout has transformer function as well, then yp is already transformed and should be compared to y directly not to the transformed y
     if(!is.null(config$yin_transformer_function) & is.null(config$yout_transformer_function)){
       y = do.call(what = config$yin_transformer_function, args = list(y, config$yin_transformer_arguments))
     }
     correlation(yp, y, metrics = metrics, ...)
   },
   
   transform_yin = function(X, y){
     y = callSuper(X, y)
     # Apply gradient transformers directly by subtracting from y in regression
     if(!is.null(attr(y, 'gradient'))){
       y = y - attr(y, 'gradient')
     }
     return(y)
   }
   
 )
)


# A simple linear regression model. Features with linear dependency to others, will be removed to avoid singularity.
# feature importances are based on p-values of coefficients.
#' @export REG.STATS.LNR
REG.STATS.LNR = setRefClass('REG.STATS.LNR', contains = "REGRESSOR",
   methods = list(
     initialize = function(...){
       callSuper(...)
       config$sig_level <<- config$sig_level %>% verify('numeric', domain = c(0,1), default = '0.1')
       type             <<- 'Regressor'
       description      <<- 'Linear Regression'
       package          <<- 'stats'
       package_language <<- 'R'
       
       if(is.empty(name)){name <<- 'LNR' %>% paste0(sample(1000:9999, 1))}
     },

     model.fit = function(X, y){
       objects$features <<- objects$features %>% filter(fclass %in% c('numeric', 'integer'))
       X = X[objects$features$fname] %>% na2zero

       if(ncol(X) == 0){fit.distribution(X, y); return(NULL)}
       if(config$sfs.enabled){
         D   = cbind(X, Y = y) %>% as.matrix
         res = evaluate(D)
         objects$features <<- objects$features %>% filter(fname %in% res$sig.feature.names)
       }
       # forml = as.formula('y ~ ' %>% paste(paste(colnames(X), collapse = ' + ')))
       objects$model <<- stats::lm(y ~ ., data = cbind(X, y))
       singulars = is.na(objects$model$coefficients) %>% which %>% names
       while(length(singulars) > 0){
         objects$features <<- objects$features %>% filter(!(fname %in% singulars))
         model.fit(X, y)
         singulars = is.na(objects$model$coefficients) %>% which %>% names
       }
       gw = get.features.weight()
       objects$features$importance <<- gw[objects$features$fname]
     },

     get.features.weight = function(){
       objects$model.summary <<- summary(objects$model)
       pv   = objects$model.summary$coefficients[-1, 'Pr(>|t|)'] %>% na2zero
       # Usually NA p-values appear when there is a perfect fit (100% R-squared), so each feature shall be considerd as important!?
       keep = (pv < 0.1)
       weights = pv
       weights[!keep] <- 0
       weights[keep]  <- 1.0 - weights[keep]/0.1
       return(weights)
     },

     model.predict = function(X){
       if(inherits(X, 'WIDETABLE')){X = rbig::as.data.frame(X)}
       objects$model %>% stats::predict(X %>% na2zero) %>% as.data.frame
     },

     get.performance.fit = function(){
       if(is.null(objects$model.summary)){objects$model.summary <<- summary(objects$model)}
       return(objects$model.summary$adj.r.squared)
     },

     # lazy
     get.parameters = function(){},

     # lazy
     get.predictor = function(){
       function(inputs, params){
         params$model$predict(inputs %>% as.data.frame)
       }
     },

     get.predictor.gradient = function(){
       function(inputs, params, wrt){
         params$coefficient[wrt]
       }
     },

     get.expert.predictor = function(X, y){
       # select random subset from the big dataset
       expert = new('FUNCTION', name = name %>% paste('predictor', sep = '.'), inputs = get.features.name() %>% {names(.) <- .;.})
       # congrats: baby is now born! Now its time for the kid to be trained in a school:

       # train the expert and find input weights and performances:
       fit(X, y)
       expert$params          = get.parameters()
       expert$objects$weights = get.features.weight()
       expert$objects$parents = get.features.name()
       expert$objects$performance.fit = get.performance.fit(X, y)
       expert$objects$performance.cv  = get.performance.cv(X, y)

       expert$rule.output   =
         function(inputs, params){
           params$model$predict(inputs %>% as.data.frame)
         }

       expert$rule.gradient =
         function(inputs, params, wrt){
           params$coefficient[wrt]
         }
       return(expert)
     }
   )
)


#' @export REG.XGBOOST
REG.XGBOOST = setRefClass('REG.XGBOOST', contains = "REGRESSOR",
                     methods = list(
                       initialize = function(...){
                         callSuper(...)
                         packages_required <<- c(packages_required, 'xgboost')
                         type             <<- 'Regressor'
                         description      <<- 'XGBoost Regression'
                         package          <<- 'xgboost'
                         package_language <<- 'R'
                         
                         
                         if(is.empty(name)){name <<- 'XGBREG' %>% paste0(sample(1000:9999, 1))}
                         if(is.null(config$nrounds)){config$nrounds <<- 100}
                         
                         # nrounds is passed as a separate argument not within argument params:
                         reserved_words <<- c(reserved_words, 'nrounds', 'sig_level')
                       },
                       
                       model.fit = function(X, y){
                         objects$features <<- objects$features %>% filter(fclass %in% c('numeric', 'integer'))
                         X = X[objects$features$fname] %>% na2zero
                         
                         if(ncol(X) == 0){fit.distribution(X, y); return(NULL)}
                         if(config$sfs.enabled){
                           # Not supported yet
                         }

                         xgb_train = xgboost::xgb.DMatrix(data = X %>% as.matrix, label = y)
                         objects$model <<- xgboost::xgb.train(
                           data = xgb_train, 
                           nrounds = config$nrounds, 
                           # watchlist = watchlist,
                           params = config %>% list.remove(reserved_words))
                         
                         imp = try(xgb.importance(model = objects$model) %>% select(fname = Feature, importance = Gain), silent = T)
                         if(!inherits(imp, 'try-error')){
                           if(!is.null(objects$features$importance)) objects$features$importance <<- NULL
                           objects$features %>% dplyr::left_join(imp, by = 'fname') %>% na2zero ->> objects$features
                         } else if(is.null(objects$features$importance)){
                           objects$features$importance <<- 1.0/nrow(objects$features)
                         }
                       },

                       model.predict = function(X){
                         objects$model %>% stats::predict(X %>% as.matrix) %>% as.data.frame
                       }
                       
                     )
)


#' @export REG.SKLEARN.XGB
REG.SKLEARN.XGB = setRefClass('REG.SKLEARN.XGB', contains = "REGRESSOR",
                             methods = list(
                               initialize = function(...){
                                 callSuper(...)
                                 type               <<- 'Extreme Gradient Boosting for Regression'
                                 if(is.empty(name)){name <<- 'SKXGBREG' %>% paste0(sample(1000:9999, 1))}
                                 module_xgb = reticulate::import('xgboost')
                                 objects$model     <<- do.call(module_xgb$XGBRegressor, config %>% list.remove(reserved_words))
                               },
                               
                               model.fit = function(X, y){
                                 objects$features <<- objects$features %>% filter(fclass %in% c('numeric', 'integer'))
                                 X = X[objects$features$fname]
                                 
                                 objects$model$fit(X %>% data.matrix, y)
                                 imp = try(objects$model$feature_importances_ %>% as.numeric, silent = T)
                                 if(inherits(imp, 'numeric')) objects$features$importance <<- imp
                               },

                               model.predict = function(X){
                                 objects$model$predict(X %>% data.matrix) %>% as.data.frame
                               }
                               
                             )
)
# REG.TAYLOR = setRefClass('REG.TAYLOR', contains = 'MODEL', methods = list(
#   initialize = function(...){
#     callSuper(...)
#     type               <<- 'Taylor Booster'
#     config$model_class <<- 'REG.LM'
#   },
#   
#   model.fit = function(X, y){
#     objects$features <<- objects$features %>% filter(fclass %in% c('numeric', 'integer'))
#     X = X[objects$features$fname]
#     
#     mdl <- new(config$model_class, config = config$model_config, rfe.enabled = T, sfs.enabled = T)
#     mdl$fit(X, y)
#     Xp = X[mdl$objects$features$fname]
#     for(col in mdl$objects$features$fname){
#       Xi = X %>% as.matrix %>% apply(2, function(v) X[,col]*v) %>% {colnames(.) <- colnames(.) %>% paste('X', col, sep = '_')}
#       Xi = cbind(Xp, Xi)
#       mdl <- new(config$model_class, config = config$model_config, rfe.enabled = T, sfs.enabled = T)
#       mdl$fit(Xi, y)
#       Xp = Xi[mdl$objects$features$fname]
#     }
#     object
#   }
# ))
genpack/maler documentation built on Jan. 27, 2025, 1:23 p.m.