#' Linear Models Trainer
#' @description Trains linear models such as Logistic, Lasso or Ridge regression model. It is built on glmnet R package.
#' This class provides fit, predict, cross valdidation functions.
#' @format \code{\link{R6Class}} object.
#' @section Usage:
#' For usage details see \bold{Methods, Arguments and Examples} sections.
#' \preformatted{
#' bst = LMTrainer$new(family, weights, alpha, lambda=100, standardize.response=FALSE)
#' bst$fit(X_train, "target")
#' prediction <- bst$predict(X_test)
#' bst$cv_model(X_train, "target", nfolds=4, parallel=TRUE)
#' cv_prediction <- bst$cv_predict(X_test)
#' }
#' @section Methods:
#' \describe{
#' \item{\code{$new()}}{Initialises an instance of random forest model}
#' \item{\code{$fit()}}{fit model to an input train data (data frame or data table) and trains the model.}
#' \item{\code{$predict()}}{returns predictions by fitting the trained model on test data.}
#' \item{\code{$cv_model()}}{Using k-fold cross validation technique, finds the best value of lambda. type.measure is the loss to use for cross validation.}
#' \item{\code{$cv_predict()}}{Using the best value of lambda, makes predictions on the test data}
#' \item{\code{$get_importance()}}{Returns a matrix of feature coefficients as generated by Lasso}
#' }
#' @section Arguments:
#' \describe{
#' \item{family}{type of regression to perform, values can be "gaussian" ,"binomial", "multinomial","mgaussian"}
#' \item{weights}{observation weights. Can be total counts if responses are proportion matrices. Default is 1 for each observation}
#' \item{alpha}{The elasticnet mixing parameter, alpha=1 is the lasso penalty, and alpha=0 the ridge penalty.}
#' \item{nlambda}{the number of lambda values - default is 100}
#' \item{standardize.response}{normalise the dependent variable between 0 and 1, default = FALSE}
#' }
#' @export
#' @examples
#' LINK <- "http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data"
#' housing <- read.table(LINK)
#' names <- c("CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS",
#' "RAD","TAX","PTRATIO","B","LSTAT","MEDV")
#' names(housing) <- names
#' lf <- LMTrainer$new(family = 'gaussian', alpha=1)
#' lf$fit(X = housing, y = 'MEDV')
#' predictions <- lf$predict(df = housing)
#'
#'
#' # cross validation model
#' lf$cv_model(X = housing, y = 'MEDV', nfolds = 5, parallel = FALSE)
#' predictions <- lf$cv_predict(df = housing)
#' coefs <- lf$get_importance()
LMTrainer <- R6Class("LMTrainer", public = list(
family = NULL,
weights = NULL,
# 1 = lasso (variable selection)
# 0 = Ridge
# NULL = Simple Gaussian Regression
alpha = NULL,
lambda = 100,
standardize = FALSE,
standardize.response = FALSE,
model = NULL,
cvmodel = NULL,
Flag = FALSE,
is_lasso = FALSE,
iid_names = NULL,
initialize = function(family,
weights,
alpha,
lambda,
standardize.response){
if(!(missing(family))) self$family <- family
if(!(missing(weights))) self$weights <- weights
if(!(missing(alpha))) self$alpha <- alpha
if(!(missing(lambda))) self$lambda <- lambda
if(!(missing(standardize.response))) {
self$standardize.response <- standardize.response
}
superml::check_package("glmnet")
},
fit = function(X, y){
superml::testdata(X, y)
self$iid_names <- setdiff(colnames(X), y)
# set default value for weights
self$weights <- rep(1, nrow(X))
if(is.null(self$alpha)){
# run simple gaussian regression
f <- as.formula(paste(y , paste("~ .")))
self$model <- stats::glm(formula = f
,data = X
,weights = self$weights
,family = self$family)
} else {
DX <- as.matrix(setDT(X)[, self$iid_names, with=F])
self$model <- glmnet::glmnet(x = DX,
y = X[[y]],
family = self$family,
weights = self$weights,
alpha = self$alpha,
nlambda = self$lambda,
standardize = self$standardize,
standardize.response = self$standardize.response)
if(self$alpha == 1) self$is_lasso <- TRUE
}
},
predict = function(df, lambda = NULL){
in_type <- ifelse(self$family == "binomial", "response","link")
if(is.null(self$alpha)){
return(stats::predict.glm(object = self$model,
newdata = df,
type = "response"))
} else {
if(is.null(lambda)) lambda <- min(self$model$lambda)
return(glmnet::predict.glmnet(object=self$model,
newx = as.matrix(setDT(df)[, c(self$iid_names), with=F]),
s = lambda,
type = "link"))
}
},
cv_model = function(X, y, nfolds, parallel, type.measure="deviance"){
superml::testdata(X, y)
self$iid_names <- setdiff(colnames(X), y)
if(isTRUE(parallel)){
cl <- makeCluster(detectCores())
doParallel::registerDoParallel(cl)
message("Starting parallel clusters.")
}
self$weights <- rep(1, nrow(X))
DX <- as.matrix(setDT(X)[, self$iid_names, with=FALSE])
self$cvmodel <- glmnet::cv.glmnet(x = DX
,y = X[[y]]
,weights = self$weights
,lambda = NULL
,nfolds = nfolds
,parallel = parallel
,type.measure = type.measure
)
# this flag is for variable importance
self$Flag <- TRUE
message("Computation done.")
if(isTRUE(parallel)){
message("Stopping clusters.")
stopCluster(cl)
}
},
cv_predict = function(df, lambda=NULL){
if(is.null(lambda)){
if(isTRUE(self$Flag)) lambda <- min(self$cvmodel$lambda.1se)
else print("Please run the cv_model function.")
}
DX <- as.matrix(setDT(df)[, c(self$iid_names), with=FALSE])
return(glmnet::predict.cv.glmnet(object=self$cvmodel,
newx = DX,
s = lambda,
type="response"))
},
get_importance = function(){
if(self$Flag){
return(as.matrix(coef(self$cvmodel)))
} else {
print("Feature selection only happens in Lasso Regression.
Please run the model with alpha=1.")
}
})
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.