R/LMTrainer.R

#' Linear Models Trainer
#' @description Trains linear models such as Logistic, Lasso or Ridge regression model. It is built on glmnet R package.
#'              This class provides fit, predict, cross valdidation functions.
#' @format \code{\link{R6Class}} object.
#' @section Usage:
#' For usage details see \bold{Methods, Arguments and Examples} sections.
#' \preformatted{
#' bst = LMTrainer$new(family, weights, alpha, lambda=100, standardize.response=FALSE)
#' bst$fit(X_train, "target")
#' prediction <- bst$predict(X_test)
#' bst$cv_model(X_train, "target", nfolds=4, parallel=TRUE)
#' cv_prediction <- bst$cv_predict(X_test)
#' }
#' @section Methods:
#' \describe{
#'   \item{\code{$new()}}{Initialises an instance of random forest model}
#'   \item{\code{$fit()}}{fit model to an input train data (data frame or data table) and trains the model.}
#'   \item{\code{$predict()}}{returns predictions by fitting the trained model on test data.}
#'   \item{\code{$cv_model()}}{Using k-fold cross validation technique, finds the best value of lambda. type.measure is the loss to use for cross validation.}
#'   \item{\code{$cv_predict()}}{Using the best value of lambda, makes predictions on the test data}
#'   \item{\code{$get_importance()}}{Returns a matrix of feature coefficients as generated by Lasso}
#' }
#' @section Arguments:
#' \describe{
#'  \item{family}{type of regression to perform, values can be "gaussian" ,"binomial", "multinomial","mgaussian"}
#'  \item{weights}{observation weights. Can be total counts if responses are proportion matrices. Default is 1 for each observation}
#'  \item{alpha}{The elasticnet mixing parameter, alpha=1 is the lasso penalty, and alpha=0 the ridge penalty.}
#'  \item{nlambda}{the number of lambda values - default is 100}
#'  \item{standardize.response}{normalise the dependent variable between 0 and 1, default = FALSE}
#' }
#' @export
#' @examples
#' LINK <- "http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data"
#' housing <- read.table(LINK)
#' names <- c("CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS",
#'            "RAD","TAX","PTRATIO","B","LSTAT","MEDV")
#' names(housing)  <-  names

#' lf <- LMTrainer$new(family = 'gaussian', alpha=1)
#' lf$fit(X = housing, y = 'MEDV')
#' predictions <- lf$predict(df = housing)
#'
#'
#' # cross validation model
#' lf$cv_model(X = housing, y = 'MEDV', nfolds = 5, parallel = FALSE)
#' predictions <- lf$cv_predict(df = housing)
#' coefs <- lf$get_importance()
LMTrainer <- R6Class("LMTrainer", public = list(

    family = NULL,
    weights = NULL,
    # 1 = lasso (variable selection)
    # 0 = Ridge
    # NULL = Simple Gaussian Regression
    alpha = NULL,
    lambda = 100,
    standardize = FALSE,
    standardize.response = FALSE,
    model = NULL,
    cvmodel = NULL,
    Flag = FALSE,
    is_lasso = FALSE,
    iid_names = NULL,

    initialize = function(family,
                          weights,
                          alpha,
                          lambda,
                          standardize.response){
        if(!(missing(family))) self$family <- family
        if(!(missing(weights))) self$weights <- weights
        if(!(missing(alpha))) self$alpha <- alpha
        if(!(missing(lambda))) self$lambda <- lambda
        if(!(missing(standardize.response))) {
            self$standardize.response <- standardize.response
        }
        superml::check_package("glmnet")

    },

    fit = function(X, y){

        superml::testdata(X, y)
        self$iid_names <- setdiff(colnames(X), y)

        # set default value for weights
        self$weights <- rep(1, nrow(X))

        if(is.null(self$alpha)){

            # run simple gaussian regression
            f <- as.formula(paste(y , paste("~ .")))
            self$model <- stats::glm(formula = f
                             ,data = X
                             ,weights = self$weights
                             ,family = self$family)

        } else {
            DX <- as.matrix(setDT(X)[, self$iid_names, with=F])
            self$model <- glmnet::glmnet(x = DX,
                             y = X[[y]],
                             family = self$family,
                             weights = self$weights,
                             alpha = self$alpha,
                             nlambda = self$lambda,
                             standardize = self$standardize,
                             standardize.response = self$standardize.response)

            if(self$alpha == 1) self$is_lasso <- TRUE
        }
    },

    predict = function(df, lambda = NULL){

        in_type <- ifelse(self$family == "binomial", "response","link")

        if(is.null(self$alpha)){
            return(stats::predict.glm(object = self$model,
                                  newdata = df,
                                  type = "response"))
        } else {
            if(is.null(lambda)) lambda <- min(self$model$lambda)
            return(glmnet::predict.glmnet(object=self$model,
                    newx = as.matrix(setDT(df)[, c(self$iid_names), with=F]),
                    s = lambda,
                    type = "link"))
        }
    },

    cv_model = function(X, y, nfolds, parallel, type.measure="deviance"){

        superml::testdata(X, y)
        self$iid_names <- setdiff(colnames(X), y)
        if(isTRUE(parallel)){
            cl <- makeCluster(detectCores())
            doParallel::registerDoParallel(cl)
            message("Starting parallel clusters.")
        }

        self$weights <- rep(1, nrow(X))
        DX <- as.matrix(setDT(X)[, self$iid_names, with=FALSE])

        self$cvmodel <- glmnet::cv.glmnet(x = DX
                                  ,y = X[[y]]
                                  ,weights = self$weights
                                  ,lambda = NULL
                                  ,nfolds = nfolds
                                  ,parallel = parallel
                                  ,type.measure = type.measure
                                  )

        # this flag is for variable importance
        self$Flag <- TRUE
        message("Computation done.")
        if(isTRUE(parallel)){
            message("Stopping clusters.")
            stopCluster(cl)
        }

    },

    cv_predict = function(df, lambda=NULL){

        if(is.null(lambda)){
            if(isTRUE(self$Flag)) lambda <- min(self$cvmodel$lambda.1se)
            else print("Please run the cv_model function.")
        }
        DX <- as.matrix(setDT(df)[, c(self$iid_names), with=FALSE])

        return(glmnet::predict.cv.glmnet(object=self$cvmodel,
                              newx = DX,
                              s = lambda,
                              type="response"))
    },

    get_importance = function(){

        if(self$Flag){
            return(as.matrix(coef(self$cvmodel)))
        } else {
            print("Feature selection only happens in Lasso Regression.
                  Please run the model with alpha=1.")
        }

    })

)
ssi-ashraf/superml documentation built on Nov. 5, 2019, 9:18 a.m.