#' @export
GLM <- R6::R6Class(
'LinearRegression', inherit=PipeComponent,
public = list(
object_cv = NULL,
family='gaussian', intercept=TRUE, standardize=TRUE,
offset_index=0,
# regularization related
alpha=1, lambda=0.01,
choose_lambda=FALSE, # if true, fit function invokes cross validation
# cross validation controls.
# - lambda_candidates are optional user supplied candidate values
# - lambda_min_ratio is the minimum lambda as fraction to the max (auto-determined)
# - nlambda controls number of lambdas to check
lambda_candidates=NULL, lambda_min_ratio=0.0001, nlambda=100,
nfolds=10, parallel=FALSE, loss='deviance',
# decided not to support:
# - foldid
# - grouped
# - keep
group_multinom = FALSE,
modified_newton = FALSE,
standardize_response = FALSE,
lower=-Inf, upper=+Inf,
max_included=NULL, force_exclude=integer(0),
tol=1e-7, maxit=1e+5,
fit = function(x, y)
{
if (self$choose_lambda) {
private$.fit_cv(x, y)
} else {
private$.fit_ncv(x, y)
}
invisible(self)
},
predict = function(x, ...)
{
tmp <- private$.format_x(x)
x <- tmp$x
os <- tmp$offset
#lambda <- if (is.null(lambda)) self$lambda else lambda
ty <- if (self$family %in% c('binomial', 'multinomial')) 'class' else 'response'
if (is.null(os)) {
out <- predict(self$object, x, s=self$lambda, type=ty, ...)
} else {
out <- predict(self$object, x, s=self$lambda, type=ty, offset=os, ...)
}
drop(out)
},
predict_proba = function(x, ...)
{
tmp <- private$.format_x(x)
x <- tmp$x
os <- tmp$offset
#lambda <- if (is.null(lambda)) self$lambda else lambda
if (is.null(os)) {
out <- predict(self$object, x, s=self$lambda, type='response', ...)
} else {
out <- predict(self$object, x, s=self$lambda, type='response', offset=os, ...)
}
if (!(self$family %in% c('binomial', 'multinomial'))) {
warning('probability prediction may not be appropriate for family ', self$family)
}
drop(out)
},
get_coef = function(nonzero_only=FALSE, ...)
{
ty <- if (nonzero_only) 'nonzero' else 'coefficients'
predict(self$object, s=self$lambda, type=ty)
},
initialize = function(...)
{
self$not_parameters <- c(self$not_parameters, 'object_cv')
self$set_parameters(...)
invisible(self)
},
mse = function(x, y)
{
y <- private$.format_y(y)
p <- self$predict(x)
mean((p-y)^2, na.rm=TRUE)
},
cross_entropy = function(x, y)
{
if (!(self$family %in% c('binomial', 'multinomial'))) {
stop('cross entropy is only available for classification models')
}
y <- private$.format_y(y)
p <- self$predict_proba(x) %>% drop()
if (any(p >= 1) || any(p <= 0)) stop('predicted probability not in (0,1)')
if (self$family == 'binomial') p <- cbind(1-p, p)
-mean(rowSums(y*log(p), na.rm=TRUE), na.rm=TRUE)
},
accuracy = function(x, y)
{
if (!(self$family %in% c('binomial', 'multinomial'))) {
stop('accuracy is only available for classification models')
}
y <- private$.format_y(y)
p <- self$predict(x) %>% drop() %>% private$.format_y()
mean(max.col(y) == max.col(p), na.rm=TRUE)
}
),
private = list(
.format_x = function(x)
{
# pick offset. there may be multiple offsets
index <- self$offset_index
index <- index[index >= 1 & index <= ncol(x)]
if (length(index) == 0) return(list(x=x, offset=NULL))
list(x=x[, -index, drop=FALSE], offset=x[, index])
},
.format_y = function(y)
{
if (self$family %in% c('binomial', 'multinomial')) {
y <- drop(y)
if (is.factor(y) || is.vector(y)) y <- nnet::class.ind(y)
}
y
},
.fit_cv = function(x, y)
{
# called by fit when choose_lambda is true
tmp <- private$.format_x(x)
x <- tmp$x
os <- tmp$offset
y <- private$.format_y(y)
res <- glmnet::cv.glmnet(
x, y, offset=os,
lambda=self$lambda_candidates,
type.measure=self$loss, nfolds=self$nfolds, parallel=self$parallel,
# parameters for glmnet
family=self$family, alpha=self$alpha,
intercept=self$intercept, standardize=self$standardize,
nlambda=self$nlambda, lambda.min.ratio=self$lambda_min_ratio,
thresh=self$tol, maxit=self$maxit,
lower.limits=self$lower, upper.limits=self$upper,
type.logistic=if (self$modified_newton) 'modified.Newton' else 'Newton',
type.multinomial=if (self$group_multinom) 'grouped' else 'ungrouped',
standardize.response=self$standardize_response,
pmax=if (is.null(self$max_included)) ncol(x)+1 else self$max_included,
exclude=self$force_exclude
)
self$object_cv <- res
self$lambda <- res$lambda.min # best lambda
self$object <- res$glmnet.fit # full model
invisible(self)
},
.fit_ncv = function(x, y)
{
# called by fir when choose_lambda is false
tmp <- private$.format_x(x)
x <- tmp$x
os <- tmp$offset
y <- private$.format_y(y)
#lambda <- if (is.null(self$lambda)) self$lambda_candidates else self$lambda
if (!is.numeric(self$lambda) || length(self$lambda) != 1) {
stop('lambda must be a single numeric value')
}
res <- glmnet::glmnet(
x, y, family=self$family, offset=os,
intercept=self$intercept, standardize=self$standardize,
alpha=self$alpha,
lambda=self$lambda, #nlambda=self$nlambda, lambda.min.ratio=self$lambda_min_ratio,
thresh=self$tol, maxit=self$maxit,
lower.limits=self$lower, upper.limits=self$upper,
type.logistic=if (self$modified_newton) 'modified.Newton' else 'Newton',
type.multinomial=if (self$group_multinom) 'grouped' else 'ungrouped',
standardize.response=self$standardize_response,
pmax=if (is.null(self$max_included)) ncol(x)+1 else self$max_included,
exclude=self$force_exclude
)
self$object <- res
invisible(self)
}
)
)
#' Generalized linear model
#'
#' Fit a generalized linear model, possibly with L1 or elastic net regularization.
#' Supports linear regression for continous outputs, poisson regression for count outputs, logistic and multinomial regression for classification.
#' Currently not supports cox regression model.
#'
#' @name GLM
#' @aliases linear_regression linear_regression_multi logistic_regression multinomial_regression poisson_regression
#'
#' @section Usage:
# #' \preformatted{glm_fit(family = 'gaussian', intercept = TRUE, standardize = TRUE,
# #' offset_index = 0, alpha = 1, lambda = 0.01, choose_lambda = FALSE,
# #' lambda_candidates = NULL, lambda_min_ratio = 0.0001, nlambda = 100,
# #' nfolds = 10, parallel = FALSE, loss = 'deviance',
# #' group_multinom = FALSE, modified_newton = FALSE, standardize_response = FALSE,
# #' lower = -Inf, upper = +Inf, max_included = NULL, force_exclude=integer(0),
# #' tol = 1e-7, maxit = 1e+5)}
#' \preformatted{linear_regression(intercept = TRUE, standardize = TRUE,
#' offset_index = 0, alpha = 1, lambda = 0.01, choose_lambda = FALSE,
#' lambda_candidates = NULL, lambda_min_ratio = 0.0001, nlambda = 100,
#' nfolds = 10, parallel = FALSE, loss = 'deviance',
#' group_multinom = FALSE, modified_newton = FALSE, standardize_response = FALSE,
#' lower = -Inf, upper = +Inf, max_included = NULL, force_exclude=integer(0),
#' tol = 1e-7, maxit = 1e+5)}
#' \preformatted{logistic_regression(intercept = TRUE, standardize = TRUE,
#' offset_index = 0, alpha = 1, lambda = 0.01, choose_lambda = FALSE,
#' lambda_candidates = NULL, lambda_min_ratio = 0.0001, nlambda = 100,
#' nfolds = 10, parallel = FALSE, loss = 'deviance',
#' group_multinom = FALSE, modified_newton = FALSE, standardize_response = FALSE,
#' lower = -Inf, upper = +Inf, max_included = NULL, force_exclude=integer(0),
#' tol = 1e-7, maxit = 1e+5)}
#' \preformatted{poisson_regression(intercept = TRUE, standardize = TRUE,
#' offset_index = 0, alpha = 1, lambda = 0.0001, choose_lambda = FALSE,
#' lambda_candidates = NULL, lambda_min_ratio = 0.01, nlambda = 100,
#' nfolds = 10, parallel = FALSE, loss = 'deviance',
#' group_multinom = FALSE, modified_newton = FALSE, standardize_response = FALSE,
#' lower = -Inf, upper = +Inf, max_included = NULL, force_exclude=integer(0),
#' tol = 1e-7, maxit = 1e+5)}
#' \preformatted{multinomial_regression(intercept = TRUE, standardize = TRUE,
#' offset_index = 0, alpha = 1, lambda = 0.0001, choose_lambda = FALSE,
#' lambda_candidates = NULL, lambda_min_ratio = 0.01, nlambda = 100,
#' nfolds = 10, parallel = FALSE, loss = 'deviance',
#' group_multinom = FALSE, modified_newton = FALSE, standardize_response = FALSE,
#' lower = -Inf, upper = +Inf, max_included = NULL, force_exclude=integer(0),
#' tol = 1e-7, maxit = 1e+5)}
#' \preformatted{linear_regression_multi(intercept = TRUE, standardize = TRUE,
#' offset_index = 0, alpha = 1, lambda = 0.01, choose_lambda = FALSE,
#' lambda_candidates = NULL, lambda_min_ratio = 0.0001, nlambda = 100,
#' nfolds = 10, parallel = FALSE, loss = 'deviance',
#' group_multinom = FALSE, modified_newton = FALSE, standardize_response = FALSE,
#' lower = -Inf, upper = +Inf, max_included = NULL, force_exclude=integer(0),
#' tol = 1e-7, maxit = 1e+5)}
#'
#' @section Arguments:
#' \describe{
# #' \item{\code{family}}{Reponse type. \itemize{
# #' \item{\code{'gaussian'} for linear regression}
# #' \item{\code{'mgaussian'} for linear regression with multiple outputs}
# #' \item{\code{'binomial'} for logistic regression}
# #' \item{\code{'multinomial'} for multinomial regression}
# #' \item{\code{'poisson'} for poisson regression}
# #' \item{\code{'cox'} for cox model}
# #' }}
#' \item{\code{intercept}}{logical indicating if the model has constant term}
#' \item{\code{standardize}}{lgical indicating if the explanatory variables should be standardized before fitted}
#' \item{\code{offset_index}}{integer, specifies which column of \code{x} is used as the offset. This is often used for poisson regression, to accomodate the difference in the periods across observations. Note that \code{family=='multinomial'} requires multiple offset variables.}
#' \item{\code{alpha}}{numeric value between 0 and 1, specifies the weight for the L1 regularization versus L2. \code{alpha=1} means L1 regularization, whle \code{alpha=0} is L2 regularization.}
#' \item{\code{lambda}}{initial values of lambda, possibly multiple values. if NULL, automatically chosen}
#' \item{\code{choose_lambda}}{if TRUE, lambda is chosen by cross validation when fitted}
#' \item{\code{lambda_candidates}}{numeric vector, if specified, used as the lambda values with which models are fitted. if NULL, automatically chosen}
#' \item{\code{lambda_min_ratio}}{smallest value of lambda as a fraction to the maximum, which is computed automatically.}
#' \item{\code{nlambda}}{number of lambda values evaluated}
#' \item{\code{nfolds}}{number of cross validation folds, used if \code{choose_lambda=TRUE}}
#' \item{\code{parallel}}{logical indicating parallel computation when conducting cross validation, used if \code{choose_lambda=TRUE}}
#' \item{\code{loss}}{loss function used for cross validation. either 'deviance', 'class', 'auc', 'mse', or 'mae', used if \code{choose_lambda=TRUE}}
#' \item{\code{modified_newton}}{if TRUE, uses an upper bound on the hessian instead of the exact}
#' \item{\code{group_multinom}}{if TRUE, uses group lasso for a variable in multinomial model}
#' \item{\code{standardize_response}}{if TRUE, output variables are standardized in multiple gaussian model}
#' \item{\code{tol}}{numeric value of convergence criterion}
#' \item{\code{maxit}}{maximum number of iteration}
#'
#' }
#' @section Value:
#' \code{GLM} class object
#'
#' @section Class Methods:
#' \describe{
#' \item{\preformatted{fit(x, y)}}{fit the model}
#' \item{\preformatted{predict(x, ...)}}{return predicted values}
#' \item{\preformatted{incr_fit(x, y)}}{not implemented}
#' \item{\preformatted{predict_proba(x, ...)}}{return probability prediction}
#'
#' \item{\preformatted{get_coef(lambda = NULL, nonzero_only = FALSE)}}{return the coefficients}
#' \item{\preformatted{mse(x, y)}}{return the mean squared error}
#' \item{\preformatted{cross_entropy(x, y)}}{return the cross entropy loss if appropriate}
#' \item{\preformatted{accuracy(x, y)}}{return the classification accuracy if appropriate}
#' }
#'
#' @section Details:
#' uses \code{\link[glmnet]{glmnet}} and \code{\link[glmnet]{cv.glmnet}} as the backend
#'
#' @examples
#' set.seed(123)
#' x <- matrix(rnorm(100*20),100,20)
#' y <- rnorm(100)
#' g <- linear_regression()
#' g$fit(x, y)
#' cor(y, g$predict(x))
#' g$mse(x, y)
#'
#' # setting lambda=0 is equivalent to lm
#' g <- linear_regression(lambda=0)
#' g$fit(x, y)
#' lmfit <- lm(y ~ x)
#' cbind(g$get_coef(), coefficients(lmfit))
#'
#' # logistic regression
#' y <- sample(0:1, 100, replace=TRUE)
#' g <- logistic_regression()
#' g$fit(x, y)
#' table(g$predict(x), y)
#' g$accuracy(x, y)
#' g$cross_entropy(x, y)
#' # y can be factor or character
#' y <- sample(c('A', 'B'), 100, replace=TRUE)
#' g$fit(x, y)
#' table(g$predict(x), y)
#' g$accuracy(x, y)
#' g$cross_entropy(x, y)
#'
#' # multinomial regression
#' y <- sample(c('l', 'm', 's'), 100, replace=TRUE)
#' g <- multinomial_regression()
#' g$fit(x, y)
#' table(g$predict(x), y)
#' g$accuracy(x, y)
#' g$cross_entropy(x, y)
#'
#' # poisson regression
#' y <- sample(0:5, 100, prob=c(2,3,3,2,1,1), replace=TRUE)
#' g <- poisson_regression()
#' g$fit(x, y)
#' cor(y, g$predict(x))
NULL
# #' @export
#glm_fit <- GLM$new
#' @export
linear_regression <- function(...) { GLM$new(family='gaussian', ...) }
#' @export
linear_regression_multi <- function(...) { GLM$new(family='mgaussian', ...) }
#' @export
logistic_regression <- function(...) { GLM$new(family='binomial', ...) }
#' @export
multinomial_regression <- function(...) { GLM$new(family='multinomial', ...) }
#' @export
poisson_regression <- function(...) { GLM$new(family='poisson', ...) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.