R/cvaGlmnetFormula.r

Defines functions minlossplot.cva.glmnet minlossplot plot.cva.glmnet print.cva.glmnet.formula coef.cva.glmnet predict.cva.glmnet.formula predict.cva.glmnet cva.glmnet.formula cva.glmnet.default cva.glmnet

Documented in coef.cva.glmnet cva.glmnet cva.glmnet.default cva.glmnet.formula minlossplot minlossplot.cva.glmnet plot.cva.glmnet predict.cva.glmnet predict.cva.glmnet.formula print.cva.glmnet.formula

#' @include glmnetUtils.r
NULL

#' @name cva.glmnet
#' @export
cva.glmnet <- function(x, ...)
UseMethod("cva.glmnet")

#' Do elastic net cross-validation for alpha and lambda simultaneously
#'
#' @param x A matrix of predictor variables; or for the plotting methods, an object returned by `cva.glmnet`.
#' @param y A response vector or matrix (for a multinomial response).
#' @param alpha A vector of alpha values for which to do cross-validation. The default is a sequence of 11 values more closely spaced around alpha = 0. For the `predict` and `coef` methods, the specific value of alpha for which to return predictions/regression coefficients.
#' @param nfolds The number of cross-validation folds to use. Defaults to 10.
#' @param foldid Vector of fold IDs for cross-validation. See [glmnet::cv.glmnet].
#' @param outerParallel Method of parallelising the outer loop over alpha. See 'Details' below. If `NULL`, the loop is run sequentially.
#' @param checkInnerParallel If the outer loop is run in parallel, check that the inner loop over lambda will not be in contention for cores.
#'
#' @details
#' The `cva.glmnet` function does simultaneous cross-validation for both the alpha and lambda parameters in an elastic net model. The procedure is as outlined in the documentation for [glmnet::cv.glmnet]: it creates a vector `foldid` allocating the observations into folds, and then calls `cv.glmnet` in a loop over different values of alpha, but the same values of `foldid` each time.
#'
#' Optionally this loop over alpha can be parallelised; currently, `cva.glmnet` knows about two methods of doing so:
#'
#' * Via [parLapply] in the parallel package. To use this, set `outerParallel` to a valid cluster object created by [makeCluster].
#' * Via `rxExec` as supplied by Microsoft R Server's RevoScaleR package. To use this, set `outerParallel` to a valid compute context created by `RxComputeContext`, or a character string specifying such a context.
#'
#' If the outer loop is run in parallel, `cva.glmnet` can check if the inner loop (over lambda) is also set to run in parallel, and disable this if it would lead to contention for cores. This is done if it is likely that the parallelisation is local on a multicore machine, ie if `outerParallel` is a `SOCKcluster` object running on `"localhost"`, or if the RevoScaleR compute context is local parallel.
#'
#' @seealso
#' [glmnet::cv.glmnet]
#' @rdname cva.glmnet
#' @method cva.glmnet default
#' @importFrom parallel parLapply
#' @importFrom glmnet cv.glmnet
#' @export
cva.glmnet.default <- function(x, y, alpha=seq(0, 1, len=11)^3, nfolds=10,
                               foldid=sample(rep(seq_len(nfolds), length=nrow(x))),
                               ..., outerParallel=NULL, checkInnerParallel=TRUE)
{
    cl <- match.call()

    .cvfunc <- function(a, xmat, y, nfolds, foldid, ...)
    {
        glmnet::cv.glmnet(x, y, alpha=a, nfolds=nfolds, foldid=foldid, ...)
    }

    .chkPar <- function()
    {
        if(checkInnerParallel && isTRUE(dotargs$parallel))
        {
            warning("Setting parallel to FALSE for inner loop over lambda", call.=FALSE)
            dotargs$parallel <<- FALSE
        }
    }

    if(length(foldid) != nrow(x) || !is.numeric(foldid))
        stop("invalid foldid specified")
    dotargs <- list(...)

    # do not explicitly import RevoScaleR; this allows use with non-Microsoft R installs
    # hide Revo from R CMD check, in case we ever want to put this on CRAN
    rxContexts <- c("RxLocalSeq", "local", "RxLocalParallel", "localpar", "RxHadoopMR", "hadoopmr", "RxSpark", "spark",
                    "RxInTeradata", "teradata", "RxForeachDoPar", "dopar", "RxInSqlServer", "sqlserver")
    if(is.character(outerParallel) && (outerParallel %in% rxContexts) && eval(parse(text="require(RevoScaleR)")))
        outerParallel <- eval(parse(text="RevoScaleR::RxComputeContext"))(outerParallel)

    lst <- if(inherits(outerParallel, "cluster"))
    {
        if(inherits(outerParallel, "SOCKcluster") && identical(outerParallel[[1]]$host, "localhost"))
            .chkPar()
        do.call(parallel::parLapply, c(list(outerParallel, alpha, .cvfunc, xmat=x, y=y, nfolds=nfolds, foldid=foldid),
                dotargs))
    }
    else if(inherits(outerParallel, "RxComputeContext"))  # again hide Revo from checks
    {
        eval(parse(text='
        oldContext <- rxSetComputeContext(outerParallel)
        on.exit(rxSetComputeContext(oldContext))
        if(is(outerParallel, "RxLocalParallel"))
            .chkPar()
        do.call(rxExec, c(list(.cvfunc, a=rxElemArg(alpha), xmat=x, y=y, nfolds=nfolds,
                foldid=foldid), dotargs))'))
    }
    else if(is.null(outerParallel))
    {
        lapply(alpha, .cvfunc, xmat=x, y=y, nfolds=nfolds, foldid=foldid, ...)
    }
    else stop("unknown value for outerParallel")

    out <- list(alpha=alpha, nfolds=nfolds, modlist=lst, call=cl)
    class(out) <- "cva.glmnet"
    out
}

#' @param formula A model formula; interaction terms are allowed and will be expanded per the usual rules for linear models.
#' @param data A data frame or matrix containing the variables in the formula.
#' @param weights An optional vector of case weights to be used in the fitting process. If missing, defaults to an unweighted fit.
#' @param offset An optional vector of offsets, an _a priori_ known component to be included in the linear predictor.
#' @param subset An optional vector specifying the subset of observations to be used to fit the model.
#' @param na.action A function which indicates what should happen when the data contains missing values. For the `predict` method, `na.action = na.pass` will predict missing values with `NA`; `na.omit` or `na.exclude` will drop them.
#' @param drop.unused.levels Should factors have unused levels dropped? Defaults to `FALSE`.
#' @param xlev A named list of character vectors giving the full set of levels to be assumed for each factor.
#' @param sparse Should the model matrix be in sparse format? This can save memory when dealing with many factor variables, each with many levels.
#' @param use.model.frame Should the base [model.frame] function be used when constructing the model matrix? This is the standard method that most R modelling functions use, but has some disadvantages. The default is to avoid `model.frame` and construct the model matrix term-by-term; see [discussion][glmnet.model.matrix].
#'
#' @details
#' There are two ways in which the matrix of predictors can be generated. The default, with `use.model.frame = FALSE`, is to process the additive terms in the formula independently. With wide datasets, this is much faster and more memory-efficient than the standard R approach which uses the `model.frame` and `model.matrix` functions. However, the resulting model object is not exactly the same as if the standard approach had been used; in particular, it lacks a bona fide [terms] object. If you require interoperability with other packages that assume the standard model object structure, set `use.model.frame = TRUE`. See [discussion][glmnet.model.matrix] for more information on this topic.
#'
#' @section Value:
#' For `cva.glmnet.default`, an object of class `cva.glmnet`. This is a list containing the following:
#'
#' * `alpha` The vector of alpha values
#' * `nfolds` The number of folds
#' * `modlist` A list of `cv.glmnet` objects, containing the cross-validation results for each value of alpha
#'
#' The function `cva.glmnet.formula` adds a few more components to the above, to facilitate working with formulas.
#'
#' @examples
#' cva <- cva.glmnet(mpg ~ ., data=mtcars)
#' predict(cva, mtcars, alpha=1)
#'
#' \dontrun{
#'
#' # Leukemia example dataset from Trevor Hastie's website
#' download.file("https://web.stanford.edu/~hastie/glmnet/glmnetData/Leukemia.RData",
#'               "Leukemia.RData")
#' load("Leukemia.Rdata")
#' leuk <- do.call(data.frame, Leukemia)
#' leuk.cva <- cva.glmnet(y ~ ., leuk, family="binomial")
#' leuk.pred <- predict(leuk.cva, leuk, which=6)
#' }
#' @rdname cva.glmnet
#' @method cva.glmnet formula
#' @export
cva.glmnet.formula <- function(formula, data, ..., weights=NULL, offset=NULL, subset=NULL,
                               na.action=getOption("na.action"), drop.unused.levels=FALSE, xlev=NULL,
                               sparse=FALSE, use.model.frame=FALSE)
{
    # must use NSE to get model.frame emulation to work
    cl <- match.call(expand.dots=FALSE)
    cl[[1]] <- if(use.model.frame)
        makeModelComponentsMF
    else makeModelComponents
    xy <- eval.parent(cl)

    model <- cva.glmnet.default(xy$x, xy$y, weights=xy$weights, offset=xy$offset, ...)
    model$call <- match.call()
    model$terms <- xy$terms
    model$xlev <- xy$xlev
    model$sparse <- sparse
    model$use.model.frame <- use.model.frame
    model$na.action <- na.action
    class(model) <- c("cva.glmnet.formula", class(model))
    model
}


#' @param object For the `predict` and `coef` methods, an object returned by `cva.glmnet`.
#' @param newx For the `predict` method, a matrix of predictor variables.
#' @param which An alternative way of specifying alpha; the index number of the desired value within the alpha vector. If both `which` and `alpha` are supplied, the former takes precedence.
#' @param ... Further arguments to be passed to lower-level functions. In the case of `cva.glmnet`, these arguments are passed to `cv.glmnet`; for `predict` and `coef`, they are passed to `predict.cv.glmnet`; and for `plot` and `minlossplot`, to `plot`.
#'
#' @details
#' The `predict` method computes predictions for a specific alpha value given a `cva.glmnet` object. It looks up the supplied alpha (possibly supplied indirectly via the `which` argument) in the object's stored `alpha` vector, and calls `glmnet:::predict.cv.glmnet` on the corresponding `cv.glmnet` fit. All the arguments to that function are (or should be) supported.
#'
#' @seealso
#' [glmnet::predict.cv.glmnet], [glmnet::coef.cv.glmnet]
#'
#' @method predict cva.glmnet
#' @rdname cva.glmnet
#' @export
predict.cva.glmnet <- function(object, newx, alpha, which=match(TRUE, abs(object$alpha - alpha) < 1e-8), ...)
{
    if(is.na(which))
        stop("supplied alpha value not found")
    mod <- object$modlist[[which]]
    predict(mod, newx, ...)
}


#' @param newdata For the `predict` and `coef` methods, a data frame containing the observations for which to calculate predictions.
#'
#' @section Value:
#' For the `predict` method, a vector or matrix of predicted values.
#'
#' @method predict cva.glmnet.formula
#' @rdname cva.glmnet
#' @export
predict.cva.glmnet.formula <- function(object, newdata, alpha, which=match(TRUE, abs(object$alpha - alpha) < 1e-8),
                                           na.action=na.pass, ...)
{
    # must use NSE to get model.frame emulation to work
    cl <- match.call(expand.dots=FALSE)
    cl$formula <- delete.response(object$terms)
    cl$data <- cl$newdata
    cl$newdata <- NULL
    cl$xlev <- object$xlev
    cl[[1]] <- if(object$use.model.frame)
        makeModelComponentsMF
    else makeModelComponents

    xy <- eval.parent(cl)
    x <- xy$x
    offset <- xy$offset

    predict.cva.glmnet(object, x, which=which, ...)
}


#' @details
#' The `coef` method is similar, returning the coefficients for the selected alpha value via `glmnet:::coef.cv.glmnet`.
#'
#' @section Value:
#' For the `coef` method, a vector of regularised regression coefficients.
#'
#' @method coef cva.glmnet
#' @rdname cva.glmnet
#' @export
coef.cva.glmnet <- function(object, alpha, which=match(TRUE, abs(object$alpha - alpha) < 1e-8), ...)
{
    if(is.na(which))
        stop("supplied alpha value not found")
    mod <- object$modlist[[which]]
    coef(mod, ...)
}


#' @rdname cva.glmnet
#' @method print cva.glmnet.formula
#' @export
print.cva.glmnet.formula <- function(x, ...)
{
    cat("Call:\n")
    dput(x$call)
    cat("\nModel fitting options:")
    cat("\n    Sparse model matrix:", x$sparse)
    cat("\n    Use model.frame:", x$use.model.frame)
    cat("\n    Alpha values:", x$alpha)
    cat("\n    Number of crossvalidation folds for lambda:", x$nfolds)
    cat("\n")
    invisible(x)
}


#' @param log.x Whether to plot the X-axis (lambda) on the log scale. Defaults to TRUE, which for most lambda sequences produces a more reasonable looking plot. If your lambda sequence includes zero, set this to FALSE.
#' @param legend.x,legend.y Location for the legend. Defaults to the top-left corner of the plot. Set either of these to NULL to omit the legend.
#' @details
#' The plot method for `cva.glmnet` objects plots the average cross-validated loss by lambda, for each value of alpha. Each line represents one `cv.glmnet` fit, corresponding to one value of alpha. Note that the specific lambda values can vary substantially by alpha.
#'
#' @seealso
#' [cva.glmnet], [glmnet::cv.glmnet], [plot]
#' @method plot cva.glmnet
#' @rdname cva.glmnet
#' @export
plot.cva.glmnet <- function(x, ..., legend.x=xlim[1], legend.y=xlim[2], log.x=TRUE)
{
    n <- length(x$modlist)
    cvm <- sapply(x$modlist, "[[", "cvm", simplify=FALSE)
    oldPal <- palette(topo.colors(n))
    on.exit(palette(oldPal))
    ylab <- x$modlist[[1]]$name
    xlst <- lapply(x$modlist, "[[", "lambda")
    ylst <- lapply(x$modlist, "[[", "cvm")
    xlim <- if(log.x) log(range(xlst)) else range(xlst)
    ylim <- range(ylst)
    xlab <- if(log.x) "log Lambda" else "Lambda"
    plot(NA, xlim=xlim, ylim=ylim, xlab=xlab, ylab=x$modlist[[1]]$name, type="n", ...)
    for(i in seq_along(cvm))
    {
        xvals <- if(log.x) log(xlst[[i]]) else xlst[[i]]
        lines(xvals, ylst[[i]], col=i)
    }
    if(!is.null(legend.x) && !is.null(legend.y))
        graphics::legend(xlim[1], ylim[2], x$alpha, col=seq_along(x$alpha), lty=1)
    invisible(x)
}


#' @rdname cva.glmnet
#' @export
minlossplot <- function(x, ...)
UseMethod("minlossplot")


#' @param cv.type For `minlossplot`, which cross-validated loss value to plot for each value of alpha. This can be either `"min"` which is the minimum loss, or `"1se"` which is the highest loss within 1 standard error of the minimum. The default is `"1se"`.
#'
#' @details
#' The `minlossplot` function gives the best (lowest) cross-validated loss for each value of alpha.
#'
#' @rdname cva.glmnet
#' @export
minlossplot.cva.glmnet <- function(x, ..., cv.type=c("1se", "min"))
{
    alpha <- x$alpha
    cv.type <- match.arg(cv.type)
    cv.type <- paste0("lambda.", cv.type)
    cvm <- sapply(x$modlist, function(mod) {
        mod$cvm[mod$lambda == mod[[cv.type]]]
    })
    plot(alpha, cvm, ylab="CV loss", ...)
    invisible(x)
}

Try the glmnetUtils package in your browser

Any scripts or data that you put into this service are public.

glmnetUtils documentation built on Sept. 10, 2023, 5:06 p.m.