R/fast_big_lm.R
In bigFastlm: Fast Linear Models for Objects from the 'bigmemory' Package

Documented in bigLm bigLm.default bigLmPure predict.bigLm print.bigLm print.summary.bigLm summary.bigLm

## bigLm.R: Rcpp/Eigen/bigmemory implementation of lm()
##
## Copyright (C)  2011 - 2015  Douglas Bates, Dirk Eddelbuettel and Romain Francois
##
## Copyright (C)  2016  Jared Huling
##
## This file is based off of code from RcppEigen.
##
## RcppEigen is free software: you can redistribute it and/or modify it
## under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 2 of the License, or
## (at your option) any later version.
##
## RcppEigen is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with RcppEigen.  If not, see <http://www.gnu.org/licenses/>.



#' fast and memory efficient linear model fitting
#'
#' @param X input model matrix. must be a big.matrix object (type = 8 for double)
#' @param y numeric response vector of length nobs.
#' @param method an integer scalar with value 0 for the LLT Cholesky or 1 for the LDLT Cholesky
#' @param gigs double scalar. maximum number of gigs of memory available. Used to figure out how to break up calculations
#' involving the design matrix X
#' @param nslices integer scalar, defaults to NULL, which defers to the \code{gigs} argument to determine the number of slices
#' required. If specified, nslices determines the number of slices to break up computation of X'X into.
#' @return A list with the elements
#' \item{coefficients}{a vector of coefficients}
#' \item{se}{a vector of the standard errors of the coefficient estimates}
#' \item{rank}{a scalar denoting the computed rank of the model matrix}
#' \item{df.residual}{a scalar denoting the degrees of freedom in the model}
#' \item{residuals}{the vector of residuals}
#' \item{s}{a numeric scalar - the root mean square for residuals}
#' \item{fitted.values}{the vector of fitted values}
#' @export
#' @examples
#'
#' library(bigmemory)
#'
#' nrows <- 50000
#' ncols <- 50
#' bkFile <- "bigmat2.bk"
#' descFile <- "bigmatk2.desc"
#' bigmat <- filebacked.big.matrix(nrow=nrows, ncol=ncols, type="double",
#'                                 backingfile=bkFile, backingpath=".",
#'                                 descriptorfile=descFile,
#'                                 dimnames=c(NULL,NULL))
#'
#' # Each column value with be the column number multiplied by
#' # samples from a standard normal distribution.
#' set.seed(123)
#' for (i in 1:ncols) bigmat[,i] = rnorm(nrows)*i
#'
#' y <- rnorm(nrows) + bigmat[,1]
#'
#' system.time(lmr1 <- bigLmPure(bigmat, y))
#'
#' system.time(lmr2 <- lm.fit(x = bigmat[,], y = y))
#'
#' max(abs(coef(lmr1) - coef(lmr2)))
#'
#'
bigLmPure <- function(X, y, method = 0L, gigs = 2.0, nslices = NULL) {


    method <- as.integer(method[1])

    # calculate number of slices if
    # nslices not specified
    if (is.null(nslices))
    {
        xgigs   <- 8.0 * nrow(X) * ncol(X) / (10 ^ 9)
        nslices <- ceiling(xgigs / gigs)
    } else
    {
        nslices <- as.integer(nslices[1])
    }

    if (nslices >= nrow(X) * 0.5) stop("nslices must be less than 1/2 of the number of observations")

    if (method > 1L)
    {
        stop("invalid method")
    }

    cnames <- colnames(X)
    if (is.null(cnames)) cnames <- paste0("X", 1:ncol(X))

    stopifnot( is.big.matrix(X), is.numeric(y), NROW(y) == nrow(X) )

    res <- .Call("bigLm_Impl", X@address, y, method, nslices, PACKAGE="bigFastlm")
    names(res$coefficients) <- cnames
    res
}

#' fast and memory efficient linear model fitting
#'
#' @param X input model matrix. must be a big.matrix object (type = 8 for double)
#' @param y numeric response vector of length nobs.
#' @param method an integer scalar with value 0 for the LLT Cholesky or 1 for the LDLT Cholesky
#' @param gigs double scalar. maximum number of gigs of memory available. Used to figure out how to break up calculations
#' involving the design matrix X
#' @param nslices integer scalar, defaults to NULL, which defers to the \code{gigs} argument to determine the number of slices
#' required. If specified, nslices determines the number of slices to break up computation of X'X into.
#' @return A list object with S3 class "bigLm" with the elements
#' \item{coefficients}{a vector of coefficients}
#' \item{se}{a vector of the standard errors of the coefficient estimates}
#' \item{rank}{a scalar denoting the computed rank of the model matrix}
#' \item{df.residual}{a scalar denoting the degrees of freedom in the model}
#' \item{residuals}{the vector of residuals}
#' \item{s}{a numeric scalar - the root mean square for residuals}
#' \item{fitted.values}{the vector of fitted values}
#' @useDynLib bigFastlm
#' @import Rcpp
#' @export
#' @examples
#'
#' library(bigmemory)
#'
#' nrows <- 50000
#' ncols <- 50
#' bkFile <- "bigmat.bk"
#' descFile <- "bigmatk.desc"
#' bigmat <- filebacked.big.matrix(nrow=nrows, ncol=ncols, type="double",
#'                                 backingfile=bkFile, backingpath=".",
#'                                 descriptorfile=descFile,
#'                                 dimnames=c(NULL,NULL))
#'
#' # Each column value with be the column number multiplied by
#' # samples from a standard normal distribution.
#' set.seed(123)
#' for (i in 1:ncols) bigmat[,i] = rnorm(nrows)*i
#'
#' y <- rnorm(nrows) + bigmat[,1]
#'
#' system.time(lmr1 <- bigLm(bigmat, y))
#'
#' system.time(lmr2 <- lm.fit(x = bigmat[,], y = y))
#'
#' max(abs(coef(lmr1) - coef(lmr2)))
#'
#'
bigLm <- function(X, ...) UseMethod("bigLm")



#' summary method for bigLm fitted objects
#'
#' @param object bigLm fitted object
#' @param ... not used
#' @return a summary.bigLm object
#' @rdname summary
#' @method summary bigLm
#' @export
#' @examples
#'
#' library(bigmemory)
#'
#' nrows <- 50000
#' ncols <- 15
#' bkFile <- "bigmat4.bk"
#' descFile <- "bigmatk4.desc"
#' bigmat <- filebacked.big.matrix(nrow=nrows, ncol=ncols, type="double",
#'                                 backingfile=bkFile, backingpath=".",
#'                                 descriptorfile=descFile,
#'                                 dimnames=c(NULL,NULL))
#'
#' # Each column value with be the column number multiplied by
#' # samples from a standard normal distribution.
#' set.seed(123)
#' for (i in 1:ncols) bigmat[,i] = rnorm(nrows)*i
#'
#' y <- rnorm(nrows) + bigmat[,1] - bigmat[,2]
#'
#' system.time(lmr1 <- bigLm(bigmat, y))
#'
#' summary(lmr1)
#'
#'
summary.bigLm <- function(object, ...)
{
    coef <- object$coefficients
    se   <- object$se
    tval <- coef/se

    object$coefficients <- cbind(Estimate     = coef,
                                 "Std. Error" = se,
                                 "t value"    = tval,
                                 "Pr(>|t|)"   = 2*pt(-abs(tval), df=object$df))

    ## cf src/stats/R/lm.R and case with no weights and an intercept
    f <- object$fitted.values
    r <- object$residuals
    #mss <- sum((f - mean(f))^2)
    mss <- if (object$intercept) sum((f - mean(f))^2) else sum(f^2)
    rss <- sum(r^2)

    object$r.squared <- mss/(mss + rss)
    df.int <- if (object$intercept) 1L else 0L
    n <- length(f)
    rdf <- object$df
    object$adj.r.squared <- 1 - (1 - object$r.squared) * ((n - df.int)/rdf)
    class(object) <- "summary.bigLm"
    object
}


#' bigLm default
#'
#' @param ... not used
#' @rdname bigLm
#' @method bigLm default
#' @export
bigLm.default <- function(X, y, method = 0L, gigs = 2.0, nslices = NULL, ...) {

    y <- as.numeric(y)

    res <- bigLmPure(X, y, method, gigs, nslices)
    res$call <- match.call()
    res$intercept <- any(big.colMax(X) == big.colMin(X))

    class(res) <- "bigLm"
    res
}

#' print method for bigLm objects
#'
#' @rdname print
#' @method print bigLm
#' @export
print.bigLm <- function(x, ...) {
    cat("\nCall:\n")
    print(x$call)
    cat("\nCoefficients:\n")
    print(x$coefficients, digits=5)
}

#' print method for summary.bigLm objects
#'
#' @param x a "summary.bigLm" object
#' @param ... not used
#' @rdname print
#' @method print summary.bigLm
#' @export
print.summary.bigLm <- function(x, ...) {
    cat("\nCall:\n")
    print(x$call)
    cat("\nResiduals:\n")
    digits <- max(3, getOption("digits") - 3)
    print(summary(x$residuals, digits=digits)[-4])
    cat("\n")

    printCoefmat(x$coefficients, P.values=TRUE, has.Pvalue=TRUE, ...)
    cat("\nResidual standard error: ", formatC(x$s, digits=digits), " on ",
        formatC(x$df), " degrees of freedom\n", sep="")
    cat("Multiple R-squared: ", formatC(x$r.squared, digits=digits),
        ",\tAdjusted R-squared: ",formatC(x$adj.r.squared, digits=digits),
        "\n", sep="")
    invisible(x)
}

#' Prediction method for bigLm fitted objects
#'
#' @param object fitted "bigLm" model object
#' @param newdata big.matrix object. If NULL, then fitted values are returned
#' @param ... not used
#' @return An object depending on the type argument
#' @importFrom bigmemory is.big.matrix as.big.matrix
#' @importFrom stats coef fitted pt printCoefmat
#' @export
#' @examples
#'
#' library(bigmemory)
#'
#' nrows <- 50000
#' ncols <- 50
#' bkFile <- "bigmat3.bk"
#' descFile <- "bigmatk3.desc"
#' bigmat <- filebacked.big.matrix(nrow=nrows, ncol=ncols, type="double",
#'                                 backingfile=bkFile, backingpath=".",
#'                                 descriptorfile=descFile,
#'                                 dimnames=c(NULL,NULL))
#'
#' # Each column value with be the column number multiplied by
#' # samples from a standard normal distribution.
#' set.seed(123)
#' for (i in 1:ncols) bigmat[,i] = rnorm(nrows)*i
#'
#' y <- rnorm(nrows) + bigmat[,1]
#'
#' system.time(lmr1 <- bigLm(bigmat, y))
#'
#'
#' preds <- predict(lmr1, newdata = bigmat)
#'
predict.bigLm <- function(object, newdata = NULL, ...) {
    if (is.null(newdata)) {
        y <- fitted(object)
    } else {
        stopifnot(is.big.matrix(newdata))
        y <- as.vector(newdata %*% coef(object))
    }
    y
}

Any scripts or data that you put into this service are public.

bigFastlm documentation built on May 30, 2017, 7:20 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

bigFastlm
Fast Linear Models for Objects from the 'bigmemory' Package

R/fast_big_lm.R
In bigFastlm: Fast Linear Models for Objects from the 'bigmemory' Package

Defines functions bigLmPure bigLm summary.bigLm bigLm.default print.bigLm print.summary.bigLm predict.bigLm

Documented in bigLm bigLm.default bigLmPure predict.bigLm print.bigLm print.summary.bigLm summary.bigLm

Try the bigFastlm package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

bigFastlm Fast Linear Models for Objects from the 'bigmemory' Package

R/fast_big_lm.R In bigFastlm: Fast Linear Models for Objects from the 'bigmemory' Package

Defines functions bigLmPure bigLm summary.bigLm bigLm.default print.bigLm print.summary.bigLm predict.bigLm

Documented in bigLm bigLm.default bigLmPure predict.bigLm print.bigLm print.summary.bigLm summary.bigLm

Try the bigFastlm package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

bigFastlm
Fast Linear Models for Objects from the 'bigmemory' Package

R/fast_big_lm.R
In bigFastlm: Fast Linear Models for Objects from the 'bigmemory' Package