R/oomglm.R

Defines functions init_oomglm glm_adjust init_weight weight end_weight fit.oomglm iter_weight oomglm update_oomglm print.oomglm

Documented in end_weight fit.oomglm glm_adjust init_oomglm init_weight iter_weight oomglm print.oomglm update_oomglm weight

#' Internal. Initialize Out of memory Generalized Linear Regression Model
#' 
#' @md
#' @param formula A symbolic description of the model to be fitted of class `formula`.
#' @param family A `family` object.
#' @param weights A one-sided, single term `formula` specifying weights. 
#' @param start Starting values for the parameters in the linear predictor.
#' 
#' @keywords internal
init_oomglm <- function(formula, family, weights, start) {
  
  if(is.character(family)) {
    family <- get(family, mode = "function", envir = parent.frame())
  }
  if(is.function(family)) {
    family <- family()
  }
  if(is.null(family$family)) {
    print(family)
    stop("'family' not recognized")
  }
  
  if(!is.null(weights) && !inherits(weights, "formula")) {
    stop("`weights` must be a formula")
  }
  
  irls <- list(
    beta       = start,
    pre_beta   = NULL,
    rss        = 0.0,
    deviance   = 0.0
  )
  
  object <- list(
    converged     = FALSE,
    iter          = 0,
    n             = 0,
    df.residual   = NULL,
    df.null       = NULL,
    formula       = formula,
    family        = family,
    terms         = terms(formula),
    weights       = weights,
    call          = sys.call(-1),
    qr            = NULL,
    names         = NULL,
    assign        = NULL,
    irls          = irls
  )
  
  class(object) <- c("oomglm", "oomlm")
  object
  
}


#' Internal. Apply Generalized Linear Model transformations to `chunk`
#' 
#' @md
#' @param object An `oomglm` model.
#' @param chunk List created by `unpack_oomchunk()`.
#' @keywords internal
glm_adjust <- function(object, chunk) {

  mm     <- chunk$data
  y      <- chunk$response
  w      <- chunk$weights
  offset <- chunk$offset

  fam    <- object$family
  beta   <- object$irls$beta
  rss    <- object$irls$rss
  dev    <- object$irls$deviance

  if(is.null(beta)) {
    eta <- rep(0.0, nrow(mm)) + offset
  } else {
    eta <- mm %*% beta + offset
  }

  g       <- fam$linkinv(eta)
  gprime  <- fam$mu.eta(eta)
  z       <- eta + (y - g) / gprime
  fam_var <- fam$variance(g)
  ww      <- w * gprime ^ 2 / fam_var

  if(!is.null(beta)) {
    rss <-
      rss + sum((y - g) ^ 2 / (w * fam_var)) * (sum(w) / length(w))
    dev <-
      dev + sum(fam$dev.resids(y, g, w))
  }

  list(
    z = z,
    w = ww,
    g = g,
    deviance = dev,
    rss      = rss
  )

}



#' @export
#' @rdname init_weight
init_weight <- function(object) {
  
  if(object$iter > 0) {
    object$irls$beta <- coef(object)
  }
  
  object$irls$rss      <- 0.0
  object$irls$deviance <- 0.0
  object$qr   <- NULL
  object$n    <- 0L

  if(!is.null(object$sandwich)) {
    object$sandwich <- list(xy = NULL)
  }
  
  object
 
}


#' @export
#' @rdname init_weight
weight <- function(object, data) {
  
  if(!inherits(data, c("oomdata", "function", "data.frame"))) {
    stop("class of `data` not recognized")
  }
  
  update_oomglm(object, data)
  
}


#' @export
#' @rdname init_weight
end_weight <- function(object,
                       tolerance = 1e-8) {

  object$iter <- object$iter + 1L 
  beta_old    <- object$irls$beta
  
  if(is.null(beta_old)) {
    return(object)
  }
  
  beta  <- coef(object)
  delta <- (beta_old - beta) / sqrt(diag(vcov(object)))
  object$converged <- max(abs(delta)) < tolerance
  
  object
  
} 


#' Fit `oomglm` model via Iteratively Reweighted Least Squares (IRLS).
#' 
#' @md
#' @param object `oomglm` model.
#' @param data an optional `oomdata_tbl`, `oomdata_dbi`, `oomdata_con`,
#'   `tibble`, `data.frame`, or `list` of observations to fit
#' @param times Maximum number of IRLS iterations to perform. Will 
#'   stop iterating if model converges before `times` iterations.
#' @param tolerance Tolerance used to determine convergence. Represents
#'  change in coefficient as a multiple of standard error.
#' @param ... ignored
#' 
#' @return `oomglm` object after performing `times` IRLS iterations on
#'  `data`.
#' 
#' @method fit oomglm
#' @seealso [oomglm()]
#' @export
#' @rdname fit.oomglm
fit.oomglm <- function(object, data, times = 2L, tolerance = 1e-8, ...) {
  iter_weight(object, data, times, tolerance)
}


#' Fit `oomglm` model via Iteratively Reweighted Least Squares (IRLS).
#' 
#' @md
#' @param object `oomglm` model.
#' @param data an optional `oomdata_tbl`, `oomdata_dbi`, `oomdata_con`,
#'   `tibble`, `data.frame`, or `list` of observations to fit
#' @param times Maximum number of IRLS iterations to perform. Will 
#'   stop iterating if model converges before `times` iterations.
#' @param tolerance Tolerance used to determine convergence. Represents
#'  change in coefficient as a multiple of standard error.
#' @param ... ignored
#' 
#' @return `oomglm` object after performing `times` IRLS iterations on
#'  `data`.
#' 
#' @seealso [fit()]
#' @export
iter_weight <- function(object, data,
                        times = 2L,
                        tolerance = 1e-8) {

  for(i in 1:times) {
    
    if(object$converged) {
      break
    }
    
    object <- init_weight(object)
    object <- weight(object, data)
    object <- end_weight(object, tolerance)
    
  }
  
  object
  
}


#' Out of memory Generalized Linear model
#' 
#' Perform memory-efficient generalized linear regression using 
#' the AS274 bounded memory QR factorization algorithm.
#'   
#' @md
#' @param formula A symbolic description of the model to be fitted of class
#'  `formula`.
#' @param family A `family` object.
#' @param weights A one-sided, single term `formula` specifying weights.
#' @param start Starting values for the parameters in the linear predictor.
#' @param ... Ignored.
#' @details
#'  An `oomglm` model can be in various states of fit depending on the number
#'  of seen observations and rounds of IRLS that have been performed.
#'  It is important to view the model within the context of:
#'  the number of observations processed per round of IRLS (`n`);
#'  the number of IRLS iterations that have been performed (`iter`);
#'  and if the IRLS algorithm has converged (`converged`).
#'
#' @return It is up to the user to know when fitting is complete.
#'  Therefore, only basic model characteristics are provided as values. 
#'  Statistics are available on demand via summary and extractor functions.
#'
#' \item{converged}{Indicates if the IRLS algorithm has converged.}
#' \item{iter}{The number of iterations of IRLS performed.}
#' \item{n}{The number observations processed per round of IRLS.}
#' \item{df.residual}{The residual degrees of freedom.}
#' \item{df.null}{The residual degrees of freedom.}
#' \item{formula}{the `formula` object specifying the linear model.}
#' \item{family}{a `family` object describing the error distribution
#'  and link function used in the model.}
#' \item{terms}{The `terms` object used.}
#' \item{weights}{The weights `formula` provided to the model.}
#' \item{call}{The matched call.}
#' @seealso [oomlm()]
#' @aliases print.oomglm print.summary.oomglm summary.oomglm logLik.oomglm
#' @export
#' @name oomglm
#' @examples \donttest{
#' # The `oomglm()` function employs Iteratively Weighted Least Squares (IWLS).
#' # The IWLS iterations are performed by the function `fit()` which
#' # makes passes over the data until convergence.
#' 
#' # reweight 4 times or until convergence
#' x <- oomglm(mpg ~ cyl + disp)
#' x <- fit(x, mtcars, times = 4)
#' 
#' tidy(x)
#' 
#' # To fit in-memory data in chunks, use `oomdata_tbl()`:
#' 
#' y      <- oomglm(mpg ~ cyl + disp)
#' chunks <- oomdata_tbl(mtcars, chunk_size = 10)
#' y      <- fit(y, chunks, times = 4)
#' 
#' tidy(y)
#'
#' }
oomglm <- function(formula,
                   family   = gaussian(),
                   weights  = NULL,
                   start    = NULL, ...) {

  init_oomglm(formula, family, weights, start)

}


#' Internal. Update `oomglm` object with additional observations
#' 
#' This function is typically called via [fit()] or [weight()].
#' 
#' @param object An `oomglm` model.
#' @param data Observations to fit.
#' @param ... Ignored.
#'
#' @keywords internal
update_oomglm <- function(object, data, ...) {
  
  updater <- function(object, data, ...) {
    
    chunk <- unpack_oomchunk(object, data)
    
    if(is.null(object$assign)) {
      object$assign <- chunk$assign
      object$names  <- colnames(chunk$data)
    }
    
    if(is.null(object$qr)) {
      qr <- new_bounded_qr(chunk$p)
    } else {
      qr <- object$qr
    }
    
    trans  <- glm_adjust(object, chunk)
    
    object$qr <- update(qr,
                        chunk$data,
                        trans$z - chunk$offset,
                        trans$w)
    
    # if(!is.null(object$sandwich)) {
    #   object$sandwich$xy <-
    #     update_sandwich(object$sandwich$xy,
    #                     chunk$data,
    #                     chunk$n,
    #                     chunk$p,
    #                     trans$z,
    #                     chunk$offset,
    #                     trans$w)
    # }
    
    intercept <- attr(object$terms, "intercept") > 0L
    
    object$n             <- object$qr$num_obs
    object$df.residual   <- object$n - chunk$p
    object$df.null       <- object$n - as.integer(intercept)
    object$iwls$rss      <- trans$rss
    object$iwls$deviance <- trans$deviance
    
    object
    
  }
  
  if(inherits(data, what = "oomdata")) {
    while(!is.null(chunk <- data())) {
      object <- updater(object, chunk, ...)
    }
    object
  } else {
    updater(object, data, ...)
  }

}


#' @method print oomglm
#' @export
print.oomglm <- function(x,
                         digits = max(3L, getOption("digits") - 3L), ...) {
  
  cat("\nCall:  ",
      paste(deparse(x$call), sep = "\n", collapse = "\n"),
      "\n\n", sep = "")
  
  if(!is.null(x$se_type)) {
    cat(paste("Standard error type:", x$se_type), "\n\n")  
  }
  
  beta <- coef(x)
  if(length(beta)) {
    cat("Coefficients:\n")
    print.default(
      format(beta, digits = digits),
      print.gap = 2L,
      quote     = FALSE)
  } else {
    cat("No coefficients\n")
  }
  
  cat("\nObservations included: ", x$n, "\n")
  cat("Degrees of Freedom:", x$df.null, "Total (i.e. Null); ",
      x$df.residual, "Residual\n")
  
  cat("Residual Deviance:", format(signif(deviance(x), digits)),
      "\tAIC:", format(signif(AIC(x), digits)), "\n\n")
  
  cat("Converged:", x$converged,
      "\nNumber of Fisher Scoring iterations:", x$iter, "\n")
  
  invisible(x)
  
}
blakeboswell/ploom documentation built on May 25, 2019, 3:24 p.m.