R/demeanlist.R
In lfe: Linear Group Fixed Effects

Documented in demeanlist

#' Centre vectors on multiple groups
#'
#' @description
#'  Uses the method of alternating projections to centre
#'  a (model) matrix on multiple groups, as specified by a list of factors.
#'  This function is called by [felm()], but it has been
#'  made available as standalone in case it's needed. In particular, if
#' one does not need transformations provided by R-formulas but have the covariates present
#' as a matrix or a data.frame, a substantial amount of time can be saved in the centering.
#' @export
# \usage{
# demeanlist(mtx, fl, icpt=0, eps=getOption('lfe.eps'),
#           threads=getOption('lfe.threads'),
#           progress=getOption('lfe.pint'),
#           accel=getOption('lfe.accel'),
#           randfact=TRUE,
#           means=FALSE,
#           weights=NULL,
#           scale=TRUE,
#           .inplace=FALSE)
# }

#' @param mtx matrix whose columns form vectors to be group-centred. mtx
#'  can also be a list of vectors or matrices, such as a data frame.
#' @param fl list of factors defining the grouping structure.
#' @param icpt the position of the intercept, this column is removed from
#' the result matrix.
#' @param eps a tolerance for the centering.
#' @param threads an integer specifying the number of threads to use.
#' @param progress integer. If positive, make progress reports (whenever a
#' vector is centered, but not more often than every `progress` minutes).
#' @param accel integer. Set to 1 if Gearhart-Koshy acceleration should be done.
#' @param randfact logical. Should the order of the factors be randomized?
#' This may improve convergence.
#' @param means logical. Should the means instead of the demeaned matrix be
#'  returned? Setting `means=TRUE` will return `mtx -  demeanlist(mtx,...)`,
#' but without the extra copy.
#' @param weights numeric. For weighted demeaning.
#' @param scale logical. Specify scaling for weighted demeaning.
#' @param na.rm logical which indicates what should happen when the data
#' contain `NA`s. If TRUE, rows in the input `mtx` are removed
#' prior to centering. If FALSE, they are kept, leading to entire groups becoming NA
#' in the output.
#' @param attrs list. List of attributes which should be attached to the output.
#' Used internally.
#'
#' @details
#' For each column `y` in `mtx`, the equivalent of the
#' following centering is performed, with `cy` as the result.
#' \preformatted{
#' cy <- y; oldy <- y-1
#' while(sqrt(sum((cy-oldy)**2)) >= eps) {
#'  oldy <- cy
#'  for(f in fl) cy <- cy - ave(cy,f)
#' }
#' }
#'
#' Each factor in `fl` may contain an
#' attribute `'x'` which is a numeric vector of the same length as
#' the factor. The centering is then not done on the means of each group,
#' but on the projection onto the covariate in each group.  That is, with a
#' covariate `x` and a factor `f`, it is like projecting out the
#' interaction `x:f`.  The `'x'` attribute can also be a matrix of column
#' vectors, in this case it can be beneficial to orthogonalize the columns,
#' either with a stabilized Gram-Schmidt method, or with the simple
#' method `x \%*\% solve(chol(crossprod(x)))`.
#'
#' The `weights` argument is used if a weighted projection is
#' computed.  If \eqn{W} is the diagonal matrix with `weights` on the
#' diagonal, `demeanlist` computes \eqn{W^{-1} M_{WD} W x} where \eqn{x} is
#' the input vector, \eqn{D} is the matrix of dummies from `fl` and
#' \eqn{M_{WD}} is the projection on the orthogonal complement of
#' the range of the matrix \eqn{WD}.  It is possible to implement the
#' weighted projection with the `'x'` attribute mentioned above, but
#' it is a separate argument for convenience.
#' If `scale=FALSE`, `demeanlist` computes \eqn{M_{WD} x} without
#' any \eqn{W} scaling.
#' If `length(scale) > 1`, then `scale[1]` specifies whether
#' the input should be scaled by \eqn{W}, and `scale[2]` specifies
#' whether the output should be scaled by \eqn{W^{-1}}.  This is just
#' a convenience to save some memory copies in other functions in the package.
#'
#' Note that for certain large datasets the overhead in [felm()]
#' is large compared to the time spent in `demeanlist`. If the data
#' are present directly without having to use the formula-interface to
#' `felm` for transformations etc, it is possible to run
#' `demeanlist` directly on a matrix or `"data.frame"` and do the
#' OLS "manually", e.g. with something like
#' `cx <- demeanlist(x,...); beta <- solve(crossprod(cx), crossprod(cx,y))`
#'
#' In some applications it is known that a single centering iteration is
#' sufficient. In particular, if `length(fl)==1` and there is no
#' interaction attribute `x`.  In this case the centering algorithm is
#' terminated after the first iteration. There may be other cases, e.g. if
#' there is a single factor with a matrix `x` with orthogonal columns. If
#' you have such prior knowledge, it is possible to force termination after
#' the first iteration by adding an attribute `attr(fl, 'oneiter') <-
#' TRUE`.  Convergence will be reached in the second iteration anyway, but
#' you save one iteration, i.e. you double the speed.
#'
#' @return
#' If `mtx` is a matrix, a matrix of the same shape, possibly with
#' column `icpt` deleted.
#'
#' If `mtx` is a list of vectors and matrices, a list of the same
#' length is returned, with the same vector and matrix-pattern, but the
#' matrices have the column `icpt` deleted.
#'
#' If `mtx` is a `'data.frame'`, a `'data.frame'`
#' with the same names is returned; the `icpt` argument is ignored.
#'
#' If `na.rm` is specified, the return value has an attribute `'na.rm'` with a vector of
#' row numbers which has been removed. In case the input is a matrix or list, the same rows
#' are removed from all of them. Note that removing NAs incurs a copy of the input, so if
#' memory usage is an issue and many runs are done, one might consider removing NAs from the data set entirely.
#' @note
#' The `accel` argument enables Gearhart-Koshy acceleration as
#' described in Theorem 3.16 by Bauschke, Deutsch, Hundal and Park in "Accelerating the
#' convergence of the method of alternating projections",
#' Trans. Amer. Math. Soc. 355 pp 3433-3461 (2003).
#'
#' `demeanlist` will use an in place transform to save memory, provided the `mtx`
#' argument is unnamed. Thus, as always in R, you shouldn't use temporary variables
#' like `tmp <- fun(x[v,]); bar <- demeanlist(tmp,...); rm(tmp)`, it will be much better to
#' do `bar <- demeanlist(fun(x[v,]),...)`. However, demeanlist allows a construction like
#' `bar <- demeanlist(unnamed(tmp),...)` which will use an in place transformation, i.e. tmp
#' will be modified, quite contrary to the usual semantics of R.
#'
#' @examples
#' oldopts <- options("lfe.threads")
#' options(lfe.threads = 2)
#' ## create a matrix
#' mtx <- data.frame(matrix(rnorm(999), ncol = 3))
#' # a list of factors
#' rgb <- c("red", "green", "blue")
#' fl <- replicate(4, factor(sample(rgb, nrow(mtx), replace = TRUE)), simplify = FALSE)
#' names(fl) <- paste("g", seq_along(fl), sep = "")
#' # centre on all means
#' mtx0 <- demeanlist(mtx, fl)
#' head(data.frame(mtx0, fl))
#' # verify that the group means for the columns are zero
#' lapply(fl, function(f) apply(mtx0, 2, tapply, f, mean))
#' options(oldopts)
demeanlist <- function(mtx, fl, icpt = 0L, eps = getOption("lfe.eps"),
                       threads = getOption("lfe.threads"),
                       progress = getOption("lfe.pint"),
                       accel = getOption("lfe.accel"),
                       randfact = TRUE,
                       means = FALSE,
                       weights = NULL,
                       scale = TRUE,
                       na.rm = FALSE,
                       attrs = NULL) {
  if (length(fl) == 0) {
    if (means) {
      foo <- unlist(utils::as.relistable(mtx))
      foo[] <- 0
      return(utils::relist(foo))
    }
    return(eval.parent(substitute(mtx)))
    #    return(mtx)
  }

  # Here we used to just .Call(C_demeanlist, mtx, fl,...)
  # However, we have rewritten this part because C_demeanlist checks the NAMED
  # status of mtx, to avoid copying if possible. But mtx is certainly NAMED > 0 inside
  # this function.  So instead, we evaluate C_demeanlist in the parent environment, without
  # touching the actual arguments in here
  mf <- match.call()
  ff <- formals(sys.function())
  # This is the argument sent to C_demeanlist, in this order:
  ff <- ff[match(
    c("mtx", "fl", "icpt", "eps", "threads", "progress", "accel", "means", "weights", "scale", "attrs"),
    names(ff), 0L
  )]
  m <- names(ff)[names(ff) %in% names(mf)]
  ff[m] <- mf[m]
  # make a new environment to store our reordered fl, enclosed by our caller's frame
  # This is just to avoid clutter in case of error messages
  env <- new.env(parent = parent.frame())
  assign("C_demeanlist", C_demeanlist, envir = env)
  assign("unnamed", unnamed, envir = env)
  assign(".fl", eval.parent(ff[["fl"]]), envir = env)
  .fl <- NULL # avoid check warning
  ff[["fl"]] <- quote(.fl)
  if (randfact && length(get(".fl", env)) > 2) {
    # This is delayed for the bizarre reason that before we rewrote this code
    # mtx was forced before reordering of fl, so we just keep it that way to avoid
    # changing the test output.
    delayedAssign("..fl", .fl[order(runif(length(.fl)))], env, env)
    ff[["fl"]] <- quote(..fl)
  }

  # Then some NA-handling, at the request of David Hugh-Jones, June 11, 2018
  badrows <- NULL
  delist <- FALSE
  isDT <- FALSE
  if (isTRUE(na.rm)) {
    # we need to touch and copy the mtx and fl
    # mtx is either a vector, a matrix or a list (data.frame/data.table) of such
    mtx <- eval.parent(ff[["mtx"]])
    if (!is.list(mtx)) {
      mtx <- list(mtx)
      delist <- TRUE
    }
    # find bad rows in any of the elements of mtx
    badrows <- NULL
    for (i in seq_along(mtx)) {
      object <- mtx[[i]]
      d <- dim(object)
      if (length(d) > 2L) next
      bad <- seq_along(object)[is.na(object)]
      if (length(bad) == 0L) next
      if (length(d)) {
        bad <- unique(((bad - 1) %% d[1L]) + 1L)
      }
      badrows <- union(badrows, bad)
    }
    if (length(badrows) > 0) {
      isDF <- is.data.frame(mtx)
      newmtx <- lapply(mtx, function(m) if (length(dim(m)) > 1) m[-badrows, , drop = FALSE] else m[-badrows])
      rm(mtx)
      if (isDF) newmtx <- as.data.frame(newmtx)
      if (delist) newmtx <- newmtx[[1]]
      fl <- eval(ff[["fl"]], env)
      N <- length(fl[[1]])
      newfl <- lapply(fl, function(f) factor(f[-badrows]))
      assign(".mtx", newmtx, envir = env)
      assign(".fl", newfl, envir = env)
      ff[["fl"]] <- quote(.fl)
      rm(newfl)
      ff[["attrs"]] <- c(ff[["attrs"]], list(na.rm = sort(badrows)))
      ff[["mtx"]] <- quote(unnamed(.mtx))
    } else {
      assign(".mtx", mtx, envir = env)
      ff[["mtx"]] <- quote(.mtx)
    }
  }
  eval(as.call(c(list(quote(.Call), quote(C_demeanlist)), ff)), env)
}