Nothing
#### Utilities for Sparse Model Matrices
## > from <- factor(sample(c(1:100, NA), size = 1e+04, replace = TRUE,
## + prob = rep.int(c(1, 20), c(100L, 1L))))
## > microbenchmark::microbenchmark(
## + Matrix:::fac2sparse1(from, drop.unused.levels = FALSE),
## + Matrix:::fac2sparse2(from, drop.unused.levels = FALSE),
## + Matrix:::fac2sparse (from, drop.unused.levels = FALSE),
## + times = 10000L)
## Unit: microseconds
## expr min lq mean median uq max neval
## Matrix:::fac2sparse1(from, drop.unused.levels = FALSE) 94.997 105.001 118.6117 107.789 110.413 4439.111 10000
## Matrix:::fac2sparse2(from, drop.unused.levels = FALSE) 538.289 586.341 643.7947 595.197 602.700 43128.556 10000
## Matrix:::fac2sparse (from, drop.unused.levels = FALSE) 163.139 185.115 221.7241 190.117 193.889 44887.784 10000
if(FALSE) {
## The "first" version, no longer used:
fac2sparse1 <- function(from,
to = c("d", "i", "l", "n", "z"),
drop.unused.levels = FALSE)
{
## factor(-like) --> sparseMatrix {also works for integer, character}
fact <- if (drop.unused.levels) factor(from) else as.factor(from)
levs <- levels(fact)
n <- length(fact)
to <- match.arg(to)
## MM: using new() and then assigning slots has efficiency "advantage"
## of *not* validity checking
res <- new(paste0(to, "gCMatrix"))
res@i <- as.integer(fact) - 1L # 0-based
res@p <- 0:n
res@Dim <- c(length(levs), n)
res@Dimnames <- list(levs, NULL)
if(to != "n")
res@x <- rep.int(switch(to,
"d" = 1., "i" = 1L, "l" = TRUE, "z" = 1+0i),
n)
res
}
## The "second" version, no longer used:
## * this one handles NAs correctly but may be slightly less efficient
## than fac2sparse1() above
fac2sparse2 <- function(from,
to = c("d", "i", "l", "n", "z"),
drop.unused.levels = TRUE,
repr = c("C", "T", "R"),
giveCsparse)
{
## factor(-like) --> sparseMatrix {also works for integer, character}
fact <- if (drop.unused.levels) factor(from) else as.factor(from)
levs <- levels(fact)
n <- length(fact)
to <- match.arg(to)
i <- as.integer(fact) - 1L ## 0-based indices
df <- data.frame(i = i, j = if(n) 0:(n-1L) else integer())[!is.na(i),]
if(to != "n")
df$x <- rep.int(switch(to,
"d" = 1., "i" = 1L, "l" = TRUE, "z" = 1+0i),
nrow(df))
T <- do.call(new, c(list(Class = paste0(to, "gTMatrix"),
Dim = c(length(levs), n),
Dimnames = list(levs, names(fact))), df))
## silent, back compatible (not yet warning about 'giveCsparse' deprecation):
repr <- if(missing(repr) && !missing(giveCsparse))
if(giveCsparse) "C" else "T"
else match.arg(repr)
switch(repr,
"C" = .M2C(T),
"T" = T,# TsparseMatrix
"R" = .M2R(T))
}
} # if(FALSE)
## The "third" version, dealing with NAs _and_ fast:
fac2sparse <- function(from,
to = c("d", "l", "n"), # no "i" or "z" yet
drop.unused.levels = TRUE,
repr = c("C", "R", "T"),
giveCsparse)
{
cl <- ".g.Matrix"
substr(cl, 1L, 1L) <- to <- match.arg(to)
substr(cl, 3L, 3L) <- repr <-
if(!missing(repr) || missing(giveCsparse))
match.arg(repr)
else if(giveCsparse)
"C"
else "T"
## Arguments are valid: _now_ allocate
from <-
if(drop.unused.levels && is.factor(from))
factor(from)
else as.factor(from)
n <- length(from)
lv <- levels(from)
nlv <- length(lv)
res <- new(cl)
res@Dim <- c(nlv, n)
res@Dimnames <- list(lv, names(from))
i. <-
switch(repr,
"C" =
{
not.na <- !is.na(from)
res@p <- c(0L, cumsum(not.na))
res@i <- as.integer(from)[not.na] - 1L
},
"R" =
{
res@p <- c(0L, cumsum(tabulate(from, nlv)))
res@j <- order(from, na.last = NA) - 1L
},
"T" =
{
which.not.na <- which(!is.na(from))
res@i <- as.integer(from)[which.not.na] - 1L
res@j <- which.not.na - 1L
}
)
if(to != "n")
res@x <- rep.int(switch(to, "d" = 1, "l" = TRUE, "i" = 1L, "z" = 1+0i),
length(i.))
res
}
setAs("factor", "sparseMatrix", function(from) fac2sparse(from, to = "d"))
## FIXME? support factor->[dlnCRT]sparseMatrix?
##' fac2Sparse() := fac2sparse w/ contrasts
##'
##' @param from factor of which we want the "contrasted" (indicator)
##' design matrix
##' @param to character string specifying the response type
##' @param drop.unused.level logical indicating if non-present factor
##' levels should be dropped, via factor(from)
##' @param factorPatt12 logical vector fp[] of length 2
##' fp[1] : give contrasted t(X); fp[2] : give "dummy" t(X) [=fac2sparse()]
##' @param contrasts.arg character string or NULL or (coercible to)
##' sparseMatrix, specifying the contrast
##'
##' @return a list of length two, each with the corresponding t(model matrix),
##' when the corresponding factorPatt12 is true.
fac2Sparse <- function(from,
to = c("d", "l", "n"), # no "i" or "z" yet
drop.unused.levels = TRUE,
repr = c("C", "R", "T"),
giveCsparse,
factorPatt12,
contrasts.arg = NULL)
{
stopifnot(is.logical(factorPatt12), length(factorPatt12) == 2L)
if(!any(factorPatt12))
return(list(NULL, NULL)) # nothing to do
m <- fac2sparse(from, to = to, drop.unused.levels = drop.unused.levels,
repr = repr, giveCsparse = giveCsparse)
list(if(factorPatt12[1L]) {
## contrasted coding, using 'contrasts.arg'
if(is.null(contrasts.arg))
contrasts.arg <- getOption("contrasts")[[if(is.ordered(from))
"ordered"
else "unordered"]]
crossprod(if(is.character(contrasts.arg)) {
## calling contr.*() with level names directly:
contr <- get(contrasts.arg, mode = "function")
contr(m@Dimnames[[1L]], sparse = TRUE)
} else as(contrasts.arg, "sparseMatrix"),
m)
},
if(factorPatt12[2L])
## uncontrasted ("dummy") coding
m
)
}
## Cut and paste from stats:::deparse2() in stats/R/models.R
deparse2 <- function(x)
paste(deparse(x, width.cutoff = 500L,
backtick = !is.symbol(x) && is.language(x)),
collapse = " ")
## Cut and paste from stats:::model.matrix.default() in stats/R/models.R,
## with some adaptation, most notably at the very end where we do _not_
## call the C-level utility of 'stats'
sparse.model.matrix <- function(object,
data = environment(object),
contrasts.arg = NULL,
xlev = NULL,
transpose = FALSE,
drop.unused.levels = FALSE,
row.names = TRUE,
sep = "",
verbose = FALSE,
...)
{
t <- if(missing(data)) terms(object) else terms(object, data=data)
if (is.null(attr(data, "terms")))
data <- model.frame(object, data, xlev=xlev)
else {
reorder <- match(vapply(attr(t, "variables"), deparse2, "")[-1L],
names(data))
if (anyNA(reorder))
stop("model frame and formula mismatch in sparse.model.matrix()")
if(!identical(reorder, seq_len(ncol(data))))
data <- data[,reorder, drop=FALSE]
}
int <- attr(t, "response")
if(length(data)) {
contr.funs <- as.character(getOption("contrasts"))
namD <- names(data)
## turn any character columns into factors
for(i in namD)
if(is.character(data[[i]]))
data[[i]] <- factor(data[[i]])
isF <- vapply(data, function(x) is.factor(x) || is.logical(x), NA)
isF[int] <- FALSE
isOF <- vapply(data, is.ordered, NA)
for(nn in namD[isF]) # drop response
if(is.null(attr(data[[nn]], "contrasts")))
contrasts(data[[nn]]) <- contr.funs[1 + isOF[nn]]
## it might be safer to have numerical contrasts:
## get(contr.funs[1 + isOF[nn]])(nlevels(data[[nn]]))
if (!is.null(contrasts.arg)) {
if (!is.list(contrasts.arg))
warning("non-list contrasts argument ignored")
else { ## contrasts.arg is a list
if (is.null(namC <- names(contrasts.arg)))
stop("'contrasts.arg' argument must be named")
for (nn in namC) {
if (is.na(ni <- match(nn, namD)))
warning(gettextf("variable '%s' is absent, its contrast will be ignored", nn),
domain = NA)
else {
ca <- contrasts.arg[[nn]]
## contrasts(*, ncol(m)) <- m works also
## for function||character 'm' in R >= 4.2,
## which supports how.many=NULL
if(is.matrix(ca) || is(ca, "Matrix"))
contrasts(data[[ni]], ncol(ca)) <- ca
else contrasts(data[[ni]]) <- ca
}
}
}
} ## non-null contrasts.arg
} else { # no rhs terms ('~1', or '~0'): internal model.matrix needs some variable
isF <- FALSE
data[["x"]] <- raw(nrow(data))
}
## <stats>
## ans <- .External2(C_modelmatrix, t, data)
## if(any(isF))
## attr(ans, "contrasts") <- lapply(data[isF], attr, "contrasts")
## </stats>
## <Matrix>
if(verbose) {
cat("model.spmatrix(t, data, ...) with t =\n")
str(t, give.attr = FALSE)
}
ans <- model.spmatrix(trms = t,
mf = data,
transpose = transpose,
drop.unused.levels = drop.unused.levels,
row.names = row.names,
sep = sep,
verbose = verbose)
## MJ: hmm ... our tests require that this "slot" exists,
## even in the empty case, i.e., !any(isF) ... why?
attr(ans, "contrasts") <- lapply(data[isF], attr, "contrasts")
## </Matrix>
ans
} # sparse.model.matrix
##' Produce the t(Z); Z = "design matrix" of (X : Y), where
##' --- t(Z) : aka rowwise -version : "r"
##'
##' @title sparse model matrix for 2-way interaction
##' @param X and Y either are numeric matrices {maybe 1-column}
##' @param Y or "as(<factor>, sparseM)" {dgCMatrix}
##' @param do.names logical
##' @param forceSparse logical
##' @return
##' @author Martin Maechler
.sparse.interaction.2 <- # formerly sparse2int()
function(X, Y, do.names = TRUE, forceSparse = FALSE, verbose = FALSE)
{
### FIXME -- the X[rep(..), ] * Y[rep(..), ] construct can become HUGE, even for sparse X[],Y[]
### ----- --> Matrix bug #1330 and ~/R/MM/Pkg-ex/Matrix/sparse-matrix-fix.R
## MJ: Moreover, as(<factor>, "sparseMatrix") is a dgCMatrix,
## for which row-indexing is already rather inefficient ... FIXME?
sx <- isS4(X)
sy <- isS4(Y)
nx <- (dx <- dim(X))[1L]
ny <- (dy <- dim(Y))[1L]
if(verbose)
cat(sprintf(".sparse.interaction.2(%s[%d], %s[%d])\n",
if(sx) "<sparse>" else "<N>", nx,
if(sy) "<sparse>" else "<N>", ny))
if(do.names) {
dnx <- dimnames(X)
dny <- dimnames(Y)
}
dimnames(X) <- dimnames(Y) <- list(NULL, NULL)
r <-
if(sx && sy) {
## 'X' and 'Y' are dgCMatrix
(if(ny == 1L) X else X[rep.int(seq_len(nx), times = ny), ]) *
(if(nx == 1L) Y else Y[rep (seq_len(ny), each = nx), ])
} else if (sx) {
## 'X' is a dgCMatrix, 'Y' is a numeric matrix
if(ny <= 1L) {
## FIXME: a similar trick would be applicable for ny > 1
r <- X
dp <- X@p[-1L] - X@p[-(dx[2L]+1L)]
## stopifnot(all(dp %in% 0:1))
r@x <- Y[dp == 1L] * X@x
r
} else {
X[rep.int(seq_len(nx), times = ny), ] *
(if(nx == 1L) Y else Y[rep (seq_len(ny), each = nx), ])
}
} else if(sy) {
## 'X' is a numeric matrix, 'Y' is a dgCMatrix
if(nx <= 1L) {
## FIXME: a similar trick would be applicable for nx > 1
r <- Y
dp <- Y@p[-1L] - Y@p[-(dy[2L]+1L)]
## stopifnot(all(dp %in% 0:1))
r@x <- X[dp == 1L] * Y@x
r
} else {
(if(ny == 1L) X else X[rep.int(seq_len(nx), times = ny), ]) *
Y[rep (seq_len(ny), each = nx), ]
}
} else {
## 'X' and 'Y' are numeric matrices
r <- (if(ny == 1L) X else X[rep.int(seq_len(nx), times = ny), ]) *
(if(nx == 1L) Y else Y[rep (seq_len(ny), each = nx), ])
if(forceSparse) .m2sparse(r, "dgC") else r
}
## FIXME: This 'names' business needs a good solution ...
## but maybe "up in the caller" ...
if(do.names &&
!is.null(dim(r)) &&
!is.null(rnx <- dnx[[1L]]) &&
!is.null(rny <- dny[[1L]]))
dimnames(r)[[1L]] <- outer(rnx, rny, paste, sep = ":")
r
} # .sparse.interaction.2
##' Sparse Model Matrix for a (high order) interaction term A:B:x:C
##'
##' @param rList list(.) of (transposed) single-factor model matrices,
##' belonging to, say, factors a, b, c,...
##' @param do.names
##' @param forceSparse
##' @param verbose
##' @return the model matrix corresponding to a:b:...
.sparse.interaction.N <- # formerly sparseInt.r()
function(rList, do.names = TRUE, forceSparse = FALSE, verbose = FALSE)
{
if((n <- length(rList)) == 0L)
return(NULL) # caller beware
if(verbose)
cat(sprintf(".sparse.interaction.N(<list>[%d], fS=%s): is.mat=(%s)\n",
n, forceSparse, paste0(symnum(vapply(rList, is.matrix, NA)),
collapse = "")),
sep = "")
r <- rList[[1L]]
if(n > 1L)
for(i in 2:n)
r <- .sparse.interaction.2(r, rList[[i]],
forceSparse = forceSparse,
do.names = do.names,
verbose = verbose)
if(!forceSparse)
r
else if(!isS4(r))
.m2sparse(r, "dgC")
else if(is(r, "denseMatrix"))
.dense2sparse(r, "C")
else r
} # .sparse.interaction.N
## MJ: unused
if(FALSE) {
is.model.frame <- function(x)
{
## Purpose: check if x is a "valid" model.frame
## ------------------------------------------------------------
## Author: Martin Maechler, Date: 3 Jul 2009
is.data.frame(x) &&
!is.null(tms <- attr(x, "terms")) &&
inherits(tms, "terms") && ## is.terms() would be better
inherits(tms, "formula") &&
is.matrix(attr(tms, "factors")) &&
is.language(vv <- attr(tms, "variables")) &&
vv[[1]] == as.symbol("list") &&
all(vapply(as.list(vv[-1]), as.character, "") %in% colnames(x))
## all((vars <- sapply(as.list(vv[-1]), as.character)) %in% colnames(x))
## and we could go on testing vars
}
} ## MJ
##' Create a sparse model matrix from a model frame.
##'
##' @title Sparse Model Matrix from Model Frame
##' @param trms a "terms" object
##' @param mf a data frame, typically resulting from model.frame()
##' @param transpose logical indicating if X' = t(X) {is faster!}
##' or X should be returned
##' @param drop.unused.levels logical indicating if unused factor
##' levels should be dropped
##' @param row.names
##' @return sparse matrix (class "dgCMatrix")
##' @author Martin Maechler
model.spmatrix <- function(trms, mf, transpose=FALSE,
drop.unused.levels = FALSE, row.names=TRUE, sep="",
verbose=FALSE)
{
## Author: Martin Maechler, Date: 7 Jul 2009
## mf is a model frame or a "simple" data.frame [after reorder !]
stopifnot(is.data.frame(mf))
n <- nrow(mf)
if(row.names)
rnames <- row.names(mf)
## mf: make into list, dropping all attributes (but the names)
### FIXME: for poly(., 5) mf has a 5-column matrix as "one column" => looses names here
fnames <- names(mf <- unclass(mf))
attributes(mf) <- list(names = fnames)
if(length(factorPattern <- attr(trms, "factors"))) {
d <- dim(factorPattern)
nVar <- d[1]
nTrm <- d[2]
n.fP <- dimnames(factorPattern)
fnames <- n.fP[[1]] # == names of variables {incl. "F(var)"} in the model
Names <- n.fP[[2]] # == colnames == names of terms: "a", "b:c", ...
} else { ## degenerate, e.g. 'Y ~ 1'
nVar <- nTrm <- 0L
fnames <- Names <- character(0)
}
## all the "variables in the model" are also in "mf", including "sin(x)";
## actually, ..../src/main/model.c even assumes
stopifnot((m <- length(mf)) >= nVar)
if(verbose)
cat(sprintf("model.spmatrix(): (n=%d, nVar=%d (m=%d), nTrm=%d)\n",
n, nVar,m, nTrm))
if(m > nVar) mf <- mf[seq_len(nVar)]
stopifnot(fnames == names(mf), allow.logical0 = TRUE)
noVar <- nVar == 0
##>> this seems wrong; we use 1:nVar for indexing mf[] below ..
##>> if(noVar) nVar <- 1L # (as in ~/R/D/r-devel/R/src/main/model.c)
## Note: "character" variables have been changed to factor in the caller;
## hence: both factor and *logical* should be dealt as factor :
is.f <- if(noVar) logical(0) else vapply(mf, function(.)
is.factor(.) | is.logical(.), NA)
indF <- which(is.f)
if(verbose) { cat(" --> indF =\n"); print(indF) }
hasInt <- attr(trms, "intercept") == 1
## the degree of interaction:
## intOrder <- attr(trms, "order")
##
if(!hasInt && length(indF)) {
## change the '1' of the first factor into a '2' :
if(any(i1 <- factorPattern[indF, ] == 1))
## replace at the first '1' location:
factorPattern[indF,][which.max(i1)] <- 2L
else {}
## nothing to do
}
## Convert "factors" to "Rowwise- sparseMatrix ("dummy"-matrix) -----------
## Result: a list of sparse model matrices for the "factor"s :
f.matr <- structure(vector("list", length = length(indF)),
names = fnames[indF])
i.f <- 0
## ---- For each variable in the model -------------------
for(i in seq_len(nVar)) {
nam <- fnames[i]
f <- mf[[i]]
if(is.f[i]) {
fp <- factorPattern[i,] ## == factorPattern[nam,]
contr <- attr(f, "contrasts")
f.matr[[(i.f <- i.f + 1)]] <- # a list of 2
lapply(fac2Sparse(f, to = "d",
drop.unused.levels=drop.unused.levels,
factorPatt12 = 1:2 %in% fp,
contrasts.arg = contr),
function(s) {
if(!is.null(s)) {
## for some contr.*(), have lost rownames; hmm..
if(is.null(rn <- rownames(s)))
rn <- seq_len(nrow(s))
rownames(s) <- paste(nam, rn, sep = sep)
}
s
})
} else { ## continuous variable --> "matrix" - for all of them
if(any(iA <- (cl <- class(f)) == "AsIs")) # drop "AsIs" class
class(f) <- if(length(cl) > 1L) cl[!iA]
nr <- if(is.matrix(f)) nrow(f <- t(f)) else (dim(f) <- c(1L, length(f)))[1]
if(is.null(rownames(f)))
rownames(f) <- if(nr == 1) nam else paste(nam, seq_len(nr), sep=sep)
mf[[i]] <- f
}
}
if(verbose) {
cat(" ---> f.matr list :\n")
str(f.matr, max.level = as.integer(verbose))
fNms <- format(dQuote(Names))
dim.string <- gsub('5', as.character(floor(1+log10(n))),
" -- concatenating (r, rj): dim = (%5d,%5d) | (%5d,%5d)\n")
}
## FIXME: do all this in C --
getR <- function(N) # using 'nm'
if(!is.null(r <- f.matr[[N]])) r[[factorPattern[N, nm]]] else mf[[N]]
vNms <- "(Intercept)"[hasInt]
counts <- integer(nTrm)
r <-
if(hasInt) ## column of 1's - as sparse
new("dgCMatrix", i = 0:(n-1L), p = c(0L, n),
Dim = c(n, 1L), x = rep.int(1, n))
else new("dgCMatrix", Dim = c(n, 0L))
if(transpose) r <- t(r)
iTrm <- seq_len(nTrm)
for(j in iTrm) { ## j-th term
nm <- Names[j]
if(verbose) cat(sprintf("term[%2d] %s .. ", j, fNms[j]))
nmSplits <- strsplit(nm, ":", fixed=TRUE)[[1]]
## NOTA BENE: This can be very slow when many terms are involved
## FIXME ??? why does it use *much* memory in those cases ??
rj <- .sparse.interaction.N(lapply(nmSplits, getR),
do.names = TRUE,
forceSparse = TRUE,
verbose = verbose) # or just (verbose >= 2)
if(verbose) cat(sprintf(dim.string, nrow(r), ncol(r), nrow(rj),ncol(rj)))
## fast version of cbind2() / rbind2(), w/o checks, dimnames, etc
r <- if(transpose) rbind.Matrix(r, rj )
else cbind.Matrix(r, t(rj))
## if(verbose) cat(" [Ok]\n")
vNms <- c(vNms, dimnames(rj)[[1]])
counts[j] <- nrow(rj)
}
rns <- if(row.names) rnames
dimnames(r) <- if(transpose) list(vNms, rns) else list(rns, vNms)
attr(r, "assign") <- c(if(hasInt) 0L, rep(iTrm, counts))
r
} ## model.spmatrix()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.