#' makeSparseMat
#'
#' Function to create a sparse design matrix of basis functions
#' based on a matrix of predictors.
#'
#' @param X A \code{data.frame} of predictors
#' @param newX Optional \code{data.frame} on which to return predicted values
#' @param verbose A \code{boolean} indicating whether to print output on functions progress
#' @importFrom plyr llply alply
#' @export
# TODO: don't always have a newX.
makeSparseMat <- function(X, newX = X, verbose = TRUE) {
if (is.vector(X)) X <- matrix(X, ncol = 1)
if (is.vector(newX)) newX <- matrix(newX, ncol = 1)
d <- ncol(X)
stopifnot(ncol(X) == ncol(newX))
nX <- length(X[, 1])
n <- length(newX[, 1])
# numbers used to correct column indices later
colStart <- 1
colEnd <- d
# Start by creating a list of univariate indicators.
# The length of the list is d and the entries are matrices of row and column
# indices for a design matrix based only on that covariate, i.e. columns in
# each list entry run from 1:n, so we can use intersect later for higher order
# terms.
if (verbose) cat("Making", d, "basis functions of dimension 1\n")
# The outer alply loop is over dimension, the inner alply loop is over
# the observed values of data. Given variable x (e.g., the first observed
# predictor variable), we loop over each observed value of that variable
# (the inner loop, running over y) and save the indices of observations
# that are smaller than y. These results are saved in variable j. j are the
# column indices of non-zero observations in the design matrix of indicator
# basis functions. We then make a vector of the same length as j indicating the
# row indices of non-zero observations in the design matrix.
uni <- plyr::alply(matrix(1:d), 1, function(x) {
j <- plyr::alply(matrix(newX[, x]), 1, function(y) {
which(X[, x] <= y)
})
i <- rep(1:n, unlist(lapply(j, length), use.names = FALSE))
cbind(unlist(i, use.names = FALSE), unlist(j, use.names = FALSE))
})
# number of 1's for each variable -- for variables with
# length(unique(x)) == length(x) will be n*(n+1)/2, but if there
# are ties, the length will be different
nperuni <- lapply(uni, nrow)
# rbind all of the univariate results together
uni.ij <- Reduce("rbind", uni)
# currently the column indices (i.e., uni.ij[,2]) run from 1:n,
# whereas (if there were no ties) we would like them to run from
# 1:(n*(2^d -1)). This line adds the correct amount to each of the
# indices to correct for this.
uni.ij[, 2] <- uni.ij[, 2] +
rep.int((colStart:colEnd) - 1, times = unlist(nperuni, use.names = FALSE)) *
nX
# i = row indices, j = column indices
i <- uni.ij[, 1]
j <- uni.ij[, 2]
# Now that we have created a sparse matrix representation of
# the univariate terms, we can loop to create higher order terms.
if (d > 1) {
for (k in 2:d) {
# Matrix of all d choose k combinations.
combos <- utils::combn(d, k)
if (verbose) cat("Making", ncol(combos), "basis functions of dimension", k, "\n")
# This is a running count of where columns of the design matrix
# start and end. Each time we loop, we adjust the starting and
# ending point of the column indices.
colStart <- colEnd + 1L
colEnd <- (colStart - 1L) + ncol(combos)
# !!! This is the primary cause of execution time and memory usage. !!!
# Here we loop over the various combinations of variables to create
# higher order indicator basis functions. In each iteration a is going
# to be of length k and uni[a] will be a list of the sparse matrix representation
# of the univariate basis functions based on X[,a]. Recall, that this means
# each component of uni[a] will contain a 2 column matrix with row and column
# indices for non-zero observations. The sparse matrix representation of the
# columns of the higher order basis function will be the intersection of the column
# indices in uni[a]. getIntersect is an ugly function that obtains the intersection
# of these indices.
j.list <- plyr::alply(combos, 2L, function(a) {
getIntersect(uni[a])
})
# as above we need to now compute the row indices corresponding to the
# observations with non-zero higher order interaction terms.
# list of length d choose k, each entry containing
# n indices of rows corresponding to subjects
i.list <- plyr::llply(j.list, function(x) {
rep(as.numeric(names(x)), unlist(lapply(x, length), use.names = FALSE))
})
# number of 1's for each combination
nper <- lapply(i.list, length)
# unlist column numbers
j.list <- lapply(j.list, unlist, use.names = FALSE)
# unlist rows and columns
thisi <- unlist(i.list, use.names = FALSE)
thisj <- unlist(j.list, use.names = FALSE)
# fix up the column number
thisj <- thisj +
rep.int((colStart:colEnd) - 1, times = unlist(nper, use.names =
FALSE)) * nX
# Put it together
# CK: this is dynamic memory allocation - pre-allocating would be much better if possible.
# Can we determine what the size will be in advance, or no?
# DB: I don't think we could pre-determine exact size, but might be able to
# determine an upper bound on the size. However, this doesn't seem to be exactly
# straightforward; it might involve some hard combinatorics.
i <- c(i, thisi)
j <- c(j, thisj)
}
}
# make the sparseMatrix
grbg <-
Matrix::sparseMatrix(
i = i[order(i)],
j = j[order(i)],
x = 1,
dims = c(n, nX * (2 ^ d - 1))
)
return(grbg)
}
#' myIntersect
#'
#' Helper function for higher order interaction basis functions.
#'
#' @param ... Arguments passed to intersect
#'
myIntersect <- function(...) {
Reduce(intersect, list(...))
}
#' getIntersect
#'
#' Heper function for higher order interaction basis functions.
#'
#' @param ... Arguments passed to \code{lapply}
getIntersect <- function(...) {
# make a list of column indices split by row index
tmp <- lapply(..., function(b) {
split(b[, 2], b[, 1])
})
tmpNames <- lapply(tmp, function(l) {
as.numeric(names(l))
})
overlap <- Reduce(intersect, tmpNames)
# indices of tmp that overlap
newtmp <- lapply(tmp, function(b) {
b[paste(overlap)]
})
# get intersection
out <- eval(parse(text = paste0(
paste0("mapply(myIntersect,"),
paste0("newtmp[[", 1:length(tmp), "]]", collapse = ","),
",SIMPLIFY=FALSE)"
)))
out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.