Nothing
# TODO: remove any ids in columns/rows that don't appear in the triplet form as
# this breaks the CSR construction algorithm.
# Alternate test
# rows <- paste('row',1:10000, sep='.')
# cols <- paste('col',1:10000, sep='.')
# mat <- cbind(sample(rows, 5000), sample(cols, 5000), rnorm(5000))
# write.csv(mat, file='triplet.csv', row.names=FALSE, col.names=FALSE, quote=FALSE)
.fileConnection <- function(file)
{
if ('connection' %in% class(file)) { zz <- file }
else if (length(grep("\\.gz(ip)?$", file)) > 0) { zz <- gzfile(file) }
else { zz <- file(file) }
zz
}
.processHeader <- function(file, row.idx, col.idx)
{
zz <- .fileConnection(file)
flog.debug('Loading ids')
count <- 0
if (! is.null(row.idx)) count <- count + 1
if (! is.null(col.idx)) count <- count + 1
all.ids <- strsplit(readLines(zz, n=count), ',', fixed=TRUE)
idx <- 0
if (! is.null(row.idx))
{
row.ids <- all.ids[[idx]]
row.ids <- row.ids[row.ids != '']
idx <- idx + 1
}
if (! is.null(col.idx))
{
col.ids <- all.ids[[idx]]
col.ids <- col.ids[col.ids != '']
idx <- idx + 1
}
list(row.ids=row.ids, col.ids=col.ids, skip=idx)
}
# About 6.7s to load a 1000 x 1000 sparse matrix with 2500 values, 90MB memory
# About 4.9s to load 10000 x 10000 sparse matrixm 600 MB memory
# rows <- paste('row', 1:10000, sep='.')
# cols <- paste('col', 1:10000, sep='.')
# m <- read.matrix('triplet.csv', row.ids=rows, col.ids=cols)
#
# Compare to slam ~18s, 2.3GB memory
# read.slam <- function(file)
# {
# m <- read.csv(file, header=FALSE, skip=1)
# m[,1] <- sub('row.|col.','', m[,1])
# m[,2] <- sub('row.|col.','', m[,2])
# m1 <- simple_triplet_matrix(m[,1], m[,2], m[,3], nrow=10000, ncol=10000)
# as.matrix(m1)
# }
# system.time(m <- read.slam('triplet.csv'))
#' Read a sparse matrix from a file and return a matrix
#'
#' Reading matrices from files can be time consuming depending on the size of
#' the matrix. \code{read.matrix} implements a fairly efficient routine to
#' read in sparse matrices and return dense matrix counterparts.
#'
#' Matrices that have dimensions on the order of thousands can be slow to load
#' into R. 'read.matrix' provides an efficient implementation for reading sparse
#' matrices in triplet form from a file or other connection. This version
#' removes dependencies from other packages and shows a speed improvement over
#' those methods.
#'
#' The primary benefit of this function is that named rows and columns can be
#' used as opposed to integer indexes, as compared to the \code{slam} package.
#' The other main motivation is that if the memory is available, dense matrix
#' calculations can be faster than their sparse counterparts, not to mention
#' having a wider range of operators available.
#'
#' When header == TRUE, the row names and/or column names are read from the
#' file. The names are expected to be comma separated in a single line.
#'
#' Various methods can be used to construct a sparse matrix representation
#' that is used as the basis for constructing the dense matrix. Currently only
#' the \code{assign_matrix_dense} function is available, which works well for
#' matrices in triplet form.
#'
#'
#' @param file A file or connection to read from
#' @param header Whether header lines exist defining all possible rows an
#' columns. If this is false, then the defined triplet elements will produce
#' the complete set of rows and columns.
#' @param skip The number of rows to skip. This assumes there is a single
#' header line, which is skipped.
#' @param row.ids If header is TRUE, the row number that defines the row.ids
#' If header == FALSE, the row.ids to use for the matrix
#' @param col.ids If header is TRUE, the col number that defines the col.ids
#' If header == FALSE, the col.ids to use for the matrix
#' @param colClasses The classes to use for the columns in the triplet file
#' @param assign.fn The function to use to construct the sparse representation
#' that is then converted to a dense matrix
#' @param filter.fn An optional function used to filter/clean the input data
#' and/or row/column ids. The signature of filter.fn must have arguments for
#' data, row.ids, and col.ids
#' @param ... Additional arguments to pass to the construction portion of the
#' implementation
#' @return A matrix object generated from sparse triplet data
#' @author Brian Lee Yung Rowe
#' @keywords array
#' @examples
#' \dontrun{
#' path <- system.file('sample-data/triplet.csv', package='futile.matrix')
#' m <- read.matrix(path)
#'
#' rows <- paste('row', 1:10000, sep='.')
#' cols <- paste('col', 1:10000, sep='.')
#' n <- read.matrix(path, row.ids=rows, col.ids=cols)
#' }
read.matrix <- function(file, header=FALSE, skip=1,
row.ids=NULL, col.ids=NULL,
colClasses=c('character','character','numeric'),
assign.fn=assign_matrix_dense, filter.fn=NULL, ...)
{
if (header)
{
h <- .processHeader(file, row.ids, col.ids)
row.ids <- h$row.ids
col.ids <- h$col.ids
skip <- h$skip
}
flog.debug('Reading triplet representation of %s', file)
zz <- .fileConnection(file)
pts <- read.csv(zz, colClasses=colClasses, skip=skip, header=FALSE)
colnames(pts) <- c('row.id','col.id','value')
flog.debug('Got raw %s: [%s,%s]', file, nrow(pts), ncol(pts))
if (is.null(row.ids)) row.ids <- unique(pts[,1])
if (is.null(col.ids)) col.ids <- unique(pts[,2])
if (!is.null(filter.fn))
{
flog.info("Applying filter to raw data")
pts <- filter.fn(pts)
row.ids <- filter.fn(row.ids)
flog.debug('Filtered %s: [%s,%s]', file, nrow(pts), ncol(pts))
}
#tryCatch(close(zz), finally=flog.debug("Closed file"))
# This guarantees that the computed ias are monotonically increasing
row.ids <- row.ids[order(row.ids)]
col.ids <- col.ids[order(col.ids)]
pts <- pts[order(pts$row.id,pts$col.id),]
#m <- assignMatrixSparse(pts, row.ids, col.ids, ...)
m <- assign.fn(pts, row.ids, col.ids, ...)
flog.debug('Assigned values to %s', file)
flog.debug('Converting to dense matrix')
m <- as.matrix(m)
flog.debug('Attaching names')
colnames(m) <- toupper(col.ids)
rownames(m) <- toupper(row.ids)
flog.debug('Done with matrix')
m
}
# Creates a sparse matrix in CSR format based on a triplet input
# Note that inputs must all be ordered otherwise this will violate the CSR
# spec during construction
# This is failing on the as.matrix for some reason, so this is alwo going away
# to streamline the package.
#assignMatrixSparse <- function(source, row.ids, col.ids,
# row.block=500000, col.block=500000)
#{
# require(SparseM)
# msg.col <- "Calculating column indexes for %s elements (map size: %s)"
# msg.row <- "Calculating row indexes for %s elements (map size: %s)"
# msg.lookup <- "Looking up block %s/%s [%s:%s] of source"
#
# # Remove records in id lists that are not present in the actual data body.
# # This is necessary to conform to CSR construction rules.
# row.ids <- row.ids[row.ids %in% source$row.id]
# col.ids <- col.ids[col.ids %in% source$col.id]
# flog.debug("Output matrix will be [%s,%s]", length(row.ids),length(col.ids))
#
# n <- length(row.ids)
# m <- length(col.ids)
#
# # This blocking is done for performance reasons. There seems to be some
# # cutoff in vector size in certain operations that causes serious slowdowns.
# block.size <- col.block
# num.pieces <- nrow(source) %/% block.size + 1
# lookup <- function(idx, map, source)
# {
# inf <- block.size * (idx - 1) + 1
# sup <- ifelse(idx == num.pieces, length(source), idx * block.size)
# flog.debug(msg.lookup, idx, num.pieces, inf, sup)
# map[source[inf:sup]]
# }
#
# flog.debug(msg.col, nrow(source), m)
# col.map <- as.integer(1:length(col.ids))
# names(col.map) <- col.ids
# col.idx.list <- apply(array(1:num.pieces), 1, lookup, col.map, source$col.id)
# if (! class(col.idx.list) %in% 'list') col.idx.list <- list(col.idx.list)
# # This is the map of (s)ids to their corresponding column number in the
# # column definition. Values are the index position and the names are the
# # (s)ids. The ordering needs to be consistent with the row.idx
# col.idx <- do.call(c, col.idx.list)
#
# flog.debug(msg.row, nrow(source), n)
# block.size <- row.block
# num.pieces <- nrow(source) %/% block.size + 1
# row.map <- as.integer(1:length(row.ids))
# names(row.map) <- row.ids
# row.idx.list <- apply(array(1:num.pieces), 1, lookup, row.map, source$row.id)
# if (! class(row.idx.list) %in% 'list') row.idx.list <- list(row.idx.list)
# # This is the map of (s)ids to their corresponding row number in the row
# # definition. Values are the index position and the names are the (s)ids.
# # The ordering needs to be consistent with the row.idx
# row.idx <- do.call(c, row.idx.list)
#
# flog.debug("Calculating ias", n)
# idxes <- rep(0, n+1)
# idxes[1] <- 1
# idx <- 0
# prev <- 1
# # The idxes become the ias in the CSR format. For this to work, the row ids
# # need to be ordered otherwise the position values are not monotonically
# # increasing which violates the CSR spec.
# tryCatch(
# for (r in row.idx)
# {
# idx <- idx + 1
# if (is.na(r))
# {
# flog.warn("Skipping bad row match at index %s", idx)
# next
# }
# if (r == prev) next
# if (idx <= 0)
# {
# flog.warn("Bad index value encountered: %s", idx)
# }
# idxes[r] <- idx
# prev <- r
# }, finally=flog.debug("prev=%s, r=%s",prev,r))
# idxes[length(idxes)] <- nrow(source) + 1
#
# flog.debug("Creating sparse matrix with dimensions [%s,%s]", n,m)
# m.ra <- source$value
# m.ja <- as.integer(col.idx)
# m.ia <- as.integer(idxes)
# d <- as.integer(c(n,m))
#
# new('matrix.csr', ra=m.ra, ja=m.ja, ia=m.ia, dimension=d)
#}
# Assign the matrix as though it were dense
# Currently unused
.assignMatrixDenseNaive <- function(source, row.ids, col.ids)
{
num.cols <- length(col.ids)
source.rows <- unique(source$row.id)
fn <- function(row.id, source)
{
row <- rep(0, num.cols)
if (! row.id %in% source.rows) return(row)
items <- source[source$row.id == row.id,]
row[col.ids %in% items$col.id] <- items$value
row
}
flog.debug('Creating raw rows for matrix')
all.rows <- lapply(row.ids, fn, source)
flog.debug('Binding %s rows to create matrix', length(all.rows))
m <- matrix(do.call(rbind, all.rows), nrow=length(all.rows), byrow=TRUE)
rownames(m) <- row.ids
colnames(m) <- col.ids
m
}
# Assign the matrix using the SLAM triplet construction
# This is blowing up the memory, so we aren't going to use this anymore
#assignMatrixTriplet <- function(source, row.ids, col.ids, ...)
#{
# require(slam)
#
# row.idx <- 1:length(row.ids)
# names(row.idx) <- row.ids
# col.idx <- 1:length(col.ids)
# names(col.idx) <- col.ids
#
# i <- row.idx[source$row.id]
# j <- col.idx[source$col.id]
# v <- source$value
# simple_triplet_matrix(i,j,v, nrow=length(row.ids), ncol=length(col.ids), ...)
#}
#' Create a matrix from a triplet representation
#'
#' A somewhat optimized constructor of dense matrices from triplets
#'
#' @param raw data.frame in triplet form where each row represents a triplet
#' @param row.ids The row ids
#' @param col.ids The col ids
#' @return A matrix
assign_matrix_dense <- function(raw, row.ids, col.ids)
{
#u.rows <- unique(raw[,1])
#rnames <- u.rows[order(u.rows)]
rows <- 1:length(row.ids)
names(rows) <- row.ids
#u.cols <- unique(raw[,2])
#rnames <- u.cols[order(u.cols)]
cols <- 1:length(col.ids)
names(cols) <- col.ids
out <- rep(0, length(rows) * length(cols))
for (x in 1:nrow(raw))
{
idx <- (cols[raw[x,2]]-1) * length(rows) + rows[raw[x,1]]
out[idx] <- raw[x,3]
NULL
}
matrix(out, ncol=length(cols), dimnames=list(row.ids, col.ids))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.