Nothing
#' @importFrom methods as new .hasSlot
#' @importFrom Rcpp evalCpp
#' @importClassesFrom Matrix RsparseMatrix dgRMatrix ngRMatrix
#' @useDynLib readsparse, .registration=TRUE
check.bool <- function(var, name) {
if (NROW(var) != 1)
stop(sprintf("Error: %s must be a single boolean/logical.", name))
if (typeof(var) != "logical")
var <- as.logical(var)
if (is.na(var))
stop(sprintf("%s cannot be NA.", name))
return(var)
}
check.int <- function(var, name) {
if (NROW(var) != 1)
stop(sprintf("Error: %s must be a single non-negative integer.", name))
if (typeof(var) != "integer")
var <- as.integer(var)
if (is.na(var))
stop(sprintf("%s cannot be NA.", name))
if (is.infinite(var))
stop(sprintf("%s cannot be infinite.", name))
if (var < 0L)
stop(sprintf("%s must be non-negative."))
return(var)
}
cast.df <- function(df) {
if (inherits(df, c("tibble", "data.table")))
df <- as.data.frame(df)
if (inherits(df, "data.frame"))
df <- as.matrix(df)
return(df)
}
cast.dense.vec <- function(X) {
if (inherits(X, c("numeric", "integer", "float32"))) {
if (typeof(X) != "double")
X <- as.numeric(X)
X <- matrix(X, nrow=1L)
}
return(X)
}
cast.sparse.vec <- function(X) {
if (inherits(X, "sparseVector")) {
X.csr <- new("dgRMatrix")
X.csr@Dim <- c(1L, X@length)
X.csr@p <- c(0L, length(X@x))
X.csr@j <- X@i - 1L
if (.hasSlot(X, "x")) {
X.csr@x <- as.numeric(X@x)
} else {
X.csr@x <- rep(1., length(X@i))
}
X <- X.csr
}
return(X)
}
check.for.overflow <- function(lst) {
if (length(lst) == 1L) {
if ((lst$err == 1) && ("err" %in% names(lst)))
stop("Number of rows exceeds INT_MAX.")
else if (lst$err == 2)
stop("Number of columns exceeds INT_MAX.")
else if (lst$err == 3)
stop("Number of classes exceeds INT_MAX.")
}
}
process.file.name <- function(fname) {
fname <- path.expand(fname)
if (take_as_utf8())
fname <- enc2utf8(fname)
# else if (Encoding(fname) != "unknown")
# fname <- enc2native(fname)
return(fname)
}
#' @title Read Sparse Matrix from Text File
#' @description Read a labelled sparse CSR matrix in text format as used by libraries
#' such as SVMLight, LibSVM, ThunderSVM, LibFM, xLearn, XGBoost, LightGBM, and more.
#'
#' The format is as follows:
#'
#' \code{<label(s)> <column>:<value> <column>:<value> ...}
#'
#' with one line per observation/row.
#'
#' Example line (row):
#'
#' \code{1 1:1.234 3:20}
#'
#' This line denotes a row with label (target variable) equal to 1, a value
#' for the first column of 1.234, a value of zero for the second column (which is
#' missing), and a value of 20 for the third column.
#'
#' The labels might be decimal (for regression), and each row might contain more
#' than one label (must be integers in this case), separated by commas \bold{without}
#' spaces inbetween - e.g.:
#'
#' \code{1,5,10 1:1.234 3:20}
#'
#' This line indicates a row with labels 1, 5, and 10 (for multi-class classification).
#' If the line has no labels, it should still include a space before the features.
#'
#'
#' The rows might additionally contain a `qid` parameter as used in ranking algorithms,
#' which should always lay inbetween the labels and the features and must be an integer - e.g.:
#'
#' \code{1 qid:2 1:1.234 3:20}
#'
#'
#' The file might optionally contain a header as the first line with metadata
#' (number of rows, number of columns, number of classes). Presence of a header will be
#' automatically detected, and is recommended to include it for speed purposes. Datasets
#' from the extreme classification repository (see references) usually include such a header.
#'
#' Lines might include comments, which start after a `#` character. Lines consisting
#' of only a `#` will be ignored. When reading from a file, such file might have a
#' BOM (information about encoding uses in Windows sytems), which will be automatically
#' skipped.
#'
#' @details Note that this function:\itemize{
#' \item Will not make any checks for negative column indices.
#' \item Has a precision of C type `int` for column indices and integer labels
#' (the maximum value that this type can hold can be checked in `.Machine$integer.max`).
#' \item Will fill missing labels with NAs when passing `multilabel=FALSE`.
#' \item Will fill with zeros (empty values) the lines that are empty (that is,
#' they generate a row in the data), but will ignore (that is, will not generate
#' a row in the data) the lines that start with `#`.
#' }
#'
#' Be aware that the data is represented as a CSR matrix with index pointer of
#' class C `int`, thus the number of rows/columns/non-zero-elements cannot exceed
#' `.Machine$integer.max`.
#'
#' On 64-bit Windows systems, if compiling the library with a compiler other than MinGW
#' or MSVC, it will not be able to read files larger than 2GB. This should not be a concern
#' if installing it from CRAN or from R itself, as the Windows version at the time
#' of writing can only be compiled with MinGW.
#'
#' If the file contains a header, and this header denotes a larger number of columns
#' or of labels than the largest index in the data, the resulting object will have
#' this dimension set according to the header. The third entry in the header (number
#' of classes/labels) will be ignored when passing `multilabel=FALSE`.
#'
#' The function uses different code paths when reading from a file or from a string,
#' and there might be slight differences between the obtained results from them.
#' For example, reading from a file might produce the desired output if the file
#' uses tabs as separators instead of spaces (not supported by most other software
#' and not standard), whereas reading from a string will not.
#' If any such difference is encountered, please submit a bug report in the
#' package's GitHub page.
#' @param file Either a file path from which the data will be read, or a string
#' (`character` variable) containing the text from which the data will be read.
#' In the latter case, must pass `from_string=TRUE`.
#' @param multilabel Whether the input file can have multiple labels per observation.
#' If passing `multilabel=FALSE` and it turns out to have multiple labels, will only
#' take the first one for each row. If the labels are non-integers or have decimal point,
#' the results will be invalid.
#' @param has_qid Whether the input file has `qid` field (used for ranking). If passing
#' `FALSE` and the file does turns out to have `qid`, the features will not be read for any
#' observations.
#' @param integer_labels Whether to output the observation labels as integers.
#' @param index1 Whether the input file uses numeration starting at 1 for the column
#' numbers (and for the label numbers when passing `multilabel=TRUE`). This is usually
#' the case for files downloaded from the repositories in the references. The function
#' will check for whether any of the column indices is zero, and will ignore this
#' option if so (i.e. will assume it is `FALSE`).
#' @param sort_indices Whether to sort the indices of the columns after reading the data.
#' These should already be sorted in the files from the repositories in the references.
#' @param ignore_zeros Whether to avoid adding features which have a value of zero.
#' If the zeros are caused due to numerical rounding in the software that wrote the
#' input file, they can be post-processed by passing `ignore_zeros=FALSE` and then
#' something like `X@x[X@x == 0] = 1e-8`.
#' @param min_cols Minimum number of columns that the output `X` object should have,
#' in case some columns are all missing in the input data.
#' @param min_classes Minimum number of columns that the output `y` object should have,
#' in case some columns are all missing in the input data. Only used when passing
#' `multilabel=TRUE`.
#' @param limit_nrows Maximum number of rows to read from the data. If there are more
#' than this number of rows, it will only read the first 'limit_nrows' rows.
#' If passing zero (the default), there will be no row limit.
#' @param use_altrep Whether to use R's ALTREP system to return C++ vector objects
#' without generating extra data copies. If passing `FALSE`, each piece of data will
#' be copied into a an R-allocated vector and returned as such.
#'
#' Passing `TRUE` is faster and uses less memory (as there are no redundant data copies),
#' but these ALTREP'd objects can potentially result in some functions/methods running
#' slower on them than on R objects (for example, manually sub-setting the vectors in
#' the S4 Matrix classes that are returned can potentially be slower by some microseconds).
#' @param from_string Whether to read the data from a string variable instead of a file.
#' If passing `from_string=TRUE`, then `file` is assumed to be a variable with the
#' data contents on it.
#' @return A list with the following entries:\itemize{
#' \item `X`: the features, as a CSR Matrix from package `Matrix` (class `dgRMatrix`).
#' \item `y`: the labels. If passing `multilabel=FALSE` (the default), will be a vector
#' (class `numeric` when passing `integer_labels=FALSE`, class `integer` when passing
#' `integer_labels=TRUE`), otherwise will be a binary CSR Matrix (class `ngRMatrix`).
#' \item `qid`: the query IDs used for ranking, as an integer vector.
#' This entry will \bold{only} be present when passing `has_qid=TRUE`.
#' }
#' These can be easily transformed to other sparse matrix types through e.g.
#' `X <- as(X, "CsparseMatrix")`.
#' @seealso \link{write.sparse}
#' @export
#' @examples
#' library(Matrix)
#' library(readsparse)
#'
#' ### Example input file
#' "1 2:1.21 5:2.05
#' -1 1:0.45 3:0.001 4:-10" -> coded.matrix
#'
#' r <- read.sparse(coded.matrix, from_string=TRUE)
#' print(r)
#'
#' ### Convert it back to text
#' recoded.matrix <- write.sparse(file=NULL, X=r$X, y=r$y, to_string=TRUE)
#' cat(recoded.matrix)
#'
#' ### Example with real file I/O
#' ## generate a random sparse matrix and labels
#' set.seed(1)
#' X <- rsparsematrix(nrow=5, ncol=10, nnz=8)
#' y <- rnorm(5)
#'
#' ## save into a text file
#' temp_file <- file.path(tempdir(), "matrix.txt")
#' write.sparse(temp_file, X, y, integer_labels=FALSE)
#'
#' ## inspect the text file
#' cat(paste(readLines(temp_file), collapse="\n"))
#'
#' ## read it back
#' r <- read.sparse(temp_file)
#' print(r)
#'
#' ### (Note that columns with all-zeros are discarded,
#' ### this behavior can be avoided with 'add_header=TRUE')
#' @references Datasets in this format can be found here:\itemize{
#' \item LibSVM Data: \url{https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/}
#' \item Extreme Classification Repository: \url{http://manikvarma.org/downloads/XC/XMLRepository.html}
#' }
#'
#' The format is also described at the SVMLight webpage: \url{http://svmlight.joachims.org}.
read.sparse <- function(file, multilabel=FALSE, has_qid=FALSE, integer_labels=FALSE,
index1=TRUE, sort_indices=TRUE, ignore_zeros=TRUE,
min_cols=0L, min_classes=0L, limit_nrows=0L,
use_altrep=TRUE, from_string=FALSE) {
multilabel <- check.bool(multilabel, "multilabel")
has_qid <- check.bool(has_qid, "has_qid")
integer_labels <- check.bool(integer_labels, "integer_labels")
index1 <- check.bool(index1, "index1")
sort_indices <- check.bool(sort_indices, "sort_indices")
ignore_zeros <- check.bool(ignore_zeros, "ignore_zeros")
from_string <- check.bool(from_string, "from_string")
use_altrep <- check.bool(use_altrep, "use_altrep")
min_cols <- check.int(min_cols, "min_cols")
min_classes <- check.int(min_classes, "min_classes")
limit_nrows <- check.int(limit_nrows, "limit_nrows")
if (inherits(file, "connection")) {
file <- paste(readLines(file), collapse="\n")
from_string <- TRUE
}
if (!from_string) {
file <- process.file.name(file)
if (multilabel) {
read_func <- read_multi_label_R
} else {
read_func <- read_single_label_R
}
} else {
if (typeof(file) != "character")
stop("Must pass a character/string variable as input.")
if (multilabel) {
read_func <- read_multi_label_from_str_R
} else {
read_func <- read_single_label_from_str_R
}
}
r <- read_func(
file,
ignore_zeros,
sort_indices,
index1,
!has_qid,
limit_nrows,
use_altrep
)
if (!length(r))
stop("Error: could not read file successfully.")
check.for.overflow(r)
r$ncols <- as.integer(max(c(r$ncols, min_cols)))
r$nclasses <- as.integer(max(c(r$nclasses, min_classes)))
features <- new("dgRMatrix")
features@Dim <- c(r$nrows, r$ncols)
features@p <- r$indptr
features@j <- r$indices
features@x <- r$values
if (multilabel) {
labels <- new("ngRMatrix")
labels@Dim <- c(r$nrows, r$nclasses)
labels@p <- r$indptr_lab
labels@j <- r$indices_lab
} else {
labels <- r$labels
if (integer_labels)
labels <- as.integer(labels)
}
if (!has_qid && length(r$indptr) > 1L &&
r$indptr[1L] == r$indptr[length(r$indptr)]) {
warning("Data has empty features. Perhaps the file has 'qid' field?")
}
if (!has_qid)
return(list(X = features, y = labels))
else
return(list(X = features, y = labels, qid = r$qid))
}
#' @title Write Sparse Matrix in Text Format
#' @description Write a labelled sparse matrix into text format as used by software
#' such as SVMLight, LibSVM, ThunderSVM, LibFM, xLearn, XGBoost, LightGBM, and others - i.e.:
#'
#' \code{<labels(s)> <column:value> <column:value> ...}
#'
#' For more information about the format and usage examples, see \link{read.sparse}.
#'
#' Can write labels for regression, classification (binary, multi-class, and multi-label),
#' and ranking (with `qid`), but note that most software that makes use of this data
#' format supports only regression and binary classification.
#' @details Be aware that writing sparse matrices to text is not a lossless operation
#' - that is, some information might be lost due to numeric precision, and metadata such
#' as row and column names will not be saved. It is recommended to use `saveRDS` or similar
#' for saving data between R sessions, or to use binary formats for passing between
#' different software such as R->Python.
#'
#' The option `ignore_zeros` is implemented heuristically, by comparing
#' `abs(x) >= 10^(-decimal_places)/2`, which might not match exactly with
#' the rounding that is done implicitly in string conversions in the libc/libc++
#' functions - thus there might still be some corner cases of all-zeros written into
#' features if the (absolute) values are very close to the rounding threshold.
#'
#' While R uses C `double` type for numeric values, most of the software that is able to
#' take input data in this format uses `float` type, which has less precision.
#'
#' The function uses different code paths when writing to a file or to a string,
#' and there might be slight differences between the generated texts from them.
#' If any such difference is encountered, please submit a bug report in the
#' package's GitHub page.
#' @param file Output file path into which to write the data.
#' Will be ignored when passing `to_string=TRUE`.
#' @param X Sparse data to write. Can be a sparse matrix from package `Matrix`
#' (classes: `dgRMatrix`, `dgTMatrix`, `dgCMatrix`, `ngRMatrix`, `ngTMatrix`, `ngCMatrix`)
#' or from package `SparseM` (classes: `matrix.csr`, `matrix.coo`, `matrix.csc`),
#' or a dense matrix of all numeric values, passed either as a `matrix` or as a `data.frame`.
#'
#' If `X` is a vector (classes `numeric`, `integer`, `dsparseVector`), will be assumed to
#' be a row vector and will thus write one row only.
#'
#' Note that the data will be casted to `dgRMatrix` in any case.
#' @param y Labels for the data. Can be passed as a vector (`integer` or `numeric`)
#' if each observation has one label, or as a sparse or dense matrix (same format as `X`)
#' if each observation can have more than 1 label. In the latter case, only the non-missing
#' column indices will be written, while the values are ignored.
#' @param qid Secondary label information used for ranking algorithms. Must be an integer vector
#' if passed. Note that not all software supports this.
#' @param integer_labels Whether to write the labels as integers. If passing `FALSE`, they will
#' have a decimal point regardless of whether they are integers or not. If the file is meant
#' to be used for a classification algorithm, one should pass `TRUE` here (the default).
#' For multilabel classification, the labels will always be written as integers.
#' @param index1 Whether the column and label indices (if multi-label) should have numeration
#' starting at 1. Most software assumes this is `TRUE`.
#' @param sort_indices Whether to sort the indices of `X` (and of `y` if multi-label) before
#' writing the data. Note that this will cause in-place modifications if either `X` or `y`
#' are passed as CSR matrices from the `Matrix` package.
#' @param ignore_zeros Whether to ignore (not write) features with a value of zero
#' after rounding to the specified decimal places.
#' @param add_header Whether to add a header with metadata as the first line (number of rows,
#' number of columns, number of classes). If passing `integer_label=FALSE` and `y` is a
#' vector, will write zero as the number of labels. This is not supported by most software.
#' @param decimal_places Number of decimal places to use for numeric values. All values
#' will have exactly this number of places after the decimal point. Be aware that values
#' are rounded and might turn to zeros (will be skipped by default) if they are too small
#' (one can do something like
#' `X@x <- ifelse(X@x >= 0, pmin(X@x, 1e-8), pmax(X@x, -1e-8))` to avoid this).
#' @param append Whether to append text at the end of the file instead of overwriting or
#' creating a new file. Ignored when passing `to_string=TRUE`.
#' @param to_string Whether to write the result into a string (which will be returned
#' from the function) instead of into a file.
#' @return If passing `to_string=FALSE` (the default), will not return anything
#' (`invisible(NULL)`). If passing `to_string=TRUE`, will return a `character`
#' variable with the data contents written into it.
#' @seealso \link{read.sparse}
#' @export
write.sparse <- function(file, X, y, qid=NULL, integer_labels=TRUE,
index1=TRUE, sort_indices=TRUE, ignore_zeros=TRUE,
add_header=FALSE, decimal_places=8L,
append=FALSE, to_string=FALSE) {
if (is.null(X) || is.null(y))
stop("Must pass 'X' and 'y'.")
if (NROW(y) != NROW(X))
stop("'X' and 'y' must have the same number of rows.")
if (!to_string) {
if ((typeof(file) != "character") || NROW(file) == 0L)
stop("'file' must be a character variable containing a file path.")
}
if (NROW(decimal_places) != 1L)
stop("'decimal_places' must be a single integer.")
decimal_places <- as.integer(decimal_places)
if (is.na(decimal_places) || is.infinite(decimal_places) || (decimal_places < 0L))
stop("Invalid 'decimal_places'.")
if (decimal_places > 20L) {
warning("'decimal_places' is greater than 20.")
}
integer_labels <- check.bool(integer_labels, "integer_labels")
index1 <- check.bool(index1, "index1")
sort_indices <- check.bool(sort_indices, "sort_indices")
ignore_zeros <- check.bool(ignore_zeros, "ignore_zeros")
add_header <- check.bool(add_header, "add_header")
X <- cast.df(X)
y <- cast.df(y)
X <- cast.dense.vec(X)
X <- cast.sparse.vec(X)
if (nrow(X) == 1L) {
y <- cast.sparse.vec(y)
} else if (inherits(y, "dsparseVector")) {
y <- as.numeric(y)
}
has_MatrixExtra <- requireNamespace("MatrixExtra", quietly=TRUE)
### Note: this check should be made right here due to potential
### in-place modifications of the data which would render the input unusable
### TODO: this here could benefit from MatrixExtra
if (sort_indices) {
if (has_MatrixExtra) {
if (inherits(X, "RsparseMatrix")) {
X <- MatrixExtra::as.csr.matrix(X)
X <- MatrixExtra::sort_sparse_indices(X, copy=TRUE)
sort_indices <- FALSE
}
} else {
if (inherits(y, "RsparseMatrix") && !inherits(y, "ngRMatrix")) {
y@j <- deepcopy_int(y@j)
}
if (inherits(X, "RsparseMatrix") && !inherits(X, "dgRMatrix")) {
X@j <- deepcopy_int(X@j)
if (inherits(X, "lsparseMatrix")) {
X@x <- deepcopy_log(X@x)
} else if (inherits(X, "dsparseMatrix")) {
X@x <- deepcopy_num(X@x)
}
}
}
}
if (has_MatrixExtra) {
X <- MatrixExtra::as.csr.matrix(X)
} else {
if (!inherits(X, "RsparseMatrix"))
X <- as (X, "RsparseMatrix")
if (!inherits(X, "generalMatrix"))
X <- as(X, "generalMatrix")
if (!inherits(X, "dgRMatrix")) {
### Note: casting from ngRMatrix to dgRMatrix is broken in Matrix==1.3.0
X.csr <- new("dgRMatrix")
X.csr@Dim <- X@Dim
X.csr@p <- X@p
X.csr@j <- X@j
if (.hasSlot(X, "x"))
X.csr@x <- as.numeric(X@x)
else
X.csr@x <- rep(1., length(X@j))
X <- X.csr
}
}
allowed_y_multi <- c("matrix", "matrix.coo", "matrix.csr", "matrix.csc",
"dgRMatrix", "dgCMatrix", "dgTMatrix",
"ngRMatrix", "ngCMatrix", "ngTMatrix",
"lgRMatrix", "lgCMatrix", "lgTMatrix")
allowed_y_single <- c("integer", "numeric", "factor", "float32")
allowed_y <- c(allowed_y_multi, allowed_y_single)
if (!inherits(y, allowed_y))
stop(sprintf("Invalid 'y' - allowed types: %s", paste0(allowed_y, collapse=",")))
if (inherits(y, allowed_y_multi)) {
y_is_multi <- TRUE
if (!inherits(y, "RsparseMatrix")) {
if (has_MatrixExtra) {
y <- MatrixExtra::as.csr.matrix(y, binary=TRUE)
} else {
y <- as(y, "RsparseMatrix")
}
if (!inherits(y, "dgRMatrix"))
y <- as(y, "generalMatrix")
}
} else {
if (inherits(y, "float32") && ncol(y) != 1L)
stop("'float32' objects not supported for multi-label 'y'.")
y_is_multi <- FALSE
if (integer_labels) {
if (typeof(y) != "integer")
y <- as.integer(y)
} else {
if (typeof(y) != "double")
y <- as.numeric(y)
}
}
if (!NROW(qid)) {
qid <- integer()
} else {
if (NROW(qid) != NROW(X))
stop("'X' and 'qid' must have the same number of rows.")
if (typeof(qid) != "integer")
qid <- as.integer(qid)
}
if (integer_labels) {
if (add_header) {
if (y_is_multi) {
n_classes <- ncol(y)
} else if (any(is.infinite(y)) || all(is.na(y))) {
n_classes <- 0L
} else if (any(y < 0)) {
n_classes <- length(unique(y))
} else {
n_classes <- max(y, na.rm=TRUE)
if (0L %in% y)
n_classes <- n_classes + 1L
n_classes <- max(n_classes, 1L)
if (is.na(n_classes) || is.infinite(n_classes))
nclasses <- 0L
if (typeof(n_classes) != "integer")
n_classes <- as.integer(n_classes)
}
} else
n_classes <- 0L
} else {
n_classes <- 0L
}
if (!to_string) {
file <- process.file.name(file)
if (add_header && append && file.exists(file))
warning("Warning: adding header to existing file with 'append=TRUE'.")
if (y_is_multi) {
success <- write_multi_label_R(
file,
X@p,
X@j,
X@x,
y@p,
y@j,
qid,
ncol(X),
ncol(y),
ignore_zeros,
sort_indices,
index1,
add_header,
decimal_places,
append
)
} else {
if (integer_labels) {
success <- write_single_label_integer_R(
file,
X@p,
X@j,
X@x,
y,
qid,
ncol(X),
n_classes,
ignore_zeros,
sort_indices,
index1,
add_header,
decimal_places,
append
)
} else {
n_classes <- 0L
success <- write_single_label_numeric_R(
file,
X@p,
X@j,
X@x,
y,
qid,
ncol(X),
n_classes,
ignore_zeros,
sort_indices,
index1,
add_header,
decimal_places,
append
)
}
}
if (!success)
stop("Error: file write failed.")
return(invisible(NULL))
} else {
if (y_is_multi) {
res <- write_multi_label_to_str_R(
X@p,
X@j,
X@x,
y@p,
y@j,
qid,
ncol(X),
ncol(y),
ignore_zeros,
sort_indices,
index1,
add_header,
decimal_places
)
} else {
if (integer_labels) {
res <- write_single_label_integer_to_str_R(
X@p,
X@j,
X@x,
y,
qid,
ncol(X),
n_classes,
ignore_zeros,
sort_indices,
index1,
add_header,
decimal_places
)
} else {
n_classes <- 0L
res <- write_single_label_numeric_to_str_R(
X@p,
X@j,
X@x,
y,
qid,
ncol(X),
n_classes,
ignore_zeros,
sort_indices,
index1,
add_header,
decimal_places
)
}
}
if (!length(res))
stop("Error: string write failed.")
return(res$str)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.