# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# The bigMap Package for R.
# Copyright (c) 2018, Joan Garriga <jgarriga@ceab.csic.es>, Frederic Bartumeus <fbartu@ceab.csic.es> (Blanes Centre for Advanced Studies, CEAB-CSIC).
# bigMap is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
# bigMap is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
# You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses.
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Process raw input data
# Att!!
# Max. object size in R (< 3.0.0) is 2^31, thus maxRows * MaxCols:
# 10000 * 214748
# 100000 * 21478
# 1M * 2147
# 10M * 214
# DEPRECATED!!!
# Any data pre-processing is left to the user
bd_.data <- function(raw.data, whiten = 4, input.dim = NULL, is.distance = F, is.sparse = F, quiet = TRUE)
{
if (!quiet) cat('+++ processing data \n')
if (is.null(input.dim)) {
if (is.distance) {
input.dim <- 1
} else {
input.dim <- ncol(raw.data)
}
}
if ('character' %in% class(raw.data)) {
return(list(inp.data = raw.data, whiten = 0, input.dim = input.dim))
}
else {
if (whiten == 0){
if ('big.matrix' %in% class(raw.data)) {
return(list(inp.data = raw.data, whiten = 0, input.dim = input.dim))
} else {
return(list(inp.data = as.big.matrix(raw.data[, 1:input.dim], type = 'double'), whiten = 0, input.dim = input.dim))
}
}
else {
return(data.get(raw.data, whiten, input.dim))
}
}
}
# -----------------------------------------------------------------------------
# +++ Preprocessing of input-data
# -----------------------------------------------------------------------------
data.get <- function(raw.data, whiten, input.dim)
{
# filter out all irrelevant features
# (makes sense in datasets like the mnist.optical.digits where some features might have zeros for all observations)
# X <- X[, which(apply(X, 2, sum) != 0)]
# input.dim <- min(input.dim, ncol(X))
if (whiten == 1) # centering
{
X <- scale(raw.data[ , ], center = T, scale = F)
}
else if (whiten == 2) # centering & scaling
{
X <- scale(raw.data[ , ], center = T, scale = T)
if (any(is.na(X))) {
return(message('+++ Error: scaling return NaNs !!!'))
}
}
else if (whiten == 3 || whiten == 4) # PCA/whitening
{
# TODO: consider using bigalgebra::bigPCA()
X <- t(scale(raw.data[ , ], center = T, scale = F))
# covariance matrix
# Att!! ncol(X) stands for nrow(t(X)), the original X
V <- X %*% t(X) / (ncol(X) -1)
# singular value decomposition
s <- svd(V, nu = input.dim, nv = 0)
# PCA
K <- t(s$u)
# whitening
if (whiten == 4) K <- diag(1/sqrt(s$d[1:input.dim])) %*% K
# take first input.dim dimensions
X <- t(K %*% X)
}
return(list(inp.data = as.big.matrix(X, type = 'double'), whiten = whiten, input.dim = input.dim))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.