R/bdm_main.R

Defines functions bdm.labels bdm.wtt bdm.pakde bdm.ptsne bdm.scp bdm.save bdm.fName bdm.init bdm.example

Documented in bdm.example bdm.fName bdm.init bdm.labels bdm.pakde bdm.ptsne bdm.save bdm.scp bdm.wtt

# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# The bigMap Package for R.

# Copyright (c) 2018, Joan Garriga <jgarriga@ceab.csic.es>, Frederic Bartumeus <fbartu@ceab.csic.es> (Blanes Centre for Advanced Studies, CEAB-CSIC).

# bigMap is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

# bigMap is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

# You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses.
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

#' Example dataset
#'
#' Loads an example of a mapping of a dataset.
#'
#' @return An example \var{bdm} instance named \var{exMap}.
#'
#' @details A \var{bdm} instance is a list with elements: \var{$dSet} a name identifying the dataset (\code{bdm.fName()} use this name to generate a default file name); \var{$data} a matrix with raw data; \var{$lbls} a vector of datapoint labels (in case they are known); \var{$N} the dataset size; \var{$is.distance} a logical value that is set to TRUE when the raw data is a distance matrix. Downstream steps of the mapping protocol will add more elements to the list.
#'
#' This example is based on a small synthetic dataset with \code{n = 5000} observations drawn from a 4-variate Gaussian Mixture Model (GMM) with 16 Gaussian components.
#'
#' @examples
#'
#' # --- load example dataset
#' bdm.example()
#' str(exMap)

bdm.example <- function()
{
	load(paste(paste(system.file('extdata', package = 'bigMap'), '/', sep = ''), 'exMap.RData', sep=''), envir=parent.frame(), verbose=T)
}


#' Create \var{bdm} instance
#'
#' Creates a \var{bdm} instance.
#'
#' @param dSet.name The name given to the input dataset. This name will be used to automatically generate a name to save the output as an \var{.Rdata} file.
#'
#' @param dSet.data A \var{data.frame} or \var{matrix} with raw input-data. The dataset must not have duplicated rows.
#'
#' @param labels If available, labels can be included as a separate vector of length equal to \code{nrow(dSet.data)}. Label values are factorized as \code{as.numeric(as.factor(labels))}.
#'
#' @param is.distance A logical value (FALSE by default). TRUE indicates that the raw data is indeed a distance matrix.
#'
#' @param check.duplicates If set to TRUE (default value) the dataset is checked for duplicated rows. Checking for duplicates in big datasets can take some time. If the dataset is known to have no duplicates disabling this option will save time.
#'
#' @return A \var{bdm} instance. A \var{bdm} instance is initially a list with a few elements to which new elements are added at each step of the mapping protocol.
#'
#' @examples
#'
#' # --- get a matrix with raw-data
#' mydata <- matrix(rnorm(10000, mean = 0, sd = 3),  ncol = 2)
#' mylabels <- apply(mydata, 1, function(row) round(sqrt(sum(row**2)), 0))
#' # --- create a \var{bdm} instance with our raw-data matrix
#' mybdm <- bdm.init('mydataset', mydata, labels = mylabels)
#' str(mybdm)

bdm.init <- function(dSet.name, dSet.data, labels = NULL, is.distance = F, check.duplicates = T)
{
	# check duplicates
	if (check.duplicates && anyDuplicated(dSet.data) > 0){
		return(message('+++ Found duplicated rows in data !! \n'))
	}
	bdm <- list()
	bdm$dSet <- dSet.name
	bdm$data <- as.matrix(dSet.data)
	if (!is.null(labels)){
		bdm$lbls <- as.numeric(as.factor(labels))
	}
	bdm$is.distance <- is.distance
	return(bdm)
}


#' Default \var{bdm} file name
#'
#' Generates a default file name. The default file name is intended for functions \code{bdm.save()} and \code{bdm.scp()} to ease the task of working/organizing multiple runs on the same dataset.
#'
#' @param bdm A \var{bdm} instance as generated by \code{bdm.init()}.
#'
#' @details The file name is generated based on \code{bdm$dSet} and main ptSNE parameters (threads, layers, rounds, boost and perplexity). In case that \code{bdm.wtt()} has been performed on any of the layers, the number of clusters in the first not null layer of bdm$wtt is also included.
#'
#' @return A \var{*.RData} file name based on \var{bdm$dSet} and main \var{bdm} parameters.
#'
#' @examples
#'
#' bdm.example()
#' str(exMap$dSet)
#' str(exMap$ptsne)
#' bdm.fName(exMap)

bdm.fName <- function(bdm)
{
	fName <- paste(bdm.mybdm(), bdm$dSet, sep = '')
	if (!is.null(bdm$ptsne))
	{
		z <- paste('_z', bdm$ptsne$threads, sep = '')
		l <- paste('_l', bdm$ptsne$layers, sep = '')
		b <- paste('_b', bdm$ptsne$boost, sep = '')
		r <- paste('_r', bdm$ptsne$rounds, sep = '')
		p <- paste('_p', bdm$Xbeta$ppx, sep = '')
		fName <- paste(fName, z, l, b, r, p, sep = '')
		if (!is.null(bdm$wtt))
		{
			if (!is.null(bdm$merge)) {
				s <- length(unique(bdm$merge$C))
				k <- paste('_m', formatC(s, width = 2, flag = '0'), sep = '')
			} else {
				s <- lapply(bdm$wtt, function(wtt) wtt$s)
				k <- paste('_k', formatC(s[[1]], width = 2, flag = '0'), sep = '')
			}
			fName <- paste(fName, k, sep = '')
		}
	}
	return(paste(fName, '.RData', sep = ''))
}


#' Save \var{bdm} instance
#'
#' Saves a \var{bdm} instance with default path/file names, as given by \code{bdm.mybdm()/bdm.fName(bdm)}. Default file name is generated based on \code{bdm$dSet} and ptSNE main parameters (threads, layers, boost, rounds, perplexity).  The purpose of functions \code{bdm.save()} and \code{bdm.scp()} used with \code{bdm.fName()} is to ease the task of working/organizing multiple runs on the same dataset.
#'
#' @param ... A \var{bdm} instance as generated by \code{bdm.init()}.
#'
#' @return None
#'
#' @examples
#'
#' # --- get a matrix with raw-data
#' mydata <- cbind(rnorm(10000, mean = 0, sd = 3),  ncol = 2)
#' mylabels <- apply(mydata, 1, function(row) round(sqrt(sum(row**2)), 0))
#' # --- create a \var{bdm} instance with our raw-data matrix
#' mybdm <- bdm.init('mydataset', mydata, labels = mylabels)
#' str(mybdm)
#' # --- save it
#' \dontrun{
#' bdm.save(mybdm)
#' }

bdm.save <- function( ... ){
	bdm <- get(deparse(substitute( ... )), envir = parent.frame())
	fName <- bdm.fName(bdm)
	if (!file.exists(bdm.mybdm())) {
		cat('+++ Error: default path', bdm.mybdm(), 'does not exist. \n')
	}
	else {
		save(..., envir = parent.frame(), file = fName)
		cat('+++ saved to ', fName, '\n', sep='')
	}
}


#' Transfer \var{bdm} instance to a remote machine.
#'
#' Transfers a \var{bdm} instance to a remote machine. By default a file name is generated based on \code{bdm$dSet} and t-SNE main parameters (threads, layers, rounds, perplexity). The purpose of functions \code{bdm.save()} and \code{bdm.scp()} used with \code{bdm.fName()} is to ease the task of working/organizing multiple runs on the same dataset.
#'
#' @param ... A \var{bdm} instance as generated by \code{bdm.init()}.
#'
#' @param dest The name or IP address of a remote machine where to transfer the file of the \var{bdm} instance. By default is send to \var{bdm.local()} environment variable.
#'
#' @return None
#'
#' @examples
#'
#' \dontrun{
#' # --- load example
#' bdm.example()
#' # --- scp to \var{bdm.local()} with default file name
#' bdm.scp(exMap)
#' # --- scp to IP address 'xxx.xxx.0.0' with default file name
#' bdm.scp(exMap, dest = 'xxx.xxx.0.0')
#' }

bdm.scp <- function( ... , dest = NULL){
	bdm <- get(deparse(substitute( ... )), envir = parent.frame())
	if (is.null(dest)){
		dest <- bdm.local()
	}
	if (substr(dest, 1, 7) == 'xxx.xxx'){
		cat('+++ WARNING: bdm.local() not set !! \n')
	}
	else {
		fName <- bdm.fName(bdm)
		dest.fName <- paste(dest, fName, sep = ':')
		bdm.save( ... )
		system(paste('scp', fName, dest.fName))
	}
}


#' Parallelized t-SNE
#'
#' Starts the ptSNE algorithm (first step of the mapping protocol).
#'
#' @param bdm A \var{bdm} instance as generated by \code{bdm.init()}.
#'
#' @param threads The number of parallel threads (in principle only limited by hardware resources, \code{i.e.} number of cores and available memory)
#'
#' @param type The type of cluster: 'SOCK' (default) for intra-node parallelization, 'MPI' (\code{message passing interface}) for inter-node parallelization.
#'
#' @param layers The number of layers (\code{minimum} 2, \code{maximum} the number of threads).
#'
#' @param rounds The number of rounds (2 by default).
#'
#' @param boost A running time accelerator factor. By default (\code{boost == 1}). See details.
#'
#' @param whiten Preprocessing of raw data. If \code{whiten = 4} (default value) raw data is transformed to principal components (PCA) and whitened afterwards. If \code{whiten = 3} only PCA is performed with NO whitening. If \code{whiten = 2} raw data is only centered and scaled. If \code{whiten = 1} raw data is only centered. If \code{whiten = 0} no preprocessing is performed at all.
#'
#' @param input.dim If raw data is given as (or is transformed to) principal components, \var{input.dim} sets the number of principal components to be used as input dimensions. Otherwise all data columns are used as input dimensions. By default \code{input.dim = ncol(bdm$data)}.
#'
#' @param ppx The value of perplexity to compute similarities (100 by default).
#'
#' @param itr The number of iterations for computing input similarities (100 by default).
#'
#' @param tol The tolerance lower bound for computing input similarities (1e-05 by default).
#'
#' @param alpha The momentum factor (0.5 by default).
#'
#' @param Y.init A \code{nx2} matrix with initial mapping positions. By default (\code{NULL}) will use random initial positions)
#'
#' @param info Progress output information: 1 yields inter-round results for progressive analytics, 0 disables intermediate results. Default value is 1.
#'
#' @return A copy of the input \var{bdm} instance with new element \var{bdm$ptsne} (t-SNE output).
#'
#' @details By default the algorithm is structured in \eqn{\sqrt{n}} epochs of \eqn{\sqrt{z}} iterations each, where \var{n} is the dataset size and \var{z} is the thread-size (\eqn{z=n*layers/threads}). The running time of the algorithm is then determined by \eqn{epochs*iters*t_i+ epochs*t_e} where \var{t_{i}} is the running time of a single iteration and \var{t_{e}} is the inter-epoch running time.
#'
#'The \var{boost} factor is meant to reduce the running time. With \eqn{boost > 1} the algorithm is structured in \eqn{n/boost} epochs with \eqn{z*boost} iterations each. This structure performs the same total number of iterations but arranged into a lower number of epochs, thus decreasing the total running time to \eqn{epochs*iters*t_i + 1/boost*epochs*t_e}. When the number of threads is high, the inter-epoch time can be high, in particular when using 'MPI' parallelization, thus, reducing the number of epochs can result in a significant reduction of the total running time. The counterpart is that increasing the number of iterations per epoch might result in a lack of convergence, thus the \var{boost} factor must be used with caution. To the most of our knowledge using values up to \eqn{boost=2.5} is generally safe.
#'
#' In case of extremely large datasets, we strongly recommend to initialize the \var{bdm} instance with already preprocessed data and use \code{whiten = 0}. Fast principal components approximations can be computed by means of \var{e.g.} \code{flashpcaR} or \code{scater} R packages.
#'
#' @examples
#'
#' # --- load example dataset
#' bdm.example()
#' # --- perform ptSNE
#' \dontrun{
#' exMap <- bdm.ptsne(exMap, threads = 10, layers = 2, rounds = 2, ppx = 200)
#' }
#' # --- plot the Cost function
#' bdm.cost(exMap)
#' # --- plot ptSNE output
#' bdm.ptsne.plot(exMap)

bdm.ptsne <- function(bdm, threads = 3, type = 'SOCK', layers = 2, rounds = 1, boost = 2.0, whiten = 4, input.dim = NULL, ppx=100, itr=100, tol=1e-5, alpha=0.5, Y.init = NULL, info = 1)
{
	# +++ check t-SNE parameters
	if (layers > threads) {
		cat('+++ WARNING: layers set to ', threads, ' !!\n', sep='')
		layers <- threads
	}
	# +++ start cluster of workers
	cl <- cluster.start(threads, type)
	if (is.null(cl)) return(bdm)
	# +++ check if betas are already computed and can be reused
	compute.Betas <- (is.null(bdm$Xbeta) || bdm$Xbeta$ppx != ppx || bdm$Xbeta$itr != itr || bdm$Xbeta$tol != tol || (!is.null(input.dim) && bdm$Xdata$input.dim != input.dim))
	# +++ process/export raw data
	Xdata <- Xdata.get(bdm$data, whiten = whiten, input.dim = input.dim, is.distance = bdm$is.distance)
	if (is.null(Xdata$X)) return(bdm)
	bdm$Xdata <- list(whiten = Xdata$whiten, input.dim = Xdata$input.dim)
	Xdata.exp(cl, Xdata$X, bdm$is.distance)
	# +++ compute/export similarities
	if (compute.Betas){
		bdm$Xbeta <- Xbeta.get(cl, Xdata$X, ppx = ppx, itr = itr, tol = tol, is.distance = bdm$is.distance)
	}
	if (is.null(bdm$Xbeta)) return(bdm)
	Xbeta.exp(cl, bdm$Xbeta$B)
	# +++ perform t-SNE
	bdm$ptsne <- list(threads = threads, layers = layers, rounds = rounds, boost = boost, alpha = alpha)
	bdm <- ptsne.get(bdm, cl, Y.init = Y.init, info = info)
	# +++ stop cluster
	stopCluster(cl)
	return(bdm)
}


#' Perplexity-adaptive kernel density estimation
#'
#' Starts the paKDE algorithm (second step of the mapping protocol).
#'
#' @param bdm A \var{bdm} instance as generated by \code{bdm.init()}.
#'
#' @param layer The number of the t-SNE layer (1 by default).
#'
#' @param threads The number of parallel threads (in principle only limited by hardware resources, \code{i.e.} number of cores and available memory)
#'
#' @param type The type of cluster: 'SOCK' (default) for intra-node parallelization, 'MPI' (\code{message passing interface}) for inter-node parallelization.
#'
#' @param ppx The value of perplexity to compute similarities in the low-dimensional embedding (100 by default).
#'
#' @param itr The number of iterations for computing input similarities (100 by default).
#'
#' @param tol The tolerance lower bound for computing input similarities (1e-05 by default).
#'
#' @param g The resolution of the density space grid (\eqn{g*g} cells, 200 by default).
#'
#' @param g.exp A numeric factor to avoid border effects. The grid limits will be expanded so as to enclose the density of the kernel of the most extreme embedded datapoints up to \code{g.exp} times \eqn{\sigma}. By default, (\code{g.exp = 3}) the grid limits are expanded so as to enclose the 0.9986 of the probability mass of the most extreme kernels.
#'
#' @details When computing the \var{paKDE} the embedding area is discretized as a grid of size \code{g*g} cells. In order to avoid border effects, the limits of the grid are expanded by default so as to enclose at least the 0.9986 of the cumulative distribution function (\eqn{3 \sigma}) of the kernels of the most extreme mapped points in each direction.
#'
#' The presence of outliers in the embedding can lead to undesired expansion of the grid limits. We can overcome this using lower values of \var{g.exp}. By setting \code{g.exp = 0} the grid limits will be equal to the range of the embedding.
#'
#' The values \var{g.exp = c(1, 2, 3, 4, 5, 6)} enclose cdf values of \var{0.8413, 0.9772, 0.9986, 0.99996, 0.99999, 1.0} respectively.
#'
#' @return A copy of the input \var{bdm} instance with new element \var{bdm$pakde} (paKDE output). \code{bdm$pakde[[layer]]$layer = 'NC'} stands for not computed layers.
#'

#' @examples
#'
#' # --- load mapped dataset
#' bdm.example()
#' # --- run paKDE
#' \dontrun{
#' exMap <- bdm.pakde(exMap, threads = 4, ppx = 200, g = 200, g.exp = 3)
#' }
#' # --- plot paKDE output
#' bdm.pakde.plot(exMap)

bdm.pakde <- function(bdm, layer = 1, threads = 2, type = 'SOCK', ppx = 100, itr = 100, tol = 1e-5, g = 200, g.exp = 3)
{
	# start cluster of workers
	cl <- cluster.start(threads, type)
	if (is.null(cl)) return(bdm)
	# initialize bdm$pakde
	if (is.null(bdm$pakde)) bdm$pakde <- list()
	# compute kde
	l <- c(1, 2) + (layer- 1) *2
	if (!is.null(bdm$ptsne$Y[ , l])) {
		cat('\n')
		cat('+++ paKDE for layer ', layer, '/', bdm$ptsne$layers, ' +++ \n', sep='')
		bdm$pakde[[layer]] <- pakde.get(cl, bdm$ptsne$Y[ , l], ppx, itr, tol, g, g.exp)
	}
	else cat('+++ Error: up-stream step bdm.ptsne(layer = ', layer, ') not found ! \n', sep = '')
	# stop cluster
	stopCluster(cl)
	return(bdm)
}


#' Watertrack transform (WTT)
#'
#' Starts the WTT algorithm (third setp of the mapping protocol).
#'
#' @param bdm A \var{bdm} instance as generated by \code{bdm.init()}.
#'
#' @param layer The number of the t-SNE layer (1 by default).
#'
#' @return A copy of the input \var{bdm} instance with \var{bdm$wtt} (WTT output). \code{bdm$wtt[[layer]]$layer = 'NC'} stands for not computed layers.
#'
#' @details This function requires the up-stream step \code{bdm.pakde()}.
#'
#' @examples
#'
#' # --- load mapped dataset
#' bdm.example()
#' # --- perform WTT
#' exMap <- bdm.wtt(exMap)
#' # --- plot WTT output
#' bdm.wtt.plot(exMap)

bdm.wtt <- function(bdm, layer = 1)
{
	# initialize bdm$wtt
	if (is.null(bdm$wtt)) bdm$wtt <- list()
	# compute WTT
	if (!is.null(bdm$pakde[[layer]]$z))
	{
		cat('\n')
		cat('+++ WTT for layer ', layer, '/', bdm$ptsne$layers, ' +++ \n', sep='')
		bdm$wtt[[layer]] <- wtt.get(bdm$pakde[[layer]])
	}
	else cat('+++ Error: up-stream step bdm.pakde(layer = ', layer, ') not found ! \n', sep = '')
	return(bdm)
}


#' Get data-point clustering labels.
#'
#' Given that clusters are computed at grid-cell level, this function returns the clustering label for each data-point.
#'
#' @param bdm A \var{bdm} instance as generated by \code{bdm.init()}.
#'
#' @param layer The number of the t-SNE layer (1 by default).
#'
#' @param merged A logical value. If TRUE (default value) and the \var{bdm} has been merged, the data-point labelling indicate the number of the merged clusters. If \var{merged} is set to FALSE or the \var{bdm} has not been merged the data-point labels correspond to the top-level clustering.
#'
#' @return A vector of data-point clustering labels.
#'
#' @examples
#'
#' bdm.example()
#' exMap.labels <- bdm.labels(exMap)

bdm.labels <- function(bdm, merged = T, layer = 1){
	# At.!!! there is an internal version of this funcion (for simplicity)
	# check merge.labels() in bdm_merge.R if any change is to be made here !!
	if (!is.null(bdm$wtt[[layer]]))
	{
		C <- bdm$wtt[[layer]]$C
		if (merged && !is.null(bdm$merge)) {
			C <- bdm$merge$C
		}
		l <- c(1, 2) + (layer -1) *2
		D2c <- grid_D2cell(bdm$ptsne$Y[ , l], bdm$wtt[[layer]]$grid) +1
		x.size <- bdm$wtt[[layer]]$grid[1, 1]
		lbls <- C[(D2c[, 2] - 1) *x.size +D2c[, 1]]
	}
	else {
		lbls <- rep(1, nrow(bdm$data))
	}
	return(lbls)
}

Try the bigMap package in your browser

Any scripts or data that you put into this service are public.

bigMap documentation built on July 8, 2020, 6:41 p.m.