bigMap: Big Data Mapping

Documented in bdm.knp bdm.knp.plot

# +++ The bigMap Package for R.

# Copyright (c) 2018, Joan Garriga <jgarriga@ceab.csic.es>, Frederic Bartumeus <fbartu@ceab.csic.es> (Blanes Centre for Advanced Studies, CEAB-CSIC).

# bigMap is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

# bigMap is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

# You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses.

# +++

#' k-ary Neighborhood Preservation
#'
#' A measure of matching between HD and LD neighborhoods ('Multi-scale similarities in stochastic neighbour embedding: Reducing dimensionality while preserving both local and global structure', Lee et. al 2015).
#'
#' @param data Input data (a matrix, a big.matrix or a .csv file name).
#'
#' @param bdm A \var{bdm} instance as generated by \code{bdm.ptsne()}.
#'
#' @param k.max Maximum neighborhood size to check. (By default \code{k.max=NULL} neighborhood sizes are checked up to n-1).
#'
#' @param sampling Fraction of data points to check for each neighborhood size. (Default value is \code{sampling=0.9}).
#'
#' @param mpi.cl An MPI (inter-node parallelization) cluster as generated by \code{bdm.mpi.start()}. (By default \code{mpi.cl = NULL} a 'SOCK' (intra-node parallelization) cluster is generated).
#'
#' @param threads The number of parallel threads (according to data size and hardware resources, \code{i.e.} number of cores and available memory. Default value is \code{threads = 4}).
#'
#' @return A copy of the input \var{bdm} instance with new element \var{bdm$knP}.
#'#'
#' @examples
#'
#' # --- load example dataset
#' bdm.example()
#' \dontrun{
#' # --- compte the kNP
#' m <- bdm.knp(ex$data, ex$map, threads = 4)
#' # --- plot the kNP
#' bdm.knp.plot(m)
#' }
#' 

bdm.knp <- function(data, bdm, k.max = NULL, sampling = 0.9, threads = 4, mpi.cl = NULL)
{
	#. start cluster of workers
	cl <- cluster.start(threads, mpi.cl)
	if (is.null(cl)) return(bdm)
	# . input data (if using mpi.cl it might have been already exported)
	if (is.null(mpi.cl) || !is.null(data)) {
		cat('+++ exporting input data \n')
		Xdata.exp(cl, data, bdm$is.distance, bdm$is.sparse, bdm$normalize)
	}
	#. output data
	cat('+++ exporting output data \n')
	Ydata.exp(cl, t(bdm$ptsne$Y[, 1:2]))
	cat('+++ computing k-ary neighbourhood preservation \n')
	nY <- nrow(bdm$ptsne$Y)
	if (is.null(k.max)) k.max <- nY -1
	t <- system.time({
		bdm$kNP <- kNP.get(cl, nY, k.max, sampling)
	})
	cat('+++ linAUC', bdm$kNP$AUC[1], ', logAUC', bdm$kNP$AUC[2], '\n', sep = ' ')
	print(t)
	bdm$t$kNP <- t
	cluster.stop(cl)
	return(bdm)
}

# ------------------------------------------------------------------------------
# +++ knp internal functions
# ------------------------------------------------------------------------------

kNP.get <- function(cl, nY, k.max, sampling)
{
	#. k-ary neighborhoods
	K <- unique(ceiling(2**(seq(1, log2(k.max -4), length.out = 100))))
	# clusterExport(cl, c('K', 'sampling'), envir = environment()) ?? seems to not work ??
	clusterExport(cl, c('k.max', 'sampling'), envir = environment())
	#. k-ary Q
	Q.list <- clusterCall(cl, thread.kNP)
	Q <- sapply(Q.list[2: length(Q.list)], function(chnk.Q) apply(chnk.Q, 2, sum))
	n <- sum(sapply(Q.list[2: length(Q.list)], function(chnk.Q) nrow(chnk.Q)))
	Q <- apply(Q, 1, sum) /(K *n)
	R <- sapply((ceiling((nY -1) *Q) -K), function(q) max(q, 0)) /(nY -1 -K)
	linAUC <- sum(R * (K -c(1, K[1:(length(K) -1)]))) / K[length(K)]
	logAUC <- sum(R /log10(K)) / sum(1 /log10(K))
	return(list(k.max = k.max, sampling = sampling, K = K, Q = Q, R = R, AUC = c(linAUC, logAUC)))
}

thread.kNP <- function()
{
	if (thread.rank != 0) {
		#. k-ary neighborhoods
		K <- unique(ceiling(2**(seq(1, log2(k.max -4), length.out = 100))))
		z_kNP(thread.rank, threads, Xbm@address, Ybm@address, is.distance, is.sparse, K, sampling)
	}
}


#' k-ary Neighborhood Preservation plot
#'
#' Log and linear plots of the k-ary Neighborhood Preservation
#'
#' @param bdm A \var{bdm} data mapping instance, or a list of them to make a comparative plot.
#'
#' @param ppxfrmt Format of \var{ppx} in the legend. If \code{ppxfrmt > 1} then \var{ppx} is shown as a fraction of the data set size and \var{ppxfrmt} is the number of decimal digits. Default value is \code{ppxfrmt = 0} and then \var{ppx} is experessed in absolute value.
#'
#' @examples
#'
#' # --- load example dataset
#' bdm.example()
#' \dontrun{
#' # --- compte the kNP
#' m <- bdm.knp(ex$data, ex$map, threads = 4)
#' # --- plot the kNP
#' bdm.knp.plot(m, ppxfrmt = 0)
#' }
#' 

bdm.knp.plot <- function(bdm, ppxfrmt = 0)
{
	if (is.null(names(bdm))) kNP.plot.list(bdm, ppxfrmt = ppxfrmt)
	else kNP.plot(bdm)
}

kNP.plot <- function(bdm, par.set = T)
{
	if (par.set) {
		parbdm.set(oma = c(1.0, 1.0, 1.0, 1.0), mar = c(3.0, 3.0, 2.0, 1.5))
		layout(matrix(1:2, nrow = 2))
	}
	# plot R vs K
	plot(bdm$kNP$K, bdm$kNP$R, type = 'l', col = 4, xlab = 'K', ylab = 'Rnx(K)', ylim= c(0.0, 1.0))
	# plot R vs log10(K)
	plot(log10(bdm$kNP$K), bdm$kNP$R, type = 'l', col = 4, xlab = 'log10(K)', ylab = 'Rnx(K)', ylim= c(0.0, 1.0))
	# title AUC
	linAUC <- knformat(bdm$kNP$AUC[1], 4)
	logAUC <- knformat(bdm$kNP$AUC[2], 4)
	title(main = paste('linAUC', linAUC, ', logAUC', logAUC, ', ppx', bdm$ppx$ppx), cex.main = 0.8)
	#
	if (par.set) parbdm.def()
}

kNP.plot.list <- function(m.list, ppxfrmt = 1, log.k = T, par.set = T)
{
	if (par.set) {
		parbdm.set(oma = c(3.0, 2.0, 3.0, 1.0))
		layout(matrix(c(1, 2, 3, 3), nrow = 2), widths = c(0.7, 0.3))
	}
	par(mar = c(3.0, 3.0, 1.5, 1.0))
	# palette
	pltt <- pltt.class(length(m.list))
	# plot R vs K
	K <- m.list[[1]]$kNP$K
	plot(K, rep(0, length(K)), type = 'n', xlab = 'K', ylab = 'Rnx(K)', ylim= c(0.0, 1.0))
	nulL <- lapply(seq_along(m.list), function(i) {
		bdm <- m.list[[i]]
		K <- bdm$kNP$K
		points(K, bdm$kNP$R, col = pltt[i], type = 'l')
	})
	# plot R vs log10(K)
	K <- log10(m.list[[1]]$kNP$K)
	plot(K, rep(0, length(K)), type = 'n', xlab = 'log10(K)', ylab = 'Rnx(K)', ylim= c(0.0, 1.0))
	nulL <- lapply(seq_along(m.list), function(i) {
		bdm <- m.list[[i]]
		K <- log10(bdm$kNP$K)
		points(K, bdm$kNP$R, col = pltt[i], type = 'l')
	})
	# plot legend
	par(mar = c(3.0, 0.5, 3.0, 0))
	plot(1, 1, xlab = '', ylab = '', xaxt = "n", yaxt = "n", bty = "n", type = "n")
	# legend text
	pltt <- c('#FFFFFF', pltt)
	legend.txt <- lapply(seq(0, length(m.list)), function(i) {
		if (i == 0) ' linAUC logAUC   ppx. '
		else {
			bdm <- m.list[[i]]
			linAUC <- knformat(bdm$kNP$AUC[1], 4)
			logAUC <- knformat(bdm$kNP$AUC[2], 4)
			if (ppxfrmt == 0) {
				ppx <- format(bdm$ppx$ppx, width = 6)
			} else {
				ppx <- format(round(bdm$ppx$ppx /nrow(bdm$ptsne$Y), ppxfrmt), scientific = T)
			}
			paste(' ', linAUC, '    ', logAUC, '    ', ppx, sep = '')
		}
	})
	legend('left', legend = legend.txt, bty = 'n', pch = 15, cex = 1.0, pt.cex = 1.0, col = pltt)
	#
	if (par.set) parbdm.def()
}

knformat <- function(x, d) paste('.', formatC(round(x *10**d, 0), width=d, flag=0), sep = '')