R/bdm_merge.R
In bigMap: Big Data Mapping

Documented in bdm.merge.s2nr bdm.optk.plot bdm.optk.s2nr

# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# The bigMap Package for R.

# Copyright (c) 2018, Joan Garriga <jgarriga@ceab.csic.es>, Frederic Bartumeus <fbartu@ceab.csic.es> (Blanes Centre for Advanced Studies, CEAB-CSIC).

# bigMap is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

# bigMap is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

# You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses.
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


#' Find optimal number of clusters based on signal-to-noise-ratio.
#'
#' Performs a recursive merging of clusters based on minimum loss of signal-to-noise-ratio (S2NR). The S2NR is the explained/unexplained variance ratio measured in the high dimensional space based on the given low dimensional clustering. Merging is applied recursively until reaching a configuration of only 2 clusters and the S2NR is measured at each step.
#'
#' @param bdm A clustered \var{bdm} instance (\var{i.e.} all up-stream steps performed: \code{bdm.ptse(), bdm.pakde() and bdm.wtt()}.
#'
#' @param plot.optk Logical value. If TRUE, this function plots the heuristic measure versus the number of clusters (default value is \code{plot.optk = TRUE})
#'
#' @param ret.optk Logical value. For large datasets this computation can take a while and it might be interesting to save it. If TRUE, the function returns a copy of the \var{bdm} instance with the values of S2NR attached as \var{bdm$optk} (default value is \code{ret.optk = FALSE}).
#'
#' @param info Logical value. If TRUE, all merging steps are shown (default value is \code{info = FALSE}).
#'
#' @param layer The \var{bdm$ptsne} layer to be used (default value is \code{layer = 1}).
#'
#' @return None if \code{ret.optk = FALSE}. Else, a copy of the input \var{bdm} instance with new element \var{bdm$optk} (a matrix).
#'
#' @details The logic under this heuristic is that neigbouring clusters in the embedding correspond to close clusters in the high dimensional space, \var{i.e.} it is a merging heuristic based on the spatial distribution of clusters. For each cluster (child cluster) we choose the neighboring cluster with steepest gradient along their common border (father cluster). Thus, we get a set of pairs of clusters (child/father) as potential mergings. Given this set of candidates, the merging is performed recursively choosing, at each step, the pair of child/father clusters that results in a minimum loss of S2NR.
#' A typical situation is that some clusters dominate over all of their neighboring clusters. This clusters have no \var{father}. Thus, once all candidate mergings have been performed we reach a \var{blocked} state where only the dominant clusters remain. This situation identifies a hierarchy level in the clustering. When this situation is reached, the algorithm starts a new merging round, identifying the child/father relations at that level of hierarchy. The process stops when only two clusters remain.
#' Usually, the clustering hierarchy is clearly depicted by singular points in the S2NR function. This is a hint that the low dimensional clustering configuration is an image of a hierarchycal spatial configuration in the high dimensional space. See \code{bdm.optk.plot()}.
#'
#' @examples
#'
#' # --- load mapped dataset
#' bdm.example()
#' # --- compute optimal number of clusters and attach the computation
#' bdm.optk.s2nr(exMap, plot.optk = TRUE, ret.optk = FALSE)

bdm.optk.s2nr <- function(bdm, info = T, plot.optk = T, ret.optk = F, layer = 1)
{
	if (is.null(bdm$wtt[[layer]]$C)) {
		return(message('+++ Error: up-stream step bdm.wtt(layer = ', layer, ') not found !', sep = ''))
	}
	else {
		bdm$optk <- s2nr.optk(bdm, verbose = info, layer = layer)
		if (plot.optk) bdm.optk.plot(bdm)
		if (ret.optk) return(bdm)
	}
}


#' Plots the signal-to-nois-ratio as a function of the number of clusters.
#'
#' The function \code{bdm.optk.sn2r()} computes the S2NR that results from recursively merging clusters and, by deafult, makes a plot of these values. For large datasets this computation can take a while, so we can save this result by setting \code{ret.optk = TRUE}. If this result is saved, we can plot it again at any time using this funcion.
#'
#' @param bdm A \var{bdm} instance as generated by \code{bdm.init()}.
#'
#' @return None.
#'
#' @examples
#'
#' bdm.example()
#' exMap <- bdm.optk.s2nr(exMap, ret.optk = TRUE)
#' bdm.optk.plot(exMap)

bdm.optk.plot <- function(bdm)
{
	if (is.null(bdm$optk))
		return(message('+++ Error: up-stream step bdm.optk() is not computed ! \n'))

	optk <- bdm$optk
	s <- length(optk$H) +1
	# gradient function
	dffH <- c(diff(optk$H), 0)

	parbdm.set(oma = c(1,1,1,1), mar = c(3,3,2,0.5))
	layout(matrix(seq(2), c(2, 1)))

	plot(optk$H, xlab = '#clusters', ylab = 'S2NR', xaxt = 'n', type = 'l', col = '4')
	axis(1, at = seq(0, s, 10), labels = seq(s, 0, -10) +1, cex.axis = 0.6)
	grid()

	if (!is.null(optk$lvls))
	{
		L <- s -optk$lvls +1
		title('S2NR', adj = 0, cex = 0.7)
		abline(v = L, lwd = 2.0, col = "#BBBBBB")
		if (length(optk$lvls) < 4) {
			axis(3, at = L, labels = optk$lvls, cex.axis = 0.6, tick = FALSE)
		} else {
			sel <- seq(1, length(optk$lvls), by = 2)
			axis(3, at = L[sel], labels=optk$lvls[sel], cex.axis=0.6, tick=FALSE)
			sel <- seq(2, length(optk$lvls), by=2)
			axis(3, line=0.5, at=L[sel], labels=optk$lvls[sel], cex.axis=0.6, tick=FALSE)
		}
	}

	# if (!is.null(optk$tRate))
	# {
	# 	title('S2NR by transition.rate optimization', adj=0, cex=0.7)
	# 	points(optk$tRate, type='l', col='6')
	# 	legend("bottomleft", legend=c('S2NR', 'tRate'), col=c(4,6), cex=0.8, lwd=3, text.font=1, bty='n')
	# }

	plot(dffH, xlab = '#clusters', ylab = 'd(S2NR)', xaxt = 'n', type = 'l', col = '2')
	title('S2NR loss gradient', adj = 0, cex = 0.7)
	axis(1, at = seq(0, s, 10), labels = seq(s, 0, -10) +1, cex.axis = 0.6)
	grid()

	L <- s -optk$loss +1
	abline(v = L, lwd=2.0, col="#BBBBBB")
	if (length(optk$loss) < 4) {
		axis(3, at = L, labels = optk$loss, cex.axis = 0.6, tick = FALSE)
	} else{
		sel <- seq(1, length(optk$loss), by = 2)
		axis(3, at=L[sel], labels = optk$loss[sel], cex.axis = 0.6, tick = FALSE)
		sel <- seq(2, length(optk$loss), by = 2)
		axis(3, line = 0.5, at = L[sel], labels = optk$loss[sel], cex.axis = 0.6, tick = FALSE)
	}

	parbdm.def()
}

#' Merging of clusters based on signal-to-noise-ratio.
#'
#' Performs a recursive merging of clusters based on minimum loss of signal-to-noise-ratio (S2NR) until reaching the desired number of clusters. The S2NR is the explained/unexplained variance ratio measured in the high dimensional space based on the given low dimensional clustering.
#'
#' @param bdm A \var{bdm} instance as generated by \code{bdm.init()}.
#'
#' @param k The number of desired clusters. The clustering will be recursively merged until reaching this number of clusters (default value is \code{k = 10}). By setting \code{k < 0} we can specify the number of clusters that we are willing to merge.
#'
#' @param plot.merge Logical value. If TRUE, the merged clustering is plotted (default value is \code{plot.merge = TRUE})
#'
#' @param ret.merge Logical value. If TRUE, the function returns a copy of the input \var{bdm} instance with the merged clustering attached as \var{bdm$merge} (default value is \code{ret.merge = FALSE})
#'
#' @param info Logical value. If TRUE, all merging steps are shown (default value is \code{info = FALSE}).
#'
#' @param layer The \var{bdm$ptsne} layer to be used (default value is \code{layer = 1}).
#'
#' @param ... If \var{plot.merge} is TRUE, you can set the \code{bdm.wtt.plot()} parameters to control the plot.
#'
#' @return None if \code{ret.merge = FALSE}. Else, a copy of the input \var{bdm} instance with new element \var{bdm$merge}.
#'
#' @details See details in \code{bdm.optk.s2nr()}.
#'
#' @examples
#'
#' bdm.example()
#' exMap.labels <- bdm.labels(exMap)

bdm.merge.s2nr <- function(bdm, k = 10, plot.merge = T, ret.merge = F, info = T, layer = 1, ...){
	bdm$merge <- s2nr.merge(bdm, k = k, verbose = info, layer = layer)
	if (length(bdm$merge$steps) == 0)
		return(message('+++ Merging error !!'))
	# show clustering after merge
	if (plot.merge) bdm.wtt.plot(bdm, layer = layer, ...)
	# return new hdd
	if (ret.merge) return(bdm)
}


# -----------------------------------------------------------------------------
# +++ Merge auxiliary functions
# -----------------------------------------------------------------------------

merge.labels <- function(Y, C, G){
	D2c <- grid_D2cell(Y, G) +1
	#g.size <- G[1, 1]
	lbls <- C[(D2c[, 2] - 1) *G[1, 1] +D2c[, 1]]
	return(lbls)
}