R/bdm_dMap.R
In bigMap: Big Data Mapping

Documented in bdm.dMap bdm.dMap.plot

# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# The bigMap Package for R.

# Copyright (c) 2018, Joan Garriga <jgarriga@ceab.csic.es>, Frederic Bartumeus <fbartu@ceab.csic.es> (Blanes Centre for Advanced Studies, CEAB-CSIC).

# bigMap is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

# bigMap is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

# You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses.
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

#' Class density maps
#'
#' Compute the class density maps of a set of classes on the embedding grid. This function returns a fuzzy mapping of the set of classes on the grid cells.  The classes can be whatever set of classes of interest and must be given as a vector of point-wise discrete labels (either numeric, string or factor).
#'
#' @param bdm A \var{bdm} instance as generated by \code{bdm.init()}.
#'
#' @param threads The number of parallel threads (in principle only limited by hardware resources, \code{i.e.} number of cores and available memory)
#'
#' @param type The type of cluster: 'SOCK' (default) for intra-node parallelization, 'MPI' for inter-node parallelization (\code{message passing interface} parallel environment).
#'
#' @param data A vector of discret covariates or class labels. The covariate values can be of any factorizable type. By default (\code{data=NULL}) the function computes the density maps based on the clustering labels (i.e. equivalent to \code{data=bdm.labels(bdm)})
#'
#' @param layer The number of the t-SNE layer (1 by default).
#'
#' @return A copy of the input \var{bdm} instance with element \var{$dMap}, a matrix with a soft clustering of the grid cells.
#'
#' @details \code{bdm.dMap()} computes the join distribution \eqn{P(V=v_{i},C=c_{j})} where \eqn{V={v_{1},\dots,v_{l}}} is the discrete covariate and \eqn{C={c_{1},\dots, c_{g}}} are the grid cells of the paKDE raster. That is, this function recomputes the paKDE but keeping track of the covariate (or class) label of each data-point. This results in a fuzzy distribution of the covariate (class) at each cell.
#'
#' Usually, figuring out the join distribution \eqn{P(V=v_{i},C=c_{j})} entails an intensive computation. Thus \code{bdm.dMap()} performs the computation and stores the result in a dedicated element named \var{$dMap}. Afterwards the class density maps can be visualized with the \code{bdm.dMap.plot()} function.
#'
#' @examples
#'
#' # --- load example dataset
#' bdm.example()
#' \dontrun{
#' exMap <- bdm.dMap(exMap, threads = 4)
#' }

bdm.dMap <- function(bdm, threads = 2, type = 'SOCK', data = NULL, layer = 1){

	if (is.null(bdm$wtt[[layer]])){
		return(message('+++ Error: up-stream step WTT not computed.'))
	}

	# +++ start cluster of workers
	cl <- cluster.start(threads, type)
	if (is.null(cl)) return(bdm)

	# get class labels
	if (is.null(data) && !is.null(bdm$wtt[[layer]])) {
		# use top-level clustering
		data <- bdm.labels(bdm, layer = layer)
		L <- data
	}
	else {
		# use user supplied labels
		L <- as.numeric(as.factor(data))
	}
	L.names <- levels(as.factor(data))

    # export class labels to workers
    clusterExport(cl, c('L'), envir = environment())

	# t-SNE mapping positions
	l <- c(1, 2) + (layer -1) * 2
	Y <- bdm$ptsne$Y[, l]
    # export tSNE-mapping to workers
    clusterExport(cl, c('Y'), envir = environment())

    # export grid & betas to workers
	pakde <- bdm$pakde[[layer]]
    clusterExport(cl, c('pakde'), envir = environment())

	# compute the density mapping
	dMap.list <- clusterCall(cl, thread.dMap)

	s <- length(unique(L))
	grid.size <- length(pakde$x) * length(pakde$y)
	cell.size <- (pakde$x[2]-pakde$x[1]) * (pakde$y[2]-pakde$y[1])

	bdm$dMap <- matrix(rep(0, grid.size * s), ncol= s)
	colnames(bdm$dMap) <- L.names
	for (i in seq(length(dMap.list))) {
		bdm$dMap <- bdm$dMap + dMap.list[[i]]
	}

	bdm$dMap <- bdm$dMap  * cell.size /pi /nrow(Y)
	cat('+++ cdf ', round(sum(bdm$dMap), 4), '\n', sep='')

	stopCluster(cl)

	return(bdm)
}


# compute mapped densities
thread.dMap <- function()
{
	if (thread.rank == 0)
	{
		return(0)
	}
	else
	{
		# number of classes
		s <- length(unique(L))
		grid.size <- length(pakde$x) * length(pakde$y)
		z <- matrix(rep(0, grid.size * s), ncol= s)

		chnk.brks <- round(seq(1, nrow(Y)+1, length.out=(threads+1)), 0)
		for (i in chnk.brks[thread.rank]:(chnk.brks[thread.rank+1] -1))
		{
			# grid cell distances to sample-point-i
			sqdst2i <- as.numeric(outer((Y[i, 1] - pakde$x)^2, (Y[i, 2] - pakde$y)^2, '+'))
			# grid cell densities by kernel-i
			z[, L[i]] <- z[, L[i]] + pakde$beta[i] * exp(-pakde$beta[i] * sqdst2i)
		}
		return(z)
	}
}


#' Class density maps plot.
#'
#' @param bdm A \var{bdm} instance as generated by \code{bdm.init()}.
#'
#' @param classes A vector with a subset of class names or covariate values. Default value is \code{classes=NULL}. If no classes are specified (default value) all classes are plotted.
#'
#' @param join Logical value. If FALSE (default value), class mapping is based on the class conditional distributions. If TRUE, class mapping is based on the overall classes join distribution.

#' @param class.pltt A colour palette to show class labels in the hard mapping. By default (\code{class.pltt = NULL}) the default palette is used.
#'
#' @param pakde.lvls  The number of levels of the heat-map when plotting class density maps (16 by default).
#'
#' @param pakde.pltt A palette of colours to indicate the levels of the class density maps. The length of the colour palette should be at least the number of levels specified in \var{pakde.lvls}.
#'
#' @param wtt.lwd The width of the watertrack lines (as set in \code{par()}).
#'
#' @param plot.peaks Logical value (TRUE by default). If set to TRUE and the up-stream step \code{bdm$wtt()} is computed the peak of each cluster is depicted.
#'
#' @param labels.cex If \var{plot.peaks} is TRUE, the size of the labels of the clusters (as set in \code{par()}). By default \code{labels.cex=0.0} and the labels of the clusters are not depicted.
#'
#' @param layer The number of the layer from which the class density maps are computed (1 by default).
#'
#' @return None.
#'
#' @details \code{bdm.dMap.plot()} yields a multi-plot layout where the first plot shows the dominating value of the covariate (or dominating class) in each cell, and the rest of the plots show the density map of each covariate value (or class).
#'
#' The join distribution \eqn{P(V=v_{i},C=c_{j})} will be affected by the bias present in the marginal distribution of the covariate. Therefore, the join distribution \eqn{P(V=v_{i},C=c_{j})} is transformed, by default, into a conditional distribution \eqn{P(c_{j}|V=v_{i})} (where the \eqn{c_{j}} are the grid cells of the embedding and V is the covariate (or class)). Thus, the first plot shows a hard classification of grid-cells, (cells are coloured based on the dominating value of the covariate (or dominating class), \emph{i.e}. the \eqn{v_{i}} for which \eqn{P(c_{j}|V=v_{i})} is maximum), and the rest of the plots show the conditional distributions \eqn{P(C=c_{j}|V=v_{i})}. This makes the plots of the different classes not directly comparable but the dominant areas of each class can be more easily identified.
#'
#' However, the same plots can be depicted based on the join distribution by setting \code{join = TRUE}. This makes sense when the bias in the covariate values (or classes) is not significant. In this case the hard clustering shows the real dominance of each covariate value (or class) over the embedding area and the density maps are comparable one to each other (although, individually, they are not real density functions as they do not add up to one).
#'
#' The multi-plot layout can be limited to a subset of the values of the covariate (or subset of classes) specified in parameter \code{classes}.
#'
#' @examples
#'
#' # --- load example dataset
#' bdm.example()
#' \dontrun{
#' exMap <- bdm.dMap(exMap, threads = 4)
#' bdm.dMap.plot(exMap)
#' }

bdm.dMap.plot <- function(bdm, classes = NULL, join = FALSE, class.pltt = NULL, pakde.pltt = NULL, pakde.lvls = 16, wtt.lwd = 1.0, plot.peaks = T, labels.cex = 1.0, layer = 1)
{
    if (is.null(bdm$dMap)) {
        stop('+++ Error: bdm.dMap() not computed ! \n')
    }

    class.names <- sapply(colnames(bdm$dMap), function(name) {
		if (all(strsplit(as.character(name), '')[[1]] %in% as.character(0:9)))
			paste('class:', name, sep = ' ')
		else
			name
	})

	if (is.null(class.pltt)) {
        class.pltt <- pltt.get(ncol(bdm$dMap))
	}
    else {
		if (length(class.pltt) != ncol(bdm$dMap)) {
			stop('+++ class.pltt length must be ', ncol(bdm$dMap), '! \n')
		}
    }
    if (!is.null(classes)) {
        class.cols <- which(colnames(bdm$dMap) %in% classes)
		class.pltt[which(!(colnames(bdm$dMap) %in% classes))] <- "#999999FF"
    }
    else {
        class.cols <- seq(ncol(bdm$dMap))
    }
	# add background colour to class palette
	class.pltt <- c("#999999FF", class.pltt)

	# get pakde colour palette
    if (is.null(pakde.pltt)) {
        pakde.pltt <- pltt.pakde(pakde.lvls)
    }

    parbdm.set(oma = c(1.0, 1.0, 1.0, 1.0), mar = c(1.0, 1.0, 1.5, 1.0))
    layout(layout.get(length(class.cols) +1))

    pakde <- bdm$pakde[[layer]]
    g <- bdm$wtt[[layer]]$grid[1, 1]

    # convert join to class conditional
    if (join) kdMap <- bdm$dMap
    else kdMap <- bdm$dMap %*% diag(1 /colSums(bdm$dMap))

    # plot class hard mapping
    z.min <- seq(min(pakde$z), max(pakde$z), length.out=pakde.lvls+1)[2]
    hardC <- sapply(seq(nrow(kdMap)), function(n) {
        cell <- kdMap[n, ]
        if (sum(cell) > z.min && (any(cell != cell[1]))) which.max(cell)
        else 0
    })
    image(pakde$x, pakde$y, matrix(hardC, nrow=g), col = class.pltt, xaxt = 'n', yaxt = 'n', xlab = '', ylab = '')
    dMap.plot.wtt(bdm, wtt.lwd, plot.peaks, labels.cex, layer)

    # plot class density mapping
    nulL <- sapply(class.cols, function(l) {
        fzzyC <- apply(kdMap, 1, function(cell) {
            ifelse(all(cell == cell[1]), 0, cell[l])
        })
        image(pakde$x, pakde$y, matrix(fzzyC, nrow=g), col = pakde.pltt, xaxt = 'n', yaxt = 'n', xlab = '', ylab = '', main = class.names[l], cex.main = 1.2, col.main = class.pltt[(l+1)])
        dMap.plot.wtt(bdm, wtt.lwd, F, 0.0, layer)
    })

    nullPlots <- max(layout.get(length(class.cols +1))) - length(class.cols) -1
	if (nullPlots > 0){
        nulL <- sapply(seq(nullPlots), function(p) plot.null())
    }

    parbdm.def()
}

# add WTT lines
dMap.plot.wtt <- function(bdm, wtt.lwd, plot.peaks, labels.cex, layer)
{
	if (!is.null(bdm$wtt[[layer]])) {
		pakde <- bdm$pakde[[layer]]
		wtt <- bdm$wtt[[layer]]
		if (!is.null(bdm$merge)){
			plot.wtt(pakde, bdm$merge$C, wtt$grid, 2*wtt.lwd, '#222222')
		}
		if (!is.null(bdm$wtt[[layer]])){
			plot.wtt(pakde, wtt$C, wtt$grid, 0.5*wtt.lwd, '#EEEEEE')
		}
		if (plot.peaks){
			if (!is.null(bdm$merge))
				wtt.peaks(pakde, wtt, bdm$merge$C, labels.cex)
			else
				wtt.peaks(pakde, wtt, wtt$C, labels.cex)
		}
	}
}