R/gene-relevance.r
In destiny: Creates diffusion maps

#' @include diffusionmap.r s4-unions.r
NULL

#' Gene relevances for entire data set
#' 
#' The relevance map is cached insided of the \code{\link{DiffusionMap}}.
#' 
#' @param coords           A \code{\link{DiffusionMap}} object or a cells \eqn{\times} dims \code{\link{matrix}}.
#' @param exprs            An cells \eqn{\times} genes \code{\link{matrix}}. Only provide if \code{coords} is no \code{\link{DiffusionMap}}.
#' @param ...              Unused. All parameters to the right of the \code{...} have to be specified by name.
#' @param k                Number of nearest neighbors to use
#' @param dims             Index into columns of \code{coord}
#' @param distance         Distance measure to use for the nearest neighbor search.
#' @param smooth           Smoothing parameters \code{c(window, alpha)} (see \code{\link[smoother]{smth.gaussian}}).
#'                         Alternatively \code{\link{TRUE}} to use the \link[smoother]{smoother} \link[smoother:smth.options]{defaults}
#'                         or \code{\link{FALSE}} to skip smoothing,
#' @param remove_outliers  Remove cells that are only within one other cell's nearest neighbor, as they tend to get large norms.
#' @param pcs              A cell \eqn{\times} \code{n_pcs} matrix of principal components to use for the distances.
#' @param knn_params       A \code{\link{list}} of parameters for \code{\link{find_knn}}.
#' @param weights          Weights for the partial derivatives. A vector of the same length as \code{dims}.
#' @param verbose          If TRUE, log additional info to the console
#' 
#' @return A \code{GeneRelevance} object:
#' 
#' @slot coords         A cells \eqn{\times} dims \code{\link{matrix}} or \code{\link[Matrix:sparseMatrix-class]{sparseMatrix}}
#'                      of coordinates (e.g. diffusion components), reduced to the dimensions passed as \code{dims}
#' @slot exprs          A cells \eqn{\times} genes matrix of expressions
#' @slot partials       Array of partial derivatives wrt to considered dimensions in reduced space
#'                      (genes \eqn{\times} cells \eqn{\times} dimensions)
#' @slot partials_norm  Matrix with norm of aforementioned derivatives. (n\_genes \eqn{\times} cells)
#' @slot nn_index       Matrix of k nearest neighbor indices. (cells \eqn{\times} k)
#' @slot dims           Column index for plotted dimensions. Can \code{\link{character}}, \code{\link{numeric}} or \code{\link{logical}}
#' @slot distance       Distance measure used in the nearest neighbor search. See \code{\link{find_knn}}
#' @slot smooth_window  Smoothing window used (see \code{\link[smoother]{smth.gaussian}})
#' @slot smooth_alpha   Smoothing kernel width used (see \code{\link[smoother]{smth.gaussian}})
#' 
#' @seealso \link{Gene Relevance methods}, \link{Gene Relevance plotting}: \code{plot_differential_map}/\code{plot_gene_relevance}
#' 
#' @examples
#' data(guo_norm)
#' dm <- DiffusionMap(guo_norm)
#' gr <- gene_relevance(dm)
#' 
#' m <- t(Biobase::exprs(guo_norm))
#' gr_pca <- gene_relevance(prcomp(m)$x, m)
#' # now plot them!
#' 
#' @rdname Gene-Relevance
#' @export
setClass('GeneRelevance', slots = c(
	coords = 'matrix',
	exprs = 'dMatrixOrMatrix',
	partials = 'array',
	partials_norm = 'matrix',
	nn_index = 'matrix',  # k = ncol(nn_index)
	dims = 'ColIndex',
	distance = 'character',
	smooth_window = 'numeric',
	smooth_alpha = 'numeric'))

#' @rdname Gene-Relevance
#' @export
setGeneric('gene_relevance', function(
	coords, exprs, ...,
	k = 20L, dims = 1:2, distance = NULL, smooth = TRUE, remove_outliers = FALSE, verbose = FALSE
) standardGeneric('gene_relevance'))

#' @importFrom Biobase updateObject
#' @rdname Gene-Relevance
#' @export
setMethod('gene_relevance', c('DiffusionMap', 'missing'), function(
	coords, exprs, ..., k,
	dims, distance, smooth, remove_outliers, verbose
) {
	dm <- updateObject(coords)
	relevance_map <- updateObject(dm@data_env$relevance_map)
	smooth <- get_smoothing(smooth)
	if (is.null(relevance_map) ||
		ncol(relevance_map@nn_index) != k ||
		!identical(relevance_map@dims, dims) ||
		!identical(relevance_map@smooth_window, smooth[[1L]]) ||
		!identical(relevance_map@smooth_alpha,  smooth[[2L]])
	) {
		coords <- eigenvectors(dm)
		if (dm@distance == 'custom') stop('Gene relevance cannot be computed from a diffusion map with a custom distance matrix.')
		exprs <- dataset_extract_doublematrix(dataset(dm))
		pcs <- get_pca(exprs, dataset(dm), dm@n_pcs, verbose)
		weights <- eigenvalues(dm)[dims]
		if (is.null(distance)) distance <- dm@distance
		else if (!identical(distance, dm@distance)) stop('the specified distance ', distance,' is not the same as the one used for the diffusion map: ', dm@distance)
		relevance_map <- gene_relevance(
			coords, exprs, ...,
			k = k, dims = dims, distance = distance, smooth = smooth, remove_outliers = remove_outliers, verbose = verbose,
			pcs = pcs, knn_params = dm@knn_params, weights = weights
		)
		dm@data_env$relevance_map <- relevance_map
	} else chkDots(...)
	relevance_map
})

#' @importFrom methods is
#' @importFrom Biobase rowMedians
#' @importFrom Matrix nnzero
#' @rdname Gene-Relevance
#' @export
setMethod('gene_relevance', c('matrix', 'dMatrixOrMatrix'), function(
	coords, exprs, ...,
	pcs = NULL, knn_params = list(), weights = 1,
	k, dims, distance, smooth, remove_outliers, verbose
) {
	chkDots(...)
	distance <- match.arg(distance, c('euclidean', 'cosine', 'rankcor', 'l2'))
	coords_used <- coords[, dims, drop = FALSE]
	n_dims <- ncol(coords_used)
	if (length(weights) == 1L) weights <- rep(weights, n_dims)
	smooth <- get_smoothing(smooth)
	
	if (is.null(colnames(exprs))) stop('The expression matrix columns need to be named but are NULL')
	if (n_dims != length(weights)) stop(n_dims, ' dimensions, but ', length(weights), ' weights were provided')
	
	nn_index <- do.call(find_knn, c(list(if (is.null(pcs)) exprs else pcs, k, distance = distance), knn_params))$index
	
	k <- ncol(nn_index)
	n_cells <- nrow(coords_used)
	n_genes <- ncol(exprs)
	partials <- array(
		NA,
		dim = c(n_cells, n_genes, n_dims),
		dimnames = list(rownames(exprs), colnames(exprs), if (is.character(dims)) dims else colnames(coords_used)))
	
	# a very small value to subtract from the differential
	values <- if (is(exprs, 'Matrix')) exprs@x else exprs
	small <- min(values[values != 0]) / (length(exprs) - nnzero(exprs))
	if (verbose) cat('Calculating expression differential\n')
	gene_differential <- function(expr_gene) {
		# Compute change in expression
		# Do not compute if reference is zero, could be drop-out
		expr_masked <- expr_gene
		expr_masked[expr_masked == 0] <- small
		differential_expr <- apply(nn_index, 2, function(nn) expr_gene[nn] - expr_masked)
		differential_expr[differential_expr == 0] <- NA  # Cannot evaluate partial
		#stopifnot(identical(dim(differential_expr), c(n_cells, k)))
		differential_expr
	}
	differential_exprs <- apply(exprs, 2L, gene_differential)
	#stopifnot(identical(dim(differential_exprs), c(n_cells * k, n_genes)))
	# apply only handles returning vectors, so we have to reshape the return value
	dim(differential_exprs) <- c(n_cells, k, n_genes)
	dimnames(differential_exprs)[[3L]] <- if (length(colnames(exprs)) > 1L) colnames(exprs) else list(colnames(exprs))
	#stopifnot(identical(gene_differential(exprs[, 1L]), differential_exprs[, , 1L]))
	
	for (d in seq_len(n_dims)) {
		# Compute partial derivatives in direction of current dimension
		
		if (verbose) cat('Calculating partial derivatives of dimension ', d, '/', n_dims, '\n')
		# We could optionaly add normalization by max(coords_used[, d]) - min(coords_used[, d])
		differential_coord <- apply(nn_index, 2L, function(nn) coords_used[nn, d] - coords_used[, d])
		
		partials_unweighted <- apply(differential_exprs, 3L, function(grad_gene_exprs) {
			# Compute median of difference quotients to NN
			difference_quotients <- grad_gene_exprs / differential_coord
			# Only compute differential if at least two observations are present!
			stable_cells <- rowSums(!is.na(difference_quotients)) >= 2L
			ifelse(stable_cells, rowMedians(difference_quotients, na.rm = TRUE), NA)
		})
		colnames(partials_unweighted) <- colnames(exprs)
		
		if (!any(is.na(smooth))) {
			order_coor <- order(coords_used[, d])
			order_orig <- order(order_coor)
			# Smooth the partials for each gene across all cells
			partials_unweighted <- apply(partials_unweighted, 2L, function(partials_gene) {
				ordered <- partials_gene[order_coor]
				smoothed <- smth.gaussian(ordered, smooth[[1L]], smooth[[2L]], tails = TRUE)
				smoothed[order_orig]
			})
			colnames(partials_unweighted) <- colnames(exprs)
		}
		
		partials[, , d] <- weights[[d]] * partials_unweighted
	}
	
	# Compute norm over partial derivates: Frobenius
	partials_norm <- apply(partials, c(1, 2), function(z) sqrt(sum(z^2, na.rm = TRUE)))
	colnames(partials_norm) <- colnames(partials)
	
	# Find outlier cells: Not in NN of more than 1 other cell
	# Remove these as they tend to receive very large norms
	if (remove_outliers) {
		outliers <- sapply(seq_len(n_cells), function(cell) sum(nn_index == cell) > 1)
		partials_norm[, outliers] <- NA
	}
	
	# Prepare output
	rownames(partials_norm) <- rownames(partials)
	colnames(partials_norm) <- colnames(partials)
	new('GeneRelevance',
		coords = coords,
		exprs = exprs,
		partials = partials,
		partials_norm = partials_norm,
		nn_index = nn_index,
		dims = dims,
		smooth_window = smooth[[1L]],
		smooth_alpha  = smooth[[2L]],
		distance = distance)
})

get_smoothing <- function(smooth) {
	if (isTRUE(smooth)) c(getOption('smoother.window'), getOption('smoother.gaussianwindow.alpha'))
	else if (identical(smooth, FALSE)) c(NA_real_, NA_real_)
	else if (!is.numeric(smooth) || length(smooth) != 2L)
		stop('`smooth` needs to be TRUE, FALSE or a numeric c(window, alpha), not', capture.output(str(smooth)))
	else smooth
}