R/nnmf.R
In NNLM: Fast and Versatile Non-Negative Matrix Factorization

#' Non-negative matrix factorization
#'
#' Non-negative matrix factorization(NMF or NNMF) using sequential coordinate-wise descent or multiplicative updates
#'
#' The problem of non-negative matrix factorization is to find \eqn{W, H, W_1, H_1}, such that \cr
#' \deqn{A = W H + W_0 H_1 + W_1 H_0 + \varepsilon = [W W_0 W_1] [H' H'_1 H'_0]' + \varepsilon}\cr
#' where \eqn{W_0}, \eqn{H_0} are known matrices, which are NULLs in most application case and \eqn{\varepsilon} is noise..
#' In tumor content deconvolution, \eqn{W_0} can be thought as known healthy profile, and \eqn{W}
#' is desired pure cancer profile. One also set \eqn{H_0} to a row matrix of 1, and thus \eqn{W_1}
#' can be treated as common profile among samples. Use \code{init} to specify \eqn{W_0} and \eqn{H_0}.
#'
#' Argument \code{init}, if used, must be a list with entries named as 'W', 'H', 'W0', 'W1', 'H1', 'H0'.
#' One could specify only a few of them. Only use 'W0' (and its correspondent 'H1') or 'H0' (and its correspondent 'W1')
#' for known matrices/profiles.
#'
#' Similarly, argument \code{mask}, if used, must be a list entries named as 'W', 'H', 'W0', 'W1', 'H1', 'H0',
#' and they should be either NULL (no specified) or a logical matrix. If a masked for matrix is specified, then
#' masked entries will be fixed to their initial values if initialized (skipped during iteration), or 0 if not
#' initialized.
#'
#' To simplify the notations, we denote right hand side of the above equation as \eqn{W H}.
#' The problem to solved using square error is\cr
#' \deqn{argmin_{W \ge 0, H \ge 0} L(A, W H) + J(W, \alpha) + J(H', \beta)}\cr
#' where \eqn{L(x, y)} is a loss function either a square loss
#' \deqn{\frac{1}{2} ||x-y||_2^2}
#' or a Kullback-Leibler divergence
#' \deqn{x \log (x/y) - x - y,}
#'
#' and
#' \deqn{J(X, \alpha) = \alpha_1 J_1(X) + \alpha_2 J_2(X) + \alpha_3 J_3(X),}
#' \deqn{J_1(X) = \frac12 ||X||_F^2 = \frac{1}{2} tr(XX^T),}
#' \deqn{J_2(X) = \sum_{i < j} (X_{\cdot i})^TX_{\cdot j} = \frac12 tr(X(E-I)X^T),}
#' \deqn{J_3(X) = \sum_{i,j} |x_{ij}| = tr(XE).}
#'
#' The formal one is usually better for symmetric distribution, while
#' the later one is more suitable for skewed distribution, especially for count data as it can be derived from
#' Poisson distributed observation. The penalty function \eqn{J} is a composition of three types of penalties,
#' which aim to minimizing L2 norm, maximizing angles between hidden features (columns of W and rows of H) and
#' L1 norm (sparsity).  The parameters \eqn{\alpha}, \eqn{\beta} of length 3 indicates the amount of penalties.
#'
#' When \code{method == 'scd'}, a sequential coordinate-wise descent algorithm is used when solving \eqn{W}
#' and \eqn{H} alternatively, which are non-negative regression problem. The \code{inner.max.iter} and
#' \code{inner.rel.tol} is used to control the number of iteration for these non-negative regressions.
#' This is also applicable to \code{method == 'lee'} (the original algorithm only iteration through all entries
#' once for each iteration), which is usually faster than the original algorithm when \code{loss == 'mse'}.
#' When \code{loss == 'mkl'}, a quadratic approximation to the KL-divergence is used when \code{method == 'scd'}.
#' Generally, for run time, 'scd' is faster than 'lee' and 'mse' is faster than 'mkl'.
#'
#' @param A              A matrix to be decomposed
#' @param k              An integer of decomposition rank
#' @param alpha          [L2, angle, L1] regularization on W (non-masked entries)
#' @param beta           [L2, angle, L1] regularization on H (non-masked entries)
#' @param method         Decomposition algorithms, either 'scd' for sequential coordinate-wise descent(default)
#'                       or 'lee' for Lee's multiplicative algorithm
#' @param loss           Loss function to use, either 'mse' for mean square error (default) or 'mkl' for mean KL-divergence
#' @param init           A list of initial matrices for W and H. One can also supply known matrices W0, H0, and initialize
#'                       their correspondent matrices H1 and W1. See details
#' @param mask           A list of mask matrices for W, H, H1 (if \code{init$W0} supplied), W1 (if \code{init$H0} supplied), which
#'                       should have the same shapes as W, H, H1 and W1 if specified. If initial matrices not specified, masked entries
#'                       are fixed to 0. See details
#' @param W.norm         A numeric value 'p' indicating the \eqn{L_p}-norm (can be infinity) used to normalized the outcome \code{W} matrix.
#'                       No normalization will be performed if 0 or negative. This argument has no effect on outcome correspondent to
#'                       known profiles \code{W0, H0}. Default to 1, i.e. sum of W should sum up to 1, which can be interpreted as
#'                       "distribution" or "proportion"
#' @param check.k        If to check whether \eqn{k \le nm/(n+m)}, where (n,m)=dim(A), or k is smaller than the column-wise
#'                       and row-wise minimum numbers of complete observation
#' @param max.iter       Maximum iteration of alternating NNLS solutions to H and W
#' @param rel.tol        Stop criterion, which is relative tolerance between two successive iterations, = |e2-e1|/avg(e1, e2)
#' @param n.threads      An integer number of threads/CPUs to use. Default to 1(no parallel). Specify 0 for all cores
#' @param trace          An integer indicating how frequent the error should be checked. MSE and MKL error will computed every
#'                       \code{trace} iterations. If 0 or negative, trace is set to a very large number and only errors of the
#'                       first and last iterations are checked.
#' @param verbose        Either 0/FALSE, 1/TRUE or 2, with 0/FALSE/else = no any tracking, 1 == progression bar, 2 == print iteration info.
#' @param show.warning   If to show warnings when targeted \code{rel.tol} is not reached
#' @param inner.max.iter Maximum number of iterations passed to each inner W or H matrix updating loop
#' @param inner.rel.tol  Stop criterion for the inner loop, which is relative tolerance passed to inner W or H matrix updating
#'                       i.e., |e2-e1|/avg(e1, e2)
#' @return A list with components
#' 	\itemize{
#' 		\item W              : left matrix, including known W0 and W1 if available, i.e., column stacked as \eqn{[W, W0, W1]}
#' 		\item H              : right matrix, including H1 and known H0 if available, i.e. row stacked as \eqn{[H', H1', H0']'}
#' 		\item mse            : a vector of mean squared errors through iterations
#' 		\item mkl            : a vector of mean KL-divergence through iterations
#' 		\item target.loss    : target for minimization, which is mean KL-divergence (if \code{loss == 'mkl'}) or half of mean squared error
#'                             if \code{loss == 'mse'} plus penalties
#' 		\item average.epochs : a vector of average epochs (one complete swap over W and H)
#' 		\item n.iteration    : total number of iteration (sum over all column of \code{beta})
#' 		\item run.time       : running time
#' 		\item options        : list of information of input arguments
#' 		\item call           : function call
#' 	}
#'
#' @references
#'
#' Franc, V. C., Hlavac, V. C., Navara, M. (2005). Sequential Coordinate-Wise Algorithm for the Non-negative Least Squares Problem.
#' Proc. Int'l Conf. Computer Analysis of Images and Patterns. Lecture Notes in Computer Science 3691. p. 407.\cr
#' \cr
#' Lee, Daniel D., and H. Sebastian Seung. 1999. "Learning the Parts of Objects by Non-Negative Matrix Factorization."
#' Nature 401: 788-91.\cr
#' \cr
#' Pascual-Montano, Alberto, J.M. Carazo, Kieko Kochi, Dietrich Lehmann, and Roberto D.Pascual-Marqui. 2006.
#' "Nonsmooth Nonnegative Matrix Factorization (NsNMF)." IEEE Transactions on Pattern Analysis and Machine Intelligence 28 (3): 403-14.\cr
#'
#' @author Eric Xihui Lin, \email{xihuil.silence@@gmail.com}
#' @seealso \code{\link{nnlm}}, \code{\link{predict.nnmf}}
#'
#' @examples
#'
#' # Pattern extraction, meta-gene
#' set.seed(123);
#'
#' data(nsclc, package = 'NNLM')
#' str(nsclc)
#'
#' decomp <- nnmf(nsclc[, 1:80], 3, rel.tol = 1e-5);
#'
#' heatmap(decomp$W, Colv = NA, xlab = 'Meta-gene', ylab = 'Gene', margins = c(2,2),
#' 	labRow = '', labCol = '', scale = 'column', col = cm.colors(100));
#' heatmap(decomp$H, Rowv = NA, ylab = 'Meta-gene', xlab = 'Patient', margins = c(2,2),
#' 	labRow = '', labCol = '', scale = 'row', col = cm.colors(100));
#'
#' # missing value imputation
#' set.seed(123);
#' nsclc2 <- nsclc;
#' index <- sample(length(nsclc2), length(nsclc2)*0.3);
#' nsclc2[index] <- NA;
#'
#' # impute using NMF
#' system.time(nsclc2.nmf <- nnmf(nsclc2, 2));
#' nsclc2.hat.nmf <- with(nsclc2.nmf, W %*% H);
#'
#' mse.mkl(nsclc[index], nsclc2.hat.nmf[index])
#'
#' @export
nnmf <- function(
	A, k = 1L, alpha = rep(0,3), beta = rep(0,3), method = c('scd', 'lee'),
	loss = c('mse', 'mkl'), init = NULL, mask = NULL, W.norm = -1L, check.k = TRUE,
	max.iter = 500L, rel.tol = 1e-4, n.threads = 1L, trace = 100/inner.max.iter,
	verbose = 1L, show.warning = TRUE, inner.max.iter = ifelse('mse' == loss, 50L, 1L),
	inner.rel.tol = 1e-9
	) {
	method <- match.arg(method);
	loss <- match.arg(loss);
	check.matrix(A, input.name = 'A');
	if (!is.double(A))
		storage.mode(A) <- 'double';
	n <- nrow(A);
	m <- ncol(A);

	init.mask <- reformat.input(init, mask, n, m, k);
	k <- init.mask$K;

	alpha <- c(as.double(alpha), rep(0., 3))[1:3];
	beta <- c(as.double(beta), rep(0., 3))[1:3];
	method.code <- get.method.code(method, loss);

	min.k <- min(dim(A));
	A.isNA <- is.na(A);
	A.anyNA <- any(A.isNA); # anyNA is depreciated in new version of R
	if (A.anyNA) {
		min.k <- min(min.k, ncol(A) - rowSums(A.isNA), nrow(A) - colSums(A.isNA));
		}
	rm(A.isNA);
	if (check.k && k > min.k && all(c(alpha, beta) == 0))
		stop(paste("k larger than", min.k, "is not recommended, unless properly masked or regularized.
				Set check.k = FALSE if you want to skip this checking."));

	if (n.threads < 0L) n.threads <- 0L; # let openMP decide
	if (is.logical(verbose)) {
		verbose <- as.integer(verbose);
		}
	if (trace <= 0) {
		trace <- 999999L; # only compute error of the 1st and last iteration
		}

	run.time <- system.time(
		out <- c_nnmf(A, as.integer(k),
			init.mask$Wi, init.mask$Hi, init.mask$Wm, init.mask$Hm,
			alpha, beta, as.integer(max.iter), as.double(rel.tol),
			as.integer(n.threads), as.integer(verbose), as.logical(show.warning),
			as.integer(inner.max.iter), as.double(inner.rel.tol), as.integer(method.code),
			as.integer(trace))
		);
	names(out) <- c('W', 'H', 'mse', 'mkl', 'target.loss', 'average.epochs', 'n.iteration');
	out$mse <- as.vector(out$mse);
	out$mkl <- as.vector(out$mkl);
	out$target.loss <- as.vector(out$target.loss);
	out$average.epochs <- as.vector(out$average.epochs);

	# add row/col names back
	colnames(out$W) <- colnames(init.mask$Wi);
	rownames(out$H) <- rownames(init.mask$Hi);
	if (!is.null(rownames(A))) rownames(out$W) <- rownames(A);
	if (!is.null(colnames(A))) colnames(out$H) <- colnames(A);
	rm(init.mask);

	if (W.norm > 0) {
		if (is.finite(W.norm)) {
			W.scale <- sapply(out$W, function(x) sum(x^W.norm)^(1./W.norm));
		} else {
			W.scale <- sapply(out$W, max);
			}
		out$W <- out$W %*% diag(1./W.scale);
		out$H <- diag(W.scale) %*% out$H
		}

	out$run.time <- run.time;
	out$options <- list(
		method = method,
		loss = loss,
		alpha = alpha,
		beta = beta,
		init = init,
		mask = mask,
		n.threads = n.threads,
		trace = trace,
		verbose = verbose,
		max.iter = max.iter,
		rel.tol = rel.tol,
		inner.max.iter = inner.max.iter,
		inner.rel.tol = inner.rel.tol
		);
	out$call <- match.call();
	return(structure(out, class = 'nnmf'));
	}