NMF-vignette.R
In NMF: Algorithms and Framework for Nonnegative Matrix Factorization (NMF)

## ----pkgmaker_preamble, echo=FALSE, results='asis'----------------------------
library(NMF)
latex_preamble()
if(!requireNamespace("Biobase")) BiocManager::install("Biobase")

## ----bibliofile, echo=FALSE, results='asis'-----------------------------------
latex_bibliography('NMF')

## ----options, echo=FALSE------------------------------------------------------
set.seed(123456)
library(knitr)

# Helper functions:
hook_try <- function(before, options, envir){
	
	.try_defined <- FALSE
	
	# remove hacked version of try
	if( !before ){
		if( .try_defined && exists('try', envir = envir, inherits = FALSE) ){
			remove(list = 'try', envir = envir)
		}
		.try_defined <<- FALSE
		return(invisible())
	}
	
	if( !is.null(options$try) ){
		
		# signal
		do.signal <- isFALSE(options$try)
		if( isManualVignette() && isTRUE(options$try) ){
			do.signal <- TRUE
		}
		# define hacked version of try()
		.try <- try_message(do.signal)
		assign('try', .try, envir)
		.try_defined <<- TRUE
	}
}

chunkOutputHook <- function(name, hook, type = c('output', 'source', 'chunk')){
	type <- match.arg(type)
	function(){
		
		if( !requireNamespace('knitr', quietly = TRUE) ) 
			stop("Package 'knitr' is required to setup knit hook '", name, "'")
		
		.hook_bkp <- NULL
		function(before, options, envir){
			# do nothing if the option is not ON
			if( is.null(options[[name]]) ) return()
			
			# set/unset hook
			if( before ){
				# store current hook function
				if( is.null(.hook_bkp) ) .hook_bkp <<- knitr::knit_hooks$get(type)
				
				# define hook wrapper
				hook_wrapper <- function(x, options){
					res <- .hook_bkp(x, options)
					hook(res, options)
				}
				
				args <- list()
				args[[type]] <- hook_wrapper
				do.call(knitr::knit_hooks$set, args)
			}else{
				args <- list()
				args[[type]] <- .hook_bkp
				do.call(knitr::knit_hooks$set, args)
				.hook_bkp <<- NULL
			}
		}
	}
}


hook_backspace <- chunkOutputHook('backspace', 
		function(x, options){
			if( !isTRUE(options$backspace) ) x
			str_bs(x)
		}
)

str_bs <- function(x){
	# remove leading backspaces
	x <- gsub("^\b+", "", x)
	# remove backspaces at beginning of line
	x <- gsub("\n\b+", '\n', x)
	while( length(grep('\b', x, fixed = TRUE)) ) 
		x <- gsub('[^\n\b][\b]', '', x)
	
	x
}

isManualVignette <- function(){
	isTRUE(getOption('R_RUNNING_MANUAL_VIGNETTE'))
}

try_message <- function(signal = FALSE){
	function(expr){
		tryCatch(expr, error = function(e){
					if( signal ) message(e)
					else message('Error: ', conditionMessage(e))
					invisible()
				})
	}
}

knit_hooks$set(try = hook_try, backspace = hook_backspace())

## ----load_library, echo=FALSE, include=FALSE----------------------------------
# Load
library(NMF)
# limit number of cores used
nmf.options(cores = 2)

## ----load_library_fake, eval=FALSE--------------------------------------------
#  # Install
#  install.packages('NMF')
#  # Load
#  library(NMF)

## ----updateObject, eval=FALSE-------------------------------------------------
#  # eg., load from some RData file
#  load('object.RData')
#  # update class definition
#  object <- nmfObject(object)

## ----features, echo=FALSE-----------------------------------------------------
nalgo <- length(nmfAlgorithm())
nseed <- length(nmfSeed())

## ----nmfAlgorithm-------------------------------------------------------------
# list all available algorithms
nmfAlgorithm()
# retrieve a specific algorithm: 'brunet' 
nmfAlgorithm('brunet')
# partial match is also fine
identical(nmfAlgorithm('br'), nmfAlgorithm('brunet')) 

## ----nmfSeed------------------------------------------------------------------
# list all available seeding methods
nmfSeed()
# retrieve a specific method: 'nndsvd' 
nmfSeed('nndsvd')
# partial match is also fine
identical(nmfSeed('nn'), nmfSeed('nndsvd'))

## ----show_Rversions-----------------------------------------------------------
nmfAlgorithm(all=TRUE)

# to get all the algorithms that have a secondary R version
nmfAlgorithm(version='R')

## ----perftable_setup, cache=TRUE----------------------------------------------
# retrieve all the methods that have a secondary R version
meth <- nmfAlgorithm(version='R')
meth <- c(names(meth), meth)
meth

if(requireNamespace("Biobase", quietly=TRUE)){
# load the Golub data
data(esGolub)

# compute NMF for each method
res <- nmf(esGolub, 3, meth, seed=123456)

# extract only the elapsed time
t <- sapply(res, runtime)[3,]
}

## ----perftable, echo=FALSE, results='asis'------------------------------------
# speed-up
m <- length(res)/2
su <- cbind( C=t[1:m], R=t[-(1:m)], Speed.up=t[-(1:m)]/t[1:m])

library(xtable)
xtable(su, caption='Performance speed up achieved by the optimized C++ implementation for some of the NMF algorithms.', label='tab:perf')

## ----citations, eval=FALSE----------------------------------------------------
#  # plain text
#  citation('NMF')
#  
#  # or to get the bibtex entries
#  #toBibtex(citation('NMF'))

## ----esGolub------------------------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
data(esGolub)
esGolub
esGolub <- esGolub[1:200,]

# remove the uneeded variable 'Sample' from the phenotypic data
esGolub$Sample <- NULL
}

## ----algo_default, cache=TRUE-------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
# default NMF algorithm
res <- nmf(esGolub, 3)
}

## ----single_show--------------------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
res 
}

## ----single_show_model--------------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
fit(res)
}

## ----single_show_estimate-----------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
V.hat <- fitted(res)
dim(V.hat)
}

## ----singlerun_summary--------------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
summary(res)

# More quality measures are computed, if the target matrix is provided: 
summary(res, target=esGolub)
}

## ----singlerun_summary_factor-------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
summary(res, class=esGolub$Cell)
}

## ----get_matrices-------------------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
# get matrix W
w <- basis(res)
dim(w)

# get matrix H
h <- coef(res)
dim(h)
}

## ----subset-------------------------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
# keep only the first 10 features
res.subset <- res[1:10,] 
class(res.subset)
dim(res.subset)
# keep only the first 10 samples 
dim(res[,1:10])
# subset both features and samples:
dim(res[1:20,1:10])
}

## ----single_extract-----------------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
# only compute the scores
s <- featureScore(res)
summary(s)

# compute the scores and characterize each metagene
s <- extractFeatures(res)
str(s)
}

## ----algo_list----------------------------------------------------------------
nmfAlgorithm()

## ----algo_lee, cache=TRUE-----------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
# using Lee and Seung's algorithm
res <- nmf(esGolub, 3, 'lee')
algorithm(res)
}

## ----algo_ns, cache=TRUE------------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
# using the Nonsmooth NMF algorithm with parameter theta=0.7
res <- nmf(esGolub, 3, 'ns', theta=0.7)
algorithm(res)
fit(res)
}

## ----algo_pe, cache=TRUE------------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
# using the PE-NMF algorithm with parameters alpha=0.01, beta=1
res <- nmf(esGolub, 3, 'pe', alpha=0.01, beta=1)
res
}

## ----seed_list----------------------------------------------------------------
nmfSeed()

## ----seed, cache=TRUE---------------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
res <- nmf(esGolub, 3, seed='nndsvd')
res
}

## ----seed_numeric, cache=TRUE-------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
# single run and single numeric seed
res <- nmf(esGolub, 3, seed=123456)
showRNG(res)

# multiple runs and single numeric seed
res <- nmf(esGolub, 3, seed=123456, nrun=2)
showRNG(res)

# single run with a 6-length seed
res <- nmf(esGolub, 3, seed=rep(123456, 6))
showRNG(res)
}

## ----seed_WH------------------------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
# initialize a "constant" factorization based on the target dimension
init <- nmfModel(3, esGolub, W=0.5, H=0.3)
head(basis(init))

# fit using this NMF model as a seed
res <- nmf(esGolub, 3, seed=init)
}

## ----algo_multirun, cache=TRUE------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
res.multirun <- nmf(esGolub, 3, nrun=5)
res.multirun
}

## ----multirun_keep, cache=TRUE------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
# explicitly setting the option keep.all to TRUE
res <- nmf(esGolub, 3, nrun=5, .options=list(keep.all=TRUE))
res
}

## ----multirun_keep_alt, eval=FALSE--------------------------------------------
#  if(requireNamespace("Biobase", quietly=TRUE)){
#  # or using letter code 'k' in argument .options
#  nmf(esGolub, 3, nrun=5, .options='k')
#  }

## ----parallel_multicore_alt, eval=FALSE---------------------------------------
#  if(requireNamespace("Biobase", quietly=TRUE)){
#  # the default call will try to run in parallel using all the cores
#  # => will be in parallel if all the requirements are satisfied
#  nmf(esGolub, 3, nrun=5, .opt='v')
#  
#  # request a certain number of cores to use => no error if not possible
#  nmf(esGolub, 3, nrun=5, .opt='vp8')
#  
#  # force parallel computation: use option 'P'
#  nmf(esGolub, 3, nrun=5, .opt='vP')
#  
#  # require an improbable number of cores => error
#  nmf(esGolub, 3, nrun=5, .opt='vP200')
#  }

## ----mpi, eval=FALSE----------------------------------------------------------
#  # file: mpi.R
#  
#  if(requireNamespace("Biobase", quietly=TRUE)){
#  ## 0. Create and register an MPI cluster
#  library(doMPI)
#  cl <- startMPIcluster()
#  registerDoMPI(cl)
#  library(NMF)
#  
#  # run on all workers using the current parallel backend
#  data(esGolub)
#  res <- nmf(esGolub, 3, 'brunet', nrun=n, .opt='p', .pbackend=NULL)
#  
#  # save result
#  save(res, file='result.RData')
#  
#  ## 4. Shutdown the cluster and quit MPI
#  closeCluster(cl)
#  mpi.quit()
#  }

## ----force_seq, cache=TRUE, backspace = TRUE----------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
# parallel execution on 2 cores (if possible)
res1 <- nmf(esGolub, 3, nrun=5, .opt='vp2', seed=123)

# or use the doParallel with single core
res2 <- nmf(esGolub, 3, nrun=5, .opt='vp1', seed=123)

# force sequential computation by sapply: use option '-p' or .pbackend=NA  
res3 <- nmf(esGolub, 3, nrun=5, .opt='v-p', seed=123)
res4 <- nmf(esGolub, 3, nrun=5, .opt='v', .pbackend=NA, seed=123)

# or use the SEQ backend of foreach: .pbackend='seq'
res5 <- nmf(esGolub, 3, nrun=5, .opt='v', .pbackend='seq', seed=123)

# all results are all identical
nmf.equal(list(res1, res2, res3, res4, res5))
}

## ----estimate_rank, cache=TRUE------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
# perform 10 runs for each value of r in range 2:6
estim.r <- nmf(esGolub, 2:6, nrun=10, seed=123456)
}

## ----estimate_rank_plot, fig.width=10, fig.height=6---------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
plot(estim.r)
}

## ----estimate_rank_hm_include, fig.width=14, fig.height=7, fig.keep='last'----
if(requireNamespace("Biobase", quietly=TRUE)){
consensusmap(estim.r, annCol=esGolub, labCol=NA, labRow=NA)
}

## ----estimate_rank_random, cache=TRUE, fig.width=10, fig.height=6, fig.keep='last'----
if(requireNamespace("Biobase", quietly=TRUE)){
# shuffle original data
V.random <- randomize(esGolub)
# estimate quality measures from the shuffled data (use default NMF algorithm)
estim.r.random <- nmf(V.random, 2:6, nrun=10, seed=123456)
# plot measures on same graph
plot(estim.r, estim.r.random)
}

## ----multimethod, cache=TRUE--------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
# fit a model for several different methods 
res.multi.method <- nmf(esGolub, 3, list('brunet', 'lee', 'ns'), seed=123456, .options='t')
}

## ----compare------------------------------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
compare(res.multi.method)

# If prior knowledge of classes is available
compare(res.multi.method, class=esGolub$Cell)
}

## ----errorplot_compute, cache=TRUE--------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
# run nmf with .option='t'
res <- nmf(esGolub, 3, .options='t')
# or with .options=list(track=TRUE)
}

## ----errorplot, out.width="0.5\\textwidth", fig.show='hold'-------------------
if(requireNamespace("Biobase", quietly=TRUE)){
plot(res)
plot(res.multi.method)
}

## ----heatmap_coef_basis_inc, fig.width=14, fig.height=7, fig.keep='last'------
if(requireNamespace("Biobase", quietly=TRUE)){
layout(cbind(1,2))
# basis components
basismap(res, subsetRow=TRUE)
# mixture coefficients
coefmap(res)
}

## ----heatmap_consensus_inc, out.width="0.49\\textwidth", crop=TRUE, echo=1:2----
if(requireNamespace("Biobase", quietly=TRUE)){
# The cell type is used to label rows and columns 
consensusmap(res.multirun, annCol=esGolub, tracks=NA)
plot(1:10)
f2 <- fig_path("2.pdf")
}

## ----hack_consensus, include=FALSE--------------------------------------------
if(requireNamespace("Biobase", quietly=TRUE)){
file.copy('consensus.pdf', f2, overwrite=TRUE)
}

## ----custom_algo_sig----------------------------------------------------------
my.algorithm <- function(x, seed, param.1, param.2){
	# do something with starting point
	# ...
	
	# return updated starting point
	return(seed)
}

## ----custom_algo--------------------------------------------------------------
my.algorithm <- function(x, seed, scale.factor=1){
	# do something with starting point
	# ...
	# for example: 
	# 1. compute principal components	
	pca <- prcomp(t(x), retx=TRUE)
	
	# 2. use the absolute values of the first PCs for the metagenes
	# Note: the factorization rank is stored in object 'start'	
	factorization.rank <- nbasis(seed)
	basis(seed) <- abs(pca$rotation[,1:factorization.rank])
	# use the rotated matrix to get the mixture coefficient
	# use a scaling factor (just to illustrate the use of extra parameters)
	coef(seed) <- t(abs(pca$x[,1:factorization.rank])) / scale.factor
	
	# return updated data
	return(seed)
}

## ----define_V-----------------------------------------------------------------
n <- 50; r <- 3; p <- 20
V <-syntheticNMF(n, r, p)

## ----custom_algo_run----------------------------------------------------------
nmf(V, 3, my.algorithm, scale.factor=10)

## ----custom_algo_run_obj------------------------------------------------------
# based on Kullback-Leibler divergence
nmf(V, 3, my.algorithm, scale.factor=10, objective='KL')
# based on custom distance metric
nmf(V, 3, my.algorithm, scale.factor=10
	, objective=function(model, target, ...){
            ( sum( (target-fitted(model))^4 ) )^{1/4} 
		}
)

## ----custom_algo_run_mixed, error = TRUE, try = TRUE--------------------------
# put some negative input data 
V.neg <- V; V.neg[1,] <- -1;

# this generates an error
try( nmf(V.neg, 3, my.algorithm, scale.factor=10) )

# this runs my.algorithm without error
nmf(V.neg, 3, my.algorithm, mixed=TRUE, scale.factor=10)

## ----nmf_models---------------------------------------------------------------
nmfModel()

## ----custom_algo_NMFoffset----------------------------------------------------
my.algorithm.offset <- function(x, seed, scale.factor=1){
	# do something with starting point
	# ...
	# for example: 
	# 1. compute principal components	
	pca <- prcomp(t(x), retx=TRUE)
	
	# retrieve the model being estimated
	data.model <- fit(seed)
	
	# 2. use the absolute values of the first PCs for the metagenes
	# Note: the factorization rank is stored in object 'start'	
	factorization.rank <- nbasis(data.model)
	basis(data.model) <- abs(pca$rotation[,1:factorization.rank])	
	# use the rotated matrix to get the mixture coefficient
	# use a scaling factor (just to illustrate the use of extra parameters)
	coef(data.model) <- t(abs(pca$x[,1:factorization.rank])) / scale.factor
	
	# 3. Compute the offset as the mean expression
	data.model@offset <- rowMeans(x)	
	
	# return updated data
	fit(seed) <- data.model
	seed
}

## ----custom_algo_NMFOffset_run------------------------------------------------
# run custom algorithm with NMF model with offset
nmf(V, 3, my.algorithm.offset, model='NMFOffset', scale.factor=10)

## ----custom_seed--------------------------------------------------------------

# start: object of class NMF
# target: the target matrix
my.seeding.method <- function(model, target){
	
	# use only the largest columns for W
	w.cols <- apply(target, 2, function(x) sqrt(sum(x^2)))
	basis(model) <- target[,order(w.cols)[1:nbasis(model)]]
	
	# initialize H randomly
	coef(model) <- matrix(runif(nbasis(model)*ncol(target))
						, nbasis(model), ncol(target))

	# return updated object
	return(model)
}

## ----custom_seed_run----------------------------------------------------------
nmf(V, 3, 'snmf/r', seed=my.seeding.method)

## ----options_algo, eval=1:6---------------------------------------------------
#show default algorithm and seeding method
nmf.options('default.algorithm', 'default.seed')

# retrieve a single option
nmf.getOption('default.seed')

# All options
nmf.options()

## ----print_options------------------------------------------------------------
nmf.printOptions()

## ----sessionInfo, echo=FALSE, results='asis'----------------------------------
toLatex(sessionInfo())