R/simQLData.R
In qle: Simulation-Based Quasi-Likelihood Estimation

Documented in setQLdata simQLdata

# Copyright (C) 2018 Markus Baaske. All Rights Reserved.
# This code is published under the GPL (>=3).
#
# File: 	simQLData.R
# Date:  	14/03/2018
# Author: 	Markus Baaske
#
# Functions to collect the simulation results

# internal, default: create and use a local cluster object of size one
#' @importFrom digest digest
doInParallel <- function(X, FUN, ... , cl = NULL, iseed = NULL,
					cores = getOption("mc.cores",1L), 						# force sequential processing if cores=1L
					  cache = getOption("qle.cache",FALSE),
			  		   	  fun = getOption("qle.multicore","lapply"))
{
	SIM <- if(cache) {
			for(i in 1:length(X))
			  attr(X[[i]],"id") <- i
			tryCatch({
			  function(x,...) {			
				dg <- digest::digest(list(FUN, x, attr(x,"id")))				
				cache.fn <- sprintf("cached_%s.rda",dg)				
				if (file.exists(cache.fn)){					
					load(cache.fn)
				} else {
					var <- FUN(x,...)					
					save(var, file = cache.fn)
				}
				return(var)
			  }
		    }, error = function(e) {
			   structure(
					   list(message=.makeMessage("Failed to load/save/execute cached results.",
								   conditionMessage(e)),
						    call=sys.call()),
					 class=c("error","condition"), error=e)
		        })
		   } else FUN	    
    
   noCluster <- is.null(cl)
   if(!exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) runif(1)
   
   tryCatch({
		if(cores > 1L || !noCluster) {
			if(noCluster && fun == "mclapply" ){
				# Only for L'Ecuyer-CMRG we get reproducible results
				if(!is.null(iseed) && RNGkind()[1L] == "L'Ecuyer-CMRG")
				  set.seed(iseed)		   
			    if(.Platform$OS.type != "windows")
					parallel::mclapply(X, SIM, ..., mc.cores = cores)
				else {
				  options(mc.cores=1L)
				  lapply(X, SIM, ...)
				}
			} else {
				if(noCluster){
					## only a 'local' cluster is supported here
					type <- if(Sys.info()["sysname"] == "Linux")
								"FORK" else "PSOCK"
					cl <- parallel::makeCluster(cores,type=type)				
			    }
				if(any(class(cl) %in% c("MPIcluster","SOCKcluster","cluster"))){
					# Only for L'Ecuyer-CMRG we get reproducible results for a cluster				
					if(!is.null(iseed) && RNGkind()[1L] == "L'Ecuyer-CMRG")
					  parallel::clusterSetRNGStream(cl,iseed)
				    parallel::parLapply(cl, X = X, fun = SIM, ...)
				} else {
				   stop(paste0("Failed to initialize cluster: unsupported cluster class: ",class(cl)))
			    }
		    }
		} else {
			noCluster <- FALSE
			lapply(X, SIM, ...)	
		}
		
   },error = function(e) {
			return(
			  structure(
				 list(message=.makeMessage("Calling `",deparse(substitute(SIM)), "` failed: ",conditionMessage(e)),
						 call=sys.call()),
			   class = c("error", "condition"), error = e))
   },finally = {
	   if(noCluster && !is.null(cl)){
		   if(inherits(try(parallel::stopCluster(cl),silent=TRUE),"try-error"))
			  message(.makeMessage("Failed to stop cluster object."))
		   cl <- NULL
		   invisible(gc)
	  }
   })
}

#' @name 	simQLdata
#'
#' @title 	Simulate the statistical model 
#'
#' @description The function runs simulations of the user-defined statistical model
#'  either for a single parameter or a list of parameters. 
#'
#' @param sim			user supplied simulation function
#' @param X				list or matrix of model parameters 
#' @param ...			arguments passed to `\code{sim}`
#' @param nsim			number of simulation replications at each parameter
#' @param mode			type of return value: "\code{list}", "\code{matrix}", "\code{mean}"
#' @param cl			cluster object, \code{NULL} (default), of class \code{MPIcluster}, \code{SOCKcluster}, \code{cluster} 
#' @param iseed			integer seed for initializing the cluster workers
#' @param na.rm			whether to remove \code{NA} values from simulation results
#' @param verbose		if \code{TRUE}, then print intermediate output 
#'
#' @return List of (aggregated) simulation results of class \code{simQL} and the following attributes:    
#' 	\item{X}{ matrix of sample points (equal to `\code{X}` if supplied) or the result of \code{\link{multiDimLHS}}}
#'  \item{nsim}{ number of simulation runs at each parameter} 
#'  \item{iseed}{ an integer seed value to initialize cluster workers}
#'  \item{error}{ only in case of errors detected in simulation function}  				  
#'
#' @details Basically, the given simulation function `\code{sim}` is called for each parameter value
#'  `\code{nsim}` times and the results are returned depending on the chosen type `\code{mode}`.
#'  
#'  If `\code{X}` is \code{NULL} (default), then the design, that is, the matrix of sample points, is first generated
#'  by \code{\link{multiDimLHS}} and the result is stored as an attribute named `\code{X}`. In this case, the arguments
#'  `\code{N}`,`\code{bounds}` and `\code{method}` must be given as named input arguments as required by \code{\link{multiDimLHS}}.
#'  In case of catched errors, the function tries to omit the corresponding simulation runs from the final results if possible.
#' 
#'  Depending on the complexity of the model simulations we strongly encourage the use of a cluster object `\code{cl}`. Make sure
#'  to export all functions to the cluster environment beforehand which are required by the user-defined simulation function.
#'  Also, using the option `\code{qle.cache}` caches results of a function call to be stored in the current working directory.
#'  The filename is generated by a hash code using the \code{\link[digest]{digest}} package. Setting an integer seed \code{iseed} before
#'  stores each result in a separate file and makes the data reproducible while loading the data again.
#' 
#' @author M. Baaske 
#' 
#' @examples 
#' # generate design points, simulate and return the sample means
#' sim <- simQLdata(sim=function(x) rlnorm(1,x[1],x[2]),
#'          nsim=10,N=8, method="maximinLHS",
#'          lb=c(-1.5,0),ub=c(2,1), mode="mean") 
#'   
#' @importFrom stats na.omit
#' @rdname simQLdata
#' @export
simQLdata <-
  function(sim, ..., nsim, X = NULL, mode = c("list","matrix","mean"), cl = NULL,
			 iseed = NULL, na.rm = getOption("na.rm",TRUE), verbose = FALSE)
{
	args <- list(...)	
	if(is.null(X)) {	  
	   my.args <- c("N","method","lb","ub")
	   id <- which(is.na(pmatch(my.args,names(args))))	  	   
       if(length(id)>0L)	  
		 stop(paste(c("Arguments of `multiDimLHS` ",my.args[id]," not found."),collapse=" "))	  
	   # generate random sample    
	   X <- multiDimLHS(args$N,args$lb,args$ub,method=args$method)	   
	   args <- args[-pmatch(my.args,names(args))]
	} else {
	 if(!is.list(X))
	   X <- .ROW2LIST(X)
	}		
	# check on input values	
	simFun <-
	 if(!inherits(sim,"simObj")){
		sim <- match.fun(sim)
		.checkfun(sim,args,remove=TRUE)		
		 structure(
			function(x) {				
				try(do.call(sim,c(list(x),args)))			
			},
	  	 class=c("simObj","function") )
	} else sim
	
  	# run simulations	
	stopifnot(!missing(nsim) || is.numeric(nsim))
	arg.list <- list(X=lapply(rep(X,each=nsim),unlist),
			      FUN=simFun, cl=cl, iseed=iseed)
			
    # simulate model
	if(verbose)
	  cat("Simulating the model...\n")
 	res <- do.call(doInParallel, arg.list)	
	
	# check results from user function simulation
	if(inherits(res,"error")) {
		msg <- .makeMessage("Error in `simQLdata`: ",conditionMessage(res))
		message(msg)
		return(.qleError(message=msg,call=match.call(),error=res))
	}
	errId <- which(sapply(res,function(x) .isError(x)))
	if(length(errId) > 0L) {
		msg <- .makeMessage("Simulation error (first error): ", res[[errId[1]]])
		message(msg)
		return(.qleError(message=msg,call=match.call(),error=res))
	}	
	np <- length(X)
	res <-
	  if(np > 1)
		split(res, cut(seq_along(res),np,labels=FALSE))
	  else list(res)

	mode <- match.arg(mode)	
	RES <- lapply(res,
			function(x) {			
				 # remove errors in results				 
				 err <- sapply(x,.isError)
				 ok <- which(!err)
				 if(length(ok) == 0L)
					return (x) 
				 
				 structure(
					 switch(mode,
							 "list" = {								
								 x[ok]
							 },
							 "mean"= {
								  if(na.rm)
								   apply(na.omit(as.data.frame(do.call(rbind,x[ok]))),2,mean)
								  else apply(do.call(rbind,x[ok]),2,mean)
							  },
							 "matrix" = {
								 data.matrix(
								  if(na.rm)
								   na.omit(as.data.frame(do.call(rbind,x[ok])))
								  else do.call(rbind,x[ok]))
		 					  }
			  		 ), error = if(sum(err) > 0L) {
								 id <- which(err)
								 message(.makeMessage("A total of ",length(id), " errors detected in user defined simulation function."))
								 structure(x[id],nErr = length(id))								 
						} else NULL)				
			 }
		  )
	names(RES) <- NULL
	nErr <- which(sapply(RES, function(x) !is.null(attr(x,"error"))))
	
	structure(RES,
			  nsim=nsim,			 
			  X=.LIST2ROW(X),
			  iseed=iseed,
		 	  error=if(length(nErr) > 0L) nErr else NULL,
	  class="simQL", call=sys.call())
	
}

# Cholesky decomposition of covariance matrices
varCHOLdecomp <- function(matList) {
	# n <- NROW(matList[[1]])
	# lx <- n*(n+1)/2
	if(!is.matrix(matList[[1]]))
	 stop("Elements of list must be matrices.\n", call.=TRUE)
	doIt <- function(i,x) {
		m <- try(chol(x[[i]],pivot = FALSE), silent=TRUE )
		if(inherits(m,"try-error") || anyNA(m) )
		 return(.qleError(message="Cholseky decomposition failed: ",
				   call=match.call(),error=m))
		as.vector(m[col(m)>=row(m)])
	}
	lapply(1:length(matList),doIt,x=matList)
}


.resampleCov <- function(T,num){
	n <- nrow(T)
	apply(
	  sapply(1:num,
		function(i){
			TS <- apply(T,2,sample,replace=TRUE)
			stopifnot(NROW(TS)==n)
			m <- var(TS,na.rm=TRUE)
			m[col(m)>=row(m)]
		}
	  ),1,var)
}


#' @name 	setQLdata
#'
#' @title	Setup of quasi-likelihood data for estimation
#'
#' @description Aggregate the data for quasi-likelihood estimation 
#'
#' @param runs		list or matrix of simulation results obtained from \code{\link{simQLdata}}
#' @param X			list or matrix of (design) points, i.e. model parameters
#' @param var.type	name of variance matrix approximation type: "\code{cholMean}" (default)
#' @param Nb		number of bootstrap samples, \code{=0} (default, no bootstrap used), to be generated for kriging the variance matrix,
#' 				    only if `\code{var.type}`=`\code{kriging}`
#' @param na.rm 	if \code{TRUE} (default), remove `NA` values from simulation results
#' @param verbose	if \code{TRUE}, print intermediate output
#'
#' @return
#'  An object of class \code{QLdata} as a data frame with columns:
#'   \item{X}{ Model parameters (\code{n=1},...,\code{q}) }
#'   \item{mean}{ Results of simulation runs (\code{m=1},...,\code{p}) }
#'   \item{var}{ Simulation variances of statistics (\code{m=1},...,\code{p}) }
#'   \item{L}{ if applicable, Cholesky decomposed terms of variance matrices of statistics (k=1,...,(m*(m+1)/2))}
#'   \item{Lb}{ if applicable, bootstrap variances of covariances}
#' 	where `\code{p}` denotes the number of user defined statistics and `\code{q}` the problem dimension, that is,
#'  the number of statistical model parameters to be estimated.	
#' 
#' 	The following items are stored as attributes:
#' 
#'   \item{type}{ see above}
#' 	 \item{nsim}{ number of simulations spent at each (design) point}
#' 	 \item{xdim}{ length of model parameter}
#' 	 \item{nWarnings}{ Number of warnings during simulation runs}
#'	 \item{nErrors}{ Number of errors during simulation runs}
#' 	 \item{nIgnored}{ List of parameters ignored due to errors}
#'
#' @details
#'  The function aggregates all neccessary data for quasi-likelihood estimation storing the
#' 	sample points and the corresponding simulation results of the statistics. If `\code{X}` equals \code{NULL},
#'  then the sample points are taken from the object `\code{runs}`.
#' 
#'  The most critical part is the decomposition of variance matrices for each sample point unless `\code{var.type}`
#'  equals "\code{const}" in which case a constant variance matrix approximation is expected to be given by the user in function \code{\link{qle}}.
#'  The Cholesky decompositions are used for an average approximation of the variance matrices of the statistics when calculating the
#'  quasi-score vector or any type of function criterion. If these fail for any reason we try to ignore, if possible, the corresponding sample
#'  points and exclude these from all subsequent computations. Unless a constant variance matrix estimate is used, the default is to approximate the
#'  variance matrix at any model parameter by either a kriging approximation of the \emph{Cholesky} terms (kriging the variance matrix) or as an average
#'  over all sampled variance matrices (variance matrix average approximation) also based on the decomposed Cholesky terms (see vignette).
#' 
#' @examples 
#' # simulate model statistics at LHS design
#' sim <- simQLdata(sim =
#'          function(x,cond) {
#'            X <- rlnorm(cond$n,x[1],x[2])
#'            c("MED"=median(X),"MAD"=mad(X))
#'          },
#'          cond=list("n"=10),
#'          nsim=10, N=10, method="maximinLHS",
#'          lb=c("mu"=-1.5,"sd"=0), ub=c("mu"=2,"sd"=1))
#' 
#' # setup the QL data model using defaults
#' qldata <- setQLdata(sim,verbose=TRUE) 
#'   
#' @author M. Baaske
#' @rdname 	setQLdata
#' @export
setQLdata <- function(runs, X = NULL, var.type="cholMean",
						Nb = 0, na.rm = TRUE, verbose = FALSE) {		
	nErrors <- 0
	nWarnings <- 0	
	nsim <- attr(runs,"nsim")
	stopifnot(!is.null(nsim) || is.numeric(nsim))
	
	if(class(runs) != "simQL" || !is.list(runs))
	  stop("Argument `runs` must be of class `simQL` and `list` type.")
	
    if(is.null(X))
	 X <- attr(runs,"X")
	if(!is.matrix(X)) 
   	 X <- .LIST2ROW(X)
 	 
	.extract <-  function(dataT) {
		if( !is.null(attr(dataT,"error")) || inherits(dataT,"error")) {
			msg <- paste0("Value of statistics has errors: ","\n")
			message(msg)
			return(.qleError(message=msg,call=match.call(),error=dataT))
	  	}
		if(is.list(dataT))
		 dataT <- do.call(rbind,dataT)
	  	if(na.rm)
		 dataT <- dataT[!rowSums(is.na(dataT)) > 0L,,drop=FALSE]

		V <- try( var(dataT, na.rm=na.rm),silent=TRUE)
	    if (is.na(V) || !is.numeric(V) || inherits(V,"try-error") ) {
			msg <- paste0("Failed to get variance matrix of statistics.","\n")
			message(msg)
	        return(.qleError(message=msg,call=match.call(),error=V))
	    }	
		is.pos <- .isPosDef(V)
		if(is.pos != 0L) {
		  warning("Covariance matrix not positive definite!")
	  	}
		vmat <- as.matrix(V)		
		ret <- list("V" = vmat,
					"mstat" = colMeans(dataT),
					"vars" = diag(vmat),					  
					"is.pos" = is.pos)
		
		if(Nb > 0 && var.type == "kriging"){
		  ret$Lb <- try(.resampleCov(dataT,Nb),silent=TRUE)
		  if(inherits(ret$Lb,"try-error"))
			attr(ret,"error") <- TRUE			  
		} 
		ret
	}

	res <- tryCatch(
			  lapply(runs,.extract),
			error = function(e) e)
	
	if(.isError(res)){
		msg <- .makeMessage("Extracting values of statistics failed.")
		message(msg)
		return(.qleError(message=msg,call=sys.call(),error=res))
	}
	
	ok <- which(
		   sapply(res,
		    function(x){
			 if(.isError(x)) {
				ok <- FALSE # try to ignore this location
				nErrors <<- nErrors+1				
			 } else {
				ok <- (x$is.pos == 0L)
				if(!ok)
				 nWarnings <<- nWarnings+1			   
			 }
			 ok
	})) 
	
	notOk <- integer(0)	
	msg <- "Normal completion"
	
	nIgnored <-
	 if(length(ok) < length(res)) {
		 notOk <- which(!(1:length(res) %in% ok))
		 X[notOk,,drop=FALSE]
	 } else NULL	
    
	if(nErrors > 0L) {
		msg <- paste0(c("Errors at sample points: ", notOk), collapse = " ")
		message(msg)		
	} else if(nWarnings > 0L) {		
		msg <- paste(c("Variance matrix is not positive definite at sample points: ", notOk) ,collapse=" ")
		message(msg)
	}
	if(!is.null(nIgnored) && nrow(nIgnored) > 0L){
		message(.makeMessage("Ignored a total of ", nrow(nIgnored), " sample points.","\n"))		
	}	
	if(length(ok) == 0L ) {
	 	msg <- .makeMessage("No parameters match the simulation conditions: ","\n")
		message(msg)
	    return(
		  .qleError(message=msg,
				    call=match.call(),
			 	    xdim=ncol(X), 
					nsim=nsim,
					nWarnings=nWarnings,
				    nErrors=nErrors,
					nIgnored=nIgnored,
					error=res)
		)
  	}
	# Cholesky decompositions	
	cvmats <- 
	 if(var.type != "const") {
		# covariance decompositions	
	    if(verbose)
		 cat("Cholesky decompositon of covariance matrices...\n")
	
	 	vmats <- lapply(res[ok],"[[","V" )
		L <- try(varCHOLdecomp(vmats),silent=TRUE)
		if(.isError(L)) {
			msg <- paste0("Cholseky decomposition failed: ","\n")
			message(msg)
			stop(.qleError(message=msg,call=match.call(),error=L))
		}
		ok2 <- which(sapply(L,function(x) !.isError(x)))
		if(length(ok2) == 0L){
			msg <- paste0("All Cholesky decompositions failed: ","\n")
			message(msg)
			return(.qleError(message=msg,call=match.call(),error=L))
	 	} else if(length(ok2) < length(L))
		   warning(paste0("A total of ",length(L)-length(ok2), " Cholesky decomposition(s) failed."))
	    
	    cvmats <- as.data.frame(do.call(rbind, L[ok2]))
	    colnames(cvmats) <- paste("L", rep(1:NCOL(cvmats)),sep="")
	   
		# bootstrap variances of covariances 
		# a local nugget variances
		if(Nb > 0 && var.type == "kriging"){
			Lb <- lapply(res[ok],"[[","Lb")			
			Lbmats <- as.data.frame(do.call(rbind,Lb[ok2]))
			colnames(Lbmats) <- paste("Lb", rep(1:NCOL(Lbmats)),sep="")
			cvmats <- cbind(cvmats,Lbmats)
		}
		if(length(ok) != length(ok2))
		  ok <- ok2
	    cvmats		
	} else NULL
	
	X <- X[ok,,drop=FALSE]
	mstats <- do.call(rbind,lapply(res[ok],"[[","mstat"))
	mvars  <- do.call(rbind,lapply(res[ok],"[[","vars"))
	
	qld <- as.data.frame(cbind(rbind(X),mstats,mvars))
	nms <- 
	 if(is.null(colnames(X)) | is.null(colnames(mstats)) | is.null(colnames(mvars))) {
		 c(colnames(qld[seq(ncol(X))]),
		   paste("mean.T",rep(1:NCOL(mstats)),sep=""),
		   paste("var.T",rep(1:NCOL(mvars)),sep=""))			   
	 } else {
		 c(colnames(X),
		   paste0("mean.",colnames(mstats)),
		   paste0("var.",colnames(mvars)))			   
	 }			
	colnames(qld) <- nms			  
	if(!is.null(cvmats)){
   	 qld <- cbind(qld,data.frame(cvmats))
 	}

	structure(qld,
			  xdim=ncol(X),
			  nsim=nsim,
			  Nb=if(var.type != "kriging") 0 else Nb,
			  nWarnings=nWarnings,
			  nErrors=nErrors,
			  nIgnored=nIgnored,
			  message=msg,
			  call=match.call(),
			  class=c("QLdata","data.frame"))
}