R/simQLData.R

Defines functions doInParallel simQLdata varCHOLdecomp .resampleCov setQLdata

Documented in setQLdata simQLdata

# Copyright (C) 2017 Markus Baaske. All Rights Reserved.
# This code is published under the GPL (>=3).
#
# File: 	simQLData.R
# Date:  	25/10/2017
# Author: 	Markus Baaske
#
# Functions to collect the simulation results

# internal
#' @importFrom digest digest
doInParallel <- function(X, FUN, ... , cl = NULL, iseed = NULL,
							cache = getOption("qle.cache",FALSE) )
{
	SIM <- if(cache) {
			for(i in 1:length(X))
			  attr(X[[i]],"id") <- i
			tryCatch({
			  function(x,...) {			
				dg <- digest::digest(list(FUN, x, attr(x,"id")))				
				cache.fn <- sprintf("cached_%s.rda",dg)				
				if (file.exists(cache.fn)){					
					load(cache.fn)
				} else {
					var <- FUN(x,...)					
					save(var, file = cache.fn)
				}
				return(var)
			  }
		    }, error = function(e) {
			   structure(
					   list(message=.makeMessage("Failed to load/save/execute cached results.",
								   conditionMessage(e)),
						    call=sys.call()),
					 class=c("error","condition"), error=e)
		        })
		   } else FUN	    
    
   tryCatch({
		noCluster <- is.null(cl)
		cores <- getOption("mc.cores",1L)
	    if(noCluster && (length(X)==1L || cores==1L)){
			noCluster <- FALSE
			lapply(X, SIM, ...)		  
		} else {
			if(noCluster){
				type <- if(Sys.info()["sysname"]=="Linux")
							"FORK" else "PSOCK"
				try(cl <- parallel::makeCluster(cores,type=type),silent=FALSE)				
		    }
			if(is.null(cl))
			  stop("Could not initialize cluster object.")	
			if(any(class(cl) %in% c("MPIcluster","SOCKcluster","cluster"))){
			  clusterSetRNGStream(cl,iseed)
			  parallel::parLapplyLB(cl, X = X, fun = SIM, ...)
			} else stop(paste0("Unsupported cluster object: ",class(cl)))			
	    }
   },error = function(e) {
			return(
			  structure(
				 list(message=.makeMessage("Calling `",deparse(substitute(SIM)), "` failed: ",conditionMessage(e)),
						 call=sys.call()),
			   class = c("error", "condition"), error = e))
   },finally = {
	   if(noCluster && !is.null(cl)){
		   if(inherits(try(stopCluster(cl),silent=TRUE),"try-error")){
			   rm(cl)			   
			   message("Error in stopping cluster.")
		   } else {
			   cl <- NULL
			   invisible(gc())
		   } 
	  }
   })
}

#' @name 	simQLdata
#'
#' @title 	Simulate the statistical model 
#'
#' @description The function runs simulations of the user-defined statistical model
#'  either for a single parameter or a list of parameters. 
#'
#' @param sim			user supplied simulation function
#' @param X				list or matrix of model parameters 
#' @param ...			arguments passed to `\code{sim}`
#' @param nsim			number of simulation replications at each parameter
#' @param mode			type of return value
#' @param cl			cluster object, \code{NULL} (default), of class "\code{MPIcluster}", "\code{SOCKcluster}", "\code{cluster}" 
#' @param iseed			integer seed for initializing the cluster workers
#' @param na.rm			whether to remove `\code{NA}` values from simulation results
#' @param verbose		if \code{TRUE}, then print intermediate output 
#'
#' @return List of (aggregated) simulation results of class \code{simQL} and the following attributes:    
#' 	\item{X}{ matrix of sample points equal to `\code{X}`, if supplied, or
#' 				 the result of \code{\link{multiDimLHS}}}
#'  \item{nsim}{ number of simulation runs at each parameter} 
#'  \item{iseed}{ an integer seed value to initialize cluster workers}
#'  \item{error}{ only in case of errors detected in simulation function}  				  
#'
#' @details Basically, the given simulation function `\code{sim}` is called for each parameter value
#'  `\code{nsim}` times and the results are returned depending on the chosen type `\code{mode}`. This is either 
#'  a list or a matrix as the outcome of function `\code{sim}` or the mean of `\code{nsim}` simulation replications at each parameter.
#'  
#'  If `\code{X}` equals \code{NULL} (default), then the design, that is, the matrix of sample points, is first generated
#'  by function \code{\link{multiDimLHS}} and the result is stored as an attribute named `\code{X}`. In this case, the arguments
#'  `\code{N}`,`\code{bounds}` and `\code{method}` must be given as named input arguments which are then passed to \code{\link{multiDimLHS}}.
#'  In case of errors catched, the corresponding simulation runs are omitted from the results if possible.
#' 
#'  We recommend using a cluster object `\code{cl}` depending on the complexity of model simulations. If using a cluster make sure
#'  to export all functions to the cluster environment beforehand which are required by the user-defined simulation function.
#'  Also, using the option `\code{qle.cache}` caches results of a function call to be stored in the current working directory.
#'  The filename is generated by a hash code using the \code{\link[digest]{digest}} package. Setting an integer seed \code{iseed} before
#'  stores each result in a separate file and makes the data reproducible while loading the data again.
#' 
#' @author M. Baaske 
#' 
#' @examples 
#' # generate design points, simulate and return the sample means
#' sim <- simQLdata(sim=function(x) rlnorm(1,x[1],x[2]),
#'          nsim=10,N=8, method="maximinLHS",
#'          lb=c(-1.5,0),ub=c(2,1), mode="mean") 
#'   
#' @importFrom stats na.omit
#' @rdname simQLdata
#' @export
simQLdata <-
  function(sim, ..., nsim, X = NULL, mode = c("list","matrix","mean"), cl = NULL,
			 iseed = NULL, na.rm = getOption("na.rm",TRUE), verbose = FALSE)
{
	args <- list(...)	
	if(is.null(X)) {	  
	   my.args <- c("N","method","lb","ub")
	   id <- which(is.na(pmatch(my.args,names(args))))	  	   
       if(length(id)>0L)	  
		 stop(paste(c("Arguments of `multiDimLHS` ",my.args[id]," not found."),collapse=" "))	  
	   # generate random sample    
	   X <- multiDimLHS(args$N,args$lb,args$ub,method=args$method)	   
	   args <- args[-pmatch(my.args,names(args))]
	} else {
	 if(!is.list(X))
	   X <- .ROW2LIST(X)
	}		
	# check on input values	
	simFun <-
	 if(!inherits(sim,"simObj")){
		sim <- match.fun(sim)
		.checkfun(sim,args,remove=TRUE)		
		 structure(
			function(x) {				
				try(do.call(sim,c(list(x),args)))			
			},
	  	 class=c("simObj","function") )
	} else sim
	
  	# run simulations	
	stopifnot(!missing(nsim) || is.numeric(nsim))
	arg.list <- list(X=lapply(rep(X,each=nsim),unlist),
			      FUN=simFun, cl=cl, iseed=iseed)
			
    # simulate model
	if(verbose)
	  cat("Simulating the model...\n")
 	res <- do.call(doInParallel, arg.list)	
	
	# check results from user function simulation
	if(inherits(res,"error")) {
		msg <- .makeMessage("Error in `simQLdata`: ",conditionMessage(res))
		message(msg)
		return(.qleError(message=msg,call=match.call(),error=res))
	}
	errId <- which(sapply(res,function(x) .isError(x)))
	if(length(errId) > 0L) {
		msg <- .makeMessage("Simulation error (first error): ", res[[errId[1]]])
		message(msg)
		return(.qleError(message=msg,call=match.call(),error=res))
	}	
	np <- length(X)
	res <-
	  if(np > 1)
		split(res, cut(seq_along(res),np,labels=FALSE))
	  else list(res)

	mode <- match.arg(mode)	
	RES <- lapply(res,
			function(x) {			
				 # remove errors in results				 
				 err <- sapply(x,.isError)
				 ok <- which(!err)
				 if(length(ok) == 0L)
					return (x) 
				 
				 structure(
					 switch(mode,
							 "list" = {								
								 x[ok]
							 },
							 "mean"= {
								  if(na.rm)
								   apply(na.omit(as.data.frame(do.call(rbind,x[ok]))),2,mean)
								  else apply(do.call(rbind,x[ok]),2,mean)
							  },
							 "matrix" = {
								 data.matrix(
								  if(na.rm)
								   na.omit(as.data.frame(do.call(rbind,x[ok])))
								  else do.call(rbind,x[ok]))
		 					  }),
					 error = if(sum(err) > 0L) {
								 id <- which(err)
								 message(.makeMessage("A total of ",length(id), " errors detected in user defined simulation function."))
								 structure(x[id],nErr = length(id))								 
							 } else NULL)				
			 }
		  )
	names(RES) <- NULL
	nErr <- which(sapply(RES, function(x) !is.null(attr(x,"error"))))
	
	structure(RES,
			  nsim=nsim,			 
			  X=.LIST2ROW(X),
			  iseed=iseed,
		 	  error=if(length(nErr) > 0L) nErr else NULL,
	  class="simQL", call=sys.call())
	
}

# Cholesky decomposition of covariance matrices
varCHOLdecomp <- function(matList) {
	# n <- NROW(matList[[1]])
	# lx <- n*(n+1)/2
	if(!is.matrix(matList[[1]]))
	 stop("Elements of list must be matrices.\n", call.=TRUE)
	doIt <- function(i,x) {
		m <- try(chol(x[[i]],pivot = FALSE), silent=TRUE )
		if(inherits(m,"try-error") || anyNA(m) )
		 return( .qleError(message="Cholseky decomposition failed: ",
				   call=match.call(),error=m))
		as.vector(m[col(m)>=row(m)])
	}
	lapply(1:length(matList),doIt,x=matList)
}


.resampleCov <- function(T,num){
	n <- nrow(T)
	apply(
	  sapply(1:num,
		function(i){
			TS <- apply(T,2,sample,replace=TRUE)
			stopifnot(NROW(TS)==n)
			m <- var(TS,na.rm=TRUE)
			m[col(m)>=row(m)]
		}
	  ),1,var)
}


#' @name 	setQLdata
#'
#' @title	Set quasi-likelihood (QL) data
#'
#' @description Aggregate the data for quasi-likelihood estimation 
#'
#' @param runs		list or matrix of simulation results from \code{\link{simQLdata}}
#' @param X			list or matrix of model parameters
#' @param var.type	character, "\code{cholMean}" (default), whether to Cholesky decompose variance
#' 					matrices either for sample average variance approximation or kriging variance matrices
#' @param Nb		numeric, number of bootstrap samples for kriging the variance matrix,
#' 				    only if `\code{var.type}`=`\code{kriging}`, default is zero which uses no bootstrapping
#' @param na.rm 	if \code{TRUE} (default), remove `NA` values from simulation results
#' @param verbose	if \code{TRUE}, print intermediate output
#'
#' @return
#'  An object of class \code{QLdata}  as a data frame with columns:
#'   \item{X}{ Model parameters (\code{n=1},...,\code{q}) }
#'   \item{mean}{ Results of simulations (\code{m=1},...,\code{p}) }
#'   \item{var}{ Simulation variances of statistics (\code{m=1},...,\code{p}) }
#'   \item{L}{ if applicable, Cholesky decomposed terms of variance matrices of statistics (k=1,...,(m*(m+1)/2))}
#' 	where `\code{p}` denotes the number of user defined statistics and `\code{q}` the problem dimension, that is,
#'  the number of parameters to be estimated.	
#' 
#' 	The following items are stored as attributes:
#' 
#'   \item{type}{ see above}
#' 	 \item{nsim}{ number of simulations at each point }
#' 	 \item{xdim}{ parameter dimension}
#' 	 \item{nWarnings}{ Number of warnings during simulations}
#'	 \item{nErrors}{ Number of errors during simulations}
#' 	 \item{nIgnored}{ List of parameters ignored (because of failures)}
#'
#' @details
#'  The function aggregates all neccessary data for quasi-likelihood estimation storing the
#' 	sample locations and the corresponding simulation results of the statistics.
#'  If `\code{X}` equals \code{NULL}, then the sample points are taken from the object `\code{runs}`.
#' 
#'  The most critical part is the decomposition of variance matrices for each sample location unless `\code{var.type}`
#'  equals "\code{const}" in which case a constant variance matrix approximation is expected later by function \code{\link{qle}}.
#'  The Cholesky decompositions are used for average approximations of the variance matrix of the statistics when calculating the
#'  quasi-score vector or any type of function criterion based on the Mahalanobis distance or quasi-deviance.
#'  If these fail for any reason we try to ignore, if possible, the corresponding sample points and exclude them
#'  from all following calculations. Unless a constant estimate of the variance matrix, the default is to approximate the
#'  variance at any model parameter by either a kriging interpolation of the \emph{Cholesky} terms or as an average over
#'  all sampled variance matrices also based on the decomposed Cholesky terms (see vignette).
#' 
#' @examples 
#' # simulate model statistics at LHS design
#' sim <- simQLdata(sim =
#'          function(x,cond) {
#'            X <- rlnorm(cond$n,x[1],x[2])
#'            c("MED"=median(X),"MAD"=mad(X))
#'          },
#'          cond=list("n"=10),
#'          nsim=10, N=10, method="maximinLHS",
#'          lb=c("mu"=-1.5,"sd"=0), ub=c("mu"=2,"sd"=1))
#' 
#' # setup the QL data model using defaults
#' qldata <- setQLdata(sim,verbose=TRUE) 
#'   
#' @author M. Baaske
#' @rdname 	setQLdata
#' @export
setQLdata <- function(runs, X = NULL, var.type="cholMean",
						Nb = 0, na.rm = TRUE, verbose = FALSE) {		
	nErrors <- 0
	nWarnings <- 0	
	nsim <- attr(runs,"nsim")
	stopifnot(!is.null(nsim) || is.numeric(nsim))
	
	if(class(runs) != "simQL" || !is.list(runs))
	  stop("Argument `runs` must be of class `simQL` and `list` type.")
	
    if(is.null(X))
	 X <- attr(runs,"X")
	if(!is.matrix(X)) 
   	 X <- .LIST2ROW(X)
 	 
	.extract <-  function(dataT) {
		if( !is.null(attr(dataT,"error")) || inherits(dataT,"error")) {
			msg <- paste0("Value of statistics has errors: ","\n")
			message(msg)
			return(.qleError(message=msg,call=match.call(),error=dataT))
	  	}
		if(is.list(dataT))
		 dataT <- do.call(rbind,dataT)
	  	if(na.rm)
		 dataT <- dataT[!rowSums(is.na(dataT)) > 0L,,drop=FALSE]

		V <- try( var(dataT, na.rm=na.rm),silent=TRUE)
	    if (is.na(V) || !is.numeric(V) || inherits(V,"try-error") ) {
			msg <- paste0("Failed to get variance matrix of statistics.","\n")
			message(msg)
	        return(.qleError(message=msg,call=match.call(),error=V))
	    }	
		is.pos <- .isPosDef(V)
		if(is.pos != 0L) {
		  warning("Covariance matrix not positive definite!")
	  	}
		vmat <- as.matrix(V)		
		ret <- list("V" = vmat,
					"mstat" = colMeans(dataT),
					"vars" = diag(vmat),					  
					"is.pos" = is.pos)
		
		if(Nb > 0 && var.type == "kriging"){
		  ret$Lb <- try(.resampleCov(dataT,Nb),silent=TRUE)
		  if(inherits(ret$Lb,"try-error"))
			attr(ret,"error") <- TRUE			  
		} 
		ret
	}

	res <- tryCatch(
			  lapply(runs,.extract),
			error = function(e) e)
	
	if(.isError(res)){
		msg <- .makeMessage("Extracting values of statistics failed.")
		message(msg)
		return(.qleError(message=msg,call=sys.call(),error=res))
	}
	
	ok <- which(
		   sapply(res,
		    function(x){
			 if(.isError(x)) {
				ok <- FALSE # try to ignore this location
				nErrors <<- nErrors+1				
			 } else {
				ok <- (x$is.pos == 0L)
				if(!ok)
				 nWarnings <<- nWarnings+1			   
			 }
			 ok
	})) 
	
	notOk <- integer(0)	
	msg <- "Normal completion"
	
	nIgnored <-
	 if(length(ok) < length(res)) {
		 notOk <- which(!(1:length(res) %in% ok))
		 X[notOk,,drop=FALSE]
	 } else NULL	
    
	if(nErrors > 0L) {
		msg <- paste0(c("Errors at sample points: ", notOk), collapse = " ")
		message(msg)		
	} else if(nWarnings > 0L) {		
		msg <- paste(c("Variance matrix is not positive definite at sample points: ", notOk) ,collapse=" ")
		message(msg)
	}
	if(!is.null(nIgnored) && nrow(nIgnored) > 0L){
		message(.makeMessage("Ignored a total of ", nrow(nIgnored), " sample points.","\n"))		
	}	
	if(length(ok) == 0L ) {
	 	msg <- .makeMessage("No parameters match the simulation conditions: ","\n")
		message(msg)
	    return(
		  .qleError(message=msg,
				    call=match.call(),
			 	    xdim=ncol(X), 
					nsim=nsim,
					nWarnings=nWarnings,
				    nErrors=nErrors,
					nIgnored=nIgnored,
					error=res)
		)
  	}
	# Cholesky decompositions	
	cvmats <- 
	 if(var.type != "const") {
		# covariance decompositions	
	    if(verbose)
		 cat("Cholesky decompositon of covariance matrices...\n")
	
	 	vmats <- lapply(res[ok],"[[","V" )
		L <- try(varCHOLdecomp(vmats),silent=TRUE)
		if(.isError(L)) {
			msg <- paste0("Cholseky decomposition failed: ","\n")
			message(msg)
			stop(.qleError(message=msg,call=match.call(),error=L))
		}
		ok2 <- which(sapply(L,function(x) !.isError(x)))
		if(length(ok2) == 0L){
			msg <- paste0("All Cholesky decompositions failed: ","\n")
			message(msg)
			return(.qleError(message=msg,call=match.call(),error=L))
	 	} else if(length(ok2) < length(L))
		   warning(paste0("A total of ",length(L)-length(ok2), " Cholesky decomposition(s) failed."))
	    
	    cvmats <- as.data.frame(do.call(rbind, L[ok2]))
	    colnames(cvmats) <- paste("L", rep(1:NCOL(cvmats)),sep="")
	   
		# bootstrap variances of covariances 
		# a local nugget variances
		if(Nb > 0 && var.type == "kriging"){
			Lb <- lapply(res[ok],"[[","Lb")			
			Lbmats <- as.data.frame(do.call(rbind,Lb[ok2]))
			colnames(Lbmats) <- paste("Lb", rep(1:NCOL(Lbmats)),sep="")
			cvmats <- cbind(cvmats,Lbmats)
		}
		if(length(ok) != length(ok2))
		  ok <- ok2
	    cvmats		
	} else NULL
	
	X <- X[ok,,drop=FALSE]
	mstats <- do.call(rbind,lapply(res[ok],"[[","mstat"))
	mvars  <- do.call(rbind,lapply(res[ok],"[[","vars"))
	
	qld <- as.data.frame(cbind(rbind(X),mstats,mvars))
	nms <- 
	 if(is.null(colnames(X)) | is.null(colnames(mstats)) | is.null(colnames(mvars))) {
		 c(colnames(qld[seq(ncol(X))]),
		   paste("mean.T",rep(1:NCOL(mstats)),sep=""),
		   paste("var.T",rep(1:NCOL(mvars)),sep=""))			   
	 } else {
		 c(colnames(X),
		   paste0("mean.",colnames(mstats)),
		   paste0("var.",colnames(mvars)))			   
	 }			
	colnames(qld) <- nms			  
	if(!is.null(cvmats)){
   	 qld <- cbind(qld,data.frame(cvmats))
 	}

	structure(qld,
			  xdim=ncol(X),
			  nsim=nsim,
			  Nb=if(var.type != "kriging") 0 else Nb,
			  nWarnings=nWarnings,
			  nErrors=nErrors,
			  nIgnored=nIgnored,
			  message=msg,
			  call=sys.call(),
			  class=c("QLdata","data.frame"))
}
mbaaske/qle documentation built on May 27, 2019, midnight