R/model.comparison.R

Defines functions pairwise.mod.xval.ttest get.sig.groups ttest.all.mod.xvals head2headMod whichMod get.nrow check.partition check.rep check.partitions.arg check.for.output.files check.parallel.args check.xval.dist.args check.partitions.file check.xval.misc.args check.xval.call post.process.par.cov get.par.cov parallelizing end.parallelization parallel.prespecify.check write.xvals announce.xval.procedure load.partitions.file wishLnL makePostParCov fitToTest getInits checkCov makeXvalBlock xvalBedassle passDist plot.mod.xval.summary summarize.mod.xval standardize.xval.results read.xval.results check.mod.cols.arg check.mod.names.arg check.mod.order.arg check.xval.files.arg check.compare.mod.xvals.call compare.model.xvals makePartitions xValidation

Documented in compare.model.xvals makePartitions xValidation

#' Run a BEDASSLE cross-validation analysis
#' 
#' \code{xValidation} runs a BEDASSLE cross-validation analysis
#' 
#' This function initiates a k-fold cross-validation analysis 
#' to determine the statistical support for the specified model.
#' 
#' @param partsFile A filename (in quotes, 
#'		with the full file path) to the data partitions object to be used 
#'		in the k-fold cross-validation procedure. This object can be created 
#'		using the \code{\link{makePartitions}} function in this package.
#' @param nReplicates An \code{integer} giving the number of cross-validation
#'		replicates to be run. This should be the same as the length of the list 
#'		specified in the \code{partsFile}.
#' @param nPartitions An \code{integer} giving the number of data folds 
#'		within each run. This should be the same as the length of each the 
#'		list specified for each replicate in the \code{partsFile}.
#' @param genDist A \code{matrix} of pairwise pi measured between 
#'		all pairs of samples.
#' @param geoDist A \code{matrix} of pairwise geographic distances 
#'		measured between all pairs of samples. A value of 
#'		\code{NULL} runs a model without geographic distance 
#'		as a predictor of genetic differentiation.
#' @param envDist A \code{matrix} of pairwise environmental distances 
#'		measured between all pairs of samples. If there 
#'		are multiple environmental distance measures, this 
#'		argument should be a \code{list} of distance matrices. 
#'		A value of \code{NULL} runs a model without geographic 
#'		distance as a predictor of genetic differentiation.
#' @param nLoci The total number of independent loci used to calculate 
#'		pairwise pi (\code{genDist}) in the dataset.
#' @param prefix A character vector giving the prefix to be attached 
#'		to all output files.
#' @param nIter An \code{integer} giving the number of iterations each MCMC 
#'		chain is run. Default is 2e3.  If the number of iterations 
#'		is greater than 500, the MCMC is thinned so that the number 
#'		of retained iterations is 500 (before burn-in).
#' @param parallel A \code{logical} value indicating whether or not to run the 
#'		different cross-validation replicates in parallel. Default is \code{FALSE}.
#'		For more details on how to set up runs in parallel, see the model 
#'		comparison vignette.
#' @param nNodes Number of nodes to run parallel analyses on. Default is 
#'		\code{NULL}. Ignored if \code{parallel} is \code{FALSE}. For more details 
#'		in how to set up runs in parallel, see the model comparison vignette. 
#' @param saveFiles A \code{logical} value indicating whether to automatically 
#'		save the output files from each cross-validation replicate. 
#'		Default is \code{FALSE}.
#' @param ... Further options to be passed to rstan::sampling (e.g., adapt_delta).
#'	
#' @return This function returns a matrix with \code{nReplicates} columns and 
#'		\code{nPartitions} columns giving the likelihood of each data partition 
#'		(averaged over the posterior distribution of the MCMC) in each replicate 
#'		analysis. The mean of these values gives an estimate of the predictive 
#'		accuracy of the specified model given the data provided. The mean and 
#'		standard error of the data partition likelihoods across replicates can be 
#' 		used for comparing models (e.g., with a t-test).
#'
#' In addition, this function saves a text file ("..._xval_results.txt"), 
#' containing the returned likelihoods for each replicate and data partition.
#'
#'@export
xValidation <- function(partsFile,nReplicates,nPartitions,genDist,geoDist=NULL,envDist=NULL,nLoci,prefix,nIter=2e3,parallel=FALSE,nNodes=1,saveFiles=FALSE,...){
	check.xval.call(args <- as.list(environment()))
	parts <- load.partitions.file(partsFile)
	check.partitions.arg(parts,nReplicates,nPartitions,genDist)
	message(announce.xval.procedure(nReplicates,nPartitions,genDist,geoDist,envDist))
	prespecified <- parallel.prespecify.check(args <- as.list(environment()))
	`%d%` <- parallelizing(args <- as.list(environment()))
	parts <- unlist(parts,recursive=FALSE)
	N <- nReplicates*nPartitions
    xvals <- foreach::foreach(n=1:N) %d% {
    				r <- (n-1) %/% nPartitions + 1;
    				p <- n - (r-1) * nPartitions;
				    message(sprintf("\nk-fold cross-validation: analyzing replicate %s/%s, partition %s/%s\n",
				    				r,nReplicates,p,nPartitions));
       			tryCatch(
	       			xvalBedassle(test = parts[[n]],
	       							 genDist = genDist,
	     							 geoDist = geoDist,
	       							 envDist = envDist,
	       							 nLoci = nLoci,
	       							 prefix = paste0(prefix,"rep",n),
	       							 nIter = nIter,
	       							 saveFiles,
	       							 ...),
	       			error=function(e){
	       				sprintf("replicate %s/%s, partition %s/%s experienced an error and could not complete\n",
	       					r,nReplicates,p,nPartitions)
	       				return(NA)
	       			})
       		 }
	xvals <- round(unlist(xvals),4)
	xvals <- matrix(xvals,nrow=nPartitions,ncol=nReplicates)
	colnames(xvals) <- paste0("rep_",1:nReplicates)
	write.xvals(xvals,nReplicates,nPartitions,prefix)
	tmp <- end.parallelization(prespecified)
    return(xvals)
}

utils::globalVariables("n")

#' Make the partitions used to run a BEDASSLE cross-validation analysis
#' 
#' \code{makePartitions} makes the partitions used to run a BEDASSLE cross-validation analysis
#' 
#' This function initiates a k-fold cross-validation analysis 
#' to determine the statistical support for the specified model.
#' 
#' @param prefix A \code{character} object giving the prefix to be attached 
#'		to the output data partitions object to be used in the k-fold 
#'		cross-validation procedure.
#' @param nReplicates An \code{integer} giving the number of cross-validation
#'		replicates to be run. The default value is 10.
#' @param nPartitions An \code{integer} giving the number of data folds 
#'		within each run. The default value is 5.
#' @param N An \code{integer} giving the number of samples in the dataset.
#'		This should have the same value as \code{nrow(geoDist)}.
#'
#' @details The number of samples \code{N} must be divisible by the specified 
#'		number of partitions (\code{nPartitions}).
#' @return This function generates and saves a nested list R object (".Robj" file) 
#'		that can be used to run a cross-validation analysis. This R object is a list 
#'		of length \code{nReplicates}, each element of which is a list of length 
#'		\code{nPartitions}. Each of those \code{nPartitions} element 
#'		is a vector of indices (between 1 and \code{N}) of the random subset 
#'		of individuals assigned to that cross-validation partition. 
#'		Each of the \code{N} individuals appears in exactly one partition per replicate.
#'
#'@export
makePartitions <- function(prefix,nReplicates=10,nPartitions=5,N){
	if(N%%nPartitions != 0){
		stop("\n\nThe number of samples (N) must be divisible by the number of partitions\n\n")
	}
	partitions <- lapply(1:nReplicates,
				function(n){
					reorderedSamples <- sample(1:N,N,replace=FALSE)
					nInPart <- N/nPartitions
					repParts <- lapply(1:nPartitions,
									function(i){
										reorderedSamples[((i-1)*nInPart+1):(i*nInPart)]
									})
					stats::setNames(repParts,paste0("partition",1:nPartitions))
	})
	partitions <- stats::setNames(partitions,paste0("rep",1:nReplicates))
	save(partitions,file=paste0(prefix,"_partitions.Robj"))
	message("\npartitions file saved\n")
	return(invisible("done"))
}

#' Compare the cross-validation output of different models
#' 
#' \code{compare.model.xvals} compares the outputs of different BEDASSLE cross-validation analyses
#' 
#' This function compares outputs of different BEDASSLE cross-validation analyses 
#' to determine the relative statistical support for each model and identify the best model.
#' 
#' @param xval.files A \code{character} vector giving the filenames (in quotes, 
#'		with the full file path) to the output files of a \code{\link{xValidation}} 
#'		analysis.
#' @param mod.order An \code{integer} vector giving the ordering of the model hypotheses 
#'		to be compared. Lower numbers indicate less complex or more parsimonious models.
#'		No ties.
#' @param mod.names An \code{character} vector giving the names associated with the models 
#'		run in the different cross-validation analyses.
#' @param mod.cols A vector containing the colors to be used in plotting the output of 
#'		the different cross-validation analyses. Default is black.
#'
#' @details This function compares the cross-validation predictive accuracy of different models 
#'		applied to the same data partitions (generated using \code{\link{makePartitions}}). 
#'		Going in the order of model complexity specified by \code{mod.order}, starting from the 
#'		least complex model, this function compares the performance of a model across all 
#'		cross-validation data partitions against that of the next most complex model. The model  
#'		that is determined to be "best" is the least complicated model whose performance is 
#'		statistically indistinguishable from adjacent model (ie, the model that is one rung 
#'		higher in complexity, as specified by \code{mod.order}.
#'
#' @return This function generates a figure comparing the cross-validation predictive accuracy 
#'		of different models used to analyze the same data partitions. The "best" model among the 
#'		set being compared is indicated with a golden arrow.  The significance of the pairwise 
#'		comparisons is indicated using letter groupings, as in a Tukey post-hoc test. The 
#'		function also returns the name of the "best" model. 
#'
#'@export
compare.model.xvals <- function(xval.files,mod.order,mod.names,mod.cols=NULL){
	check.compare.mod.xvals.call(args <- as.list(environment()))
	n.models <- length(xval.files)
	if(is.null(mod.cols)){
		mod.cols <- rep(1,n.models)
	}
	xval.results <- lapply(xval.files,function(n){read.xval.results(n)})
	nReplicates <- ncol(xval.results[[1]])
	xval.results <- lapply(xval.results,function(x){colMeans(x,na.rm=TRUE)})
	yLim <- range(unlist(xval.results),na.rm=TRUE)+c(0,diff(range(unlist(xval.results),na.rm=TRUE))/5)
	plot(0,xlim=c(0.5,length(xval.files)+0.5),
		   ylim=yLim,
		   xlab="models",
		   xaxt="n",
		   ylab="standardized predictive accuracy",
		   main="cross-validation model comparison",
		   type="n")
	graphics::axis(side=1,at=1:n.models,labels=mod.names)
	pairwise.sigs <- ttest.all.mod.xvals(xval.results,n.models)[mod.order,mod.order]
	group.labels <- get.sig.groups(pairwise.sigs,n.models)[mod.order]
	lab.ycoord <- graphics::par("usr")[4] - diff(range(unlist(xval.results),na.rm=TRUE))/10
	invisible(lapply(1:n.models,
		function(i){
			j <- mod.order[i]
			plot.mod.xval.summary(j,summarize.mod.xval(xval.results[[j]]),mod.cols[j])
			graphics::points(x=jitter(rep(j,nReplicates)),
					y=xval.results[[j]],
					pch=20,col=grDevices::adjustcolor(mod.cols[j],0.5))
			graphics::text(x=j,y=lab.ycoord,labels= group.labels[[j]],col=mod.cols[j])
		}))
	bestMod <- whichMod(xval.files,mod.order,mod.names)
	bestMod <- which(mod.names==bestMod)
	if(!is.null(bestMod)){
		graphics::arrows(x0=bestMod,
			  			 x1=bestMod,
						 y0=min(xval.results[[bestMod]])-2*diff(yLim)/10,
						 y1=min(xval.results[[bestMod]])-diff(yLim)/10,
						 col="black",lwd=2,length=0.03)
		graphics::arrows(x0=bestMod,
						 x1=bestMod,
						 y0=min(xval.results[[bestMod]])-2*diff(yLim)/10,
						 y1=min(xval.results[[bestMod]])-diff(yLim)/10,
						 col="goldenrod1",lwd=1.5,length=0.03)
	}
	return(mod.names[bestMod])
}

check.compare.mod.xvals.call <- function(args){
	check.xval.files.arg(args[["xval.files"]])
	check.mod.order.arg(args)
	check.mod.names.arg(args)
	check.mod.cols.arg(args)
	return(invisible("compare model xval args checked"))
}

check.xval.files.arg <- function(xval.files){
	if(length(xval.files) < 2){
		stop("\nyou must specify more than 1 xValidation output file to compare\n")
	}
	if(any(!is.character(xval.files))){
		stop("\nyou must specify a character vector for the \"xval.files\" argument\n")
	}
	if(any(!file.exists(xval.files))){
		stop("\nfunction is unable to find an \"xval.file\" specified\n")
	}
	return(invisible("xval.files checked"))
}

check.mod.order.arg <- function(args){
	nModels <- length(args[["xval.files"]])
	modOrder <- args[["mod.order"]]
	if(length(modOrder) != nModels){
		stop("\nthe length of \"mod.order\" does not match the number of files in \"xval.files\"\n")
	}
	if(!is.numeric(modOrder)){
		stop("\nthe \"mod.order\" argument must be of type \"numeric\"\n")	
	}
	if(length(modOrder) != length(unique(modOrder))){
		stop("\nthe \"mod.order\" argument cannot contain duplicate values\n")	
	}
	if(sum(abs(sort(modOrder)-1:(max(modOrder)))) != 0){
		stop("\nthe \"mod.order\" argument must contain all values between 1 and max(mod.order)\n")	
	}
	return(invisible("mod.order checked"))
}

check.mod.names.arg <- function(args){
	nModels <- length(args[["xval.files"]])
	modNames <- args[["mod.names"]]
	if(length(modNames) != nModels){
		stop("\nthe length of \"mod.names\" does not match the number of files in \"xval.files\"\n")
	}
	if(!is.character(modNames)){
		stop("\nthe \"mod.names\" must be of type \"character\"\n")	
	}
	if(length(modNames) != length(unique(modNames))){
		stop("\nthe \"mod.names\" argument cannot contain duplicate values\n")	
	}
	return(invisible("mod.names checked"))
}

check.mod.cols.arg <- function(args){
	if(!is.null(args[["mod.cols"]])){
		if(length(args[["mod.cols"]]) != length(args[["xval.files"]])){
			stop("\nyou must specify one plotting color for each model\n")
		}
	}
	return(invisible("mod.cols checked"))
}

read.xval.results <- function(xval.file){
	xval.result <- data.matrix(utils::read.table(xval.file,stringsAsFactors=FALSE,header=TRUE))
	if(any(!is.finite(xval.result))){
		xval.result[which(!is.finite(xval.result))] <- NA
	}
	return(xval.result)
}

standardize.xval.results <- function(xval.results){
	n.models <- length(xval.results)
	nReplicates <- ncol(xval.results[[1]])
	nPartitions <- nrow(xval.results[[1]])
	std.xval.array <- array(NA,dim=c(nPartitions,nReplicates,n.models))
	for(i in 1:n.models){
		std.xval.array[,,i] <- xval.results[[i]]
	}
	for(i in 1:nPartitions){
		for(j in 1:nReplicates){
			std.xval.array[i,j,] <- std.xval.array[i,j,] - max(std.xval.array[i,j,]) 
		}
	}
	std.xval.results <- lapply(1:n.models,function(i){std.xval.array[,,i]})
	return(std.xval.results)
}

summarize.mod.xval <- function(mod.xval.results){
	xval.mean <- mean(mod.xval.results,na.rm=TRUE)
	xval.std.err <- stats::sd(mod.xval.results)/sqrt(length(mod.xval.results))
	xval.CI <- xval.mean + c(-1.96,1.96) * xval.std.err
	mod.xval.summary <- list("mean" = xval.mean,
							 "CI" = xval.CI)
	return(mod.xval.summary)
}

plot.mod.xval.summary <- function(i,mod.xval.summary,mod.col=1){
	graphics::segments(x0=i,x1=i,y0=mod.xval.summary$CI[1],y1=mod.xval.summary$CI[2],lwd=1.8,col=mod.col)
	graphics::points(i,mod.xval.summary$mean,pch=19,cex=1.8,col=mod.col)
	return(invisible("plotted"))
}

passDist <- function(a,B=NULL){
	X <- NULL
	if(!is.null(B)){
		if(inherits(B,"matrix")){
			X <- B[a,a]
		} else if(inherits(B,"array")){
			X <- array(NA,dim=c(dim(B)[1],length(a),length(a)))
			for(i in 1:dim(B)[1]){
				X[i,,] <- B[i,a,a]
			}
		}
	}
	return(X)
}

xvalBedassle <- function(test,genDist,geoDist,envDist,nLoci,prefix,nIter,saveFiles,...){
	trainBlock <- makeXvalBlock(inSamples=c(1:nrow(genDist))[-test],genDist,geoDist,envDist,nLoci,silent=TRUE)
	testBlock <- makeXvalBlock(inSamples=test,genDist,geoDist,envDist,nLoci,silent=TRUE)
	stan.model <- pick.stan.model(trainBlock$geoDist,trainBlock$envDist)
	trainFit <- rstan::sampling(object = stanmodels[[stan.model]],
							 	 refresh = min(nIter/10,500),
							 	 data = trainBlock,
							 	 iter = nIter,
							 	 chains = 1,
							 	 init=getInits(trainBlock,stan.model),
							 	 thin = ifelse(nIter/500 > 1,nIter/500,1),
							 	 save_warmup = FALSE,
							 	 ...)
	if(saveFiles){
		trainBlock <- unstandardize.distances(trainBlock)
		save(trainBlock,file=paste(prefix,"data.block.Robj",sep="_"))
		save(trainFit,file=paste(prefix,"model.fit.Robj",sep="_"))
		bedassleResults <- get.bedassle.results(trainBlock,trainFit,nChains=1)
		write.bedassle.results(trainBlock,bedassleResults,prefix,nChains=1)
		make.all.bedassle.plots(paste0(prefix,"_posterior_chain",1,".txt"),paste(prefix,"data.block.Robj",sep="_"),prefix)
	}
	testFit <- fitToTest(trainFit,stan.model,testBlock)
	return(testFit)
}

makeXvalBlock <- function(inSamples,genDist,geoDist,envDist,nLoci,silent=TRUE){
	trainCov <- pwp2allelicCov(genDist[inSamples,inSamples])
	if(any(eigen(trainCov)$values < 0)){
		stop("\nthe allelic covariance calculated from the specified genetic distance matrix is not positive definite\n")
	}
	envDist <- make.envDist.list(envDist)
	sd.geoDist.list <- standardize.distances(geoDist)
	sd.envDist.list <- standardize.envDist(envDist)
	envDist.array <- make.envDist.array(sd.envDist.list)
	envDist.array <- passDist(a=inSamples,B=envDist.array)
	trainBlock <- list("N" = nrow(trainCov),
					   "L" = nLoci,
					   "obsCov" = trainCov,
					   "geoDist" = sd.geoDist.list$std.X[inSamples,inSamples],
					   "sd.geoDist" = sd.geoDist.list$stdev.X,
					   "envDist" = envDist.array,
					   "sd.envDist" = lapply(sd.envDist.list,"[[", "stdev.X"),
					   "nE" = ifelse(!is.null(envDist),length(envDist),0))
	trainBlock <- validate.data.block(trainBlock,silent=silent)
	return(trainBlock)
}

checkCov <- function(inits,dataBlock,modName){
	if(modName=="NULL"){
		parCov <- matrix(inits$alpha0,nrow=dataBlock$N,ncol=dataBlock$N) + diag(inits$nugget,dataBlock$N)
	}
	if(modName=="GEO"){
		parCov <- inits$alpha0 * exp(-(inits$alphaD*dataBlock$geoDist)^inits$alpha2) + diag(inits$nugget,dataBlock$N)
	}
	if(modName=="ENV"){
		eDist <- Reduce("+",
					lapply(1:dataBlock$nE,
						function(e){
							inits$alphaE[e]*dataBlock$envDist[e,,]
					}))
		parCov <- inits$alpha0 * exp(-eDist^inits$alpha2) + diag(inits$nugget,dataBlock$N)
	}
	if(modName=="GEOENV"){
		eDist <- Reduce("+",
					lapply(1:dataBlock$nE,
						function(e){
							inits$alphaE[e]*dataBlock$envDist[e,,]
					}))
		parCov <- inits$alpha0 * exp(-(eDist+inits$alphaD*dataBlock$geoDist)^inits$alpha2) + diag(inits$nugget,dataBlock$N)
	}
	posdef <- all(eigen(parCov)$values > 0)
	return(posdef)
}

getInits <- function(dataBlock,modName){
	posdef <- FALSE
	while(!posdef){
		if(modName=="NULL"){
			inits <- list("alpha0" = abs(stats::rnorm(1)),
						  "nugget"=abs(stats::rnorm(1,mean=0)))
		}
		if(modName=="GEO"){
			inits <- list("alpha0" = abs(stats::rnorm(1)),
						  "alphaD" = abs(stats::rnorm(1)),
						  "alpha2" = stats::runif(1,0,2),
						  "nugget"=abs(stats::rnorm(1,mean=0)))	
		}
		if(modName=="ENV"){
			inits <- list("alpha0" = abs(stats::rnorm(1)),
						  "alphaE" = as.array(abs(stats::rnorm(dataBlock$nE))),
						  "alpha2" = stats::runif(1,0,2),
						  "nugget"=abs(stats::rnorm(1,mean=0)))		
		}
		if(modName=="GEOENV"){
			inits <- list("alpha0" = abs(stats::rnorm(1)),
						  "alphaD" = abs(stats::rnorm(1)),
						  "alphaE" = as.array(abs(stats::rnorm(dataBlock$nE))),
						  "alpha2" = stats::runif(1,0,2),
						  "nugget"=abs(stats::rnorm(1,mean=0)))		
		}
		posdef <- checkCov(inits,dataBlock,modName)
	}
	inits <- list(inits)
	return(inits)
}

fitToTest <- function(trainFit,modName,testBlock){
	pars <- list("a0" = rstan::extract(trainFit,"alpha0",permute=FALSE),
				 "nugget" = rstan::extract(trainFit,"nugget",permute=FALSE))
	if(modName=="GEO"){
		pars[["aD"]] <- rstan::extract(trainFit,"alphaD",permute=FALSE)
		pars[["a2"]] <- rstan::extract(trainFit,"alpha2",permute=FALSE)
	}
	if(modName =="ENV"){
		pars[["aE"]] <- rstan::extract(trainFit,"alphaE",permute=FALSE)
		pars[["a2"]] <- rstan::extract(trainFit,"alpha2",permute=FALSE)
	}
	if(modName =="GEOENV"){
		pars[["aD"]] <- rstan::extract(trainFit,"alphaD",permute=FALSE)
		pars[["aE"]] <- rstan::extract(trainFit,"alphaE",permute=FALSE)
		pars[["a2"]] <- rstan::extract(trainFit,"alpha2",permute=FALSE)
	}
	ppc <- makePostParCov(nIter=length(pars$a0),
						  N=testBlock$N,
						  modName=modName,
						  pars=pars,
						  nE=testBlock$nE,
						  D=testBlock$geoDist,
						  E=testBlock$envDist)
	lnL <- mean(unlist(
			lapply(ppc,
				function(x){
					wishLnL(df=testBlock$L,parCov=x,obsCov=testBlock$obsCov)})))
	return(lnL)
}

makePostParCov <- function(nIter,N,modName,pars,nE,D=NULL,E=NULL){
	if(modName=="NULL"){
		pcPost <- lapply(1:nIter,
					function(i){
						pars$a0[i] + diag(pars$nugget[i],N)
					})
	} else if (modName=="GEO"){
		pcPost <- lapply(1:nIter,
					function(i){
						pars$a0[i] * exp(-(pars$aD[i]*D)^pars$a2[i]) + diag(pars$nugget[i],N)
					})
	} else if (modName=="ENV"){
		pcPost <- lapply(1:nIter,
					function(i){
						eDist <- Reduce("+",
									lapply(1:nE,
										function(e){
											pars$aE[i,,e]*E[e,,]
									}))
						pars$a0[i] * exp(-(eDist)^pars$a2[i]) + diag(pars$nugget[i],N)
					})	
	} else if (modName=="GEOENV"){
		pcPost <- lapply(1:nIter,
					function(i){
						eDist <- Reduce("+",
									lapply(1:nE,
										function(e){
											pars$aE[i,,e]*E[e,,]
									}))
						pars$a0[i] * exp(-(pars$aD[i]*D + eDist)^pars$a2[i]) + diag(pars$nugget[i],N)
					})
	}
	return(pcPost)
}

wishLnL <- function(df,parCov,obsCov){
	invParCov <- chol2inv(chol(parCov))
	logDet <- determinant(parCov)$modulus[[1]]
	lnL <- -0.5 * (sum(invParCov * obsCov) + df * logDet)
	return(lnL)
}

load.partitions.file <- function(partsFile){
	tmpenv <- environment()
	tmp <- load(partsFile,envir=tmpenv)
	rep.partitions <- lapply(tmp,get,envir=tmpenv)
	names(rep.partitions) <- tmp
	return(unlist(rep.partitions,recursive=FALSE))
}

announce.xval.procedure <- function(nReplicates,nPartitions,genDist,geoDist,envDist){
	N <- nrow(genDist)
	xval.proc <- sprintf("running %s %s of k-fold cross-validation on:\n\n\t",nReplicates, ifelse(nReplicates>1,"replicates","replicate"))
	if(is.null(geoDist) & is.null(envDist)){
		xval.proc <- paste0(xval.proc,"the \"NULL\" model (neither geographic or environmental distance)")
	}
	if(!is.null(geoDist) & is.null(envDist)){
		xval.proc <- paste0(xval.proc,"the \"GEO\" model (geographic, but not environmental distance)")
	}
	if(is.null(geoDist) & !is.null(envDist)){
		xval.proc <- paste0(xval.proc,"the \"ENV\" model (environmental, but not geographic distance)")
	}
	if(!is.null(geoDist) & !is.null(envDist)){
		xval.proc <- paste0(xval.proc,"the \"GEOENV\" model (geographic and environmental distance)")
	}
	if(!is.null(envDist)){
		nE <- ifelse(is.list(envDist),length(envDist),1)
		vars <- ifelse(nE == 1,"matrix","matrices")
		envAddendum <- sprintf("\n\twith %s environmental distance %s\n",nE,vars)
		xval.proc <- paste0(xval.proc,envAddendum)
	}
	xval.proc <- paste0(xval.proc,
					sprintf("\n\nwith %s data partitions, each comprised of %s individuals",
							nPartitions,N/nPartitions))
	return(xval.proc)
}

write.xvals <- function(xvals,nReplicates,nPartitions,prefix){
	utils::write.table(xvals,
						file=paste0(prefix,"_xval_results.txt"),
						row.names=FALSE,
						quote=FALSE)
	return(invisible("xval results written"))
}

parallel.prespecify.check <- function(args){
	prespecified <- FALSE
	if(args[["parallel"]] & foreach::getDoParRegistered()){
		prespecified <- TRUE
	}
	return(prespecified)
}

end.parallelization <- function(prespecified){
	if(!prespecified){
		doParallel::stopImplicitCluster()
		message("\nParallel workers terminated\n\n")
	}
	return(invisible("if not prespecified, parallelization ended"))
}

parallelizing <- function(args){
	if(args[["parallel"]]){
		if(!foreach::getDoParRegistered()){
			if(is.null(args[["nNodes"]])){
				nNodes <- parallel::detectCores()-1
			} else {
				nNodes <- args[["nNodes"]]
			}
			cl <- parallel::makeCluster(nNodes)
			doParallel::registerDoParallel(cl)
			message("\nRegistered doParallel with ",nNodes," workers\n")
		} else {
			message("\nUsing ",foreach::getDoParName()," with ", foreach::getDoParWorkers(), " workers")
		}
		d <- foreach::`%dopar%`
	} else {
		message("\nRunning sequentially with a single worker\n")
		d <- foreach::`%do%`
	}
	return(d)
}

get.par.cov <- function(model.fit,chain.no,N){
	par.cov <- array(NA,dim=c(model.fit@sim$n_save[chain.no],N,N))
	for(i in 1:N){
		for(j in 1:N){
			my.par <- sprintf("parCov[%s,%s]",i,j)
			par.cov[,i,j] <- rstan::extract(model.fit,pars=my.par,inc_warmup=TRUE,permuted=FALSE)[,chain.no,]
		}
	}
	return(par.cov)
}

post.process.par.cov <- function(parCov){
	pp.cov.list <- lapply(1:dim(parCov)[1],
							function(i){
								list("inv" = chol2inv(chol(parCov[i,,])),
									 "log.det" = determinant(parCov[i,,])$modulus[[1]])
							})
	return(pp.cov.list)
}

check.xval.call <- function(args){
	check.partitions.file(args)
	check.xval.misc.args(args)
	check.xval.dist.args(args)
	check.parallel.args(args)
	check.for.output.files(args)
	return(invisible("args checked"))
}

check.xval.misc.args <- function(args){
	if(!is.whole.number(args[["nReplicates"]])){
		stop("\nyou must specify an integer value for the \"nReplicates\" argument\n")	
	}
	if(!is.whole.number(args[["nPartitions"]])){
		stop("\nyou must specify an integer value for the \"nPartitions\" argument\n")	
	}
	if(!is.whole.number(args[["nLoci"]])){
		stop("\nyou must specify an integer value for the \"nLoci\" argument\n")	
	}
	if(!is.character(args[["prefix"]])){
		stop("\nyou must specify a character value for the \"prefix\" argument\n")		
	}
	if(!is.whole.number(args[["nIter"]])){
		stop("\nyou must specify an integer value for the \"nIter\" argument\n")	
	}
	if(!is.logical(args[["saveFiles"]])){
		stop("\nyou must specify a logical value (TRUE or FALSE) for the \"saveFiles\" argument\n")
	}
	return(invisible("misc args checked"))
}

check.partitions.file <- function(args){
	if(!is.character(args[["partsFile"]])){
		stop("\nyou must specify a character vector for the \"partsFile\" argument\n")
	}
	if(!file.exists(args[["partsFile"]])){
		stop("\nfunction is unable to find the \"partsFile\" specified\n")
	}
	return(invisible("partition files checked"))
}

check.xval.dist.args <- function(args){
	check.geoDist.arg(args)
	check.envDist.arg(args)
	return(invisible("dist args checked"))
}

check.parallel.args <- function(args){
	if(args[["parallel"]] & args[["nNodes"]]==1){
		stop("\nyou have specified the \"parallel\" option with \"nNodes\" set to 1.\n\n")
	}
	if(!args[["parallel"]] & args[["nNodes"]] > 1){
		stop("\nyou have are running with \"parallel\" set to FALSE but with \"nNodes\" greater than 1.\n\n")
	}
	if(!args[["parallel"]] & foreach::getDoParWorkers() > 1){
		stop("\nyou are running with more than 1 worker but you have set the \"parallel\" option to FALSE\n\n")
	}
	return(invisible("parallel args checked"))
}

check.for.output.files <- function(args){
	if(file.exists(paste0(args[["prefix"]],"_xval_results.txt"))){
		stop("\noutput files will be overwritten if you proceed with this analysis\n\n")
	}
	return(invisible("files checked for"))
}

check.partitions.arg <- function(parts,nReplicates,nPartitions,genDist){
	N <- nrow(genDist)
	n <- N/nPartitions
	if(length(parts) != nReplicates){
		stop("\nthe number of replicates in the \"partitions\" argument does not match the number of replicates specified\n")
	}
	if(!inherits(parts,"list")){
		stop("\nthe partitions object must be of class \"list\"\\n")
	}
	lapply(1:length(nPartitions),
		function(i){
			check.rep(parts[[i]],sprintf("rep%s",i),nPartitions,n,genDist)
		})
	return(invisible("rep partitions arg checked"))
}

check.rep <- function(rep,rep.name,nPartitions,n,genDist){
	if(length(rep) != nPartitions){
		stop(sprintf("\nthe number of partitions in %s of the \"partsFile\" argument does not match the number of partitions specified\n",rep.name))
	}
	if(!inherits(rep,"list")){
		stop("\nthe data partitions object within each cross-validation replicate must be of class \"list\"\n")
	}
	lapply(1:length(rep),
		function(k){
			check.partition(rep[[k]],rep.name,sprintf("partition_%s",k),n,genDist)
		})
	return(invisible("rep checked"))
}

check.partition <- function(partition,rep.name,partition.name,n,genDist){
	if(length(partition)!=n){
		stop(
			sprintf("\nyou have specified a data partition (%s in %s) with the incorrect dimensions\n",
					partition.name,rep.name))
	}
	allCov <- pwp2allelicCov(genDist[partition,partition])
	if(any(eigen(allCov)$values <= 0)){
		stop(sprintf("\nthe allelic covariance calculated from the specified genetic distance matrix by %s in %s is not positive definite\n",partition.name,rep.name))
	}
	return(invisible("partition checked"))
}

get.nrow <- function(geoDist,envDist){
	n.row <- NULL
	if(!is.null(geoDist)){
		n.row <- nrow(geoDist)
	}
	if(!is.null(envDist)){
		if(inherits(envDist,"matrix")){
			n.row <- nrow(envDist)
		}
		if(inherits(envDist,"list")){
			n.row <- nrow(envDist[[1]])
		}
	}
	return(n.row)
}

whichMod <- function(xval.files,mod.order,mod.names){
	n.models <- length(xval.files)
	xval.results <- lapply(xval.files,function(n){read.xval.results(n)})
	xval.results <- xval.results
	nReplicates <- ncol(xval.results[[1]])
	xval.results <- lapply(xval.results,
						function(x){
							if(any(!is.finite(x))){
								z <- x
								z[which(!is.finite(x))] <- NA
								colMeans(z,na.rm=TRUE)
							} else {
								colMeans(x)
							}
					})
	modComps <- lapply(1:n.models,function(i){
					sapply(c(1:n.models)[-i],function(j){
						if(mod.order[i] < mod.order[j]){
							c(mod.names[i],mod.names[j])[head2headMod(xval.results[[i]],xval.results[[j]])]
						} else {
							c(mod.names[j],mod.names[i])[head2headMod(xval.results[[j]],xval.results[[i]])]
						}
					})
				})
	z <- 0
	halt <- FALSE
	while(!halt){
		z <- z + 1
		halt <- !any(modComps[[z]] != mod.names[z],na.rm=TRUE) | z == n.models
	}
	return(mod.names[z])
}

head2headMod <- function(x1,x2){
	p <- stats::t.test(x=x1,y=x2,paired=TRUE)$p.value
	meanDiff <- mean(x1) - mean(x2)
	if (p > 0.05){
		betterMod <- 1
	} else if(p < 0.05 & meanDiff > 0){
		betterMod <- 1
	} else if (p < 0.05 & meanDiff < 0){
		betterMod <- 2
	}
	return(betterMod) 
}

ttest.all.mod.xvals <- function(xval.results,n.models){
	pairwise.sigs <- matrix(NA,nrow=n.models,ncol=n.models)
	for(i in 1:n.models){
		for(j in 1:n.models){
			if(i != j){
				pairwise.sigs[i,j] <- pairwise.mod.xval.ttest(xval.results[[i]],xval.results[[j]])
			}
		}
	}
	return(pairwise.sigs)
}

get.sig.groups <- function(pairwise.sigs,n.models){
	groupLetters <- c("A",rep("",n.models-1))
	groupLetters[which(pairwise.sigs[1,] > 0.05)] <- "A"
	for(i in 2:n.models){
		if(groupLetters[i]==""){
			groupLetters[i] <- paste0(groupLetters[i],LETTERS[i],collapse="")
			if(any(pairwise.sigs[i,] > 0.05,na.rm=TRUE)){
				inGroup <- which(pairwise.sigs[i,] > 0.05)
				for(j in inGroup){
					groupLetters[j] <- paste0(groupLetters[j],LETTERS[i],collapse="")
				}
			}
		}
	}
	return(groupLetters)
}

pairwise.mod.xval.ttest <- function(xvals1,xvals2){
	pval <- stats::t.test(x=xvals2,y=xvals1,paired=TRUE,alternative="two.sided")$p.value
	return(pval)
}
gbradburd/bedassle documentation built on May 20, 2022, 1 p.m.