paleotree: Paleontological and Phylogenetic Analyses of Evolution

Documented in seqTimeList

#' Construct a Stochastic Sequenced Time-List from an Unsequenced Time-List
#' 
#' This function randomly samples from a \code{timeList} object (i.e. a list composed of a matrix of interval start and end
#' dates and a matrix of taxon first and last intervals), to find a set of taxa and intervals that do not overlap,
#' output as a new \code{timeList} object.

#' @details
#' Many analyses of diversification and sampling in the fossil record require a dataset composed of sequential non-overlapping intervals,
#' but the nature of the geologic record often makes this difficult, with taxa from different regions, environments and sedimentary basins
#' having first and last appearances placed in entirely in-congruent systems of chronostratigraphic intervals. While one option is to convert
#' such occurrences to a single, global stratigraphic system, this may still result in overlapping intervals when fossil collections are poorly
#' constrained stratigraphically. (For example, this may often be the case in global datasets.) 

#' This function offers an approach to avoid this issue in large datasets by randomly subsampling
#' the available taxa and intervals to produce stochastic
#' sets of ranges composed of data drawn from non-overlapping intervals. 
#' 
#' \code{seqTimeList} is stochastic and thus should be set for many runs to produce many such solutions. Additionally,
#' all solutions found are returned, and users may wish to sort amongst these to maximize the number of intervals and 
#' number of taxa returned. A single solution which maximizes returned taxa and intervals may not be a precise enough approach
#' to estimating sampling rates, however, given the uncertainty in data. Thus, many runs should always be considered.
#' 
#' By default, solutions are searched for without consideration to the length of intervals used (i.e. the selection of intervals is 'unweighted').
#' Alternatively, we can 'weight' selection toward the smallest intervals in the set, using the argument \code{weightSampling}. Smaller
#' intervals presumably overlap less and thus should retain more taxa and intervals of more equal length. However, in practice with empirical datasets,
#' the package author finds these approaches do not seem to produce very different estimates.
#' 
#' For some datasets, many solutions found using \code{seqTimeList} may return infinite sampling values. This is often due to saving too many taxa
#' found in single intervals to the exclusion of longer-ranging taxa (see the example). This excess of single interval taxa is a clear artifact
#' of the randomized \code{seqTimeList} procedure and such solutions should probably be ignored.
 
#' @param timeList A list composed of two matrices, giving interval start and end 
#' dates and taxon first and last occurrences within those intervals. Some intervals
#' are expected to overlap (thus necessitating the use of this function), and datasets
#' lacking overlapping intervals will return an error message.

#' @param nruns Number of new \code{timeList} composed of non-overlapping intervals produced.

#' @param weightSampling If \code{TRUE}, weight sampling of new intervals toward smaller intervals. 
#' \code{FALSE} by default.

#' @return
#' A list, composed of three elements: \code{nIntervals}, a vector of the
#' number of intervals in each solution, \code{nTaxa}, a vector of the number of
#' taxa in each solution, and \code{timeLists}, a list composed of each new
#' \code{timeList} object as an element.

#' @seealso 
#' Resulting time-lists can be analyzed with \code{\link{freqRat}}, \code{\link{durationFreq}}, etc.
#' 
#' Additionally, \code{\link{binTimeData}} can be useful for simulating interval data.

#' @author David W. Bapst

#' @examples
#' # Simulate some fossil ranges with simFossilRecord
#' set.seed(444)
#' record <- simFossilRecord(
#'     p = 0.1, 
#'     q = 0.1, 
#'     nruns = 1,
#'     nTotalTaxa = c(60,80), 
#'     nExtant = 0
#'     )
#' taxa <- fossilRecord2fossilTaxa(record)
#' # simulate a fossil record with imperfect sampling with sampleRanges()
#' rangesCont <- sampleRanges(taxa,r = 0.1)
#' 
#' # Now let's use binTimeData to get ranges in discrete overlapping intervals
#'     # via pre-set intervals input
#' presetIntervals <- cbind(
#'   c(1000, 995, 990, 980, 970, 975, 960, 950, 940, 930, 900, 890, 888, 879, 875),
#'    c(995, 989, 960, 975, 960, 950, 930, 930, 930, 900, 895, 888, 880, 875, 870)
#'   )
#' rangesDisc1 <- binTimeData(rangesCont, int.times = presetIntervals)
#' 
#' seqLists <- seqTimeList(rangesDisc1, nruns = 10)
#' seqLists$nTaxa
#' seqLists$nIntervals
#' 
#' #apply freqRat as an example analysis
#' sapply(seqLists$timeLists, freqRat)
#' 
#' # notice the zero and infinite freqRat estimates? What's going on?
#' 
#' freqRat(seqLists$timeLists[[4]], plot = TRUE)
#' 
#' # too few taxa of two or three interval durations for the ratio to work properly
#'     # perhaps ignore these estimates
#' 
#' # with weighted selection of intervals
#' seqLists <- seqTimeList(rangesDisc1, nruns = 10, weightSampling = TRUE)
#' 
#' seqLists$nTaxa
#' seqLists$nIntervals
#' sapply(seqLists$timeLists, freqRat)
#' 
#' # didn't have much effect in this simulated example

#' @export seqTimeList
seqTimeList <- function(
                   timeList,
                   nruns = 100,
                   weightSampling = FALSE
                   ){
  #############################################
	timeList[[1]] <- as.matrix(timeList[[1]])
	timeList[[2]] <- as.matrix(timeList[[2]])
	#let's try to get sampling rates out of a timeList with overlapping intervals
	#test for overlap
	lap <- any(!apply(timeList[[1]],1,function(x)
		2>sum(x[1]>(timeList[[1]][,2]) & x[2]<(timeList[[1]][,1]))))
	if(!lap){stop("Your timeList doesn't have any overlap??")}
	#let's make a list of valid intervals
		#first, all non-overlapping intervals
	noLap <- which(apply(timeList[[1]],1,function(x) 
		2>sum(x[1]>(timeList[[1]][,2]) & x[2]<(timeList[[1]][,1]))))
	#now let's search iteratively for interval solutions
	nInts <- nTaxa <- numeric()
	savedLists <- list(1:nruns)
	for(i in 1:nruns){
		#set it up!
		valid <- noLap
		invalid <- numeric()
		if(length(valid)>0){
			candidate <- (1:nrow(timeList[[1]]))[-valid]
			}else{
			candidate <- (1:nrow(timeList[[1]]))
			}
		#LOOP
		while(length(candidate)>1){
			#next, let's pull out a random candidate interval
			if(weightSampling){
				weights <- order(apply(timeList[[1]][candidate,,drop = FALSE],1,diff))
				weights <- weights/sum(weights)
				draw <- sample(1:length(candidate),1,prob = weights)
			}else{
				draw <- sample(1:length(candidate),1)
				}
			#remove from candidate, add to valid
			valid <- c(valid,candidate[draw])		
			drawInt <- timeList[[1]][candidate[draw],]
			candidate <- candidate[-draw]
			#add all overlapping to invalid and remove from candidate
			candInt <- timeList[[1]][candidate,,drop = FALSE]
			badInt <- apply(candInt,1,function(x) drawInt[1]>x[2] & drawInt[2]<x[1])	
			invalid <- c(invalid,candidate[badInt])
			candidate <- candidate[!badInt]
			#
			#newInts <- timeList[[1]][valid,]
			#if(any(!apply(newInts,1,function(x)
			#	2>sum(x[1]>(newInts[,2]) & x[2]<(newInts[,1]))))){
			#		stop("WHAT!")}
			}
		if(length(candidate) == 1){valid <- c(valid,candidate)}
		valid <- sort(valid)
		newInts <- timeList[[1]][valid,]
		# test that there be no overlaps
		if(any(!apply(newInts,1,function(x)	
			2>sum(x[1]>(newInts[,2]) & x[2]<(newInts[,1]))))){stop("WHAT!")}
		#
		validTaxa <- apply(timeList[[2]],1,function(x) any(x[1] == valid) & any(x[2] == valid))
		cropTaxa <- timeList[[2]][validTaxa,]
		newTaxa <- cbind(sapply(cropTaxa[,1],function(x) which(x == valid)),
		sapply(cropTaxa[,2],function(x) which(x == valid)))
		nInts[i] <- nrow(newInts)
		nTaxa[i] <- nrow(newTaxa)
		savedLists[[i]] <- list(intTimes = newInts,taxonTimes = newTaxa)
		}
	results <- list(nIntervals = nInts,nTaxa = nTaxa,timeLists = savedLists)
	return(results)
	}