R/phrapl-info.R

Defines functions SubsampleMSTree.raw SubsampleMSTree GetScoreOfSingleTree GetOutDegreeOfPolytomies GetCladeNamesFromNewickString GetAndBindLabel GetClades ConvertOutputVectorToLikelihood.1sub ConvertOutputVectorToLikelihood AdjustUsingBeta PipeMS GetLengthGridList CreateStartGrid CreateFineGrid ReturnAIC ScaleParameterVectorByNe PassBounds.addedEvents PassBounds ProportionDifference ReturnModel CreateAssignment.df GuessAssignments CreateAssignment NumberPopulationsAtRoot FractionNonZeroMigration KGrowthMap KN0multiplierMap KCollapseMatrix KPopInterval KAll MsIndividualParameters SetMaxK

Documented in AdjustUsingBeta ConvertOutputVectorToLikelihood ConvertOutputVectorToLikelihood.1sub CreateAssignment CreateAssignment.df CreateFineGrid CreateStartGrid FractionNonZeroMigration GetAndBindLabel GetCladeNamesFromNewickString GetClades GetLengthGridList GetOutDegreeOfPolytomies GetScoreOfSingleTree GuessAssignments KAll KCollapseMatrix KGrowthMap KN0multiplierMap KPopInterval MsIndividualParameters NumberPopulationsAtRoot PassBounds PassBounds.addedEvents PipeMS ProportionDifference ReturnAIC ReturnModel ScaleParameterVectorByNe SetMaxK SubsampleMSTree SubsampleMSTree.raw

#functions for getting or settiinfo about runs
SetMaxK<-function(popVector,nTrees=1,taxaPerParam=5,force=FALSE) {
  if (exists("maxK") && !force) {
    print(paste("note: setMaxK called, but maxK already exists, so using the existing one of ",maxK,". To change this behavior, give setMaxK the option force=TRUE",sep=""))
    return(maxK)
  }
  else {
  	maxK<-max(1,floor(sum(popVector)/(nTrees*taxaPerParam)))
  	return(maxK)
  }
}

MsIndividualParameters<-function(migrationIndividual) {
	collapseMatrix<-migrationIndividual$collapseMatrix
	complete<-migrationIndividual$complete
	n0multiplierMap<-migrationIndividual$n0multiplierMap
	growthMap<-migrationIndividual$growthMap
	migrationArray<-migrationIndividual$migrationArray
	parameterList<-c()
	if (max(collapseMatrix,na.rm=TRUE)>0) {
		for (i in sequence(KCollapseMatrix(collapseMatrix))) {
			parameterList<-append(parameterList,paste("collapse_",i,sep=""))
		}
	}
	for (i in sequence(max(n0multiplierMap,na.rm=TRUE))) {
		parameterList<-append(parameterList,paste("n0multiplier_",i,sep=""))
	}
	for (i in sequence(max(growthMap,na.rm=TRUE))) {
		parameterList<-append(parameterList,paste("growth_",i,sep=""))
	}
	for (i in sequence(max(migrationArray,na.rm=TRUE))) {
		parameterList<-append(parameterList,paste("migration_",i,sep=""))
	}
	return(parameterList)
}

KAll<-function(migrationIndividual) {
  return(length(MsIndividualParameters(migrationIndividual)) - 1) #-1 since first n0 is not free
}

KPopInterval<-function(popInterval) {
#returns the number of free parameters needed for that interval object. For example, having everything collapse in one step requires one param (the tMRCA). Having one collapse then a second requires two. Having no collapse requires 0
	maxVector<-ColMax(popInterval$collapseMatrix)
	return(length(which(maxVector>0)))
}

KCollapseMatrix<-function(collapseMatrix) {
#returns the number of free parameters needed for that interval object.
#For example, having everything collapse in one step requires one param (the tMRCA).
#Having one collapse then a second requires two. Having no collapse requires 0
	firstTime<-TRUE
	#Need to toss columns of added events (all NAs), if present
	for(i in 1:dim(collapseMatrix)[2]){
		if(sum(!is.na(collapseMatrix[,i])) > 0){
			if(max(collapseMatrix[,i],na.rm=TRUE) > 0){
				if(firstTime==TRUE){
					collapseMatrixTemp<-matrix(collapseMatrix[,i],ncol=1,nrow=nrow(collapseMatrix))
					firstTime=FALSE
				}else{
					collapseMatrixTemp<-cbind(collapseMatrixTemp,collapseMatrix[,i])
				}
			}
		}
	}
	if(firstTime == TRUE){
		return(0)
	}else{
		maxVector<-ColMax(collapseMatrixTemp)
		return(length(which(maxVector > 0)))
	}
}

KN0multiplierMap<-function(n0multiplierMap) {
	return(max(n0multiplierMap,na.rm=TRUE)-1) #-1 since first n0 is not free
}

KGrowthMap<-function(growthMap) {
	return(max(growthMap,na.rm=TRUE))
}

FractionNonZeroMigration <- function(migrationIndividual) {
	return(sum(migrationIndividual$migrationArray[!is.na(migrationIndividual$migrationArray)]>0) / sum(!is.na(migrationIndividual$migrationArray)) )
}

NumberPopulationsAtRoot <- function(migrationIndividual) {
	last.interval <- dim(migrationIndividual$collapseMatrix)[2]
	number.ones <- length(which(migrationIndividual$collapseMatrix[,last.interval]==1))
	number.na <- length(which(is.na(migrationIndividual$collapseMatrix[,last.interval])))
	number.all <- dim(migrationIndividual$collapseMatrix)[1]
	number.alive <- number.all - number.na
	number.unmerged <- number.alive
	if (number.ones > 0) {
		number.unmerged <- number.alive - number.ones + 1
	}
	return(number.unmerged)
}

CreateAssignment<-function(popVector,file="assign.txt") {
	alphabet<-strsplit("ABCDEFGHIJKLMNOPQRSTUVWXYZ","")[[1]]
	assignFrame<-data.frame()
	indivTotal<-0
	for(popIndex in 1:length(popVector)) {
		popLabel<-alphabet[popIndex]
		for(indivIndex in 1:popVector[popIndex]) {
			indivTotal<-indivTotal+1
			if(indivTotal==1) {
				assignFrame<-data.frame(popLabel,indivTotal)
			}
			else {
				assignFrame<-rbind(assignFrame,data.frame(popLabel, indivTotal))
			}
		}
	}
	utils::write.table(assignFrame,file=file,quote=FALSE,sep="\t",row.names=FALSE,col.names=FALSE)
}

GuessAssignments <- function(observedTrees, popVector) {
	tips <- observedTrees[[1]]$tip.label
	# Levenshtein Distance
	distances  <- utils::adist(tips)
	rownames(distances) <- tips
	hc <- stats::hclust(stats::as.dist(distances))
	groupings <- stats::cutree(hc, k=length(popVector))
	if(sum(sort(table(groupings))==sort(popVector))!=length(popVector)) {
		warning("automatic grouping failed")
		return(NA)
	}
	alphabet<-strsplit("ABCDEFGHIJKLMNOPQRSTUVWXYZ","")[[1]]
	assignments<-data.frame(indiv=names(groupings), popLabel=groupings, row.names=sequence(length(groupings)), stringsAsFactors=FALSE)
	groupings.table <- table(groupings)
	for(i in sequence(length(groupings.table))) {
		matching.index<-which(popVector==groupings.table[i])
		if(length(matching.index)>0) {
			matching.index<-matching.index[1]
		} else {
			warning("automatic grouping failed")
			return(NA)
		}
		assignments$popLabel[which(assignments$popLabel==i)]<-alphabet[matching.index]
		popVector[matching.index]<-0

	}
	return(assignments)
}

CreateAssignment.df<-function(popVector) {
	alphabet<-strsplit("ABCDEFGHIJKLMNOPQRSTUVWXYZ","")[[1]]
	assignFrame<-data.frame()
	indivTotal<-0
	for(popIndex in 1:length(popVector)) {
		popLabel<-alphabet[popIndex]
		for(indivIndex in 1:popVector[popIndex]) {
			indivTotal<-indivTotal+1
			if(indivTotal==1) {
				assignFrame<-data.frame(popLabel,indivTotal)
			}
			else {
				assignFrame<-rbind(assignFrame,data.frame(popLabel, indivTotal))
			}
		}
	}
	return(assignFrame)
}


ReturnModel<-function(p,migrationArrayMap) {
   prunedResults<-subset(migrationArrayMap, migrationArrayMap$collapseMatrix.number==p[1])
   prunedResults<-subset(prunedResults, prunedResults$n0multiplierMap.number==p[2])
   prunedResults<-subset(prunedResults, prunedResults$growthMap.number==p[3])
   prunedResults<-subset(prunedResults, prunedResults$migrationArray.number==p[4])
   if(dim(prunedResults)[1]==1) {
      return(prunedResults$model)
   }
   else  {
   	  warning(paste("All these models match", prunedResults$model, "taking first one"))
   	  print(paste("All these models match", prunedResults$model, "taking first one"))
   	  print("Perhaps you sent ReturnModel a model ID rather than a vector?")
      return(prunedResults$model[1])
   }
}

ProportionDifference <- function(a,b) {
	return(abs((a-b)/min(c(a,b))))
}

PassBounds <- function(parameterVector, parameterBounds) {
	#If collapse values zero (i.e., Inf), just return(FALSE) here, so the Infs don't cause problems later on
	collapseParameters<-parameterVector[grep("collapse",names(parameterVector))]
	if(length(grep("Inf",collapseParameters)) > 0){
		return(FALSE)
	}

	#To makes sure later taus remain larger than earlier taus...
	if(length(collapseParameters)>1) { #have at least two
		for(i in sequence(length(collapseParameters)-1)){
			if(collapseParameters[i] > collapseParameters[i+1]){
				return(FALSE)
			}
		}
	}

	#Now sort the parameter vectors in order to keep ratios within bounds
	n0multiplierParameters<-sort(parameterVector[grep("n0multiplier",names(parameterVector))])
	growthParameters<-sort(parameterVector[grep("growth",names(parameterVector))])
	migrationParameters<-sort(parameterVector[grep("migration",names(parameterVector))])
	collapseParameters<-sort(parameterVector[grep("collapse",names(parameterVector))])
	if(length(migrationParameters) > 0){
		if(min(migrationParameters) < parameterBounds$minMigrationRate) {
			return(FALSE)
		}
	}
	if(length(collapseParameters) > 0){
		if(min(collapseParameters) <  parameterBounds$minCollapseTime) {
			return(FALSE)
		}
	}
	if(length(n0multiplierParameters)>1) { #have at least two
		for (i in sequence(length(n0multiplierParameters)-1)) {
			if (ProportionDifference(n0multiplierParameters[i], n0multiplierParameters[i+1]) < parameterBounds$minN0Ratio) {
				return(FALSE)
			}
		}
	}
	if(length(growthParameters) > 0){
		if(min(abs(growthParameters)) <  parameterBounds$minGrowth) {
			return(FALSE)
		}
	}
	if(length(migrationParameters)>1) { #have at least two
		for (i in sequence(length(migrationParameters)-1)) {
			if (ProportionDifference(migrationParameters[i], migrationParameters[i+1]) < parameterBounds$minMigrationRatio) {
				return(FALSE)
			}
		}
	}
	if(length(collapseParameters)>1) { #have at least two
		for (i in sequence(length(collapseParameters)-1)) {
			if (ProportionDifference(collapseParameters[i], collapseParameters[i+1]) < parameterBounds$minCollapseRatio) {
				return(FALSE)
			}
		}
	}
	if(length(growthParameters)>1) { #have at least two
		for (i in sequence(length(growthParameters)-1)) {
			if (ProportionDifference(growthParameters[i], growthParameters[i+1]) < parameterBounds$minGrowthRatio) {
				return(FALSE)
			}
		}
	}
	return(TRUE)
}

#This function checks whether time values for events (inserted into a collapseMatrix using AddEventToMigrationArray
#are not more recent than preceding collapse times or later than subsequent collapse times
#If the desired history specified by collapseMatrix fits the addedEventTimes, this function returns TRUE
PassBounds.addedEvents<-function(parameterVector,migrationIndividual,addedEventTime,addedEventTimeAsScalar){

	#If there are added non-coalescence events, toss nonsensical combinations of collapse sequences and fixed (or scaled) event times
	if(!is.null(addedEventTime)){

		#First, get the relative position of each event
		collapseMatrix<-migrationIndividual$collapseMatrix
		positionNames<-c(1:dim(collapseMatrix)[2])
		numCollapses<-0
		eventCount<-1
		for(i in 1:dim(collapseMatrix)[2]){
			if(sum(!is.na(collapseMatrix[,i])) == 0){
				names(positionNames)[i]<-"event"
			}else{
				if(max(collapseMatrix[,i],na.rm=TRUE) > 0){
					names(positionNames)[i]<-"collapse"
					numCollapses<-numCollapses + 1
				}
			}
		}
		#For each event
		for(numGens in 1:length(positionNames)){
			#Get the correct event and collapse columns to compare
			if(names(positionNames)[numGens] == "event"){

				#If there is a collapse directly preceding the non-coalescence event...
				doPreceding<-FALSE
				if(numGens > 1){
					if(length(grep("collapse",names(positionNames[numGens - 1]))) > 0){
						doPreceding<-TRUE #Then must not only worry about the collapse following the event, but also the one preceding it
					}
				}

				positionsRemaining<-positionNames[numGens]:length(positionNames)
				collapseColToCompare<-numCollapses - (length(positionsRemaining[grep("collapse",
					names(positionNames)[positionsRemaining])]) - 1)

				#Only keep if the collapse time is larger than the preceding event time
				if(addedEventTimeAsScalar == TRUE){
					#This should not filter anything unless the scalar is greater than 1
					if(parameterVector[collapseColToCompare] <= parameterVector[collapseColToCompare] *
						addedEventTime[eventCount]){
						return(FALSE)
					}
				}else{
					if(parameterVector[collapseColToCompare] <= addedEventTime[eventCount]){
						return(FALSE)
					}
				}

				#Only keep if the collapse time is smaller than the following event time
				if(doPreceding == TRUE){
					if(addedEventTimeAsScalar == TRUE){
						if(parameterVector[collapseColToCompare - 1] >= parameterVector[collapseColToCompare] *
							addedEventTime[eventCount]){
							return(FALSE)
						}
					}else{
						if(parameterVector[collapseColToCompare - 1] >= addedEventTime[eventCount]){
							return(FALSE)
						}
					}
				}
				eventCount<-eventCount + 1
			}
		}
	}
	return(TRUE)
}

ScaleParameterVectorByNe <- function(parameterVector, NeScaling=1) { #so if mitochondrial, do 1/4
#doing each class of parameters separately in case some aren't affected by the scaling
	parameterVector[which(grepl("collapse", names(parameterVector)))] <- parameterVector[which(grepl("collapse", names(parameterVector)))]/NeScaling
	parameterVector[which(grepl("n0", names(parameterVector)))] <- parameterVector[which(grepl("n0", names(parameterVector)))]/1
	parameterVector[which(grepl("migration", names(parameterVector)))] <- parameterVector[which(grepl("migration", names(parameterVector)))]/NeScaling
	parameterVector[which(grepl("growth", names(parameterVector)))] <- parameterVector[which(grepl("growth", names(parameterVector)))]/NeScaling
	return(parameterVector)
}

#Return AIC for a given model and tree
ReturnAIC<-function(par,migrationIndividual,nTrees=1,msPath=system.file("msdir","ms",package="P2C2M"),
	comparePath=system.file("extdata","comparecladespipe.pl", package="phrapl"),subsampleWeights.df=NULL,
	unresolvedTest=TRUE,print.results=TRUE, print.ms.string=FALSE,debug=FALSE,print.matches=FALSE,
	badAIC=100000000000000,ncores=1,maxParameterValue=20,numReps=0,parameterBounds=list(minCollapseTime=0.1,
	minCollapseRatio=0,minN0Ratio=0.1,minGrowth=0.1,minGrowthRatio=0.1,minMigrationRate=0.05,minMigrationRatio=0.1),
	subsamplesPerGene=1,totalPopVector,popAssignments,summaryFn="mean",saveNoExtrap=FALSE,doSNPs=FALSE,nEq=100,
	setCollapseZero=NULL,rm.n0=TRUE,popScaling,addedEventTime=NULL,addedEventTimeAsScalar=TRUE,optimization="grid", usePhyclust=TRUE){
	parameterVector<-exp(par)

	#If using optimization, the n0multiplier_1 is removed (so it is not optimized), so a "1" needs to be inserted
	if(length(parameterVector) < length(MsIndividualParameters(migrationIndividual))){
  		#First, remove n0, so can get names
  		names(parameterVector)<-MsIndividualParameters(migrationIndividual)[-grep("n0multiplier_1",
  			MsIndividualParameters(migrationIndividual))]
  		#Re-insert n0
  		parameterVector<-c(parameterVector[grep("collapse",names(parameterVector))],"n0multiplier_1"=1,
			parameterVector[grep("collapse",names(parameterVector),invert=TRUE)])
	}else{
		names(parameterVector)<-MsIndividualParameters(migrationIndividual)
	}

	#If using optimization and setCollapseZero is not NULL, then set the relevant values to zero
	if(!is.null(setCollapseZero)){
		parameterVector[setCollapseZero]<-0
	}

	#Weed out values outside of proposed bounds (only if doing numerical optimization)
	if(numReps > 0){
		if(!is.null(setCollapseZero)){
  			if(!PassBounds(parameterVector[-setCollapseZero], parameterBounds)){
				return(badAIC)
			}
		}else{
			if(!PassBounds(parameterVector, parameterBounds)){
				return(badAIC)
			}
 		}
		if(max(parameterVector) > maxParameterValue) {
			return(badAIC)
		}
		if(!is.null(addedEventTime)){
			if(!PassBounds.addedEvents(parameterVector,migrationIndividual,addedEventTime,addedEventTimeAsScalar))
				return(badAIC)
		}
	}

 	if(print.results){
 		if(rm.n0){
 			print(parameterVector[-(grep("n0multiplier",names(parameterVector)))])
 		}else{
 			print(parameterVector)
		}
 	}

 	#Do tree matching
 	likelihoodVector<-c()
	for(j in 1:length(popAssignments)){ #Do separately for each subsample size class
		currentPopAssign<-j
		likelihoodVectorCurrent<-PipeMS(migrationIndividual=migrationIndividual,parameterVector=parameterVector,
		nTrees=nTrees,subsamplesPerGene=subsamplesPerGene,popAssignments=popAssignments,msPath=msPath,comparePath=comparePath,
		ncores=ncores,currentPopAssign=currentPopAssign,print.ms.string=print.ms.string,debug=debug,unresolvedTest=unresolvedTest,
		doSNPs=doSNPs,popScaling=popScaling,addedEventTime=addedEventTime,addedEventTimeAsScalar=addedEventTimeAsScalar, usePhyclust=usePhyclust)

 	 	#Apply weights to matches
		if(!is.null(subsampleWeights.df)) {
			likelihoodVectorCurrent<-as.numeric(likelihoodVectorCurrent) * subsampleWeights.df[[j]][,1]
			subsampleWeightsVec<-subsampleWeights.df[[j]][,1]
		} else {
			likelihoodVectorCurrent<-as.numeric(likelihoodVectorCurrent)
			subsampleWeightsVec<-rep(1,length(subsampleWeights.df[[j]][,1]))
		}

	  	likelihoodVector<-append(likelihoodVector,likelihoodVectorCurrent)
  	}

  	#Convert matches to likelihoods (note that when calculating AIC, K is adjusted by subtracting off any
  	#fixed collapses, and also subtracting 1 for the first n0multiplier, which is never free
 	if(length(popAssignments) > 1){
  		lnLValue<-ConvertOutputVectorToLikelihood(outputVector=likelihoodVector,popAssignments=popAssignments,
  			nTrees=nTrees,subsamplesPerGene=subsamplesPerGene,totalPopVector=totalPopVector,saveNoExtrap=saveNoExtrap,
  			summaryFn=summaryFn,nEq=nEq)
  		AICValue<-2*(-lnLValue[1] + (KAll(migrationIndividual) - length(setCollapseZero) - 1))
  		colnames(AICValue)<-"AIC"
  		if(saveNoExtrap==TRUE){
  			AICValue.noExtrap<-2*(-lnLValue[2] + (KAll(migrationIndividual) - length(setCollapseZero) - 1))
  			colnames(AICValue.noExtrap)<-"AIC.lnL.noExtrap"
  		}
  	}else{
  		lnLValue<-ConvertOutputVectorToLikelihood.1sub(outputVector=likelihoodVector,
  			popAssignments=popAssignments,subsampleWeightsVec=subsampleWeightsVec,nTrees=nTrees,
  			subsamplesPerGene=subsamplesPerGene,totalPopVector=totalPopVector,summaryFn=summaryFn,
  			nEq=nEq,optimization=optimization)
  		AICValue<-2*(-lnLValue[1] + (KAll(migrationIndividual) - length(setCollapseZero)))
	}

	if(print.results) {
		parameterVector<-as.data.frame(matrix(nrow=1,ncol=length(parameterVector),parameterVector))
		resultsVector<-cbind(AICValue,lnLValue[1],parameterVector)
    	names(resultsVector)<-c("AIC","lnL",MsIndividualParameters(migrationIndividual))
#	    print(resultsVector)
		print(resultsVector[1:2])
		if(print.matches){
 	  		cat("\nmatches\n")
 	  		print(paste(round(likelihoodVector,4),collapse=" ",sep="")) #print matches for each observed tree
 	  	}
		cat("End Run\n\n") #Separator of different simulation runs

	   #print total number of matches
# 	   matches<-sum(as.numeric(likelihoodVector))
# 	   names(matches)<-"matchSum"
# 	   print(matches)

		#print number of matches per locus (summarized across subsamples)
#	    vector2print<-as.numeric(likelihoodVector)
#  		matches<-0
#    	matchesVec<-array()
#		localVector<-rep(NA, subsamplesPerGene)
#		print("matches per locus")
#		baseIndex<-1
#		for (i in sequence(length(vector2print))) {
#			localVector[baseIndex]<-vector2print[i]
#			baseIndex <- baseIndex+1
#			if(i%%subsamplesPerGene == 0) {
#				if(summaryFn=="SumDivScaledNreps"){
#					matchesVec<-rbind(matchesVec,get(summaryFn)(localVector=localVector,totalPopVector=totalPopVector,subNum=4,subsamplesPerGene=subsamplesPerGene))
#				}else{
#					matchesVec<-rbind(matchesVec,get(summaryFn)(localVector))
#				}
#				localVector<-rep(NA, subsamplesPerGene)
#				baseIndex<-1
#			}
#		}
 #   	print(matchesVec[-1])
	}
	if(length(popAssignments) > 1 && saveNoExtrap==TRUE){
		return(cbind(AICValue,AICValue.noExtrap,lnLValue))
	}else{
		names(AICValue)<-NULL
  		return(AICValue)
  	}
}

#This takes an outputted grid from initial.AIC search and produces ranges of values for each parameter that can be constructed into
#a new, finer-grained grid. This new grid is based on the range of values contained in the best supported combinations of parameter values
#Outputted is a list of vectors containing a range of parameter values.
CreateFineGrid<-function(gridList=NULL,gridSizeVector=c(6,6,6)){
	#Create ranges of parameter values from which can construct new finer-grained grid
	gridSorted<-gridList[order(gridList$AIC),] #Sort current grid by AIC
	parmNames<-colnames(gridSorted)[-1]
	parmRanges<-data.frame(matrix(nrow=2,ncol=length(parmNames)))
	parmRangesExtendedTEMP<-data.frame(matrix(nrow=2,ncol=length(parmNames)))
	parmRangesExtended<-data.frame(matrix(nrow=2,ncol=length(parmNames)))
	fineGrid<-list()
	colnames(parmRanges)<-parmNames
	colnames(parmRangesExtendedTEMP)<-parmNames
	colnames(parmRangesExtended)<-parmNames
	for(rep in 1:length(parmNames)){ #for each parameter
		#Define ranges of the top 5 models
		parmRanges[1,rep]<-min(gridSorted[1:5,(rep + 1)])
		parmRanges[2,rep]<-max(gridSorted[1:5,(rep + 1)])
		uniqueParmsVec<-unique(gridSorted[,(rep + 1)])[order(unique(gridSorted[,(rep + 1)]))] #vector of unique parms

		#If the minimum best parameter is not the smallest parameter in the grid, extend the range a bit (to 50% the distance
		#from the next parameter value). Ditto for the maximum best parameter.
		if(is.na(uniqueParmsVec[rev(order(uniqueParmsVec[which(uniqueParmsVec < parmRanges[1,rep])]))][1])){
			parmRangesExtendedTEMP[1,rep]<-parmRanges[1,rep]
		}else{
			parmRangesExtendedTEMP[1,rep]<-uniqueParmsVec[rev(order(uniqueParmsVec[which(uniqueParmsVec < parmRanges[1,rep])]))][1]
		}
		if(is.na(uniqueParmsVec[which(uniqueParmsVec > parmRanges[2,rep])][1])){
			parmRangesExtendedTEMP[2,rep]<-parmRanges[2,rep]
		}else{
			parmRangesExtendedTEMP[2,rep]<-uniqueParmsVec[which(uniqueParmsVec > parmRanges[2,rep])][1]
		}

		#Using the above range extentions, define grid ranges for each parameter
		parmRangesExtended[1,rep]<-parmRanges[1,rep] - ((parmRanges[1,rep] - parmRangesExtendedTEMP[1,rep]) / 2)
		parmRangesExtended[2,rep]<-parmRanges[2,rep] + ((parmRangesExtendedTEMP[2,rep] - parmRanges[2,rep]) / 2)

		#Now create vectors of new grid values for each parameter
		#First, define the number of grid points based on the type of parameter
		if(length(grep("collapse",colnames(parmRangesExtended)[rep])) != 0){
			gridSize<-gridSizeVector[1]
		}
		if(length(grep("n0multiplier",colnames(parmRangesExtended)[rep])) != 0){
			gridSize<-gridSizeVector[2]
		}
		if(length(grep("growth",colnames(parmRangesExtended)[rep])) != 0){
			gridSize<-gridSizeVector[3]
		}
		if(length(grep("migration",colnames(parmRangesExtended)[rep])) != 0){
			gridSize<-gridSizeVector[4]
		}

		#Calculate internal grid values
		gridInterval<-(parmRangesExtended[2,rep] - parmRangesExtended[1,rep]) / (gridSizeVector[1] - 1)
		gridInternalVals<-array()
		currentValue<-parmRangesExtended[1,rep]
		for(rep1 in 1:(gridSizeVector[1] - 2)){
			gridInternalVals<-c(gridInternalVals,(currentValue + gridInterval))
			currentValue<-tail(gridInternalVals,n=1)
		}
		gridInternalVals<-gridInternalVals[!is.na(gridInternalVals)]
		#Concatenate min, max, and internal values and add to the grid list
		fineGrid[[length(fineGrid) + 1]]<-c(parmRangesExtended[1,rep],gridInternalVals,parmRangesExtended[2,rep])
		names(fineGrid)[length(fineGrid)]<-colnames(parmRangesExtended)[rep]
		startGrid<-list()
		startGrid[[1]]<-log(expand.grid(fineGrid))
	}
	return(fineGrid)
}

#This takes vectors of parameter values and constructs a grid containing all possible combinations of parameter values, except
#non-sensical collapse time combinations (e.g., tau's at time 1 which are larger than tau's at time 2), and identical migration
#rates and n0mulipliers are filtered out.
#Also, if non-coalescence events have been added to the collapseMatrix, nonsensical combinations of fixed event times and
#collapse times can also be filtered out. For this, need to input the migrationIndividual, the event times (via addedEventTime),
#and a statement concerning whether these are scalar values (via addedEventTimeAsScalar).
#This grid is outputted in the form of a list which can be used to obtain AIC values using SearchContinuousModelSpaceNLoptr.
CreateStartGrid<-function(fineGrid,migrationIndividual=NULL,addedEventTime=NULL,addedEventTimeAsScalar=TRUE,startGridWithSetCollapseZero=FALSE){
	#If the purpose is to re-filter a grid based on added events after adding zero-filled collapse columns...
	if(startGridWithSetCollapseZero == TRUE){
		startGrid<-exp(fineGrid)
		startGrid<-list(startGrid)

	#Otherwise, run as normal
	}else{
		startGrid<-list()
		startGrid[[1]]<-expand.grid(fineGrid)
		#Toss nonsensical collapse combinations (e.g., when collapse 2 is smaller than collapse 1)
		howManyCollapses<-length(grep("collapse",names(startGrid[[1]])))
		if(howManyCollapses > 1){
			for(rep in 1:(howManyCollapses - 1)){
				startGrid[[1]]<-startGrid[[1]][which(startGrid[[1]][,rep] < startGrid[[1]][,(rep+1)]),]
			}
		}
	}

	#If there are added non-coalescence events, toss nonsensical combinations of collapse sequences and fixed (or scaled) event times
	if(!is.null(addedEventTime)){

		#First, get the relative position of each event
		collapseMatrix<-migrationIndividual$collapseMatrix
		positionNames<-c(1:dim(collapseMatrix)[2])
		numCollapses<-0
		eventCount<-1
		for(i in 1:dim(collapseMatrix)[2]){
			if(sum(!is.na(collapseMatrix[,i])) == 0){
				names(positionNames)[i]<-"event"
			}else{
				if(max(collapseMatrix[,i],na.rm=TRUE) > 0){
					names(positionNames)[i]<-"collapse"
					numCollapses<-numCollapses + 1
				}
			}
		}
		#For each event
		for(numGens in 1:length(positionNames)){
			#Get the correct event and collapse columns to compare
			if(names(positionNames)[numGens] == "event"){

				#If there is a collapse directly preceding the non-coalescence event...
				doPreceding<-FALSE
				if(numGens > 1){
					if(length(grep("collapse",names(positionNames[numGens - 1]))) > 0){
						doPreceding<-TRUE #Then must not only worry about the collapse following the event, but also the one preceding it
					}
				}

				positionsRemaining<-positionNames[numGens]:length(positionNames)
				collapseColToCompare<-numCollapses - (length(positionsRemaining[grep("collapse",
					names(positionNames)[positionsRemaining])]) - 1)

				#Only keep if the collapse time is larger than the preceding event time
				if(addedEventTimeAsScalar == TRUE){
					#This should not filter anything unless the scalar is greater than 1
					startGrid[[1]]<-startGrid[[1]][which(startGrid[[1]][,collapseColToCompare] >
						startGrid[[1]][,collapseColToCompare] * addedEventTime[eventCount]),]
				}else{
					startGrid[[1]]<-startGrid[[1]][which(startGrid[[1]][,collapseColToCompare] > addedEventTime[eventCount]),]
				}
				if(nrow(startGrid[[1]]) == 0){
					warning("Error: eventTimes need to be reduced such that they precede the next collapse event")
				}

				#Only keep if the collapse time is smaller than the following event time
				if(doPreceding == TRUE){
					if(addedEventTimeAsScalar == TRUE){
						startGrid[[1]]<-startGrid[[1]][which(startGrid[[1]][,(collapseColToCompare - 1)] <
							startGrid[[1]][,collapseColToCompare] * addedEventTime[eventCount]),]
					}else{
						startGrid[[1]]<-startGrid[[1]][which(startGrid[[1]][,(collapseColToCompare - 1)] < addedEventTime[eventCount]),]
					}
				}
				if(nrow(startGrid[[1]]) == 0){
					warning("Error: eventTimes need to be increased such that they are larger than the preceding collapse event")
				}
				eventCount<-eventCount + 1
			}
		}
	}

	#Toss migration parameters that are equal
	howManyMigrations<-length(grep("migration",names(startGrid[[1]])))
	if(howManyMigrations > 1){
		#Get vector of detailing rows with the same migration rate
		migrationCols<-startGrid[[1]][,grep("migration",names(startGrid[[1]]))] #mig columns only
		uniqueMigs<-c()
		for(r in 1:nrow(migrationCols)){
			uniqueMigs<-append(uniqueMigs,length(unique(simplify2array(migrationCols[r,])))==
				length(simplify2array(migrationCols[r,])))
		}
		#Toss rows with duplicate migration rates (if more than 1 free mig param)
		startGrid[[1]]<-startGrid[[1]][which(uniqueMigs==TRUE),]
	}

	#Toss n0multiplier parameters that are equal
	howManyN0s<-length(grep("n0multiplier",names(startGrid[[1]])))
	if(howManyN0s > 1){
		#Get vector of detailing rows with the same n0multiplier
		n0Cols<-startGrid[[1]][,grep("n0multiplier",names(startGrid[[1]]))] #n0 columns only
		uniqueN0s<-c()
		for(r in 1:nrow(n0Cols)){
			uniqueN0s<-append(uniqueN0s,length(unique(simplify2array(n0Cols[r,])))==
				length(simplify2array(n0Cols[r,])))
		}
		#Toss rows with duplicate n0multipliers (if more than 1 free n0 param)
		startGrid[[1]]<-startGrid[[1]][which(uniqueN0s==TRUE),]
	}

	#Toss growth parameters that are equal
	howManyGrowths<-length(grep("growth",names(startGrid[[1]])))
	if(howManyGrowths > 1){
		#Get vector of detailing rows with the same growth
		growthCols<-startGrid[[1]][,grep("growth",names(startGrid[[1]]))] #growth columns only
		uniqueGrowths<-c()
		for(r in 1:nrow(growthCols)){
			uniqueGrowths<-append(uniqueGrowths,length(unique(simplify2array(growthCols[r,])))==
				length(simplify2array(growthCols[r,])))
		}
		#Toss rows with duplicate growths (if more than 1 free growth param)
		startGrid[[1]]<-startGrid[[1]][which(uniqueGrowths==TRUE),]
	}
	startGrid[[1]]<-log(startGrid[[1]]) #since we are optimizing in log space
	return(startGrid)
}

#This function returns the number of grid points in a grid constructed for a given model and
#grid start parameters (the value is not perfectly multiplicative as older collapse events with
#smaller values than earlier collapse events are filtered out).
GetLengthGridList<-function(modelID=1,collapseStarts=NULL,
	migrationStarts=NULL,n0multiplierStarts=NULL,growthStarts=NULL,
	setCollapseZero=NULL){
	paramNames<-MsIndividualParameters(migrationArray[[modelID]])

	startingVectorList<-list()
	nameCount<-1

	#Calculate number of collapse parameters to estimate (subtracting off those that are set to zero)
	numCollapse <- sum(grepl("collapse",paramNames)) - length(setCollapseZero)
	if(numCollapse < 0){ #If for some reason there are more fixed parameters than possible, reduce them
		setCollapseZero<-sequence(sum(grepl("collapse",paramNames)))
		numCollapse <- sum(grepl("collapse",paramNames)) - length(setCollapseZero)
	}
	if(!is.null(setCollapseZero)){ #If setCollapseZero is nonzero
		collapses2estimate<-grep("collapse",paramNames,value=TRUE)[which(grep("collapse",paramNames)!=
			setCollapseZero)]
	}else{
		collapses2estimate<-grep("collapse",paramNames,value=TRUE)
	}
	if(numCollapse > 0) {
		for (i in sequence(numCollapse)) {
			startingVectorList<-append(startingVectorList, list(collapseStarts))
			names(startingVectorList)[nameCount]<-collapses2estimate[i]
			nameCount<-nameCount + 1
		}
	}

	#Calculate number of n0 parameters (if just one, subtract it off)
	if(sum(grepl("n0multiplier",paramNames)) == 1){
	   numn0 <- sum(grepl("n0multiplier",paramNames)) - 1 # -1 since one is fixed to be 1
	}else{
	   numn0 <- sum(grepl("n0multiplier",paramNames))
	}
	if (numn0 > 0) {
		for (i in sequence(numn0)) {
			startingVectorList<-append(startingVectorList, list(n0multiplierStarts))
			names(startingVectorList)[nameCount]<-grep("n0multiplier",paramNames,value=TRUE)[i]
			nameCount<-nameCount + 1
		}
	}

	#Calculate number of growth parameters
	numGrowth <- sum(grepl("growth",paramNames))
	if (numGrowth > 0) {
		for (i in sequence(numGrowth)) {
			startingVectorList<-append(startingVectorList, list(growthStarts))
			names(startingVectorList)[nameCount]<-grep("growth",paramNames,value=TRUE)[i]
			nameCount<-nameCount + 1
		}
	}

	#Calculate number of migration parameters
	numMigration <- sum(grepl("migration",paramNames))
	if (numMigration > 0) {
		for (i in sequence(numMigration)) {
			startingVectorList<-append(startingVectorList, list(migrationStarts))
			names(startingVectorList)[nameCount]<-grep("migration",paramNames,value=TRUE)[i]
			nameCount<-nameCount + 1
		}
	}
	startGrid<-CreateStartGrid(startingVectorList)
	startGrid<-startGrid[[1]] #default grid shouldn't be list (as there is always one grid)

	return(nrow(startGrid))
}

##Match simulated trees to observed trees and export vector of matches
PipeMS<-function(migrationIndividual,parameterVector,popAssignments,nTrees=1,msPath=system.file("msdir","ms",package="P2C2M"),
		comparePath=system.file("extdata", "comparecladespipe.pl", package="phrapl"),unresolvedTest=TRUE,subsamplesPerGene,debug=FALSE,
		print.ms.string=FALSE,ncores=1,currentPopAssign=1, doSNPs=FALSE, popScaling=popScaling,addedEventTime=NULL,
		addedEventTimeAsScalar=TRUE, usePhyclust=TRUE){
	stored.wd<-getwd()
	setwd(tempdir())
	final.outputVector<-c()
	for (scaling.index in sequence(length(unique(popScaling)))) {
		observed<-paste(tempdir(),"/observed",currentPopAssign,".scaling.",unique(popScaling)[scaling.index],".tre",sep="")
		assign<-paste(tempdir(),"/assign",currentPopAssign,".txt",sep="")
		msCallInfo<-CreateMSstringSpecific(popAssignments[[currentPopAssign]],migrationIndividual=migrationIndividual,
			parameterVector=ScaleParameterVectorByNe(parameterVector,unique(popScaling)[scaling.index]),
			addedEventTime=addedEventTime,addedEventTimeAsScalar=addedEventTimeAsScalar,nTrees=ceiling(nTrees/ncores))

		systemMS<-function(stringname){
	#		outputVectorMS<-suppressWarnings(system(command="(stringname) & sleep 2 ; kill $!",intern=TRUE))
	#		http://stackoverflow.com/questions/687948/timeout-a-command-in-bash-without-unnecessary-delay
			outputVectorMS<-system(stringname,intern=TRUE)
			return(outputVectorMS)
		}
		systemPerl<-function(stringname){
			outputVectorPerl<-system(stringname,intern=TRUE)
			return(outputVectorPerl)
		}

		if(print.ms.string) {
			print(msCallInfo)
		}

		unresolvedFlag<-"-u"
		if (unresolvedTest==FALSE) {
			unresolvedFlag<-""
		}

		snpFlag<-"-d" #for doSNPs
		if (doSNPs==FALSE) {
			snpFlag<-""
		}


		#Simulate trees in MS and do matching in perl
		outputstringMS<-paste(msPath,sprintf("%i",msCallInfo$nsam),sprintf("%i",msCallInfo$nreps),msCallInfo$opts,
			" | grep ';' > mstrees.txt", sep=" ")
		outputstringPerl<-paste("cat mstrees.txt | perl",comparePath,unresolvedFlag, snpFlag, paste("-a",assign,sep=""),
			paste("-o",observed,sep=""),sep=" ")

		if(ncores==1){
			if(usePhyclust) {
				msOutput <- phyclust::ms(nsam=msCallInfo$nsam, nreps=msCallInfo$nreps, opts=msCallInfo$opts)
				msOutput <- msOutput[grepl(';', msOutput)]
				msOutput <- gsub(" ", "", msOutput)
				msOutput <- gsub("s", "", msOutput)

				cat(msOutput, file="mstrees.txt", sep="\n")
				outputVectorMS <- ""
			} else {
				outputVectorMS<-systemMS(stringname=outputstringMS)
			}
			outputVectorPerl<-systemPerl(stringname=outputstringPerl)
			outputVector<-paste(outputVectorMS,outputVectorPerl,sep=" ")
			#return(outputVector)
		}else{
			stop("something is wrong here")
			wrapOutput<-function(x,outputstring) {
				as.numeric(system(outputstring,intern=TRUE))
			}
			outputVector<-apply(simplify2array(mclapply(sequence(ncores),wrapOutput,outputstring=paste(outputstringMS, outputstringPerl, sep=" ", collapse=" "),mc.cores=ncores)),1,sum)
			#return(outputVector)
		}
		final.outputVector <- append(final.outputVector, outputVector)
	}
	#reorder to match input tree order
	orderVector <- c()
	for  (scaling.index in sequence(length(unique(popScaling)))) {
		orderVector <- append(orderVector, which(popScaling==unique(popScaling)[scaling.index]))
	}
	final.outputVector[orderVector]<-final.outputVector
	setwd(stored.wd)
	return(final.outputVector)
}

#What happens with zero matching trees? Is the probability of the data exactly
#zero under that model? No -- there is a nonzero probability of nearly every
#gene tree topology, and letting it be zero is a strong statement that a model
#is impossible. It's like flipping a coin 5 times, finding zero heads, and
#saying that the best estimate for P(heads) is zero: it's probably wrong.
#Similarly, we have prior information: each possible gene topology, absent
#other information, has a 1/howmanytrees(nTips) chance of matching. Simply
#plugging this in when we have no matches isn't ideal: shouldn't our estimate
#of this change if we do 1e9 simulated trees than 1e3 and still find no match?
#We use a beta-binomial distribution to do this; mean is set to
#1/howmanytrees(nTips), and the equivalent sample size of the prior
#(how much weight we give it) matters but is up to the user
#By default, we assume that it's equal to 100 data points
#This is several orders of magnitude smaller than the usual number of nTrees
#so using beta will have little effect with matches, but it does let lack of
#matches result in a finite likelihood
AdjustUsingBeta <- function(numMatches, nTrees, nTips, nEq=100) {
	#nEq = a + b + 1
	#1/howmanytrees(nTips) = a / (a + b) #the prior mean
	#b = nEq - a - 1
	#howmanytrees(nTips) = (a + b) / a = (a + nEq - a - 1) / a = (nEq -1) / a
	a <- (nEq - 1) /  howmanytrees(nTips)
	b <- nEq - a - 1
	return( (numMatches + a) / (nTrees + a + b))
}

#This function calculates lnL based on number of matches for each subsample. Then the likelihood of the
#full dataset is estimated from the subsampled dataset based on a linear relationship between sample size
#and lnL.
ConvertOutputVectorToLikelihood<-function(outputVector,popAssignments,nTrees=1,subsamplesPerGene=1,totalPopVector,
		saveNoExtrap=FALSE,summaryFn="mean", nEq=100){
	nLoci<-length(outputVector) / length(popAssignments) / subsamplesPerGene #number of loci

	start<-1
	end<-length(outputVector) / length(popAssignments)
	for(g in sequence(length(popAssignments))){ #for each subsample size class
		currentPopAssignments<-popAssignments[[g]]
		outputVector[start:end]<-as.numeric(outputVector[start:end])
		#probOfMissing<-1/((howmanytrees(sum(popAssignments[[g]]))) * 1000000)
		#outputVector[start:end][which(outputVector[start:end]==0)]<-probOfMissing
		outputVector[start:end]<-AdjustUsingBeta(numMatches=outputVector[start:end], nTrees=nTrees, nTips=sum(currentPopAssignments), nEq=nEq)

		start<-start + end
		end<-end + end
	}

	#Convert to log space
#	outputVector<-as.numeric(outputVector)/nTrees
	outputVector<-log(outputVector)

	#If the option to save non-extrapolated likelihood using the largest subsample size, do this
	if(saveNoExtrap==TRUE){
		lnL.noExtrap<-0
		localVector.noExtrap<-rep(NA,subsamplesPerGene)
		baseIndex<-1
		locusIndex<-1
		for (i in 1:(length(outputVector) / length(popAssignments))){
			localVector.noExtrap[baseIndex]<-outputVector[i]
			baseIndex <- baseIndex+1
			if(i%%subsamplesPerGene == 0) {
				if(summaryFn=="SumDivScaledNreps"){
					lnL.noExtrap<-lnL.noExtrap+get(summaryFn)(localVector=localVector.noExtrap,popAssignments,
						totalPopVector=totalPopVector[locusIndex],subsamplesPerGene=subsamplesPerGene)
				}else{
					lnL.noExtrap<-lnL.noExtrap+get(summaryFn)(localVector.noExtrap)
				}
				localVector.noExtrap<-rep(NA, subsamplesPerGene)
				baseIndex<-1
				locusIndex<-locusIndex + 1
			}
		}
	}

	#Index locations of loci within outputVector
	treesPerLocus<-subsamplesPerGene * length(popAssignments) #number of trees per locus
	outputIndex<-array(NA,length(outputVector))
	counter1<-1
	for(n in 1:length(popAssignments)){
		counter2<-1
		for(m in sequence(length(outputIndex) / length(popAssignments))){
			outputIndex[counter1]<-counter2
			counter1<-counter1+1
			if(m%%subsamplesPerGene==0){
				counter2<-counter2+1
			}
		}
	}

	#Calculate coefficients for ntaxa-lnL relationship to calculate lnL of full dataset
	slopes<-data.frame(matrix(NA,nrow=subsamplesPerGene,ncol=nLoci))
	intercepts<-data.frame(matrix(NA,nrow=subsamplesPerGene,ncol=nLoci))
	#For each locus
	for(i in 1:nLoci){
		localVector<-outputVector[which(outputIndex==i)]
		currentSlopes<-data.frame()
		currentIntercepts<-data.frame()
		#For each subsample (but across different subsample size classes)
		for(j in 1:(length(localVector) / length(popAssignments))){
			#Get indexes to find a given subsampled tree of different sizes
			currentSubsample<-array(NA,length(popAssignments))
			currentSubsampleSize<-j
			for(k in 1:length(popAssignments)){
				currentSubsample[k]<-currentSubsampleSize
				currentSubsampleSize<-currentSubsampleSize + subsamplesPerGene
			}
			#Get the slopes and intercepts for each subsample within the current locus
			slopes[j,i]<-coef(lm(localVector[currentSubsample]~sapply(popAssignments,sum)))[2]
			intercepts[j,i]<-coef(lm(localVector[currentSubsample]~sapply(popAssignments,sum)))[1]
		}
		colnames(slopes)[i]<-paste("slopes.L",i,sep="")
		colnames(intercepts)[i]<-paste("intercepts.L",i,sep="")
	}

	#Get mean and 95%CI of coefficients
	meanSlopes<-sapply(slopes,mean,simplify = "array")
	meanIntercepts<-sapply(intercepts,mean,simplify = "array")
	coeffsOrdered<-sapply(cbind(slopes,intercepts),sort)
	if(subsamplesPerGene>1){
		coeffs95Lower<-coeffsOrdered[(round(subsamplesPerGene*0.025) + 1),]
		coeffs95Upper<-coeffsOrdered[round(subsamplesPerGene*0.975),]
	}else{
		coeffs95Lower<-unname(coeffsOrdered)
		coeffs95Upper<-unname(coeffsOrdered)
	}
	names(coeffs95Lower)<-paste("L95.",c(names(slopes),names(intercepts)),sep="")
	names(coeffs95Upper)<-paste("U95.",c(names(slopes),names(intercepts)),sep="")
	#Estimate lnL from full dataset
	lnL.byLocus<-unname(meanIntercepts + (totalPopVector * meanSlopes))
	names(lnL.byLocus)<-paste("lnL.L",c(1:length(lnL.byLocus)),sep="")
	lnL<-sum(lnL.byLocus)
	names(lnL)<-"lnL"
	#Prep to return
	if(saveNoExtrap==TRUE){
		lnL<-c(lnL,lnL.noExtrap=lnL.noExtrap,lnL.byLocus,meanSlopes,meanIntercepts,coeffs95Lower,coeffs95Upper)
	}else{
		lnL<-c(lnL,lnL.byLocus,meanSlopes,meanIntercepts,coeffs95Lower,coeffs95Upper)
	}
	lnL.mat<-data.frame(matrix(nrow=1,ncol=length(lnL),lnL))
	colnames(lnL.mat)<-names(lnL)

	return(lnL.mat)
}

#If only one set of subsamples is used, use this to calculate likelihoods (no extrapolation),
#although effective subsample sizes can be considered by using SumDivScaledNreps

#The way this works: Overall likelihoods are calculated by first taking the mean value across subsamples,
#and then summing these means across all loci. Prior to taking the mean value, subsamples that yielded zero
#matches must be assigned a likelihood. To do this, we assume that a single match occurred, calculate the
#likelihood of a single match (assuming the subsampleWeight of the tree and nTrees), and then apply an lnL penalty (to
#account for the fact that zero matches were actually observed). This penalty was optimized empirically to produce
#likelihood values observed using coal looking at both nTrees = 1e4 and 1e5 (nTrees should rarely smaller or larger
#than this). Although the proportion of matchless trees in a dataset was also considered when optimizing the penalty,
#the best penalties excluded this information. The best penalty was -5 and -4 lnL points for 1e4 and 1e5 nTrees,
#respectively. The penalty applied by default is thus calculated assuming the slope derived from this fact.

#In the case where there are no matches across all subsamples and loci, the beta binomial will be used to estimate
#the lnL when nloptr or genoud optimization are being employed, as these methods require an lnL estimate. When the
#grid alone is being used to optimize, these completely matchless datasets can also be assigned an lnL using beta
#binomial if calculateBetaWithoutOptimization = TRUE; otherwise, lnL will be set to be NA.

#MatchedThreshold gives the proportion of the total loci that must have an estimated lnL (i.e., have some matches)
#before lnLs can be estimated based on the min matching value + penalty. The idea here is that if zero-matching is so
#pervasive, then who's to say the lnL isn't extremely low (e.g., infinity). If the proportion of matching loci falls
#below this threshold, then either beta binomial will be used or lnL will be assign NA, as per above.
ConvertOutputVectorToLikelihood.1sub<-function(outputVector,popAssignments,subsampleWeightsVec,nTrees=1,
	subsamplesPerGene=1,totalPopVector,summaryFn="mean", nEq=100, propMatchedThreshold=0.05,penaltyAtIntercept=5.11,
	penaltySlope=0.000011,optimization="grid",calculateBetaWithoutOptimization=FALSE){

	##First get the Beta estimate, if desired, in case needed (for cases where no matches are found)
	if(calculateBetaWithoutOptimization == TRUE || optimization != "grid"){
		outputVectorBeta<-as.numeric(outputVector)
		outputVectorBeta<-AdjustUsingBeta(numMatches=outputVectorBeta, nTrees=nTrees, nTips=sum(popAssignments[[1]]), nEq=nEq)
		outputVectorBeta<-log(outputVectorBeta)
		lnLBeta<-0
		localVectorBeta<-rep(NA, subsamplesPerGene)
		baseIndex<-1
		locusIndex<-1
		for (i in sequence(length(outputVectorBeta))) {
			localVectorBeta[baseIndex]<-outputVectorBeta[i]
			baseIndex <- baseIndex+1
			if(i%%subsamplesPerGene == 0) {
				if(summaryFn=="SumDivScaledNreps"){
					lnLBeta<-lnLBeta+get(summaryFn)(localVector=localVector,popAssignments,totalPopVector=totalPopVector[locusIndex],
						subsamplesPerGene=subsamplesPerGene)
				}else{
					lnLBeta<-lnLBeta+get(summaryFn)(localVectorBeta)
				}
				localVectorBeta<-rep(NA, subsamplesPerGene)
				baseIndex<-1
				locusIndex<-locusIndex + 1
			}
		}
	}

	##Now get the likelihood using the "min possible" method (assuming 1 match minus an empirically derived penalty)
	#First, get summary value (e.g., mean) across subsamples
	outputVector<-as.numeric(outputVector)

	#Calculate log-likelihoods for each locus
	outputVector<-outputVector/nTrees #get likelihood
	outputVector<-log(outputVector)

	#Assign likelihood to matchless loci (assume 1 match)
	zeroMatchers<-grep(-Inf,outputVector) #keep track of the distribution of zero matching trees
	penaltyConstant<-penaltyAtIntercept - (nTrees * penaltySlope) #calculate penalty based on nTrees
	outputVector[which(outputVector == -Inf)]<-log(subsampleWeightsVec[which(outputVector == -Inf)] / nTrees) - penaltyConstant

	#Summarize likelihoods across subsamples for each locus
	summaryVector<-c()
	localVector<-rep(NA, subsamplesPerGene)
	matchlessLoci<-c()
	baseIndex<-1
	locusIndex<-1
	counterAll<-0
	currentCounter<-c()
	for (i in sequence(length(outputVector))) {
		localVector[baseIndex]<-outputVector[i]
		baseIndex <- baseIndex+1
		counterAll<-counterAll + 1
		currentCounter<-append(currentCounter,counterAll)
		if(i%%subsamplesPerGene == 0) {
			if(summaryFn=="SumDivScaledNreps"){
				summaryVector<-summaryVector+get(summaryFn)(localVector=localVector,popAssignments,
					totalPopVector=totalPopVector[locusIndex],subsamplesPerGene=subsamplesPerGene)
			}else{
				summaryVector<-append(summaryVector,get(summaryFn)(localVector))
			}

			#Build object that keeps track of which loci were characterized by completely matchless subsamples
			areThereMatches<-currentCounter %in% zeroMatchers
			if(sum(areThereMatches) == length(currentCounter)){
				matchlessLoci<-append(matchlessLoci,1)
			}else{
				matchlessLoci<-append(matchlessLoci,0)
			}

			#Reset
			localVector<-rep(NA, subsamplesPerGene)
			baseIndex<-1
			locusIndex<-locusIndex + 1
			currentCounter<-c()
		}
	}

	#Assign likelihood to matchless loci and then sum across loci
 	propMatched<-1 - (sum(matchlessLoci==1) / length(matchlessLoci)) #proportion of loci with matches
 	if(propMatched >= propMatchedThreshold){ #If there are enough matching loci, assign lnL to zero-matching loci
		lnL<-sum(summaryVector)
	}else{
		if(calculateBetaWithoutOptimization == TRUE || optimization != "grid"){
			lnL<-lnLBeta
		}else{
			lnL<-NA
		}
	}
	return(lnL)
}

#GetAndBindLabel and GetClades together get a list of all clades in the tree. For the taxa descended from each clade,
#sorts alphabetically and then makes them a string. These are slow, and have since been replaced by GetCladesQuickly
GetClades <- function(phy) {
	return(simplify2array(sapply(subtrees(phy), GetAndBindLabel)))
}

#Used with GetClades
GetAndBindLabel <- function(phy) {
	#note the sorting here
	return( paste( sort( phy$tip.label ), collapse="_" ) )
}

#This extracts clades from raw newick trees (implemented for efficiency). It also allows for
#relabeling of individuals with population letters. Thus, this replaces GetCladesQuickly (7 times
#slower) and ConvertAlleleNumbersToPopulationLetters
GetCladeNamesFromNewickString <- function(newick.string,assignFrame,convertIndivNumsToPopLetters=FALSE,
		getOutDegreeOfPolytomies=FALSE) {
	descendantCounts<-c() #for saving the degree of ploytomies
	nobrlen.tree <- gsub(pattern=";", replacement="", x=gsub(pattern='\\:\\d+\\.*\\d*', replacement="",
		x= newick.string, perl=TRUE)) #remove branch lengths
	clade.names<-rep(NA, sum(grepl("\\(", strsplit(nobrlen.tree,"")[[1]]))) #num of clades=num of parentheses/2

	#Start the culling of clades
	for(clade.count in sequence(length(clade.names))) {
		m<-regexpr(pattern="\\([^(^)]+\\)", text=nobrlen.tree, perl=TRUE) #find location of first set of closed parentheses
		clade<-gsub("\\)", "", gsub("\\(","",regmatches(x=nobrlen.tree, m))) #everything in between is a clade
		taxa<-strsplit(clade, ",")[[1]]

		#Get the degree of ploytomies for each clade with more than two taxa
		if(getOutDegreeOfPolytomies){
			if(length(taxa) > 2){ #if there are more than 2 members a new clade...
				descendantCounts<-append(descendantCounts,length(taxa)) #record the degree
			}
		}

		#Convert numbers to letters
		if(convertIndivNumsToPopLetters){
			which.untransformed.alleles<-which(grepl("\\d", taxa, perl=TRUE)) #number of untransformed
			for(allele.index in sequence(length(which.untransformed.alleles))) {
				taxon.index<-which.untransformed.alleles[allele.index]
				taxa[taxon.index]<-as.character(assignFrame$popLabel[match(taxa[taxon.index],assignFrame$indivTotal)])
			}
		}

		clade.name<-paste(sort(taxa),collapse="-")
		regmatches(x=nobrlen.tree, m)<-clade.name
		clade.names[clade.count]<-clade.name
	}
	if(getOutDegreeOfPolytomies){
		return(list(clade.names,descendantCounts))
	}else{
		return(clade.names)
	}
}

GetOutDegreeOfPolytomies <- function(phy) {
	descendantCounts <- table(phy$edge[,1])
	descendantCounts <- unname(descendantCounts[which(descendantCounts>2)])
	return(descendantCounts)
}

#Match clades of two trees
#assumes:
#1. You have already run ConvertAlleleNumbersToPopulationLetters so these trees have letters
#2. You have already made them have the same size (do SubsampleMSTree if needed)
GetScoreOfSingleTree <- function(cladesMS, phyMS, cladesGene, phyGene, polytomyDegreesForCorrection=NULL) {
	numberCladesInMSOnly <- sum(!cladesMS%in%cladesGene)
	numberCladesInGeneOnly <- sum(!cladesGene%in%cladesMS)
	matchCount <- 0
	if(numberCladesInMSOnly == 0 && numberCladesInGeneOnly==0) {
		matchCount <- 1
	}
	if(!is.null(polytomyDegreesForCorrection)){
		if(numberCladesInMSOnly > 0 && numberCladesInGeneOnly == 0){
			descendantCounts<-polytomyDegreesForCorrection
			correction <- 1
			for (i in sequence(length(descendantCounts))) {
				correction <- correction * howmanytrees(descendantCounts[i], rooted=TRUE, binary=TRUE, labeled=TRUE)
			}
			matchCount <- 1 / correction #idea here is that the gene tree could have been resolved at each polytomy multiple ways
			#only one of these ways would match the given phyMS tree. So we figure out the number of ways to resolve polytomy 1,
			#multiply that by the number of ways to resolve polytomy 2, etc. A polytomy with three descendant edges has 3 ways to
			#resolve it, one with 4 descendant edges has 3 * 5 = 15, etc.
		}
	}
	return(matchCount)
}

#Drops tips from the simulated trees (to be used in accordance with popAssignments)
#Relies on using phylo objects, which makes it slow
SubsampleMSTree<-function(phy,popVectorOrig,popVectorFinal) {
	taxaToDrop<-c()
	minSample<-1
	maxSample<-0
	uniquePops<-unique(phy$tip.label)
	for(population in 1:length(uniquePops)){
		maxSample<-minSample + popVectorOrig[population] - 1
		sampleSizeDiff<-popVectorOrig[population] - popVectorFinal[population]
		if(sampleSizeDiff > 0){
			taxaToDrop<-append(taxaToDrop,
				which(phy$tip.label==uniquePops[population])[c(1:sampleSizeDiff)])
		}
		minSample<-maxSample + 1
	}
	phy<-drop.tip(phy,taxaToDrop)

	return(phy)
}

#Drop tips from newick string. This is not presently working. If we just keep all the
#parentheses as is when deleting taxa and just drop labels and commas, this will be easy to get working.
SubsampleMSTree.raw<-function(phy,popVectorOrig,popVectorFinal) {
	#Get taxon labels from tree
	taxonLabels<-strsplit(phy, "[^\\w]", perl=TRUE)[[1]]
	taxonLabels<-taxonLabels[which(nchar(taxonLabels)>0)]

	#Get tree positions for labels to drop
	uniquePops<-unique(taxonLabels)

	#Split tree into characters and match to old taxon label positions
	split.tree<-split.tree<-strsplit(phy,"")[[1]]
	labelPositions<-c()
	for(i in 1:length(uniquePops)){
		labelPositions<-append(labelPositions,which(split.tree==uniquePops[i]))
	}

	#Select positions in the split tree of taxa to drop
	taxaToDrop<-c()
	startPos<-1
	endPos<-popVectorOrig[population]
	for(population in 1:length(uniquePops)){
		sampleSizeDiff<-popVectorOrig[population] - popVectorFinal[population]
		if(sampleSizeDiff > 0){
			currentLabels<-labelPositions[startPos:endPos]
			taxaToDrop<-append(taxaToDrop,currentLabels[c(sample(1:popVectorOrig[population],sampleSizeDiff,replace=FALSE))])
			startPos<-startPos + popVectorOrig[population]
			endPos<-endPos + popVectorOrig[population]
		}
	}

	#Drop one tip at a time (must be fully-resolved)
	#Toss the label and adjacent commas and opening brackets
	for(h in 1:length(taxaToDrop)){
		split.tree[taxaToDrop[h]]<-"drop"
		if(split.tree[taxaToDrop[h] - 1] == "("){
			split.tree[taxaToDrop[h] - 1]<-"drop"
			bracket="left"
		}
		if(split.tree[taxaToDrop[h] - 1] == ","){
			split.tree[taxaToDrop[h] - 1]<-"drop"
		}
		if(split.tree[taxaToDrop[h] + 1] == ")"){
			split.tree[taxaToDrop[h] + 1]<-"drop"
			bracket="right"
		}
		if(split.tree[taxaToDrop[h] + 1] == ","){
			split.tree[taxaToDrop[h] + 1]<-"drop"
		}

		#Find and toss the closing bracket
		if(bracket=="right"){
			leftCount<- 0 #when leftCount exceeds rightCount, drop parenthesis
			rightCount<- 0
			currentRemainder<-split.tree[1:(taxaToDrop[h] - 2)]
			rightPos<-which(currentRemainder==")")
			leftPos<-which(currentRemainder=="(")
			continue<-TRUE
			for(j in length(currentRemainder):1){
				if(continue==TRUE){
					if(j%in%leftPos){
						leftCount<-leftCount + 1
					}
					if(j%in%rightPos){
						rightCount<-rightCount + 1
					}
					if(leftCount > rightCount){
						split.tree[j]<-"drop"
						continue<-FALSE
					}
				}
			}
		}else{
			leftCount<-0 #when rightCount exceeds leftCount, drop parenthesis
			rightCount<-0
			currentRemainder<-split.tree[(taxaToDrop[h] + 2):length(split.tree)]
			positionRemainder<-length(split.tree[1:(taxaToDrop[h] + 2)])
			rightPos<-which(currentRemainder==")")
			leftPos<-which(currentRemainder=="(")
			continue<-TRUE
			for(j in 1:length(currentRemainder)){
				if(continue==TRUE){
					if(j%in%leftPos){
						leftCount<-leftCount + 1
					}
					if(j%in%rightPos){
						rightCount<-rightCount + 1
					}
					if(rightCount > leftCount){
						split.tree[j + (positionRemainder - 1)]<-"drop"
						continue<-FALSE
					}
				}
			}
		}
	}

	#Drop tips
	split.tree<-split.tree[which(split.tree != "drop")]

	#Get rid of extra commas
	if(split.tree[1]==","){
		split.tree[1]<-"drop"
	}
	if(split.tree[length(split.tree)]==","){
		split.tree[length(split.tree)]<-"drop"
	}
	for(l in 1:2){ #twice to be sure
		for(k in 1:(length(split.tree) - 1)){
			if(split.tree[k]=="(" && split.tree[k+1]==","){
				split.tree[k+1]<-"drop"
			}
			if(split.tree[k]=="," && split.tree[k+1]==")"){
				split.tree[k]<-"drop"
			}
			if(split.tree[k]=="," && split.tree[k+1]==","){
				split.tree[k]<-"drop"
			}
		}
		split.tree<-split.tree[which(split.tree != "drop")]
	}
	split.tree<-split.tree[which(split.tree != "drop")]
	split.tree.cat<-paste(split.tree,collapse="")


	save<-split.tree.cat #for troubleshooting
	split.tree.cat<-save #for troubleshooting

	#Get rid of brackets around singletons
	while(length(grep("\\(\\(\\w\\)",split.tree.cat)) == 1){
		m<-regexpr(pattern="\\(\\(\\w\\)",text=split.tree.cat, perl=TRUE)
		new.clade<-gsub("\\)","",regmatches(x=split.tree.cat, m))
		if((m[1] + 3) < length(split.tree)){
			split.tree<-c(split.tree[1:(m[1] - 1)],strsplit(new.clade,"")[[1]],
				split.tree[(m[1] + 4):length(split.tree)])
		}else{
			split.tree<-c(split.tree[1:(m[1] - 1)],strsplit(new.clade,"")[[1]])
		}
		split.tree.cat<-paste(split.tree,collapse="")
	}

	while(length(grep("\\(\\w\\)\\)",split.tree.cat)) == 1){
		m<-regexpr(pattern="\\(\\w\\)\\)",text=split.tree.cat,perl=TRUE)
		new.clade<-gsub("\\(","",regmatches(x=split.tree.cat,m))
		split.tree<-strsplit(split.tree.cat,"")[[1]]
		if((m[1]) > 1){
			split.tree<-c(split.tree[1:(m[1] - 1)],strsplit(new.clade,"")[[1]],
				split.tree[(m[1] + 4):length(split.tree)])
		}else{
			split.tree<-c(strsplit(new.clade,"")[[1]],split.tree[2:length(split.tree)])
		}
		split.tree.cat<-paste(split.tree,collapse="")
	}

	while(length(grep("\\(\\w\\)",split.tree.cat)) == 1){
		m<-regexpr(pattern="\\(\\w\\)",text=split.tree.cat,perl=TRUE)[1]
		split.tree.temp<-strsplit(split.tree.cat,"")[[1]]
		opening.l<-length(split.tree.temp[1:(m-1)][which(split.tree.temp=="(")])
		closing.l<-length(split.tree.temp[(m+3):length(split.tree.temp)][which(split.tree.temp==")")])
		if(opening.l > closing.l || m==length(split.tree)){
			split.tree.temp<-split.tree.temp[-m]
		}
		if(closing.l > opening.l || m==1){
			split.tree.temp<-split.tree.temp[-(m+2)]
		}
		split.tree.cat<-paste(split.tree.temp,collapse="")
	}

	#Get rid of double brackets
	#First get positions of double brackets
	openBracket<-c()
	closeBracket<-c()
	split.tree<-strsplit(split.tree.cat,"")[[1]]
	for(m in 1:(length(split.tree) - 1)){
		if(split.tree[m]=="(" && split.tree[m+1]=="("){
			openBracket<-append(openBracket,m)
		}
		if(split.tree[m]==")" && split.tree[m+1]==")"){
			closeBracket<-append(closeBracket,m + 1)
		}
	}

	#For different pairs, if there are as many or more brackets as taxa, toss them
	end<-FALSE
	while((length(grep("\\(",split.tree)) >= length(grep("\\w",split.tree)) ||
			length(grep("\\)",split.tree)) >= length(grep("\\w",split.tree))) &&
			end==FALSE){
		if(openBracket[length(openBracket)] > closeBracket[1]){
			substring<-split.tree[openBracket[1]:closeBracket[length(closeBracket)]]
			if(length(grep("\\(",split.tree)) >= length(grep("\\w",split.tree))){
				if(length(grep("\\(",substring)) >= length(grep("\\w",substring))){
					substring<-substring[-1]
					removed<-TRUE
				}
			}

			if(length(grep("\\)",split.tree)) >= length(grep("\\w",split.tree))){
				if(length(grep("\\)",substring)) >= length(grep("\\w",substring))){
					substring<-substring[-length(substring)]
					removed<-TRUE
				}
			}
			if(removed==TRUE){
				split.tree<-split.tree[-c(openBracket[1]:closeBracket[length(closeBracket)])]
				if(openBracket[1] > 1){
					split.tree<-c(split.tree[1:openBracket[1] - 1],substring,
						split.tree[openBracket[1]:length(split.tree)])
				}else{
					split.tree<-c(substring,split.tree)
				}
				if(length(openBracket) > 1 && length(closeBracket) > 1){
					openBracket<-openBracket[-1]
					closeBracket<-closeBracket[-(length(openBracket))]
				}else{
					end<-TRUE
				}
			}else{
				end<-TRUE
			}

		}else{
			removed<-FALSE
			substring<-split.tree[openBracket[length(openBracket)]:closeBracket[1]]
			if(length(grep("\\(",split.tree)) >= length(grep("\\w",split.tree))){
				if(length(grep("\\(",substring)) >= length(grep("\\w",substring))){
					substring<-substring[-1]
					removed<-TRUE
				}
			}
			if(length(grep("\\)",split.tree)) >= length(grep("\\w",split.tree))){
				if(length(grep("\\)",substring)) >= length(grep("\\w",substring))){
					substring<-substring[-length(substring)]
					removed<-TRUE
					reduce<-TRUE
				}
			}

			if(removed==TRUE){
				split.tree<-split.tree[-c(openBracket[length(openBracket)]:closeBracket[1])]
				if(openBracket[length(openBracket)] > 1){
					split.tree<-c(split.tree[1:openBracket[length(openBracket)] - 1],substring,
					split.tree[openBracket[length(openBracket)]:length(split.tree)])
				}else{
					split.tree<-c(substring,split.tree)
				}
				if(length(openBracket) > 1 && length(closeBracket) > 1){
					openBracket<-openBracket[-(length(openBracket))]
					closeBracket<-closeBracket[-1]
					if(reduce==TRUE){
						closeBracket<-closeBracket - 1
						reduce==FALSE
					}
				}else{
					end<-TRUE
				}
			}else{
				end<-TRUE
			}
		}
	}
	split.tree.cat<-paste(split.tree,collapse="")

	return(split.tree.cat)
}
bomeara/phrapl documentation built on Feb. 8, 2021, 6:45 p.m.