R/post_sampletrees.R

Defines functions addTrees readTrees readOutput changeArgs.treeoutput print.treeoutput summary.treeoutput treeapply addTreeStat plot.treeoutput merge.treeoutput writeTreeoutput checkfile

Documented in addTrees addTreeStat changeArgs.treeoutput checkfile merge.treeoutput plot.treeoutput print.treeoutput readOutput readTrees summary.treeoutput treeapply writeTreeoutput

## This file contains the functions for processing the sampletrees 
## output


## This function is used to add the trees from a sampletrees run
## to the sampletrees output object
addTrees=function(output, all=TRUE, lines=NULL, start=1, end=NULL, nlines=NULL){
	
	#require(ape)
	
	# Check that trees aren't already read in
	if (class(output$rawdata$Trees)=="multiPhylo"){
		stop("Trees already saved in the tree output object.\n")
	} else {
		output$rawdata$Trees=readTrees(output=output, all=all, start=start, lines=lines, end=end, nlines=nlines)
	}
	return(output)
}


## This function is used to read in trees from a run of
## sampletrees. It is expected to mostly be used as a helper function by
## addTrees and treeapply
readTrees=function(output=NULL, filenames=NULL, all=TRUE,lines=NULL,start=1, end=NULL, nlines=NULL)
{
	#require(ape)
	if (class(output$rawdata$Trees)=="multiPhylo"){
		cat("Trees already read in to the tree output object")
		return(output$rawdata$Trees)
	} else {
		if(is.null(filenames)&&is.null(output)) {
			stop("At least one of filenames or output must not be null\n")
		}
		
		treesNames = c()
		if(!is.null(output)) {
			filenames=output$rawdata$Trees
			treesNames = as.character(output$rawdata$i)
		}
		# Check that file(s) exists
		if ( sum(!file.exists(filenames))>0 ){
			stop(paste("Tree file(s) ",filenames," don't exist\n"))
		}
		
		# Get data on the files
		numfiles  = length(filenames)
		numlines  = vector(length=length(numfiles))
		lineindex = vector(length=length(numfiles))
		total     = 0
		found     = FALSE
		filestart = start
		
		for (i in 1:numfiles){
			if(is.null(output)) {
				treesNames = c(treesNames, as.character(read.table(filenames[i])[,1]))
			}
			numlines[i]  = length(count.fields(filenames[i]))
			lineindex[i] = total
			total        = total+numlines[i]
			if ((start<total)&&(found==FALSE)){
				startfileindex=i
				if (startfileindex>1){
					filestart=start-sum(numlines[1:(i-1)])
				}
				found=TRUE
			}
		}
	
		if ((start<0)||(start>sum(numlines))){
			stop("The first line to be read in is either too low or too high\n")
		}
	
		if (all==TRUE) { # Read in all the trees
			for (i in 1:length(filenames)) {
				if (i==1){
					trees=read.tree(file=filenames[i]) # Problem here: needs to be file= but was text=
				} else {
					trees=c(trees, read.tree(file=filenames[i])) # Problem here: needs to be file= but was text=
				}
	
			}
			#if(!is.null(output)) names(trees)=as.character(output$rawdata$i)
			names(trees)=treesNames
				
		} else if (!is.null(lines)){
		
			# The lines to be read in are stored in the vector called lines
			if (!is.numeric(sum(lines))){
				# If a subset is to be read in, ensure that the number
				# of lines has to read in has been specified
				stop("Line values must be numeric\n")
			} else {
			
				toread=vector(length=length(lines))
				treedat=vector(length=length(lines))
				treenames=vector(length=length(lines))
			
				lower=1
				for (i in 1:length(numlines)){
					upper=lower+numlines[i]
					inrange=(lines>=lower)&(lines<upper)
					toread[inrange]=i
					lower=upper
				}
				if (sum(toread==0)>0){
					stop("More lines to be read in than are in tree files\n ")
				}
			

				for (i in 1:length(lines)){
					temp=scan(file=filenames[toread[i]], skip=(lines[i]-lineindex[toread[i]]-1), 
						nlines=1, what="character",quiet=TRUE)
					treedat[i]=temp[2]
				}
				#if(!is.null(output)) trees=read.tree(text=treedat, tree.names=output$rawdata$i[lines])
				#else				 trees=read.tree(text=treedat)
				trees=read.tree(text=treedat, tree.names=treesNames[lines])
			}
		
		} else if ((!is.null(nlines))||(!is.null(end))) {
		
		
			# The lines to be read in start at 'start', and end when 'nlines' has been
			# read in
			if (!is.null(end)){
				nlines=end-start+1
				if ( (end<start) || (end>sum(numlines)) ){
					stop("Last line to be read in must be greater than first and less than max number of trees in the files\n")
				}
			} else {
				end=start+nlines-1
				if ( (nlines>sum(numlines)) || (end>sum(numlines))){
					stop("More lines to be read in than are in tree files\n ")
				}
			}
		
			numtoread=nlines
			fileindex=startfileindex
			treedat=NULL
		
			while (numtoread>0){
				numfromfile=min(numlines[fileindex]-filestart+1,numtoread)
				temp=scan(file=filenames[fileindex], skip=filestart-1, 
					nlines=2*numfromfile, what="character",quiet=TRUE)
				treedat=c(treedat,temp[2*(1:numfromfile)])
				fileindex=fileindex+1
				numtoread=numtoread-numfromfile
				filestart=1
			}
			#if(!is.null(output)) trees=read.tree(text=treedat, tree.names=output$rawdata$i[start:end])
			#else				 trees=read.tree(text=treedat)
			trees=read.tree(text=treedat, tree.names=treesNames[start:end])
		
		} else {
			stop("If all=FALSE must specify values for lines OR nlines OR stop")
		}
	
		return(trees)
	}

}




# This function is used to read in sampletrees output. Note that it does not
# read in the trees as these are processed separately.
readOutput=function(treeobj=NULL, argobj=NULL, argfile=NULL, addtrees=FALSE)
{
	if( is.null(treeobj) ) {
		if( is.null(argobj)&&is.null(argfile)){
			stop("One of 'treeobj', 'argobj' or 'argfile' must not be NULL")
		} else if( is.null(argobj) ) {
			argobj=readArgs(argfile, check=FALSE)
		}
		argobj$clean=TRUE
		
		# Read in acceptance rates and save it as a list.
		# This list will be appended to by other functions
		accept=read.table(paste(argobj$RunName,"_accept.out",sep=""))
		colnames(accept)=c("Type","Total","Accepted")
		procResults=list(Accept=accept)
		
		# Check if there are processed results and read them in 
		fname=paste(argobj$RunName,"_proctrees.out",sep="")
		if (file.exists(fname)){
			procResults$TreeStats=read.table(fname,header=T)
		}
	}
     else {
     	argobj	  <- treeobj$runinfo
     	procResults <- treeobj$procdata
     }
     
	# Read in the raw results and create a list of the raw results
	dat=read.table(paste(argobj$RunName,"_samples.out",sep=""), header=T)
	rawResults=list(i=dat[,"i"])
	rawResults$Theta=dat[,"theta"]
	rawResults$Rho=dat[,"rho"]
	rawResults$Trees=paste(argobj$RunName,"_trees.out",sep="")
	
#	treeobj$rawdata=rawResults
	
	# Store all the results as a list of three elements. Each element of the list
	# is in turn a list
	# List consists of (1) metadata giving settings
	#                  (2) list of raw data from the run
	#                  (3) list of processed data from the run
	results = list(runinfo=argobj, rawdata=rawResults, procdata=procResults)
	class(results)="treeoutput"
	if (addtrees==TRUE){
		results=addTrees(results)
	}
	
	return(results)
}


# This function is used to change any of the options in the
# settings object. Because the settings object is a list, 
# one could just change the values of the list. But it is 
# safer to use this function because it will test that
# the option that is being changed is actually valid. 
changeArgs.treeoutput=function(object,...)
{
	results=object
	args=results$runinfo
	otherargs=list(...)
	if (length(otherargs)!=0){
		
		inboth=intersect(names(args),names(otherargs))
		args[inboth]=otherargs[inboth]
		
		notfound <- setdiff(names(otherargs),names(args))
		if (length(notfound)!=0) {
			warning(paste("The following are not valid settings: ", notfound))
		}
		
	}
	
	results$runinfo=args
	
	return(results)	
}


print.treeoutput=function(x, ...)
{
	output=x
	
	# Print the run settings
	info=output$runinfo
	class(info)="pars"
	print(info)	
	
	# Print the run statistics
	cat("\nSampletrees run information:\n")
	cat(paste("\tTotal number samples read in: ",length(output$rawdata$i),"\n"))
	cat(paste("\tFirst sample index: ",output$rawdata$i[1],"\n"))
	cat(paste("\tLast sample index: ",output$rawdata$i[length(output$rawdata$i)],"\n"))
	
	# Print information about processed results
		if (length(output$procdata)>1){
		cat(paste("\tNumber summary statistics available: ",ncol(output$procdata$TreeStats)-1,"\n"))
	}
	
}




summary.treeoutput=function(object, ...)
{
	
	output=object
	
	outmat=matrix(nrow=6,ncol=2)
	outmat=data.frame(outmat)
	colnames(outmat)=c("Name","Value")
	
	outmat[1:3,"Name"]=names(output$runinfo)[8:10]
	outmat[1:3,"Value"]=unlist(output$runinfo)[8:10]
	outmat[4,]=c("First Observation",output$rawdata$i[1])
	outmat[5,]=c("Last Observation",output$rawdata$i[length(output$rawdata$i)])
	outmat[6,]=c("Number Observations", length(output$rawdata$i))
	
	print(outmat)
	
	cat("\nAcceptance proportions:\n")
	temptable=output$procdata$Accept
	temptable[output$procdata$Accept[,"Type"]==3,"Total"]=1
	propns=temptable[,"Accepted"]/temptable[,"Total"]
	names(propns)=output$procdata$Accept[,"Type"]
	print(round(propns,5))
	
	cat("\nMCMC output summaries:\n")
	cat("Theta:\n")
	print(summary(output$rawdata$Theta))
	cat("\nRho:\n")
	print(summary(output$rawdata$Rho))
	
	if (!is.null(output$procdata$TreeStats)){
		for (i in 2:ncol(output$procdata$TreeStats)){
			cat(paste("\n",names(output$procdata$TreeStats)[i],":\n",sep=""))
			print(summary(output$procdata$TreeStats[,i]))
		}
		
	}
		
}



treeapply=function(output, myfunc, funcname=NULL, maxlines=1000, treerange=NULL, ...)
{
	#require(ape)

	# Compute the statistic on the trees
	if (class(output$rawdata$Trees)=="multiPhylo"){
		
		# The trees have been read in so use those
		treestat=unlist(lapply(output$rawdata$Trees, myfunc, ...))
		treenames=as.numeric(names(output$rawdata$Trees))
		rnames=1:length(treestat)
		
	} else{

		# Begin reading in trees. If there is a range, use the range; otherwise, apply 
		# to all trees in the file
		if (!is.null(treerange)){
			
			testrange=treerange[1]+0:(length(treerange)-1)
			if (sum(testrange-treerange)==0){
				
				# Range consists of consecutive numbers so faster use of scan:
				trees=readTrees(output=output,all=FALSE, lines=NULL, 
						      start=treerange[1],end=treerange[length(treerange)])
						
			} else {
				
				# Range consists of non-consecutive subset
				trees=readTrees(output=output,all=FALSE, lines=treerange)
				
			}
			
			treestat=unlist(lapply(trees, myfunc, ...))
			treenames=output$rawdata$i[treerange]
			rnames=treerange
			
		} else {
			
			treestat=NULL
			
			# No special range used
			for (i in 1:length(output$rawdata$Trees)){
				
				con=file(output$rawdata$Trees[i],open="rt")
				
				while (length(input <- readLines(con, n=maxlines)) > 0){ 
					trees=read.tree(text=input)
					substat=unlist(lapply(trees, myfunc, ...))
					treestat=c(treestat,substat)
				} 
				close(con)
			}
			
			treenames=output$rawdata$i
			rnames=1:length(treestat)
		}
			
	}


	df=data.frame(treenames,treestat)
	rownames(df)=rnames
	if (!is.null(funcname)){
		colnames(df)=c("index",funcname)
	} else {
		colnames(df)=c("index","stat")
	}
	
	return(df)
}


addTreeStat=function(output, myfunc, funcname=NULL, maxlines=1000, treerange=NULL, ...)
{
	if (!is.data.frame(output$procdata$TreeStats)){
			
		# This is the first tree statistic to be saved
		treestat=treeapply(output, myfunc, funcname=funcname, maxlines=maxlines,treerange=treerange, ...)
		ncols=ncol(treestat)
		output$procdata$TreeStats=treestat
			
	} else {
			
		# Tree stats have already been read in so use the same tree indices
		treeindex=as.numeric(rownames(output$procdata$TreeStats))
		nrows=nrow(output$procdata$TreeStats)
			
		if ( (!is.null(treerange))&&(!identical(treerange,treeindex)) ){
			warning("Provided treerange is different from those of the TreeStats data.frame. 
					 The tree indices will be read from TreeStats\n")
			treerange=treeindex
		} else if (!identical(1:nrows,treeindex)){
			treerange=treeindex
		}
		treestat=treeapply(output, myfunc, funcname=funcname, maxlines=maxlines,treerange=treerange, ...)
		
		
		# Add the statistic to the output object
		newdf=merge(output$procdata$TreeStats,treestat, by="index")
		rownames(newdf)=rownames(treestat)
		ncols=ncol(newdf)
		output$procdata$TreeStats=newdf
	}
	
	if (!is.null(funcname)){
		colnames(output$procdata$TreeStats)[ncols]=funcname
	} else {
		colnames(output$procdata$TreeStats)[ncols]=paste("Stat.",ncols-1,sep="")
	}
			

	return(output)
	
}

plot.treeoutput=function(x,oneperpage=FALSE,
		           asktoplot=FALSE, layoutmat=NULL,statnames=NULL, ...)
{	
	
	output=x
	
	# To modify settings to hit enter before next plot. This
	# must be re-set to initial value at end of function
	oldpar=par()$ask
	par(ask=asktoplot)
	
	
	# Set up the matrices for plotting
	if (!is.null(layoutmat)){
		if (is.matrix(layoutmat)){
			m=layoutmat
		} else{
			stop("Variable layoutmat must be of type matrix\n")
		}
	} else {
		if (is.data.frame(output$procdata$TreeStats)){
			m=matrix(c(1,2,3,4), byrow=T, nrow = 2, ncol = 2)
		} else {
			m=matrix(c(1,1,2,2,0,3,3,0), byrow=T, nrow = 2, ncol = 4)
		}
	}
	
	if (oneperpage==TRUE){
		plotdims=c(1,1)
		m=matrix(c(1),nrow=1,ncol=1)
	}
	layout(m)
	
	# Plot the acceptance rate
	newcounts=output$procdata$Accept[,2]
	newcounts[output$procdata$Accept[,1]==3]=1
	
	barplot(output$procdata$Accept[,3]/newcounts,ylim=c(0,1),
			ylab="acceptance proportion",xlab="update label",
			names=output$procdata$Accept[,1],main="Update acceptance proportions")
	
	# Plot theta and rho
	xs=output$rawdata$i
	
	plot(xs,output$rawdata$Theta, main="Traceplot of Theta", xlab="Iterations",ylab="Theta", type="l")
	lines(lowess(xs,output$rawdata$Theta),col="red")
	plot(xs,output$rawdata$Rho, main="Traceplot of Rho", xlab="Iterations",ylab="Rho",type="l")
	lines(lowess(xs,output$rawdata$Rho),col="red")


	# Plot the tree stats
	if (is.data.frame(output$procdata$TreeStats)){
		if (is.null(statnames)){
			plottitles=colnames(output$procdata$TreeStats)[2:ncol(output$procdata$TreeStats)]
		} else {
			if (length(statnames)!=(ncol(output$procdata$TreeStats)-1)){
				stop("Length of vector of tree statistics names doesn't match number of tree statistics\n ")
			} else {
				plottitles=statnames
			}
		}
		xs=output$procdata$TreeStats[,1]
	
		for (i in 2:ncol(output$procdata$TreeStats)){
			plot(xs,output$procdata$TreeStats[,i], main=paste("Traceplot of",plottitles[i-1]), 
					ylab=plottitles[i-1], xlab="Iterations",type="l")
			lines(lowess(xs,output$procdata$TreeStats[,i]),col="red")
		}
	}

	
	par(ask=oldpar)

}


merge.treeoutput=function(x,y,runname=NULL,...)
{


	output1=x
	output2=y
	
	## For a merged job, it is assumed that output2 started where
	## output2 finished and so many of the options should be the 
	## same between them
	
	## Error checking: Make sure that options that should be the same
	## are the same
	checkindex=c(3:7,12:13,15:16)
	for (i in checkindex){
		if (output1$runinfo[[i]]!=output2$runinfo[[i]]){
			stop(paste(names(output1$runinfo[[i]])," must have same value in both settings objects\n"))
		}
	}
	
	if ((output1$runinfo$DataType=='g')&&(output1$runinfo$HaploFreqFile!=output2$runinfo$HaploFreqFile)){
			stop(paste(names(output1$runinfo$HaploFreqFile)," must have same value in both settings objects\n"))
	}
	
	## This is not an error, but it will mess up calculating the first/last values 
	## and plotting
	if (output1$runinfo$Thinning!=output2$runinfo$Thinning){
		warning("Two different values used for thinning interval\n")
		newoutput1$runinfo$Thinning=NA
	}
	
	if (is.null(runname)){
		runname="Merge"
	}
	
	
	## The initial values are all those from output1. 
	newoutput=output1
	newoutput$runinfo$ChainLength=output1$runinfo$ChainLength+output2$runinfo$ChainLength
	newoutput$runinfo$RunName=runname
	
	
	## Merge the acceptance tables
	newtable=output1$procdata$Accept+output2$procdata$Accept
	newtable[,"Type"]=output1$procdata$Accept[,"Type"]
	
	if (sum(newtable[,"Type"]==3)==1){
		index=which(output1$procdata$Accept[,"Type"]==3)
		newtotal=output1$procdata$Accept[index,"Total"]+output2$procdata$Accept[index,"Total"]
		newpropn=1/newtotal*(output1$procdata$Accept[index,"Total"]*output1$procdata$Accept[index,"Accepted"]+
					output2$procdata$Accept[index,"Total"]*output2$procdata$Accept[index,"Accepted"])
	 	newtable[index,"Total"]=newtotal
		newtable[index,"Accepted"]=newpropn
	}
	newoutput$procdata$Accept=newtable
	
	
	## Merge the i, Theta and Rho values
	newoutput$rawdata$Theta=c(output1$rawdata$Theta,output2$rawdata$Theta)
	newoutput$rawdata$Rho=c(output1$rawdata$Rho,output2$rawdata$Rho)
	newoutput$rawdata$i=c(output1$rawdata$i,output1$rawdata$i[length(output1$rawdata$i)]+output2$rawdata$i)
	
	
	## Merge the Tree values
	newoutput$rawdata$Trees=c(output1$rawdata$Trees,output2$rawdata$Trees)
	
	
	## Merge the tree statistics
	treestatflag=c(is.data.frame(output1$procdata$TreeStats),is.data.frame(output2$procdata$TreeStats))
	if (sum(treestatflag)!=0){
		if (sum(treestatflag)==1){
			stop("Error: Tree statistics must be computed on both runs or on neither run")	
		} else {
			if (ncol(output1$procdata$TreeStats)!=ncol(output2$procdata$TreeStats)){
				stop("Error: The same sets of tree statistics must be computed for both runs")	
			} else {
				newoutput$procdata$TreeStats=rbind(output1$procdata$TreeStats,output2$procdata$TreeStats)
			}
		}
	}
	
	return(newoutput)
	
}



writeTreeoutput=function(output, argfile=NULL, ask=TRUE)
{
	
	#require("ape")
	runname=output$runinfo$RunName
	
	# Write out the settings
	newpars=output$runinfo
	newpars$clean=TRUE
	class(newpars)="pars"
	
	if (is.null(argfile)){
		argfile=paste(runname,"_Settings.par",sep="")
	}
	
	if ((ask==FALSE)||checkfile(argfile)){	
		writeArgs(newpars, outfile=argfile)
	} else {
		cat(paste("No output written to file ",argfile,"\n",sep=""))
	}
	
	
	# Write out theta and rho to file 

	samples=data.frame(output$rawdata$i, output$rawdata$Theta, output$rawdata$Rho)
	colnames(samples)=c("i","Theta","Rho")
	resfilename=paste(runname,"_samples.out", sep="")
	
	if ((ask==FALSE)||checkfile(resfilename)){
		write.table(samples,resfilename,quote=F,row.names=F)
	} else {
		cat(paste("No output written to ",resfilename,"\n",sep=""))
	}
	
	
	# Write out trees to file
	resfilename=paste(runname,"_trees.out", sep="")
	
	if ((ask==FALSE)||checkfile(resfilename)){
	
		if (class(output$rawdata$Trees)=="multiPhylo"){
			write.tree(output$rawdata$Trees,resfilename)
		} else if (length(output$rawdata$Trees)==1){
			file.copy(from=output$rawdata$Trees, to=resfilename, overwrite = FALSE)
		} else if (length(output$rawdata$Trees)>1){
			for (i in 1:length(output$rawdata$Trees)){
				file.append(resfilename, output$rawdata$Trees[i])
			}	
		}
		
	} else {
		cat(paste("No output written to ",resfilename,"\n",sep=""))
	}
	
	
	# Write out acceptances to file
	resfilename=paste(runname,"_accept.out", sep="")
	if ((ask==FALSE)||checkfile(resfilename)){
		write.table(output$procdata$Accept,resfilename,quote=F,row.names=F,col.names=F)
	} else {
		cat(paste("No output written to ",resfilename,"\n",sep=""))
	}
	
	
	# Write out any processed results to file. This assumes they are
	# stored as a dataframe
	if (!is.null(output$procdata$TreeStats)){
		
		resfilename=paste(runname,"_proctrees.out",sep="")
		
		if ((ask==FALSE)||checkfile(resfilename)){
			write.table(output$procdata[[2]],resfilename,quote=F,row.names=F)
		} else {
			cat(paste("No output written to ",resfilename,"\n",sep=""))
		}
		
	}
	
	
}

# Helper function
checkfile=function(filename){

	check=TRUE
	
	if (file.exists(filename)){
		check=FALSE
		response='x'
		while( !identical(response, 'y')&&!identical(response, 'n') ){
			cat(paste("Overwrite ",filename," (y or n)?\n",sep=""))
			response=scan(n=1, what="character",quiet=TRUE)
			if (identical(response, 'y')){
				check=TRUE
			}	 
		}
	}
	
	return(check)
}

Try the Rsampletrees package in your browser

Any scripts or data that you put into this service are public.

Rsampletrees documentation built on March 3, 2020, 1:07 a.m.