deprecated/phyloscan.oldfun.R

PR.PACKAGE			<- "phyloscan" 
PR.EVAL.FASTA		<- paste('Rscript', system.file(package=PR.PACKAGE, "phyloscan.evaluate.fasta.Rscript"))
PR.EVAL.EXAML		<- paste('Rscript', system.file(package=PR.PACKAGE, "phyloscan.read.trees.Rscript"))
PR.SCAN.STATISTICS	<- paste('Rscript', system.file(package=PR.PACKAGE, "phyloscan.scan.statistics.Rscript"))
PR.SCAN.SUPERINF	<- paste('Rscript', system.file(package=PR.PACKAGE, "phyloscan.scan.superinfections.Rscript"))
PR.ALIGNMENT.FILE	<- system.file(package=PR.PACKAGE, "HIV1_compendium_C_B_CPX.fasta")
PR.ALIGNMENT.ROOT	<- "AF460972"
PR.ALIGNMENT.TO		<- "K03455"
EPS					<- 1e-12

#' @title Combine output from multiple phyloscanner runs
#' @param in.dir Full directory path to output of phyloscanner runs
#' @param save.file If not NA, the combined output is saved to file.
#' @param postfix.trees Pattern at end of file names to identify short read tree output generated by the phyloscanner tools. By default 'trees.rda'.
#' @param postfix.trmwindowstats Pattern at end of file names to identify transmission statistics output per window, generated by the phyloscanner tools. By default 'trmStatsPerWindow.rda'.
#' @param regex.ind Regular expression that identifies the ID of query individuals
#' @param trmw.min.reads Minimum number of reads per individuals in order to make a transmission assignment that involves this individual
#' @return list of three R objects 'phs', 'dtrees', 'dtrms'. See description.
#' @description This function generates three R objects. 'phs' is a list of all short read trees in ape format.
#' 	  'dtrees' is a data.table that provides info on each short read tree. Columns are 'PTY_RUN' (phyloscanner run id), 'W_FROM' (start of window), 'W_TO' (end of window), 'IDX' (index of tree in phs).
#' 	  'dtrms' is a data.table that provides info on transmission assignments. Columns are 'PTY_RUN' (phyloscanner run id), 'ID1' (identifier of first individual), 'ID2' (identifier of second individual), 'TYPE' (window assignment), 
#'	  'WIN_OF_TYPE' (number of windows with that assignment), 'WIN_TOTAL' (Number of windows where reads from both individuals are present), 'PAIR_ID' (unique ID of each combination of PTY_RUN,ID1, ID2)
phsc.combine.phyloscanner.output<- function(in.dir, save.file=NA, postfix.trees='trees.rda', postfix.trmwindowstats='trmStatsPerWindow.rda', regex.ind="^[0-9]+_[0-9]_[0-9]+", trmw.min.reads=1, trmw.min.tips=1, trmw.close.brl=Inf, trmw.distant.brl=Inf, norm.file.name=NA)
{
	#	read trees
	cat('\nread trees')
	ptyr	<- data.table(FILE=list.files(in.dir, pattern=postfix.trees, full.names=TRUE))	
	phs		<- lapply(seq_len(nrow(ptyr)), function(i)
			{
				load(ptyr[i, FILE])
				phs
			})
	phs		<- do.call('c', phs)
	#	read tree info
	dtrees	<- lapply(seq_len(nrow(ptyr)), function(i)
			{
				load(ptyr[i, FILE])
				dfr
			})
	dtrees	<- do.call('rbind', dtrees)
	dtrees[, IDX:=seq_len(nrow(dtrees))]
	#
	#	read transmission window stats
	#	
	cat('\nread transmission window stats')
	ptyr	<- data.table(FILE=list.files(in.dir, pattern=postfix.trmwindowstats, full.names=TRUE))
	ptyr[, PTY_RUN:=ptyr[, as.integer(gsub('ptyr','',regmatches(FILE, regexpr('ptyr[0-9]+',FILE))))]]
	dwin		<- lapply(seq_len(nrow(ptyr)), function(i)
			{
				load(ptyr[i, FILE])
				tt[, PTY_RUN:= ptyr[i,PTY_RUN]]
				tt
			})
	dwin	<- do.call('rbind', dwin)
	setnames(dwin, 	c('PAT.1','PAT.2','PAT.1_TIPS','PAT.2_TIPS','PAT.1_READS','PAT.2_READS','PATHS.12','PATHS.21'),
			c('ID1','ID2','ID1_L','ID2_L','ID1_R','ID2_R','PATHS_12','PATHS_21'))
	set(dwin, NULL, 'ID1', dwin[, regmatches(ID1, regexpr(regex.ind, ID1))])
	set(dwin, NULL, 'ID2', dwin[, regmatches(ID2, regexpr(regex.ind, ID2))])
	set(dwin, NULL, 'PATRISTIC_DISTANCE', dwin[, as.numeric(PATRISTIC_DISTANCE)])
	#	create W_FROM W_TO from SUFFIX
	set(dwin, NULL, 'W_FROM', dwin[, as.integer(gsub('[^0-9]*([0-9]+)_to_([0-9]+).*','\\1', SUFFIX))])
	set(dwin, NULL, 'W_TO', dwin[, as.integer(gsub('[^0-9]*([0-9]+)_to_([0-9]+).*','\\2', SUFFIX))])
	set(dwin, NULL, 'SUFFIX', NULL)
	#	normalise if desired
	if(!is.na(norm.file.name))
	{
		tmp			<- load(norm.file.name)
		if(length(tmp)!=1)	stop("Expected one R data.table in file",norm.file.name)
		eval(parse(text=paste("norm.table<- ",tmp,sep='')))
		stopifnot( c('W_FROM','W_TO','MEDIAN_PWD')%in%colnames(norm.table) )	 
		setnames(norm.table, 'MEDIAN_PWD', 'NORM_CONST')
		norm.table[, W_MID:= (W_FROM+W_TO)/2]
		#	standardize to 1 on gag+pol ( prot + first part of RT in total 1300bp )
		#	790 - 3385
		cat('\nstandardise normalising constants to 1 on the gag+pol ( prot + first part of RT in total 1300bp pol ) region')
		tmp		<- subset(norm.table, W_MID>=790L & W_MID<=3385L)
		stopifnot( nrow(tmp)>0 )	# norm.table must contain gag+pol region	
		tmp		<- tmp[, mean(NORM_CONST)]
		stopifnot( is.finite(tmp) )
		set(norm.table, NULL, 'NORM_CONST', norm.table[, NORM_CONST/tmp])
		norm.table	<- subset(norm.table, select=c(W_MID, NORM_CONST))
		
		tmp		<- unique(subset(dwin, select=c(W_FROM, W_TO)))
		setkey(tmp, W_FROM)
		tmp		<- tmp[, {
					list( NORM_CONST=subset(norm.table, W_MID>=W_FROM & W_MID<=W_TO)[, mean(NORM_CONST)]	)
				}, by=c('W_FROM','W_TO')]
		dwin	<- merge(dwin, tmp, by=c('W_FROM','W_TO'))
		set(dwin, NULL, 'PATRISTIC_DISTANCE', dwin[, PATRISTIC_DISTANCE/NORM_CONST])		
	}
	#
	#	build transmission summary stats based on selection criteria
	#	
	cat('\nreduce transmission window stats to windows with at least',trmw.min.reads,'reads and at least',trmw.min.tips,'tips')
	dwin	<- subset(dwin, ID1_R>=trmw.min.reads & ID2_R>=trmw.min.reads & ID1_L>=trmw.min.tips & ID2_L>=trmw.min.tips)
	cat('\ntotal number of windows with trm assignments is',nrow(dwin))		
	#
	#	merge assignments (keeping as much detail as needed for full interpretation)
	#
	dwin[, TYPE_RAW:= TYPE]
	#	chains with no intermediate
	tmp		<- dwin[, which(TYPE=="anc_12" & ADJACENT)]
	cat('\nFound adjacent anc_12, n=', length(tmp),'--> chain with no intermediate')
	set(dwin, tmp, 'TYPE', 'chain_12_nointermediate')
	tmp		<- dwin[, which(TYPE=="multi_anc_12" & ADJACENT)]
	cat('\nFound adjacent multi_anc_12, n=', length(tmp),'--> chain with no intermediate')
	set(dwin, tmp, 'TYPE', 'chain_12_nointermediate')
	#	chains with no intermediate
	tmp		<- dwin[, which(TYPE=="anc_21" & ADJACENT)]
	cat('\nFound adjacent anc_21, n=', length(tmp),'--> chain with no intermediate')
	set(dwin, tmp, 'TYPE', 'chain_21_nointermediate')
	tmp		<- dwin[, which(TYPE=="multi_anc_21" & ADJACENT)]
	cat('\nFound adjacent multi_anc_21, n=', length(tmp),'--> chain with no intermediate')
	set(dwin, tmp, 'TYPE', 'chain_21_nointermediate')	
	#
	#	chains with intermediate
	tmp		<- dwin[, which(TYPE=="anc_12" & !ADJACENT)]
	cat('\nFound non-adjacent anc_12, n=', length(tmp),'--> chain with intermediate')
	set(dwin, tmp, 'TYPE', 'chain_12_withintermediate')
	tmp		<- dwin[, which(TYPE=="multi_anc_12" & !ADJACENT)]
	cat('\nFound non-adjacent multi_anc_12, n=', length(tmp),'--> chain with intermediate')
	set(dwin, tmp, 'TYPE', 'chain_12_withintermediate')
	#	chains with intermediate
	tmp		<- dwin[, which(TYPE=="anc_21" & !ADJACENT)]
	cat('\nFound non-adjacent anc_21, n=', length(tmp),'--> chain with intermediate')
	set(dwin, tmp, 'TYPE', 'chain_21_withintermediate')
	tmp		<- dwin[, which(TYPE=="multi_anc_21" & !ADJACENT)]
	cat('\nFound non-adjacent multi_anc_21, n=', length(tmp),'--> chain with intermediate')
	set(dwin, tmp, 'TYPE', 'chain_21_withintermediate')
	#
	#	intermingled with no intermediate
	tmp		<- dwin[, which(TYPE=="conflict" & ADJACENT)]
	cat('\nFound adjacent conflict, n=', length(tmp),'--> intermingled with no intermediate')
	set(dwin, tmp, 'TYPE', 'intermingled_nointermediate')
	#	intermingled with intermediate
	tmp		<- dwin[, which(TYPE=="conflict" & !ADJACENT)]
	cat('\nFound non-adjacent conflict, n=', length(tmp),'--> intermingled with intermediate')
	set(dwin, tmp, 'TYPE', 'intermingled_withintermediate')
	#
	#	other	
	tmp		<- dwin[, which(ADJACENT & PATHS_12==0 & PATHS_21==0)]
	cat('\nFound adjacent with no paths, n=', length(tmp),'--> other')
	set(dwin, tmp, 'TYPE', 'other_nointermediate')
	tmp		<- dwin[, which(!ADJACENT & PATHS_12==0 & PATHS_21==0)]
	cat('\nFound non-adjacent with no assignment, n=', length(tmp),'--> other')
	set(dwin, tmp, 'TYPE', 'other_withintermediate')
	#	check
	stopifnot( !nrow(subset(dwin, TYPE=='none'))	)
	#
	#	add distance as second dimension
	#
	if(!is.na(trmw.close.brl) & is.finite(trmw.close.brl))
	{
		cat('\nidentifying close pairwise assignments using distance=',trmw.close.brl)
		tmp		<- dwin[, which(PATRISTIC_DISTANCE<trmw.close.brl)]
		cat('\nFound close, n=', length(tmp))
		set(dwin, tmp, 'TYPE', dwin[tmp, paste0(TYPE,'_close')])		
	}
	if(!is.na(trmw.distant.brl) & is.finite(trmw.distant.brl))
	{
		cat('\nidentifying distant pairwise assignments using distance=',trmw.distant.brl)
		tmp		<- dwin[, which(PATRISTIC_DISTANCE>=trmw.distant.brl)]
		cat('\nFound distant, n=', length(tmp))
		set(dwin, tmp, 'TYPE', dwin[tmp, paste0(TYPE,'_distant')])	
	}
	#	
	#	summarise transmission stats
	#	
	dtrms	<- dwin[, list(WIN_OF_TYPE=length(W_FROM), ID1_R=ID1_R[1], ID1_L=ID1_L[1], ID2_R=ID2_R[1], ID2_L=ID2_L[1]), by=c('PTY_RUN','ID1','ID2','TYPE')]
	tmp		<- dtrms[, list(WIN_TOTAL=sum(WIN_OF_TYPE)), by=c('PTY_RUN','ID1','ID2')]
	dtrms	<- merge(dtrms, tmp, by=c('PTY_RUN','ID1','ID2'))
	#	set pair id	
	tmp		<- dtrms[, list(SCORE=sum(WIN_OF_TYPE[grepl('close|chain_12_nointermediate|chain_21_nointermediate|intermingled_nointermediate',TYPE)])), by=c('ID1','ID2','PTY_RUN')]
	dtrms	<- merge(dtrms, tmp, by=c('ID1','ID2','PTY_RUN'))
	#	give every pair an ID
	tmp		<- unique(dtrms, by=c('ID1','ID2'))
	tmp		<- tmp[order(-SCORE),]
	tmp[, PAIR_ID:= seq_len(nrow(tmp))]	
	tmp2	<- unique(dtrms, by=c('ID1','ID2','PTY_RUN'))
	tmp		<- merge(tmp2, subset(tmp, select=c(ID1,ID2,PAIR_ID)), by=c('ID1','ID2'))
	tmp		<- tmp[, list(PAIR_ID= paste(PAIR_ID,'-',seq_along(PAIR_ID),sep=''), PTY_RUN=PTY_RUN), by=c('ID1','ID2')]	
	dtrms	<- merge(dtrms, tmp, by=c('ID1','ID2', 'PTY_RUN'))	
	setkey(dtrms, PAIR_ID)
	#	save to file
	if(!is.na(save.file))
	{
		cat('\nwrite to file', save.file)
		save(phs, dtrees, dtrms, dwin, file=save.file)		
	}
	list(phs=phs, dtrees=dtrees, dtrms=dtrms, dwin=dwin)
}	

phsc.cmd.process.phyloscanner.output.in.directory.old<- function(ptyf, pty.args)
{
	cmds	<- ptyf[, {	
				#
				#	get bash commands to plot trees and calculate splits for each phylotype run
				#						
				cmd	<- phsc.cmd.SplitPatientsToSubtrees(	pty.args$prog.split, 
						file.path(dirname(pty.args$prog),'tools'),
						FILE_PTY_RUN,
						outputdir=dirname(FILE_PTY_RUN),																													
						outgroupName=paste('REF_CPX_',pty.args$alignments.root,'_read_1_count_0',sep=''), 
						pdfwidth=30, pdfrelheight=0.15)
				#
				#	add bash command to calculate patient stats
				#						
				pty_run			<- PTY_RUN
				file.patients	<- file.path(pty.args$out.dir, paste('ptyr',pty_run,'_patients.txt',sep=''))
				cat(subset(pty.runs, PTY_RUN==pty_run)[, paste(FILE_ID,'.bam',sep='')], sep='\n', file= file.patients)
				treeFiles		<- file.path(pty.args$out.dir, paste('ptyr',pty_run,'_InWindow_',sep=''))
				splitsFile		<- file.path(pty.args$out.dir, paste('Subtrees_r_','ptyr',pty_run,'_InWindow_',sep=''))
				outputBaseName	<- file.path(pty.args$out.dir, paste('ptyr',pty_run,sep=''))
				tmp				<- phsc.cmd.SummaryStatistics( 	pty.args$prog.smry, 
						file.path(dirname(pty.args$prog),'tools'), 
						paste('REF_CPX_',pty.args$alignments.root,'_read_1_count_0',sep=''), 
						file.patients, 
						treeFiles, 
						treeFiles,
						splitsFile, 
						outputBaseName)
				cmd				<- paste(cmd, tmp, sep='\n')
				#
				#	add bash command to get likely.transmissions 
				#
				tmp		<- phsc.cmd.LikelyTransmissions(	pty.args$prog.lkltrm, file.path(dirname(pty.args$prog),'tools'), 
						FILE_PTY_RUN, 
						file.path(dirname(FILE_PTY_RUN),paste('Subtrees_r_',basename(FILE_PTY_RUN),sep='')), 
						FILE_PTY_RUN, 
						root.name=paste('REF_CPX_',pty.args$alignments.root,'_read_1_count_0',sep=''), 
						zeroLengthTipsCount=FALSE, 
						dual.inf.thr=NA,
						romeroSeverson=TRUE)
				cmd		<- paste(cmd, tmp, sep='\n')
				#
				#	add bash command to get likely.transmissions.summary
				#						
				tmp	<- phsc.cmd.LikelyTransmissionsSummary(	pty.args$prog.lklsmry, file.path(dirname(pty.args$prog),'tools'),
						paste(FILE_PTY_RUN,'patients.txt',sep=''),
						paste(FILE_PTY_RUN,'patStatsFull.csv',sep=''),
						FILE_PTY_RUN, 
						paste(FILE_PTY_RUN,'trmStats.csv',sep=''),
						min.threshold=1, 
						allow.splits=TRUE)
				cmd	<- paste(cmd, tmp, sep='\n')	
				list(CMD=cmd)					
			}, by=c('PTY_RUN')]
	cmds
}

pty.cmd.evaluate.fasta<- function(indir, outdir=indir, strip.max.len=Inf, select='', min.ureads.individual=NA)
{
	cmd		<- paste('\n',PR.EVAL.FASTA, ' -indir=',indir, ' -strip.max.len=',strip.max.len, sep='')
	if(select!='')
		cmd	<- paste(cmd,' -select=',select,sep='')
	if(!is.na(min.ureads.individual))
		cmd	<- paste(cmd,' -min.ureads.individual=',min.ureads.individual,sep='')	
	cmd	<- paste(cmd,'\n',sep='')
	cmd
}


pty.cmd.scan.statistics<- function(indir, outdir=indir, select='')
{
	cmd		<- paste('\n',PR.SCAN.STATISTICS, ' -indir=',indir, sep='')
	if(select!='')
		cmd	<- paste(cmd,' -select=',select,sep='')
	if(!is.na(outdir))
		cmd	<- paste(cmd,' -outdir=',outdir,sep='')	
	cmd	<- paste(cmd,'\n',sep='')
	cmd
}



pty.cmd.scan.superinfections<- function(indir, outdir=indir, select='', references.pattern='REF', run.pattern='ptyr', plot.max.clade=0)
{
	cmd		<- paste('\n',PR.SCAN.SUPERINF, ' -indir=',indir, ' -plot.max.clade=', plot.max.clade, sep='')
	if(select!='')
		cmd	<- paste(cmd,' -select=',select,sep='')
	if(references.pattern!='')
		cmd	<- paste(cmd,' -references.pattern=',references.pattern,sep='')
	if(run.pattern!='')
		cmd	<- paste(cmd,' -run.pattern=',run.pattern,sep='')
	cmd	<- paste(cmd,'\n',sep='')
	cmd
}


pty.cmd.evaluate.examl<- function(indir, outdir=indir, select='', outgroup=NA, references.pattern='REF', run.pattern='ptyr', tree.pattern='\\.newick', rm.newick=0, rm.fasta=0, plot.trees.per.page=NA, plot.w=NA, plot.h=NA, infile=NA)
{
	cmd		<- paste('\n',PR.EVAL.EXAML, ' -indir=',indir, ' -rm.newick=', rm.newick, ' -rm.fasta=', rm.fasta, sep='')
	if(!is.na(infile))
		cmd	<- paste(cmd,' -infile=',infile,sep='')		
	if(select!='')
		cmd	<- paste(cmd,' -select=',select,sep='')
	if(!is.na(references.pattern))
		cmd	<- paste(cmd,' -references.pattern=',references.pattern,sep='')
	if(!is.na(run.pattern))
		cmd	<- paste(cmd,' -run.pattern=',run.pattern,sep='')
	if(!is.na(tree.pattern))
		cmd	<- paste(cmd,' -tree.pattern=',tree.pattern,sep='')	
	if(!is.na(outgroup))
		cmd	<- paste(cmd,' -outgroup=',outgroup,sep='')
	if(!is.na(plot.trees.per.page))
		cmd	<- paste(cmd,' -plot.trees.per.page=',plot.trees.per.page,sep='')
	if(!is.na(plot.w))
		cmd	<- paste(cmd,' -plot.w=',plot.w,sep='')
	if(!is.na(plot.h))
		cmd	<- paste(cmd,' -plot.h=',plot.h,sep='')
	cmd	<- paste(cmd,'\n',sep='')
	cmd
}



project.dualinfecions.phylotypes.mltrees.160115<- function() 
{
	require(ggtree)
	require(phytools)
	
	pty.infile		<- file.path(HOME,"data", "PANGEA_HIV_n5003_Imperial_v160110_ZA_examlbs500_ptyrunsinput.rda")
	indir.tr		<- file.path(HOME,"phylotypes_160120")
	pty.evaluate.tree(pty.infile, indir.tr)
	#	can now delete all newick files	
	
	pty.infile		<- file.path(HOME,"data", "PANGEA_HIV_n5003_Imperial_v160110_ZA_examlbs500_ptyrunstrees.rda")
	#
	#	get statistics
	#
	#	node heights
	tmp					<- ptyfiles[,{										
										ph		<- pty.ph[[FILE]]
										tmp		<- node.depth.edgelength(ph)[1:Ntip(ph)]
										list(BAM=ph$tip.label, HEIGHT=tmp)
									}, by='FILE']
	pty.stat			<- merge(ptyfiles, tmp, by='FILE')
	#	extract individual from read name
	pty.stat[, IND:= gsub('_read.*','',BAM)]
	#	extract number of identical reads from read name 
	pty.stat[, READ_N:= as.integer(gsub('count_','',regmatches(BAM, regexpr('count_[0-9]+',BAM))))]
	#	get number of reads per individual
	pty.stat			<- merge(pty.stat,pty.stat[, list(IND_N=sum(READ_N)),by=c('FILE','IND')],by=c('FILE','IND'))
	#	check if reads from same individual are monophyletic
	pty.stat			<- merge(pty.stat, pty.stat[,	list(MONOPH=is.monophyletic(pty.ph[[FILE]], BAM)), by=c('FILE','IND')], by=c('FILE','IND'))
	set(pty.stat, NULL, 'MONOPH', pty.stat[, factor(MONOPH, levels=c(FALSE,TRUE),labels=c('N','Y'))])
	
	
	
	ptyfiles				<- merge(ptyfiles, pty.stat[, list(HMX= max(HEIGHT)), by='FILE'], by='FILE')		
}


pty.evaluate.tree.root.nofill<- function(ptyfiles, pty.ph, pty.runs)
{
	pty.root	<- lapply(ptyfiles[, unique(PTY_RUN)], function(ptyr){	
				#print(ptyr)	
				#ptyr<- 15
				tmp					<- subset(ptyfiles, PTY_RUN==ptyr)[,FILE]
				phs					<- lapply(tmp, function(x) pty.ph[[x]])	
				names(phs)			<- tmp
				#	number of read counts per individual				
				phpd				<- do.call('rbind',lapply(seq_along(phs),function(i){
									#i			<- 1
									#print(i)		
									ph			<- phs[[i]]
									phb			<- data.table(IDX=seq_along(ph$tip.label), BAM=ph$tip.label, FILE_ID= gsub('_read.*','',ph$tip.label))
									phb[, COUNT:= as.numeric(gsub('count_','',regmatches(BAM, regexpr('count_[0-9]+',BAM))))]
									phb			<- merge(phb, subset(pty.runs[pty.runs$PTY_RUN==ptyr,],select=FILE_ID), by='FILE_ID', all=1)
									set(phb, phb[, which(is.na(COUNT))], 'COUNT', 0)
									phb			<- phb[, list(COUNT=sum(COUNT)), by='FILE_ID']
									phb[, FILE:= names(phs)[i]]
									phb					
								}))
				phpd				<- merge(phpd, subset(ptyfiles, PTY_RUN==ptyr, select=c(FILE, W_FROM)), by='FILE')
				setkey(phpd, W_FROM)
				outgroup.order		<- phpd[, list(COUNT=mean(COUNT)), by='FILE_ID']				
				outgroup.order		<- merge(outgroup.order, phpd[, list(PRESENT=length(which(COUNT>0))), by='FILE_ID'], by='FILE_ID')
				setkey(outgroup.order, COUNT)
				outgroup.order		<- subset(outgroup.order, PRESENT>0)
				#	ususally not all present throughout; pick in order
				ans					<- do.call('rbind',lapply(seq_along(phs),function(i){
							#i			<- 147
							#print(i)
							ph				<- phs[[i]]							
							phb				<- data.table( IDX=seq_along(ph$tip.label), BAM=ph$tip.label, FILE_ID= gsub('_read.*','',ph$tip.label))				
							phb				<- merge(phb, outgroup.order, by='FILE_ID')
							outgroup.ind	<- phb[which.min(COUNT), FILE_ID]							
							phgd			<- cophenetic.phylo(ph)
							tmp				<- subset(phb, FILE_ID==outgroup.ind)[, BAM]
							if(length(tmp)<nrow(phgd))
								phgd		<- phgd[ tmp, setdiff(rownames(phgd), tmp), drop=FALSE]
							outgroup.seq	<- rownames(phgd)[ which(min(phgd)==phgd, arr.ind=TRUE)[1,1] ]
							data.table(FILE=names(phs)[i], ROOT=outgroup.seq)					
						}))
				ans[, PTY_RUN:= ptyr]
				ans				
			})
	pty.root	<- do.call('rbind',pty.root)
	pty.root
}


pty.evaluate.tree.root.withfill<- function(ptyfiles, pty.ph, pty.runs)
{
	pty.root	<- lapply(ptyfiles[, unique(PTY_RUN)], function(ptyr){	
				#print(ptyr)	
				#ptyr<- 15
				tmp			<- subset(ptyfiles, PTY_RUN==ptyr)[,FILE]
				phs			<- lapply(tmp, function(x) pty.ph[[x]])	
				names(phs)	<- tmp
				#get patristic distances between target and fill
				#if no target, select target_ind with lowest taxon index (just makes an unambiguous selection when windows are considered one by one)
				phpd		<- do.call('rbind',lapply(seq_along(phs),function(i){
									#i			<- 1
									print(i)
									ph			<- phs[[i]]							
									phb			<- data.table(IDX=seq_along(ph$tip.label), BAM=ph$tip.label, FILE_ID= gsub('_read.*','',ph$tip.label))				
									phb			<- merge(phb, pty.runs[pty.runs$PTY_RUN==ptyr,], by='FILE_ID', all=1)
									phgd		<- cophenetic.phylo(ph)
									tmp			<- subset(phb, !FILL & !is.na(BAM))[, BAM]
									if(length(tmp)==0)
									{
										tmp		<- subset(phb, !is.na(BAM))[, FILE_ID[which.min(TX_IDX)]]
										tmp		<- subset(phb, !is.na(BAM) & FILE_ID==tmp)[, BAM]
									}								
									tmp			<- phgd[ tmp, setdiff(rownames(phgd), tmp), drop=FALSE]
									stopifnot(ncol(tmp)>0)								
									ans			<- as.data.table(melt(tmp, value.name='PATR'))
									setnames(ans, c('Var1','Var2'), c('TARGET','FILL'))
									ans[, FILE:= names(phs)[i]]	
									ans[, IDX:=i]							
									ans							
								}))
				#print(phpd[, unique(IDX)])				
				phpd[, FILL_IND:= gsub('_read.*','',FILL)]				
				#print(phpd)		
				#	try consider as root only individual present across all BAM files
				tmp		<- phpd[, list(FILL_IND_N=length(FILL)), by=c('FILL_IND','FILE')]
				tmp		<- dcast.data.table(tmp, FILE~FILL_IND, value.var='FILL_IND_N')
				tmp		<- apply(tmp[,-1, with=FALSE], 2, function(x) !any(is.na(x))	)
				tmp		<- data.table(FILL_IND=names(tmp)[tmp])
				if(nrow(tmp)==0)	# 	may be empty
				{
					print(PTY_RUN)
					stop()
				}				
				tmp		<- merge(phpd, tmp, by='FILL_IND')
				#	select individual with average largest distance
				tmp		<- tmp[, list(PATR= median(PATR)), by='FILL_IND']	
				root	<- tmp[which.max(PATR),][,FILL_IND]
				ans		<- subset(phpd, FILL_IND==root)[, list(ROOT=FILL[which.max(PATR)]), by=c('IDX','FILE')]
				tmp		<- ans[, list(CHECK= ROOT%in%phs[[IDX]]$tip.label) , by='IDX']
				stopifnot( tmp[, all(CHECK)] )		
				ans[, IDX:=NULL]
				ans[, PTY_RUN:= ptyr]
				ans
			})
	pty.root	<- do.call('rbind',pty.root)
	pty.root
}


pty.stat.different<- function(ph)
{
	phb			<- data.table(IDX=seq_along(ph$tip.label), BAM=ph$tip.label, FILE_ID= gsub('_read.*','',ph$tip.label))				
	phb[, COUNT:= as.numeric(gsub('count_','',regmatches(BAM, regexpr('count_[0-9]+',BAM))))]				
	phm			<- phb[, list(MRCA=as.numeric(getMRCA(ph, IDX))), by='FILE_ID']
	#	find longest branch and diversity in clade below
	phm			<- phm[, {
				#FILE_ID<- '13554_1_18'
				#MRCA<- 2331
				z	<- extract.clade(ph, MRCA, root.edge=1)
				#plot(z, show.tip.label=0)
				#add.scale.bar()
				# tips in same clade have consecutive tip indices in ape format
				df	<- data.table(IDX=seq_along(z$tip.label), BAM=z$tip.label, FILE_ID= gsub('_read.*','',z$tip.label))
				df	<- merge(df, data.table(IDX= df$IDX[which(df$FILE_ID!=FILE_ID)]), by='IDX')
				tmp	<- c(nrow(df), df[, length(unique(FILE_ID))])
				if(tmp[1])
				{
					
					df[, GROUP:= cumsum(c(1,as.numeric(diff(IDX)>1)))]
					df[, GROUP:= df[, as.numeric(factor(paste(FILE_ID,'-',GROUP,sep='')))]]
					df[, GROUP:= df[, cumsum(c(1,as.numeric(diff(GROUP)>1)))]]
					tmp[1]	<- df[, length(unique(GROUP))]					
				}
				list(DIFF=tmp[1], DIFF_IND=tmp[2] )								
			}, by='FILE_ID']
	phm
}


pty.stat.maxlocalsep<- function(ph, threshold.min.stem=0.01)
{
	phb			<- data.table(IDX=seq_along(ph$tip.label), BAM=ph$tip.label, FILE_ID= gsub('_read.*','',ph$tip.label))				
	phb[, COUNT:= as.numeric(gsub('count_','',regmatches(BAM, regexpr('count_[0-9]+',BAM))))]				
	phm			<- phb[, list(MRCA=as.numeric(getMRCA(ph, IDX))), by='FILE_ID']
	#	find longest branch and diversity in clade below
	phm			<- phm[, {
				#FILE_ID<- 'R3_res567_S22_L001'; MRCA<- 1213
				#print(MRCA)
				z		<- extract.clade(ph, MRCA, root.edge=1)
				z		<- drop.tip(z, z$tip.label[ !grepl(FILE_ID, z$tip.label) ], root.edge=1)
				#plot(z, cex=0.2)
				#add.scale.bar()
				# find largest internal branch							
				df		<- data.table(E_IDX=which(z$edge[,2]>Ntip(z)))
				df[, LEN:= z$edge.length[E_IDX]]
				df[, TO:= z$edge[E_IDX,2]]
				df		<- subset( df[order(-LEN),], LEN>threshold.min.stem )
				ans		<- data.table(E_IDX=NA_integer_, CL_MX_LOCAL_SEP=NA_real_, CL_AVG_HEIGHT_UNIQUE=NA_real_, CL_AVG_HEIGHT_ALL=NA_real_, CL_TAXA_N=NA_integer_, CL_COUNT_N=NA_real_)
				if(nrow(df))
					ans	<- df[, {
							tmp	<- extract.clade(z, TO, root.edge=0)
							tmp2<- node.depth.edgelength(tmp)[seq_len(Ntip(tmp))]	
							tmp3<- merge(phb,data.table(BAM=tmp$tip.label),by='BAM')[, COUNT]
							list(	CL_MX_LOCAL_SEP=LEN, 
									CL_AVG_HEIGHT_UNIQUE=mean(tmp2),
									CL_AVG_HEIGHT_ALL=mean(rep(tmp2,tmp3)),
									CL_TAXA_N=Ntip(tmp),
									CL_COUNT_N=sum(tmp3))
						}, by='E_IDX']
				ans[, MRCA:=MRCA]				 
				ans[, TAXA_N:=Ntip(z)]
				ans[, COUNT_N:=merge(phb,data.table(BAM=z$tip.label),by='BAM')[, sum(COUNT)]]
				ans
			}, by='FILE_ID']
	subset(phm, !is.na(E_IDX))
}


pty.stat.withinhost.diversity<- function(ph, coi.div.probs=c(0.02,0.25,0.5,0.75,0.98))
{
	phb			<- data.table( BAM=ph$tip.label, FILE_ID= gsub('_read.*','',ph$tip.label))				
	phb[, COUNT:= as.numeric(gsub('count_','',regmatches(BAM, regexpr('count_[0-9]+',BAM))))]	
	phgd		<- cophenetic.phylo(ph)	
	phgd		<- as.data.table(melt(phgd, value.name='PATR'))
	setnames(phgd, c('Var1','Var2'), c('BAM','BAM2'))
	phgd		<- merge(phb, phgd, by='BAM')
	setnames(phgd, c('BAM','FILE_ID','COUNT','BAM2'), c('BAM1','FILE_ID1','COUNT1','BAM'))
	phgd		<- merge(phb, phgd, by='BAM')
	setnames(phgd, c('BAM','FILE_ID','COUNT'), c('BAM2','FILE_ID','COUNT2'))
	phgd		<- subset(phgd, FILE_ID1==FILE_ID)
	phgd[, FILE_ID1:= NULL]
	tmp			<- phgd[, list(READS='UNIQUE', P=coi.div.probs, QU=quantile(PATR,p=coi.div.probs)), by='FILE_ID']
	ans			<- phgd[, list(READS='ALL', P=coi.div.probs, QU=quantile(rep(PATR,COUNT1*COUNT2),p=coi.div.probs)), by='FILE_ID']
	ans			<- rbind(ans,tmp)
	set(ans, NULL, 'P', ans[, paste('WHD_p',P*100,sep='')])
	ans			<- dcast.data.table( ans, READS+FILE_ID~P, value.var='QU')
	tmp			<- phb[, list(READS='ALL', N=sum(COUNT)), by='FILE_ID']
	tmp			<- rbind(tmp, phb[, list(READS='UNIQUE', N=length(BAM)), by='FILE_ID'])
	ans			<- merge(ans,tmp,by=c('READS','FILE_ID'))	
}


pty.stat.collect	<- function(indir, outdir=indir, outfile=file.path(outdir,'ptyr_examl_stat.rda'))
{
	infiles		<- data.table(FILE=list.files(indir, pattern='examl.rda$'))
	infiles[, PTY_RUN:= as.numeric(gsub('ptyr','',sapply(strsplit(FILE,'_'),'[[',1)))]
	tmp			<- data.table(FILE_STAT=list.files(indir, pattern='examl_stat.rda$'))
	tmp[, FILE:= gsub('_stat','',FILE_STAT)]
	infiles		<- merge(infiles, tmp, by='FILE',all.x=1)	
	setkey(infiles, PTY_RUN)
	if( infiles[, any(is.na(FILE_STAT))] )
		cat('\nwarning stat files not available for=',paste( subset(infiles, is.na(FILE_STAT))[, FILE], collapse=', '))	
	infiles		<- subset(infiles, !is.na(FILE_STAT))	
	pty.stat	<- do.call('rbind',lapply(seq_len(nrow(infiles)), function(i){
						load( gsub('\\.rda','_stat\\.rda', file.path(indir, infiles[i,FILE])) )				
						pty.stat
					}))	
	save(pty.stat, file=outfile)
}



pty.stat.monophyletic.clades<- function(ph)
{
	phb			<- data.table(IDX=seq_along(ph$tip.label), BAM=ph$tip.label, IND= gsub('_read.*','',ph$tip.label), REF=grepl(references.pattern,ph$tip.label))
	set(phb, phb[, which(REF)],'IND','REFERENCE')
	#	for each patient define mrca (MRCA)
	#	for each patient, 
	#		determine MRCA (mrca)
	#		calculate number of other individuals in clade below MRCA (diff)	
	z			<- phb[, {
				#IND<- "BEE0016-1"
				#IDX<- subset(phb, IND=="BEE0016-1")[, IDX]
				print(IND)
				mrca	<- IDX		
				diff	<- 0L
				if(length(IDX)>1)
				{
					mrca	<- as.integer(getMRCA(ph, IDX))
					tmp		<- extract.clade(ph, mrca, root.edge=1)
					#print(tmp)
					diff	<- length(setdiff(gsub('_read.*','',tmp$tip.label), IND))								
				}												
				list(MRCA=mrca, DIFF_IND=diff)							
			}, by=c('IND')]	
	z			<- subset(z, IND!='REFERENCE')
	#	for all patients that are not monophyletic, there could be several clades
	#		trace back all ancestors between tips of other individuals and MRCA
	#		for each such tip, construct the unique path to MRCA that is not on a previous path
	#		find change points on these unique paths below which the subtree contains reads from the current patient
	#		the idea is that all monophyletic clades of this patient must break off from one of these paths	
	tmp			<- subset(z, DIFF_IND>0, c(IND, MRCA))
	if(!nrow(tmp))
		z[, CLADE:=NA_integer_]
	if(nrow(tmp))
	{
		#	find all tips that belong to another patient than the current individual
		tmp			<- tmp[, {
					if(MRCA<=Ntip(ph))
						tmp	<- ph$tip.label[MRCA]
					if(MRCA>Ntip(ph))
						tmp	<- extract.clade(ph,MRCA)$tip.label
					list(MRCA=MRCA, MRCA_IND=IND, BAM=tmp)	
				}, by='IND']
		tmp[, IND:=NULL]
		zz			<- merge(tmp, subset(phb, select=c(BAM, IND, IDX)), by='BAM')
		zz			<- subset(zz, MRCA_IND!=IND)
		#	determine change points
		zz			<- zz[, {
					#IDX<- c(980,910,912,950); MRCA<- 1580; IND<- 'R1_RES669_S20_L001'; MRCA_IND<- 'R1_RES827_S23_L001'					
					#	determine paths to MRCA from each tip
					anc.group		<- Ancestors(ph, IDX)	
					if(!is.list(anc.group))
						anc.group	<- list(anc.group)
					anc.group	<- lapply(seq_along(anc.group), function(i)  anc.group[[i]][ seq_len(which(anc.group[[i]]==MRCA[1])-1)] )							
					#	determine unique paths until we hit a path that is already visited
					anc.join	<- lapply(seq_along(anc.group), function(i){	unique(unlist(lapply(seq_len(i), function(j) anc.group[[j]])))	})
					anc.join	<- c(NA,anc.join)
					anc.group	<- lapply(seq_along(anc.group), function(i)	setdiff(anc.group[[i]],anc.join[[i]])	)
					#	check which clades defined by the mrcas on the ancestor path contain at least one read from MRCA_IND
					tmp			<- lapply(seq_along(anc.group), function(i) sapply(anc.group[[i]], function(j)	any(grepl(MRCA_IND, extract.clade(ph,j)$tip.label))		)	)				
					#	determine lowest node (counting from tips) which contains at least one read from MRCA_IND
					tmp			<- lapply(seq_along(tmp), function(i){
								ans		<- NA_integer_	
								tmp2	<- integer(0)
								if(length(tmp[[i]]))
									tmp2<- which(tmp[[i]]) 
								if(length(tmp2))
									ans	<- as.integer(anc.group[[i]][tmp2[1]])										
								ans
							})				
					list(IDX=IDX, CHANGE_NODE=unlist(tmp))
				},by=c('IND','MRCA','MRCA_IND')]
		zz			<- subset(zz, !is.na(CHANGE_NODE))
		#	each ancestor before a change node could have as one of its children a monophyletic clade from this patient
		setkey(zz, MRCA_IND, CHANGE_NODE)
		zz	<- unique(zz)
		zz	<- zz[, {
					#CHANGE_NODE<- 2053; MRCA<- 1580; MRCA_IND<- 'R1_RES827_S23_L001'
					#MRCA<- 1212; MRCA_IND<- 'R1_RES669_S20_L001'; CHANGE_NODE<- 1218
					#	the first two potentially monophyletic clades are the children of the CHANGE_NODE
					tmp			<- Children(ph, CHANGE_NODE)					
					#	define path from change node to just before mrca 					
					path		<- c(CHANGE_NODE, setdiff( Ancestors(ph,CHANGE_NODE), c(MRCA,Ancestors(ph,MRCA)) ))
					#	monophyletic clades could break off the path ending at MRCA
					#	the mrca's of these clades are the siblings of the path constructed above 
					pot.clade	<- c(tmp, unlist(Siblings(ph, path)))
					#	check if potential clades are monophyletic
					tmp			<- sapply(pot.clade, function(x){
								if(x<=Ntip(ph))
									ans	<- grepl(MRCA_IND, ph$tip.label[x])
								if(x>Ntip(ph))
									ans	<- all(grepl(MRCA_IND, extract.clade(ph,x)$tip.label))
								ans
							})	
					#	return monophyletic clades
					list(CLADE=as.integer(pot.clade[tmp]))
				}, by=c('MRCA_IND','CHANGE_NODE')]
		zz[, CHANGE_NODE:=NULL]
		setnames(zz,'MRCA_IND','IND')
		#	merge all monophyletic clades
		z	<- merge(z, unique(zz), all=1, by='IND', allow.cartesian=TRUE)
	}	
	tmp	<- z[, which(DIFF_IND==0)]
	set(z, tmp, 'CLADE', z[tmp,MRCA])
	#	double check CLADEs (the MONOPH_PA condition is key)
	if(nrow(z))
	{
		tmp	<- z[,{
					if(CLADE<=Ntip(ph))
						tmp	<- grepl(IND, ph$tip.label[CLADE])
					if(CLADE>Ntip(ph))
						tmp	<- all(grepl(IND, extract.clade(ph,CLADE)$tip.label))
					list(MONOPH=tmp, MONOPH_PA=all(grepl(IND, extract.clade(ph,Ancestors(ph, CLADE, type="parent"))$tip.label)))
				}, by=c('IND','CLADE')]
		stopifnot(tmp[, all(MONOPH)], tmp[, !any(MONOPH_PA)])	
	}	
	z	
}


pty.stat.monophyletic.clades.using.mxpars<- function(ph)
{
	phb			<- data.table(IDX=seq_along(ph$tip.label), BAM=ph$tip.label, IND= gsub('_read.*','',ph$tip.label), REF=grepl(references.pattern,ph$tip.label))
	set(phb, phb[, which(REF)],'IND','REFERENCE')
	#phb[, COUNT:= as.numeric(gsub('count_','',regmatches(BAM, regexpr('count_[0-9]+',BAM))))]
	#	consecutive tips of the same individual could define separate clades
	phb[, GROUP:=cumsum(c(1,as.numeric(diff(as.numeric(factor(IND)))!=0)))]
	#	for each potential clade, 
	#		determine MRCA (mrca)
	#		calculate number of other individuals in clade below MRCA (diff)
	#		get node index of ancestor of MRCA (mrcaa)
	z			<- phb[, {
				#print(GROUP)
				mrca	<- IDX		
				diff	<- 0L
				mrcaa	<- NA_real_
				if(length(IDX)>1)
				{
					mrca	<- as.integer(getMRCA(ph, IDX))
					tmp		<- extract.clade(ph, mrca, root.edge=1)
					#print(tmp)
					diff	<- length(setdiff(gsub('_read.*','',tmp$tip.label), IND))								
				}												
				#ancestor of MRCA
				mrcaa	<- ph$edge[which(ph$edge[,2]==mrca),1]																
				list(MRCA=mrca, DIFF_IND=diff, MRCA_ANC=mrcaa)							
			}, by=c('GROUP','IND')]
	z			<- subset(z, IND!='REFERENCE')
	#	get max parsimony assignment for edge ending in ancestor of MRCA
	set(z, NULL, 'MRCA_ANC_EDGE', z[, attr(ph, "INDIVIDUAL")[MRCA_ANC]])
	set(z, NULL, 'MONOPH', 0L)
	#	if DIFF_IND==0 and MRCA_ANC_EDGE!=IND, then we have a unique clade
	set(z, z[, which(DIFF_IND==0L & MRCA_ANC_EDGE!=IND)], 'MONOPH', 1L)
	#	no double counting for every individual: ignore nested clades
	tmp			<- z[, {
				tmp<- Descendants(ph, min(MRCA), type='all')
				list(MRCA=MRCA, SUBCLADE= MRCA%in%tmp)
			}, by='IND']
	z			<- merge(z, subset(tmp, !SUBCLADE), by=c('IND','MRCA'))
	z[, SUBCLADE:=NULL]	
	#	for each potential clade with >=1 other individuals within, there could be several clades
	#		trace back all ancestors between tips of other individuals and MRCA
	#		for each such tip, construct the unique path to MRCA that is not on a previous path
	#		find change points in max pars edge assignments along the unique paths
	#		the idea is that this finds the breaking edges that result in monophylies
	tmp			<- subset(z, DIFF_IND>0, c(GROUP, IND, MRCA))
	if(nrow(tmp))
	{
		tmp			<- tmp[, {
					if(MRCA<=Ntip(ph))
						tmp	<- ph$tip.label[MRCA]
					if(MRCA>Ntip(ph))
						tmp	<- extract.clade(ph,MRCA)$tip.label
					list(MRCA=MRCA, MRCA_IND=IND, BAM=tmp)	
				}, by='GROUP']
		zz			<- merge(tmp, subset(phb, select=c(BAM, IND, IDX)), by='BAM')
		zz			<- subset(zz, MRCA_IND!=IND)
		zz			<- zz[, {
					#IDX<- c(980,910); MRCA<- 1580; IND<- 'R1_RES669_S20_L001'; MRCA_IND<- 'R1_RES827_S23_L001'
					#IDX<- c(917); MRCA<- 2690; IND<- 'R10_105205_S83_L001'; MRCA_IND<- 'R1_RES669_S20_L001'
					#	determine paths to MRCA from each tip
					anc.group		<- Ancestors(ph, IDX)	
					if(!is.list(anc.group))
						anc.group	<- list(anc.group)
					anc.group	<- lapply(seq_along(anc.group), function(i)  anc.group[[i]][ seq_len(which(anc.group[[i]]==MRCA[1])-1)] )							
					#	determine unique paths until we hit a path that is already visited
					anc.join	<- lapply(seq_along(anc.group), function(i){	unique(unlist(lapply(seq_len(i), function(j) anc.group[[j]])))	})
					anc.join	<- c(NA,anc.join)
					anc.group	<- lapply(seq_along(anc.group), function(i)	setdiff(anc.group[[i]],anc.join[[i]])	)
					#	determine max parsimony assignment along unique branches
					#	to find patient-clades within the subtree containing all patient taxa, it does not matter if branches change *between* other individuals
					mxp.assign	<- as.character(attr(ph, "INDIVIDUAL"))
					mxp.assign[ mxp.assign!=MRCA_IND[1] ]	<- 'OTHER'
					tmp			<- ifelse(IND==MRCA_IND[1], IND, 'OTHER')
					tmp			<- lapply(seq_along(anc.group), function(i)	c(tmp, mxp.assign[ anc.group[[i]] ]) )
					#	count changes in max parsimony assignment along each branch
					mxp.changes	<- lapply(seq_along(tmp), function(i)	sum( as.numeric(diff(as.numeric(factor(tmp[[i]])))!=0) )	)							
					#	determine lowest node at which change occurs
					tmp			<- lapply(seq_along(tmp), function(i){
								tmp2<- tail(which(diff(as.numeric(factor(tmp[[i]])))!=0),1) #index of ancestor *below* which change occurs
								if(length(tmp2))
									tmp2<- as.integer(anc.group[[i]][tmp2])										
								if(!length(tmp2))
									tmp2<- NA_integer_
								tmp2
							})				
					list(IDX=IDX, MRCA_IND=MRCA_IND, CHANGES_ON_UNIQUE_PATH=unlist(mxp.changes), CHANGE_NODE=unlist(tmp))
				},by=c('GROUP','IND','MRCA')]
		zz			<- subset(zz, CHANGES_ON_UNIQUE_PATH>0)
		#
		#	each ancestor before a change node could have as one of its children a monophyletic clade
		#	
		setkey(zz, GROUP, CHANGE_NODE)
		zz	<- unique(zz)
		zz	<- zz[, {
					#CHANGE_NODE<- 2053; MRCA<- 1580; MRCA_IND<- 'R1_RES827_S23_L001'
					#cat(CHANGE_NODE)
					#	path from change node to just before mrca; we will be checking the sibling of these so this is why the mrca is excluded
					path		<- c(CHANGE_NODE, setdiff( Ancestors(ph,CHANGE_NODE), c(MRCA,Ancestors(ph,MRCA)) ))
					#	the first potentially monophyletic clade must be calculated in a different way							
					tmp			<- ph$edge[ph$edge[,1]==CHANGE_NODE,2]
					tmp2		<- which(as.character(attr(ph, "INDIVIDUAL"))[tmp]==MRCA_IND)
					#	collect the potentially monophyletic clades along the path
					pot.clade	<- c(tmp[tmp2], unlist(Siblings(ph, path)))
					#	check if potential clades are monophyletic
					tmp			<- sapply(pot.clade, function(x){
								if(x<=Ntip(ph))
									ans	<- grepl(MRCA_IND, ph$tip.label[x])
								if(x>Ntip(ph))
									ans	<- all(grepl(MRCA_IND, extract.clade(ph,x)$tip.label))
								ans
							})	
					#	return monophyletic clades
					list(CLADE=as.integer(pot.clade[tmp]))
				}, by=c('GROUP','CHANGE_NODE')]
		zz[, CHANGE_NODE:=NULL]
		#	merge all monophyletic clades
		z	<- merge(z, unique(zz), all=1, by='GROUP', allow.cartesian=TRUE)
	}	
	tmp	<- z[, which(MONOPH==1)]
	set(z, tmp, 'CLADE', z[tmp,MRCA])
	subset(z, select=c(IND, MRCA_ANC_EDGE, DIFF_IND, CLADE))
}


pty.stat.avgnodedepth.and.counts<- function(ph, mrca, select=NA)
{
	if(mrca<=Ntip(ph))
	{
		phb		<- data.table(	BAM=ph$tip.label[mrca],
								IND=gsub('_read.*','',ph$tip.label[mrca]),
								COUNT=as.numeric(gsub('count_','',regmatches(ph$tip.label[mrca], regexpr('count_[0-9]+',ph$tip.label[mrca])))),											
								DEPTH=0	)
	}
	if(mrca>Ntip(ph))
	{
		ph		<- extract.clade(ph,mrca)
		phb		<- data.table(	BAM=ph$tip.label,
								IND=gsub('_read.*','',ph$tip.label),
								COUNT= as.numeric(gsub('count_','',regmatches(ph$tip.label, regexpr('count_[0-9]+',ph$tip.label))))	)
		phb[, DEPTH:=node.depth.edgelength(ph)[seq_len(Ntip(ph))]]
	}		
	if(!is.na(select))
		phb		<- subset(phb, grepl(select,IND))
	list(AVG_ROOT2TIP=phb[, sum(DEPTH*COUNT) / sum(COUNT)], READ_N=phb[, sum(COUNT)], UREAD_N=phb[, length(COUNT)])	
}


pty.stat.superinfections.160208<- function(pty.ph, ptyfiles, references.pattern='REF')
{	
	#	@CF: with data.table, the df[, {}, by=BY] syntax specifies the columns BY to loop over
	#	@CF: the return variables of a data.table block {} are either a named list, or another data.table
	#	@CF: one difference to data.frame is that the column names are available as variable names inside the block {}
	
	#	For each phylotype run (PTY_RUN), each patient (IND), each window (W_FROM, W_TO):
	#	- find the root node (CLADE) defining all monophyletic clades		
	stat.clades	<- ptyfiles[, {
				#FILE		<- subset(ptyfiles, W_FROM==661)[,FILE]
				#FILE<- 'ptyr22_InWindow_481_to_540_dophy_examl.newick'
				cat('\n',FILE)
				ph			<- pty.ph[[FILE]]
				pty.stat.monophyletic.clades(ph)							
			}, by=c('PTY_RUN','W_FROM','W_TO','FILE')]
	gc()	
	#	For each monophyletic clade:
	#	- calculate the mean node root-to-tip within clade, weighted by read count
	#	- calculate the number of reads
	#	- calculate the number of unique reads
	tmp			<- stat.clades[, {
				#tmp<- subset(stat.clades, W_FROM==661); FILE<- tmp[,FILE]
				#FILE<- 'ptyr22_InWindow_661_to_720_dophy_examl.newick'; CLADE<- 2066
				#cat('\n',FILE)
				ph			<- pty.ph[[FILE]]
				tmp			<- pty.stat.avgnodedepth.and.counts(ph, CLADE, select=NA)	
				names(tmp)	<- paste(names(tmp),'_CLADE',sep='')				
				tmp
			}, by=c('PTY_RUN','W_FROM','W_TO','FILE','CLADE')]
	stat.clades	<- merge(stat.clades, tmp, by=c('PTY_RUN','W_FROM','W_TO','FILE','CLADE'))
	gc()	
	#	For each phylotype run (PTY_RUN), each patient (IND), each window (W_FROM, W_TO):
	#	- calculate the average root to tip distance, weighted by read count
	#	  this requires the MRCA which can be expensive but has already been computed
	#	- calculate the total number of reads 
	#	- calculate the number of unique reads
	setkey(stat.clades, PTY_RUN, W_FROM, W_TO, FILE, IND)
	tmp			<- subset(unique(stat.clades), select=c(PTY_RUN, W_FROM, W_TO, FILE, IND, MRCA))
	stat.ind	<- tmp[,{
				ph			<- pty.ph[[FILE]]
				pty.stat.avgnodedepth.and.counts(ph, MRCA, select=IND)				
			}, by=c('PTY_RUN','W_FROM','W_TO','FILE','IND')]
	set(stat.ind, NULL, 'UREAD_N', stat.ind[, as.numeric(UREAD_N)])
	#	For each phylotype run (PTY_RUN), each patient (IND), each window (W_FROM, W_TO):
	#	- calculate number of clades
	tmp			<- stat.clades[, list(CLADE_N=as.numeric(length(CLADE))), by=c('PTY_RUN','W_FROM','W_TO','FILE','IND')]	
	stat.ind	<- merge(stat.ind, tmp, by=c('PTY_RUN','W_FROM','W_TO','FILE','IND'))
	#	For each monophyletic clade:
	#	- calculate the proportion of reads in the clade
	#	- calculate clade order by proportion
	stat.clades	<- merge(stat.clades, subset(stat.ind, select=c(PTY_RUN, W_FROM, W_TO, FILE, IND, READ_N)), by=c('PTY_RUN','W_FROM','W_TO','FILE','IND'))	
	stat.clades[, READ_P_CLADE:= READ_N_CLADE/READ_N]
	stat.clades[, READ_N:= NULL]
	stat.clades	<- stat.clades[ order(PTY_RUN, W_FROM, W_TO, FILE, IND, -READ_P_CLADE), ]	
	tmp			<- stat.clades[, list(ORDER_CLADE= seq_along(CLADE), CLADE=CLADE), by=c('PTY_RUN','W_FROM','W_TO','FILE','IND')]
	stat.clades	<- merge(stat.clades, tmp, by=c('PTY_RUN','W_FROM','W_TO','FILE','IND','CLADE'))
	#
	#	check that all reads are in one clade
	#
	tmp			<- stat.clades[, list(READ_N_CHECK=sum(READ_N_CLADE)), by=c('PTY_RUN','W_FROM','W_TO','FILE','IND')]	
	tmp			<- merge(tmp, subset(stat.ind, select=c(PTY_RUN, W_FROM, W_TO, FILE, IND, READ_N)), by=c('PTY_RUN','W_FROM','W_TO','FILE','IND'))
	stopifnot( !nrow(subset(tmp, READ_N_CHECK<READ_N))	)
	list(stat.clades=stat.clades, stat.ind=stat.ind)
}


pty.stat.superinfections.160208.plot<- function(stat.clades, stat.ind, outfile, plot.max.clade=5)
{			
	#
	#	print by patient
	#
	
	#	@CF: To get a sub-plot for each statistic using 'facet_grid', I melt the data.table
	#	@CF: the 'set' command looks a bit weird, but is great because it avoids creating new R objects via '<-'
	
	#	melt the statistics on individuals
	stat.plot	<- melt(stat.ind, measure.vars=c('READ_N','UREAD_N','CLADE_N','AVG_ROOT2TIP'), id.vars=c('PTY_RUN','W_FROM','W_TO','FILE','IND'), value.name='V', variable.name='STAT')
	#	melt the statistics on clades
	tmp			<- melt(stat.clades, measure.vars=c('READ_P_CLADE','AVG_ROOT2TIP_CLADE'), id.vars=c('PTY_RUN','W_FROM','W_TO','FILE','IND','ORDER_CLADE'), value.name='V', variable.name='STAT')
	set(tmp, NULL, 'STAT', tmp[,gsub('_CLADE','',STAT)])
	#	select most prevalent clades and put all together
	stat.plot	<- rbind( stat.plot, subset(tmp, ORDER_CLADE<=plot.max.clade), use.names=TRUE, fill=TRUE )
	#	give things nice names
	set(stat.plot, NULL, 'ORDER_CLADE', stat.plot[, as.character(ORDER_CLADE)])
	tmp			<- stat.plot[, which(!is.na(ORDER_CLADE))]
	set(stat.plot, tmp, 'ORDER_CLADE', stat.plot[tmp, paste('Clade',ORDER_CLADE)])
	tmp			<- stat.plot[, sort(na.omit(unique(ORDER_CLADE)))]
	set(stat.plot, stat.plot[, which(is.na(ORDER_CLADE))], 'ORDER_CLADE', 'Individual')
	set(stat.plot, NULL, 'ORDER_CLADE', stat.plot[, factor(ORDER_CLADE, levels=c('Individual',tmp), labels=c('Individual',tmp))])	
	set(stat.plot, NULL, 'STAT', stat.plot[,factor(STAT, levels=c('READ_N','UREAD_N','CLADE_N','READ_P','AVG_ROOT2TIP'), labels=c('Reads\n(#)', 'Tips\n(#)','Monophyletic\nclades\n(#)','Reads\n(%)','Root-to-tip\npatristic distance\n(mean subst/site)'))])
	#	
	#	setup plots per patient	
	#
	ps	<- lapply(stat.plot[, unique(IND)], function(x){
				#x<- 'R1_RES669_S20_L001'
				cat('\n',x)
				tmp			<- subset(stat.plot, IND==x)
				set(tmp, NULL, 'IND', tmp[,paste('run:',PTY_RUN,', individual:',IND)])
				col			<- c('black',rainbow_hcl(tmp[, length(unique(ORDER_CLADE))-1], start = 270, end = -30, c=100, l=50))
				names(col)	<- tmp[, unique(ORDER_CLADE)]	
				setkey(tmp, PTY_RUN, W_FROM, IND, ORDER_CLADE)
				p			<- ggplot(tmp , aes(x=W_FROM, y=V)) +
						geom_point(data=subset(tmp, !grepl('Reads\n(%)',STAT)), size=0.5, aes(colour=ORDER_CLADE)) +
						geom_bar(data=subset(tmp, STAT=='Reads\n(%)'), stat='identity', aes(fill=ORDER_CLADE)) +
						geom_line(data=subset(tmp, grepl('Root-to-tip',STAT)), aes(colour=ORDER_CLADE, group=ORDER_CLADE)) +
						scale_colour_manual(values=col) +
						scale_fill_manual(values=col, guide=FALSE) +
						scale_x_continuous(breaks=seq(0,12e3,1e3), minor_breaks=seq(0,12e3,2e2), expand=c(0,0), lim=c(1,stat.plot[, max(W_TO)+1])) +
						theme_bw() + labs(x='Genome location', y='', colour='', fill='') +
						facet_grid(STAT~IND, scale='free_y')				
			})
	cat('\nPlot to',outfile)
	pdf(file=outfile, w=9, h=9)			
	for(p in ps)
		print(p)					
	dev.off()	
}


pty.stat.all.160128	<- function(pty.ph, ptyfiles)
{
	#
	#	quantiles of within individual diversity (patristic distance)
	#	calculated among unique reads, and all reads (expand by count of each read)		
	coi.div		<- ptyfiles[, {
				#FILE		<- subset(ptyfiles, W_FROM==6961)[,FILE]
				ph			<- pty.ph[[FILE]]
				pty.stat.withinhost.diversity(ph)
			}, by=c('PTY_RUN','W_FROM','W_TO','FILE')]
	coi.div[, N:=NULL]
	gc()
	tmp			<- subset(coi.div, READS=='ALL')
	setnames(tmp, c("WHD_p2","WHD_p25","WHD_p50","WHD_p75","WHD_p98"), c("WHDA_p2","WHDA_p25","WHDA_p50","WHDA_p75","WHDA_p98"))
	tmp[, READS:=NULL]
	coi.div		<- subset(coi.div, READS=='UNIQUE')
	setnames(coi.div, c("WHD_p2","WHD_p25","WHD_p50","WHD_p75","WHD_p98"), c("WHDU_p2","WHDU_p25","WHDU_p50","WHDU_p75","WHDU_p98"))
	coi.div[, READS:=NULL]
	coi.div		<- merge(coi.div, tmp, by=c('PTY_RUN','W_FROM','W_TO','FILE','FILE_ID'))
	#
	#	maximum branch length within individual, and mean path length of subtending clade
	#	calculated among unique reads, and all reads (expand by count of each read)
	coi.lsep	<- ptyfiles[, {
				#FILE		<- subset(ptyfiles, W_FROM==1021)[,FILE]						
				ph			<- pty.ph[[FILE]]
				pty.stat.maxlocalsep(ph)														
			}, by=c('PTY_RUN','W_FROM','W_TO','FILE')]
	coi.div		<- merge(coi.div, coi.lsep, by=c('PTY_RUN','W_FROM','W_TO','FILE','FILE_ID'))
	coi.lsep	<- NULL
	gc()
	#
	#	Different: number of clades from different individual
	coi.diff	<- ptyfiles[, {
				#FILE		<- subset(ptyfiles, W_FROM==1621)[,FILE]
				ph			<- pty.ph[[FILE]]
				pty.stat.different(ph)								
			}, by=c('PTY_RUN','W_FROM','W_TO','FILE')]
	pty.stat	<- merge(coi.div, coi.diff, by=c('PTY_RUN','W_FROM','W_TO','FILE','FILE_ID'))
	coi.diff	<- coi.div	<- NULL
	gc()
	pty.stat
}


pty.evaluate.tree.collapse.clusters<- function(ph, thresh.brl=8e-6)
{
	#	cluster by genetic distance
	dist.brl	<- hivc.clu.brdist.stats(ph, eval.dist.btw="leaf", stat.fun=hivc.clu.min.transmission.cascade)							
	clustering	<- hivc.clu.clusterbythresh(ph, thresh.brl=thresh.brl, dist.brl=dist.brl, retval="all")			
	#print(clustering)	
	df			<- data.table(BAM=ph$tip.label, FILE_ID= gsub('_read.*','',ph$tip.label), COUNT= as.numeric(gsub('count_','',regmatches(ph$tip.label, regexpr('count_[0-9]+',ph$tip.label)))),  CLU_ID=clustering$clu.mem[seq_len(Ntip(ph))])
	#df			<- merge(df, data.table(CLU_ID=seq_along(clustering$clu.idx), MRCA=clustering$clu.idx), by='CLU_ID', all.x=1)
	df			<- subset(df,!is.na(CLU_ID))
	phc			<- copy(ph)
	if(nrow(df))
	{
		#	only collapse sequences from same individual
		tmp		<- subset(df[, list(CLU_N=length(unique(FILE_ID))), by='CLU_ID'], CLU_N==1)
		df		<- merge(df, tmp, by='CLU_ID')			
		df		<- merge(df, df[, list(TAXA_N=length(FILE_ID), BAMCLU= paste( FILE_ID[1],'_clu_',CLU_ID,'_count_',sum(COUNT), sep='' ) ), by='CLU_ID'], by='CLU_ID')
		#	do not collapse if this results in singleton
		if(Ntip(phc)==nrow(df) && df[, length(unique(CLU_ID))]==1)
			df	<- subset(df, CLU_ID<0)
		for(clu.id in df[, unique(CLU_ID)])
		{
			tmp			<- subset(df, CLU_ID==clu.id)			
			z			<- drop.tip(phc, tip=tmp[, BAM], subtree=TRUE)
			if(!is.null(z))
			{
				phc		<- z
				phc$tip.label[ which(grepl("[",phc$tip.label,fixed=TRUE)) ]	<- tmp[1,BAMCLU]	
			}			 				
		}
	}	
	phc
}

pty.evaluate.tree.groupindividuals<- function(ph, phb)
{
	tmp				<- lapply( phb[, unique(FILE_ID)], function(x)	subset(phb, FILE_ID==x)[, BAM]	)
	names(tmp)		<- phb[, unique(FILE_ID)]
	ph				<- groupOTU(ph, tmp, group='INDIVIDUAL')		
	z	<- merge(data.table(FROM=ph$edge[,1],IDX=ph$edge[,2]), phb, by='IDX', all=1)
	z[, GROUP:= attr(ph,'INDIVIDUAL')[1:nrow(ph$edge)]]
	z	<- unique(subset(z, !is.na(FILE_ID), select=c(FILE_ID, GROUP)))
	attr(ph,'INDIVIDUAL')	<- factor(attr(ph,'INDIVIDUAL'), levels=c(0,z[,as.character(GROUP)]), labels=c('not characterized',z[,FILE_ID]))
	ph
}

pty.evaluate.tree.groupcandidates<- function(ph, phb)
{
	set(phb, NULL, 'FILL', phb[, factor(FILL, levels=c(0,1), labels=c('target','filler'))])
	tmp				<- lapply( phb[, unique(FILL)], function(x)	subset(phb, FILL==x)[, BAM]	)
	names(tmp)		<- phb[, unique(FILL)]
	ph				<- groupOTU(ph, tmp, group='TYPE')				
	z	<- merge(data.table(FROM=ph$edge[,1],IDX=ph$edge[,2]), phb, by='IDX', all=1)
	z[, GROUP:= attr(ph,'TYPE')[1:nrow(ph$edge)]]
	z	<- unique(subset(z, !is.na(FILE_ID), select=c(FILL, GROUP)))
	attr(ph,'TYPE')	<- factor(attr(ph,'TYPE'), levels=c(0,z[,as.character(GROUP)]), labels=c('not characterized',z[,as.character(FILL)]))
	ph
}


pty.evaluate.tree.collapse<- function(pty.runs, ptyfiles, pty.phc, outdir, thresh.brl=8e-6)
{	
	#pty.phc<- copy(pty.ph)
	#length(pty.phc)<- 10
	#	collapse nodes to make browsing of plots easier
	cat('\ncalculating cladograms')
	for(i in seq_along(pty.phc))
	{
		print(i) 
		ph				<- pty.phc[[i]]		
		#FILE<- "ptyr1_InWindow_481_to_540_dophy_examl.newick"
		#ph		<- pty.phc[[FILE]]		
		ph				<- pty.evaluate.tree.collapse.clusters(ph, thresh.brl=thresh.brl)
		phb				<- data.table(IDX=seq_along(ph$tip.label), BAM=ph$tip.label, FILE_ID= gsub('_read.*|_clu.*','',ph$tip.label))
		#	group edges by individual
		ph				<- pty.evaluate.tree.groupindividuals(ph, phb)
		#	group edges by FILL
		tmp				<- as.numeric(gsub('ptyr','',regmatches(names(pty.ph)[i], regexpr('ptyr[0-9]+',names(pty.ph)[i]))))
		phb				<- merge(phb, subset(pty.runs, PTY_RUN==tmp), by='FILE_ID')
		ph				<- pty.evaluate.tree.groupcandidates(ph, phb)	
		pty.phc[[i]]	<- ph		
	}
	#	save
	tmp		<- file.path( outdir, ptyfiles[1, gsub('\\.newick','\\collapsed.rda',gsub('_dophy','',gsub('_InWindow_[0-9]+_to_[0-9]+','',FILE)))] )
	cat('\nsave to file', tmp)
	save(pty.phc, file=tmp)
	#	need node heights for plotting
	tmp				<- ptyfiles[,	{
								#print(FILE)
								#FILE<- "ptyr1_InWindow_481_to_540_dophy_examl.newick"
								ph		<- pty.phc[[FILE]]
								tmp		<- node.depth.edgelength(ph)[1:Ntip(ph)]
								list(BAM=ph$tip.label, HEIGHT=tmp)
							}, by='FILE']	
	ptyfiles		<- merge(ptyfiles, tmp[, list(HMX= max(HEIGHT)), by='FILE'], by='FILE')
	setkey(ptyfiles, PTY_RUN, W_FROM)	
	#ptyfiles			<- subset(ptyfiles, PTY_RUN==1)
	for(ptyr in ptyfiles[, unique(PTY_RUN)])
	{
		#ptyr<- 15
		cat('\nplotting cladograms to R for',ptyr)
		tmp			<- subset(ptyfiles, PTY_RUN==ptyr)
		#	title
		tmp[, TITLE:=paste('run',PTY_RUN,', window [',W_FROM,',',W_TO,']',sep='')]
		setkey(tmp, W_FROM)
		phs			<- lapply(tmp[, FILE], function(x) pty.phc[[x]])
		names(phs)	<- tmp[, TITLE] 
		#	colours	
		tmp2		<- setdiff(sort(unique(unlist(lapply(seq_along(phs), function(i)	levels(attr(phs[[i]],'INDIVIDUAL'))	)))),'not characterized')
		col			<- c('black',rainbow_hcl(length(tmp2), start = 270, end = -30, c=100, l=50))
		names(col)	<- c('not characterized',tmp2)	
		phps		<- lapply(seq_along(phs), function(i){					
					max.node.height	<- tmp[i,][, HMX]
					df				<- data.table(	BAM=phs[[i]]$tip.label, IDX=seq_along(phs[[i]]$tip.label), 
													COUNT= as.numeric(gsub('count_','',regmatches(phs[[i]]$tip.label, regexpr('count_[0-9]+',phs[[i]]$tip.label)))), 
													CLU=grepl('_clu',phs[[i]]$tip.label), 
													FILE_ID= gsub('_read.*|_clu.*','',phs[[i]]$tip.label))
					p				<- ggtree(phs[[i]], aes(color=INDIVIDUAL, linetype=TYPE)) %<+% df + 
													geom_nodepoint(size=phs[[i]]$node.label/100*3) +
													geom_tippoint(aes(size=COUNT, shape=CLU)) +
													geom_tiplab(size=1.2,  hjust=-.1) +							 
													scale_color_manual(values=col, guide = FALSE) +
													scale_shape_manual(values=c(20,18), guide=FALSE) +
													scale_size_area(guide=FALSE) +							
													scale_linetype_manual(values=c('target'='solid','filler'='dotted'),guide = FALSE) +
													theme_tree2() +
													theme(legend.position="bottom") + ggplot2::xlim(0, max.node.height*1.3) +
													labs(x='subst/site', title=names(phs)[i])
					p					
				})	
		names(phps)	<- names(phs)
		file	<- file.path( outdir, tmp[1,gsub('.newick','collapsed.pdf',gsub('_dophy','',gsub('_InWindow_[0-9]+_to_[0-9]+','',FILE)))] )
		cat('\nplotting cladograms to file',file)
		if(1)		#for window length 60 (multiple pages)
		{				
			tmp			<- seq_len(ceiling(length(phps)/10))
			pdf(file=file, w=20, h=40)		#for win=60
			for(i in tmp)
			{		
				grid.newpage()
				pushViewport(viewport(layout=grid.layout(2, 5)))
				z	<- intersect(seq.int((i-1)*10+1, i*10), seq_len(length(phps)))
				for(j in z)
					print(phps[[j]], vp = viewport(layout.pos.row=(ceiling(j/5)-1)%%2+1, layout.pos.col=(j-1)%%5+1))				
			}
			dev.off()	
		}	
	}
}

#' @import ape data.table gridExtra colorspace ggtree

pty.evaluate.tree<- function(indir, pty.runs=NULL, outdir=indir, select='', outgroup=NA, references.pattern='REF', run.pattern='ptyr', tree.pattern='newick$', rm.newick=FALSE, rm.fasta=FALSE, plot.trees.per.page=10, plot.w=20, plot.h=40)
{	
	if(0)
	{
		#indir				<- "~/Dropbox (SPH Imperial College)/2015_PANGEA_DualPairsFromFastQIVA/coinf_ptoutput_150121"
		#indir				<- "~/Dropbox (SPH Imperial College)/2015_PANGEA_DualPairsFromFastQIVA/coinf_ptoutput_150201"
		indir				<- "~/Dropbox (SPH Imperial College)/2015_PANGEA_DualPairsFromFastQIVA/coinf_ptoutput_UG60"
		outdir				<- indir
		pty.runs			<- NULL
		#pty.infile			<- "~/Dropbox (SPH Imperial College)/2015_PANGEA_DualPairsFromFastQIVA/data/PANGEA_HIV_n5003_Imperial_v160110_UG_gag_coinfinput_160219.rda"
		select				<- '^ptyr1_In'
		outgroup			<- "CPX_AF460972"
		references.pattern	<- 'REF'
		run.pattern			<- 'ptyr'
		tree.pattern		<- '_dtbs10.newick$'
		rm.newick			<- FALSE
		rm.fasta			<- FALSE
		plot.trees.per.page	<- 4
		plot.w				<- 20
		plot.h				<- 40
	}
	#
	#	the first part pre-processes trees and saves them to file
	#	the second part plots the trees
	#	@CF: to do the plotting by hand, load the pre-processed rda file and continue with PART 2
	#
	#	PART 1
	#
	stopifnot(!is.null(pty.runs) || (is.null(pty.runs) & !is.na(outgroup)))
	#	collect ML tree file names
	ptyfiles		<- data.table(FILE=list.files(indir, tree.pattern))
	#	define phylotype run ID
	#	Just 1 if no run pattern provided
	if(nchar(run.pattern))
		ptyfiles[, PTY_RUN:= as.numeric(gsub(run.pattern,'',sapply(strsplit(FILE,'_'),'[[',1)))]
	if(!nchar(run.pattern))
		ptyfiles[, PTY_RUN:=1L]
	#	Extract window coordinates
	ptyfiles[, W_FROM:= as.numeric(gsub('InWindow_','',regmatches(FILE,regexpr('InWindow_[0-9]+',FILE))))] 
	ptyfiles[, W_TO:= as.numeric(gsub('to_','',regmatches(FILE,regexpr('to_[0-9]+',FILE))))]
	#	Select just a few files for processing; does nothing if select==''
	ptyfiles		<- subset(ptyfiles, grepl(select,FILE))
	setkey(ptyfiles, PTY_RUN, W_FROM)
	cat('\nfound tree files, n=',nrow(ptyfiles))
	#	
	#	read raw trees in newick format from tree file
	#	and set node labels to numeric
	#
	pty.ph		<- lapply( seq_len(nrow(ptyfiles)), function(i)
			{				
				ph			<- read.tree(file.path(indir,ptyfiles[i, FILE]))
				#	node labels
				if(is.null(ph$node.label))
					ph$node.label	<- rep('0',Nnode(ph))
				tmp				<- ph$node.label
				tmp[which(tmp=='Root'|tmp=='')]	<- '0'
				ph$node.label	<- as.numeric(tmp)
				ph
			})
	names(pty.ph)<- ptyfiles[, FILE]
	#	count number of individuals in each tree
	ptyfiles[, IND_N:= sapply(seq_along(pty.ph), function(i)  length(unique(gsub('_read.*','',pty.ph[[i]]$tip.label)))		)]
	#	remove trees with just one individual if no outgroup specified	
	if(is.na(outgroup) && pty.runs[, any(FILL==1)])
	{
		cat('\nignoring trees with only one individual (only when no root specified)', subset(ptyfiles, IND_N==1)[, paste(unique(FILE),collapse=', ')])
		ptyfiles		<- subset(ptyfiles, IND_N>1)
		pty.ph			<- lapply( ptyfiles[, FILE], function(x)	pty.ph[[x]]		)
		names(pty.ph)	<- ptyfiles[, FILE]			
	}	
	ptyfiles[, IDX:=seq_len(nrow(ptyfiles))]
	#
	#	GET ROOTS
	#	
	#	root specified by user; check if in tree; 
	#		if in tree, then set the ROOT column; 
	#		if not in tree, return missing 
	if(!is.na(outgroup))
	{
		cat('\nroot and evaluate trees')
		pty.root	<- ptyfiles[, {
					#FILE<- 'ptyr22_InWindow_241_to_300_dophy_examl.newick'
					ans		<- NA_character_
					tmp		<- which(grepl(outgroup,pty.ph[[FILE]]$tip.label))
					if(length(tmp)==0)
						cat('\ncannot find root', outgroup,'in tree',FILE,', ignoring file')
					if(length(tmp)==1)
						ans	<- pty.ph[[FILE]]$tip.label[tmp]
					if(length(tmp)>1)
						cat('\nfound >1 roots', outgroup,'in tree',FILE,', ignoring file')
					list(ROOT=ans)
				}, by= c('FILE','PTY_RUN')]
		# pty.root is a data.table with run ID PTY_RUN, file name FILE, and corresponding root ROOT
		pty.root	<- subset(pty.root, !is.na(ROOT))	
		ptyfiles	<- merge(ptyfiles,pty.root, by=c('FILE','PTY_RUN'))
	}
	#	root not specified; determine root for each run: find taxon with largest distance from BAM of selected individuals (these have FILL==0)
	#	this is only for olli
	if(is.na(outgroup) && pty.runs[, any(FILL==1)])
	{
		cat('\ndetermine root among fillers')
		pty.root	<- pty.evaluate.tree.root.withfill(ptyfiles, pty.ph, pty.runs)
		ptyfiles	<- merge(ptyfiles,pty.root, by=c('FILE','PTY_RUN'))
	}
	#	root not specified; determine root for each run: 
	#	this is only for olli
	if(is.na(outgroup) && pty.runs[, all(FILL==0)])
	{
		cat('\ndetermine root among all individuals')
		pty.root	<- pty.evaluate.tree.root.nofill(ptyfiles, pty.ph, pty.runs)
		ptyfiles	<- merge(ptyfiles,pty.root, by=c('FILE','PTY_RUN'))
	}	
	#
	#	pre-process trees: root, ladderize, associate edges to individuals
	#
	#	pty.ph.cp			<- copy(pty.ph)	
	#	pty.ph	<- copy(pty.ph.cp)
	cat('\nroot, ladderize, group trees')
	for(i in seq_along(pty.ph))
	{
		cat('\nprocess tree ',names(pty.ph)[i])
		#print(i)
		#i<- 44
		#i<- 5
		#	root
		root			<- subset(ptyfiles, FILE==names(pty.ph)[i])[, ROOT]
		ph				<- pty.ph[[i]]
		tmp										<- which(ph$tip.label==root)
		ph										<- reroot(ph, tmp, ph$edge.length[which(ph$edge[,2]==tmp)])
		ph$node.label[ph$node.label=='Root']	<- 0
		ph$node.label							<- as.numeric(ph$node.label)		
		#	ladderize
		ph				<- ladderize(ph)
		#	define individuals and references
		phb				<- data.table(IDX=seq_along(ph$tip.label), BAM=ph$tip.label, FILE_ID= gsub('_read.*','',ph$tip.label), REF=grepl(references.pattern,ph$tip.label))
		set(phb, phb[, which(REF)],'FILE_ID','REFERENCE')
		#	homogenize taxon labels: add dummy "_read_1_count_0" to reach taxon name for which the read and count bit is missing
		#	this can only be the case for references, fail if this is not true
		tmp				<- phb[, which(!grepl('_read_[0-9]+_count_[0-9]+',BAM))]		
		if(length(tmp))
		{
			stopifnot( !nrow(subset(phb[tmp, ],!REF))	)	#incorrect taxon label
			set(phb, tmp, 'BAM', phb[tmp, paste(BAM,'_read_1_count_0',sep='')])
			setkey(phb, IDX)
			ph$tip.label	<- phb[,BAM]			
		}
		#	group edges by individual
		#	@CF: if you modify the trees, you need to re-run this function
		ph				<- pty.evaluate.tree.groupindividuals(ph, phb)
		#	group edges by FILL
		#tmp			<- as.numeric(gsub('ptyr','',regmatches(names(pty.ph)[i], regexpr('ptyr[0-9]+',names(pty.ph)[i]))))
		#phb			<- merge(phb, subset(pty.runs, PTY_RUN==tmp), by='FILE_ID')
		#ph				<- pty.evaluate.tree.groupindividuals(ph, phb)	
		pty.ph[[i]]		<- ph
	}		
	#
	#	save trees
	#	
	tmp		<- regmatches(tree.pattern,regexpr('\\.[^\\.]+$', tree.pattern)) 
	tmp		<- ptyfiles[1, gsub(tmp,'_preprtr.rda',gsub('_dophy','',gsub('\\.InWindow_[0-9]+_to_[0-9]+|_InWindow_[0-9]+_to_[0-9]+','',FILE)))]
	cat('\nsave trees to file ',tmp)
	save(pty.ph, ptyfiles, file=file.path(outdir,tmp))
	#
	#	clean up: remove newick / fasta files
	#
	if(rm.newick)
	{
		invisible( ptyfiles[, list(SUCCESS=file.remove(file.path(indir,FILE))), by='FILE'] )
		invisible( ptyfiles[, list(SUCCESS=file.remove(file.path(indir,gsub('newick','txt',FILE)))), by='FILE'] )
	}
	if(rm.fasta)
	{
		invisible( ptyfiles[, list(SUCCESS=file.remove(file.path(indir,gsub('_examl\\.newick','\\.fasta',FILE)))), by='FILE'] )
	}
	#
	#	PART 2
	#	
	#	plot the trees
	#	@CF: try this first for the saved trees as they are. once you follow the code, try drop.tip etc. 
	#		 Note that you will have to call 'ph<- pty.evaluate.tree.groupindividuals(ph, phb)' once more to setup the correct edge colours. See how the function is used a few lines higher up.
	#
	#	need node heights to determine x-axis range for plotting
	tmp					<- ptyfiles[,{										
				ph		<- pty.ph[[FILE]]
				tmp		<- node.depth.edgelength(ph)[1:Ntip(ph)]
				list(BAM=ph$tip.label, HEIGHT=tmp)	#BAM and HEIGHT are vectors
			}, by='FILE']	
	#	'tmp' is now a data.table containing for each tree in file FILE, all taxon names and associated node depths
	#	now extract the max node depth for each tree in file FILE, and merge this 
	#	as an extra column to the ptyfiles data.table
	ptyfiles				<- merge(ptyfiles, tmp[, list(HMX= max(HEIGHT)), by='FILE'], by='FILE')
	#	sort by run id and window, so the tree appear in order
	setkey(ptyfiles, PTY_RUN, W_FROM)	
	#	for each phylotype run, produce one pdf
	#ptyfiles			<- subset(ptyfiles, PTY_RUN==1)
	for(ptyr in ptyfiles[, unique(PTY_RUN)])
	{
		#@CF: since you don t have phylotype run IDs, just set ptyr<-1
		#ptyr<- 22
		cat('\nplot trees with run.pattern',ptyr,'\n\n')
		tmp			<- subset(ptyfiles, PTY_RUN==ptyr)
		#	setup title
		tmp[, TITLE:=paste('run',PTY_RUN,', window [',W_FROM,',',W_TO,']',sep='')]
		setkey(tmp, W_FROM)
		#	select trees for this pdf in 'phs'
		phs			<- lapply(tmp[, FILE], function(x) pty.ph[[x]])
		names(phs)	<- tmp[, TITLE] 
		#	setup colours: get all individual names from all selected trees	
		tmp2		<- setdiff(sort(unique(unlist(lapply(seq_along(phs), function(i)	levels(attr(phs[[i]],'INDIVIDUAL'))	)))),c('not characterized'))
		if(!'REFERENCE'%in%tmp2)
		{
			col			<- c('black',rainbow_hcl(length(tmp2), start = 270, end = -30, c=100, l=50))
			names(col)	<- c('not characterized',tmp2)						
		}
		if('REFERENCE'%in%tmp2)
		{
			tmp2		<- setdiff(tmp2,'REFERENCE')
			col			<- c('grey50','black',rainbow_hcl(length(tmp2), start = 270, end = -30, c=100, l=50))
			names(col)	<- c('not characterized','REFERENCE',tmp2)						
		}
		#
		#	setup ggtrees
		#
		#	this is the same as p<- ggplot... . the plot is stored in a list, and not plotted. Type print(p) to plot.
		phps		<- lapply(seq_along(phs), function(i){
		#phps		<- lapply(1, function(i){
					#	@CF just set i<-1 and then copy and paste to produce one tree for one window
					cat('\nsetup tree ',names(phs)[i])
					max.node.height	<- tmp[i,][, HMX]
					ph.title		<- names(phs)[i]
					ph				<- phs[[i]]			
					#	now get the counts and some other stuff that is irrelevant for this plot. CLU will always be FALSE.
					df				<- data.table(	BAM=ph$tip.label, IDX=seq_along(ph$tip.label), 
							COUNT= as.numeric(gsub('count_','',regmatches(ph$tip.label, regexpr('count_[0-9]+',ph$tip.label)))), 
							CLU=grepl('_clu',ph$tip.label), 
							FILE_ID= gsub('_read.*|_clu.*','',ph$tip.label))
					set(df, df[, which(grepl(references.pattern,FILE_ID))],'FILE_ID','REFERENCE')
					p				<- pty.evaluate.tree.plot.ph(ph, df, col, max.node.height, ph.title)
					p
				})	
		names(phps)	<- names(phs)
		file	<- file.path( indir, tmp[1,gsub(paste('.',tree.pattern,sep=''),'.pdf',gsub('_dophy','',gsub('.InWindow_[0-9]+_to_[0-9]+|_InWindow_[0-9]+_to_[0-9]+','',FILE)))] )
		cat('\nplot trees to file ',file)		
		tmp			<- seq_len(ceiling(length(phps)/plot.trees.per.page))
		#	single-page plot
		if(plot.trees.per.page==1)
		{
			pdf(file=file, w=plot.w, h=plot.h)		#for win=60
			for(i in tmp)
				print(phps[[i]])
			dev.off()
		}
		#	multi-page plot	not divisible by two
		if(plot.trees.per.page>1 & plot.trees.per.page%%2==1)
		{
			pdf(file=file, w=plot.w, h=plot.h)		#for win=60
			for(i in tmp)
			{		
				print(i)
				grid.newpage()
				pushViewport(viewport(layout=grid.layout(1, plot.trees.per.page)))
				z	<- intersect(seq.int((i-1)*plot.trees.per.page+1, i*plot.trees.per.page), seq_len(length(phps)))
				for(j in z)
					print(phps[[j]], vp = viewport(layout.pos.row=1, layout.pos.col=(j-1)%%(plot.trees.per.page)+1))				
			}
			dev.off()
		}	
		#	multi-page plot	divisible by two
		if(plot.trees.per.page>1 & plot.trees.per.page%%2==0)
		{
			pdf(file=file, w=plot.w, h=plot.h)		#for win=60
			for(i in tmp)
			{		
				print(i)
				grid.newpage()
				pushViewport(viewport(layout=grid.layout(2, plot.trees.per.page/2)))
				z	<- intersect(seq.int((i-1)*plot.trees.per.page+1, i*plot.trees.per.page), seq_len(length(phps)))
				for(j in z)
					print(phps[[j]], vp = viewport(layout.pos.row=(ceiling(j/(plot.trees.per.page/2))-1)%%2+1, layout.pos.col=(j-1)%%(plot.trees.per.page/2)+1))				
			}
			dev.off()
		}					
	}
}	

phsc.get.prior.parameter.n0.old<- function(n.states, n.type=2, n.obs=3, confidence.cut=0.5)
{
	phsc.find.n0.aux<- function(n0, n.states=3, n.type=2, n.obs=3, confidence.cut=0.5)
	{
		abs( pbeta(1/n.states+(1-1/n.states)/(n.states+1), (n0+n.states*n.type)/n.states, ((n.states-1)*n0+n.states*(n.obs-n.type))/n.states, lower.tail=FALSE)-confidence.cut ) 	
	}	
	ans	<- optimize(phsc.find.n0.aux, c(.01,100), n.states=n.states, n.type=n.type, n.obs=n.obs, confidence.cut=confidence.cut)
	ans	<- round(ans$minimum, d=4)
	ans
}


pty.evaluate.tree.plot.ph<- function(ph, df, col, max.node.height, ph.title)
{
	#	with ggtree, extra info can be supplied in a data.frame through the %<+% operator
	#	the first column must be the tip names
	#	here I use this extra info in 'df' to specify the size of the tippoints as a function of COUNT
	#	the aes(color=INDIVIDUAL) bit looks for the INDIVIDUAL attribute on 'ph' that defines the edge colours.						
	if(any(ph$node.label==0))
	{
		p			<- ggtree(ph, aes(color=INDIVIDUAL)) %<+% df +
				geom_tippoint(aes(size=COUNT, shape=CLU)) +
				#geom_text(aes(label=label), size=1.5,  hjust=-.1) +
				geom_tiplab(size=1.2,  hjust=-.1) +
				scale_color_manual(values=col, guide=FALSE) +
				scale_shape_manual(values=c(20,18), guide=FALSE) +
				scale_size_area(guide=FALSE) +							
				theme_tree2() +
				theme(legend.position="bottom") + ggplot2::xlim(0, max.node.height*1.3) +
				labs(x='subst/site', title=ph.title)
	}	
	if(any(ph$node.label>0))
	{	
		tmp						<- c(rep(0,Ntip(ph)),ph$node.label)
		tmp[tmp<=50]			<- ''		
		attr(ph,'NODE_LABEL')	<- tmp	
		p						<- ggtree(ph, aes(colour=INDIVIDUAL)) %<+% df +								
				geom_nodepoint(size=as.numeric(as.character(factor(ph$node.label>75, levels=c(FALSE,TRUE), labels=c('0','2')))), shape=23, fill='grey') +
				geom_text2(aes(label=NODE_LABEL), colour='grey', size=1.2,  hjust=1.5, vjust=-1.5) +
				geom_tiplab(size=1.2,  hjust=-.1) +
				scale_colour_manual(values=col, guide=FALSE) +
				scale_shape_manual(values=c(20,18), guide=FALSE) +
				scale_size_area(guide=FALSE) +							
				theme_tree2() +
				theme(legend.position="bottom") + ggplot2::xlim(0, max.node.height*1.3) +								
				labs(x='subst/site', title=ph.title)
	}		
	p
}

pty.evaluate.tree.plot.develstuff<- function()
{
	if(0)	#devel
	{
		tmp	 	<- matrix(seq(1, 2*ceiling(length(phps)/2)), ncol=ceiling(length(phps)/2), nrow=2)
		grid.newpage()
		pushViewport(viewport(layout = grid.layout(nrow(tmp), ncol(tmp))))
		for(i in seq_along(phps))
			print(phps[[i]], vp = viewport(layout.pos.row=which(tmp==i, arr.ind=TRUE)[1,'row'], layout.pos.col=which(tmp==i, arr.ind=TRUE)[1,'col']))	
		dev.off()
	}
	if(0)	#devel
	{	
		ph				<- pty.ph[[30]]
		ph				<- pty.evaluate.tree.collapse.clusters(ph, thresh.brl=8e-6)
		phb				<- data.table(IDX=seq_along(ph$tip.label), BAM=ph$tip.label, FILE_ID= gsub('_read.*|_clu.*','',ph$tip.label))
		#	group edges by individual
		ph				<- pty.evaluate.tree.groupindividuals(ph, phb)
		#	group edges by FILL
		tmp				<- as.numeric(gsub('ptyr','',regmatches(names(pty.ph)[i], regexpr('ptyr[0-9]+',names(pty.ph)[i]))))
		phb				<- merge(phb, subset(pty.runs, PTY_RUN==tmp), by='FILE_ID')
		ph				<- pty.evaluate.tree.groupcandidates(ph, phb)	
		pty.phc[[i]]	<- ph
		
		
		tmp2		<- levels(attr(ph,'INDIVIDUAL'))
		col			<- c('black',rainbow_hcl(length(tmp2)-1, start = 270, end = -30, c=100, l=50))
		names(col)	<- tmp2	
		df			<- data.table(	BAM=ph$tip.label, IDX=seq_along(ph$tip.label), 
				COUNT= as.numeric(gsub('count_','',regmatches(ph$tip.label, regexpr('count_[0-9]+',ph$tip.label)))), 
				CLU=grepl('_clu',ph$tip.label), 
				FILE_ID= gsub('_read.*|_clu.*','',ph$tip.label))
		set(df, df[, which(grepl(references.pattern,FILE_ID))],'FILE_ID','REFERENCE')
		ggtree(ph, aes(color=INDIVIDUAL)) %<+% df +
				geom_nodepoint(size=ph$node.label/100*3) +
				geom_tippoint(aes(size=COUNT, shape=CLU)) +
				#geom_point2(aes(subset=(node==42)), size=5, shape=23, fill='#068C00') +
				#geom_point2(data=df, aes(subset=(node==IDX)), size=5, shape=23, fill=df[,COL]) +
				#geom_text(aes(label=label), size=1.5,  hjust=-.1) +
				geom_tiplab(size=1.2,  hjust=-.1) +
				scale_color_manual(values=col, guide=FALSE) +
				scale_shape_manual(values=c(20,18), guide=FALSE) +
				scale_size_area(guide=FALSE) +
				#scale_linetype_manual(values=c('target'='solid','filler'='dotted'),guide = FALSE) +
				theme_tree2() +
				theme(legend.position="bottom") + ggplot2::xlim(0, 0.3) +
				labs(x='subst/site')
		
	}
}

pty.drop.tip<- function (phy, tip, trim.internal = TRUE, subtree = FALSE, root.edge = 0, 
		rooted = is.rooted(phy), interactive = FALSE, subtree.label=NA) 
{
	if (!inherits(phy, "phylo")) 
		stop("object \"phy\" is not of class \"phylo\"")
	Ntip <- length(phy$tip.label)
	if (interactive) {
		cat("Left-click close to the tips you want to drop; right-click when finished...\n")
		xy <- locator()
		nToDrop <- length(xy$x)
		tip <- integer(nToDrop)
		lastPP <- get("last_plot.phylo", envir = .PlotPhyloEnv)
		for (i in 1:nToDrop) {
			d <- sqrt((xy$x[i] - lastPP$xx)^2 + (xy$y[i] - lastPP$yy)^2)
			tip[i] <- which.min(d)
		}
	}
	else {
		if (is.character(tip)) 
			tip <- which(phy$tip.label %in% tip)
	}
	out.of.range <- tip > Ntip
	if (any(out.of.range)) {
		warning("some tip numbers were larger than the number of tips: they were ignored")
		tip <- tip[!out.of.range]
	}
	if (!length(tip)) 
		return(phy)
	if (length(tip) == Ntip) {
		warning("drop all tips of the tree: returning NULL")
		return(NULL)
	}
	wbl <- !is.null(phy$edge.length)
	if (!rooted && subtree) {
		phy <- root(phy, (1:Ntip)[-tip][1])
		root.edge <- 0
	}
	phy <- reorder(phy)
	NEWROOT <- ROOT <- Ntip + 1
	Nnode <- phy$Nnode
	Nedge <- dim(phy$edge)[1]
	if (subtree) {
		trim.internal <- TRUE
		tr <- reorder(phy, "postorder")
		N <- .C(node_depth, as.integer(Ntip), as.integer(Nnode), 
				as.integer(tr$edge[, 1]), as.integer(tr$edge[, 2]), 
				as.integer(Nedge), double(Ntip + Nnode), 1L)[[6]]
	}
	edge1 <- phy$edge[, 1]
	edge2 <- phy$edge[, 2]
	keep <- !logical(Nedge)
	keep[match(tip, edge2)] <- FALSE
	if (trim.internal) {
		ints <- edge2 > Ntip
		repeat {
			sel <- !(edge2 %in% edge1[keep]) & ints & keep
			if (!sum(sel)) 
				break
			keep[sel] <- FALSE
		}
		if (subtree) {
			subt <- edge1 %in% edge1[keep] & edge1 %in% edge1[!keep]
			keep[subt] <- TRUE
		}
		if (root.edge && wbl) {
			degree <- tabulate(edge1[keep])
			if (degree[ROOT] == 1) {
				j <- integer(0)
				repeat {
					i <- which(edge1 == NEWROOT & keep)
					j <- c(i, j)
					NEWROOT <- edge2[i]
					degree <- tabulate(edge1[keep])
					if (degree[NEWROOT] > 1) 
						break
				}
				keep[j] <- FALSE
				if (length(j) > root.edge) 
					j <- 1:root.edge
				NewRootEdge <- sum(phy$edge.length[j])
				if (length(j) < root.edge && !is.null(phy$root.edge)) 
					NewRootEdge <- NewRootEdge + phy$root.edge
				phy$root.edge <- NewRootEdge
			}
		}
	}
	if (!root.edge) 
		phy$root.edge <- NULL
	phy$edge <- phy$edge[keep, ]
	if (wbl) 
		phy$edge.length <- phy$edge.length[keep]
	TERMS <- !(phy$edge[, 2] %in% phy$edge[, 1])
	oldNo.ofNewTips <- phy$edge[TERMS, 2]
	if (subtree) {
		i <- which(tip %in% oldNo.ofNewTips)
		if (length(i)) {
			phy$tip.label[tip[i]] <- "[1_tip]"
			tip <- tip[-i]
		}
	}
	n <- length(oldNo.ofNewTips)
	phy$edge[TERMS, 2] <- rank(phy$edge[TERMS, 2])
	phy$tip.label <- phy$tip.label[-tip]
	if (subtree || !trim.internal) {
		node2tip <- oldNo.ofNewTips[oldNo.ofNewTips > Ntip]
		new.tip.label <- if (subtree) {
					paste("[", subtree.label[node2tip], "_tips]", sep = "")
				}
				else {
					if (is.null(phy$node.label)) 
						rep("NA", length(node2tip))
					else phy$node.label[node2tip - Ntip]
				}
		phy$tip.label <- c(phy$tip.label, new.tip.label)
	}
	phy$Nnode <- dim(phy$edge)[1] - n + 1L
	newNb <- integer(Ntip + Nnode)
	newNb[NEWROOT] <- n + 1L
	sndcol <- phy$edge[, 2] > n
	phy$edge[sndcol, 2] <- newNb[phy$edge[sndcol, 2]] <- (n + 
				2):(n + phy$Nnode)
	phy$edge[, 1] <- newNb[phy$edge[, 1]]
	storage.mode(phy$edge) <- "integer"
	if (!is.null(phy$node.label)) 
		phy$node.label <- phy$node.label[which(newNb > 0) - Ntip]
	collapse.singles(phy)
}

#' @import ape zoo data.table

pty.evaluate.fasta<- function(indir, outdir=indir, strip.max.len=Inf, select='', min.ureads.individual=NA, verbose=1)
{
	#select			<- '^ptyr1_InWindow'
	#strip.max.len	<- 350		
	infiles			<- data.table(FILE=list.files(indir, pattern='fasta$'))
	infiles			<- subset(infiles, !grepl('*',FILE,fixed=1))
	infiles			<- subset(infiles, grepl(select,FILE))
	set(infiles, NULL, 'PTY_RUN', as.numeric(gsub('ptyr','',sapply(strsplit(infiles$FILE,'_'),'[[',1))))
	set(infiles, NULL, 'W_FROM', as.numeric(gsub('InWindow_','',regmatches(infiles$FILE,regexpr('InWindow_[0-9]+',infiles$FILE))))) 
	set(infiles, NULL, 'W_TO', as.numeric(gsub('to_','',regmatches(infiles$FILE,regexpr('to_[0-9]+',infiles$FILE)))))	
	setkey(infiles, W_FROM)	
	#	read all fasta into R
	pty.seq.rw			<- lapply( infiles[, FILE], function(x) read.dna(file.path(indir,x), format='fasta')	)
	names(pty.seq.rw)	<- infiles[, FILE]
	pty.seq				<- lapply( infiles[, FILE], function(x) seq.strip.gap(pty.seq.rw[[x]], strip.max.len=strip.max.len)	)
	names(pty.seq)		<- infiles[, FILE]
	#	get statistics
	if(verbose==1)
		cat('\ncollecting alignment statistics\t',select,'\n')
	seqd	<- infiles[,{
				if(verbose==1)	cat('\n',FILE)
				#FILE	<- 'ptyr1_InWindow_1_to_60.fasta'
				seq		<- pty.seq[[FILE]]					
				tmp		<- rownames(seq)
				seqd	<- data.table(READ=tmp, FILE_ID= gsub('_read.*','',tmp)) 
				seqd[, READ_N:=as.integer(gsub('_count_','',regmatches(tmp,regexpr('_count_[0-9]+',tmp))))]			
				tmp		<- as.character(seq)
				seqd[, FIRST:= apply( tmp, 1, function(x) which(x!='-')[1] )]
				seqd[, LAST:= ncol(tmp)-apply( tmp, 1, function(x) which(rev(x)!='-')[1] ) + 1L]
				seqd[, LEN:= LAST-FIRST+1L]
				seqd	<- merge(seqd, seqd[, list(UNIQUE_N=length(READ)),by='FILE_ID'], by='FILE_ID')
				seqd[, TOTAL_N:= nrow(seqd)]
				seqd
			},by='FILE']
	seqd	<- merge(infiles, seqd, by='FILE')
	#tmp		<- subset(si, select=c(SANGER_ID, PANGEA_ID))
	#set(tmp, NULL, 'PANGEA_ID', tmp[, gsub('-','_',PANGEA_ID)])
	#setnames(tmp, c('PANGEA_ID','SANGER_ID'), c('TAXA','FILE_ID'))
	#seqd	<- merge(seqd, tmp, by='FILE_ID',all.x=1)
	#tmp		<- seqd[, which(is.na(TAXA))]
	#set(seqd, tmp, 'TAXA', seqd[tmp, FILE_ID])
	#seqd	<- merge(seqd, subset(pty.runs, select=c(FILE_ID,PTY_RUN,FILL)), by=c('PTY_RUN','FILE_ID'), all.x=1)
	#set(seqd, NULL, 'FILL', seqd[, factor(FILL, levels=c(0,1), labels=c('candidate','filler'))])	
	tmp		<- file.path(outdir, gsub('.fasta','_alignments.rda',gsub('_InWindow_[0-9]+_to_[0-9]+','',infiles[1,][,FILE])))
	if(verbose)
		cat('\nsave to',tmp)
	#	save and delete old fasta files
	save(pty.seq.rw, pty.seq, seqd, file=tmp)	
	invisible( seqd[, file.remove(file.path(indir,FILE))] )
	#
	#	prepare examl files
	#
	setkey(seqd, W_FROM)
	#	select
	if(!is.na(min.ureads.individual))	#	select individuals with x unique reads in each window
	{
		seqd	<- subset(seqd, grepl('REF',FILE_ID) | UNIQUE_N>=min.ureads.individual)
		seqd[, TOTAL_N:=NULL]
		seqd	<- merge(seqd, seqd[, list(TOTAL_N=length(READ)), by='FILE'], by='FILE')
	}
	#	write fasta
	pty.fa	<- seqd[, {	
				#FILE<- 'ptyr1_InWindow_1_to_60.fasta'
				#READ	<- tmp$READ[which(tmp$FILE==FILE)]
				z		<- file.path(outdir, gsub('\\.fasta','_dophy\\.fasta',FILE))
				if(!file.exists(z))
				{
					cat('\nwrite to',z)
					write.dna(pty.seq[[FILE]][READ,], file=z, format='fasta', colsep='', nbcol=-1)
				}										
				list(FILE=z, TAXA_N=length(READ), PTY_RUN=PTY_RUN[1], W_FROM=W_FROM[1], W_TO=W_TO[1])
			}, by='FILE']
	cat('\nwrote fasta files for trees, min taxa=',pty.fa[,min(TAXA_N)],'median taxa=',pty.fa[,median(TAXA_N)],'max taxa=',pty.fa[,max(TAXA_N)])
}


project.dualinfecions.phylotypes.countbam.150120<- function()
{
	require(ggplot2)
	require(data.table)
	require(Rsamtools)
	pty.infile	<- file.path(HOME,"data","PANGEA_HIV_n5003_Imperial_v160110_ZA_examlbs500_ptyrunsinput.rda")
	pty.data.dir	<- '/Users/Oliver/duke/2016_PANGEAphylotypes/data'
	#pty.data.dir	<- '/work/or105/PANGEA_mapout/data'
		
	load( pty.infile )	
	tmp			<- subset(si, select=c(SANGER_ID, PANGEA_ID))
	set(tmp, NULL, 'PANGEA_ID', tmp[, gsub('-','_',PANGEA_ID)])
	setnames(tmp, c('PANGEA_ID','SANGER_ID'), c('TAXA','FILE_ID'))
	pty.runs	<- merge(pty.runs, tmp, by='TAXA', all.x=1)
	tmp			<- pty.runs[, which(is.na(FILE_ID))]
	set(pty.runs, tmp,'FILE_ID', pty.runs[tmp, TAXA])	
	tmp			<- subset(pty.runs, FILL==0)	
	pty.runs	<- subset(pty.runs, FILL==1)
	setkey(pty.runs, FILE_ID)
	pty.runs	<- unique(pty.runs)
	pty.runs	<- pty.runs[ setdiff(pty.runs$FILE_ID, tmp$FILE_ID), ] 
	pty.runs	<- rbind(tmp, pty.runs)
	
	bfiles			<- data.table(FILE=list.files(pty.data.dir, pattern='bam$'))	
	bfiles			<- subset(bfiles, !grepl('Contam',FILE))
	#	get lengths of all reads in quality trimmed bam file
	#z	<- scanBam(file.path(pty.data.dir,FILE), param=ScanBamParam(what=c('qwidth','qual')))
	bam.len			<- bfiles[,{
				z	<- scanBam(file.path(pty.data.dir,FILE), param=ScanBamParam(what=c('qwidth')))
				list(QU=seq(0,320,20), CDF=ecdf(z[[1]][['qwidth']])(seq(0,320,20)))
				#list(PR=seq(0.01,0.99,0.01), QU=quantile(z[[1]][['qwidth']], p=seq(0.01,0.99,0.01)))				
			}, by='FILE']
	bam.len[, FILE_ID:= gsub('.bam','',FILE)]
	bam.len		<- merge(bam.len, subset(pty.runs, select=c(TAXA, FILL, FILE_ID)), by='FILE_ID', all.x=1 )
	set(bam.len, bam.len[, which(is.na(FILL))], 'FILL', 2)
	set(bam.len, NULL, 'FILL', bam.len[, factor(FILL, levels=c(0,1, 2), labels=c('candidate','filler','no consensus'))])
	setnames(bam.len, 'FILL', 'TYPE')
	save(bam.len, file= gsub('ptyrunsinput','bamlen',pty.infile))	
	#
	ggplot(bam.len, aes(y=CDF, x=factor(QU), fill=TYPE)) + geom_boxplot() + 
			theme_bw() + labs(y='cumulative frequency\nin one individual\n', x='\nlength of quality-trimmed short reads\n(nt)', fill='individual') +
			theme(legend.position='bottom')
	ggsave(file= gsub('ptyrunsinput.rda','bamlen.pdf',pty.infile), w=10, h=7)	
}


pty.cmdwrap.examl<- function(pty.args)
{
	indir					<- pty.args[['out.dir']]
	outdir					<- indir	
	stopifnot( pty.args[['exa.n.per.run']]>=0, pty.args[['bs.n']]>=0	)	
	infiles		<- data.table(FILE=list.files(indir, pattern='_alignments.rda$'))
	infiles[, PTY_RUN:= as.numeric(gsub('ptyr','',sapply(strsplit(FILE,'_'),'[[',1)))]	
	tmp				<- data.table(OUTFILE=list.files(indir, pattern='.newick$'))						
	tmp[, PTY_RUN:= as.numeric(gsub('ptyr','',sapply(strsplit(OUTFILE,'_'),'[[',1)))]
	infiles			<- merge(infiles, tmp, by='PTY_RUN', all.x=1, allow.cartesian=TRUE)		
	setkey(infiles, PTY_RUN)
	infiles			<- subset(unique(infiles), is.na(OUTFILE))
	if(!any(is.na(pty.args[['select']])))
		infiles		<- subset(infiles, PTY_RUN%in%pty.args[['select']])
	print(infiles)
	
	pty.fa		<- do.call('rbind',lapply( seq_len(nrow(infiles)), function(i)
					{
						#i<- 1							
						load(file.path(indir,infiles[i,FILE]))	#loads "pty.seq.rw" "pty.seq"    "seqd"
						setkey(seqd, W_FROM)
						#	select
						if(!is.na(pty.args[['min.ureads.individual']]))	#	select individuals with x unique reads in each window
						{
							seqd	<- subset(seqd, grepl('REF',FILE_ID) | UNIQUE_N>=pty.args[['min.ureads.individual']])
							seqd[, TOTAL_N:=NULL]
							seqd	<- merge(seqd, seqd[, list(TOTAL_N=length(READ)), by='FILE'], by='FILE')
						}
						#	write fasta
						pty.fa		<- seqd[, {	
									#FILE<- 'ptyr1_InWindow_1_to_60.fasta'
									#READ	<- tmp$READ[which(tmp$FILE==FILE)]
									z		<- file.path(outdir,gsub('\\.fasta','_dophy\\.fasta',FILE))
									#print(z)
									if(!file.exists(z))
									{
										write.dna(pty.seq[[FILE]][READ,], file=z, format='fasta', colsep='', nbcol=-1)
									}										
									list(TAXA_N=length(READ), PTY_RUN=PTY_RUN[1], W_FROM=W_FROM[1], W_TO=W_TO[1])
								}, by='FILE']
						cat('\nusing fasta files for trees',infiles[i,FILE],', min taxa=',pty.fa[,min(TAXA_N)],'median taxa=',pty.fa[,median(TAXA_N)],'max taxa=',pty.fa[,max(TAXA_N)])
						pty.fa
					}))	
	#
	#	get commands to reconstruct tree
	#
	if(pty.args[['bs.n']]>0)	#	bootstrap on one machine version 
	{
		exa.cmd			<- pty.fa[,	list(CMD=cmd.examl.bootstrap.on.one.machine(outdir, sub("\\.fasta$", "_dophy",FILE), bs.from=0, bs.to=pty.args[['bs.n']]-1, bs.n=pty.args[['bs.n']], opt.bootstrap.by="nucleotide", args.examl=pty.args[['args.examl']])), by='FILE']
		exa.cmd[, RUN_ID:=seq_len(nrow(exa.cmd))]
	}
	if(pty.args[['bs.n']]==0)	#	no bootstrap version
	{		
		exa.cmd			<- pty.fa[,{				 					
					cmd			<- cmd.examl.single(outdir, sub("\\.fasta$", "_dophy",FILE), args.examl=pty.args[['args.examl']])					
					list(CMD=cmd)					
				}, by=c('PTY_RUN','FILE')]
		if(!is.na(pty.args[['exa.n.per.run']]))
			exa.cmd[, RUN_ID:= ceiling(seq_len(nrow(exa.cmd))/pty.args[['exa.n.per.run']])]		
		if(is.na(pty.args[['exa.n.per.run']]))
			exa.cmd[, RUN_ID:=PTY_RUN]
		exa.cmd			<- exa.cmd[,	list(CMD=paste(CMD,collapse='\n',sep='\n')), 	by='RUN_ID']
	}
	exa.cmd
}	


pty.get.taxa.combinations<- function(pty.taxan, pty.sel.n)
{
	pty.maxn	<- ceiling( (pty.taxan-pty.sel.n) / (pty.sel.n-1) )
	pty.maxn	<- pty.maxn*(pty.sel.n-1)
	pty.idx		<- matrix( rev(seq_len(pty.maxn))+pty.sel.n, nrow=pty.sel.n-1 )
	#	for each col with lowest number x, repeat x-1 times
	tmp			<- lapply( seq_len(ncol(pty.idx)), function(j)
			{
				z	<- seq_len(min(pty.idx[,j])-1)
				z	<- t(matrix(data=z, nrow=length(z), ncol=nrow(pty.idx)+1))
				z[seq_len(nrow(pty.idx))+1,]	<- NA				
				z[seq_len(nrow(pty.idx))+1,]	<- pty.idx[,j]
				t(z)
			})
	pty.idx		<- do.call('rbind',tmp)
	pty.idx		<- rbind(matrix(seq_len(pty.sel.n),nrow=1), pty.idx)
	pty.idx		<- do.call('rbind',lapply( seq_len(nrow(pty.idx)), function(i) data.table(RUN=i, TX_IDX=pty.idx[i,])))
	pty.idx
}

phsc.get.basic.pairwise.relationships.170703<- function(df, trmw.close.brl, trmw.distant.brl)
{
	df[, TYPE_BASIC:= TYPE_RAW]
	
	stopifnot(c('ADJACENT','TYPE_BASIC','PATRISTIC_DISTANCE','PATHS_12','PATHS_21')%in%colnames(df))
	#	chains with no intermediate
	tmp		<- df[, which(TYPE_BASIC=="anc_12" & ADJACENT)]
	cat('\nFound adjacent anc_12, n=', length(tmp),'--> chain with no intermediate')
	set(df, tmp, 'TYPE_BASIC', 'chain_12_nointermediate')
	tmp		<- df[, which(TYPE_BASIC=="multi_anc_12" & ADJACENT)]
	cat('\nFound adjacent multi_anc_12, n=', length(tmp),'--> chain with no intermediate')
	set(df, tmp, 'TYPE_BASIC', 'chain_12_nointermediate')
	#	chains with no intermediate
	tmp		<- df[, which(TYPE_BASIC=="anc_21" & ADJACENT)]
	cat('\nFound adjacent anc_21, n=', length(tmp),'--> chain with no intermediate')
	set(df, tmp, 'TYPE_BASIC', 'chain_21_nointermediate')
	tmp		<- df[, which(TYPE_BASIC=="multi_anc_21" & ADJACENT)]
	cat('\nFound adjacent multi_anc_21, n=', length(tmp),'--> chain with no intermediate')
	set(df, tmp, 'TYPE_BASIC', 'chain_21_nointermediate')	
	#
	#	chains with intermediate
	#
	tmp		<- df[, which(TYPE_BASIC=="anc_12" & !ADJACENT)]
	cat('\nFound non-adjacent anc_12, n=', length(tmp),'--> chain with intermediate')
	set(df, tmp, 'TYPE_BASIC', 'chain_12_withintermediate')
	tmp		<- df[, which(TYPE_BASIC=="multi_anc_12" & !ADJACENT)]
	cat('\nFound non-adjacent multi_anc_12, n=', length(tmp),'--> chain with intermediate')
	set(df, tmp, 'TYPE_BASIC', 'chain_12_withintermediate')
	#	chains with intermediate
	tmp		<- df[, which(TYPE_BASIC=="anc_21" & !ADJACENT)]
	cat('\nFound non-adjacent anc_21, n=', length(tmp),'--> chain with intermediate')
	set(df, tmp, 'TYPE_BASIC', 'chain_21_withintermediate')
	tmp		<- df[, which(TYPE_BASIC=="multi_anc_21" & !ADJACENT)]
	cat('\nFound non-adjacent multi_anc_21, n=', length(tmp),'--> chain with intermediate')
	set(df, tmp, 'TYPE_BASIC', 'chain_21_withintermediate')
	#
	#	intermingled with no intermediate
	tmp		<- df[, which(TYPE_BASIC=="conflict" & ADJACENT)]
	cat('\nFound adjacent conflict, n=', length(tmp),'--> intermingled with no intermediate')
	set(df, tmp, 'TYPE_BASIC', 'intermingled_nointermediate')
	#	intermingled with intermediate
	tmp		<- df[, which(TYPE_BASIC=="conflict" & !ADJACENT)]
	cat('\nFound non-adjacent conflict, n=', length(tmp),'--> intermingled with intermediate')
	set(df, tmp, 'TYPE_BASIC', 'intermingled_withintermediate')
	#
	#	other	
	tmp		<- df[, which(ADJACENT & PATHS_12==0 & PATHS_21==0)]
	cat('\nFound adjacent with no paths, n=', length(tmp),'--> other')
	set(df, tmp, 'TYPE_BASIC', 'other_nointermediate')
	tmp		<- df[, which(!ADJACENT & PATHS_12==0 & PATHS_21==0)]
	cat('\nFound non-adjacent with no assignment, n=', length(tmp),'--> other')
	set(df, tmp, 'TYPE_BASIC', 'other_withintermediate')
	#	check
	stopifnot( !nrow(subset(df, TYPE_BASIC=='none'))	)
	#
	#	add distance as second dimension
	#
	if(!is.na(trmw.close.brl) & is.finite(trmw.close.brl))
	{
		cat('\nidentifying close pairwise assignments using distance=',trmw.close.brl)
		tmp		<- df[, which(PATRISTIC_DISTANCE<trmw.close.brl)]
		cat('\nFound close, n=', length(tmp))
		set(df, tmp, 'TYPE_BASIC', df[tmp, paste0(TYPE_BASIC,'_close')])		
	}
	if(!is.na(trmw.distant.brl) & is.finite(trmw.distant.brl))
	{
		cat('\nidentifying distant pairwise assignments using distance=',trmw.distant.brl)
		tmp		<- df[, which(PATRISTIC_DISTANCE>=trmw.distant.brl)]
		cat('\nFound distant, n=', length(tmp))
		set(df, tmp, 'TYPE_BASIC', df[tmp, paste0(TYPE_BASIC,'_distant')])	
	}
	df
}

pty.various<- function()
{		
	if(1)
	{
		project.dualinfecions.phylotypes.evaluatereads.150119()
	}	
}
olli0601/Phyloscanner.R.utilities documentation built on April 16, 2024, 5:44 a.m.