deprecated/phyloscan.Rakai.couples.old.R

RakaiAll.addposteriors.pairs.170120<- function()
{
	require(data.table)
	require(scales)
	require(ggplot2)
	require(grid)
	require(gridExtra)
	require(RColorBrewer)
	# load pty.run
	load( "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/Couples_PANGEA_HIV_n4562_Imperial_v151113_phscruns.rda" )
	# load transmission summaries
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/161219/RCCS_161219_w270_trmw_assignments_allpairs.rda')	
	rpw		<- subset(rpw, RUN%in%c("RCCS_161219_w270_d50_p001_mr20_mt1_cl2_d5") )
	setnames(rpw, c('TYPE','TYPE_PAIR'), c('TYPE_DIR_TODI7x3','TYPE_PAIR_TODI3x3'))
	#
	rpw[, table(RUN, useNA='if')]
	#	define plotting order: largest number of trm assignments	
	tmp		<- rpw[, list( WIN_TR=length(which(grepl('close|anc',TYPE_DIR_TODI7x3))) ), by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID')]
	tmp		<- tmp[order(-WIN_TR),]
	tmp[, PLOT_ID:=seq_len(nrow(tmp))]	
	#	make pair data table
	rp		<- copy(rpw)
	set(rp, NULL, c('PTY_RUN','RUN','W_FROM','W_TO','TYPE_RAW','TYPE_DIR_TODI7x3','TYPE_PAIR_TODI3x3','PATRISTIC_DISTANCE','CONTIGUOUS','PATHS_12','PATHS_21','MALE_SANGER_ID_L','MALE_SANGER_ID_R','FEMALE_SANGER_ID_L','FEMALE_SANGER_ID_R'), NULL)
	rp		<- unique(rp)
	#	make COUPID
	rp[, COUPID:= paste0(MALE_RID,':',FEMALE_RID)]
	#	define label
	tmp		<- merge(tmp, rp, by=c('MALE_SANGER_ID','FEMALE_SANGER_ID'))
	setkey(tmp, PLOT_ID)
	tmp[, LABEL_SH:= factor(PLOT_ID, levels=PLOT_ID, labels=paste(COUPID, ' ( M:', MALE_SANGER_ID,' F:',FEMALE_SANGER_ID, ' run:', PTY_RUN, ' )', sep=''))]
	tmp[, LABEL:= factor(PLOT_ID, levels=PLOT_ID, labels=paste('Pair ', COUPID,' -phsc.run=',PTY_RUN, '\nPerson M ', MALE_RID, ' ', MALE_SANGER_ID,' -loc:',MALE_REGION,',',MALE_COMM_NUM,',',MALE_HH_NUM,' -birth:',MALE_BIRTHDATE,' -neg:',MALE_LASTNEGDATE,' -pos:',MALE_FIRSTPOSDATE,' -seq:',MALE_SEQDATE,
							'\n<->', 
							'\nPerson F ', FEMALE_RID, ' ', FEMALE_SANGER_ID,' -loc:',FEMALE_REGION,',',FEMALE_COMM_NUM,',',FEMALE_HH_NUM,' -birth:',FEMALE_BIRTHDATE,' -neg:',FEMALE_LASTNEGDATE,' -pos:',FEMALE_FIRSTPOSDATE,' -seq:',FEMALE_SEQDATE,																				
							'\n',sep=''))]
	tmp		<- subset(tmp, select=c(PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, LABEL, LABEL_SH))
	rpw		<- merge(tmp, rpw, by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))
	#	define min/max reads
	tmp		<- rpw[, list(	ID_R_MIN=min(MALE_SANGER_ID_R, FEMALE_SANGER_ID_R),
					ID_R_MAX=max(MALE_SANGER_ID_R, FEMALE_SANGER_ID_R)), by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM')]
	rpw		<- merge(rpw, tmp, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM'))
	#
	#	define topo/distance types of phylogenetic relationship between pairs
	#
	#	given the bimodal distribution of patristic distances from phyloscanner
	#	I ususally consider 3 distance states: close, neither close nor distant, distant
	#
	#	TYPE_DIR_TODI7x3		7 topology states (this is the "basic topological characterization" on Slack that I derive from Matthew's output) 
	#							chain_12_nointermediate, chain_12_withintermediate, chain_21_nointermediate, chain_21_withintermediate, intermingled_nointermediate, intermingled_withintermediate, other  
	#							times 
	#							3 distance states
	#
	#	TYPE_PAIR_TODI3x3		3 topology states
	#							pair (chain_XX_nointermediate+intermingled_nointermediate), withintermediate (XX_withintermediate), other
	#							times 
	#							3 distance states
	#
	#	TYPE_PAIR_TODI2x2		2 topology states
	#							ancestral/intermingled, other
	#							times 
	#							2 distance states (close, not close)
	#
	#	TYPE_PAIR_TODI3			non-factorial design that combines distant or withintermediate, and close or ancestral/intermingled
	#							the idea here is that 
	#							evidence for extra-couple transmission comes from large patristic distance OR intermediates	
	#							evidence for extra-couple transmission comes from ancestral/intermingled OR short patristic distance	
	#
	#	TYPE_PAIR_DI			3 distance states
	#
	#
	#	TYPE_DIR_TO3			3 topological direction states
	#							chain_fm	(chain_fm_nointermediate regardless of distance); chain_mf	(chain_mf_nointermediate regardless of distance); other
	#							Rest: intermingled -> NA.
	#
	#	TYPE_DIR_TODI3			3 topological direction states if close.  
	#							chain_fm	(chain_fm_nointermediate regardless of distance); chain_mf	(chain_mf_nointermediate regardless of distance); other
	#							Rest: intermingled / distant / intermediate distance -> NA.
	#
	#	TYPE_CHAIN_TODI2		2 topological states if close.  
	#							chain	(xxx_withintermediate_close); pair	(xxx_nointermediate_close, other)
	#							Rest: distant / intermediate distance -> NA.
	rpw[, TYPE_PAIR_TO:= 'other']
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair_close','pair','pair_distant'))], 'TYPE_PAIR_TO', 'ancestral/\nintermingled')
	rpw[, TYPE_PAIR_TODI3:= 'with intermediate\nor distant']
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair_close','other_close'))], 'TYPE_PAIR_TODI3', 'no intermediate\n and close')
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair','other'))], 'TYPE_PAIR_TODI3', 'no intermediate\n but not close')
	rpw[, TYPE_PAIR_DI:= cut(PATRISTIC_DISTANCE, breaks=c(1e-12,0.02,0.05,2), labels=c('close','intermediate\ndistance','distant'))] 	
	rpw[, TYPE_PAIR_TODI2x2:= 'not close other']
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair_close'))], 'TYPE_PAIR_TODI2x2', 'close ancestral/\nintermingled')
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair','pair_distant'))], 'TYPE_PAIR_TODI2x2', 'not close ancestral/\nintermingled')
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('withintermediate_close','other_close'))], 'TYPE_PAIR_TODI2x2', 'close other')	
	rpw[, TYPE_DIR_TO3:= NA_character_]
	set(rpw, rpw[, which(grepl('chain_fm',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'fm')
	set(rpw, rpw[, which(grepl('chain_mf',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'mf')
	set(rpw, rpw[, which(grepl('intermingled',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'intermingled')		
	rpw[, TYPE_DIR_TODI3:= NA_character_]
	set(rpw, rpw[, which(grepl('chain_fm',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'fm')
	set(rpw, rpw[, which(grepl('chain_mf',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'mf')
	set(rpw, rpw[, which(grepl('intermingled',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'intermingled')			
	rpw[, TYPE_CHAIN_TODI2:= NA_character_]
	set(rpw, rpw[, which(grepl('withintermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI2', 'chain')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI2', 'pair')
	set(rpw, rpw[, which(grepl('other',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI2', 'pair')
	set(rpw, rpw[, which(grepl('intermingled',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'intermingled')			
	set(rpw, NULL, 'TYPE_DIR_TODI7x3', rpw[, gsub('intermediate',' intermediate',gsub('intermediate_','intermediate\n',gsub('intermingled_','intermingled\n',gsub('(chain_[fm][mf])_','\\1\n',TYPE_DIR_TODI7x3))))])
	#	define effectively independent number of windows
	#	select only W_FROM that are > W_TO	
	tmp		<- rpw[, {
				z		<- 1
				repeat
				{
					zz		<- which(W_FROM>max(W_TO[z]))[1]
					if(length(zz)==0 | is.na(zz))	break
					z		<- c(z, zz)			
				}
				list(W_FROM=W_FROM[z], W_TO=W_TO[z])
			}, by=c('PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','RUN')]
	tmp[, OVERLAP:= 0L]
	rpw		<- merge(rpw, tmp, by=c('PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','W_FROM','W_TO','RUN'), all.x=1 )
	set(rpw, rpw[, which(is.na(OVERLAP))], 'OVERLAP', 1L)		
	rpw		<- melt(rpw, measure.vars=c('TYPE_PAIR_TO','TYPE_CHAIN_TODI2','TYPE_PAIR_TODI3x3','TYPE_PAIR_TODI3','TYPE_PAIR_DI','TYPE_PAIR_TODI2x2','TYPE_DIR_TODI7x3','TYPE_DIR_TO3','TYPE_DIR_TODI3'), variable.name='GROUP', value.name='TYPE')
	set(rpw, NULL, 'TYPE', rpw[,gsub('_',' ', TYPE)])	
	rpw		<- subset(rpw, !is.na(TYPE))
	#
	#	for each pair count windows by TYPE_PAIR_DI (k)
	#
	rplkl	<- rpw[, list(K=length(W_FROM)), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH','GROUP','TYPE')]	
	rplkl	<- dcast.data.table(rplkl, RUN+PTY_RUN+FEMALE_SANGER_ID+MALE_SANGER_ID+LABEL+LABEL_SH~GROUP+TYPE, value.var='K')
	for(x in setdiff(colnames(rplkl),c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH','GROUP')))
		set(rplkl, which(is.na(rplkl[[x]])), x, 0L)	
	for(x in colnames(rplkl)[grepl('TYPE_PD_MEAN',colnames(rplkl))])
		set(rplkl, which(rplkl[[x]]>0), x, 1L)	
	rplkl	<- melt(rplkl, id.vars=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH'), variable.name='GROUP', value.name='K')
	rplkl[, TYPE:= gsub('.*_([^_]+)$','\\1',GROUP)]
	set(rplkl, NULL, 'GROUP', rplkl[, gsub('(.*)_[^_]+$','\\1',GROUP)])	
	#	for each pair count total windows (n) and effective windows (neff)	
	tmp		<- rpw[, list(N=length(W_FROM), NEFF=sum(1-OVERLAP)), by=c('PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','RUN','GROUP')]
	rplkl	<- merge(rplkl, tmp, by=c('PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','RUN','GROUP'))
	#	define effective number of windows of type
	rplkl[, KEFF:= K/N*NEFF]	
	#
	#	add prior (equal probabilities to all types), posterior, and marginal probabilities
	#
	rplkl[, DIR_PRIOR:= 0.1]
	rplkl[, DIR_PO:= DIR_PRIOR+KEFF]
	tmp		<- rplkl[, {
				alpha	<- DIR_PO
				beta	<- sum(DIR_PO)-DIR_PO				
				list(	TYPE=TYPE, 
						POSTERIOR_ALPHA=alpha, 
						POSTERIOR_BETA=beta)	
			}, by=c('PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','RUN','GROUP')]
	rplkl	<- merge(rplkl, tmp, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP','TYPE'))
	#	save
	save(rplkl, file='~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/161219/RCCS_161219_w270_trmp_allpairs_posteriors.rda')
	
	#
	#	prepare colours
	#	
	cols.type	<- list()
	tmp2		<- do.call('rbind',list(
					data.table(	TYPE= c("chain fm\nno intermediate\nclose","chain fm\nno intermediate","chain fm\nno intermediate\ndistant"),
							COLS= brewer.pal(11, 'PiYG')[c(1,2,4)]),
					data.table(	TYPE= c("chain mf\nno intermediate\nclose","chain mf\nno intermediate","chain mf\nno intermediate\ndistant"),
							COLS= brewer.pal(11, 'PuOr')[c(1,2,4)]),
					data.table(	TYPE= c("intermingled\nno intermediate\nclose","intermingled\nno intermediate","intermingled\nno intermediate\ndistant"),
							COLS= brewer.pal(11, 'PRGn')[c(1,2,4)]),
					data.table(	TYPE= c("chain fm\nwith intermediate\nclose","chain fm\nwith intermediate","chain fm\nwith intermediate\ndistant"),
							COLS= rev(brewer.pal(11, 'BrBG'))[c(3,4,5)]),
					data.table(	TYPE= c("chain mf\nwith intermediate\nclose","chain mf\nwith intermediate","chain mf\nwith intermediate\ndistant"),
							COLS= rev(brewer.pal(11, 'PRGn'))[c(3,4,5)]),
					data.table(	TYPE= c("intermingled\nwith intermediate\nclose","intermingled\nwith intermediate","intermingled\nwith intermediate\ndistant"),
							COLS= rev(brewer.pal(11, 'RdBu'))[c(3,4,5)]),
					data.table(	TYPE= c("other close","other","other distant"),
							COLS= rev(brewer.pal(11, 'RdGy'))[c(3,4,5)])))
	cols.type[['TYPE_DIR_TODI7x3']]	<- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
	tmp2		<- do.call('rbind',list(
					data.table(	TYPE= c("pair close","pair","pair distant"),
							COLS= brewer.pal(11, 'PuOr')[c(1,2,4)]),
					data.table(	TYPE= c("withintermediate close","withintermediate","withintermediate distant"),
							COLS= rev(brewer.pal(11, 'RdBu'))[c(3,4,5)]),
					data.table(	TYPE= c("other close","other","other distant"),
							COLS= rev(brewer.pal(11, 'RdGy'))[c(3,4,5)])))
	tmp2		<- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
	cols.type[['TYPE_PAIR_TODI3x3']]	<- tmp2	
	tmp2		<- do.call('rbind',list(
					data.table(	TYPE= c('close ancestral/\nintermingled', 'not close ancestral/\nintermingled'),
							COLS= brewer.pal(11, 'PRGn')[c(2,4)]),
					data.table(	TYPE= c('close other','not close other'),
							COLS= rev(brewer.pal(11, 'RdGy'))[c(3,5)])))
	tmp2		<- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
	cols.type[['TYPE_PAIR_TODI2x2']]	<- tmp2	
	tmp2		<- data.table(	TYPE= c("no intermediate\n and close", "no intermediate\n but not close", "with intermediate\nor distant"),
			COLS= c(brewer.pal(11, 'RdBu')[c(2,4)], rev(brewer.pal(11, 'RdGy'))[4]))					
	tmp2		<- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
	cols.type[['TYPE_PAIR_TODI3']]	<- tmp2
	tmp2		<- data.table(	TYPE= c("close", "intermediate\ndistance", "distant"),
			COLS= c(rev(brewer.pal(11, 'RdBu'))[c(2,4)], rev(brewer.pal(11, 'RdGy'))[4]))					
	tmp2		<- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
	cols.type[['TYPE_PAIR_DI']]	<- tmp2
	tmp2		<- data.table(	TYPE= c("ancestral/\nintermingled", "other"),
			COLS= c(rev(brewer.pal(9, 'Greens'))[2], rev(brewer.pal(11, 'RdGy'))[4]))					
	tmp2		<- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
	cols.type[['TYPE_PAIR_TO']]	<- tmp2	
	#
	#	make detailed plots for each pair
	#		topology assignments across genome
	#		distances across genome
	#		number of windows of certain type
	#		estimated posterior probabilities on unknown phylogenetic relationship
	#
	groups		<- c('TYPE_DIR_TODI7x3','TYPE_PAIR_TODI3x3','TYPE_PAIR_TODI2x2','TYPE_PAIR_TODI3','TYPE_PAIR_DI')
	group		<- c('TYPE_DIR_TODI7x3')
	
	widths	<- unit(c(4, 6), "null")
	heights	<- unit(c(2, 3.5, 4, 5), "null")
	height	<- 9
	if(group%in%c('TYPE_DIR_TODI7x3'))
	{
		widths	<- unit(c(4, 6), "null")
		heights	<- unit(c(2, 3.5, 4, 15), "null")
		height	<- 17
	}		
	if(group%in%c('TYPE_PAIR_TODI2x2'))
	{
		heights	<- unit(c(2, 3.5, 4, 3.75), "null")
		height	<- 8
	}
	if(group%in%c('TYPE_PAIR_TODI3','TYPE_PAIR_DI'))
	{
		heights	<- unit(c(2, 3.5, 4, 3.5), "null")
		height	<- 7
	}
	pdf(file=file.path(dir, paste(run,'-phsc-relationships_allpairs','_',group,'.pdf',sep='')), w=10, h=height)	
	plot.tmp	<- unique(subset(rplkl, GROUP==group, c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','LABEL')), by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))
	setkey(plot.tmp, LABEL)		
	for(i in seq_len(nrow(plot.tmp)))
	{
		#pty_run	<- 38; id1		<- '16016_1_4'; id2		<- '15105_1_35'
		pty_run	<- plot.tmp[i, PTY_RUN]; id1		<- plot.tmp[i, FEMALE_SANGER_ID]; id2		<- plot.tmp[i, MALE_SANGER_ID]		
		tmp		<- subset(rpw, PTY_RUN==pty_run & GROUP==group & FEMALE_SANGER_ID==id1 & MALE_SANGER_ID==id2)
		p1		<- ggplot(tmp, aes(x=W_FROM)) +			
				geom_bar(aes(y=ID_R_MAX, colour=TYPE), stat='identity', fill='transparent') +
				geom_bar(aes(y=ID_R_MIN, fill=TYPE), stat='identity', colour='transparent') +
				labs(x='', y='number of reads', fill='phylogenetic\nrelationship\n', colour='phylogenetic\nrelationship\n') +
				scale_fill_manual(values=cols.type[[group]]) +
				scale_colour_manual(values=cols.type[[group]]) +
				scale_x_continuous(breaks=seq(0,1e4,500), minor_breaks=seq(0,1e4,100), limits=c(rpw[, min(W_FROM)], rpw[, max(W_FROM)])) +
				scale_y_log10(breaks=c(10,100,1000,1e4,1e5)) +
				theme_bw() + theme(legend.position='left') +			
				guides(fill=FALSE, colour=FALSE)
		p2		<- ggplot(tmp, aes(x=W_FROM, y=PATRISTIC_DISTANCE)) +
				geom_point(size=1) +					
				labs(x='window start\n\n', y='patristic distance') +
				scale_x_continuous(breaks=seq(0,1e4,500), minor_breaks=seq(0,1e4,100), limits=c(rpw[, min(W_FROM)], rpw[, max(W_FROM)])) +
				scale_y_log10(labels=percent, limits=c(0.001, 0.7), expand=c(0,0), breaks=c(0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25)) +
				theme_bw() + theme(legend.position='left')
		tmp		<- subset(rplkl, GROUP==group & PTY_RUN==pty_run & FEMALE_SANGER_ID==id1 & MALE_SANGER_ID==id2)
		p3		<- ggplot(tmp, aes(x=TYPE, y=KEFF, fill=TYPE)) + geom_bar(stat='identity') +
				scale_fill_manual(values=cols.type[[group]]) +
				theme_bw() + theme(legend.position='bottom') +
				coord_flip() + guides(fill=FALSE) +			
				labs(x='', y='\nnon-overlapping windows\n(number)', fill='phylogenetic\nrelationship\n')
		p4		<- ggplot(tmp, aes(x=TYPE, 	middle=qbeta(0.5, POSTERIOR_ALPHA, POSTERIOR_BETA), 
								ymin=qbeta(0.025, POSTERIOR_ALPHA, POSTERIOR_BETA), 
								ymax=qbeta(0.975, POSTERIOR_ALPHA, POSTERIOR_BETA), 
								lower=qbeta(0.25, POSTERIOR_ALPHA, POSTERIOR_BETA), 
								upper=qbeta(0.75, POSTERIOR_ALPHA, POSTERIOR_BETA), 
								fill=TYPE)) + 
				geom_boxplot(stat='identity') +
				scale_y_continuous(labels=percent, breaks=seq(0,1,0.2), limits=c(0,1), expand=c(0,0)) +
				scale_fill_manual(values=cols.type[[group]]) +
				theme_bw() + theme(legend.position='right', legend.margin=margin(0, .1, 0, 1, "cm")) +
				coord_flip() + guides(fill=guide_legend(ncol=1)) +
				labs(x='', y='\nposterior probability\n', fill='phylogenetic\nrelationship\n')				
		grid.newpage()
		pushViewport(viewport(layout = grid.layout(4, 2, heights=heights, widths=widths)))   
		grid.text(tmp[1,LABEL], gp=gpar(fontsize=10), vp=viewport(layout.pos.row = 1, layout.pos.col = 1:2))
		print(p1, vp = viewport(layout.pos.row = 2, layout.pos.col = 1:2))
		print(p2, vp = viewport(layout.pos.row = 3, layout.pos.col = 1:2))         
		print(p3, vp = viewport(layout.pos.row = 4, layout.pos.col = 1))
		print(p4, vp = viewport(layout.pos.row = 4, layout.pos.col = 2))
	}
	dev.off()	
	
}

RakaiAll.addposteriors.pairs.170201<- function()
{
	require(data.table)
	require(scales)
	require(ggplot2)
	require(grid)
	require(gridExtra)
	require(RColorBrewer)
	# load pty.run
	load( "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/Couples_PANGEA_HIV_n4562_Imperial_v151113_phscruns.rda" )
	# load transmission summaries
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/161219/RCCS_161219_w270_trmw_assignments_allpairs.rda')	
	rpw		<- subset(rpw, RUN%in%c("RCCS_161219_w270_d50_p001_mr20_mt1_cl2_d5") )
	setnames(rpw, c('TYPE','TYPE_PAIR'), c('TYPE_DIR_TODI7x3','TYPE_PAIR_TODI3x3'))
	#
	rpw[, table(RUN, useNA='if')]
	#	define plotting order: largest number of trm assignments	
	tmp		<- rpw[, list( WIN_TR=length(which(grepl('close|anc',TYPE_DIR_TODI7x3))) ), by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID')]
	tmp		<- tmp[order(-WIN_TR),]
	tmp[, PLOT_ID:=seq_len(nrow(tmp))]	
	#	make pair data table
	rp		<- copy(rpw)
	set(rp, NULL, c('PTY_RUN','RUN','W_FROM','W_TO','TYPE_RAW','TYPE_DIR_TODI7x3','TYPE_PAIR_TODI3x3','PATRISTIC_DISTANCE','CONTIGUOUS','PATHS_12','PATHS_21','MALE_SANGER_ID_L','MALE_SANGER_ID_R','FEMALE_SANGER_ID_L','FEMALE_SANGER_ID_R'), NULL)
	rp		<- unique(rp)
	#	make COUPID
	rp[, COUPID:= paste0(MALE_RID,':',FEMALE_RID)]
	#	define label
	tmp		<- merge(tmp, rp, by=c('MALE_SANGER_ID','FEMALE_SANGER_ID'))
	setkey(tmp, PLOT_ID)
	tmp[, LABEL_SH:= factor(PLOT_ID, levels=PLOT_ID, labels=paste(COUPID, ' ( M:', MALE_SANGER_ID,' F:',FEMALE_SANGER_ID, ' run:', PTY_RUN, ' )', sep=''))]
	tmp[, LABEL:= factor(PLOT_ID, levels=PLOT_ID, labels=paste('Pair ', COUPID,' -phsc.run=',PTY_RUN, '\nPerson M ', MALE_RID, ' ', MALE_SANGER_ID,' -loc:',MALE_REGION,',',MALE_COMM_NUM,',',MALE_HH_NUM,' -birth:',MALE_BIRTHDATE,' -neg:',MALE_LASTNEGDATE,' -pos:',MALE_FIRSTPOSDATE,' -seq:',MALE_SEQDATE,
							'\n<->', 
							'\nPerson F ', FEMALE_RID, ' ', FEMALE_SANGER_ID,' -loc:',FEMALE_REGION,',',FEMALE_COMM_NUM,',',FEMALE_HH_NUM,' -birth:',FEMALE_BIRTHDATE,' -neg:',FEMALE_LASTNEGDATE,' -pos:',FEMALE_FIRSTPOSDATE,' -seq:',FEMALE_SEQDATE,																				
							'\n',sep=''))]
	tmp		<- subset(tmp, select=c(PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, LABEL, LABEL_SH))
	rpw		<- merge(tmp, rpw, by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))
	#	define min/max reads
	tmp		<- rpw[, list(	ID_R_MIN=min(MALE_SANGER_ID_R, FEMALE_SANGER_ID_R),
					ID_R_MAX=max(MALE_SANGER_ID_R, FEMALE_SANGER_ID_R)), by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM')]
	rpw		<- merge(rpw, tmp, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM'))
	#
	#	define topo/distance types of phylogenetic relationship between pairs
	#
	#	given the bimodal distribution of patristic distances from phyloscanner
	#	I ususally consider 3 distance states: close, neither close nor distant, distant
	#
	#	TYPE_DIR_TODI7x3		7 topology states (this is the "basic topological characterization" on Slack that I derive from Matthew's output) 
	#							chain_12_nointermediate, chain_12_withintermediate, chain_21_nointermediate, chain_21_withintermediate, intermingled_nointermediate, intermingled_withintermediate, other  
	#							times 
	#							3 distance states
	#
	#	TYPE_PAIR_TODI3x3		3 topology states
	#							pair (chain_XX_nointermediate+intermingled_nointermediate), withintermediate (XX_withintermediate), other
	#							times 
	#							3 distance states
	#
	#	TYPE_PAIR_TODI2x2		2 topology states
	#							ancestral/intermingled, other
	#							times 
	#							2 distance states (close, not close)
	#
	#	TYPE_PAIR_TODI3			non-factorial design that combines distant or withintermediate, and close or ancestral/intermingled
	#							the idea here is that 
	#							evidence for extra-couple transmission comes from large patristic distance OR intermediates	
	#							evidence for extra-couple transmission comes from ancestral/intermingled OR short patristic distance	
	#
	#	TYPE_PAIR_DI			3 distance states
	#
	#
	#	TYPE_DIR_TO3			3 topological direction states
	#							chain_fm	(chain_fm_nointermediate regardless of distance); chain_mf	(chain_mf_nointermediate regardless of distance); other
	#							Rest: intermingled -> NA.
	#
	#	TYPE_DIR_TODI3			3 topological direction states if close.  
	#							chain_fm	(chain_fm_nointermediate regardless of distance); chain_mf	(chain_mf_nointermediate regardless of distance); other
	#							Rest: intermingled / distant / intermediate distance -> NA.
	#
	#	TYPE_CHAIN_TODI2		2 topological states if close.  
	#							chain	(xxx_withintermediate_close); pair	(xxx_nointermediate_close, other)
	#							Rest: distant / intermediate distance -> NA.
	rpw[, TYPE_PAIR_TO:= 'other']
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair_close','pair','pair_distant'))], 'TYPE_PAIR_TO', 'ancestral/\nintermingled')
	rpw[, TYPE_PAIR_TODI3:= 'with intermediate\nor distant']
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair_close','other_close'))], 'TYPE_PAIR_TODI3', 'no intermediate\n and close')
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair','other'))], 'TYPE_PAIR_TODI3', 'no intermediate\n but not close')
	rpw[, TYPE_PAIR_DI:= cut(PATRISTIC_DISTANCE, breaks=c(1e-12,0.02,0.05,2), labels=c('close','intermediate\ndistance','distant'))] 	
	rpw[, TYPE_PAIR_TODI2x2:= 'not close other']
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair_close'))], 'TYPE_PAIR_TODI2x2', 'close ancestral/\nintermingled')
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair','pair_distant'))], 'TYPE_PAIR_TODI2x2', 'not close ancestral/\nintermingled')
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('withintermediate_close','other_close'))], 'TYPE_PAIR_TODI2x2', 'close other')	
	rpw[, TYPE_DIR_TO3:= NA_character_]
	set(rpw, rpw[, which(grepl('chain_fm',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'fm')
	set(rpw, rpw[, which(grepl('chain_mf',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'mf')
	set(rpw, rpw[, which(grepl('intermingled',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'intermingled')		
	rpw[, TYPE_DIR_TODI3:= NA_character_]
	set(rpw, rpw[, which(grepl('chain_fm',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'fm')
	set(rpw, rpw[, which(grepl('chain_mf',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'mf')
	set(rpw, rpw[, which(grepl('intermingled',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'intermingled')			
	rpw[, TYPE_CHAIN_TODI2:= NA_character_]
	set(rpw, rpw[, which(grepl('withintermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI2', 'chain')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI2', 'pair')
	set(rpw, rpw[, which(grepl('other',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI2', 'pair')
	set(rpw, rpw[, which(grepl('intermingled',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'intermingled')			
	set(rpw, NULL, 'TYPE_DIR_TODI7x3', rpw[, gsub('intermediate',' intermediate',gsub('intermediate_','intermediate\n',gsub('intermingled_','intermingled\n',gsub('(chain_[fm][mf])_','\\1\n',TYPE_DIR_TODI7x3))))])
	#
	#	identify chunks of contiguous windows
	#	
	setkey(rpw, RUN, PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, W_FROM)
	rpw.slide	<- rpw[, {
				ans	<- NA_integer_
				tmp	<- diff(W_FROM)
				if(length(tmp))
					ans	<- min(tmp)
				list(W_SLIDE=ans)
			}, by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','RUN')][, min(na.omit(W_SLIDE))]
	#	melt because chunks are dependent on topology types: if there are NAs, then the chunks may change
	rpw		<- melt(rpw, measure.vars=c('TYPE_PAIR_TO','TYPE_CHAIN_TODI2','TYPE_PAIR_TODI3x3','TYPE_PAIR_TODI3','TYPE_PAIR_DI','TYPE_PAIR_TODI2x2','TYPE_DIR_TODI7x3','TYPE_DIR_TO3','TYPE_DIR_TODI3'), variable.name='GROUP', value.name='TYPE')
	set(rpw, NULL, 'TYPE', rpw[,gsub('_',' ', TYPE)])	
	rpw		<- subset(rpw, !is.na(TYPE))			
	#	define chunks
	setkey(rpw, RUN, PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, GROUP, W_FROM)
	tmp		<- rpw[, {
				tmp<- as.integer( c(TRUE,(W_FROM[-length(W_FROM)]+rpw.slide)!=W_FROM[-1]) )
				list(W_FROM=W_FROM, W_TO=W_TO, CHUNK=cumsum(tmp))
			}, by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','RUN','GROUP')]
	rpw		<- merge(rpw,tmp,by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','RUN','W_FROM','W_TO','GROUP'))
	#	define chunk length in terms of non-overlapping windows	& number of windows in chunk
	tmp		<- rpw[, {
				list(W_FROM=W_FROM, W_TO=W_TO, CHUNK_L=(max(W_TO+1L)-min(W_FROM))/(W_TO[1]+1L-W_FROM[1]), CHUNK_N=length(W_FROM))
			}, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP','CHUNK')]
	rpw		<- merge(rpw,tmp,by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','RUN','GROUP','CHUNK','W_FROM','W_TO'))	
	#	for each chunk, count: windows by type and effective length of chunk
	#	then sum chunks
	rplkl	<- rpw[, list(	K= length(W_FROM), KEFF= length(W_FROM)/CHUNK_N[1] * CHUNK_L[1]), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','CHUNK','LABEL','LABEL_SH','GROUP','TYPE')]	
	rplkl	<- rplkl[, list(STAT=c('K','KEFF'), V=c(sum(K),sum(KEFF))), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH','GROUP','TYPE')]	
	#	add zeros
	rplkl	<- dcast.data.table(rplkl, RUN+PTY_RUN+FEMALE_SANGER_ID+MALE_SANGER_ID+LABEL+LABEL_SH~GROUP+TYPE+STAT, value.var='V')
	for(x in setdiff(colnames(rplkl),c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH','GROUP')))
		set(rplkl, which(is.na(rplkl[[x]])), x, 0L)	
	for(x in colnames(rplkl)[grepl('TYPE_PD_MEAN',colnames(rplkl))])
		set(rplkl, which(rplkl[[x]]>0), x, 1L)	
	rplkl	<- melt(rplkl, id.vars=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH'), variable.name='GROUP', value.name='V')
	rplkl[, STAT:= gsub('.*_([^_]+)$','\\1',GROUP)]
	set(rplkl, NULL, 'GROUP', rplkl[, gsub('(.*)_[^_]+$','\\1',GROUP)])	
	rplkl[, TYPE:= gsub('.*_([^_]+)$','\\1',GROUP)]
	set(rplkl, NULL, 'GROUP', rplkl[, gsub('(.*)_[^_]+$','\\1',GROUP)])	
	rplkl	<- dcast.data.table(rplkl, RUN+PTY_RUN+FEMALE_SANGER_ID+MALE_SANGER_ID+LABEL+LABEL_SH+GROUP+TYPE~STAT, value.var='V')	
	#	calculate N and NEFF
	tmp		<- rplkl[, list(N= sum(K), NEFF= sum(KEFF)), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP')]	
	rplkl	<- merge(rplkl, tmp, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP'))
	#	add parameters for marginal posterior probabilities (alpha, beta)
	par.prior	<- 0.1	
	tmp		<- rplkl[, list(	TYPE=TYPE, POSTERIOR_ALPHA=KEFF+par.prior, POSTERIOR_BETA=sum(KEFF+par.prior)-(KEFF+par.prior))	, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP')]
	rplkl	<- merge(rplkl, tmp, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP','TYPE'))
	#	save
	save(rplkl, file='~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/161219/RCCS_161219_w270_trmp_allpairs_posteriors.rda')
	#
	#	prepare colours
	#	
	cols.type	<- list()
	tmp2		<- do.call('rbind',list(
					data.table(	TYPE= c("chain fm\nno intermediate\nclose","chain fm\nno intermediate","chain fm\nno intermediate\ndistant"),
							COLS= brewer.pal(11, 'PiYG')[c(1,2,4)]),
					data.table(	TYPE= c("chain mf\nno intermediate\nclose","chain mf\nno intermediate","chain mf\nno intermediate\ndistant"),
							COLS= brewer.pal(11, 'PuOr')[c(1,2,4)]),
					data.table(	TYPE= c("intermingled\nno intermediate\nclose","intermingled\nno intermediate","intermingled\nno intermediate\ndistant"),
							COLS= brewer.pal(11, 'PRGn')[c(1,2,4)]),
					data.table(	TYPE= c("chain fm\nwith intermediate\nclose","chain fm\nwith intermediate","chain fm\nwith intermediate\ndistant"),
							COLS= rev(brewer.pal(11, 'BrBG'))[c(3,4,5)]),
					data.table(	TYPE= c("chain mf\nwith intermediate\nclose","chain mf\nwith intermediate","chain mf\nwith intermediate\ndistant"),
							COLS= rev(brewer.pal(11, 'PRGn'))[c(3,4,5)]),
					data.table(	TYPE= c("intermingled\nwith intermediate\nclose","intermingled\nwith intermediate","intermingled\nwith intermediate\ndistant"),
							COLS= rev(brewer.pal(11, 'RdBu'))[c(3,4,5)]),
					data.table(	TYPE= c("other close","other","other distant"),
							COLS= rev(brewer.pal(11, 'RdGy'))[c(3,4,5)])))
	cols.type[['TYPE_DIR_TODI7x3']]	<- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
	tmp2		<- do.call('rbind',list(
					data.table(	TYPE= c("pair close","pair","pair distant"),
							COLS= brewer.pal(11, 'PuOr')[c(1,2,4)]),
					data.table(	TYPE= c("withintermediate close","withintermediate","withintermediate distant"),
							COLS= rev(brewer.pal(11, 'RdBu'))[c(3,4,5)]),
					data.table(	TYPE= c("other close","other","other distant"),
							COLS= rev(brewer.pal(11, 'RdGy'))[c(3,4,5)])))
	tmp2		<- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
	cols.type[['TYPE_PAIR_TODI3x3']]	<- tmp2	
	tmp2		<- do.call('rbind',list(
					data.table(	TYPE= c('close ancestral/\nintermingled', 'not close ancestral/\nintermingled'),
							COLS= brewer.pal(11, 'PRGn')[c(2,4)]),
					data.table(	TYPE= c('close other','not close other'),
							COLS= rev(brewer.pal(11, 'RdGy'))[c(3,5)])))
	tmp2		<- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
	cols.type[['TYPE_PAIR_TODI2x2']]	<- tmp2	
	tmp2		<- data.table(	TYPE= c("no intermediate\n and close", "no intermediate\n but not close", "with intermediate\nor distant"),
			COLS= c(brewer.pal(11, 'RdBu')[c(2,4)], rev(brewer.pal(11, 'RdGy'))[4]))					
	tmp2		<- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
	cols.type[['TYPE_PAIR_TODI3']]	<- tmp2
	tmp2		<- data.table(	TYPE= c("close", "intermediate\ndistance", "distant"),
			COLS= c(rev(brewer.pal(11, 'RdBu'))[c(2,4)], rev(brewer.pal(11, 'RdGy'))[4]))					
	tmp2		<- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
	cols.type[['TYPE_PAIR_DI']]	<- tmp2
	tmp2		<- data.table(	TYPE= c("ancestral/\nintermingled", "other"),
			COLS= c(rev(brewer.pal(9, 'Greens'))[2], rev(brewer.pal(11, 'RdGy'))[4]))					
	tmp2		<- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
	cols.type[['TYPE_PAIR_TO']]	<- tmp2	
	#
	#	make detailed plots for each pair
	#		topology assignments across genome
	#		distances across genome
	#		number of windows of certain type
	#		estimated posterior probabilities on unknown phylogenetic relationship
	#
	run		<- 'RCCS_161219_w270_dxxx'
	dir		<- rpw$DIR[1]	
	
	groups		<- c('TYPE_DIR_TODI7x3','TYPE_PAIR_TODI3x3','TYPE_PAIR_TODI2x2','TYPE_PAIR_TODI3','TYPE_PAIR_DI')
	group		<- c('TYPE_DIR_TODI7x3')
	
	widths	<- unit(c(4, 6), "null")
	heights	<- unit(c(2, 3.5, 4, 5), "null")
	height	<- 9
	if(group%in%c('TYPE_DIR_TODI7x3'))
	{
		widths	<- unit(c(4, 6), "null")
		heights	<- unit(c(2, 3.5, 4, 15), "null")
		height	<- 17
	}		
	if(group%in%c('TYPE_PAIR_TODI2x2'))
	{
		heights	<- unit(c(2, 3.5, 4, 3.75), "null")
		height	<- 8
	}
	if(group%in%c('TYPE_PAIR_TODI3','TYPE_PAIR_DI'))
	{
		heights	<- unit(c(2, 3.5, 4, 3.5), "null")
		height	<- 7
	}
	pdf(file=file.path(dir, paste(run,'-phsc-relationships_allpairs','_',group,'.pdf',sep='')), w=10, h=height)	
	plot.tmp	<- unique(subset(rplkl, GROUP==group, c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','LABEL')), by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))
	setkey(plot.tmp, LABEL)		
	for(i in seq_len(nrow(plot.tmp)))
	{
		#pty_run	<- 38; id1		<- '16016_1_4'; id2		<- '15105_1_35'
		pty_run	<- plot.tmp[i, PTY_RUN]; id1		<- plot.tmp[i, FEMALE_SANGER_ID]; id2		<- plot.tmp[i, MALE_SANGER_ID]		
		tmp		<- subset(rpw, PTY_RUN==pty_run & GROUP==group & FEMALE_SANGER_ID==id1 & MALE_SANGER_ID==id2)
		p1		<- ggplot(tmp, aes(x=W_FROM)) +			
				geom_bar(aes(y=ID_R_MAX, colour=TYPE), stat='identity', fill='transparent') +
				geom_bar(aes(y=ID_R_MIN, fill=TYPE), stat='identity', colour='transparent') +
				labs(x='', y='number of reads', fill='phylogenetic\nrelationship\n', colour='phylogenetic\nrelationship\n') +
				scale_fill_manual(values=cols.type[[group]]) +
				scale_colour_manual(values=cols.type[[group]]) +
				scale_x_continuous(breaks=seq(0,1e4,500), minor_breaks=seq(0,1e4,100), limits=c(rpw[, min(W_FROM)], rpw[, max(W_FROM)])) +
				scale_y_log10(breaks=c(10,100,1000,1e4,1e5)) +
				theme_bw() + theme(legend.position='left') +			
				guides(fill=FALSE, colour=FALSE)
		p2		<- ggplot(tmp, aes(x=W_FROM, y=PATRISTIC_DISTANCE)) +
				geom_point(size=1) +					
				labs(x='window start\n\n', y='patristic distance') +
				scale_x_continuous(breaks=seq(0,1e4,500), minor_breaks=seq(0,1e4,100), limits=c(rpw[, min(W_FROM)], rpw[, max(W_FROM)])) +
				scale_y_log10(labels=percent, limits=c(0.001, 0.7), expand=c(0,0), breaks=c(0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25)) +
				theme_bw() + theme(legend.position='left')
		tmp		<- subset(rplkl, GROUP==group & PTY_RUN==pty_run & FEMALE_SANGER_ID==id1 & MALE_SANGER_ID==id2)
		p3		<- ggplot(tmp, aes(x=TYPE, y=KEFF, fill=TYPE)) + geom_bar(stat='identity') +
				scale_fill_manual(values=cols.type[[group]]) +
				theme_bw() + theme(legend.position='bottom') +
				coord_flip() + guides(fill=FALSE) +			
				labs(x='', y='\nnon-overlapping windows\n(number)', fill='phylogenetic\nrelationship\n')
		p4		<- ggplot(tmp, aes(x=TYPE, 	middle=qbeta(0.5, POSTERIOR_ALPHA, POSTERIOR_BETA), 
								ymin=qbeta(0.025, POSTERIOR_ALPHA, POSTERIOR_BETA), 
								ymax=qbeta(0.975, POSTERIOR_ALPHA, POSTERIOR_BETA), 
								lower=qbeta(0.25, POSTERIOR_ALPHA, POSTERIOR_BETA), 
								upper=qbeta(0.75, POSTERIOR_ALPHA, POSTERIOR_BETA), 
								fill=TYPE)) + 
				geom_boxplot(stat='identity') +
				scale_y_continuous(labels=percent, breaks=seq(0,1,0.2), limits=c(0,1), expand=c(0,0)) +
				scale_fill_manual(values=cols.type[[group]]) +
				theme_bw() + theme(legend.position='right', legend.margin=margin(0, .1, 0, 1, "cm")) +
				coord_flip() + guides(fill=guide_legend(ncol=1)) +
				labs(x='', y='\nposterior probability\n', fill='phylogenetic\nrelationship\n')				
		grid.newpage()
		pushViewport(viewport(layout = grid.layout(4, 2, heights=heights, widths=widths)))   
		grid.text(tmp[1,LABEL], gp=gpar(fontsize=10), vp=viewport(layout.pos.row = 1, layout.pos.col = 1:2))
		print(p1, vp = viewport(layout.pos.row = 2, layout.pos.col = 1:2))
		print(p2, vp = viewport(layout.pos.row = 3, layout.pos.col = 1:2))         
		print(p3, vp = viewport(layout.pos.row = 4, layout.pos.col = 1))
		print(p4, vp = viewport(layout.pos.row = 4, layout.pos.col = 2))
	}
	dev.off()	
	
}

RakaiAll.addposteriors.pairs.170227<- function()
{
	require(data.table)
	require(scales)
	require(ggplot2)
	require(grid)
	require(gridExtra)
	require(RColorBrewer)
	# load pty.run
	load( "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/Couples_PANGEA_HIV_n4562_Imperial_v151113_phscruns.rda" )
	# load transmission summaries
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/161219/RCCS_161219_w270_trmw_assignments_allpairs.rda')	
	tmp		<- subset(rpw, RUN%in%c("RCCS_161219_w270_d50_p001_mr20_mt1_cl2_d5") )
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170227/RCCS_170227_w270_trmw_assignments_allpairs.rda')
	set(tmp, NULL, setdiff(colnames(tmp),colnames(rpw)), NULL)
	rpw		<- rbind(rpw, tmp, fill=TRUE)	
	setnames(rpw, c('TYPE','TYPE_PAIR'), c('TYPE_DIR_TODI7x3','TYPE_PAIR_TODI3x3'))
	#
	rpw[, table(RUN, useNA='if')]
	#	check if we have all pty.runs
	stopifnot(!length(setdiff( 	subset(rpw, RUN=='RCCS_161219_w270_d50_p001_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))],
							subset(rpw, RUN=='RCCS_170227_w270_d50_st20_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))]	)))	
	#	define plotting order: largest number of trm assignments	
	tmp		<- rpw[, list( WIN_TR=length(which(grepl('close|anc',TYPE_DIR_TODI7x3))) ), by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID')]
	tmp		<- tmp[, list(WIN_TR=max(WIN_TR)), by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID')]
	tmp		<- tmp[order(-WIN_TR),]
	tmp[, PLOT_ID:=seq_len(nrow(tmp))]	
	#	make pair data table
	rp		<- copy(rpw)
	set(rp, NULL, c('RUN','FILE','DIR','W_FROM','W_TO','TYPE_RAW','TYPE_DIR_TODI7x3','TYPE_PAIR_TODI3x3','PATRISTIC_DISTANCE','CONTIGUOUS','PATHS_12','PATHS_21','MALE_SANGER_ID_L','MALE_SANGER_ID_R','FEMALE_SANGER_ID_L','FEMALE_SANGER_ID_R'), NULL)
	rp		<- unique(rp, by=c('MALE_SANGER_ID','FEMALE_SANGER_ID','PTY_RUN'))
	#	make COUPID
	rp[, COUPID:= paste0(MALE_RID,':',FEMALE_RID)]
	#	define label
	tmp		<- merge(tmp, rp, by=c('MALE_SANGER_ID','FEMALE_SANGER_ID','PTY_RUN'))
	setkey(tmp, PLOT_ID)
	tmp[, LABEL_SH:= factor(PLOT_ID, levels=PLOT_ID, labels=paste(COUPID, ' ( M:', MALE_SANGER_ID,' F:',FEMALE_SANGER_ID, ' run:', PTY_RUN, ' )', sep=''))]
	tmp[, LABEL:= factor(PLOT_ID, levels=PLOT_ID, labels=paste('Pair ', COUPID,' -phsc.run=',PTY_RUN, '\nPerson M ', MALE_RID, ' ', MALE_SANGER_ID,' -loc:',MALE_REGION,',',MALE_COMM_NUM,',',MALE_HH_NUM,' -birth:',MALE_BIRTHDATE,' -neg:',MALE_LASTNEGDATE,' -pos:',MALE_FIRSTPOSDATE,' -seq:',MALE_SEQDATE,
							'\n<->', 
							'\nPerson F ', FEMALE_RID, ' ', FEMALE_SANGER_ID,' -loc:',FEMALE_REGION,',',FEMALE_COMM_NUM,',',FEMALE_HH_NUM,' -birth:',FEMALE_BIRTHDATE,' -neg:',FEMALE_LASTNEGDATE,' -pos:',FEMALE_FIRSTPOSDATE,' -seq:',FEMALE_SEQDATE,																				
							'\n',sep=''))]
	tmp		<- subset(tmp, select=c(PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, LABEL, LABEL_SH))
	rpw		<- merge(tmp, rpw, by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))
	#	define min/max reads
	tmp		<- rpw[, list(	ID_R_MIN=min(MALE_SANGER_ID_R, FEMALE_SANGER_ID_R),
					ID_R_MAX=max(MALE_SANGER_ID_R, FEMALE_SANGER_ID_R)), by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM')]
	rpw		<- merge(rpw, tmp, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM'))
	#
	#	define topo/distance types of phylogenetic relationship between pairs
	#
	#	given the bimodal distribution of patristic distances from phyloscanner
	#	I ususally consider 3 distance states: close, neither close nor distant, distant
	#
	#	TYPE_DIR_TODI7x3		7 topology states (this is the "basic topological characterization" on Slack that I derive from Matthew's output) 
	#							chain_12_nointermediate, chain_12_withintermediate, chain_21_nointermediate, chain_21_withintermediate, intermingled_nointermediate, intermingled_withintermediate, other  
	#							times 
	#							3 distance states
	#
	#	TYPE_PAIR_TODI3x3		3 topology states
	#							pair (chain_XX_nointermediate+intermingled_nointermediate), withintermediate (XX_withintermediate), other
	#							times 
	#							3 distance states
	#
	#	TYPE_PAIR_TODI2x2		2 topology states
	#							ancestral/intermingled, other
	#							times 
	#							2 distance states (close, not close)
	#
	#	TYPE_PAIR_TODI3			non-factorial design that combines distant or withintermediate, and close or ancestral/intermingled
	#							the idea here is that 
	#							evidence for extra-couple transmission comes from large patristic distance OR intermediates	
	#							evidence for extra-couple transmission comes from ancestral/intermingled OR short patristic distance	
	#
	#	TYPE_PAIR_DI			3 distance states
	#
	#
	#	TYPE_DIR_TO3			3 topological direction states
	#							chain_fm	(chain_fm_nointermediate regardless of distance); chain_mf	(chain_mf_nointermediate regardless of distance); other
	#							Rest: intermingled -> NA.
	#
	#	TYPE_DIR_TODI3			3 topological direction states if close.  
	#							chain_fm	(chain_fm_nointermediate regardless of distance); chain_mf	(chain_mf_nointermediate regardless of distance); other
	#							Rest: intermingled / distant / intermediate distance -> NA.
	#
	#	TYPE_CHAIN_TODI2		2 topological states if close.  
	#							chain	(xxx_withintermediate_close); pair	(xxx_nointermediate_close, other)
	#							Rest: distant / intermediate distance -> NA.
	rpw[, TYPE_PAIR_TO:= 'other']
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair_close','pair','pair_distant'))], 'TYPE_PAIR_TO', 'ancestral/\nintermingled')
	rpw[, TYPE_PAIR_TODI3:= 'with intermediate\nor distant']
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair_close','other_close'))], 'TYPE_PAIR_TODI3', 'no intermediate\n and close')
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair','other'))], 'TYPE_PAIR_TODI3', 'no intermediate\n but not close')
	rpw[, TYPE_PAIR_DI:= cut(PATRISTIC_DISTANCE, breaks=c(1e-12,0.02,0.05,2), labels=c('close','intermediate\ndistance','distant'))] 	
	rpw[, TYPE_PAIR_TODI2x2:= 'not close other']
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair_close'))], 'TYPE_PAIR_TODI2x2', 'close ancestral/\nintermingled')
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair','pair_distant'))], 'TYPE_PAIR_TODI2x2', 'not close ancestral/\nintermingled')
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('withintermediate_close','other_close'))], 'TYPE_PAIR_TODI2x2', 'close other')	
	rpw[, TYPE_DIR_TO3:= NA_character_]
	set(rpw, rpw[, which(grepl('chain_fm',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'fm')
	set(rpw, rpw[, which(grepl('chain_mf',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'mf')
	set(rpw, rpw[, which(grepl('intermingled',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'intermingled')		
	rpw[, TYPE_DIR_TODI3:= NA_character_]
	set(rpw, rpw[, which(grepl('chain_fm',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'fm')
	set(rpw, rpw[, which(grepl('chain_mf',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'mf')
	set(rpw, rpw[, which(grepl('intermingled',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'intermingled')			
	rpw[, TYPE_CHAIN_TODI2:= NA_character_]
	set(rpw, rpw[, which(grepl('withintermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI2', 'chain')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI2', 'pair')
	set(rpw, rpw[, which(grepl('other',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI2', 'pair')
	set(rpw, rpw[, which(grepl('intermingled',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'intermingled')			
	set(rpw, NULL, 'TYPE_DIR_TODI7x3', rpw[, gsub('intermediate',' intermediate',gsub('intermediate_','intermediate\n',gsub('intermingled_','intermingled\n',gsub('(chain_[fm][mf])_','\\1\n',TYPE_DIR_TODI7x3))))])
	#
	#	identify chunks of contiguous windows
	#	
	setkey(rpw, RUN, PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, W_FROM)
	rpw.slide	<- rpw[, {
				ans	<- NA_integer_
				tmp	<- diff(W_FROM)
				if(length(tmp))
					ans	<- min(tmp)
				list(W_SLIDE=ans)
			}, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','RUN')][, min(na.omit(W_SLIDE))]
	#	melt because chunks are dependent on topology types: if there are NAs, then the chunks may change
	rpw		<- melt(rpw, measure.vars=c('TYPE_PAIR_TO','TYPE_CHAIN_TODI2','TYPE_PAIR_TODI3x3','TYPE_PAIR_TODI3','TYPE_PAIR_DI','TYPE_PAIR_TODI2x2','TYPE_DIR_TODI7x3','TYPE_DIR_TO3','TYPE_DIR_TODI3'), variable.name='GROUP', value.name='TYPE')
	set(rpw, NULL, 'TYPE', rpw[,gsub('_',' ', TYPE)])	
	rpw		<- subset(rpw, !is.na(TYPE))			
	#	define chunks
	setkey(rpw, RUN, PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, GROUP, W_FROM)
	tmp		<- rpw[, {
				tmp<- as.integer( c(TRUE,(W_FROM[-length(W_FROM)]+rpw.slide)!=W_FROM[-1]) )
				list(W_FROM=W_FROM, W_TO=W_TO, CHUNK=cumsum(tmp))
			}, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','GROUP')]
	rpw		<- merge(rpw,tmp,by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM','W_TO','GROUP'))
	#	define chunk length in terms of non-overlapping windows	& number of windows in chunk
	tmp		<- rpw[, {
				list(W_FROM=W_FROM, W_TO=W_TO, CHUNK_L=(max(W_TO+1L)-min(W_FROM))/(W_TO[1]+1L-W_FROM[1]), CHUNK_N=length(W_FROM))
			}, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP','CHUNK')]
	rpw		<- merge(rpw,tmp,by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','GROUP','CHUNK','W_FROM','W_TO'))	
	#	for each chunk, count: windows by type and effective length of chunk
	#	then sum chunks
	rplkl	<- rpw[, list(	K= length(W_FROM), KEFF= length(W_FROM)/CHUNK_N[1] * CHUNK_L[1]), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','CHUNK','LABEL','LABEL_SH','GROUP','TYPE')]	
	rplkl	<- rplkl[, list(STAT=c('K','KEFF'), V=c(sum(K),sum(KEFF))), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH','GROUP','TYPE')]	
	#	add zeros
	rplkl	<- dcast.data.table(rplkl, RUN+PTY_RUN+FEMALE_SANGER_ID+MALE_SANGER_ID+LABEL+LABEL_SH~GROUP+TYPE+STAT, value.var='V')
	for(x in setdiff(colnames(rplkl),c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH','GROUP')))
		set(rplkl, which(is.na(rplkl[[x]])), x, 0L)	
	for(x in colnames(rplkl)[grepl('TYPE_PD_MEAN',colnames(rplkl))])
		set(rplkl, which(rplkl[[x]]>0), x, 1L)	
	rplkl	<- melt(rplkl, id.vars=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH'), variable.name='GROUP', value.name='V')
	rplkl[, STAT:= gsub('.*_([^_]+)$','\\1',GROUP)]
	set(rplkl, NULL, 'GROUP', rplkl[, gsub('(.*)_[^_]+$','\\1',GROUP)])	
	rplkl[, TYPE:= gsub('.*_([^_]+)$','\\1',GROUP)]
	set(rplkl, NULL, 'GROUP', rplkl[, gsub('(.*)_[^_]+$','\\1',GROUP)])	
	rplkl	<- dcast.data.table(rplkl, RUN+PTY_RUN+FEMALE_SANGER_ID+MALE_SANGER_ID+LABEL+LABEL_SH+GROUP+TYPE~STAT, value.var='V')	
	#	calculate N and NEFF
	tmp		<- rplkl[, list(N= sum(K), NEFF= sum(KEFF)), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP')]	
	rplkl	<- merge(rplkl, tmp, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP'))
	#	add parameters for marginal posterior probabilities (alpha, beta)
	par.prior	<- 0.1	
	tmp		<- rplkl[, list(	TYPE=TYPE, POSTERIOR_ALPHA=KEFF+par.prior, POSTERIOR_BETA=sum(KEFF+par.prior)-(KEFF+par.prior))	, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP')]
	rplkl	<- merge(rplkl, tmp, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP','TYPE'))
	#	save
	save(rplkl, file='~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170227/RCCS_170227_w270_trmp_allpairs_posteriors.rda')
	#
	#	check runs missing?
	#
	tmp		<- unique(subset(rplkl, select=c(RUN, PTY_RUN)))
	tmp		<- merge(tmp,tmp[,list(N_RUNS=length(RUN)),by='PTY_RUN'],by='PTY_RUN')
	dcast.data.table(subset(tmp, N_RUNS<4), PTY_RUN~RUN, value.var='N_RUNS')	
	#	
	#	check presence/absence by dual
	#
	tmp		<- unique(subset(rplkl, select=c(RUN, PTY_RUN, FEMALE_SANGER_ID, MALE_SANGER_ID)))
	tmp		<- merge(tmp, tmp[, list(N_RUNS=length(RUN)),by=c('PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID')],by=c('PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID'))	
	subset(tmp, N_RUNS<4)[, table(PTY_RUN)]
	#
	#	prepare colours
	#	
	cols.type	<- list()
	tmp2		<- do.call('rbind',list(
					data.table(	TYPE= c("chain fm\nno intermediate\nclose","chain fm\nno intermediate","chain fm\nno intermediate\ndistant"),
							COLS= brewer.pal(11, 'PiYG')[c(1,2,4)]),
					data.table(	TYPE= c("chain mf\nno intermediate\nclose","chain mf\nno intermediate","chain mf\nno intermediate\ndistant"),
							COLS= brewer.pal(11, 'PuOr')[c(1,2,4)]),
					data.table(	TYPE= c("intermingled\nno intermediate\nclose","intermingled\nno intermediate","intermingled\nno intermediate\ndistant"),
							COLS= brewer.pal(11, 'PRGn')[c(1,2,4)]),
					data.table(	TYPE= c("chain fm\nwith intermediate\nclose","chain fm\nwith intermediate","chain fm\nwith intermediate\ndistant"),
							COLS= rev(brewer.pal(11, 'BrBG'))[c(3,4,5)]),
					data.table(	TYPE= c("chain mf\nwith intermediate\nclose","chain mf\nwith intermediate","chain mf\nwith intermediate\ndistant"),
							COLS= rev(brewer.pal(11, 'PRGn'))[c(3,4,5)]),
					data.table(	TYPE= c("intermingled\nwith intermediate\nclose","intermingled\nwith intermediate","intermingled\nwith intermediate\ndistant"),
							COLS= rev(brewer.pal(11, 'RdBu'))[c(3,4,5)]),
					data.table(	TYPE= c("other close","other","other distant"),
							COLS= rev(brewer.pal(11, 'RdGy'))[c(3,4,5)])))
	cols.type[['TYPE_DIR_TODI7x3']]	<- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
	tmp2		<- do.call('rbind',list(
					data.table(	TYPE= c("pair close","pair","pair distant"),
							COLS= brewer.pal(11, 'PuOr')[c(1,2,4)]),
					data.table(	TYPE= c("withintermediate close","withintermediate","withintermediate distant"),
							COLS= rev(brewer.pal(11, 'RdBu'))[c(3,4,5)]),
					data.table(	TYPE= c("other close","other","other distant"),
							COLS= rev(brewer.pal(11, 'RdGy'))[c(3,4,5)])))
	tmp2		<- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
	cols.type[['TYPE_PAIR_TODI3x3']]	<- tmp2	
	tmp2		<- do.call('rbind',list(
					data.table(	TYPE= c('close ancestral/\nintermingled', 'not close ancestral/\nintermingled'),
							COLS= brewer.pal(11, 'PRGn')[c(2,4)]),
					data.table(	TYPE= c('close other','not close other'),
							COLS= rev(brewer.pal(11, 'RdGy'))[c(3,5)])))
	tmp2		<- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
	cols.type[['TYPE_PAIR_TODI2x2']]	<- tmp2	
	tmp2		<- data.table(	TYPE= c("no intermediate\n and close", "no intermediate\n but not close", "with intermediate\nor distant"),
			COLS= c(brewer.pal(11, 'RdBu')[c(2,4)], rev(brewer.pal(11, 'RdGy'))[4]))					
	tmp2		<- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
	cols.type[['TYPE_PAIR_TODI3']]	<- tmp2
	tmp2		<- data.table(	TYPE= c("close", "intermediate\ndistance", "distant"),
			COLS= c(rev(brewer.pal(11, 'RdBu'))[c(2,4)], rev(brewer.pal(11, 'RdGy'))[4]))					
	tmp2		<- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
	cols.type[['TYPE_PAIR_DI']]	<- tmp2
	tmp2		<- data.table(	TYPE= c("ancestral/\nintermingled", "other"),
			COLS= c(rev(brewer.pal(9, 'Greens'))[2], rev(brewer.pal(11, 'RdGy'))[4]))					
	tmp2		<- { tmp<- tmp2[, COLS]; names(tmp) <- tmp2[, TYPE]; tmp }
	cols.type[['TYPE_PAIR_TO']]	<- tmp2	
	#
	#	make detailed plots for each pair
	#		topology assignments across genome
	#		distances across genome
	#		number of windows of certain type
	#		estimated posterior probabilities on unknown phylogenetic relationship
	#
	run		<- 'RCCS_170227_w250_dxxx'
	dir		<- '/Users/Oliver/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170227'	
	
	groups		<- c('TYPE_DIR_TODI7x3','TYPE_PAIR_TODI3x3','TYPE_PAIR_TODI2x2','TYPE_PAIR_TODI3','TYPE_PAIR_DI')
	group		<- c('TYPE_DIR_TODI7x3')
	group		<- 'TYPE_PAIR_TODI3'
	
	widths	<- unit(c(4, 6), "null")
	heights	<- unit(c(2, 3.5, 4, 5), "null")
	height	<- 9
	if(group%in%c('TYPE_DIR_TODI7x3'))
	{
		widths	<- unit(c(4, 6), "null")
		heights	<- unit(c(2, 3.5, 4, 15), "null")
		height	<- 17
	}		
	if(group%in%c('TYPE_PAIR_TODI2x2'))
	{
		heights	<- unit(c(2, 3.5, 4, 3.75), "null")
		height	<- 8
	}
	if(group%in%c('TYPE_PAIR_TODI3','TYPE_PAIR_DI'))
	{
		heights	<- unit(c(2, 3.5, 4, 3.5), "null")
		height	<- 7
	}
	pdf(file=file.path(dir, paste(run,'-phsc-relationships_allpairs','_',group,'.pdf',sep='')), w=10, h=height)	
	plot.tmp	<- unique(subset(rplkl, GROUP==group, c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','LABEL')), by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))
	setkey(plot.tmp, LABEL)		
	for(i in seq_len(nrow(plot.tmp)))
	{
		#pty_run	<- 38; id1		<- '16016_1_4'; id2		<- '15105_1_35'
		pty_run	<- plot.tmp[i, PTY_RUN]; id1		<- plot.tmp[i, FEMALE_SANGER_ID]; id2		<- plot.tmp[i, MALE_SANGER_ID]		
		tmp		<- subset(rpw, PTY_RUN==pty_run & GROUP==group & FEMALE_SANGER_ID==id1 & MALE_SANGER_ID==id2)
		p1		<- ggplot(tmp, aes(x=W_FROM)) +			
				geom_bar(aes(y=ID_R_MAX, colour=TYPE), stat='identity', fill='transparent') +
				geom_bar(aes(y=ID_R_MIN, fill=TYPE), stat='identity', colour='transparent') +
				labs(x='', y='number of reads', fill='phylogenetic\nrelationship\n', colour='phylogenetic\nrelationship\n') +
				scale_fill_manual(values=cols.type[[group]]) +
				scale_colour_manual(values=cols.type[[group]]) +
				scale_x_continuous(breaks=seq(0,1e4,500), minor_breaks=seq(0,1e4,100), limits=c(rpw[, min(W_FROM)], rpw[, max(W_FROM)])) +
				scale_y_log10(breaks=c(10,100,1000,1e4,1e5)) +
				theme_bw() + theme(legend.position='left') +			
				guides(fill=FALSE, colour=FALSE)
		p2		<- ggplot(tmp, aes(x=W_FROM, y=PATRISTIC_DISTANCE)) +
				geom_point(size=1) +					
				labs(x='window start\n\n', y='patristic distance') +
				scale_x_continuous(breaks=seq(0,1e4,500), minor_breaks=seq(0,1e4,100), limits=c(rpw[, min(W_FROM)], rpw[, max(W_FROM)])) +
				scale_y_log10(labels=percent, limits=c(0.001, 0.7), expand=c(0,0), breaks=c(0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25)) +
				theme_bw() + theme(legend.position='left')
		tmp		<- subset(rplkl, GROUP==group & PTY_RUN==pty_run & FEMALE_SANGER_ID==id1 & MALE_SANGER_ID==id2)
		p3		<- ggplot(tmp, aes(x=TYPE, y=KEFF, fill=TYPE)) + geom_bar(stat='identity') +
				scale_fill_manual(values=cols.type[[group]]) +
				theme_bw() + theme(legend.position='bottom') +
				coord_flip() + guides(fill=FALSE) +			
				labs(x='', y='\nnon-overlapping windows\n(number)', fill='phylogenetic\nrelationship\n')
		p4		<- ggplot(tmp, aes(x=TYPE, 	middle=qbeta(0.5, POSTERIOR_ALPHA, POSTERIOR_BETA), 
								ymin=qbeta(0.025, POSTERIOR_ALPHA, POSTERIOR_BETA), 
								ymax=qbeta(0.975, POSTERIOR_ALPHA, POSTERIOR_BETA), 
								lower=qbeta(0.25, POSTERIOR_ALPHA, POSTERIOR_BETA), 
								upper=qbeta(0.75, POSTERIOR_ALPHA, POSTERIOR_BETA), 
								fill=TYPE)) + 
				geom_boxplot(stat='identity') +
				scale_y_continuous(labels=percent, breaks=seq(0,1,0.2), limits=c(0,1), expand=c(0,0)) +
				scale_fill_manual(values=cols.type[[group]]) +
				theme_bw() + theme(legend.position='right', legend.margin=margin(0, .1, 0, 1, "cm")) +
				coord_flip() + guides(fill=guide_legend(ncol=1)) +
				labs(x='', y='\nposterior probability\n', fill='phylogenetic\nrelationship\n')				
		grid.newpage()
		pushViewport(viewport(layout = grid.layout(4, 2, heights=heights, widths=widths)))   
		grid.text(tmp[1,LABEL], gp=gpar(fontsize=10), vp=viewport(layout.pos.row = 1, layout.pos.col = 1:2))
		print(p1, vp = viewport(layout.pos.row = 2, layout.pos.col = 1:2))
		print(p2, vp = viewport(layout.pos.row = 3, layout.pos.col = 1:2))         
		print(p3, vp = viewport(layout.pos.row = 4, layout.pos.col = 1))
		print(p4, vp = viewport(layout.pos.row = 4, layout.pos.col = 2))
	}
	dev.off()	
	
}

RakaiAll.confounded.RIDs.170309<- function()
{	
	tmp		<- RakaiCirc.epi.get.info.170208()
	rh		<- tmp$rh
	rd		<- tmp$rd
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/circumcision/RCCS_SeqInfo_160816.rda')		
	rs		<- subset(rs, !is.na(VISIT))
	tmp		<- unique(subset(rs, !is.na(SID), select=c(SID,RID)))
	setnames(tmp, 'SID', 'FILE_ID')
	tmp		<- merge(pty.runs, tmp, by='FILE_ID')
	ddup	<- subset(tmp[, list(N=length(FILE_ID)), by=c('PTY_RUN','RID')], N>1)
	ddup	<- merge(ddup, unique(subset(rs, !is.na(SID), select=c(SID,RID))), by='RID')
	setkey(ddup, PTY_RUN)
	ddup[, N:=NULL]
	save(ddup, file="~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/Couples_PANGEA_HIV_n4562_Imperial_v151113_phscruns_confounded.rda")
}

RakaiAll.addposteriors.pairs.170309<- function()
{
	require(data.table)
	require(scales)
	require(ggplot2)
	require(grid)
	require(gridExtra)
	require(RColorBrewer)
	# load pty.run
	load( "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/Couples_PANGEA_HIV_n4562_Imperial_v151113_phscruns.rda" )
	# load transmission summaries
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170309/RCCS_170309_w250_trmw_assignments_allpairs.rda')	
	tmp		<- copy(rpw)
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170227/RCCS_170227_w270_trmw_assignments_allpairs.rda')
	rpw[, UNINTERRUPTED:=NA]	
	rpw		<- rbind(rpw, tmp, fill=TRUE)	
	setnames(rpw, c('TYPE','TYPE_PAIR'), c('TYPE_DIR_TODI7x3','TYPE_PAIR_TODI3x3'))
	#
	rpw[, table(RUN, useNA='if')]
	#	check if we have all pty.runs
	stopifnot(!length(setdiff( 	subset(rpw, RUN=='RCCS_170227_w270_d50_st20_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))],
							subset(rpw, RUN=='RCCS_170309_w250_d50_st20_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))]	)))	
	#	define plotting order: largest number of trm assignments	
	tmp		<- rpw[, list( WIN_TR=length(which(grepl('close|anc',TYPE_DIR_TODI7x3))) ), by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID')]
	tmp		<- tmp[, list(WIN_TR=max(WIN_TR)), by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID')]
	tmp		<- tmp[order(-WIN_TR),]
	tmp[, PLOT_ID:=seq_len(nrow(tmp))]	
	#	make pair data table
	rp		<- copy(rpw)
	set(rp, NULL, c('RUN','FILE','DIR','W_FROM','W_TO','TYPE_RAW','TYPE_DIR_TODI7x3','TYPE_PAIR_TODI3x3','PATRISTIC_DISTANCE','CONTIGUOUS','PATHS_12','PATHS_21','MALE_SANGER_ID_L','MALE_SANGER_ID_R','FEMALE_SANGER_ID_L','FEMALE_SANGER_ID_R'), NULL)
	rp		<- unique(rp, by=c('MALE_SANGER_ID','FEMALE_SANGER_ID','PTY_RUN'))
	#	make COUPID
	rp[, COUPID:= paste0(MALE_RID,':',FEMALE_RID)]
	#	define label
	tmp		<- merge(tmp, rp, by=c('MALE_SANGER_ID','FEMALE_SANGER_ID','PTY_RUN'))
	setkey(tmp, PLOT_ID)
	tmp[, LABEL_SH:= factor(PLOT_ID, levels=PLOT_ID, labels=paste(COUPID, ' ( M:', MALE_SANGER_ID,' F:',FEMALE_SANGER_ID, ' run:', PTY_RUN, ' )', sep=''))]
	tmp[, LABEL:= factor(PLOT_ID, levels=PLOT_ID, labels=paste('Pair ', COUPID,' -phsc.run=',PTY_RUN, '\nPerson M ', MALE_RID, ' ', MALE_SANGER_ID,' -loc:',MALE_REGION,',',MALE_COMM_NUM,',',MALE_HH_NUM,' -birth:',MALE_BIRTHDATE,' -neg:',MALE_LASTNEGDATE,' -pos:',MALE_FIRSTPOSDATE,' -seq:',MALE_SEQDATE,
							'\n<->', 
							'\nPerson F ', FEMALE_RID, ' ', FEMALE_SANGER_ID,' -loc:',FEMALE_REGION,',',FEMALE_COMM_NUM,',',FEMALE_HH_NUM,' -birth:',FEMALE_BIRTHDATE,' -neg:',FEMALE_LASTNEGDATE,' -pos:',FEMALE_FIRSTPOSDATE,' -seq:',FEMALE_SEQDATE,																				
							'\n',sep=''))]
	tmp		<- subset(tmp, select=c(PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, LABEL, LABEL_SH))
	rpw		<- merge(tmp, rpw, by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))
	#	define min/max reads
	tmp		<- rpw[, list(	ID_R_MIN=min(MALE_SANGER_ID_R, FEMALE_SANGER_ID_R),
					ID_R_MAX=max(MALE_SANGER_ID_R, FEMALE_SANGER_ID_R)), by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM')]
	rpw		<- merge(rpw, tmp, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM'))
	#
	#	define topo/distance types of phylogenetic relationship between pairs
	#
	#	given the bimodal distribution of patristic distances from phyloscanner
	#	I ususally consider 3 distance states: close, neither close nor distant, distant
	#
	#	TYPE_DIR_TODI7x3		7 topology states (this is the "basic topological characterization" on Slack that I derive from Matthew's output) 
	#							chain_12_nointermediate, chain_12_withintermediate, chain_21_nointermediate, chain_21_withintermediate, intermingled_nointermediate, intermingled_withintermediate, other  
	#							times 
	#							3 distance states
	#
	#	TYPE_PAIR_TODI3x3		3 topology states
	#							pair (chain_XX_nointermediate+intermingled_nointermediate), withintermediate (XX_withintermediate), other
	#							times 
	#							3 distance states
	#
	#	TYPE_PAIR_TODI2x2		2 topology states
	#							ancestral/intermingled, other
	#							times 
	#							2 distance states (close, not close)
	#
	#	TYPE_PAIR_TODI3			non-factorial design that combines distant or withintermediate, and close or ancestral/intermingled
	#							the idea here is that 
	#							evidence for extra-couple transmission comes from large patristic distance OR intermediates	
	#							evidence for extra-couple transmission comes from ancestral/intermingled OR short patristic distance	
	#
	#	TYPE_PAIR_DI			3 distance states
	#
	#
	#	TYPE_DIR_TO3			3 topological direction states
	#							chain_fm	(chain_fm_nointermediate regardless of distance); chain_mf	(chain_mf_nointermediate regardless of distance); other
	#							Rest: intermingled -> NA.
	#
	#	TYPE_DIR_TODI3			3 topological direction states if close.  
	#							chain_fm	(chain_fm_nointermediate regardless of distance); chain_mf	(chain_mf_nointermediate regardless of distance); other
	#							Rest: intermingled / distant / intermediate distance -> NA.
	#
	#	TYPE_CHAIN_TODI2		2 topological states if close.  
	#							chain	(xxx_withintermediate_close); pair	(xxx_nointermediate_close, other)
	#							Rest: distant / intermediate distance -> NA.
	rpw[, TYPE_PAIR_TO:= 'other']
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair_close','pair','pair_distant'))], 'TYPE_PAIR_TO', 'ancestral/\nintermingled')
	rpw[, TYPE_PAIR_TODI3:= 'with intermediate\nor distant']
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair_close','other_close'))], 'TYPE_PAIR_TODI3', 'no intermediate\n and close')
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair','other'))], 'TYPE_PAIR_TODI3', 'no intermediate\n but not close')
	rpw[, TYPE_PAIR_DI:= cut(PATRISTIC_DISTANCE, breaks=c(1e-12,0.02,0.05,2), labels=c('close','intermediate\ndistance','distant'))] 	
	rpw[, TYPE_PAIR_TODI2x2:= 'not close other']
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair_close'))], 'TYPE_PAIR_TODI2x2', 'close ancestral/\nintermingled')
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair','pair_distant'))], 'TYPE_PAIR_TODI2x2', 'not close ancestral/\nintermingled')
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('withintermediate_close','other_close'))], 'TYPE_PAIR_TODI2x2', 'close other')	
	rpw[, TYPE_DIR_TO3:= NA_character_]
	set(rpw, rpw[, which(grepl('chain_fm',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'fm')
	set(rpw, rpw[, which(grepl('chain_mf',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'mf')
	set(rpw, rpw[, which(grepl('intermingled',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'intermingled')		
	rpw[, TYPE_DIR_TODI3:= NA_character_]
	set(rpw, rpw[, which(grepl('chain_fm',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'fm')
	set(rpw, rpw[, which(grepl('chain_mf',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'mf')
	set(rpw, rpw[, which(grepl('intermingled',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'intermingled')			
	rpw[, TYPE_CHAIN_TODI2:= NA_character_]
	set(rpw, rpw[, which(grepl('withintermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI2', 'chain')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI2', 'pair')
	set(rpw, rpw[, which(grepl('other',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI2', 'pair')
	set(rpw, rpw[, which(grepl('intermingled',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'intermingled')			
	set(rpw, NULL, 'TYPE_DIR_TODI7x3', rpw[, gsub('intermediate',' intermediate',gsub('intermediate_','intermediate\n',gsub('intermingled_','intermingled\n',gsub('(chain_[fm][mf])_','\\1\n',TYPE_DIR_TODI7x3))))])
	#
	#	identify chunks of contiguous windows
	#	
	setkey(rpw, RUN, PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, W_FROM)
	rpw.slide	<- rpw[, {
				ans	<- NA_integer_
				tmp	<- diff(W_FROM)
				if(length(tmp))
					ans	<- min(tmp)
				list(W_SLIDE=ans)
			}, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','RUN')][, min(na.omit(W_SLIDE))]
	#	melt because chunks are dependent on topology types: if there are NAs, then the chunks may change
	rpw		<- melt(rpw, measure.vars=c('TYPE_PAIR_TO','TYPE_CHAIN_TODI2','TYPE_PAIR_TODI3x3','TYPE_PAIR_TODI3','TYPE_PAIR_DI','TYPE_PAIR_TODI2x2','TYPE_DIR_TODI7x3','TYPE_DIR_TO3','TYPE_DIR_TODI3'), variable.name='GROUP', value.name='TYPE')
	set(rpw, NULL, 'TYPE', rpw[,gsub('_',' ', TYPE)])	
	rpw		<- subset(rpw, !is.na(TYPE))			
	#	define chunks
	setkey(rpw, RUN, PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, GROUP, W_FROM)
	tmp		<- rpw[, {
				tmp<- as.integer( c(TRUE,(W_FROM[-length(W_FROM)]+rpw.slide)!=W_FROM[-1]) )
				list(W_FROM=W_FROM, W_TO=W_TO, CHUNK=cumsum(tmp))
			}, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','GROUP')]
	rpw		<- merge(rpw,tmp,by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM','W_TO','GROUP'))
	#	define chunk length in terms of non-overlapping windows	& number of windows in chunk
	tmp		<- rpw[, {
				list(W_FROM=W_FROM, W_TO=W_TO, CHUNK_L=(max(W_TO+1L)-min(W_FROM))/(W_TO[1]+1L-W_FROM[1]), CHUNK_N=length(W_FROM))
			}, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP','CHUNK')]
	rpw		<- merge(rpw,tmp,by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','GROUP','CHUNK','W_FROM','W_TO'))	
	#	for each chunk, count: windows by type and effective length of chunk
	#	then sum chunks
	rplkl	<- rpw[, list(	K= length(W_FROM), KEFF= length(W_FROM)/CHUNK_N[1] * CHUNK_L[1]), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','CHUNK','LABEL','LABEL_SH','GROUP','TYPE')]	
	rplkl	<- rplkl[, list(STAT=c('K','KEFF'), V=c(sum(K),sum(KEFF))), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH','GROUP','TYPE')]	
	#	add zeros
	rplkl	<- dcast.data.table(rplkl, RUN+PTY_RUN+FEMALE_SANGER_ID+MALE_SANGER_ID+LABEL+LABEL_SH~GROUP+TYPE+STAT, value.var='V')
	for(x in setdiff(colnames(rplkl),c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH','GROUP')))
		set(rplkl, which(is.na(rplkl[[x]])), x, 0L)	
	for(x in colnames(rplkl)[grepl('TYPE_PD_MEAN',colnames(rplkl))])
		set(rplkl, which(rplkl[[x]]>0), x, 1L)	
	rplkl	<- melt(rplkl, id.vars=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH'), variable.name='GROUP', value.name='V')
	rplkl[, STAT:= gsub('.*_([^_]+)$','\\1',GROUP)]
	set(rplkl, NULL, 'GROUP', rplkl[, gsub('(.*)_[^_]+$','\\1',GROUP)])	
	rplkl[, TYPE:= gsub('.*_([^_]+)$','\\1',GROUP)]
	set(rplkl, NULL, 'GROUP', rplkl[, gsub('(.*)_[^_]+$','\\1',GROUP)])	
	rplkl	<- dcast.data.table(rplkl, RUN+PTY_RUN+FEMALE_SANGER_ID+MALE_SANGER_ID+LABEL+LABEL_SH+GROUP+TYPE~STAT, value.var='V')	
	#	calculate N and NEFF
	tmp		<- rplkl[, list(N= sum(K), NEFF= sum(KEFF)), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP')]	
	rplkl	<- merge(rplkl, tmp, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP'))
	#	add parameters for marginal posterior probabilities (alpha, beta)
	par.prior	<- 0.1
	tmp			<- unique(rplkl, by=c('GROUP','TYPE'))[, list(N_TYPE=length(TYPE)), by='GROUP']
	tmp[, POSTERIOR_ALPHA:= par.prior/N_TYPE]
	tmp[, POSTERIOR_BETA:= par.prior*(1-1/N_TYPE)]
	set(tmp, NULL, 'N_TYPE', NULL)
	rplkl		<- merge(rplkl,tmp,by='GROUP')
	set(rplkl,NULL,'POSTERIOR_ALPHA', rplkl[,KEFF+POSTERIOR_ALPHA])
	set(rplkl,NULL,'POSTERIOR_BETA', rplkl[,NEFF-KEFF+POSTERIOR_BETA])
	#	save
	save(rplkl, file='~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170309/RCCS_170309_w250_trmp_allpairs_posteriors.rda')
	#
	#	check runs missing?
	#
	if(0)
	{
		tmp		<- unique(subset(rplkl, select=c(RUN, PTY_RUN)))
		tmp		<- merge(tmp,tmp[,list(N_RUNS=length(RUN)),by='PTY_RUN'],by='PTY_RUN')
		dcast.data.table(subset(tmp, N_RUNS<4), PTY_RUN~RUN, value.var='N_RUNS')		
	}	
	
	#
	#	plot next to each other 
	#		'RCCS_170309_w250_d50_st20_mr20_mt1_cl2_d5' (using uninterrupted)
	#		'RCCS_170227_w270_d50_st20_mr20_mt1_cl2_d5' (not using uninterrupted)
	#
	outfile.base	<- '/Users/Oliver/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170309/RCCS_170309_w250_dxxx' 
	
	rps		<- subset(rplkl, GROUP%in%c('TYPE_PAIR_TODI3') & TYPE==c("no intermediate\n and close"))
	rps[, PROB:=pbeta(0.5, POSTERIOR_ALPHA, POSTERIOR_BETA)]
	rps		<- rps[, list(PROB_D=abs(diff(PROB))), by=c('PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID')]
	write.csv(subset(rps, PROB_D>.05), file=paste0(outfile.base,'-phsc-relationships_compare_contiguous_to_contiguousORuninterrupted.csv'))
	#	
	#
	group			<- 'TYPE_PAIR_TODI3'
	cols.run		<- c('black', 'blue')
	names(cols.run)	<- c('RCCS_170227_w270_d50_st20_mr20_mt1_cl2_d5','RCCS_170309_w250_d50_st20_mr20_mt1_cl2_d5')
	rplkl2		<- merge(subset(rps, PROB_D>.05), subset(rplkl, GROUP==group), by=c('PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID'))
	set(rplkl2, NULL, 'RUN', rplkl2[, factor(RUN, levels=rev(c('RCCS_170227_w270_d50_st20_mr20_mt1_cl2_d5','RCCS_170309_w250_d50_st20_mr20_mt1_cl2_d5')))])
	set(rplkl2, NULL, 'TYPE', rplkl2[, factor(TYPE, levels=rev(c("no intermediate\n and close","no intermediate\n but not close","with intermediate\nor distant")))])
	plot.file	<- paste0(outfile.base,'-phsc-relationships_compare_contiguous_to_contiguousORuninterrupted_EXP.pdf')
	plot.prob.select	<- "no intermediate\n and close"
	phsc.plot.windowassignments.by.runs(rplkl2, plot.file, plot.prob.select, cols.run, cols=NULL, group=group, height=40)
	#
	#
	
	#
	#	make detailed plots for each pair
	#		topology assignments across genome
	#		distances across genome
	#		number of windows of certain type
	#		estimated posterior probabilities on unknown phylogenetic relationship
	#
	
	#groups		<- c('TYPE_DIR_TODI7x3','TYPE_PAIR_TODI3x3','TYPE_PAIR_TODI2x2','TYPE_PAIR_TODI3','TYPE_PAIR_DI')
	run			<- 'RCCS_170227_w270_d50_st20_mr20_mt1_cl2_d5'
	run			<- 'RCCS_170309_w250_d50_st20_mr20_mt1_cl2_d5'
	group		<- c('TYPE_DIR_TODI7x3')		
	plot.select	<- unique(subset(rplkl, GROUP==group, c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','LABEL')), by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))
	plot.select	<- merge(subset(rps, PROB_D>.05), plot.select, by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))
	plot.file	<- paste0(outfile.base,'-phsc-relationships_allpairs','_',run,'_',group,'.pdf')
	rpw2		<- subset(rpw, RUN==run & GROUP==group)
	rplkl2		<- subset(rplkl, RUN==run & GROUP==group)
	phsc.plot.windowsummaries.for.pairs(plot.select, rpw2, rplkl2, plot.file, cols=NULL, group=group)
	
	group		<- 'TYPE_PAIR_TODI3'
	run			<- 'RCCS_170309_w250_d50_st20_mr20_mt1_cl2_d5'
	tmp			<- subset(rplkl, NEFF<5 & KEFF>2 & GROUP=='TYPE_PAIR_TODI3' & TYPE=='no intermediate\n and close', select=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))
	plot.select	<- unique(subset(merge(rplkl, tmp, by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID')), GROUP==group), by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))
	rpw2		<- subset(rpw, RUN==run & GROUP==group)
	rplkl2		<- subset(rplkl, RUN==run & GROUP==group)
	plot.file	<- paste0(outfile.base,'-phsc-relationships_allpairsNEFF5KEFF2','_',run,'_',group,'.pdf')
	phsc.plot.windowsummaries.for.pairs(plot.select, rpw2, rplkl2, plot.file, cols=NULL, group=group)
	
	
	#
	#	plot phylogenies for all inconsistent pairs in either of the two runs
	#
	tmp		<- subset(rps, PROB_D>.05)
	set(tmp, NULL, 'FEMALE_SANGER_ID', tmp[, as.character(FEMALE_SANGER_ID)])
	set(tmp, NULL, 'MALE_SANGER_ID', tmp[, as.character(MALE_SANGER_ID)])
	#	check RCCS_170227_w270_d50_st20_mr20_mt1_cl2_d5
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170309/RCCS_170309_w250_d50_st20_mr20_mt1_cl2_d5_phscout.rda')
	outfile	<- '~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170309/unint_RCCS_170309_w250_d50_st20_mr20_mt1_cl2_d5_'	
	invisible(sapply(seq_len(nrow(tmp)), function(ii)
					{	
						#ii<- 14
						ids			<- c(tmp[ii, MALE_SANGER_ID],tmp[ii, FEMALE_SANGER_ID])
						pty.run		<- tmp[ii, PTY_RUN]
						dfs			<- subset(dtrees, PTY_RUN==pty.run, select=c(PTY_RUN, W_FROM, W_TO, IDX))
						dfs[, TITLE:= dfs[, paste('male ', ids[1],'\nfemale ',ids[2],'\nrun ', pty.run, '\nwindow ', W_FROM,'-', W_TO,sep='')]]			
						plot.file	<- paste0(outfile, pty.run,'_M_',ids[1],'_F_', ids[2],'.pdf')						
						invisible(phsc.plot.phy.selected.individuals(phs, dfs, ids, plot.cols=c('red','blue'), group.redo=TRUE, plot.file=plot.file, pdf.h=150, pdf.rw=10, pdf.ntrees=20, pdf.title.size=40))					
					}))	
	invisible(sapply(seq_len(nrow(tmp)), function(ii)
					{	
						#ii<- 14
						ids			<- c(tmp[ii, MALE_SANGER_ID],tmp[ii, FEMALE_SANGER_ID])
						pty.run		<- tmp[ii, PTY_RUN]
						dfs			<- subset(dtrees, PTY_RUN==pty.run, select=c(PTY_RUN, W_FROM, W_TO, IDX))
						dfs[, TITLE:= dfs[, paste('male ', ids[1],'\nfemale ',ids[2],'\nrun ', pty.run, '\nwindow ', W_FROM,'-', W_TO,sep='')]]			
						plot.file	<- paste0(outfile, pty.run,'_M_',ids[1],'_F_', ids[2],'_droppedblacklisted.pdf')
						invisible(phsc.plot.phy.selected.individuals(phs, dfs, ids, plot.cols=c('red','blue'), drop.less.than.n.ids=2, drop.blacklisted=TRUE, plot.file=plot.file, pdf.h=30, pdf.rw=10, pdf.ntrees=20, pdf.title.size=40))					
					}))	
	#	test
	invisible(sapply(seq_len(nrow(tmp))[-c(1:7)], function(ii)
					{	
						#ii<- 1
						ids			<- c(tmp[ii, MALE_SANGER_ID],tmp[ii, FEMALE_SANGER_ID])
						pty.run		<- tmp[ii, PTY_RUN]
						dfs			<- subset(dtrees, PTY_RUN==pty.run, select=c(PTY_RUN, W_FROM, W_TO, IDX))
						dfs[, TITLE:= dfs[, paste('male ', ids[1],'\nfemale ',ids[2],'\nrun ', pty.run, '\nwindow ', W_FROM,'-', W_TO,sep='')]]								
						plot.file	<- paste0(outfile, pty.run,'_M_',ids[1],'_F_', ids[2],'_collapsed.pdf')					
						invisible(phsc.plot.phycollapsed.selected.individuals(phs, dfs, ids, plot.cols=c('red','blue'), drop.less.than.n.ids=2, plot.file=plot.file, pdf.h=10, pdf.rw=5, pdf.ntrees=20, pdf.title.size=10))					
					}))	
}

RakaiAll.addposteriors.pairs.170320<- function()
{
	require(data.table)
	require(scales)
	require(ggplot2)
	require(grid)
	require(gridExtra)
	require(RColorBrewer)
	# load pty.run
	load( "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/Couples_PANGEA_HIV_n4562_Imperial_v151113_phscruns.rda" )
	# load transmission summaries
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170309/RCCS_170309_w250_trmw_assignments_allpairs.rda')	
	tmp		<- copy(rpw)
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170227/RCCS_170227_w270_trmw_assignments_allpairs.rda')
	rpw[, UNINTERRUPTED:=NA]	
	tmp		<- rbind(rpw, tmp, fill=TRUE)	
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170320/RCCS_170320_w250_trmw_assignments_allpairs.rda')		
	rpw		<- rbind(rpw, tmp)
	
	setnames(rpw, c('TYPE','TYPE_PAIR'), c('TYPE_DIR_TODI7x3','TYPE_PAIR_TODI3x3'))
	#
	rpw[, table(RUN, useNA='if')]
	#	check if we have all pty.runs
	stopifnot(!length(setdiff( 	subset(rpw, RUN=='RCCS_170227_w270_d50_st20_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))],
							subset(rpw, RUN=='RCCS_170309_w250_d50_st20_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))]	)),
			!length(setdiff( 	subset(rpw, RUN=='RCCS_170227_w270_d50_st20_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))],
							subset(rpw, RUN=='RCCS_170320_w250_d50_st20_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))]	)))	
	#	define plotting order: largest number of trm assignments	
	tmp		<- rpw[, list( WIN_TR=length(which(grepl('close|anc',TYPE_DIR_TODI7x3))) ), by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID')]
	tmp		<- tmp[, list(WIN_TR=max(WIN_TR)), by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID')]
	tmp		<- tmp[order(-WIN_TR),]
	tmp[, PLOT_ID:=seq_len(nrow(tmp))]	
	#	make pair data table
	rp		<- copy(rpw)
	set(rp, NULL, c('RUN','FILE','DIR','W_FROM','W_TO','TYPE_RAW','TYPE_DIR_TODI7x3','TYPE_PAIR_TODI3x3','PATRISTIC_DISTANCE','CONTIGUOUS','PATHS_12','PATHS_21','MALE_SANGER_ID_L','MALE_SANGER_ID_R','FEMALE_SANGER_ID_L','FEMALE_SANGER_ID_R'), NULL)
	rp		<- unique(rp, by=c('MALE_SANGER_ID','FEMALE_SANGER_ID','PTY_RUN'))
	#	make COUPID
	rp[, COUPID:= paste0(MALE_RID,':',FEMALE_RID)]
	#	define label
	tmp		<- merge(tmp, rp, by=c('MALE_SANGER_ID','FEMALE_SANGER_ID','PTY_RUN'))
	setkey(tmp, PLOT_ID)
	tmp[, LABEL_SH:= factor(PLOT_ID, levels=PLOT_ID, labels=paste(COUPID, ' ( M:', MALE_SANGER_ID,' F:',FEMALE_SANGER_ID, ' run:', PTY_RUN, ' )', sep=''))]
	tmp[, LABEL:= factor(PLOT_ID, levels=PLOT_ID, labels=paste('Pair ', COUPID,' -phsc.run=',PTY_RUN, '\nPerson M ', MALE_RID, ' ', MALE_SANGER_ID,' -loc:',MALE_REGION,',',MALE_COMM_NUM,',',MALE_HH_NUM,' -birth:',MALE_BIRTHDATE,' -neg:',MALE_LASTNEGDATE,' -pos:',MALE_FIRSTPOSDATE,' -seq:',MALE_SEQDATE,
							'\n<->', 
							'\nPerson F ', FEMALE_RID, ' ', FEMALE_SANGER_ID,' -loc:',FEMALE_REGION,',',FEMALE_COMM_NUM,',',FEMALE_HH_NUM,' -birth:',FEMALE_BIRTHDATE,' -neg:',FEMALE_LASTNEGDATE,' -pos:',FEMALE_FIRSTPOSDATE,' -seq:',FEMALE_SEQDATE,																				
							'\n',sep=''))]
	tmp		<- subset(tmp, select=c(PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, LABEL, LABEL_SH))
	rpw		<- merge(tmp, rpw, by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))
	#	define min/max reads
	tmp		<- rpw[, list(	ID_R_MIN=min(MALE_SANGER_ID_R, FEMALE_SANGER_ID_R),
					ID_R_MAX=max(MALE_SANGER_ID_R, FEMALE_SANGER_ID_R)), by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM')]
	rpw		<- merge(rpw, tmp, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM'))
	#
	#	define topo/distance types of phylogenetic relationship between pairs
	#
	#	given the bimodal distribution of patristic distances from phyloscanner
	#	I ususally consider 3 distance states: close, neither close nor distant, distant
	#
	#	TYPE_DIR_TODI7x3		7 topology states (this is the "basic topological characterization" on Slack that I derive from Matthew's output) 
	#							chain_12_nointermediate, chain_12_withintermediate, chain_21_nointermediate, chain_21_withintermediate, intermingled_nointermediate, intermingled_withintermediate, other  
	#							times 
	#							3 distance states
	#
	#	TYPE_PAIR_TODI3x3		3 topology states
	#							pair (chain_XX_nointermediate+intermingled_nointermediate), withintermediate (XX_withintermediate), other
	#							times 
	#							3 distance states
	#
	#	TYPE_PAIR_TODI2x2		2 topology states
	#							ancestral/intermingled, other
	#							times 
	#							2 distance states (close, not close)
	#
	#	TYPE_PAIR_TODI3			non-factorial design that combines distant or withintermediate, and close or ancestral/intermingled
	#							the idea here is that 
	#							evidence for extra-couple transmission comes from large patristic distance OR intermediates	
	#							evidence for extra-couple transmission comes from ancestral/intermingled OR short patristic distance	
	#
	#	TYPE_PAIR_DI			3 distance states
	#
	#
	#	TYPE_DIR_TO3			3 topological direction states
	#							chain_fm	(chain_fm_nointermediate regardless of distance); chain_mf	(chain_mf_nointermediate regardless of distance); other
	#							Rest: intermingled -> NA.
	#
	#	TYPE_DIR_TODI3			3 topological direction states if close.  
	#							chain_fm	(chain_fm_nointermediate regardless of distance); chain_mf	(chain_mf_nointermediate regardless of distance); other
	#							Rest: intermingled / distant / intermediate distance -> NA.
	#
	#	TYPE_CHAIN_TODI2		2 topological states if close.  
	#							chain	(xxx_withintermediate_close); pair	(xxx_nointermediate_close, other)
	#							Rest: distant / intermediate distance -> NA.
	rpw[, TYPE_PAIR_TO:= 'other']
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair_close','pair','pair_distant'))], 'TYPE_PAIR_TO', 'ancestral/\nintermingled')
	rpw[, TYPE_PAIR_TODI3:= 'with intermediate\nor distant']
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair_close','other_close'))], 'TYPE_PAIR_TODI3', 'no intermediate\n and close')
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair','other'))], 'TYPE_PAIR_TODI3', 'no intermediate\n but not close')
	rpw[, TYPE_PAIR_DI:= cut(PATRISTIC_DISTANCE, breaks=c(1e-12,0.02,0.05,2), labels=c('close','intermediate\ndistance','distant'))] 	
	rpw[, TYPE_PAIR_TODI2x2:= 'not close other']
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair_close'))], 'TYPE_PAIR_TODI2x2', 'close ancestral/\nintermingled')
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('pair','pair_distant'))], 'TYPE_PAIR_TODI2x2', 'not close ancestral/\nintermingled')
	set(rpw, rpw[, which(TYPE_PAIR_TODI3x3%in%c('withintermediate_close','other_close'))], 'TYPE_PAIR_TODI2x2', 'close other')	
	rpw[, TYPE_DIR_TO3:= NA_character_]
	set(rpw, rpw[, which(grepl('chain_fm',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'fm')
	set(rpw, rpw[, which(grepl('chain_mf',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'mf')
	set(rpw, rpw[, which(grepl('intermingled',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'intermingled')		
	rpw[, TYPE_DIR_TODI3:= NA_character_]
	set(rpw, rpw[, which(grepl('chain_fm',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'fm')
	set(rpw, rpw[, which(grepl('chain_mf',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'mf')
	set(rpw, rpw[, which(grepl('intermingled',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'intermingled')			
	rpw[, TYPE_CHAIN_TODI2:= NA_character_]
	set(rpw, rpw[, which(grepl('withintermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI2', 'chain')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI2', 'pair')
	set(rpw, rpw[, which(grepl('other',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI2', 'pair')
	set(rpw, rpw[, which(grepl('intermingled',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'intermingled')			
	set(rpw, NULL, 'TYPE_DIR_TODI7x3', rpw[, gsub('intermediate',' intermediate',gsub('intermediate_','intermediate\n',gsub('intermingled_','intermingled\n',gsub('(chain_[fm][mf])_','\\1\n',TYPE_DIR_TODI7x3))))])
	#
	#	identify chunks of contiguous windows
	#	
	setkey(rpw, RUN, PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, W_FROM)
	rpw.slide	<- rpw[, {
				ans	<- NA_integer_
				tmp	<- diff(W_FROM)
				if(length(tmp))
					ans	<- min(tmp)
				list(W_SLIDE=ans)
			}, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','RUN')][, min(na.omit(W_SLIDE))]
	#	melt because chunks are dependent on topology types: if there are NAs, then the chunks may change
	rpw		<- melt(rpw, measure.vars=c('TYPE_PAIR_TO','TYPE_CHAIN_TODI2','TYPE_PAIR_TODI3x3','TYPE_PAIR_TODI3','TYPE_PAIR_DI','TYPE_PAIR_TODI2x2','TYPE_DIR_TODI7x3','TYPE_DIR_TO3','TYPE_DIR_TODI3'), variable.name='GROUP', value.name='TYPE')
	set(rpw, NULL, 'TYPE', rpw[,gsub('_',' ', TYPE)])	
	rpw		<- subset(rpw, !is.na(TYPE))			
	#	define chunks
	setkey(rpw, RUN, PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, GROUP, W_FROM)
	tmp		<- rpw[, {
				tmp<- as.integer( c(TRUE,(W_FROM[-length(W_FROM)]+rpw.slide)!=W_FROM[-1]) )
				list(W_FROM=W_FROM, W_TO=W_TO, CHUNK=cumsum(tmp))
			}, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','GROUP')]
	rpw		<- merge(rpw,tmp,by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM','W_TO','GROUP'))
	#	define chunk length in terms of non-overlapping windows	& number of windows in chunk
	tmp		<- rpw[, {
				list(W_FROM=W_FROM, W_TO=W_TO, CHUNK_L=(max(W_TO+1L)-min(W_FROM))/(W_TO[1]+1L-W_FROM[1]), CHUNK_N=length(W_FROM))
			}, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP','CHUNK')]
	rpw		<- merge(rpw,tmp,by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','GROUP','CHUNK','W_FROM','W_TO'))	
	#	for each chunk, count: windows by type and effective length of chunk
	#	then sum chunks
	rplkl	<- rpw[, list(	K= length(W_FROM), KEFF= length(W_FROM)/CHUNK_N[1] * CHUNK_L[1]), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','CHUNK','LABEL','LABEL_SH','GROUP','TYPE')]	
	rplkl	<- rplkl[, list(STAT=c('K','KEFF'), V=c(sum(K),sum(KEFF))), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH','GROUP','TYPE')]	
	#	add zeros
	rplkl	<- dcast.data.table(rplkl, RUN+PTY_RUN+FEMALE_SANGER_ID+MALE_SANGER_ID+LABEL+LABEL_SH~GROUP+TYPE+STAT, value.var='V')
	for(x in setdiff(colnames(rplkl),c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH','GROUP')))
		set(rplkl, which(is.na(rplkl[[x]])), x, 0L)	
	for(x in colnames(rplkl)[grepl('TYPE_PD_MEAN',colnames(rplkl))])
		set(rplkl, which(rplkl[[x]]>0), x, 1L)	
	rplkl	<- melt(rplkl, id.vars=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH'), variable.name='GROUP', value.name='V')
	rplkl[, STAT:= gsub('.*_([^_]+)$','\\1',GROUP)]
	set(rplkl, NULL, 'GROUP', rplkl[, gsub('(.*)_[^_]+$','\\1',GROUP)])	
	rplkl[, TYPE:= gsub('.*_([^_]+)$','\\1',GROUP)]
	set(rplkl, NULL, 'GROUP', rplkl[, gsub('(.*)_[^_]+$','\\1',GROUP)])	
	rplkl	<- dcast.data.table(rplkl, RUN+PTY_RUN+FEMALE_SANGER_ID+MALE_SANGER_ID+LABEL+LABEL_SH+GROUP+TYPE~STAT, value.var='V')	
	#	calculate N and NEFF
	tmp		<- rplkl[, list(N= sum(K), NEFF= sum(KEFF)), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP')]	
	rplkl	<- merge(rplkl, tmp, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP'))
	#	add parameters for marginal posterior probabilities (alpha, beta)
	par.prior	<- 0.1
	tmp			<- unique(rplkl, by=c('GROUP','TYPE'))[, list(N_TYPE=length(TYPE)), by='GROUP']
	tmp[, POSTERIOR_ALPHA:= par.prior/N_TYPE]
	tmp[, POSTERIOR_BETA:= par.prior*(1-1/N_TYPE)]
	set(tmp, NULL, 'N_TYPE', NULL)
	rplkl		<- merge(rplkl,tmp,by='GROUP')
	set(rplkl,NULL,'POSTERIOR_ALPHA', rplkl[,KEFF+POSTERIOR_ALPHA])
	set(rplkl,NULL,'POSTERIOR_BETA', rplkl[,NEFF-KEFF+POSTERIOR_BETA])
	#	save
	save(rplkl, file='~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170320/RCCS_170320_w250_trmp_allpairs_posteriors.rda')
	#
	#	check runs missing?
	#
	if(0)
	{
		tmp		<- unique(subset(rplkl, select=c(RUN, PTY_RUN)))
		tmp		<- merge(tmp,tmp[,list(N_RUNS=length(RUN)),by='PTY_RUN'],by='PTY_RUN')
		dcast.data.table(subset(tmp, N_RUNS<4), PTY_RUN~RUN, value.var='N_RUNS')		
	}	
	
	#
	#	plot next to each other 
	#		'RCCS_170309_w250_d50_st20_mr20_mt1_cl2_d5' (using uninterrupted)
	#		'RCCS_170227_w270_d50_st20_mr20_mt1_cl2_d5' (not using uninterrupted)
	#
	outfile.base	<- '/Users/Oliver/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170320/RCCS_170320_w250_dxxx' 
	
	rps		<- subset(rplkl, 	GROUP%in%c('TYPE_PAIR_TODI3') & 
					TYPE==c("no intermediate\n and close") &
					RUN%in%c('RCCS_170227_w270_d50_st20_mr20_mt1_cl2_d5','RCCS_170309_w250_d50_st20_mr20_mt1_cl2_d5'))
	rps[, PROB:=pbeta(0.5, POSTERIOR_ALPHA, POSTERIOR_BETA)]
	rps		<- rps[, list(PROB_D=abs(diff(PROB))), by=c('PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID')]
	write.csv(subset(rps, PROB_D>.05), file=paste0(outfile.base,'-phsc-relationships_compare_contiguous_to_contiguousORuninterrupted.csv'))
	#	
	#
	group			<- 'TYPE_PAIR_TODI3'
	cols.run		<- c('black', 'blue', 'green')
	names(cols.run)	<- c('RCCS_170227_w270_d50_st20_mr20_mt1_cl2_d5','RCCS_170309_w250_d50_st20_mr20_mt1_cl2_d5','RCCS_170320_w250_d50_st20_mr20_mt1_cl2_d5')
	rplkl2		<- merge(subset(rps, PROB_D>.05), subset(rplkl, GROUP==group), by=c('PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID'))
	set(rplkl2, NULL, 'RUN', rplkl2[, factor(RUN, levels=rev(c('RCCS_170227_w270_d50_st20_mr20_mt1_cl2_d5','RCCS_170309_w250_d50_st20_mr20_mt1_cl2_d5','RCCS_170320_w250_d50_st20_mr20_mt1_cl2_d5')))])
	set(rplkl2, NULL, 'TYPE', rplkl2[, factor(TYPE, levels=rev(c("no intermediate\n and close","no intermediate\n but not close","with intermediate\nor distant")))])
	plot.file	<- paste0(outfile.base,'-phsc-relationships_compare_contiguous_to_contiguousORuninterrupted_EXP.pdf')
	plot.prob.select	<- "no intermediate\n and close"
	phsc.plot.windowassignments.by.runs(rplkl2, plot.file, plot.prob.select, cols.run, cols=NULL, group=group, height=40)
	#
	#	make detailed plots for each pair with PROB_D > 0.05
	#	
	#groups		<- c('TYPE_DIR_TODI7x3','TYPE_PAIR_TODI3x3','TYPE_PAIR_TODI2x2','TYPE_PAIR_TODI3','TYPE_PAIR_DI')
	run			<- 'RCCS_170227_w270_d50_st20_mr20_mt1_cl2_d5'
	run			<- 'RCCS_170309_w250_d50_st20_mr20_mt1_cl2_d5'
	run			<- 'RCCS_170320_w250_d50_st20_mr20_mt1_cl2_d5'
	group		<- c('TYPE_DIR_TODI7x3')		
	plot.select	<- unique(subset(rplkl, GROUP==group, c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','LABEL')), by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))
	plot.select	<- merge(subset(rps, PROB_D>.05), plot.select, by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))
	plot.file	<- paste0(outfile.base,'-phsc-relationships_allpairs','_',run,'_',group,'.pdf')
	rpw2		<- subset(rpw, RUN==run & GROUP==group)
	rplkl2		<- subset(rplkl, RUN==run & GROUP==group)
	phsc.plot.windowsummaries.for.pairs(plot.select, rpw2, rplkl2, plot.file, cols=NULL, group=group)
	
	#
	#	check prior 
	#
	group		<- 'TYPE_DIR_TODI7x3'
	run			<- 'RCCS_170320_w250_d50_st20_mr20_mt1_cl2_d5'
	rps			<- subset(rplkl, NEFF<5 & KEFF>2 & GROUP=='TYPE_PAIR_TODI3' & TYPE=='no intermediate\n and close', select=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))
	plot.select	<- unique(subset(merge(rplkl, rps, by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID')), GROUP==group), by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))
	rpw2		<- subset(rpw, RUN==run & GROUP==group)
	rplkl2		<- subset(rplkl, RUN==run & GROUP==group)
	plot.file	<- paste0(outfile.base,'-phsc-relationships_allpairsNEFF5KEFF2','_',run,'_',group,'.pdf')
	phsc.plot.windowsummaries.for.pairs(plot.select, rpw2, rplkl2, plot.file, cols=NULL, group=group)
	#	plot phylogenies for pairs with little evidence in either of the two runs	
	tmp			<- copy(rps)
	set(tmp, NULL, 'FEMALE_SANGER_ID', tmp[, as.character(FEMALE_SANGER_ID)])
	set(tmp, NULL, 'MALE_SANGER_ID', tmp[, as.character(MALE_SANGER_ID)])
	#	check RCCS_170227_w270_d50_st20_mr20_mt1_cl2_d5
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170320/RCCS_170320_w250_d50_st20_mr20_mt1_cl2_d5_phscout.rda')
	outfile	<- '~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170320/prior_RCCS_170320_w250_d50_st20_mr20_mt1_cl2_d5_'		
	invisible(sapply(seq_len(nrow(tmp))[-c(1:7)], function(ii)
					{	
						#ii<- 1
						ids			<- c(tmp[ii, MALE_SANGER_ID],tmp[ii, FEMALE_SANGER_ID])
						pty.run		<- tmp[ii, PTY_RUN]
						dfs			<- subset(dtrees, PTY_RUN==pty.run, select=c(PTY_RUN, W_FROM, W_TO, IDX))
						dfs[, TITLE:= dfs[, paste('male ', ids[1],'\nfemale ',ids[2],'\nrun ', pty.run, '\nwindow ', W_FROM,'-', W_TO,sep='')]]								
						plot.file	<- paste0(outfile, pty.run,'_M_',ids[1],'_F_', ids[2],'_collapsed.pdf')					
						invisible(phsc.plot.phycollapsed.selected.individuals(phs, dfs, ids, plot.cols=c('red','blue'), drop.less.than.n.ids=2, plot.file=plot.file, pdf.h=10, pdf.rw=5, pdf.ntrees=20, pdf.title.size=10))					
					}))	
}


RakaiAll.addposteriors.pairs.170322<- function()
{
	require(data.table)
	require(scales)
	require(ggplot2)
	require(grid)
	require(gridExtra)
	require(RColorBrewer)
	
	# load couples "rp"
	load("~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/Couples_PANGEA_HIV_n4562_Imperial_v151113_info.rda")
	rc		<- copy(rp)	
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/circumcision/RCCS_SeqInfo_160816.rda')
	rs		<- subset(rs, !is.na(VISIT))		
	tmp		<- RakaiCirc.epi.get.info.170208()
	rh		<- tmp$rh
	rd		<- tmp$rd
	#	add sequence dates to rd
	tmp		<- unique(subset(rs, !is.na(PID), select=c(PID, DATE)),by='PID')
	setnames(tmp, 'DATE','SEQDATE')
	rd		<- merge(rd, tmp, by='PID',all.x=1)
	#	focus on those with PANGEA seqs
	rd		<- subset(rd, !is.na(PID))
	
	# load pty.run
	load( "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/Couples_PANGEA_HIV_n4562_Imperial_v151113_phscruns.rda" )
	# load transmission summaries
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170320/RCCS_170320_w250_trmw_assignments_allpairs.rda')	
	tmp		<- copy(rpw)
	tmp[, TYPE_PAIR:=NULL]
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170322/RCCS_170322_w250_trmw_assignments_allpairs.rda')
	rpw		<- rbind(rpw, tmp)
	setnames(rpw, c('TYPE'), c('TYPE_DIR_TODI7x3'))
	#
	rpw[, table(RUN, useNA='if')]
	#	check if we have all pty.runs
	stopifnot(!length(setdiff( 	subset(rpw, RUN=='RCCS_170320_w250_d50_st20_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))],subset(rpw, RUN=='RCCS_170322_w250_d50_st20_trU_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))]	)),
			!length(setdiff( 	subset(rpw, RUN=='RCCS_170320_w250_d50_st20_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))],subset(rpw, RUN=='RCCS_170322_w250_d50_st20_trB_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))]	)),
			!length(setdiff( 	subset(rpw, RUN=='RCCS_170320_w250_d50_st20_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))],subset(rpw, RUN=='RCCS_170322_w250_d50_st20_trA_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))]	)))	
	#	define plotting order: largest number of trm assignments	
	tmp		<- rpw[, list( WIN_TR=length(which(grepl('close|anc',TYPE_DIR_TODI7x3))) ), by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID')]
	tmp		<- tmp[, list(WIN_TR=max(WIN_TR)), by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID')]
	tmp		<- tmp[order(-WIN_TR),]
	tmp[, PLOT_ID:=seq_len(nrow(tmp))]	
	#	make pair data table
	rp		<- copy(rpw)
	set(rp, NULL, c('RUN','FILE','DIR','W_FROM','W_TO','TYPE_RAW','TYPE_DIR_TODI7x3','PATRISTIC_DISTANCE','CONTIGUOUS','PATHS_12','PATHS_21','MALE_SANGER_ID_L','MALE_SANGER_ID_R','FEMALE_SANGER_ID_L','FEMALE_SANGER_ID_R'), NULL)
	rp		<- unique(rp, by=c('MALE_SANGER_ID','FEMALE_SANGER_ID','PTY_RUN'))
	#	make COUPID
	rp[, COUPID:= paste0(MALE_RID,':',FEMALE_RID)]
	#	define label
	tmp		<- merge(tmp, rp, by=c('MALE_SANGER_ID','FEMALE_SANGER_ID','PTY_RUN'))
	setkey(tmp, PLOT_ID)
	tmp[, LABEL_SH:= factor(PLOT_ID, levels=PLOT_ID, labels=paste(COUPID, ' ( M:', MALE_SANGER_ID,' F:',FEMALE_SANGER_ID, ' run:', PTY_RUN, ' )', sep=''))]
	tmp[, LABEL:= factor(PLOT_ID, levels=PLOT_ID, labels=paste('Pair ', COUPID,' -phsc.run=',PTY_RUN, '\nPerson M ', MALE_RID, ' ', MALE_SANGER_ID,' -loc:',MALE_REGION,',',MALE_COMM_NUM,',',MALE_HH_NUM,' -birth:',MALE_BIRTHDATE,' -neg:',MALE_LASTNEGDATE,' -pos:',MALE_FIRSTPOSDATE,' -seq:',MALE_SEQDATE,
							'\n<->', 
							'\nPerson F ', FEMALE_RID, ' ', FEMALE_SANGER_ID,' -loc:',FEMALE_REGION,',',FEMALE_COMM_NUM,',',FEMALE_HH_NUM,' -birth:',FEMALE_BIRTHDATE,' -neg:',FEMALE_LASTNEGDATE,' -pos:',FEMALE_FIRSTPOSDATE,' -seq:',FEMALE_SEQDATE,																				
							'\n',sep=''))]
	tmp		<- subset(tmp, select=c(PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, LABEL, LABEL_SH))
	rpw		<- merge(tmp, rpw, by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))
	#	define min/max reads
	tmp		<- rpw[, list(	ID_R_MIN=min(MALE_SANGER_ID_R, FEMALE_SANGER_ID_R),
					ID_R_MAX=max(MALE_SANGER_ID_R, FEMALE_SANGER_ID_R)), by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM')]
	rpw		<- merge(rpw, tmp, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM'))
	#
	#	define relationships
	rpw[, TYPE_PAIR_DI:= cut(PATRISTIC_DISTANCE, breaks=c(1e-12,0.02,0.05,2), labels=c('close','intermediate\ndistance','distant'))]
	rpw[, TYPE_PAIR_TO:= 'other']
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('chain',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TO', 'ancestral/\nintermingled')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('intermingled',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TO', 'ancestral/\nintermingled')
	rpw[, TYPE_PAIR_TODI2x2:= 'not close other']
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('chain',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TO', 'close ancestral/\nintermingled')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('intermingled',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TO', 'close ancestral/\nintermingled')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('chain',TYPE_DIR_TODI7x3) & !grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TO', 'not close ancestral/\nintermingled')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('intermingled',TYPE_DIR_TODI7x3) & !grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TO', 'not close ancestral/\nintermingled')
	set(rpw, rpw[, which(grepl('withintermediate',TYPE_DIR_TODI7x3) & grepl('other',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TO', 'close other')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('other',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TO', 'close other')
	rpw[, TYPE_DIR_TO3:= NA_character_]
	set(rpw, rpw[, which(grepl('chain_fm',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'fm')
	set(rpw, rpw[, which(grepl('chain_mf',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'mf')
	set(rpw, rpw[, which(grepl('intermingled',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'intermingled')		
	rpw[, TYPE_DIR_TODI3:= NA_character_]
	set(rpw, rpw[, which(grepl('chain_fm',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'fm')
	set(rpw, rpw[, which(grepl('chain_mf',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'mf')
	set(rpw, rpw[, which(grepl('intermingled',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'intermingled')
	rpw[, TYPE_PAIR_TODI:= 'distant']
	set(rpw, rpw[, which(grepl('withintermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI', 'chain')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI', 'likely pair')
	set(rpw, rpw[, which(!grepl('distant',TYPE_DIR_TODI7x3) & !grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI', 'intermediate distance')	
	rpw[, TYPE_CHAIN_TODI:= 'distant']
	set(rpw, rpw[, which(grepl('withintermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI', 'chain')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI', 'chain')
	set(rpw, rpw[, which(grepl('other',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI', 'chain')
	set(rpw, rpw[, which(!grepl('distant',TYPE_DIR_TODI7x3) & !grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI', 'intermediate distance')		
	rpw[, TYPE_PAIR_TODI_NEW:= 'distant']
	set(rpw, rpw[, which(grepl('withintermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI_NEW', 'chain')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI_NEW', 'likely pair')	
	set(rpw, rpw[, which(!grepl('distant',TYPE_DIR_TODI7x3) & !grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI_NEW', 'intermediate distance')	
	#	TYPE_DIR_TODI7x3 condense other
	set(rpw, NULL, 'TYPE_DIR_TODI7x3', rpw[, gsub('other_withintermediate_distant','other_distant',gsub('other_withintermediate_close','other_close',gsub('other_withintermediate$','other',gsub('other_nointermediate$','other',gsub('other_nointermediate_distant','other_distant',TYPE_DIR_TODI7x3)))))])	
	set(rpw, NULL, 'TYPE_DIR_TODI7x3', rpw[, gsub('other_no','other\nno',gsub('([ho])intermediate','\\1 intermediate',gsub('intermediate_','intermediate\n',gsub('intermingled_','intermingled\n',gsub('(chain_[fm][mf])_','\\1\n',TYPE_DIR_TODI7x3)))))])	
	#
	#	identify chunks of contiguous windows
	#	
	setkey(rpw, RUN, PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, W_FROM)
	rpw.slide	<- rpw[, {
				ans	<- NA_integer_
				tmp	<- diff(W_FROM)
				if(length(tmp))
					ans	<- min(tmp)
				list(W_SLIDE=ans)
			}, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','RUN')][, min(na.omit(W_SLIDE))]
	#	melt because chunks are dependent on topology types: if there are NAs, then the chunks may change
	rpw		<- melt(rpw, measure.vars=c('TYPE_PAIR_TO','TYPE_PAIR_TODI','TYPE_PAIR_TODI_NEW','TYPE_CHAIN_TODI','TYPE_PAIR_DI','TYPE_PAIR_TODI2x2','TYPE_DIR_TODI7x3','TYPE_DIR_TO3','TYPE_DIR_TODI3'), variable.name='GROUP', value.name='TYPE')
	set(rpw, NULL, 'TYPE', rpw[,gsub('_',' ', TYPE)])	
	rpw		<- subset(rpw, !is.na(TYPE))			
	#	define chunks
	setkey(rpw, RUN, PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, GROUP, W_FROM)
	tmp		<- rpw[, {
				tmp<- as.integer( c(TRUE,(W_FROM[-length(W_FROM)]+rpw.slide)!=W_FROM[-1]) )
				list(W_FROM=W_FROM, W_TO=W_TO, CHUNK=cumsum(tmp))
			}, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','GROUP')]
	rpw		<- merge(rpw,tmp,by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM','W_TO','GROUP'))
	#	define chunk length in terms of non-overlapping windows	& number of windows in chunk
	tmp		<- rpw[, {
				list(W_FROM=W_FROM, W_TO=W_TO, CHUNK_L=(max(W_TO+1L)-min(W_FROM))/(W_TO[1]+1L-W_FROM[1]), CHUNK_N=length(W_FROM))
			}, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP','CHUNK')]
	rpw		<- merge(rpw,tmp,by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','GROUP','CHUNK','W_FROM','W_TO'))	
	#	for each chunk, count: windows by type and effective length of chunk
	#	then sum chunks
	rplkl	<- rpw[, list(	K= length(W_FROM), KEFF= length(W_FROM)/CHUNK_N[1] * CHUNK_L[1]), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','CHUNK','LABEL','LABEL_SH','GROUP','TYPE')]	
	rplkl	<- rplkl[, list(STAT=c('K','KEFF'), V=c(sum(K),sum(KEFF))), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH','GROUP','TYPE')]	
	#	add zeros
	rplkl	<- dcast.data.table(rplkl, RUN+PTY_RUN+FEMALE_SANGER_ID+MALE_SANGER_ID+LABEL+LABEL_SH~GROUP+TYPE+STAT, value.var='V')
	for(x in setdiff(colnames(rplkl),c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH','GROUP')))
		set(rplkl, which(is.na(rplkl[[x]])), x, 0L)	
	for(x in colnames(rplkl)[grepl('TYPE_PD_MEAN',colnames(rplkl))])
		set(rplkl, which(rplkl[[x]]>0), x, 1L)	
	rplkl	<- melt(rplkl, id.vars=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH'), variable.name='GROUP', value.name='V')
	rplkl[, STAT:= gsub('.*_([^_]+)$','\\1',GROUP)]
	set(rplkl, NULL, 'GROUP', rplkl[, gsub('(.*)_[^_]+$','\\1',GROUP)])	
	rplkl[, TYPE:= gsub('.*_([^_]+)$','\\1',GROUP)]
	set(rplkl, NULL, 'GROUP', rplkl[, gsub('(.*)_[^_]+$','\\1',GROUP)])	
	rplkl	<- dcast.data.table(rplkl, RUN+PTY_RUN+FEMALE_SANGER_ID+MALE_SANGER_ID+LABEL+LABEL_SH+GROUP+TYPE~STAT, value.var='V')	
	#	calculate N and NEFF
	tmp		<- rplkl[, list(N= sum(K), NEFF= sum(KEFF)), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP')]	
	rplkl	<- merge(rplkl, tmp, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP'))
	#	add parameters for marginal posterior probabilities (alpha, beta)
	rplkl[, PAR_PRIOR:=0.1]	
	tmp		<- copy(rplkl)
	set(tmp, NULL, 'PAR_PRIOR', 3)
	rplkl	<- rbind(rplkl, tmp)
	#
	tmp			<- unique(rplkl, by=c('GROUP','TYPE','PAR_PRIOR'))[, list(N_TYPE=length(TYPE)), by=c('GROUP','PAR_PRIOR')]
	tmp[, POSTERIOR_ALPHA:= PAR_PRIOR/N_TYPE]
	tmp[, POSTERIOR_BETA:= PAR_PRIOR*(1-1/N_TYPE)]
	set(tmp, NULL, 'N_TYPE', NULL)
	rplkl		<- merge(rplkl,tmp,by=c('GROUP','PAR_PRIOR'))
	set(rplkl,NULL,'POSTERIOR_ALPHA', rplkl[,KEFF+POSTERIOR_ALPHA])
	set(rplkl,NULL,'POSTERIOR_BETA', rplkl[,NEFF-KEFF+POSTERIOR_BETA])
	#	save
	save(rd, rh, ri, rs, rp, rpw, rplkl, file='~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170322/RCCS_170322_w250_trmp_allpairs_posteriors_cmptoprv.rda')
}

RakaiAll.addposteriors.pairs.170405<- function()
{
	require(data.table)
	require(scales)
	require(ggplot2)
	require(grid)
	require(gridExtra)
	require(RColorBrewer)
	
	# load couples "rp"
	load("~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/Couples_PANGEA_HIV_n4562_Imperial_v151113_info.rda")
	rc		<- copy(rp)	
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/circumcision/RCCS_SeqInfo_160816.rda')
	rs		<- subset(rs, !is.na(VISIT))		
	tmp		<- RakaiCirc.epi.get.info.170208()
	rh		<- tmp$rh
	rd		<- tmp$rd
	#	add sequence dates to rd
	tmp		<- unique(subset(rs, !is.na(PID), select=c(PID, DATE)),by='PID')
	setnames(tmp, 'DATE','SEQDATE')
	rd		<- merge(rd, tmp, by='PID',all.x=1)
	#	focus on those with PANGEA seqs
	rd		<- subset(rd, !is.na(PID))
	
	# load pty.run
	load( "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/Couples_PANGEA_HIV_n4562_Imperial_v151113_phscruns.rda" )
	# load transmission summaries
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170320/RCCS_170320_w250_trmw_assignments_allpairs.rda')	
	tmp		<- copy(rpw)
	tmp[, TYPE_PAIR:=NULL]
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170405/RCCS_170405_w250_trmw_assignments_allpairs.rda')
	rpw		<- rbind(rpw, tmp)
	setnames(rpw, c('TYPE'), c('TYPE_DIR_TODI7x3'))
	#
	rpw[, table(RUN, useNA='if')]
	#	check if we have all pty.runs
	stopifnot(	!length(setdiff( 	subset(rpw, RUN=='RCCS_170320_w250_d50_st20_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))],subset(rpw, RUN=='RCCS_170405_w250_d50_st20_trB_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))]	)),
			!length(setdiff( 	subset(rpw, RUN=='RCCS_170320_w250_d50_st20_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))],subset(rpw, RUN=='RCCS_170405_w250_d50_st20_trB_blNormed_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))]	)),
			!length(setdiff( 	subset(rpw, RUN=='RCCS_170320_w250_d50_st20_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))],subset(rpw, RUN=='RCCS_170405_w250_d50_st20_trB_blNormedOnFly_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))]	)),
			!length(setdiff( 	subset(rpw, RUN=='RCCS_170320_w250_d50_st20_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))],subset(rpw, RUN=='RCCS_170405_w250_d50_st20_trU_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))]	)),
			!length(setdiff( 	subset(rpw, RUN=='RCCS_170320_w250_d50_st20_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))],subset(rpw, RUN=='RCCS_170405_w250_d50_st20_trC_mr20_mt1_cl2_d5')[, sort(unique(PTY_RUN))]	)))	
	#	define plotting order: largest number of trm assignments	
	tmp		<- rpw[, list( WIN_TR=length(which(grepl('close|anc',TYPE_DIR_TODI7x3))) ), by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID')]
	tmp		<- tmp[, list(WIN_TR=max(WIN_TR)), by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID')]
	tmp		<- tmp[order(-WIN_TR),]
	tmp[, PLOT_ID:=seq_len(nrow(tmp))]	
	#	make pair data table
	rp		<- copy(rpw)
	set(rp, NULL, c('RUN','FILE','DIR','W_FROM','W_TO','TYPE_RAW','TYPE_DIR_TODI7x3','PATRISTIC_DISTANCE','CONTIGUOUS','PATHS_12','PATHS_21','MALE_SANGER_ID_L','MALE_SANGER_ID_R','FEMALE_SANGER_ID_L','FEMALE_SANGER_ID_R'), NULL)
	rp		<- unique(rp, by=c('MALE_SANGER_ID','FEMALE_SANGER_ID','PTY_RUN'))
	#	make COUPID
	rp[, COUPID:= paste0(MALE_RID,':',FEMALE_RID)]
	#	define label
	tmp		<- merge(tmp, rp, by=c('MALE_SANGER_ID','FEMALE_SANGER_ID','PTY_RUN'))
	setkey(tmp, PLOT_ID)
	tmp[, LABEL_SH:= factor(PLOT_ID, levels=PLOT_ID, labels=paste(COUPID, ' ( M:', MALE_SANGER_ID,' F:',FEMALE_SANGER_ID, ' run:', PTY_RUN, ' )', sep=''))]
	tmp[, LABEL:= factor(PLOT_ID, levels=PLOT_ID, labels=paste('Pair ', COUPID,' -phsc.run=',PTY_RUN, '\nPerson M ', MALE_RID, ' ', MALE_SANGER_ID,' -loc:',MALE_REGION,',',MALE_COMM_NUM,',',MALE_HH_NUM,' -birth:',MALE_BIRTHDATE,' -neg:',MALE_LASTNEGDATE,' -pos:',MALE_FIRSTPOSDATE,' -seq:',MALE_SEQDATE,
							'\n<->', 
							'\nPerson F ', FEMALE_RID, ' ', FEMALE_SANGER_ID,' -loc:',FEMALE_REGION,',',FEMALE_COMM_NUM,',',FEMALE_HH_NUM,' -birth:',FEMALE_BIRTHDATE,' -neg:',FEMALE_LASTNEGDATE,' -pos:',FEMALE_FIRSTPOSDATE,' -seq:',FEMALE_SEQDATE,																				
							'\n',sep=''))]
	tmp		<- subset(tmp, select=c(PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, LABEL, LABEL_SH))
	rpw		<- merge(tmp, rpw, by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))
	#	define min/max reads
	tmp		<- rpw[, list(	ID_R_MIN=min(MALE_SANGER_ID_R, FEMALE_SANGER_ID_R),
					ID_R_MAX=max(MALE_SANGER_ID_R, FEMALE_SANGER_ID_R)), by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM')]
	rpw		<- merge(rpw, tmp, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM'))
	#
	#	define relationships
	rpw[, TYPE_PAIR_DI:= cut(PATRISTIC_DISTANCE, breaks=c(1e-12,0.02,0.05,2), labels=c('close','intermediate\ndistance','distant'))]
	rpw[, TYPE_PAIR_TO:= 'other']
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('chain',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TO', 'ancestral/\nintermingled')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('intermingled',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TO', 'ancestral/\nintermingled')
	rpw[, TYPE_PAIR_TODI2x2:= 'not close other']
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('chain',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TO', 'close ancestral/\nintermingled')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('intermingled',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TO', 'close ancestral/\nintermingled')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('chain',TYPE_DIR_TODI7x3) & !grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TO', 'not close ancestral/\nintermingled')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('intermingled',TYPE_DIR_TODI7x3) & !grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TO', 'not close ancestral/\nintermingled')
	set(rpw, rpw[, which(grepl('withintermediate',TYPE_DIR_TODI7x3) & grepl('other',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TO', 'close other')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('other',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TO', 'close other')
	rpw[, TYPE_DIR_TO3:= NA_character_]
	set(rpw, rpw[, which(grepl('chain_fm',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'fm')
	set(rpw, rpw[, which(grepl('chain_mf',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'mf')
	set(rpw, rpw[, which(grepl('intermingled',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TO3', 'intermingled')		
	rpw[, TYPE_DIR_TODI3:= NA_character_]
	set(rpw, rpw[, which(grepl('chain_fm',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'fm')
	set(rpw, rpw[, which(grepl('chain_mf',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'mf')
	set(rpw, rpw[, which(grepl('intermingled',TYPE_DIR_TODI7x3) & grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_DIR_TODI3', 'intermingled')
	rpw[, TYPE_PAIR_TODI:= 'distant']
	set(rpw, rpw[, which(grepl('withintermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI', 'chain')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI', 'likely pair')
	set(rpw, rpw[, which(!grepl('distant',TYPE_DIR_TODI7x3) & !grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI', 'intermediate distance')	
	rpw[, TYPE_CHAIN_TODI:= 'distant']
	set(rpw, rpw[, which(grepl('withintermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI', 'chain')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI', 'chain')
	set(rpw, rpw[, which(grepl('other',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI', 'chain')
	set(rpw, rpw[, which(!grepl('distant',TYPE_DIR_TODI7x3) & !grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_CHAIN_TODI', 'intermediate distance')		
	rpw[, TYPE_PAIR_TODI_NEW:= 'distant']
	set(rpw, rpw[, which(grepl('withintermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI_NEW', 'chain')
	set(rpw, rpw[, which(grepl('nointermediate',TYPE_DIR_TODI7x3) & grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI_NEW', 'likely pair')	
	set(rpw, rpw[, which(!grepl('distant',TYPE_DIR_TODI7x3) & !grepl('close',TYPE_DIR_TODI7x3))], 'TYPE_PAIR_TODI_NEW', 'intermediate distance')	
	#	TYPE_DIR_TODI7x3 condense other
	set(rpw, NULL, 'TYPE_DIR_TODI7x3', rpw[, gsub('other_withintermediate_distant','other_distant',gsub('other_withintermediate_close','other_close',gsub('other_withintermediate$','other',gsub('other_nointermediate$','other',gsub('other_nointermediate_distant','other_distant',TYPE_DIR_TODI7x3)))))])	
	set(rpw, NULL, 'TYPE_DIR_TODI7x3', rpw[, gsub('other_no','other\nno',gsub('([ho])intermediate','\\1 intermediate',gsub('intermediate_','intermediate\n',gsub('intermingled_','intermingled\n',gsub('(chain_[fm][mf])_','\\1\n',TYPE_DIR_TODI7x3)))))])	
	#
	#	identify chunks of contiguous windows
	#	
	setkey(rpw, RUN, PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, W_FROM)
	rpw.slide	<- rpw[, {
				ans	<- NA_integer_
				tmp	<- diff(W_FROM)
				if(length(tmp))
					ans	<- min(tmp)
				list(W_SLIDE=ans)
			}, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','RUN')][, min(na.omit(W_SLIDE))]
	#	melt because chunks are dependent on topology types: if there are NAs, then the chunks may change
	rpw		<- melt(rpw, measure.vars=c('TYPE_PAIR_TO','TYPE_PAIR_TODI','TYPE_PAIR_TODI_NEW','TYPE_CHAIN_TODI','TYPE_PAIR_DI','TYPE_PAIR_TODI2x2','TYPE_DIR_TODI7x3','TYPE_DIR_TO3','TYPE_DIR_TODI3'), variable.name='GROUP', value.name='TYPE')
	set(rpw, NULL, 'TYPE', rpw[,gsub('_',' ', TYPE)])	
	rpw		<- subset(rpw, !is.na(TYPE))			
	#	define chunks
	setkey(rpw, RUN, PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, GROUP, W_FROM)
	tmp		<- rpw[, {
				tmp<- as.integer( c(TRUE,(W_FROM[-length(W_FROM)]+rpw.slide)!=W_FROM[-1]) )
				list(W_FROM=W_FROM, W_TO=W_TO, CHUNK=cumsum(tmp))
			}, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','GROUP')]
	rpw		<- merge(rpw,tmp,by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM','W_TO','GROUP'))
	#	define chunk length in terms of non-overlapping windows	& number of windows in chunk
	tmp		<- rpw[, {
				list(W_FROM=W_FROM, W_TO=W_TO, CHUNK_L=(max(W_TO+1L)-min(W_FROM))/(W_TO[1]+1L-W_FROM[1]), CHUNK_N=length(W_FROM))
			}, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP','CHUNK')]
	rpw		<- merge(rpw,tmp,by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','GROUP','CHUNK','W_FROM','W_TO'))	
	#	for each chunk, count: windows by type and effective length of chunk
	#	then sum chunks
	rplkl	<- rpw[, list(	K= length(W_FROM), KEFF= length(W_FROM)/CHUNK_N[1] * CHUNK_L[1]), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','CHUNK','LABEL','LABEL_SH','GROUP','TYPE')]	
	rplkl	<- rplkl[, list(STAT=c('K','KEFF'), V=c(sum(K),sum(KEFF))), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH','GROUP','TYPE')]	
	#	add zeros
	rplkl	<- dcast.data.table(rplkl, RUN+PTY_RUN+FEMALE_SANGER_ID+MALE_SANGER_ID+LABEL+LABEL_SH~GROUP+TYPE+STAT, value.var='V')
	for(x in setdiff(colnames(rplkl),c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH','GROUP')))
		set(rplkl, which(is.na(rplkl[[x]])), x, 0L)	
	for(x in colnames(rplkl)[grepl('TYPE_PD_MEAN',colnames(rplkl))])
		set(rplkl, which(rplkl[[x]]>0), x, 1L)	
	rplkl	<- melt(rplkl, id.vars=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH'), variable.name='GROUP', value.name='V')
	rplkl[, STAT:= gsub('.*_([^_]+)$','\\1',GROUP)]
	set(rplkl, NULL, 'GROUP', rplkl[, gsub('(.*)_[^_]+$','\\1',GROUP)])	
	rplkl[, TYPE:= gsub('.*_([^_]+)$','\\1',GROUP)]
	set(rplkl, NULL, 'GROUP', rplkl[, gsub('(.*)_[^_]+$','\\1',GROUP)])	
	rplkl	<- dcast.data.table(rplkl, RUN+PTY_RUN+FEMALE_SANGER_ID+MALE_SANGER_ID+LABEL+LABEL_SH+GROUP+TYPE~STAT, value.var='V')	
	#	calculate N and NEFF
	tmp		<- rplkl[, list(N= sum(K), NEFF= sum(KEFF)), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP')]	
	rplkl	<- merge(rplkl, tmp, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP'))
	#	add parameters for marginal posterior probabilities (alpha, beta)
	rplkl[, PAR_PRIOR:=0.1]	
	tmp		<- copy(rplkl)
	set(tmp, NULL, 'PAR_PRIOR', 3)
	rplkl	<- rbind(rplkl, tmp)
	#
	tmp			<- unique(rplkl, by=c('GROUP','TYPE','PAR_PRIOR'))[, list(N_TYPE=length(TYPE)), by=c('GROUP','PAR_PRIOR')]
	tmp[, POSTERIOR_ALPHA:= PAR_PRIOR/N_TYPE]
	tmp[, POSTERIOR_BETA:= PAR_PRIOR*(1-1/N_TYPE)]
	set(tmp, NULL, 'N_TYPE', NULL)
	rplkl		<- merge(rplkl,tmp,by=c('GROUP','PAR_PRIOR'))
	set(rplkl,NULL,'POSTERIOR_ALPHA', rplkl[,KEFF+POSTERIOR_ALPHA])
	set(rplkl,NULL,'POSTERIOR_BETA', rplkl[,NEFF-KEFF+POSTERIOR_BETA])
	#	save
	save(rd, rh, ri, rs, rp, rpw, rplkl, file='~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170405/RCCS_170405_w250_trmp_allpairs_posteriors_cmptoprv.rda')
}

RakaiAll.addposteriors.pairs.170410<- function()
{
	require(data.table)
	require(scales)
	require(ggplot2)
	require(grid)
	require(gridExtra)
	require(RColorBrewer)
	
	# load couples "rp"
	load("~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/Couples_PANGEA_HIV_n4562_Imperial_v151113_info.rda")
	rc		<- copy(rp)	
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/circumcision/RCCS_SeqInfo_160816.rda')
	rs		<- subset(rs, !is.na(VISIT))		
	tmp		<- RakaiCirc.epi.get.info.170208()
	rh		<- tmp$rh
	rd		<- tmp$rd
	#	add sequence dates to rd
	tmp		<- unique(subset(rs, !is.na(PID), select=c(PID, DATE)),by='PID')
	setnames(tmp, 'DATE','SEQDATE')
	rd		<- merge(rd, tmp, by='PID',all.x=1)
	#	focus on those with PANGEA seqs
	rd		<- subset(rd, !is.na(PID))
	
	# load pty.run
	load( "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/Couples_PANGEA_HIV_n4562_Imperial_v151113_phscruns.rda" )
	# load transmission summaries
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170410/RCCS_170410_w250_trmw_assignments_allpairs.rda')	
	setnames(rpw, c('TYPE'), c('TYPE_DIR_TODI7x3'))
	#
	rpw[, table(RUN, useNA='if')]
	#	check if we have all pty.runs
	stopifnot(	!length(setdiff( 	subset(rpw, RUN=='RCCS_170410_w250_d50_st20_trB_mr20_mt1_cl3.5_d8')[, sort(unique(PTY_RUN))],subset(rpw, RUN=='RCCS_170410_w250_d50_st20_trU_mr20_mt1_cl3.5_d8')[, sort(unique(PTY_RUN))]	)),
			!length(setdiff( 	subset(rpw, RUN=='RCCS_170410_w250_d50_st20_trB_mr20_mt1_cl3.5_d8')[, sort(unique(PTY_RUN))],subset(rpw, RUN=='RCCS_170410_w250_d50_st20_trC_mr20_mt1_cl3.5_d8')[, sort(unique(PTY_RUN))]	)),
			!length(setdiff( 	subset(rpw, RUN=='RCCS_170410_w250_d50_st20_trB_mr20_mt1_cl3.5_d8')[, sort(unique(PTY_RUN))],subset(rpw, RUN=='RCCS_170410_w250_d50_st20_trB_blInScriptNormed_mr20_mt1_cl3.5_d8')[, sort(unique(PTY_RUN))]	)),
			!length(setdiff( 	subset(rpw, RUN=='RCCS_170410_w250_d50_st20_trB_mr20_mt1_cl3.5_d8')[, sort(unique(PTY_RUN))],subset(rpw, RUN=='RCCS_170410_w250_d50_st20_trB_blNormedOnFly_mr20_mt1_cl3.5_d8')[, sort(unique(PTY_RUN))]	))
	)	
	#	define plotting order: largest number of trm assignments	
	tmp		<- rpw[, list( WIN_TR=length(which(grepl('close|anc',TYPE_DIR_TODI7x3))) ), by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID')]
	tmp		<- tmp[, list(WIN_TR=max(WIN_TR)), by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID')]
	tmp		<- tmp[order(-WIN_TR),]
	tmp[, PLOT_ID:=seq_len(nrow(tmp))]	
	#	make pair data table
	rp		<- copy(rpw)
	set(rp, NULL, c('RUN','FILE','DIR','W_FROM','W_TO','TYPE_RAW','TYPE_DIR_TODI7x3','PATRISTIC_DISTANCE','CONTIGUOUS','PATHS_12','PATHS_21','MALE_SANGER_ID_L','MALE_SANGER_ID_R','FEMALE_SANGER_ID_L','FEMALE_SANGER_ID_R'), NULL)
	rp		<- unique(rp, by=c('MALE_SANGER_ID','FEMALE_SANGER_ID','PTY_RUN'))
	#	make COUPID
	rp[, COUPID:= paste0(MALE_RID,':',FEMALE_RID)]
	#	define label
	tmp		<- merge(tmp, rp, by=c('MALE_SANGER_ID','FEMALE_SANGER_ID','PTY_RUN'))
	setkey(tmp, PLOT_ID)
	tmp[, LABEL_SH:= factor(PLOT_ID, levels=PLOT_ID, labels=paste(COUPID, ' ( M:', MALE_SANGER_ID,' F:',FEMALE_SANGER_ID, ' run:', PTY_RUN, ' )', sep=''))]
	tmp[, LABEL:= factor(PLOT_ID, levels=PLOT_ID, labels=paste('Pair ', COUPID,' -phsc.run=',PTY_RUN, '\nPerson M ', MALE_RID, ' ', MALE_SANGER_ID,' -loc:',MALE_REGION,',',MALE_COMM_NUM,',',MALE_HH_NUM,' -birth:',MALE_BIRTHDATE,' -neg:',MALE_LASTNEGDATE,' -pos:',MALE_FIRSTPOSDATE,' -seq:',MALE_SEQDATE,
							'\n<->', 
							'\nPerson F ', FEMALE_RID, ' ', FEMALE_SANGER_ID,' -loc:',FEMALE_REGION,',',FEMALE_COMM_NUM,',',FEMALE_HH_NUM,' -birth:',FEMALE_BIRTHDATE,' -neg:',FEMALE_LASTNEGDATE,' -pos:',FEMALE_FIRSTPOSDATE,' -seq:',FEMALE_SEQDATE,																				
							'\n',sep=''))]
	tmp		<- subset(tmp, select=c(PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, LABEL, LABEL_SH))
	rpw		<- merge(tmp, rpw, by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))
	#	define min/max reads
	tmp		<- rpw[, list(	ID_R_MIN=min(MALE_SANGER_ID_R, FEMALE_SANGER_ID_R),
					ID_R_MAX=max(MALE_SANGER_ID_R, FEMALE_SANGER_ID_R)), by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM')]
	rpw		<- merge(rpw, tmp, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM'))
	#
	#	identify chunks of contiguous windows
	#	
	setkey(rpw, RUN, PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, W_FROM)
	rpw.slide	<- rpw[, {
				ans	<- NA_integer_
				tmp	<- diff(W_FROM)
				if(length(tmp))
					ans	<- min(tmp)
				list(W_SLIDE=ans)
			}, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','RUN')][, min(na.omit(W_SLIDE))]
	#	define chunks
	setkey(rpw, RUN, PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, W_FROM)
	tmp		<- rpw[, {
				tmp<- as.integer( c(TRUE,(W_FROM[-length(W_FROM)]+rpw.slide)!=W_FROM[-1]) )
				list(W_FROM=W_FROM, W_TO=W_TO, CHUNK=cumsum(tmp))
			}, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID')]
	rpw		<- merge(rpw,tmp,by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM','W_TO'))
	#	define chunk length in terms of non-overlapping windows	& number of windows in chunk
	tmp		<- rpw[, {
				list(W_FROM=W_FROM, W_TO=W_TO, CHUNK_L=(max(W_TO+1L)-min(W_FROM))/(W_TO[1]+1L-W_FROM[1]), CHUNK_N=length(W_FROM))
			}, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','CHUNK')]
	rpw		<- merge(rpw,tmp,by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','CHUNK','W_FROM','W_TO'))	
	#	for each chunk, count: windows by type and effective length of chunk
	#	then sum chunks
	rplkl	<- rpw[, list(	K= length(W_FROM), KEFF= length(W_FROM)/CHUNK_N[1] * CHUNK_L[1]), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','CHUNK','LABEL','LABEL_SH','TYPE_DIR_TODI7x3')]	
	rplkl	<- rplkl[, list(STAT=c('K','KEFF'), V=c(sum(K),sum(KEFF))), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH','TYPE_DIR_TODI7x3')]
	#
	#	define relationship groups
	#	
	relationship.groups	<- c('TYPE_PAIR_DI','TYPE_DIR_TODI3','TYPE_DIRSCORE_TODI3','TYPE_PAIR_TODI','TYPE_PAIRSCORE_TODI')
	rpw					<- phsc.get.pairwise.relationships(rpw, get.groups=relationship.groups, make.pretty.labels=TRUE)
	rplkl				<- phsc.get.pairwise.relationships(rplkl, get.groups=relationship.groups, make.pretty.labels=TRUE)
	#	melt relationship groups
	rpw		<- melt(rpw, measure.vars=c(relationship.groups,'TYPE_DIR_TODI7x3'), variable.name='GROUP', value.name='TYPE')		
	rpw		<- subset(rpw, !is.na(TYPE))			
	rplkl	<- melt(rplkl, measure.vars=c(relationship.groups,'TYPE_DIR_TODI7x3'), variable.name='GROUP', value.name='TYPE')
	rplkl	<- subset(rplkl, !is.na(TYPE))
	#	sum K and KEFF of same relationship state
	rplkl	<- rplkl[, list(V=sum(V)), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH','GROUP','TYPE','STAT')]
	#	add zero-count relationship states (change to wide table and set NA's to zero's)
	rplkl	<- dcast.data.table(rplkl, RUN+PTY_RUN+FEMALE_SANGER_ID+MALE_SANGER_ID+LABEL+LABEL_SH~GROUP+TYPE+STAT, value.var='V')
	for(x in setdiff(colnames(rplkl),c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH','GROUP')))
		set(rplkl, which(is.na(rplkl[[x]])), x, 0L)	
	#	melt again
	rplkl	<- melt(rplkl, id.vars=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH'), variable.name='GROUP', value.name='V')
	rplkl[, STAT:= gsub('.*_([^_]+)$','\\1',GROUP)]
	set(rplkl, NULL, 'GROUP', rplkl[, gsub('(.*)_[^_]+$','\\1',GROUP)])	
	rplkl[, TYPE:= gsub('.*_([^_]+)$','\\1',GROUP)]
	set(rplkl, NULL, 'GROUP', rplkl[, gsub('(.*)_[^_]+$','\\1',GROUP)])
	#	expand KEFF and K columns now that everything is done
	rplkl	<- dcast.data.table(rplkl, RUN+PTY_RUN+FEMALE_SANGER_ID+MALE_SANGER_ID+LABEL+LABEL_SH+GROUP+TYPE~STAT, value.var='V')	
	#	calculate N and NEFF
	tmp		<- rplkl[, list(N= sum(K), NEFF= sum(KEFF)), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP')]	
	rplkl	<- merge(rplkl, tmp, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP'))
	#	add parameters for marginal posterior probabilities (alpha, beta)
	#	prior depends on number of states (ie T).
	tmp			<- unique(subset(rplkl, select=c('GROUP','TYPE')))[, list(N_TYPE=length(TYPE)), by=c('GROUP')]
	tmp			<- tmp[, list(PAR_PRIOR=phsc.get.prior.parameter.n0(N_TYPE, n.type=2, n.obs=3, confidence.cut=0.5)), by=c('GROUP','N_TYPE')]
	rplkl		<- merge(rplkl, tmp, by=c('GROUP'))
	rplkl[, POSTERIOR_ALPHA:= PAR_PRIOR/N_TYPE+KEFF]
	rplkl[, POSTERIOR_BETA:= PAR_PRIOR*(1-1/N_TYPE)+NEFF-KEFF]	
	#	save
	save(rd, rh, ri, rs, rp, rpw, rplkl, file='~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170410/RCCS_170410_w250_trmp_allpairs_posteriors_cmptoprv.rda')
}

RakaiAll.addposteriors.pairs.170426<- function()
{
	require(data.table)
	require(scales)
	require(ggplot2)
	require(grid)
	require(gridExtra)
	require(RColorBrewer)
	
	# load couples "rp"
	load("~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/Couples_PANGEA_HIV_n4562_Imperial_v151113_info.rda")
	rc		<- copy(rp)	
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/circumcision/RCCS_SeqInfo_160816.rda')
	rs		<- subset(rs, !is.na(VISIT))		
	tmp		<- RakaiCirc.epi.get.info.170208()
	rh		<- tmp$rh
	rd		<- tmp$rd
	#	add sequence dates to rd
	tmp		<- unique(subset(rs, !is.na(PID), select=c(PID, DATE)),by='PID')
	setnames(tmp, 'DATE','SEQDATE')
	rd		<- merge(rd, tmp, by='PID',all.x=1)
	#	focus on those with PANGEA seqs
	rd		<- subset(rd, !is.na(PID))
	
	# load pty.run
	load( "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/Couples_PANGEA_HIV_n4562_Imperial_v151113_phscruns.rda" )
	# load transmission summaries
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170410/RCCS_170410_w250_trmw_assignments_allpairs.rda')	
	tmp		<- subset(rpw, RUN=='RCCS_170410_w250_d50_st20_trB_blInScriptNormed_mr20_mt1_cl3.5_d8')
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170426/RCCS_170426_w250_trmw_assignments_allpairs.rda')
	rpw		<- rbind(rpw, tmp)
	setnames(rpw, c('TYPE'), c('TYPE_DIR_TODI7x3'))
	#
	rpw[, table(RUN, useNA='if')]
	#	check if we have all pty.runs
	stopifnot(	!length(setdiff( 	subset(rpw, RUN=='RCCS_170410_w250_d50_st20_trB_blInScriptNormed_mr20_mt1_cl3.5_d8')[, sort(unique(PTY_RUN))],subset(rpw, RUN=='RCCS_170426_w250_d50_p10_blNormed_mr20_mt1_cl3.5_d8')[, sort(unique(PTY_RUN))]	)),
			!length(setdiff( 	subset(rpw, RUN=='RCCS_170410_w250_d50_st20_trB_blInScriptNormed_mr20_mt1_cl3.5_d8')[, sort(unique(PTY_RUN))],subset(rpw, RUN=='RCCS_170426_w250_d50_p35_blNormed_mr20_mt1_cl3.5_d8')[, sort(unique(PTY_RUN))]	))
	)	
	#	define plotting order: largest number of trm assignments	
	tmp		<- rpw[, list( WIN_TR=length(which(grepl('close|anc',TYPE_DIR_TODI7x3))) ), by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID')]
	tmp		<- tmp[, list(WIN_TR=max(WIN_TR)), by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID')]
	tmp		<- tmp[order(-WIN_TR),]
	tmp[, PLOT_ID:=seq_len(nrow(tmp))]	
	#	make pair data table
	rp		<- copy(rpw)
	set(rp, NULL, c('RUN','FILE','DIR','W_FROM','W_TO','TYPE_RAW','TYPE_DIR_TODI7x3','PATRISTIC_DISTANCE','CONTIGUOUS','PATHS_12','PATHS_21','MALE_SANGER_ID_L','MALE_SANGER_ID_R','FEMALE_SANGER_ID_L','FEMALE_SANGER_ID_R'), NULL)
	rp		<- unique(rp, by=c('MALE_SANGER_ID','FEMALE_SANGER_ID','PTY_RUN'))
	#	make COUPID
	rp[, COUPID:= paste0(MALE_RID,':',FEMALE_RID)]
	#	define label
	tmp		<- merge(tmp, rp, by=c('MALE_SANGER_ID','FEMALE_SANGER_ID','PTY_RUN'))
	setkey(tmp, PLOT_ID)
	tmp[, LABEL_SH:= factor(PLOT_ID, levels=PLOT_ID, labels=paste(COUPID, ' ( M:', MALE_SANGER_ID,' F:',FEMALE_SANGER_ID, ' run:', PTY_RUN, ' )', sep=''))]
	tmp[, LABEL:= factor(PLOT_ID, levels=PLOT_ID, labels=paste('Pair ', COUPID,' -phsc.run=',PTY_RUN, '\nPerson M ', MALE_RID, ' ', MALE_SANGER_ID,' -loc:',MALE_REGION,',',MALE_COMM_NUM,',',MALE_HH_NUM,' -birth:',MALE_BIRTHDATE,' -neg:',MALE_LASTNEGDATE,' -pos:',MALE_FIRSTPOSDATE,' -seq:',MALE_SEQDATE,
							'\n<->', 
							'\nPerson F ', FEMALE_RID, ' ', FEMALE_SANGER_ID,' -loc:',FEMALE_REGION,',',FEMALE_COMM_NUM,',',FEMALE_HH_NUM,' -birth:',FEMALE_BIRTHDATE,' -neg:',FEMALE_LASTNEGDATE,' -pos:',FEMALE_FIRSTPOSDATE,' -seq:',FEMALE_SEQDATE,																				
							'\n',sep=''))]
	tmp		<- subset(tmp, select=c(PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, LABEL, LABEL_SH))
	rpw		<- merge(tmp, rpw, by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))
	#	define min/max reads
	tmp		<- rpw[, list(	ID_R_MIN=min(MALE_SANGER_ID_R, FEMALE_SANGER_ID_R),
					ID_R_MAX=max(MALE_SANGER_ID_R, FEMALE_SANGER_ID_R)), by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM')]
	rpw		<- merge(rpw, tmp, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM'))
	#
	#	identify chunks of contiguous windows
	#	
	setkey(rpw, RUN, PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, W_FROM)
	rpw.slide	<- rpw[, {
				ans	<- NA_integer_
				tmp	<- diff(W_FROM)
				if(length(tmp))
					ans	<- min(tmp)
				list(W_SLIDE=ans)
			}, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','RUN')][, min(na.omit(W_SLIDE))]
	#	define chunks
	setkey(rpw, RUN, PTY_RUN, MALE_SANGER_ID, FEMALE_SANGER_ID, W_FROM)
	tmp		<- rpw[, {
				tmp<- as.integer( c(TRUE,(W_FROM[-length(W_FROM)]+rpw.slide)!=W_FROM[-1]) )
				list(W_FROM=W_FROM, W_TO=W_TO, CHUNK=cumsum(tmp))
			}, by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID')]
	rpw		<- merge(rpw,tmp,by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','W_FROM','W_TO'))
	#	define chunk length in terms of non-overlapping windows	& number of windows in chunk
	tmp		<- rpw[, {
				list(W_FROM=W_FROM, W_TO=W_TO, CHUNK_L=(max(W_TO+1L)-min(W_FROM))/(W_TO[1]+1L-W_FROM[1]), CHUNK_N=length(W_FROM))
			}, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','CHUNK')]
	rpw		<- merge(rpw,tmp,by=c('RUN','PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID','CHUNK','W_FROM','W_TO'))	
	#	for each chunk, count: windows by type and effective length of chunk
	#	then sum chunks
	rplkl	<- rpw[, list(	K= length(W_FROM), KEFF= length(W_FROM)/CHUNK_N[1] * CHUNK_L[1]), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','CHUNK','LABEL','LABEL_SH','TYPE_DIR_TODI7x3')]	
	rplkl	<- rplkl[, list(STAT=c('K','KEFF'), V=c(sum(K),sum(KEFF))), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH','TYPE_DIR_TODI7x3')]
	#
	#	define relationship groups
	#	
	relationship.groups	<- c('TYPE_PAIR_DI','TYPE_PAIRSCORE_DI','TYPE_DIR_TODI3','TYPE_DIRSCORE_TODI3','TYPE_PAIR_TODI','TYPE_PAIRSCORE_TODI')
	rpw					<- phsc.get.pairwise.relationships(rpw, get.groups=relationship.groups, make.pretty.labels=TRUE)
	rplkl				<- phsc.get.pairwise.relationships(rplkl, get.groups=relationship.groups, make.pretty.labels=TRUE)
	#	melt relationship groups
	rpw		<- melt(rpw, measure.vars=c(relationship.groups,'TYPE_DIR_TODI7x3'), variable.name='GROUP', value.name='TYPE')		
	rpw		<- subset(rpw, !is.na(TYPE))			
	rplkl	<- melt(rplkl, measure.vars=c(relationship.groups,'TYPE_DIR_TODI7x3'), variable.name='GROUP', value.name='TYPE')
	rplkl	<- subset(rplkl, !is.na(TYPE))
	#	sum K and KEFF of same relationship state
	rplkl	<- rplkl[, list(V=sum(V)), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH','GROUP','TYPE','STAT')]
	#	add zero-count relationship states (change to wide table and set NA's to zero's)
	rplkl	<- dcast.data.table(rplkl, RUN+PTY_RUN+FEMALE_SANGER_ID+MALE_SANGER_ID+LABEL+LABEL_SH~GROUP+TYPE+STAT, value.var='V')
	for(x in setdiff(colnames(rplkl),c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH','GROUP')))
		set(rplkl, which(is.na(rplkl[[x]])), x, 0L)	
	#	melt again
	rplkl	<- melt(rplkl, id.vars=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','LABEL','LABEL_SH'), variable.name='GROUP', value.name='V')
	rplkl[, STAT:= gsub('.*_([^_]+)$','\\1',GROUP)]
	set(rplkl, NULL, 'GROUP', rplkl[, gsub('(.*)_[^_]+$','\\1',GROUP)])	
	rplkl[, TYPE:= gsub('.*_([^_]+)$','\\1',GROUP)]
	set(rplkl, NULL, 'GROUP', rplkl[, gsub('(.*)_[^_]+$','\\1',GROUP)])
	#	expand KEFF and K columns now that everything is done
	rplkl	<- dcast.data.table(rplkl, RUN+PTY_RUN+FEMALE_SANGER_ID+MALE_SANGER_ID+LABEL+LABEL_SH+GROUP+TYPE~STAT, value.var='V')	
	#	calculate N and NEFF
	tmp		<- rplkl[, list(N= sum(K), NEFF= sum(KEFF)), by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP')]	
	rplkl	<- merge(rplkl, tmp, by=c('RUN','PTY_RUN','FEMALE_SANGER_ID','MALE_SANGER_ID','GROUP'))
	#	add parameters for marginal posterior probabilities (alpha, beta)
	#	prior depends on number of states (ie T).
	tmp			<- unique(subset(rplkl, select=c('GROUP','TYPE')))[, list(N_TYPE=length(TYPE)), by=c('GROUP')]
	tmp			<- tmp[, list(PAR_PRIOR=phsc.get.prior.parameter.n0(N_TYPE, n.type=2, n.obs=3, confidence.cut=0.5)), by=c('GROUP','N_TYPE')]
	rplkl		<- merge(rplkl, tmp, by=c('GROUP'))
	rplkl[, POSTERIOR_ALPHA:= PAR_PRIOR/N_TYPE+KEFF]
	rplkl[, POSTERIOR_BETA:= PAR_PRIOR*(1-1/N_TYPE)+NEFF-KEFF]	
	#	save
	save(rd, rh, ri, rs, rp, rpw, rplkl, file='~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170426/RCCS_170426_w250_trmp_allpairs_posteriors_cmptoprv.rda')
}

RakaiAll.analyze.pairs.170322.triangles<- function()
{
	require(data.table)
	require(scales)
	require(ggplot2)
	require(grid)
	require(gridExtra)
	require(RColorBrewer)
	require(Hmisc)
	
	run		<- 'RCCS_170322_w250_trB_triangle_'
	dir		<- '/Users/Oliver/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170322'	
	#	load denominator
	tmp		<- RakaiCirc.epi.get.info.170208()
	ra		<- tmp$ra		
	# load couples "rp"
	load("~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/Couples_PANGEA_HIV_n4562_Imperial_v151113_info.rda")
	rc		<- copy(rp)
	# load pty.run
	load( "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/Couples_PANGEA_HIV_n4562_Imperial_v151113_phscruns.rda" )
	# load rd, rh, rs, rp, rpw, rplkl, ptc
	load('~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/couples/170322/RCCS_170322_w250_trmp_allpairs_posteriors_cmptoprv.rda')
	#
	# select final run
	#
	rpw		<- subset(rpw, RUN%in%c("RCCS_170322_w250_d50_st20_trB_mr20_mt1_cl2_d5") )
	rplkl	<- subset(rplkl, RUN%in%c("RCCS_170322_w250_d50_st20_trB_mr20_mt1_cl2_d5") & PAR_PRIOR==3 )	
	#	add info on pair types to rplkl
	rp		<- copy(rpw)
	set(rp, NULL, c('DIR','FILE','RUN','W_FROM','W_TO','TYPE_RAW','TYPE','GROUP','PATRISTIC_DISTANCE','UNINTERRUPTED','CONTIGUOUS','PATHS_12','PATHS_21','MALE_SANGER_ID_L','MALE_SANGER_ID_R','FEMALE_SANGER_ID_L','FEMALE_SANGER_ID_R','CHUNK','CHUNK_L','CHUNK_N','ID_R_MIN','ID_R_MAX'), NULL)
	rp		<- unique(rp)
	#	make COUPID
	rp[, COUPID:= paste0(MALE_RID,':',FEMALE_RID)]	
	#	add PAIR_TYPE
	tmp		<- unique(subset(rc, select=c(COUPID, MALE_HH_NUM, FEMALE_HH_NUM, COUP_SC, PAIR_TYPE)))	
	setnames(tmp, 'COUP_SC', 'COUP_TYPE')
	rp	<- merge(rp, tmp, by=c('COUPID','MALE_HH_NUM','FEMALE_HH_NUM'),all.x=1)
	set(rp, rp[, which(is.na(PAIR_TYPE))], 'PAIR_TYPE', 'not registered as couple')	
	tmp		<- subset(rp, select=c(FEMALE_SANGER_ID, MALE_SANGER_ID, MALE_RID, FEMALE_RID, COUPID, PTY_RUN, COUP_TYPE, PAIR_TYPE))
	rplkl	<- merge(tmp, rplkl, by=c('MALE_SANGER_ID','FEMALE_SANGER_ID','PTY_RUN'))	
	#	select first couples for whom transmission cannot be excluded 
	#	no intermediate\n and close > 50%
	confidence.cut	<- 0.5
	rpd		<- subset(rplkl, GROUP%in%c('TYPE_PAIR_TODI_NEW') & TYPE=='likely pair' & pbeta(0.5, POSTERIOR_ALPHA, POSTERIOR_BETA, lower.tail=FALSE)>confidence.cut)		
	tmp		<- subset(rpd, select=c(PTY_RUN, FEMALE_SANGER_ID, MALE_SANGER_ID))
	tmp		<- merge(tmp, rplkl, by=c('PTY_RUN','MALE_SANGER_ID','FEMALE_SANGER_ID'))	
	rmf		<- subset(tmp, GROUP%in%c('TYPE_DIR_TODI3') & TYPE=='mf' & pbeta(0.5, POSTERIOR_ALPHA, POSTERIOR_BETA, lower.tail=FALSE)>confidence.cut)
	rfm		<- subset(tmp, GROUP%in%c('TYPE_DIR_TODI3') & TYPE=='fm' & pbeta(0.5, POSTERIOR_ALPHA, POSTERIOR_BETA, lower.tail=FALSE)>confidence.cut)
	rex		<- subset(rplkl, GROUP%in%c('TYPE_PAIR_TODI') & TYPE=='distant' & pbeta(0.5, POSTERIOR_ALPHA, POSTERIOR_BETA, lower.tail=FALSE)>confidence.cut)	
	cat('\ncouples with phyloscanner assessment, n=',				nrow(unique(rplkl, by='COUPID')))
	cat('\ncouples not implicated in transmission, n=',				nrow(unique(rex, by='COUPID')))
	cat('\ncouples that are likely pairs, n=',						nrow(unique(rpd, by='COUPID')))
	cat('\ncouples that are likely pairs with evidence M->F, n=',	nrow(unique(rmf, by='COUPID')))
	cat('\ncouples that are likely pairswith evidence F->M, n=',	nrow(unique(rfm, by='COUPID')))
	#	couples that are likely pairs, n= 160
	#	likely direction resolved, n= 113
	#	couples that are likely pairs with evidence M->F, n= 78
	#	couples that are likely pairs with evidence F->M, n= 35
	
	#	define two helper data.table
	rmf		<- merge(unique(subset(rmf, select=COUPID)), unique(rp, by='COUPID'),by='COUPID')
	rmf[, PHSC_DIR:='m->f']
	rfm		<- merge(unique(subset(rfm, select=COUPID)), unique(rp, by='COUPID'),by='COUPID')
	rfm[, PHSC_DIR:='f->m']
	rtr		<- rbind(rmf, rfm)	
	rtr[, AGEDIFF:= FEMALE_BIRTHDATE-MALE_BIRTHDATE]
	rtr[, AVGAGE:= (MALE_BIRTHDATE+FEMALE_BIRTHDATE)/2]	
	rtr2	<- copy(rmf)
	setnames(rtr2,colnames(rtr2),gsub('FEMALE','REC',colnames(rtr2)))
	setnames(rtr2,colnames(rtr2),gsub('MALE','TR',colnames(rtr2)))
	tmp		<- copy(rfm)
	setnames(tmp,colnames(tmp),gsub('FEMALE','TR',colnames(tmp)))
	setnames(tmp,colnames(tmp),gsub('MALE','REC',colnames(tmp)))
	rtr2	<- rbind(rtr2,tmp)
	
	#	get triangles
	require(igraph)
	tri		<- subset(rtr2, select=c(TR_RID, REC_RID))			
	tri.igr	<- graph.data.frame(tri, directed=TRUE, vertices=NULL)
	tri		<- data.table(	RID=V(tri.igr)$name, 
			CLU=clusters(tri.igr, mode="weak")$membership)
	tmp2	<- tri[, list(CLU_SIZE=length(RID)), by='CLU']
	setkey(tmp2, CLU_SIZE)	
	tri		<- merge(tri, tmp2, by='CLU')
	tri 	<- subset(tri, CLU_SIZE>2)
	setnames(tri, 'RID', 'TR_RID')
	tri.pr	<- merge(rtr2, tri, by='TR_RID')
	setnames(tri, 'TR_RID', 'RID')
	tri		<- merge(tri, tri[, list(TRANSMITTER= factor(RID%in%rtr2$TR_RID, levels=c(TRUE,FALSE), labels=c('transmitter','recipient'))), by='RID'], by='RID')	
	tri		<- merge(tri, unique(ri,by='RID'), by='RID')
	tmp		<- tri[, list(RID=RID, SAME_HH_AS_TRM=factor(HH_NUM%in%HH_NUM[TRANSMITTER=='transmitter'], levels=c(TRUE,FALSE), labels=c('y','n'))), by='CLU']
	tri		<- merge(tri, tmp, by=c('CLU','RID'))
	setkey(tri, CLU, TRANSMITTER, SAME_HH_AS_TRM, HH_NUM)
	tri[, IND:= 1:3]
	
	
	tri.t	<- suppressWarnings( dcast.data.table(melt(tri, id.vars=c('CLU','IND'), measure.vars=c('TRANSMITTER','SEX','BIRTHDATE','SAME_HH_AS_TRM','MARSTAT','OCCUP_OLLI','SEXP1YR','SEXP1OUT','FIRSTPOSDATE','RECENTCD4','RECENTCD4DATE','RECENTVL','RECENTVLDATE','ARVSTARTDATE')), CLU+variable~IND, value.var='value') )
	write.csv(tri.t, file=file.path(dir, paste0(run, 'tritable.csv')))
	
	
	tmp		<- graph.data.frame(subset(tri.pr, select=c(TR_RID, REC_RID)), directed=TRUE, vertices=NULL)
	plot(tmp, vertex.size=2, vertex.label.cex=0.25, edge.arrow.size=0.5, layout=layout.fruchterman.reingold(tmp, niter=1e3) )
	
	cat(paste('\nprocess cluster no',clu))
	tmp					<- subset(df.inds, IDCLU==clu & IDPOP>=0, select=c(IDPOP, GENDER, TIME_SEQ))
	tmp[, IS_SEQ:= tmp[, factor(!is.na(TIME_SEQ), label=c('N','Y'), levels=c(FALSE, TRUE))]]
	
	V(clu.igr)$color	<- ifelse( get.vertex.attribute(clu.igr, 'IS_SEQ')=='Y', 'green', 'grey90' )
	V(clu.igr)$shape	<- ifelse( get.vertex.attribute(clu.igr, 'GENDER')=='M', 'circle', 'square' )
	
	par(mai=c(0,0,1,0))
	plot(clu.igr, main=paste('IDCLU=',clu,sep=''), vertex.size=2, vertex.label.cex=0.25, edge.arrow.size=0.5, layout=layout.fruchterman.reingold(clu.igr, niter=1e3) )
	legend('bottomright', fill=c('green','grey90'), legend=c('sequence sampled','sequence not sampled'), bty='n')
	legend('bottomleft', legend=c('square= Female','circle= Male'), bty='n')
	
	
	setnames(tmp, 'IDPOP', 'IDREC')
	df.trms		<- merge( df.trms, tmp