misc/phyloscan.Rakai.phylogeography.R

RakaiFull.phylogeography.180521.prevalence.gender<- function()
{
	require(data.table)
	require(scales)
	require(ggplot2)
	require(ggmap)
	require(grid)
	require(gridExtra)
	require(RColorBrewer)
	require(Hmisc)
	require(gtools)	#rdirichlet
	require(rethinking)	# STAN wrapper
	
	infile					<- "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/todi_pairs_171122_cl25_d50_prior23_min30_withmetadata.rda"
	outfile.base			<- gsub('_withmetadata.rda','',infile)	
	load(infile)	
	setkey(rtp, MALE_RID, FEMALE_RID)
	rtp[, PAIRID:= seq_len(nrow(rtp))]
	rtpdm	<- subset(rtp, grepl('mf|fm',SELECT))
	rtpdm[, PAIR_COMM_TYPE:= FEMALE_COMM_TYPE]
	set(rtpdm, rtpdm[, which(FEMALE_COMM_TYPE!=MALE_COMM_TYPE)], 'PAIR_COMM_TYPE', 'mixed')
	set(rtpdm, rtpdm[, which(is.na(FEMALE_EDUCAT))], 'FEMALE_EDUCAT', 'Unknown')
	set(rtpdm, rtpdm[, which(is.na(MALE_EDUCAT))], 'MALE_EDUCAT', 'Unknown')
	rtpdm[, COUPLE2:= factor(COUPLE=='no couple', levels=c(TRUE,FALSE), labels=c('no couple','couple'))]
	rtpdm[, SAMEHH:= factor(FEMALE_HH_NUM==MALE_HH_NUM, levels=c(TRUE,FALSE), labels=c('same hh','different hh'))]	
	rtpdm[, PAIR_COMM:= MALE_COMM_NUM_A]
	set(rtpdm, rtpdm[, which(FEMALE_COMM_NUM_A!=MALE_COMM_NUM_A)], 'PAIR_COMM', 'mixed')
	rtpdm[, MALE_SEXP1OUT2:= factor(MALE_SEXP1OUT=='0', levels=c(TRUE,FALSE),labels=c('none','1+'))]
	set(rtpdm, rtpdm[, which(MALE_SEXP1OUT=='Unknown')], 'MALE_SEXP1OUT2', 'Unknown')
	rtpdm[, FEMALE_SEXP1OUT2:= factor(FEMALE_SEXP1OUT=='0', levels=c(TRUE,FALSE),labels=c('none','1+'))]
	set(rtpdm, rtpdm[, which(FEMALE_SEXP1OUT=='Unknown')], 'FEMALE_SEXP1OUT2', 'Unknown')
	
	infile					<- "~/Dropbox (SPH Imperial College)/Rakai Pangea Meta Data/Data for Fish Analysis Working Group/community_hivprev_byGenderStudyVisit.rda"
	load(infile)
	df	<- as.data.table(melt(female.negative, varnames=c('COMM_NUM', 'VISIT'), value.name='FEMALE_NEG'))	
	tmp	<- as.data.table(melt(male.negative, varnames=c('COMM_NUM', 'VISIT'), value.name='MALE_NEG'))	
	df	<- merge(df, tmp, by=c('COMM_NUM', 'VISIT'), all=TRUE)	
	tmp	<- as.data.table(melt(female.positive, varnames=c('COMM_NUM', 'VISIT'), value.name='FEMALE_POS'))
	df	<- merge(df, tmp, by=c('COMM_NUM', 'VISIT'), all=TRUE)
	tmp	<- as.data.table(melt(male.positive, varnames=c('COMM_NUM', 'VISIT'), value.name='MALE_POS'))
	df	<- merge(df, tmp, by=c('COMM_NUM', 'VISIT'), all=TRUE)
	#	add geo-locations
	load("~/Dropbox (SPH Imperial College)/Rakai Pangea Meta Data/Data for Fish Analysis Working Group/community_geography.rda")
	comgps	<- as.data.table(comgps)
	set(comgps, NULL, 'COMM_NUM', comgps[, as.integer(as.character(COMM_NUM))])
	df	<- merge(df, comgps, by='COMM_NUM', all.x=TRUE)	
	set(df, NULL, 'COMM_NUM', df[, gsub('^107$|^16$','16m',gsub('^776$|^51$','51m',gsub('^4$|^24$','24m',gsub('^1$|^22$','22m',as.character(COMM_NUM)))))])
	for(x in c('FEMALE_NEG','MALE_NEG','FEMALE_POS','MALE_POS'))
		set(df, which(is.na(df[[x]])), x, 0L)
	df	<- subset(df, FEMALE_NEG!=0 | MALE_NEG!=0 | FEMALE_POS!=0 | MALE_POS!=0)
	#	sum by merged communities that are essentially the same
	df	<- df[, list(FEMALE_NEG=sum(FEMALE_NEG), MALE_NEG=sum(MALE_NEG), FEMALE_POS=sum(FEMALE_POS), MALE_POS=sum(MALE_POS), longitude=mean(longitude), latitude=mean(latitude)), by= c('COMM_NUM','VISIT')]
	#	add anonymized ID
	tmp	<- as.data.table(read.csv('~/Dropbox (SPH Imperial College)/Rakai Pangea Meta Data/Data for Fish Analysis Working Group/PANGEA_Rakai_community_anonymized_IDs.csv', stringsAsFactors=FALSE))
	tmp	<- unique(subset(tmp, select=c(COMM_NUM, COMM_NUM_A, COMM_TYPE)))
	df	<- merge(df, tmp, by='COMM_NUM', all.x=TRUE)
	#	subset(df, is.na(COMM_NUM_A))	#comms with missing anonymized IDs are some historic ones, not relevant
	df	<- subset(df, !is.na(COMM_NUM_A))
	
	#	select relevant communities
	if(0)
	{
		tmp	<- unique(subset(rtpdm, select=c(MALE_COMM_NUM,MALE_COMM_TYPE)))
		setnames(tmp, c('MALE_COMM_NUM','MALE_COMM_TYPE'), c('COMM_NUM','COMM_TYPE'))
		tmp2<- unique(subset(rtpdm, select=c(FEMALE_COMM_NUM,FEMALE_COMM_TYPE)))
		setnames(tmp2, c('FEMALE_COMM_NUM','FEMALE_COMM_TYPE'), c('COMM_NUM','COMM_TYPE'))
		tmp	<- unique(rbind(tmp, tmp2))
		df	<- merge(tmp, df, by='COMM_NUM')		
	}
	df[, FEMALE:= FEMALE_NEG+FEMALE_POS]
	df[, MALE:= MALE_NEG+MALE_POS]
	df[, POS:= MALE_POS+FEMALE_POS]
	df[, NEG:= MALE_NEG+FEMALE_NEG]
	#	new clean community numbers
	dg	<- subset(df, VISIT%in%c(15,15.1,16))
	tmp	<- unique(subset(dg, select=c(COMM_NUM,COMM_TYPE)))
	tmp[, COMM_NUM2:= seq_len(nrow(tmp))]
	tmp[, COMM_TYPE2:= as.integer(as.character(factor(COMM_TYPE,levels=c('agrarian','trading','fisherfolk'), labels=c('1','2','3'))))]
	dg	<- merge(dg, tmp, by=c('COMM_NUM','COMM_TYPE'))
	
	if(0)
	{
		#	any change in time in gender specific prevalence?
		#	not really
		dg	<- subset(df, VISIT%in%c(14,15,15.1,16,17))
		ggplot(dg, aes(x=factor(VISIT), group=COMM_NUM)) +
				geom_line(aes(y= MALE_POS/MALE), colour='blue') +
				geom_line(aes(y= FEMALE_POS/FEMALE), colour='hotpink2') +
				geom_point(aes(y= MALE_POS/MALE), colour='blue') +
				geom_point(aes(y= FEMALE_POS/FEMALE), colour='hotpink2') +
				theme_bw() +
				facet_wrap(~COMM_TYPE+COMM_NUM, ncol=4) +
				labs(x='\nvisit', y='gender specific HIV prevalence estimate\n')
		ggsave(file=paste0(outfile.base,'_trmMF_prevalenceratios.pdf'), w=9, h=9)		
	}
	
	#	estimate prevalences and prevalence ratio for each community		
	mpr.1 	<- map2stan(
			alist(
					FEMALE_POS ~ dbinom(FEMALE, seropos_f),
					MALE_POS ~ dbinom(MALE, seropos_m),
					logit(seropos_f) <- afc[COMM_NUM2],
					logit(seropos_m) <- amc[COMM_NUM2],					
					afc[COMM_NUM2] ~ dnorm(0, 10),
					amc[COMM_NUM2] ~ dnorm(0, 10)										
			),
			data=as.data.frame(dg), 
			start=list(	afc=rep(0,length(unique(dg$COMM_NUM2))), amc=rep(0,length(unique(dg$COMM_NUM2)))),
			warmup=5e2, iter=5e3, chains=1, cores=4
	)	
	post	<- extract.samples(mpr.1)			
	dgg		<- unique(subset(dg, select=c(COMM_NUM, COMM_TYPE, COMM_NUM2, COMM_NUM_A, longitude, latitude)))
	tmp		<- dgg[, 
			list(	STAT=c('M','CL','IL','IU','CU'),
					PF= as.numeric(quantile(logistic(post$afc[, COMM_NUM2]), prob=c(0.5,0.025,0.25,0.75,0.975))),
					PM= as.numeric(quantile(logistic(post$amc[, COMM_NUM2]), prob=c(0.5,0.025,0.25,0.75,0.975))),
					RFM= as.numeric(quantile(logistic(post$afc[, COMM_NUM2])/logistic(post$amc[, COMM_NUM2]), prob=c(0.5,0.025,0.25,0.75,0.975)))
			), 
			by='COMM_NUM2']
	tmp		<- melt(tmp, id.vars=c('COMM_NUM2','STAT'))	
	tmp		<- dcast.data.table(tmp, variable+COMM_NUM2~STAT, value.var='value')
	dgg		<- merge(dgg, tmp, by='COMM_NUM2')
	#
	#	 plot
	#
	tmp		<- subset(dgg, variable!='RFM')
	set(tmp, tmp[, which(COMM_TYPE!='fisherfolk')], 'COMM_TYPE', 'inland communities')
	set(tmp, tmp[, which(COMM_TYPE=='fisherfolk')], 'COMM_TYPE', 'fishing sites')
	set(tmp, tmp[, which(variable=='PM')], 'variable', 'men')
	set(tmp, tmp[, which(variable=='PF')], 'variable', 'women')
	tmp2	<- subset(tmp, variable=='men', c(COMM_NUM, COMM_NUM_A, COMM_TYPE, M))	
	tmp2	<- tmp2[order(COMM_TYPE, M),]
	tmp2[, DUMMY:= seq_len(nrow(tmp2))]
	set(tmp2, NULL, 'COMM_NUM_A', tmp2[, factor(DUMMY, levels=DUMMY, labels=COMM_NUM_A)])
	tmp2	<- subset(tmp2, select=c(COMM_NUM,COMM_NUM_A))
	tmp[, COMM_NUM_A:=NULL]
	tmp		<- merge(tmp, tmp2, by='COMM_NUM')
	ggplot(tmp, aes(x=COMM_NUM_A)) +
			geom_boxplot(aes(middle=M, lower=IL, upper=IU, ymin=CL, ymax=CU, fill=variable), stat='identity') +
			theme_bw() + 
			facet_grid(~COMM_TYPE, space='free', scale='free') +
			scale_fill_manual(values=c('women'='hotpink2', 'men'='deepskyblue')) +
			scale_y_continuous(labels=scales:::percent, expand=c(0,0), lim=c(0,0.6)) +
			labs(x='\ncommunity', y='HIV-1 prevalence\namong study participants\n', fill='gender')
	ggsave(file=paste0(outfile.base,'_trmMF_estimatedprevalence.pdf'), w=12, h=6)
	
	ggplot(subset(dgg, variable!='RFM'), aes(x=COMM_NUM3)) +
			geom_boxplot(aes(middle=M, lower=IL, upper=IU, ymin=CL, ymax=CU, fill=variable), stat='identity') +
			theme_bw() + 
			#facet_grid(~COMM_TYPE, space='free', scale='free') +
			scale_fill_manual(values=c('PF'='hotpink2', 'PM'='deepskyblue')) +
			scale_y_continuous(labels=scales:::percent) +
			labs(x='\ncommunity', y='HIV prevalence\nvisits 15, 15.1, 16\n', fill='gender')
	ggsave(file=paste0(outfile.base,'_trmMF_estimatedprevalence_orderedbyFMratio.pdf'), w=12, h=6)
	save(dgg, file=paste0(outfile.base,'_trmMF_estimatedFMprevalence.rda'))
	#
	#	plot on map
	#	
	require(ggmap)
	#zm		<- get_googlemap(center="rakai district uganda", zoom=10, maptype="hybrid")
	style	<- "feature:road|color:0x17202A&style=feature:water|color:0x677996&style=feature:landscape.natural|color:0xedecda&style=feature:administrative|visibility=off"
	zm		<- get_googlemap(c(lon=31.65, lat=-0.66), scale=2, size=c(550,550), zoom=10, maptype="road", style=style)
	#	plot number of observed recipients and observed transmitters
	ggmap(zm) +
			geom_point(data=subset(dgg, variable=='RFM'), aes(x=longitude, y=latitude, pch=COMM_TYPE, colour=M), size=7) +
			geom_text(data=subset(dgg, variable=='RFM'), aes(x=longitude, y=latitude, label=COMM_NUM), nudge_x=0, nudge_y=0, size=3, colour='black') + 
			scale_colour_gradient2(trans='log', breaks=c(1,1.25,1.5,2,2.5,3), low="deepskyblue", mid="orange", high='red', midpoint=log(2.2)) +
			labs(x='\nlongitude',y='latitude\n',colour='female to male\nprevalence ratio', pch='community\ntype')
	ggsave(file=paste0(outfile.base,'trmMF_prevalenceratio_on_map.pdf'), w=7, h=7)	
	ggmap(zm) +
			geom_point(data=subset(dgg, variable=='PF'), aes(x=longitude, y=latitude, pch=factor(COMM_TYPE=='fisherfolk', levels=c(TRUE,FALSE),labels=c('fishing\nsite','inland\ncommunity')), colour=M), size=9) +
			scale_colour_gradient2(breaks=c(0,0.1,0.2,0.3,0.4,0.5,0.6), lim=c(0.04,0.56), labels=scales:::percent, low="cyan", mid='darkorchid2', high='firebrick1', midpoint=0.35) +
			scale_shape_manual(values=c('fishing\nsite'=17,'inland\ncommunity'=19)) +
			labs(x='\nlongitude',y='latitude\n',colour='female\nHIV-1 prevalence', pch='')
	ggsave(file=paste0(outfile.base,'trmMF_prevalenceF_on_map.pdf'), w=7, h=7, useDingbats=FALSE)
	ggmap(zm) +
			geom_point(data=subset(dgg, variable=='PM'), aes(x=longitude, y=latitude, pch=factor(COMM_TYPE=='fisherfolk', levels=c(TRUE,FALSE),labels=c('fishing\nsite','inland\ncommunity')), colour=M), size=9) +			 
			scale_colour_gradient2(breaks=c(0,0.1,0.2,0.3,0.4,0.5,0.6), lim=c(0.04,0.56), labels=scales:::percent, low="cyan", mid='darkorchid2', high='firebrick1', midpoint=0.35) +
			scale_shape_manual(values=c('fishing\nsite'=17,'inland\ncommunity'=19)) +
			labs(x='\nlongitude',y='latitude\n',colour='male\nHIV-1 prevalence', pch='')
	ggsave(file=paste0(outfile.base,'trmMF_prevalenceM_on_map.pdf'), w=7, h=7, useDingbats=FALSE)	
}



RakaiFull.phylogeography.170421<- function()
{
	require(data.table)
	require(scales)
	require(ggplot2)
	require(ggmap)
	require(grid)
	require(gridExtra)
	require(RColorBrewer)
	require(Hmisc)
	require(gtools)	#rdirichlet
	
	infile					<- "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/todi_pairs_170428_withmetadata.rda"		
	outfile.base			<- "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/todi_pairs_170428_"	
	infile					<- "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/todi_pairs_170516_withmetadata.rda"		
	outfile.base			<- "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/todi_pairs_170516_"	
	zm		<- get_googlemap(center="rakai district uganda", zoom=10, maptype="hybrid")
	zc		<- as.data.table(read.csv('~/Dropbox (SPH Imperial College)/Rakai Pangea Meta Data/Data for Fish Analysis Working Group/PANGEA_Rakai_community_anonymized_IDs.csv', stringsAsFactors=FALSE))
	load(infile)	
	#set(rtpdm, NULL, c('MALE_FIRSTPOSVIS.y','MALE_FIRSTPOSDATE.y','FEMALE_FIRSTPOSVIS.y','FEMALE_FIRSTPOSDATE.y'), NULL)
	#setnames(rtpdm, c('MALE_FIRSTPOSVIS.x','MALE_FIRSTPOSDATE.x','FEMALE_FIRSTPOSVIS.x','FEMALE_FIRSTPOSDATE.x'), c('MALE_FIRSTPOSVIS','MALE_FIRSTPOSDATE','FEMALE_FIRSTPOSVIS','FEMALE_FIRSTPOSDATE'))
	nrow(rtpdm)
	#	stage 1: 307 transmissions with direction resolved to 307 recipients with unique transmitter
	#	stage 2: 252 transmissions with unique transmitters	
	
	rtpdm[, AGEDIFF:= rtpdm[, FEMALE_BIRTHDATE-MALE_BIRTHDATE]]
	set(rtpdm, NULL, 'PAIR_ID', rtpdm[, paste0(MALE_RID,'-',FEMALE_RID)])
	set(rtpdm, NULL, 'MALE_SEX', 'M')
	set(rtpdm, NULL, 'FEMALE_SEX', 'F')	
	
	subset(rsm, MIN_PNG_OUTPUT>0)[, quantile(MIN_PNG_OUTPUT/HIV, p=c(0,0.5,1))]
	
	#
	#	some helper data.tables
	rmf		<- subset(rtpdm, TYPE=='mf')
	rfm		<- subset(rtpdm, TYPE=='fm')
	rtr2	<- copy(rmf)
	setnames(rtr2,colnames(rtr2),gsub('FEMALE','REC',colnames(rtr2)))
	setnames(rtr2,colnames(rtr2),gsub('MALE','TR',colnames(rtr2)))
	tmp		<- copy(rfm)
	setnames(tmp,colnames(tmp),gsub('FEMALE','TR',colnames(tmp)))
	setnames(tmp,colnames(tmp),gsub('MALE','REC',colnames(tmp)))
	rtr2	<- rbind(rtr2,tmp)
	
	#
	#	Bayesian source attriution model
	#	adjust for incomplete sampling
	#
	dc	<- rtr2[, list(TR_OBS=length(PAIR_ID)), by=c('TR_COMM_NUM_A','REC_COMM_NUM_A')]	
	#	Bayesian model: add uniform prior
	if(0)
	{
		#	(which is Dirichlet 1 among all communities pairs that have a connection either way
		tmp	<- subset(dc, select=c(REC_COMM_NUM_A, TR_COMM_NUM_A))	
		setnames(tmp, c('REC_COMM_NUM_A','TR_COMM_NUM_A'), c('TR_COMM_NUM_A','REC_COMM_NUM_A'))
		tmp	<- merge(tmp, dc, all.x=1)
		tmp	<- subset(tmp, is.na(TR_OBS))
		set(tmp, NULL, 'TR_OBS', 0)
		dc	<- rbind(dc, tmp)		
	}
	if(1)
	{
		#	This is a bit non-standard, I just don t want the prior to have a large impact, so I chose a sparse one. 
		#	(which is Dirichlet 1 among all communities pairs that are closest)
		#	always add self if not present
		tmp	<- subset(dc[, list(UNOBSERVED_SELF=!any(REC_COMM_NUM_A==TR_COMM_NUM_A)), by='TR_COMM_NUM_A'],UNOBSERVED_SELF, TR_COMM_NUM_A)
		tmp[, REC_COMM_NUM_A:=TR_COMM_NUM_A]
		tmp[, TR_OBS:=0]
		dc	<- rbind(dc, tmp)
		#	ensure each community has at least 1 non-self community, if not add closest other community
		#	I really want to keep this sparse, so do not consider non-self to all communities
		tmp	<- unique(zc, by='COMM_NUM_A')
		tmp	<- as.data.table(t(sapply(seq_len(nrow(tmp)), function(i)
								{
									z<- sort( sqrt( (tmp[,longitude]-tmp[i,longitude])^2+(tmp[,latitude]-tmp[i,latitude])^2 ), index.return=TRUE)$ix
									c('TR_COMM_NUM_A'=tmp[i, COMM_NUM_A], 'REC_COMM_NUM_A'=tmp[z[2],COMM_NUM_A])
								})))	
		z	<- subset(dc[, list(REC_N_OBS=length(REC_COMM_NUM_A)), by='TR_COMM_NUM_A'], REC_N_OBS==1)
		tmp	<- merge(tmp, z, by='TR_COMM_NUM_A')
		tmp	<- merge(subset(tmp, select=c(TR_COMM_NUM_A, REC_COMM_NUM_A)), dc, all.x=1, by=c('REC_COMM_NUM_A','TR_COMM_NUM_A'))
		tmp	<- subset(tmp, is.na(TR_OBS))
		set(tmp, NULL, 'TR_OBS', 0)
		dc	<- rbind(dc, tmp)
	}
	#rsm[, list(ELIGIBLE_AVG=sum(ELIGIBLE_AVG)), by='COMM_TYPE']
	dc[, TR_PRIOR:= 0.5]
	#
	#	Bayesian model first hierarchy: define Beta posterior for sampling probabilities (all alpha and betas)
	#
	tmp	<- subset(rsm, select=c(COMM_NUM_A, ELIGIBLE_AVG, PARTICIPATED_AVG, HIV, MIN_PNG_OUTPUT))
	tmp[, P_PART_EMP:= PARTICIPATED_AVG/ELIGIBLE_AVG]
	tmp[, P_PART_ALPHA:= round(PARTICIPATED_AVG)+1]
	tmp[, P_PART_BETA:= round(ELIGIBLE_AVG-PARTICIPATED_AVG)+1]
	tmp[, P_SEQ_EMP:= MIN_PNG_OUTPUT/HIV]
	tmp[, P_SEQ_ALPHA:= round(MIN_PNG_OUTPUT)+1]
	tmp[, P_SEQ_BETA:= round(HIV-MIN_PNG_OUTPUT)+1]	
	setnames(tmp, colnames(tmp), paste0('TR_',colnames(tmp)))
	dc	<- merge(dc, tmp, by='TR_COMM_NUM_A')
	setnames(tmp, colnames(tmp), gsub('TR_','REC_',colnames(tmp)))
	dc	<- merge(dc, tmp, by='REC_COMM_NUM_A')
	#
	#	Bayesian model second hierarchy: draw unobserved data to augment likelihood
	#
	mc.it	<- 1e4
	dcb		<- dc[, {
				tmp	<- 	rbeta(mc.it, TR_P_PART_ALPHA, TR_P_PART_BETA)*
						rbeta(mc.it, TR_P_SEQ_ALPHA, TR_P_SEQ_BETA)*
						rbeta(mc.it, REC_P_PART_ALPHA, REC_P_PART_BETA)*
						rbeta(mc.it, REC_P_SEQ_ALPHA, REC_P_SEQ_BETA)
				#print(tmp)
				tmp	<- rnbinom(mc.it, TR_OBS+TR_PRIOR, tmp)
				#print(tmp)
				list(MONTE_CARLO_IT=seq_len(mc.it), TR_PRIOR=TR_PRIOR, TR_OBS=TR_OBS, TR_MISS= tmp)
			}, by=c('REC_COMM_NUM_A','TR_COMM_NUM_A')]	
	#
	#	Bayesian model second hierarchy: Dirichlet posterior for transmission from community i to j, pi_ij with pi_ij summing to 1
	#
	tmp		<- dcb[, list(	REC_COMM_NUM_A= REC_COMM_NUM_A, 
					TR_COMM_NUM_A= TR_COMM_NUM_A, 
					PI_IJ_ALPHA= TR_OBS+TR_MISS+TR_PRIOR				
			), by='MONTE_CARLO_IT']
	dcb		<- merge(dcb, tmp, by=c('REC_COMM_NUM_A','TR_COMM_NUM_A','MONTE_CARLO_IT'))
	#
	#	this is the end of the source attribution inference on the WAIFM matrix
	#
	
	#
	#	transmission hubs
	#	proportion of transmitters in community i that have a recipient outside community
	#	this is a summary on the estimated posterior density of the WAIFM matrix
	#	TODO missing community 94
	#
	#	get parameters of posterior under augmented likelihood
	z		<- dcb[, {
				z<- which(REC_COMM_NUM_A==TR_COMM_NUM_A)
				list(P_RECOUTSIDE_ALPHA= sum(PI_IJ_ALPHA[-z]), P_RECOUTSIDE_BETA= sum(PI_IJ_ALPHA[z]))
			}, by=c('TR_COMM_NUM_A','MONTE_CARLO_IT')]	
	#	aggregate and get quantiles
	mc.it	<- 1e2
	z		<- z[, {												
				tmp		<- quantile(rbeta(length(P_RECOUTSIDE_ALPHA)*mc.it, P_RECOUTSIDE_ALPHA, P_RECOUTSIDE_BETA), p=seq(0,1,0.01))
				list(P=seq(0,1,0.01), Q=unname(tmp))
			}, by='TR_COMM_NUM_A']
	#	subset to main quantities of interest
	z		<- dcast.data.table(subset(z, P%in%c(0.03, 0.25, 0.5, 0.75, 0.97)), TR_COMM_NUM_A~P, value.var='Q')
	setnames(z, c('0.03','0.25','0.5','0.75','0.97'), c('REC_P_OUTSIDE_CL','REC_P_OUTSIDE_IL','REC_P_OUTSIDE_M','REC_P_OUTSIDE_IU','REC_P_OUTSIDE_CU'))
	#	add TR_OBS from TR_COMM_NUM_A
	z		<- merge(z, dc[, list(REC_N_OBS=sum(TR_OBS)), by='TR_COMM_NUM_A'], by='TR_COMM_NUM_A')
	setnames(z, 'TR_COMM_NUM_A', 'COMM_NUM_A')	
	#	now after analysis, remove 94 with unknown long / lat
	#z		<- subset(z, COMM_NUM!='94')
	z	<- merge(z, unique(subset(zc, select=c(COMM_NUM_A, COMM_TYPE))), by='COMM_NUM_A')
	ggplot(z, aes(x=COMM_NUM_A, middle=REC_P_OUTSIDE_M, min=REC_P_OUTSIDE_CL, max=REC_P_OUTSIDE_CU, lower=REC_P_OUTSIDE_IL, upper=REC_P_OUTSIDE_IU, fill=REC_P_OUTSIDE_M)) + 
			geom_boxplot(outlier.shape=NA, stat='identity') +
			scale_fill_gradientn(colours=c("#2166AC","#F7F7F7","#B2182B"), values=rescale(c(0, .2, 1)), space = "Lab") +
			#geom_point(data=subset(z, BS==0), pch=2, colour='red') +
			scale_y_continuous(labels=scales:::percent, breaks=seq(0,1,0.2)) +
			facet_grid(~COMM_TYPE, scales='free_x', space='free_x') +
			labs(x='\ncommunity', y="proportion of transmitters from the community\nthat have a recipient outside the community") +
			theme_bw() + guides(fill='none')
	ggsave(file=paste0(outfile.base,'_hubs_transmitters_adjusted_confidenceintervals_anonmyized.pdf'), w=9, h=5)		
	
	#
	#	transmission hubs - central estimates
	#	among transmitters how many have a recipient outside community
	#	adjusted for sampling
	#	TODO missing community 94
	z[, REC_P_OUTSIDE_PREC:= 1 - REC_P_OUTSIDE_IU + REC_P_OUTSIDE_IL]	
	z	<- merge(unique(subset(zc, select=c(COMM_NUM_A, longitude, latitude, LONG_A, LAT_A)), by='COMM_NUM_A'), z, by='COMM_NUM_A')	
	ggmap(zm) +
			geom_point(data=z, aes(x=longitude, y=latitude, pch=COMM_TYPE, size=REC_N_OBS, fill=100*REC_P_OUTSIDE_M), stroke=1.5, alpha=0.8) +
			scale_fill_gradientn(colours=c("#2166AC","#F7F7F7","#B2182B"), values=rescale(c(0, .2, 1)), space = "Lab") +			
			scale_size(breaks=c(5,10,20,40,80), range=c(5,20))+
			scale_shape_manual(values=c('agrarian'=21, 'trading'=22, 'fisherfolk'=23)) +
			#scale_colour_brewer(palette='Dark2') +
			theme(legend.position='bottom', legend.box = "vertical") +			
			labs(	size="transmissions with transmitters from community",
					pch="community type",
					fill="proportion of transmitters from the community\nthat have a recipient outside the community")
	ggsave(file=paste0(outfile.base,'_hubs_transmitters_adjusted.pdf'), w=10, h=10)
	ggmap(zm) +
			geom_point(data=z, aes(x=LONG_A, y=LAT_A, pch=COMM_TYPE, size=REC_N_OBS, fill=100*REC_P_OUTSIDE_M), stroke=1.5, alpha=0.8) +
			scale_fill_gradientn(colours=c("#2166AC","#F7F7F7","#B2182B"), values=rescale(c(0, .2, 1)), space = "Lab") +			
			scale_size(breaks=c(5,10,20,40,80), range=c(5,20))+
			scale_shape_manual(values=c('agrarian'=21, 'trading'=22, 'fisherfolk'=23)) +
			#scale_colour_brewer(palette='Dark2') +
			theme(legend.position='bottom', legend.box = "vertical") +			
			labs(	size="transmissions with transmitters from community",
					pch="community type",
					fill="proportion of transmitters from the community\nthat have a recipient outside the community")
	ggsave(file=paste0(outfile.base,'_hubs_transmitters_adjusted_anonymized.pdf'), w=10, h=10)
	
	#
	#	recipient hubs - size is 1-IQR
	ggmap(zm) +
			geom_point(data=z, aes(x=longitude, y=latitude, pch=COMM_TYPE, fill=100*REC_P_OUTSIDE_M, size=REC_P_OUTSIDE_PREC), stroke=1.5, alpha=0.8) +
			scale_fill_gradientn(colours=c("#2166AC","#F7F7F7","#B2182B"), values=rescale(c(0, .2, 1)), space = "Lab") +
			#scale_fill_distiller(palette = "RdBu") +
			scale_size(breaks=c(.6,.7,.8,.9,.95), range=c(1,10))+
			scale_shape_manual(values=c('agrarian'=21, 'trading'=22, 'fisherfolk'=23)) +			
			theme(legend.position='bottom', legend.box = "vertical") +
			labs(	size="1 - interquantile range",
					pch="community type",
					fill="proportion of transmitters from the community\nthat have a recipient outside the community")
	ggsave(file=paste0(outfile.base,'_hubs_transmitters_adjusted_sizeIsUnctertainty.pdf'), w=10, h=10)
	ggmap(zm) +
			geom_point(data=z, aes(x=LONG_A, y=LAT_A, pch=COMM_TYPE, fill=100*REC_P_OUTSIDE_M, size=REC_P_OUTSIDE_PREC), stroke=1.5, alpha=0.8) +
			scale_fill_gradientn(colours=c("#2166AC","#F7F7F7","#B2182B"), values=rescale(c(0, .2, 1)), space = "Lab") +
			#scale_fill_distiller(palette = "RdBu") +
			scale_size(breaks=c(.6,.7,.8,.9,.95), range=c(1,10))+
			scale_shape_manual(values=c('agrarian'=21, 'trading'=22, 'fisherfolk'=23)) +			
			theme(legend.position='bottom', legend.box = "vertical") +
			labs(	size="1 - interquantile range",
					pch="community type",
					fill="proportion of transmitters from the community\nthat have a recipient outside the community")
	ggsave(file=paste0(outfile.base,'_hubs_transmitters_adjusted_sizeIsUnctertainty_anonymized.pdf'), w=10, h=10)
	
	#
	#	transmission hubs
	#	crude estimates		
	z	<- copy(rtr2)
	z	<- z[, list(REC_OUTSIDE_COMM=length(which(REC_COMM_NUM_A!=TR_COMM_NUM_A)), REC_IN_COMM=length(which(REC_COMM_NUM_A==TR_COMM_NUM_A)) ), by=c('TR_COMM_NUM_A','TR_COMM_TYPE')]
	z[, REC_N:=REC_OUTSIDE_COMM+REC_IN_COMM]
	setnames(z, 'TR_COMM_NUM_A', 'COMM_NUM_A')
	z	<- merge(unique(zc, by='COMM_NUM_A'), z, by='COMM_NUM_A')
	ggmap(zm) +
			geom_point(data=z, aes(x=longitude, y=latitude, pch=COMM_TYPE, colour=COMM_TYPE, size=REC_N, fill=100*REC_OUTSIDE_COMM/REC_N), stroke=1.5, alpha=0.8) + 
			scale_fill_gradientn(colours=c("#2166AC","#F7F7F7","#B2182B"), values=rescale(c(0, .2, 1)), space = "Lab") +
			scale_size(breaks=c(5,10,20,40,80), range=c(5,20))+
			scale_shape_manual(values=c('agrarian'=21, 'trading'=22, 'fisherfolk'=23)) +
			scale_colour_brewer(palette='Dark2') +			
			theme(legend.position='bottom', legend.box = "vertical") +
			labs(	size="transmissions with transmitters from community", 
					fill="proportion of transmitters from the community\nthat have a recipient outside the community")
	ggsave(file=paste0(outfile.base,'_hubs_transmitters_crude.pdf'), w=10, h=10)
	
	
	#
	#	geography transmitter flows into agrarian/trading/fisherolk recipient communities
	#	crude	
	tmp		<- rtr2[,list(N=length(unique(PAIR_ID))), by=c('TR_COMM_TYPE','REC_COMM_TYPE')]
	tmp[, P_CELL:= N/sum(N)]
	tmp		<- merge(tmp, tmp[, list(P_REC= N/sum(N), REC_COMM_TYPE=REC_COMM_TYPE), by='TR_COMM_TYPE'], by=c('TR_COMM_TYPE','REC_COMM_TYPE'))
	tmp		<- merge(tmp, tmp[, list(P_TR= N/sum(N), TR_COMM_TYPE=TR_COMM_TYPE), by='REC_COMM_TYPE'], by=c('TR_COMM_TYPE','REC_COMM_TYPE'))
	#tmp[, LABEL:= paste0(N, ' (',round(P_CELL,d=2)*100,'%)\ntransmitters: ',round(P_TR,d=2)*100,'%\nrecipients: ',round(P_REC,d=2)*100,'%')]
	tmp[, LABEL:= paste0(N, '\n(',round(P_TR,d=2)*100,'%)')]
	ggplot(tmp, aes(x=REC_COMM_TYPE, y=P_TR, fill=TR_COMM_TYPE)) + 
			geom_bar(stat='identity', position='dodge') +
			scale_fill_brewer(palette='Dark2') +
			theme_bw() + 
			scale_y_continuous(labels=scales::percent, limits=c(0,1), expand=c(0,0), breaks=seq(0,1,0.2)) +			
			labs(x='\nlocation likely recipient',y='location likely transmitter\n',fill='transmitter from') 
	ggsave(file=paste0(outfile.base,'_phylogeography_aft_sources_crude_prop.pdf'), w=6, h=5)
	ggplot(tmp, aes(x=REC_COMM_TYPE, y=N, fill=TR_COMM_TYPE)) + 
			geom_bar(stat='identity', position='dodge') +
			scale_fill_brewer(palette='Dark2') +
			theme_bw() + 
			scale_y_continuous() +			
			labs(x='\nlocation likely recipient',y='location likely transmitter\n',fill='transmitter from') 
	ggsave(file=paste0(outfile.base,'_phylogeography_aft_sources_crude_count.pdf'), w=6, h=5)	
	ggplot(tmp, aes(x=TR_COMM_TYPE, y=P_REC, fill=REC_COMM_TYPE)) + 
			geom_bar(stat='identity', position='dodge') +
			scale_fill_brewer(palette='Dark2') +
			theme_bw() + 
			scale_y_continuous(labels=scales::percent, limits=c(0,1), expand=c(0,0), breaks=seq(0,1,0.2)) +			
			labs(x='\nlocation likely transmitters',y='location likely recipients\n',fill='recipient in') 
	ggsave(file=paste0(outfile.base,'_phylogeography_aft_destinations_crude_prop.pdf'), w=6, h=5)
	ggplot(tmp, aes(x=TR_COMM_TYPE, y=N, fill=REC_COMM_TYPE)) + 
			geom_bar(stat='identity', position='dodge') +
			scale_fill_brewer(palette='Dark2') +
			theme_bw() + 
			scale_y_continuous() +			
			labs(x='\nlocation likely transmitter',y='location likely recipient\n',fill='recipient in') 
	ggsave(file=paste0(outfile.base,'_phylogeography_aft_destinations_crude_count.pdf'), w=6, h=5)	
	
	#
	#	geography transmitter flows into agrarian/trading/fisherolk recipient communities
	#	adjusted		
	groups	<- c('agrarian','trading','fisherfolk')
	z		<- lapply(groups, function(group)
			{				
				tmp		<- subset(zc, COMM_TYPE==group)$COMM_NUM_A		
				z		<- subset(dcb, REC_COMM_NUM_A%in%tmp)[, list(PI_ITYPE_ALPHA= sum(PI_IJ_ALPHA)), by=c('TR_COMM_NUM_A','MONTE_CARLO_IT')]
				tmp		<- unique(subset(zc, select=c(COMM_NUM_A, COMM_TYPE)))
				setnames(tmp, c('COMM_NUM_A','COMM_TYPE'), c('TR_COMM_NUM_A','TR_COMM_TYPE'))
				z		<- merge(z, tmp, by='TR_COMM_NUM_A')
				z		<- z[, list(PI_TYPETYPE_ALPHA=sum(PI_ITYPE_ALPHA)), by=c('TR_COMM_TYPE','MONTE_CARLO_IT')]	
				#	aggregate and get quantiles
				mc.it	<- 1e2
				z		<- z[, {												
							tmp		<- rdirichlet(mc.it, PI_TYPETYPE_ALPHA)
							colnames(tmp)	<- TR_COMM_TYPE
							tmp		<- as.data.table(tmp)								
						}, by=c('MONTE_CARLO_IT')]
				z		<- melt(z, id.vars='MONTE_CARLO_IT', variable.name='TR_COMM_TYPE', value.name='PI_TYPETYPE')
				z		<- z[, list(P=seq(0,1,0.01), Q=unname(quantile(PI_TYPETYPE, p=seq(0,1,0.01)))), by='TR_COMM_TYPE']
				#	subset to main quantities of interest
				z		<- dcast.data.table(subset(z, P%in%c(0.03, 0.25, 0.5, 0.75, 0.97)), TR_COMM_TYPE~P, value.var='Q')
				z[, REC_COMM_TYPE:=group]
				z
			})
	z		<- do.call('rbind',z)
	setnames(z, c('0.03','0.25','0.5','0.75','0.97'), c('TR_CL','TR_IL','TR_M','TR_IU','TR_CU'))
	ggplot(z, aes(x=REC_COMM_TYPE, y=TR_M, ymin=TR_CL, lower=TR_IL, upper=TR_IU, ymax=TR_CU, fill=TR_COMM_TYPE)) + 
			#geom_boxplot(stat='identity', position=position_dodge(width=0.9)) +
			geom_bar(position='dodge', stat='identity') +
			geom_errorbar(width=0.2, position=position_dodge(width=0.9)) +
			scale_fill_brewer(palette='Dark2') +
			theme_bw() + theme(legend.position='bottom') +
			scale_y_continuous(labels=scales::percent, limits=c(0,1), expand=c(0,0), breaks=seq(0,1,0.2)) +			
			labs(x='\nlocation likely recipient',y='sources of transmissions\n',fill='transmitter from') 
	ggsave(file=paste0(outfile.base,'_phylogeography_aft_sources_adjusted.pdf'), w=6, h=5)	
	#
	#	geography transmitter flows from agrarian/trading/fisherolk transmitter communities
	#	adjusted		
	groups	<- c('agrarian','trading','fisherfolk')
	z		<- lapply(groups, function(group)
			{				
				tmp		<- subset(zc, COMM_TYPE==group)$COMM_NUM_A		
				z		<- subset(dcb, TR_COMM_NUM_A%in%tmp)[, list(PI_ITYPE_ALPHA= sum(PI_IJ_ALPHA)), by=c('REC_COMM_NUM_A','MONTE_CARLO_IT')]
				tmp		<- unique(subset(zc, select=c(COMM_NUM_A, COMM_TYPE)))
				setnames(tmp, c('COMM_NUM_A','COMM_TYPE'), c('REC_COMM_NUM_A','REC_COMM_TYPE'))
				z		<- merge(z, tmp, by='REC_COMM_NUM_A')
				z		<- z[, list(PI_TYPETYPE_ALPHA=sum(PI_ITYPE_ALPHA)), by=c('REC_COMM_TYPE','MONTE_CARLO_IT')]	
				#	aggregate and get quantiles
				mc.it	<- 1e2
				z		<- z[, {												
							tmp		<- rdirichlet(mc.it, PI_TYPETYPE_ALPHA)
							colnames(tmp)	<- REC_COMM_TYPE
							tmp		<- as.data.table(tmp)								
						}, by=c('MONTE_CARLO_IT')]
				z		<- melt(z, id.vars='MONTE_CARLO_IT', variable.name='REC_COMM_TYPE', value.name='PI_TYPETYPE')
				z		<- z[, list(P=seq(0,1,0.01), Q=unname(quantile(PI_TYPETYPE, p=seq(0,1,0.01)))), by='REC_COMM_TYPE']
				#	subset to main quantities of interest
				z		<- dcast.data.table(subset(z, P%in%c(0.03, 0.25, 0.5, 0.75, 0.97)), REC_COMM_TYPE~P, value.var='Q')
				z[, TR_COMM_TYPE:=group]
				z
			})
	z		<- do.call('rbind',z)
	setnames(z, c('0.03','0.25','0.5','0.75','0.97'), c('REC_CL','REC_IL','REC_M','REC_IU','REC_CU'))
	ggplot(z, aes(x=TR_COMM_TYPE, y=REC_M, ymin=REC_CL, lower=REC_IL, upper=REC_IU, ymax=REC_CU, fill=REC_COMM_TYPE)) + 
			#geom_boxplot(stat='identity', position=position_dodge(width=0.9)) +
			geom_bar(position='dodge', stat='identity') +
			geom_errorbar(width=0.2, position=position_dodge(width=0.9)) +
			scale_fill_brewer(palette='Dark2') +
			theme_bw() + theme(legend.position='bottom') +
			scale_y_continuous(labels=scales::percent, limits=c(0,1), expand=c(0,0), breaks=seq(0,1,0.2)) +			
			labs(x='\nlocation likely transmitter',y='destination of transmissions\n',fill='recipients in') 
	ggsave(file=paste0(outfile.base,'_phylogeography_aft_destinations_barplot_adjusted.pdf'), w=6, h=5)	
	
	
	
	#
	#	geography flows out > flows in for agrarian/trading/fisherolk communities
	#	adjusted		
	groups	<- c('agrarian','trading','fisherfolk')
	z		<- lapply(groups, function(group)
			{				
				tmp		<- unique(subset(zc, select=c(COMM_NUM_A, COMM_TYPE)))
				set(tmp, tmp[, which(COMM_TYPE!=group)], 'COMM_TYPE', paste0('non-',group))
				setnames(tmp, c('COMM_NUM_A','COMM_TYPE'), c('REC_COMM_NUM_A','REC_COMM_TYPE'))
				z		<- merge(dcb, tmp, by='REC_COMM_NUM_A')
				setnames(tmp, c('REC_COMM_NUM_A','REC_COMM_TYPE'), c('TR_COMM_NUM_A','TR_COMM_TYPE'))
				z		<- merge(z, tmp, by='TR_COMM_NUM_A')
				z		<- z[, list(PI_ST_ALPHA=sum(PI_IJ_ALPHA)), by=c('REC_COMM_TYPE','TR_COMM_TYPE','MONTE_CARLO_IT')]
				z[, FLOW:=paste0('from ',TR_COMM_TYPE,' to ',REC_COMM_TYPE)]
				#	draw random variables
				mc.it	<- 1e2
				z		<- z[, {												
							tmp		<- rdirichlet(mc.it, PI_ST_ALPHA)
							colnames(tmp)	<- FLOW
							tmp		<- as.data.table(tmp)								
						}, by=c('MONTE_CARLO_IT')]
				#	transform as desired
				z[, STAT:= paste0('from ',group,' to non-',group,'\n/\nfrom non-',group,' to ',group)]				
				setnames(z, c(paste0('from ',group,' to non-',group),paste0('from non-',group,' to ',group)), c('A','D'))
				#	get quantiles
				z		<- z[, list(P=seq(0,1,0.01), Q=unname(quantile(A/D, p=seq(0,1,0.01)))), by='STAT']
			})
	z		<- do.call('rbind',z)
	z		<- dcast.data.table(subset(z, P%in%c(0.03, 0.25, 0.5, 0.75, 0.97)), STAT~P, value.var='Q')
	setnames(z, c('0.03','0.25','0.5','0.75','0.97'), c('FLOW_CL','FLOW_IL','FLOW_M','FLOW_IU','FLOW_CU'))
	z[, COMM_TYPE:= gsub('^from ([a-z]+).*$', '\\1', STAT)]
	ggplot(z, aes(x=COMM_TYPE, middle=FLOW_M, min=FLOW_CL, lower=FLOW_IL, upper=FLOW_IU, max=FLOW_CU, fill=STAT)) +
			geom_hline(yintercept=1, colour='grey50', size=1.5) +
			geom_boxplot(stat='identity') +
			scale_fill_brewer(palette='Dark2') +
			theme_bw() + theme(legend.position='bottom') +
			scale_y_log10(expand=c(0,0), breaks=c(1/4,1/3,1/2,2/3,1,3/2,2,3,4), labels=c('1/4','1/3','1/2','2/3','1','3/2','2','3','4'), limits=c(1/4,5)) +			
			coord_flip() +
			labs(x='community type\n', y='\nflows out / flows in', fill='flow') 
	ggsave(file=paste0(outfile.base,'_phylogeography_aft_flowsoutin_adjusted.pdf'), w=6, h=4)
	
	
	#
	#	geography who infects whom matrix  
	#	crude
	tmp		<- rtr2[,list(N=length(unique(PAIR_ID))), by=c('TR_COMM_TYPE','REC_COMM_TYPE')]
	tmp[, P_CELL:= N/sum(N)]
	tmp		<- merge(tmp, tmp[, list(P_REC= N/sum(N), REC_COMM_TYPE=REC_COMM_TYPE), by='TR_COMM_TYPE'], by=c('TR_COMM_TYPE','REC_COMM_TYPE'))
	tmp		<- merge(tmp, tmp[, list(P_TR= N/sum(N), TR_COMM_TYPE=TR_COMM_TYPE), by='REC_COMM_TYPE'], by=c('TR_COMM_TYPE','REC_COMM_TYPE'))
	#tmp[, LABEL:= paste0(N, ' (',round(P_CELL,d=2)*100,'%)\ntransmitters: ',round(P_TR,d=2)*100,'%\nrecipients: ',round(P_REC,d=2)*100,'%')]
	tmp[, LABEL:= paste0(N, '\n(',round(P_TR,d=2)*100,'%)')]
	#tmp[, LABEL:= paste0(N, '\n(',round(P_CELL,d=2)*100,'%)')]
	ggplot(tmp, aes(x=factor(REC_COMM_TYPE),y=factor(TR_COMM_TYPE))) + 
			geom_point(aes(size=N), colour='grey80') +
			geom_text(aes(label=LABEL), nudge_x=0, nudge_y=0, size=3, colour='black') +			
			theme_bw() + 
			scale_size(range = c(5, 50)) +
			labs(x='\nlocation likely recipient',y='location likely transmitter\n') +
			guides(size='none')
	ggsave(file=paste0(outfile.base,'_commtype_3x3.pdf'), w=5, h=5)
	
	set(tmp, NULL, 'REC_COMM_TYPE', tmp[,paste0('to_',REC_COMM_TYPE)])
	set(tmp, NULL, 'TR_COMM_TYPE', tmp[,paste0('from_',TR_COMM_TYPE)])
	tmp		<- suppressWarnings(melt(tmp, id.vars=c('REC_COMM_TYPE','TR_COMM_TYPE'), measure.vars=c('N','P_CELL')))
	tmp		<- dcast.data.table(tmp, variable+TR_COMM_TYPE~REC_COMM_TYPE, value.var='value')
	write.csv(tmp, row.names=FALSE, paste0(outfile.base,'_commtype_3x3_raw.csv'))
	#
	#	geography who infects whom matrix  
	#	adjusted P
	tmp		<- unique(subset(zc, select=c(COMM_NUM_A, COMM_TYPE)))
	setnames(tmp, c('COMM_NUM_A','COMM_TYPE'), c('REC_COMM_NUM_A','REC_COMM_TYPE'))
	z		<- merge(dcb, tmp, by='REC_COMM_NUM_A')
	setnames(tmp, c('REC_COMM_NUM_A','REC_COMM_TYPE'), c('TR_COMM_NUM_A','TR_COMM_TYPE'))
	z		<- merge(z, tmp, by='TR_COMM_NUM_A')
	z		<- z[, list(TR_OBS=sum(TR_OBS), TR_MISS=sum(TR_MISS), PI_ST_ALPHA=sum(PI_IJ_ALPHA)), by=c('REC_COMM_TYPE','TR_COMM_TYPE','MONTE_CARLO_IT')]
	z[, FLOW:=paste0('from_',TR_COMM_TYPE,' to_',REC_COMM_TYPE)]
	mc.it	<- 1e2
	z		<- z[, {												
				tmp		<- rdirichlet(mc.it, PI_ST_ALPHA)
				colnames(tmp)	<- FLOW
				tmp		<- as.data.table(tmp)								
			}, by=c('MONTE_CARLO_IT')]
	z		<- melt(z, id.vars='MONTE_CARLO_IT')	
	z[, TR_COMM_TYPE:= gsub('(from_[a-z]+) (to_[a-z]+)','\\1',variable)]
	z[, REC_COMM_TYPE:= gsub('(from_[a-z]+) (to_[a-z]+)','\\2',variable)]	
	z		<- z[, list(P=seq(0,1,0.01), Q=unname(quantile(value, p=seq(0,1,0.01)))), by=c('TR_COMM_TYPE','REC_COMM_TYPE')]	
	z		<- dcast.data.table(subset(z, P%in%c(0.03, 0.25, 0.5, 0.75, 0.97)), TR_COMM_TYPE+REC_COMM_TYPE~P, value.var='Q')
	setnames(z, c('0.03','0.25','0.5','0.75','0.97'), c('PADJ_CL','PADJ_IL','PADJ_M','PADJ_IU','PADJ_CU'))		
	ans		<- melt(z, id.vars=c('TR_COMM_TYPE','REC_COMM_TYPE'), measure.vars=c('PADJ_M','PADJ_CL','PADJ_CU'))
	ans		<- dcast.data.table(ans, variable+REC_COMM_TYPE~TR_COMM_TYPE, value.var='value')
	#	adjusted N
	tmp		<- unique(subset(zc, select=c(COMM_NUM_A, COMM_TYPE)))
	setnames(tmp, c('COMM_NUM_A','COMM_TYPE'), c('REC_COMM_NUM_A','REC_COMM_TYPE'))
	z		<- merge(dcb, tmp, by='REC_COMM_NUM_A')
	setnames(tmp, c('REC_COMM_NUM_A','REC_COMM_TYPE'), c('TR_COMM_NUM_A','TR_COMM_TYPE'))
	z		<- merge(z, tmp, by='TR_COMM_NUM_A')
	z		<- z[, list(TR_ADJ=sum(TR_OBS)+sum(TR_MISS)), by=c('REC_COMM_TYPE','TR_COMM_TYPE','MONTE_CARLO_IT')]
	set(z, NULL, 'REC_COMM_TYPE', z[,paste0('to_',REC_COMM_TYPE)])
	set(z, NULL, 'TR_COMM_TYPE', z[,paste0('from_',TR_COMM_TYPE)])
	z		<- z[, list(P=seq(0,1,0.01), Q=unname(quantile(TR_ADJ, p=seq(0,1,0.01), type=1))), by=c('TR_COMM_TYPE','REC_COMM_TYPE')]	
	z		<- dcast.data.table(subset(z, P%in%c(0.03, 0.25, 0.5, 0.75, 0.97)), TR_COMM_TYPE+REC_COMM_TYPE~P, value.var='Q')
	setnames(z, c('0.03','0.25','0.5','0.75','0.97'), c('NADJ_CL','NADJ_IL','NADJ_M','NADJ_IU','NADJ_CU'))
	z		<- melt(z, id.vars=c('TR_COMM_TYPE','REC_COMM_TYPE'), measure.vars=c('NADJ_M','NADJ_CL','NADJ_CU'))
	z		<- dcast.data.table(z, variable+REC_COMM_TYPE~TR_COMM_TYPE, value.var='value')
	ans		<- rbind(z, ans)
	write.csv(ans, row.names=FALSE, paste0(outfile.base,'_commtype_3x3_adjusted.csv'))
	
	
	#
	#	geography transmitters from outside community
	z	<- copy(rtr2)
	z[, FROM_OUTSIDE:= factor(REC_COMM_NUM_A!=TR_COMM_NUM_A, levels=c(TRUE,FALSE), labels=c('outside community','same community'))]
	set(z, NULL, 'REC_COMM_TYPE', z[, factor(REC_COMM_TYPE)])
	ggplot(z, aes(x=REC_COMM_TYPE, fill=FROM_OUTSIDE)) + geom_bar() + 
			theme_bw() + 
			labs(x='\ncommunity type of recipient', fill='transmitter from')
	ggsave(file=paste0(outfile.base,'_extra_community.pdf'), w=5, h=5)
	ggplot(z, aes(x=REC_COMM_TYPE, fill=FROM_OUTSIDE)) + geom_bar(position='fill') + 
			theme_bw() + 
			scale_y_continuous(labels=scales::percent, expand=c(0,0), breaks=seq(0,1,0.2)) +
			labs(x='\ncommunity type of recipient', y='transmitters\n', fill='transmitter from')
	ggsave(file=paste0(outfile.base,'_extra_community_props.pdf'), w=5, h=5)
	#
	#	odds for unequal transmission flows agrarian/fisherfolk
	z	<- z[, list(N=length(REC_RID)), by=c('REC_COMM_TYPE','FROM_OUTSIDE')]
	tmp	<- dcast.data.table(subset(z, REC_COMM_TYPE!='trading'), FROM_OUTSIDE~REC_COMM_TYPE, value.var='N')
	zz	<- as.matrix(tmp[, 2:3, with=FALSE])
	rownames(zz)	<- tmp[[1]]
	fisher.test(zz)
	#STAGE 1
	#data:  zz
	#p-value = 0.0113
	#alternative hypothesis: true odds ratio is not equal to 1
	#95 percent confidence interval:
	#1.137942 3.390227
	#sample estimates:
	#odds ratio 
	#1.963628
	
	#STAGE 2
	#p-value = 0.0002936
	#alternative hypothesis: true odds ratio is not equal to 1
	#95 percent confidence interval:
	#1.698675 7.052786
	#sample estimates:
	#odds ratio 
	#3.427573 
	
	#
	#	geography recipients in different community
	z	<- copy(rtr2)
	z[, FROM_OUTSIDE:= factor(REC_COMM_NUM_A!=TR_COMM_NUM_A, levels=c(TRUE,FALSE), labels=c('outside community','same community'))]
	set(z, NULL, 'TR_COMM_TYPE', z[, factor(TR_COMM_TYPE)])
	ggplot(z, aes(x=TR_COMM_TYPE, fill=FROM_OUTSIDE)) + geom_bar() + 
			theme_bw() + 
			labs(x='\ncommunity type of transmitters', fill='recipient')
	ggsave(file=paste0(outfile.base,'_extra_community_of_recipients.pdf'), w=5, h=5)	
	ggplot(z, aes(x=TR_COMM_TYPE, fill=FROM_OUTSIDE)) + geom_bar(position='fill') + 
			theme_bw() + 
			scale_y_continuous(labels=scales::percent, expand=c(0,0), breaks=seq(0,1,0.2)) +
			labs(x='\ncommunity type of transmitters', y='recipients\n', fill='recipient')
	ggsave(file=paste0(outfile.base,'_extra_community_of_recipients_props.pdf'), w=5, h=5)	
	z	<- z[, list(N=length(TR_RID)), by=c('TR_COMM_TYPE','FROM_OUTSIDE')]
	tmp	<- dcast.data.table(subset(z, TR_COMM_TYPE!='trading'), FROM_OUTSIDE~TR_COMM_TYPE, value.var='N')
	zz	<- as.matrix(tmp[, 2:3, with=FALSE])
	rownames(zz)	<- tmp[[1]]
	fisher.test(zz)
	#STAGE 1
	#p-value = 0.05827
	#alternative hypothesis: true odds ratio is not equal to 1
	#95 percent confidence interval:
	#		0.9429887 3.1200202
	#sample estimates:
	#		odds ratio 
	#1.719531
	
	#STAGE 2
	#p-value = 0.003303
	#alternative hypothesis: true odds ratio is not equal to 1
	#95 percent confidence interval:
	# 1.346652 6.087743
	#sample estimates:
	#odds ratio 
	#  2.844896 
	
	
	#
	#	geography transmitters from outside community by gender of recipients
	z	<- copy(rtr2)
	z[, FROM_OUTSIDE:= factor(REC_COMM_NUM_A!=TR_COMM_NUM_A, levels=c(TRUE,FALSE), labels=c('outside community','same community'))]
	set(z, NULL, 'REC_COMM_TYPE', z[, factor(REC_COMM_TYPE)])
	set(z, NULL, 'REC_SEX', z[, paste0('gender recipient: ',REC_SEX)])	
	ggplot(z, aes(x=REC_COMM_TYPE, fill=FROM_OUTSIDE)) + geom_bar(position='fill') + 
			theme_bw() + 
			scale_y_continuous(labels=scales::percent, expand=c(0,0), breaks=seq(0,1,0.2)) +
			labs(x='\ncommunity type of recipient', y='transmitters\n', fill='transmitter from') +
			facet_grid(~REC_SEX)
	ggsave(file=paste0(outfile.base,'_extra_community_bygender_props.pdf'), w=7, h=5)	
	
	#
	#	community locations
	#		
	zc	<- as.data.table(read.csv('~/Dropbox (SPH Imperial College)/Rakai Pangea Meta Data/Data for Fish Analysis Working Group/community_spatialLoc.csv'))
	set(zc, NULL, 'COMM_NUM_A', zc[,gsub('^107$|^16$','16m',gsub('^776$|^51$','51m',gsub('^4$|^24$','24m',gsub('^1$|^22$','22m',as.character(COMM_NUM)))))])
	zc	<- merge(zc, unique(subset(rh, select=c(COMM_NUM, COMM_TYPE))), by='COMM_NUM')	
	ggmap(zm) +
			geom_point(data=unique(zc, by='COMM_NUM'), aes(x=longitude, y=latitude, pch=COMM_TYPE, colour=COMM_TYPE), size=8, alpha=0.8) +
			geom_text(data=unique(zc, by='COMM_NUM'), aes(x=longitude, y=latitude, label=COMM_NUM), nudge_x=0, nudge_y=0, size=3, colour='black')
	ggsave(file=paste0(outfile.base,'_hubs_comm_locations.pdf'), w=10, h=10)
	
	
}

RakaiFull.phylogeography.171122.samplinglocations<- function()
{
	require(data.table)
	require(scales)
	require(ggplot2)
	require(ggmap)
	require(grid)
	require(gridExtra)
	require(RColorBrewer)
	require(Hmisc)
	require(gtools)	#rdirichlet
	
	infile					<- "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/todi_pairs_171122_cl35_prior23_min30_withmetadata.rda"
	infile					<- "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/todi_pairs_171122_cl25_prior23_min30_withmetadata.rda"
	outfile.base			<- gsub('_withmetadata.rda','',infile)
	
	zm		<- get_googlemap(center="rakai district uganda", zoom=10, maptype="hybrid")
	zc		<- as.data.table(read.csv('~/Dropbox (SPH Imperial College)/Rakai Pangea Meta Data/Data for Fish Analysis Working Group/PANGEA_Rakai_community_anonymized_IDs.csv', stringsAsFactors=FALSE))
	load(infile)	
	setkey(rtp, MALE_RID, FEMALE_RID)
	rtp[, PAIRID:= seq_len(nrow(rtp))]
	rtpdm	<- subset(rtp, grepl('mf|fm',SELECT))
	nrow(rtpdm)
	#	284	with 0.035
	#	238	with 0.025
	
	#
	#	all locations
	ggmap(zm) +
			geom_point(data=unique(zc, by='COMM_NUM'), aes(x=longitude, y=latitude, pch=COMM_TYPE, colour=COMM_TYPE), size=8, alpha=0.8) +
			geom_text(data=unique(zc, by='COMM_NUM'), aes(x=longitude, y=latitude, label=COMM_NUM), nudge_x=0, nudge_y=0, size=3, colour='black')
	ggsave(file=paste0(outfile.base,'_comm_all_locations.pdf'), w=10, h=10)
	
	#
	#	locations with number sequence samples
	ds	<- subset(rsm, MIN_PNG_OUTPUT>0, select=c(COMM_NUM, COMM_TYPE, COMM_NUM_A, ELIGIBLE_AVG, PARTICIPATED_AVG, HIV, MIN_PNG_OUTPUT))
	ds[, P_PART_EMP:= PARTICIPATED_AVG/ELIGIBLE_AVG]
	ds[, P_SEQ_EMP:= MIN_PNG_OUTPUT/HIV]
	ds[, P_SEQCOV:= P_PART_EMP*P_SEQ_EMP]
	ds[, P_SEQCOV_C:= cut(P_SEQCOV, breaks=seq(0,1,0.1), labels=c('<10%','10-19%','20-29%','30-39%','40-49%','50-59%','60-69%','70-79%','80-89%','90%-100%'))]
	tmp	<- unique(subset(zc, select=c(COMM_NUM, longitude, latitude)), by='COMM_NUM')
	ds	<- merge(ds, tmp, by='COMM_NUM')
	ggmap(zm) +
			geom_point(data=ds, aes(x=longitude, y=latitude, pch=COMM_TYPE, colour=P_SEQCOV_C, size=MIN_PNG_OUTPUT), alpha=0.8) +
			geom_text(data=ds, aes(x=longitude, y=latitude, label=MIN_PNG_OUTPUT), nudge_x=0, nudge_y=0, size=3, colour='black') +			
			scale_colour_brewer(palette="YlOrRd") +
			scale_size(breaks=c(0,10,50,100,200,400,1000), range=c(5,15))+
			#scale_shape_manual(values=c('agrarian'=21, 'trading'=22, 'fisherfolk'=23)) +			
			theme(legend.position='bottom', legend.box = "vertical") +
			guides(size='none', colour=guide_legend(override.aes=list(size=7)), pch=guide_legend(override.aes=list(size=7))) +
			labs(	x='', y='', pch="community\ntype",
					colour="estimated\nsequence coverage\nof HIV infected population")
	ggsave(file=paste0(outfile.base,'_comm_sequencenumbers_on_map.pdf'), w=7, h=7)
	
	
	#	sequence coverage
	tmp	<- rsm[,  list(STAT=c('Median','CL','CU'), V=as.numeric(binconf(MIN_PNG_OUTPUT, round(HIV*6205/4928)))) , by=c('COMM_NUM_A','COMM_TYPE','MIN_PNG_OUTPUT')]
	tmp	<- dcast.data.table(tmp, COMM_NUM_A+COMM_TYPE+MIN_PNG_OUTPUT~STAT, value.var='V')
	ggplot(tmp, aes(x=COMM_NUM_A, y=Median, ymin=CL, ymax=CU)) +
			geom_point(aes(size=MIN_PNG_OUTPUT)) +
			#geom_errorbar(width=0.3) +
			theme_bw() +
			scale_y_continuous(labels=scales:::percent, lim=c(0.1,0.8)) +
			labs(x='\ncommunity', y='sequence coverage\nHIV infected who have a sequence\n', size='NGS output') +
			facet_grid(~COMM_TYPE, space='free', scales='free')
	ggsave(file=paste0(outfile.base,'_comm_seqcov.pdf'), w=10, h=5)
	
	set.seed(123)
	long <- rnorm(50, sd=100)
	lat <- rnorm(50, sd=50)
	d <- data.frame(long=long, lat=lat)
	d <- with(d, d[abs(long) < 150 & abs(lat) < 70,])
	n <- nrow(d)
	d$region <- factor(1:n)
	d$A <- abs(rnorm(n, sd=1))
	d$B <- abs(rnorm(n, sd=2))
	d$C <- abs(rnorm(n, sd=3))
	d$D <- abs(rnorm(n, sd=4))
	d$radius <- 6 * abs(rnorm(n))
	head(d)
	library(ggplot2)
	library(scatterpie)
	
	world <- map_data('world')
	p <- ggplot(world, aes(long, lat)) +
			geom_map(map=world, aes(map_id=region), fill=NA, color="black") +
			coord_quickmap()
	p + geom_scatterpie(aes(x=long, y=lat, group=region, r=radius),
					data=d, cols=LETTERS[1:4], color=NA, alpha=.8) +
			geom_scatterpie_legend(d$radius, x=-160, y=-55)
	
	infile	<- "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/popviraemia_data.rda"
	load(infile)
	df[, SLART_YES:= SLART_YES_M+SLART_YES_F]
	
}

RakaiFull.phylogeography.180618.samplinglocations<- function()
{	
	library(ggplot2)
	library(scatterpie)
	require(data.table)
	require(scales)
	require(ggplot2)
	require(ggmap)
	require(grid)
	require(gridExtra)
	require(RColorBrewer)
	require(Hmisc)
	require(gtools)	#rdirichlet
	
	#
	#	get map
	#
	style	<- "feature:road|color:0x17202A&style=feature:water|color:0x677996&style=feature:landscape.natural|color:0xedecda&style=feature:administrative|visibility=off"
	zm		<- get_googlemap(c(lon=31.65, lat=-0.66), scale=2, size=c(550,550), zoom=10, maptype="road", style=style)
	zc		<- as.data.table(read.csv('~/Dropbox (SPH Imperial College)/Rakai Pangea Meta Data/Data for Fish Analysis Working Group/PANGEA_Rakai_community_anonymized_IDs.csv', stringsAsFactors=FALSE))			
	zc		<- unique(zc, by='COMM_NUM')
	#
	#	get self reported ART, POS
	#
	infile			<- '~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/180322_sampling_by_gender_age.rda'		
	load(infile)
	#	get those infected before round 17 and who participated in at least one round 15-16
	df		<- subset(de, HIV_1517==1, select=c(COMM_NUM, RID, HIV_1517, SELFREPORTART_AT_FIRST_VISIT, MIN_PNG_OUTPUT))
	df		<- df[, list(MIN_PNG_OUTPUT=max(MIN_PNG_OUTPUT)), by=c('COMM_NUM','RID','HIV_1517','SELFREPORTART_AT_FIRST_VISIT')]	
	df		<- df[, list(	POS=sum(HIV_1517),
							SLART_YES=sum(SELFREPORTART_AT_FIRST_VISIT),
							DEEP_SEQ_1516=sum(MIN_PNG_OUTPUT)
					), by='COMM_NUM']
	df[, SLART_NO_NOSEQ:= POS-DEEP_SEQ_1516-SLART_YES]
	df		<- merge(df, zc, by='COMM_NUM')		
							
	df[, POS2:= POS/1000]
	df[, POS3:= POS^(1/3)/300]
	ggmap(zm) +
		geom_scatterpie(aes(x=longitude, y=latitude, group=COMM_NUM, r=POS3),
						data=df, cols=c('SLART_YES','SLART_NO_NOSEQ','DEEP_SEQ_1516')) +		
		scale_fill_manual(values=c('SLART_NO_NOSEQ'='turquoise2','DEEP_SEQ_1516'='firebrick2','SLART_YES'='grey50')) +
		geom_scatterpie_legend(df$POS3, x=31.4, y=-0.4, n=5, labeller=function(radius) round((radius*400)^3) )
	ggsave(file=paste0(outfile.base,'_comm_seqcov_map.pdf'), w=7, h=5)
}

RakaiCirc.epi.get.info.170208<- function()
{
	hivc.db.Date2numeric<- function( x )
	{
		if(!class(x)%in%c('Date','character'))	return( x )
		x	<- as.POSIXlt(x)
		tmp	<- x$year + 1900
		x	<- tmp + round( x$yday / ifelse((tmp%%4==0 & tmp%%100!=0) | tmp%%400==0,366,365), d=3 )
		x	
	}
	#
	infile				<- "~/Dropbox (SPH Imperial College)/Rakai Pangea Meta Data/Data for Fish Analysis Working Group/alldat_r15or17_witharv.rda"
	load(infile)
	ra		<- as.data.table(alldat)
	ra		<- subset(ra, select=c(RCCS_studyid,REGION,COMM_NUM,HH_NUM,SEX,AGEYRS,visdate,visit,lastNegDate,hiv,firstPosDate,eversex, evermarr, currmarr, polymar, sexpever, sexp1yr, sexp1out, sexgift, sexyear, religion, educate, educyrs, edcat, occ, occ2, arvmed))
	#	a bit of clean up 	
	setnames(ra, colnames(ra), gsub('\\.','_',toupper(colnames(ra))))	
	set(ra, NULL, 'VISDATE', ra[, as.Date(VISDATE, format='%d/%m/%Y')])	
	for(x in colnames(ra))
		if(class(ra[[x]])=='Date')
			set(ra, NULL, x, hivc.db.Date2numeric(ra[[x]]))
	#	
	wdir				<- "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/circumcision"	
	infile				<- "~/Dropbox (SPH Imperial College)/Rakai Pangea Meta Data/Data for Fish Analysis Working Group/RakaiPangeaMetaData_v2.rda"
	load(infile)
	#	a bit of clean up 
	rd		<- as.data.table(rccsData)
	setnames(rd, colnames(rd), gsub('\\.','_',toupper(colnames(rd))))	
	for(x in colnames(rd))
		if(class(rd[[x]])=='Date')
			set(rd, NULL, x, hivc.db.Date2numeric(rd[[x]]))
	rh		<- as.data.table(rccsHistory)
	setnames(rh, colnames(rh), gsub('\\.','_',toupper(colnames(rh))))
	rn		<- as.data.table(neuroData)
	setnames(rn, colnames(rn), gsub('\\.','_',toupper(colnames(rn))))
	for(x in colnames(rn))
		if(class(rn[[x]])=='Date')
			set(rn, NULL, x, hivc.db.Date2numeric(rn[[x]]))
	#rd[, table(VISIT)]
	#	make shorter
	setnames(ra, 'RCCS_STUDYID', 'RID')
	setnames(rd, 'RCCS_STUDYID', 'RID')
	setnames(rd, 'PANGEA_ID', 'PID')
	setnames(rd, 'STUDYID', 'SID')
	setnames(rh, 'RCCS_STUDYID', 'RID')	
	setnames(rn, 'STUDYID', 'RID')
	setnames(rn, 'PANGEA_ID', 'PID')
	#	get neuro into format of rd
	setnames(rn, c('SAMPLEDATE','GENDER','CD4','VIRALLOAD'), c('DATE','SEX','RECENTCD4','RECENTVL'))
	set(rn, NULL, c('RECENTVLDATE','RECENTCD4DATE'), rn[, DATE])
	set(rn, NULL, c('TIMESINCEVL','TIMESINCECD4'), 0)
	#	data checks
	setkey(rh, VISIT, RID)
	stopifnot(nrow(rh)==nrow(unique(rh)))
	#	define circumcision	
	set(rh, rh[, which(!CIRCUM%in%c(1,2))], 'CIRCUM', NA_integer_)
	set(rh, NULL, 'CIRCUM', rh[, factor(CIRCUM, levels=c(1,2), labels=c('Y','N'))])
	#	define sexual relationships
	set(rh, rh[, which(is.na(RLTN1))],'RLTN1',99L)
	set(rh, rh[, which(is.na(RLTN2))],'RLTN2',99L)
	set(rh, rh[, which(is.na(RLTN3))],'RLTN3',99L)
	set(rh, rh[, which(is.na(RLTN4))],'RLTN4',99L)
	setnames(rh, c('RLTN1','RLTN2','RLTN3','RLTN4'), c('RLTN1_','RLTN2_','RLTN3_','RLTN4_'))
	warning('undocumented RLTN codes 15 16 88 -> set to Unknown')
	tmp		<- as.data.table(matrix(data=c(	'1','Current wife (at the time)',
							'2','Current consensual partner (at the time)',
							'3','Former wife/consensual partner',
							'4','Girlfriend',
							'5','Occasional or casual friend',
							'6','Visitor (incl. wedding/funeral)',
							'7','Stranger',
							'8','Workmate',
							'9','Boss/work supervisor',
							'10','Employee',
							'11','Fellow student',
							'12','Sugar mummy',
							'13','Relative other than spouse',
							'14','Other non relative',
							'15','Unknown',
							'16','Unknown',
							'88','Unknown',
							'0','Unknown',
							'97','Unknown',
							'98','Unknown',
							'99','Unknown'
					), ncol=2, byrow=TRUE))
	setnames(tmp, colnames(tmp), c('RLTN1_','RLTN1'))
	set(tmp, NULL,'RLTN1_',tmp[, as.integer(RLTN1_)])
	rh		<- merge(rh,tmp,by='RLTN1_')
	setnames(tmp, c('RLTN1_','RLTN1'), c('RLTN2_','RLTN2'))	
	rh		<- merge(rh,tmp,by='RLTN2_')
	setnames(tmp, c('RLTN2_','RLTN2'), c('RLTN3_','RLTN3'))	
	rh		<- merge(rh,tmp,by='RLTN3_')
	setnames(tmp, c('RLTN3_','RLTN3'), c('RLTN4_','RLTN4'))	
	rh		<- merge(rh,tmp,by='RLTN4_')
	set(rh, NULL, c('RLTN1_','RLTN2_','RLTN3_','RLTN4_'), NULL)	
	tmp		<- rh[, which(RLTN1=='Unknown' & RLTN2!='Unknown')]
	set(rh, tmp, 'RLTN1', rh[tmp, RLTN2])
	set(rh, tmp, 'RLTN2', 'Unknown')
	tmp		<- rh[, which(RLTN2=='Unknown' & RLTN3!='Unknown')]
	set(rh, tmp, 'RLTN2', rh[tmp, RLTN3])
	set(rh, tmp, 'RLTN3', 'Unknown')
	tmp		<- rh[, which(RLTN3=='Unknown' & RLTN4!='Unknown')]
	set(rh, tmp, 'RLTN3', rh[tmp, RLTN4])
	set(rh, tmp, 'RLTN4', 'Unknown')
	#	define number named sexual relations last year
	tmp		<- melt(rh, id.vars=c('RID','VISIT'), measure.vars=c('RLTN1','RLTN2','RLTN3','RLTN4'))
	tmp		<- tmp[, list(RLTN_NAMED= length(which(value!='Unknown'))), by=c('RID','VISIT')]
	rh		<- merge(rh, tmp, by=c('RID','VISIT'))
	#	define occupation
	#	set back OCCUP codes according to patient sheet
	setnames(rh, c('OCCUP1','OCCUP2','OCCUP3','OCCUP4'), c('OCCUP11','OCCUP12','OCCUP13','OCCUP14'))
	setnames(rh, c('OCC','OCC2'), c('OCCUP1','OCCUP2'))	
	set(rh, rh[, which(is.na(OCCUP1))],'OCCUP1',99L)
	set(rh, rh[, which(is.na(OCCUP2))],'OCCUP2',99L)
	set(rh, NULL, 'OCCUP1', rh[, as.integer(OCCUP1)])	
	setnames(rh, c('OCCUP1','OCCUP2'), c('OCCUP1_','OCCUP2_'))
	#	handle ra
	setnames(ra, c('OCC','OCC2'), c('OCCUP1','OCCUP2'))	
	set(ra, ra[, which(is.na(OCCUP1))],'OCCUP1',99L)
	set(ra, ra[, which(is.na(OCCUP2))],'OCCUP2',99L)
	set(ra, NULL, 'OCCUP1', ra[, as.integer(OCCUP1)])	
	setnames(ra, c('OCCUP1','OCCUP2'), c('OCCUP1_','OCCUP2_'))	
	tmp		<- as.data.table(matrix(data=c(	'1','Agriculture for home use/barter',
							'2','Agriculture for selling',
							'3','Housework in your own home',
							'4','Housekeeper',
							'5','Home brewing',
							'6','Government/clerical/teaching',
							'7','Fishing',
							'8','Student',
							'9','Military/police',
							'10','Shopkeeper',
							'11','Trading/vending',
							'12','Bar worker or owner',
							'13','Trucker',
							'14','Unemployed',
							'15','Other',
							'88','No additional occupation',
							'16','Medical worker',
							'17','Casual laborer',
							'18','Waitress/Waiter/restaurant owner',
							'19','Hair dresser/Salon owner',
							'20','Construction (brick maker, builder, porter, painter, roofing)',
							'21','Mechanic (automobiles, bicycles, electronics)',
							'22','Boda Boda',
							'23','Client/Sex worker',
							'0','Unknown',
							'98','Unknown',
							'99','Unknown'), ncol=2, byrow=TRUE))
	setnames(tmp, colnames(tmp), c('OCCUP1_','OCCUP1'))
	set(tmp, NULL,'OCCUP1_',tmp[, as.integer(OCCUP1_)])
	rh		<- merge(rh,tmp,by='OCCUP1_')
	setnames(tmp, c('OCCUP1_','OCCUP1'), c('OCCUP2_','OCCUP2') )
	rh		<- merge(rh,tmp,by='OCCUP2_')	
	set(rh, NULL, c('OCCUP1_','OCCUP2_'), NULL)
	setnames(tmp, c('OCCUP2_','OCCUP2'), c('OCCUP1_','OCCUP1') )
	#	handle ra
	ra		<- merge(ra,tmp,by='OCCUP1_')
	setnames(tmp, c('OCCUP1_','OCCUP1'), c('OCCUP2_','OCCUP2') )
	ra		<- merge(ra,tmp,by='OCCUP2_')	
	set(ra, NULL, c('OCCUP1_','OCCUP2_'), NULL)
	tmp		<- rh[, which(OCCUP1=='Unknown' & OCCUP2!='Unknown')]
	set(rh, tmp, 'OCCUP1', rh[tmp, OCCUP2])
	set(rh, tmp, 'OCCUP2', 'Unknown')
	tmp		<- ra[, which(OCCUP1=='Unknown' & OCCUP2!='Unknown')]
	set(ra, tmp, 'OCCUP1', ra[tmp, OCCUP2])
	set(ra, tmp, 'OCCUP2', 'Unknown')	
	#	condense OCCUP1 and OCCUP2
	for(x in c('OCCUP1','OCCUP2'))
	{
		set(rh, which(rh[[x]]%in%c('Boda Boda','Trucker')), x, 'Boda/Trucking')
		set(rh, which(rh[[x]]%in%c('Government/clerical/teaching','Military/police','Medical worker')), x, 'Government/clerical/related')
		set(rh, which(rh[[x]]%in%c('Trading/vending','Shopkeeper','Hair dresser/Salon owner')), x, 'Trading/Shopkeeper/Hair')
		set(rh, which(rh[[x]]%in%c('Agriculture for home use/barter','Agriculture for selling','Housekeeper','Housework in your own home','Home brewing')), x, 'Agro/House')
		set(rh, which(rh[[x]]%in%c('Waitress/Waiter/restaurant owner','Bar worker or owner')), x, 'Bar/waitress')
		set(rh, which(rh[[x]]%in%c('Casual laborer','Unemployed')), x, 'Casual laborer/unemployed')
		set(rh, which(rh[[x]]%in%c('Construction (brick maker, builder, porter, painter, roofing)','Mechanic (automobiles, bicycles, electronics)')), x, 'Construction/Mechanic')
		
		set(ra, which(ra[[x]]%in%c('Boda Boda','Trucker')), x, 'Boda/Trucking')
		set(ra, which(ra[[x]]%in%c('Government/clerical/teaching','Military/police','Medical worker')), x, 'Government/clerical/related')
		set(ra, which(ra[[x]]%in%c('Trading/vending','Shopkeeper','Hair dresser/Salon owner')), x, 'Trading/Shopkeeper/Hair')
		set(ra, which(ra[[x]]%in%c('Agriculture for home use/barter','Agriculture for selling','Housekeeper','Housework in your own home','Home brewing')), x, 'Agro/House')
		set(ra, which(ra[[x]]%in%c('Waitress/Waiter/restaurant owner','Bar worker or owner')), x, 'Bar/waitress')
		set(ra, which(ra[[x]]%in%c('Casual laborer','Unemployed')), x, 'Casual laborer/unemployed')
		set(ra, which(ra[[x]]%in%c('Construction (brick maker, builder, porter, painter, roofing)','Mechanic (automobiles, bicycles, electronics)')), x, 'Construction/Mechanic')
	}
	set(rh, rh[, which(OCCUP1=='No additional occupation' & OCCUP2=='Unknown')], 'OCCUP1', 'Unknown')
	#	refine OCCUP1 OCCUP2 when OCAT==Student
	set(rh, rh[, which(OCAT%in%c('Student'))], 'OCCUP1', 'Student')
	set(rh, rh[, which(OCAT%in%c('Student'))], 'OCCUP2', 'Student')
	#	define own OCCUP at time of RCCS visit
	rh[, OCCUP_OLLI:= OCCUP1]
	for(x in c('Student','Boda/Trucking','Bar/waitress','Client/Sex worker'))
		set(rh, rh[, which(OCCUP1==x | OCCUP2==x )],'OCCUP_OLLI', x)
	#	handle ra
	set(ra, ra[, which(OCCUP1=='No additional occupation' & OCCUP2=='Unknown')], 'OCCUP1', 'Unknown')
	ra[, OCCUP_OLLI:= OCCUP1]
	for(x in c('Student','Boda/Trucking','Bar/waitress','Client/Sex worker'))
		set(ra, ra[, which(OCCUP1==x | OCCUP2==x )],'OCCUP_OLLI', x)
	#	OK this is getting complicated: Kate is using all OCC codes to override OCCUP1 
	#	as deemed sensible
	#	just OCAT for simplicity
	#subset(rh, OCAT=='Bar/waitress' & OCCUP1!='Bar/waitress')
	#subset(rh, RID=='G030852')
	#	define SEXWORK
	set(rh, NULL, 'SEXWORK', rh[, as.character(factor(SEXWORK,levels=c(0,1),labels=c('N','Y')))])
	#	define SEXBAR (either work in bar or have sexual contact with barworker)
	set(rh, NULL, 'SEXBAR', rh[, as.character(factor(SEXBAR,levels=c(0,1),labels=c('N','Y')))])	
	#	extend MARSTAT for rh
	tmp		<- rh[, which((is.na(MARSTAT)|MARSTAT=='Never Married') & (RLTN1=='Current wife (at the time)'|RLTN2=='Current wife (at the time)'|RLTN3=='Current wife (at the time)'|RLTN4=='Current wife (at the time)'))]
	set(rh, tmp, 'EVERMARR', 1L)
	set(rh, tmp, 'CURRMARR', 1L)
	set(rh, tmp, 'MARSTAT', 'Monogamous')
	tmp		<- rh[, which(is.na(MARSTAT) & (RLTN1=='Former wife/consensual partner'|RLTN2=='Former wife/consensual partner'|RLTN3=='Former wife/consensual partner'|RLTN4=='Former wife/consensual partner'))]
	set(rh, tmp, 'EVERMARR', 1L)
	set(rh, tmp, 'PREVMAR', 1L)
	set(rh, tmp, 'MARSTAT', 'Previously Married')
	tmp		<- rh[, which(is.na(MARSTAT) & (!RLTN1%in%c('Unknown','Current consensual partner (at the time)')))]
	set(rh, tmp, 'EVERMARR', 0L)
	set(rh, tmp, 'PREVMAR', 0L)
	set(rh, tmp, 'CURRMARR', 0L)
	set(rh, tmp, 'MARSTAT', 'Never Married')
	set(rh, rh[, which(is.na(MARSTAT))], 'MARSTAT', 'Unknown')
	#	define MARSTAT for ra
	ra[, MARSTAT:='Unknown']
	set(ra, ra[, which(EVERMARR==2)], 'MARSTAT', 'Never Married')
	set(ra, ra[, which(EVERMARR==1 & CURRMARR==2)], 'MARSTAT', 'Previously Married')
	set(ra, ra[, which(EVERMARR==1 & CURRMARR==1)], 'MARSTAT', 'Monogamous')
	set(ra, ra[, which(POLYMAR>1 & !POLYMAR%in%c(97,98))], 'MARSTAT', 'Polygamous')
	#	define SEXYEAR
	set(rh, rh[, which(is.na(SEXYEAR))],'SEXYEAR', 99)
	set(rh, NULL, 'SEXYEAR', rh[, gsub('_[0-9]$','',as.character(factor(SEXYEAR, levels=c(0,1,2,8,99), labels=c('Unknown_1','Y','N','Unknown_2','Unknown_3'))))])
	stopifnot( !nrow(subset(rh, is.na(SEXYEAR))) )
	set(ra, ra[, which(is.na(SEXYEAR))],'SEXYEAR', 99)
	set(ra, NULL, 'SEXYEAR', ra[, gsub('_[0-9]$','',as.character(factor(SEXYEAR, levels=c(0,1,2,8,9,99), labels=c('Unknown_1','Y','N','Unknown_2','Unknown_3','Unknown_4'))))])
	stopifnot( !nrow(subset(ra, is.na(SEXYEAR))) )		
	#	define SEXP1YR 
	setnames(rh, c('SEXP1YR'), c('SEXP1YR_'))	
	rh[, SEXP1YR:= as.character(SEXP1YR_)]
	set(rh, rh[, which(SEXP1YR_==92)],'SEXP1YR','<3')
	set(rh, rh[, which(SEXP1YR_==93)],'SEXP1YR','3+')
	set(rh, rh[, which(SEXP1YR_%in%c(97,98,99) | is.na(SEXP1YR_))],'SEXP1YR','Unknown')
	set(rh, NULL, 'SEXP1YR_', NULL)
	setnames(ra, c('SEXP1YR'), c('SEXP1YR_'))	
	ra[, SEXP1YR:= as.character(SEXP1YR_)]
	set(ra, ra[, which(SEXP1YR_==92)],'SEXP1YR','<3')
	set(ra, ra[, which(SEXP1YR_==93)],'SEXP1YR','3+')
	set(ra, ra[, which(SEXP1YR_%in%c(97,98,99) | is.na(SEXP1YR_))],'SEXP1YR','Unknown')
	set(ra, NULL, 'SEXP1YR_', NULL)	
	#	revisit SEXP1YR based on relationships
	set(rh, rh[, which(SEXP1YR%in%c('Unknown') & RLTN_NAMED==1)], 'SEXP1YR', '1')
	set(rh, rh[, which(SEXP1YR%in%c('Unknown') & RLTN_NAMED==2)], 'SEXP1YR', '2')
	set(rh, rh[, which(SEXP1YR%in%c('Unknown') & RLTN_NAMED==3)], 'SEXP1YR', '3')
	set(rh, rh[, which(SEXP1YR%in%c('Unknown') & RLTN_NAMED==4)], 'SEXP1YR', '3+')
	tmp		<- rh[, which(SEXP1YR%in%c('0') & RLTN_NAMED>0)]
	if( nrow(rh[tmp,]) )
		warning("found SEXP1YR%in%c('0') & RLTN_NAMED>0  --> set to had sex last year, n=", length(tmp))
	set(rh, rh[, which(SEXP1YR%in%c('0') & RLTN_NAMED==1)], 'SEXP1YR', '1')
	set(rh, rh[, which(SEXP1YR%in%c('0') & RLTN_NAMED==2)], 'SEXP1YR', '2')
	set(rh, rh[, which(SEXP1YR%in%c('0') & RLTN_NAMED==3)], 'SEXP1YR', '3')
	set(rh, rh[, which(SEXP1YR%in%c('0') & RLTN_NAMED==4)], 'SEXP1YR', '3+')
	#	revisit SEXYEAR based on SEXP1YR
	set(rh, rh[, which(SEXYEAR%in%c('Unknown') & !SEXP1YR%in%c('0','Unknown'))], 'SEXYEAR', 'Y')
	tmp		<- rh[, which(SEXYEAR%in%c('N') & !SEXP1YR%in%c('0','Unknown'))]
	if( nrow(rh[tmp,]) )
		warning("found SEXYEAR%in%c('N') & !SEXP1YR%in%c('0','Unknown') --> set to had sex last year, n=", length(tmp))	
	set(rh, tmp, 'SEXYEAR', 'Y')		
	set(ra, ra[, which(SEXYEAR%in%c('Unknown') & !SEXP1YR%in%c('0','Unknown'))], 'SEXYEAR', 'Y')
	tmp		<- ra[, which(SEXYEAR%in%c('N') & !SEXP1YR%in%c('0','Unknown'))]
	if( nrow(ra[tmp,]) )
		warning("found SEXYEAR%in%c('N') & !SEXP1YR%in%c('0','Unknown') --> set to had sex last year, n=", length(tmp))	
	set(ra, tmp, 'SEXYEAR', 'Y')		
	#	define SEXP1OUT 
	setnames(rh, c('SEXP1OUT'), c('SEXP1OUT_'))	
	rh[, SEXP1OUT:= as.character(SEXP1OUT_)]
	set(rh, rh[, which(SEXP1OUT_==92)],'SEXP1OUT','<3')
	set(rh, rh[, which(SEXP1OUT_==93)],'SEXP1OUT','3+')
	set(rh, rh[, which(SEXP1OUT_%in%c(97,98,99) | is.na(SEXP1OUT_))],'SEXP1OUT','Unknown')
	set(rh, NULL, 'SEXP1OUT_', NULL)	
	setnames(ra, c('SEXP1OUT'), c('SEXP1OUT_'))	
	ra[, SEXP1OUT:= as.character(SEXP1OUT_)]
	set(ra, ra[, which(SEXP1OUT_==92)],'SEXP1OUT','<3')
	set(ra, ra[, which(SEXP1OUT_==93)],'SEXP1OUT','3+')
	set(ra, ra[, which(SEXP1OUT_%in%c(97,98,99) | is.na(SEXP1OUT_))],'SEXP1OUT','Unknown')
	set(ra, NULL, 'SEXP1OUT_', NULL)	
	#	revisit SEXYEAR based on SEXP1OUT
	set(rh, rh[, which(SEXYEAR%in%c('Unknown') & !SEXP1OUT%in%c('0','Unknown'))], 'SEXYEAR', 'Y')
	tmp		<- rh[, which(SEXYEAR%in%c('N') & !SEXP1OUT%in%c('0','Unknown'))]
	if( nrow(rh[tmp,]) )
		warning("found SEXYEAR%in%c('N') & !SEXP1OUT%in%c('0','Unknown') --> set to had sex last year, n=", length(tmp))
	set(rh, tmp, 'SEXYEAR', 'Y')
	set(ra, ra[, which(SEXYEAR%in%c('Unknown') & !SEXP1OUT%in%c('0','Unknown'))], 'SEXYEAR', 'Y')
	tmp		<- ra[, which(SEXYEAR%in%c('N') & !SEXP1OUT%in%c('0','Unknown'))]
	if( nrow(ra[tmp,]) )
		warning("found SEXYEAR%in%c('N') & !SEXP1OUT%in%c('0','Unknown') --> set to had sex last year, n=", length(tmp))
	set(ra, tmp, 'SEXYEAR', 'Y')	
	#	revisit SEXP1YR based on SEXP1OUT
	set(rh, rh[, which(SEXP1OUT=='3+' & SEXP1YR%in%c('0','1','2','<3','Unknown'))], 'SEXP1YR', '3+')
	set(rh, rh[, which(SEXP1OUT=='<3' & SEXP1YR%in%c('0','Unknown'))], 'SEXP1YR', '<3')
	rh[, DUMMY:= seq_len(nrow(rh))]
	warning("set(rh, rh[, which(RID=='G013746' & VISIT==14)], 'SEXP1OUT', 1) --> think this is typo")
	set(rh, rh[, which(RID=='G013746' & VISIT==14)], 'SEXP1OUT', '1')	
	tmp		<- rh[, which(	!SEXP1OUT%in%c('3+','<3','Unknown') & !SEXP1YR%in%c('3+','<3','Unknown'))]  
	tmp		<- subset(rh[tmp, ], as.numeric(SEXP1OUT)>as.numeric(SEXP1YR))[, DUMMY]
	warning("as.numeric(SEXP1OUT)>as.numeric(SEXP1YR), set to SEXP1OUT, n=", length(tmp))
	set(rh, tmp, 'SEXP1YR', rh[tmp, SEXP1OUT])	
	set(ra, ra[, which(SEXP1OUT=='3+' & SEXP1YR%in%c('0','1','2','<3','Unknown'))], 'SEXP1YR', '3+')
	set(ra, ra[, which(SEXP1OUT=='<3' & SEXP1YR%in%c('0','Unknown'))], 'SEXP1YR', '<3')
	ra[, DUMMY:= seq_len(nrow(ra))]
	tmp		<- ra[, which(	!SEXP1OUT%in%c('3+','<3','Unknown') & !SEXP1YR%in%c('3+','<3','Unknown'))]  
	tmp		<- subset(ra[tmp, ], as.numeric(SEXP1OUT)>as.numeric(SEXP1YR))[, DUMMY]
	warning("as.numeric(SEXP1OUT)>as.numeric(SEXP1YR), set to SEXP1OUT, n=", length(tmp))
	set(ra, tmp, 'SEXP1YR', ra[tmp, SEXP1OUT])	
	#	revisit SEXP1YR based on SEXYEAR
	set(rh, rh[, which(SEXYEAR=='Unknown' & SEXP1YR=='0')], 'SEXP1YR', 'Unknown')
	set(rh, rh[, which(SEXYEAR=='Y' & SEXP1YR=='0')], 'SEXP1YR', 'Unknown')
	set(ra, ra[, which(SEXYEAR=='Unknown' & SEXP1YR=='0')], 'SEXP1YR', 'Unknown')
	set(ra, ra[, which(SEXYEAR=='Y' & SEXP1YR=='0')], 'SEXP1YR', 'Unknown')	
	#	define SEXPEVER
	setnames(rh, c('SEXPEVER'), c('SEXPEVER_'))	
	rh[, SEXPEVER:= as.character(SEXPEVER_)]
	set(rh, rh[, which(SEXPEVER_==92)],'SEXPEVER','<3')
	set(rh, rh[, which(SEXPEVER_==93)],'SEXPEVER','3+')
	set(rh, rh[, which(SEXPEVER_%in%c(97,98,99) | is.na(SEXPEVER_))],'SEXPEVER','Unknown')
	set(rh, NULL, 'SEXPEVER_', NULL)
	stopifnot( !nrow(subset(rh, is.na(SEXPEVER))) )	
	setnames(ra, c('SEXPEVER'), c('SEXPEVER_'))	
	ra[, SEXPEVER:= as.character(SEXPEVER_)]
	set(ra, ra[, which(SEXPEVER_==92)],'SEXPEVER','<3')
	set(ra, ra[, which(SEXPEVER_==93)],'SEXPEVER','3+')
	set(ra, ra[, which(SEXPEVER_%in%c(97,98,99) | is.na(SEXPEVER_))],'SEXPEVER','Unknown')
	set(ra, NULL, 'SEXPEVER_', NULL)
	stopifnot( !nrow(subset(ra, is.na(SEXPEVER))) )	
	#	revisit SEXPEVER based on SEXP1YR
	set(rh, rh[, which(SEXP1YR=='3+' & SEXPEVER%in%c('0','1','2','<3','Unknown'))], 'SEXPEVER', '3+')
	set(rh, rh[, which(SEXP1YR=='<3' & SEXPEVER%in%c('0','Unknown'))], 'SEXPEVER', '<3')	
	tmp		<- rh[, which(	!SEXP1YR%in%c('3+','<3','Unknown') & !SEXPEVER%in%c('3+','<3','Unknown'))]  
	tmp		<- subset(rh[tmp, ], as.numeric(SEXP1YR)>as.numeric(SEXPEVER))[, DUMMY]
	warning("as.numeric(SEXP1YR)>as.numeric(SEXPEVER), set to SEXP1YR, n=", length(tmp))
	set(rh, tmp, 'SEXPEVER', rh[tmp, SEXP1YR])	
	set(ra, ra[, which(SEXP1YR=='3+' & SEXPEVER%in%c('0','1','2','<3','Unknown'))], 'SEXPEVER', '3+')
	set(ra, ra[, which(SEXP1YR=='<3' & SEXPEVER%in%c('0','Unknown'))], 'SEXPEVER', '<3')	
	tmp		<- ra[, which(	!SEXP1YR%in%c('3+','<3','Unknown') & !SEXPEVER%in%c('3+','<3','Unknown'))]  
	tmp		<- subset(ra[tmp, ], as.numeric(SEXP1YR)>as.numeric(SEXPEVER))[, DUMMY]
	warning("as.numeric(SEXP1YR)>as.numeric(SEXPEVER), set to SEXP1YR, n=", length(tmp))
	set(ra, tmp, 'SEXPEVER', ra[tmp, SEXP1YR])	
	#	define EVERSEX 
	set(rh, rh[, which(is.na(EVERSEX))],'EVERSEX', 99)
	set(rh, NULL, 'EVERSEX', rh[, gsub('_[0-9]$','',as.character(factor(EVERSEX, levels=c(0,1,2,3,8,99), labels=c('Unknown_1','Y','N','Unknown_2','Unknown_3','Unknown_4'))))])
	stopifnot( !nrow(subset(rh, is.na(EVERSEX))) )	
	set(rh, rh[, which(EVERSEX%in%c('Unknown') & SEXYEAR=='Y')], 'EVERSEX', 'Y')
	set(rh, rh[, which(EVERSEX%in%c('Unknown') & CURRMARR==1)], 'EVERSEX', 'Y')
	set(rh, rh[, which(EVERSEX%in%c('Unknown') & EVERMARR==1)], 'EVERSEX', 'Y')
	set(rh, rh[, which(EVERSEX%in%c('Unknown') & RLTN_NAMED>0)], 'EVERSEX', 'Y')	
	set(ra, ra[, which(is.na(EVERSEX))],'EVERSEX', 99)
	set(ra, NULL, 'EVERSEX', ra[, gsub('_[0-9]$','',as.character(factor(EVERSEX, levels=c(0,1,2,3,8,99), labels=c('Unknown_1','Y','N','Unknown_2','Unknown_3','Unknown_4'))))])
	stopifnot( !nrow(subset(ra, is.na(EVERSEX))) )	
	set(ra, ra[, which(EVERSEX%in%c('Unknown') & SEXYEAR=='Y')], 'EVERSEX', 'Y')
	set(ra, ra[, which(EVERSEX%in%c('Unknown') & CURRMARR==1)], 'EVERSEX', 'Y')
	set(ra, ra[, which(EVERSEX%in%c('Unknown') & EVERMARR==1)], 'EVERSEX', 'Y')		
	#	revisit EVERSEX based on SEXPEVER
	set(rh, rh[, which(EVERSEX%in%c('Unknown') & !SEXPEVER%in%c('0','Unknown'))], 'EVERSEX', 'Y')
	tmp		<- rh[, which(EVERSEX%in%c('N') & !SEXPEVER%in%c('0','Unknown'))]
	if( nrow(rh[tmp,]) )
		warning("found EVERSEX%in%c('N') & !SEXPEVER%in%c('0','Unknown') --> set to had sex ever, n=", length(tmp))
	set(rh, tmp, 'EVERSEX', 'Y')
	set(ra, ra[, which(EVERSEX%in%c('Unknown') & !SEXPEVER%in%c('0','Unknown'))], 'EVERSEX', 'Y')
	tmp		<- ra[, which(EVERSEX%in%c('N') & !SEXPEVER%in%c('0','Unknown'))]
	if( nrow(ra[tmp,]) )
		warning("found EVERSEX%in%c('N') & !SEXPEVER%in%c('0','Unknown') --> set to had sex ever, n=", length(tmp))
	set(ra, tmp, 'EVERSEX', 'Y')	
	#	revisit SEXPEVER based on EVERSEX
	set(rh, rh[, which(EVERSEX=='Unknown' & SEXPEVER=='0')], 'SEXP1YR', 'Unknown')
	set(rh, rh[, which(EVERSEX=='Y' & SEXPEVER=='0')], 'SEXP1YR', 'Unknown')
	set(ra, ra[, which(EVERSEX=='Unknown' & SEXPEVER=='0')], 'SEXP1YR', 'Unknown')
	set(ra, ra[, which(EVERSEX=='Y' & SEXPEVER=='0')], 'SEXP1YR', 'Unknown')		
	#	check SEXPEVER	
	tmp		<- subset(rh, SEXPEVER=='0' & SEXACTIVE==1)
	if( nrow(tmp) )
		warning("found SEXPEVER=='0' & SEXACTIVE==1 --> report only, n=", nrow(tmp))	
	#	add extra-marital partner to MARSTAT
	tmp	<- rh[, which(MULTIPART>0 & MARSTAT!='Previously Married')]
	set(rh, tmp, 'MARSTAT', rh[tmp, paste0(MARSTAT,' + casual partner')])
	tmp	<- rh[, which(MARSTAT%in%c('Monogamous') & !SEXP1YR%in%c('1','Unknown'))]
	set(rh, tmp, 'MARSTAT', rh[tmp, paste0(MARSTAT,' + casual partner')])	
	tmp	<- rh[, which(MARSTAT%in%c('Polygamous') & !SEXP1YR%in%c('1','Unknown') & POLYMAR<as.numeric(gsub('Unknown','0',gsub('+','',SEXP1YR,fixed=1))) ) ]
	set(rh, tmp, 'MARSTAT', rh[tmp, paste0(MARSTAT,' + casual partner')])
	tmp	<- ra[, which(MARSTAT%in%c('Monogamous') & !SEXP1YR%in%c('1','Unknown'))]
	set(ra, tmp, 'MARSTAT', ra[tmp, paste0(MARSTAT,' + casual partner')])	
	tmp	<- ra[, which(MARSTAT%in%c('Polygamous') & !SEXP1YR%in%c('1','Unknown') & POLYMAR<as.numeric(gsub('Unknown','0',gsub('+','',SEXP1YR,fixed=1))) ) ]
	set(ra, tmp, 'MARSTAT', ra[tmp, paste0(MARSTAT,' + casual partner')])
	#	define ever alcohol use during sex
	set(rh, NULL, 'ALC', rh[, as.character(factor(ALC, levels=c(0,1), labels=c('N','Y')))])
	#	redefine SEXP1YR
	set(rh, rh[, which(!SEXP1YR%in%c('0','1','2','Unknown'))], 'SEXP1YR','3+')
	set(ra, ra[, which(!SEXP1YR%in%c('0','1','2','Unknown'))], 'SEXP1YR','3+')
	#	redefine SEXP1OUT
	set(rh, rh[, which(!SEXP1OUT%in%c('0','1','2','Unknown'))], 'SEXP1OUT','3+')
	set(ra, ra[, which(!SEXP1OUT%in%c('0','1','2','Unknown'))], 'SEXP1OUT','3+')
	#	check circumcision
	tmp		<- rh[, which(CIRCUM=='Y' & SEX=='F')]
	cat('\nWarning: found female circumcised --> set to NA' ,rh[tmp, paste(RID, collapse=' ')])	
	set(rh, tmp, 'CIRCUM', NA_integer_)
	#	define COMM_TYPE
	set(rh, NULL, 'COMM_NUM', rh[, as.character(COMM_NUM)])
	set(ra, NULL, 'COMM_NUM', ra[, as.character(COMM_NUM)])
	tmp		<- data.table(	COMM_NUM=	c("1","2","3","4","5","6","7","8","9","14","15","16","18","19","22","23","24","25","29","32","33","34","35","36","38","40","44","45","46","51","52","53","54","55", "56","57","58","59","60","61","62","65","67","74","77","81","84","89","94","95","103","106","107","108","109","120","177", "183", "256", "370","391","401","451", "468","602", "754", "755", "760", "770","771","772","773","774","776"),
			COMM_TYPE=	c("T","A","A","T","A","A","A","A","A", "A", "A", "T", "A", "A", "T", "A", "T", "A", "A", "A", "T", "A", "A", "A", "F", "A", "A", "A", "A", "T", "A", "A", "A", "A",  "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",  "A","A",   "A",  "T",  "A",  "A",  "A",  "A",   "A",   "A",   "A",  "A", "A",  "A",    "A",  "A",  "A",    "A",   "A", "F",  "F",  "A",  "A",  "F",   "T"))
	set(tmp, NULL, 'COMM_TYPE', tmp[, as.character(factor(COMM_TYPE, levels=c('A','T','F'), labels=c('agrarian','trading','fisherfolk')))])		
	stopifnot(!length(setdiff( rh[, COMM_NUM], tmp[, COMM_NUM] )))
	stopifnot(!length(setdiff( ra[, COMM_NUM], tmp[, COMM_NUM] )))
	rh		<- merge(rh, tmp, by='COMM_NUM')
	ra		<- merge(ra, tmp, by='COMM_NUM')	
	#	merge community numbers for same community
	set(rh, NULL, 'COMM_NUM', rh[, gsub('^107$|^16$','16m',gsub('^776$|^51$','51m',gsub('^4$|^24$','24m',gsub('^1$|^22$','22m',COMM_NUM))))])
	set(ra, NULL, 'COMM_NUM', ra[, gsub('^107$|^16$','16m',gsub('^776$|^51$','51m',gsub('^4$|^24$','24m',gsub('^1$|^22$','22m',COMM_NUM))))])
	set(rd, NULL, 'COMM_NUM', rd[, gsub('^107$|^16$','16m',gsub('^776$|^51$','51m',gsub('^4$|^24$','24m',gsub('^1$|^22$','22m',COMM_NUM))))])
	#	add anonymized IDs
	dc		<- as.data.table(read.csv('~/Dropbox (SPH Imperial College)/Rakai Pangea Meta Data/Data for Fish Analysis Working Group/PANGEA_Rakai_community_anonymized_IDs.csv'))
	dc		<- unique(subset(dc, select=c('COMM_NUM','COMM_NUM_A')))	
	rh		<- merge(rh, dc, by='COMM_NUM')
	ra		<- merge(ra, dc, by='COMM_NUM')
	rd		<- merge(rd, dc, by='COMM_NUM')
	#	set to NULL
	set(rh, NULL, c('VDEX','EVERMARR','CURRMARR','RELIGION','POLYMAR','DUMMY','RLTN1','RLTN2','RLTN3','RLTN4','RLTN_NAMED'), NULL)
	set(rh, NULL, c('BVDEX','EVERSEX','SEXGIFT','SEXYEAR','EDUCATE','EDUCYRS','EDCAT','ARVMED','CNDEVER1','RNYRCON1','CNDEVER2','RNYRCON2','CNDEVER3','RNYRCON3','CNDEVER4','RNYRCON4','RLTNCON1'),NULL)
	set(rh, NULL, c('RLTNCON2','RLTNCON3','RLTNCON4','ALC1B','ALC2B','ALC3B','ALC4B','ALC1F','ALC2F','ALC3F','ALC4F','OCCUP11','OCCUP12','OCCUP13','OCCUP14','OCCUP21','OCCUP22','OCCUP23','OCCUP24','SEXHIGH','SEXOUT'),NULL)
	set(rh, NULL, c('SEXCAT','PREVMAR','AGECAT','AGECAT2','HIVPREV2','UNDER25','AGE15TO19','AGE20TO24','AGE25TO29','AGE30TO34','AGE35TO39','AGE40TO44','AGE45TO49','OCCLAG1','SUM_ALC'),NULL)
	set(rh, NULL, c('SEXACTIVE','MULTIPART','CAS','SUMCON','CONCON','NEVERSEX','OCCUP1','OCCUP2','SEXPEVER'),NULL)
	set(rn, NULL, c('SAMPLEREASON'), NULL)
	rd[, COHORT:= 'RCCS']
	rh[, COHORT:= 'RCCS']
	ra[, COHORT:= 'RCCS']
	#
	list(rd=rd, rh=rh, ra=ra, rn=rn)
}

RakaiFull.phylogeography.180322.get.data.eligibility.participation.sequenced<- function()
{
	require(data.table)
	
	infile					<- "~/Dropbox (SPH Imperial College)/Rakai Pangea Meta Data/Data for Fish Analysis Working Group/rakai_elibility.rda"
	outfile.base			<- "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/"
	load(infile)
	
	#	subset to data of interest
	de	<- as.data.table(eldat)	
	de	<- subset(de, status%in%c('_Participated','Away','Blood refusal','Missing data','Other','Refused','urine sample'))
	de	<- subset(de, visit<17)
	de	<- subset(de, select=c(PERM_ID, CURR_ID, visit, RESIDENT, MOBILITY, REGION, COMM_NUM, HH_NUM, SEX, STUDY_ID, status, date, AGEYRS, inmigrant))
	setnames(de, colnames(de), toupper(colnames(de)))
	#	define PARTICIPATED as "participated, missing data ok"
	#	TODO take out missing data
	de[, PARTICIPATED:= as.integer(STATUS%in%c('_Participated'))]
	de[, DATE:= hivc.db.Date2numeric(DATE)]	
	
	#	merge communities that are very close / identical
	setnames(de, 'COMM_NUM', 'COMM_NUM_RAW')
	set(de, NULL, 'COMM_NUM', de[, gsub('^107$|^16$','16m',gsub('^776$|^51$','51m',gsub('^4$|^24$','24m',gsub('^1$|^22$','22m',COMM_NUM_RAW))))])
	
	#	fixup missing permanent IDs with Study IDs
	stopifnot( nrow(subset(de, is.na(PERM_ID) & is.na(STUDY_ID)))==0 )
	tmp		<- de[, which(is.na(PERM_ID))]
	set(de, tmp, 'PERM_ID', de[tmp, STUDY_ID])
	#	find study participants with multiple permanent IDs and fixup
	tmp		<- subset(de, !is.na(STUDY_ID))[, list(N_PERM=length(unique(PERM_ID))), by='STUDY_ID']
	tmp		<- subset(tmp, N_PERM>1, STUDY_ID)
	tmp[, DUMMY:=1]
	de		<- merge(de, tmp, by='STUDY_ID', all.x=1)
	tmp		<- de[, which(DUMMY==1)]
	set(de, tmp, 'PERM_ID', de[tmp, STUDY_ID])
	de[, DUMMY:=NULL]
	
	#	we care about wether individuals participated in any of the visits 15, 15.1, 16
	#	the best we can do is look at permanent ids, even though they are not unique
	tmp		<- de[, list(PARTICIPATED_ANY_VISIT=as.integer(any(PARTICIPATED==1))), by='PERM_ID']
	de		<- merge(de, tmp, by='PERM_ID')
	#	to those of ever participated, find study id
	tmp		<- subset(de, PARTICIPATED_ANY_VISIT==1)
	stopifnot( nrow(subset(tmp, PARTICIPATED==1 & is.na(STUDY_ID)))==0 )
	tmp		<- tmp[, list(DUMMY= STUDY_ID[PARTICIPATED==1][1]), by='PERM_ID']
	de		<- merge(de, tmp, by='PERM_ID', all.x=TRUE)
	tmp		<- de[, which(is.na(STUDY_ID) & !is.na(DUMMY))]
	set(de, tmp, 'STUDY_ID', de[tmp,DUMMY])
	de[, DUMMY:=NULL]
	setnames(de, 'STUDY_ID','RID')
	#	there are a few individuals with Study ID who did not "participate" when missing not included
	#subset(de, PARTICIPATED_ANY_VISIT==0 & !is.na(STUDY_ID))
	
	#	prepare HIV status
	tmp		<- RakaiCirc.epi.get.info.170208()
	rd		<- tmp$rd
	rneuro	<- tmp$rn
	ra		<- tmp$ra
	#	prepare self report ART
	#	find those who report to be on ART at their first visit in 15-17
	rart	<- merge(ra, ra[VISIT>=14 & VISIT<17, list(VISIT=min(VISIT)), by='RID'], by=c('RID','VISIT'))	
	set(rart, rart[, which(is.na(ARVMED))], 'ARVMED', 2L)
	setnames(rart, 'ARVMED', 'SELFREPORTART_AT_FIRST_VISIT')
	rart	<- subset(rart, select=c(RID, SELFREPORTART_AT_FIRST_VISIT))
	#set(rart, NULL,  'SELFREPORTART', rart[, factor(SELFREPORTART, levels=c(0,1,2), labels=c('no','yes','unknown'))])
	#	select meta-data closest to first pos date	
	tmp		<- rd[, list(VISIT=VISIT[which.min(abs(DATE-FIRSTPOSDATE))]), by='RID']
	tmp2	<- rd[, list(PANGEA=as.integer(any(PANGEA==1))), by='RID']
	rd		<- merge(rd, tmp, by=c('RID','VISIT'))
	rd[, HIV:= 1L]
	rd[, PANGEA:=NULL]
	rd		<- merge(rd, tmp2, by='RID')
	#
	ra		<- unique(subset(ra, !is.na(FIRSTPOSDATE), select=c(RID, SEX, VISIT, VISDATE, FIRSTPOSDATE, ARVMED, COMM_NUM, COMM_NUM_A)))
	tmp		<- ra[, list(VISIT=VISIT[which.min(abs(VISDATE-FIRSTPOSDATE))]), by='RID']	
	ra		<- merge(ra, tmp, by=c('RID','VISIT'))
	ra[, HIV_1517:= 1L]
	#rd		<- subset(rd, BIRTHYR>2010-50 & (is.na(EST_DATEDIED) | (!is.na(EST_DATEDIED) & EST_DATEDIED>2010)))
		
	#	add HIV status
	tmp		<- rd[FIRSTPOSDATE<2015+2/12, list(HIV=max(HIV), PANGEA=max(PANGEA)), by='RID'] 			
	de		<- merge(de, tmp, by='RID', all.x=1)	
	tmp		<- unique(subset(ra, FIRSTPOSDATE<2015+2/12, select=c(RID, HIV_1517, FIRSTPOSDATE)))
	de		<- merge(de, tmp, by='RID', all.x=1)
	
	#	add ART status
	de		<- merge(de, rart, by='RID', all.x=1)
	
	#	get individuals with at least 750nt overlap with another individual at 20X
	infile	<- "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/close_pairs_170704_cl35_withmedian.rda"
	load(infile)
	rn	<- subset(rn, NEFF>3)	
	rn	<- data.table(RID=rn[, unique(c(ID1, ID2))],MIN_PNG_OUTPUT=1)		
	#	add to rn individuals with any sequence data
	load('~/Dropbox (SPH Imperial College)/2015_PANGEA_DualPairsFromFastQIVA/RakaiAll_input_170301/Rakai_phyloscanner_170301_b75_part2.rda')
	tmp		<- unique(subset(dc, !is.na(SID), select=RID))
	tmp[, BAM_OUTPUT:=1L]
	rn		<- merge(tmp, rn, all=1, by='RID')
	set(rn, rn[, which(is.na(MIN_PNG_OUTPUT))],'MIN_PNG_OUTPUT',0L)
	#	add
	de		<- merge(de, rn, by='RID', all.x=1)
	rd		<- merge(rd, rn, by='RID', all.x=1)
	rneuro	<- merge(rneuro, rn, by='RID', all.x=1)
		
	#	tmp above has 4074 individuals
	#	of those, 3758 are in 'de', ie participated in rounds 15-17
	
	#	for every individual that ever participated, keep first record of when participated
	#	for every individual that never participated, keep last record
	tmp		<- subset(de, PARTICIPATED_ANY_VISIT==0)[, list(VISIT=max(VISIT)), by='PERM_ID']
	tmp		<- merge(de, tmp, by=c('PERM_ID','VISIT'))
	tmp2	<- subset(de, PARTICIPATED_ANY_VISIT==1)[, list(VISIT=min(VISIT[PARTICIPATED==1])), by='PERM_ID']
	de		<- merge(de, tmp2, by=c('PERM_ID','VISIT'))
	de		<- rbind(de, tmp)
	
	#	fill in missing HIV etc
	tmp		<- de[, which(PARTICIPATED_ANY_VISIT==1 & is.na(HIV))]
	set(de, tmp, 'HIV', 0L)
	tmp		<- de[, which(PARTICIPATED_ANY_VISIT==1 & is.na(HIV_1517))]
	set(de, tmp, 'HIV_1517', 0L)
	tmp		<- de[, which(PARTICIPATED_ANY_VISIT==1 & is.na(BAM_OUTPUT))]
	set(de, tmp, c('BAM_OUTPUT','MIN_PNG_OUTPUT'), 0L)
	tmp		<- rd[, which(is.na(BAM_OUTPUT))]
	set(rd, tmp, c('BAM_OUTPUT','MIN_PNG_OUTPUT'), 0L)
	
	de[, table(HIV, HIV_1517)]	
	# 672 with HIV_1517==1 and HIV==0
	#	0 with HIV_1517==0 and HIV==1
	# use HIV_1517 below
		
	rneuro[, sum(MIN_PNG_OUTPUT)]			# 224	
	rd[, sum(MIN_PNG_OUTPUT)]				# 2746
	de[, sum(MIN_PNG_OUTPUT, na.rm=TRUE)]	# 2689
	
	#	calculate number for whom we have deep seq output
	tmp		<- subset(rd, MIN_PNG_OUTPUT==1)[, list(DEEP_SEQ=length(RID)), by=c('COMM_NUM')]	
	tmp		<- subset(tmp, DEEP_SEQ>0)
	tmp[, DEEP_SEQ:=NULL]	
	tmp[, COMM_ANY_MIN_PNG_OUTPUT:= 1]
	de		<- merge(de, tmp, by='COMM_NUM', all.x=1)
	rd		<- merge(rd, tmp, by='COMM_NUM', all.x=1)
	set(de, de[, which(is.na(COMM_ANY_MIN_PNG_OUTPUT))], 'COMM_ANY_MIN_PNG_OUTPUT', 0L)
	set(rd, rd[, which(is.na(COMM_ANY_MIN_PNG_OUTPUT))], 'COMM_ANY_MIN_PNG_OUTPUT', 0L)
	tmp		<- unique(subset(rd, select=c(COMM_NUM,COMM_NUM_A)))
	de		<- merge(de, tmp, by='COMM_NUM')
	
	#	subset to only those communities with seq data
	#	4 fishing, 34 inland
	#def		<- copy(de)
	#de		<- subset(de, !is.na(DUMMY))
	#de[, DUMMY:=NULL]
	#rd		<- subset(rd, !is.na(DUMMY))
	#rd[, DUMMY:=NULL]	
	#

	#	add age category at midpoint of observation period, 2013.25
	set(de, de[, which(is.na(DATE))], 'DATE', hivc.db.Date2numeric(as.Date("2011-09-02")))	
	de[, AGE_AT_MID:= 2013.25 - (hivc.db.Date2numeric(DATE)-AGEYRS)]
	de[, AGE_AT_MID_C:= cut(AGE_AT_MID, breaks=c(10,25,35,65), labels=c('15-24','25-34','35+'), right=FALSE)]
	stopifnot( nrow(subset(de, is.na(AGE_AT_MID_C)))==0 )
	
	
	
	#	prepare inmigrant -- identify inmigrants from fishing communities and from external
	infile		<- "~/Dropbox (SPH Imperial College)/Rakai Pangea Meta Data/Data for Fish Analysis Working Group/RakaiPangeaMetaData_v2.rda"
	load(infile)
	inmigrant	<- as.data.table(inmigrant)	
	#	plot fisherfolk	to figure out how much of a radius we need
	if(0)
	{
		zf		<- data.table(longitude=c(31.763,31.7968,31.754,31.838), latitude=c(-0.915, -0.6518, -0.703, -0.497), ID= c('Kasensero','Bukyanju','NearBwende','Fish4'))
		make_circles <- function(centers, radius, nPoints = 100){
			# centers: the data frame of centers with ID
			# radius: radius measured in kilometer
			#
			meanLat <- mean(centers$latitude)
			# length per longitude changes with lattitude, so need correction
			radiusLon <- radius /111 / cos(meanLat/57.3) 
			radiusLat <- radius / 111
			circleDF <- data.frame(ID = rep(centers$ID, each = nPoints))
			angle <- seq(0,2*pi,length.out = nPoints)
			
			circleDF$lon <- unlist(lapply(centers$longitude, function(x) x + radius * cos(angle)))
			circleDF$lat <- unlist(lapply(centers$latitude, function(x) x + radius * sin(angle)))
			return(circleDF)
		}
		zc <- make_circles(zf, 0.01)
		ggmap(zm) +			
				geom_point(data=zf, aes(x=longitude, y=latitude, pch=ID), stroke=1.5, alpha=0.8) +
				geom_polygon(data=zc, aes(lon, lat, group = ID), color = "red", alpha = 0)
		#	radius of length 0.01 should catch					
		tmp		<- inmigrant[, list( 	DIST_KASENSERO= sqrt( (inmig_lon- 31.763)^2 + (inmig_lat - (-0.915))^2),
						DIST_BUKYANJU= sqrt( (inmig_lon- 31.7968)^2 + (inmig_lat - (-0.6518))^2),
						DIST_NEARBWENDE= sqrt( (inmig_lon- 31.754)^2 + (inmig_lat - (-0.703))^2),
						DIST_FISH4= sqrt( (inmig_lon- 31.838)^2 + (inmig_lat - (-0.497))^2)
				), by=c('RCCS_studyid','visit')]
		tmp		<- melt(tmp, id.vars=c('RCCS_studyid','visit'))
		ggplot(subset(tmp, value<0.3), aes(x=value)) +
				geom_histogram(binwidth=0.01) +
				facet_grid(variable~.)
		#	1 looks good
		zfd		<- merge(inmigrant, subset(tmp, value<0.01, c(RCCS_studyid, visit)), by=c('RCCS_studyid','visit'))
		#	so fishing sites are MALEMBO DIMU KASENSERO NAMIREMBE 
		#	but there are spelling mistakes		
	}
	#
	#	clean up inmigrant	
	#
	#	inmigrant[, unique(sort(inmig_place))]
	set(inmigrant, NULL, 'inmig_place', inmigrant[, gsub('DDIMO|DDIMU|DIMO|DIMU','DIMU',inmig_place)])
	set(inmigrant, inmigrant[, which(grepl('MALEMBO',inmig_place))], 'inmig_place', 'MALEMBO')
	set(inmigrant, NULL, 'inmig_place', inmigrant[, gsub("KASEMSERO","KASENSERO",inmig_place)])
	set(inmigrant, inmigrant[, which(grepl('KASENSERO',inmig_place))], 'inmig_place', 'KASENSERO')
	
	#	define from_fishing and from_outside and from_inland
	inmigrant[, INMIG_LOC:= 'inland' ]
	set(inmigrant, inmigrant[, which(grepl('MALEMBO|DIMU|KASENSERO|NAMIREMBE',inmig_place))], 'INMIG_LOC','fisherfolk')
	set(inmigrant, inmigrant[, which(inmig_admin0!='Uganda')], 'INMIG_LOC','external')
	set(inmigrant, inmigrant[, which(inmig_admin1!='Rakai')], 'INMIG_LOC','external')
	set(inmigrant, inmigrant[, which(is.na(inmig_admin1))], 'INMIG_LOC','origin_unknown')
	inmigrant[, table(INMIG_LOC)]
	#	  external fisherfolk     inland    unknown 
    #   	762         40       1571        381 
	inmigrant	<- subset(inmigrant, select=c(RCCS_studyid, date, INMIG_LOC))	
	inmigrant[, date:= hivc.db.Date2numeric(date)]	
	setnames(inmigrant, colnames(inmigrant), gsub('DATE','INMIGRATE_DATE',gsub('RCCS_STUDYID','RID',toupper(colnames(inmigrant)))))
	inmigrant	<- merge(inmigrant, de, by='RID', all.x=TRUE)
	#	ignore inmigrants not seen in 15-16
	inmigrant	<- subset(inmigrant, !is.na(STATUS))
	#	only inmigration date closest to first visit and before the first visit
	tmp			<- inmigrant[, list(FIRSTVISITDATE=min(DATE)), by='RID']
	inmigrant	<- merge(inmigrant, tmp, by='RID')	
	tmp			<- inmigrant[INMIGRATE_DATE<=FIRSTVISITDATE, list(INMIGRATE_DATE= INMIGRATE_DATE[which.min(FIRSTVISITDATE-INMIGRATE_DATE)]), by='RID']
	inmigrant	<- merge(inmigrant, tmp, by=c('RID','INMIGRATE_DATE'))
	inmigrant	<- unique(inmigrant, by=c('RID','VISIT'))
	#	select those inmigrated in the last 2 years
	tmp			<- subset(inmigrant, (FIRSTVISITDATE-INMIGRATE_DATE)<=2, c(RID, INMIG_LOC))
	setnames(tmp, 'INMIG_LOC', 'INMIG_2YRS_LOC')
	tmp[, INMIG_2YRS:=1]
	de			<- merge(de, tmp, by='RID', all.x=TRUE)
	set(de, de[, which(is.na(INMIG_2YRS))], 'INMIG_2YRS', 0)
	set(de, de[, which(is.na(INMIG_2YRS_LOC))], 'INMIG_2YRS_LOC', 'resident')	
	
	#	some fixup
	set(de, de[, which(PARTICIPATED_ANY_VISIT==0 & MIN_PNG_OUTPUT==1)], 'PARTICIPATED_ANY_VISIT', 1L)
	set(de, de[, which(SELFREPORTART_AT_FIRST_VISIT==1 & MIN_PNG_OUTPUT==1)], 'SELFREPORTART_AT_FIRST_VISIT', 0L)
	
	
	de[, table(HIV_1517, SELFREPORTART_AT_FIRST_VISIT, useNA='if')]
	#	        SELFREPORTART
	#	HIV_1517     0         1     2  <NA>
    #		    0    20715     8     7    10
    #			1     3884  1256     2     0
    #			<NA>     0     0     0 11763
		
	#	now calculate participation by community and gender
	des		<- de[, list(N=length(CURR_ID)), by=c('COMM_NUM','COMM_NUM_A','COMM_ANY_MIN_PNG_OUTPUT','SEX','PARTICIPATED_ANY_VISIT')]
	set(des, NULL, 'PARTICIPATED_ANY_VISIT', des[, factor(PARTICIPATED_ANY_VISIT, levels=c('1','0'), labels=c('PART_EVER','PART_NEVER'))])
	des		<- dcast.data.table(des, COMM_NUM+COMM_NUM_A+COMM_ANY_MIN_PNG_OUTPUT+SEX~PARTICIPATED_ANY_VISIT, value.var='N')
	
	#	now calculate HIV pos by end of round 16, and sequenced by end of round 16, by community and gender
	#	among those that participated in rounds 15 - 16
	tmp		<- subset(de, PARTICIPATED_ANY_VISIT==1)
	tmp		<- tmp[, list(	HIV_1516_YES=length(which(HIV_1517==1)), 
							HIV_1516_NO=length(which(HIV_1517==0)),
							SLART_AT_FIRST_VISIT=length(which(SELFREPORTART_AT_FIRST_VISIT==1)),
							DEEP_SEQ_1516=length(which(MIN_PNG_OUTPUT==1)) ), by=c('COMM_NUM','SEX')]	
	des		<- merge(des, tmp, by=c('COMM_NUM','SEX'), all=TRUE)
	
	#	add community type
	des[, COMM_TYPE:= as.character(factor(substr(COMM_NUM_A,1,1)=='f',levels=c(TRUE,FALSE),labels=c('fisherfolk','inland')))]	
	
	#	now do the same by community and gender and age category
	desa	<- de[, list(N=length(CURR_ID)), by=c('COMM_NUM','COMM_NUM_A','COMM_ANY_MIN_PNG_OUTPUT','SEX','AGE_AT_MID_C','PARTICIPATED_ANY_VISIT')]
	set(desa, NULL, 'PARTICIPATED_ANY_VISIT', desa[, factor(PARTICIPATED_ANY_VISIT, levels=c('1','0'), labels=c('PART_EVER','PART_NEVER'))])
	desa	<- dcast.data.table(desa, COMM_NUM+COMM_NUM_A+COMM_ANY_MIN_PNG_OUTPUT+SEX+AGE_AT_MID_C~PARTICIPATED_ANY_VISIT, value.var='N')
	tmp		<- subset(de, PARTICIPATED_ANY_VISIT==1)
	tmp		<- tmp[, list(	HIV_1516_YES=length(which(HIV_1517==1)), 
							HIV_1516_NO=length(which(HIV_1517==0)), 
							SLART_AT_FIRST_VISIT=length(which(SELFREPORTART_AT_FIRST_VISIT==1)),
							DEEP_SEQ_1516=length(which(MIN_PNG_OUTPUT==1)) ), by=c('COMM_NUM','SEX','AGE_AT_MID_C')]	
	desa	<- merge(desa, tmp, by=c('COMM_NUM','SEX','AGE_AT_MID_C'), all=TRUE)
	desa[, COMM_TYPE:= as.character(factor(substr(COMM_NUM_A,1,1)=='f',levels=c(TRUE,FALSE),labels=c('fisherfolk','inland')))]
	#	check for missing and replace by 0		
	for(x in c('PART_EVER','PART_NEVER','HIV_1516_YES','HIV_1516_NO','SLART_AT_FIRST_VISIT','DEEP_SEQ_1516'))
		set(desa, which(is.na(desa[[x]])), x, 0)
	
	#	now do the same by community and gender and migration category
	desm	<- de[, list(N=length(CURR_ID)), by=c('COMM_NUM','COMM_NUM_A','COMM_ANY_MIN_PNG_OUTPUT','SEX','INMIG_2YRS_LOC','PARTICIPATED_ANY_VISIT')]
	set(desm, NULL, 'PARTICIPATED_ANY_VISIT', desm[, factor(PARTICIPATED_ANY_VISIT, levels=c('1','0'), labels=c('PART_EVER','PART_NEVER'))])
	desm	<- dcast.data.table(desm, COMM_NUM+COMM_NUM_A+COMM_ANY_MIN_PNG_OUTPUT+SEX+INMIG_2YRS_LOC~PARTICIPATED_ANY_VISIT, value.var='N')
	tmp		<- subset(de, PARTICIPATED_ANY_VISIT==1)
	tmp		<- tmp[, list(	HIV_1516_YES=length(which(HIV_1517==1)), 
							HIV_1516_NO=length(which(HIV_1517==0)),
							SLART_AT_FIRST_VISIT=length(which(SELFREPORTART_AT_FIRST_VISIT==1)),
							DEEP_SEQ_1516=length(which(MIN_PNG_OUTPUT==1)) ), by=c('COMM_NUM','SEX','INMIG_2YRS_LOC')]	
	desm	<- merge(desm, tmp, by=c('COMM_NUM','SEX','INMIG_2YRS_LOC'), all=TRUE)
	desm[, COMM_TYPE:= as.character(factor(substr(COMM_NUM_A,1,1)=='f',levels=c(TRUE,FALSE),labels=c('fisherfolk','inland')))]
	#	check for missing and replace by 0		
	for(x in c('PART_EVER','PART_NEVER','HIV_1516_YES','HIV_1516_NO','SLART_AT_FIRST_VISIT','DEEP_SEQ_1516'))
		set(desm, which(is.na(desm[[x]])), x, 0)
	
	
	
	#	now calculate HIV pos by end of round 16, and sequenced by end of round 16, by community and gender
	#	among those that ever participated
	#	excluding those that died before 2010, or that reached age 60 before 2010
	tmp		<- subset(rd, is.na(EST_DATEDIED) | (!is.na(EST_DATEDIED) & EST_DATEDIED>=2010))	
	tmp		<- subset(tmp, (2010-BIRTHYR)<60)
	rds		<- tmp[, list(HIV_EVER_YES=length(which(HIV==1)), DEEP_SEQ_EVER=length(which(MIN_PNG_OUTPUT==1)) ), by=c('COMM_NUM','SEX')]	
	rds[, sum(DEEP_SEQ_EVER)]	
	# 2665 (among those in rd who are not too old and did not die before 2010)
	# now up to 2700

	save(des, desa, desm, rds, de, df, rd, file='~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/180322_sampling_by_gender_age.rda')
}

RakaiFull.phylogeography.180322.table1<- function()
{
	require(data.table)
	require(Hmisc)	
	
	infile.allpairs				<- '~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/todi_pairs_171122_cl25_d50_prior23_min30_networksallpairs.rda'	
	infile	<- '~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/180322_sampling_by_gender_age.rda'
	outfile.base	<- gsub('_networksallpairs.rda','',infile.allpairs)
	load(infile)
	
	#	there s just 1 person ever sequenced in comms 401 and 55, ignore
	#	this means we are just working with des, not rds
	des		<- merge(des, rds, by=c('COMM_NUM','SEX'), all.x=TRUE)
	subset(des[, list(DEEP_SEQ_1516=sum(DEEP_SEQ_1516)), by='COMM_NUM'], DEEP_SEQ_1516==0)
	#	36, 55, 401
	#des		<- subset(des, !COMM_NUM%in%c('36','55','401'))
	#desa	<- subset(desa, !COMM_NUM%in%c('36','55','401'))
	
	length( setdiff( subset(tmp, !is.na(RID) & HIV==1 & MIN_PNG_OUTPUT==1)[, RID], subset(de, !is.na(RID) & HIV==1 & MIN_PNG_OUTPUT==1)[, RID] ) )
	#	57 individuals that have a sequence who are not in de but in rd
			
	#	overall sequence coverage 
	des[, sum(DEEP_SEQ_1516)]	#2652
	des[, sum(SLART_AT_FIRST_VISIT)]	#1264
	des[, sum(HIV_1516_YES)]	#5142
	des[, sum(HIV_EVER_YES)]	#5620
	des[, sum(DEEP_SEQ_1516)/sum(HIV_EVER_YES)]							# 	0.4761135
	des[, sum(DEEP_SEQ_1516)/sum(HIV_1516_YES)]							# 	0.5156584
	des[, sum(DEEP_SEQ_1516)/sum(HIV_1516_YES-SLART_AT_FIRST_VISIT)]	#	0.6838577
	des[, sum(DEEP_SEQ_1516)/( sum(HIV_EVER_YES * (PART_EVER+PART_NEVER)/PART_EVER) )]	# 0.3295489
	des[, sum(DEEP_SEQ_1516)/( sum(HIV_1516_YES * (PART_EVER+PART_NEVER)/PART_EVER) )]	# 0.3618695
	des[, sum(DEEP_SEQ_1516)/( sum((HIV_1516_YES-SLART_AT_FIRST_VISIT) * (PART_EVER+PART_NEVER)/PART_EVER) )]	# 0.481056
	
	#	make supplementary plots of participation and sequencing by gender
	des[, P_PART_RAW:= PART_EVER/(PART_EVER+PART_NEVER)]
	des[, P_SEQ_RAW_1516:= DEEP_SEQ_1516/(HIV_1516_YES-SLART_AT_FIRST_VISIT)]
		
	tmp		<- melt(des, id.vars=c('COMM_TYPE','COMM_NUM_A','SEX'), measure.vars=c('P_PART_RAW','P_SEQ_RAW_1516'))	
	set(tmp, NULL, 'COMM_TYPE', tmp[, factor(COMM_TYPE, levels=c('fisherfolk','inland'), labels=c('fishing site','inland community'))])
	set(tmp, NULL, 'SEX', tmp[, factor(SEX, levels=c('M','F'), labels=c('male','female'))])
	ggplot(tmp, aes(x=COMM_NUM_A, fill=SEX, y=value)) +
			geom_bar(position='dodge', stat='identity') +
			scale_fill_manual(values=c('female'='hotpink2', 'male'='steelblue2')) +
			scale_y_continuous(labels=scales:::percent, expand=c(0,0), lim=c(0,1)) +
			facet_grid(variable~COMM_TYPE, space='free', scales='free') +
			labs(x='\ncommunity', fill='gender') +
			theme_bw()
	ggsave(file=paste0(outfile.base,'sampling_differences_180322.pdf'), w=12, h=6)
	#
	
	#	number communities with some sequences
	des[, length(unique(COMM_NUM))]
	#	36


	#	make table 1 for paper
	#	get all transmission events part of transmission chains regardless of gender combinations		
	confidence.cut		<- 0.6
	load(infile.allpairs)		
	rtnn[, SXO:= paste0(ID1_SEX,ID2_SEX)]
	rtnn[, SELECT:= NA_character_]
	set(rtnn, rtnn[, which(is.na(PTY_RUN))], 'SELECT', 'insufficient deep sequence data for at least one partner of couple')
	set(rtnn, rtnn[, which(!is.na(PTY_RUN) & is.na(LINK_12) & is.na(LINK_21))], 'SELECT', 'couple most likely not a pair')
	set(rtnn, rtnn[, which(!is.na(POSTERIOR_SCORE_LINKED) & POSTERIOR_SCORE_LINKED<=confidence.cut)], 'SELECT', 'couple ambiguous if pair or not pair')
	set(rtnn, rtnn[, which(!is.na(POSTERIOR_SCORE_LINKED) & POSTERIOR_SCORE_LINKED>confidence.cut)], 'SELECT', 'couple most likely a pair direction not resolved')
	set(rtnn, rtnn[, which(!is.na(POSTERIOR_SCORE_LINKED) & POSTERIOR_SCORE_LINKED>confidence.cut & POSTERIOR_SCORE_12>confidence.cut)], 'SELECT', 'couple most likely a pair direction resolved to 12')
	set(rtnn, rtnn[, which(!is.na(POSTERIOR_SCORE_LINKED) & POSTERIOR_SCORE_LINKED>confidence.cut & POSTERIOR_SCORE_21>confidence.cut)], 'SELECT', 'couple most likely a pair direction resolved to 21')	
	rtnn	<- subset(rtnn, select=c(ID1, ID2, ID1_SEX, ID2_SEX, LINK_12, LINK_21, SXO, SELECT))
	#	determine first concordant pos visit; add metadata at first concordant pos visit
	tmp		<- unique(subset(rd, select=c(RID, FIRSTPOSVIS, FIRSTPOSDATE)))
	setnames(tmp, colnames(tmp), gsub('ID1_RID','ID1',paste0('ID1_',colnames(tmp))))
	rtnn	<- merge(rtnn, tmp, by='ID1')	
	setnames(tmp, colnames(tmp), gsub('ID1','ID2',colnames(tmp)))
	rtnn	<- merge(rtnn, tmp, by='ID2')			
	rtnn[, VISIT_FIRSTCONCPOS:= rtnn[, pmax(ID1_FIRSTPOSVIS,ID2_FIRSTPOSVIS)]]
	set(rtnn, NULL, c('ID1_FIRSTPOSVIS','ID1_FIRSTPOSDATE','ID2_FIRSTPOSVIS','ID2_FIRSTPOSDATE'), NULL)
	#	add in date of birth, comm type from nearest visit we have data on
	tmp		<- unique(subset(rd, select=c(RID, VISIT)))
	setnames(tmp, 'RID', 'ID1')
	tmp		<- unique(merge(rtnn, tmp, by='ID1')[, list(VISIT= VISIT[which.min(abs(VISIT-VISIT_FIRSTCONCPOS))]), by=c('ID1','VISIT_FIRSTCONCPOS')])
	setnames(tmp, 'ID1', 'RID')
	tmp2	<- unique(subset(rd, select=c('RID','VISIT', "COMM_NUM", "COMM_NUM_A","BIRTHDATE")), by=c('RID','VISIT'))
	tmp		<- merge(tmp, tmp2, by=c('RID','VISIT'))
	setnames(tmp, colnames(tmp), gsub('ID1_VISIT_FIRSTCONCPOS','VISIT_FIRSTCONCPOS',gsub('ID1_RID','ID1',paste0('ID1_',colnames(tmp)))))
	set(tmp, NULL, 'ID1_VISIT', NULL)	
	rtnn	<- merge(rtnn, tmp, by=c('ID1','VISIT_FIRSTCONCPOS'), all.x=1)	
	tmp		<- unique(subset(rd, select=c(RID, VISIT)))
	setnames(tmp, 'RID', 'ID2')
	tmp		<- unique(merge(rtnn, tmp, by='ID2')[, list(VISIT= VISIT[which.min(abs(VISIT-VISIT_FIRSTCONCPOS))]), by=c('ID2','VISIT_FIRSTCONCPOS')])
	setnames(tmp, 'ID2', 'RID')
	tmp2	<- unique(subset(rd, select=c('RID','VISIT', "COMM_NUM", "COMM_NUM_A","BIRTHDATE")), by=c('RID','VISIT'))
	tmp		<- merge(tmp, tmp2, by=c('RID','VISIT'))
	setnames(tmp, colnames(tmp), gsub('ID2_VISIT_FIRSTCONCPOS','VISIT_FIRSTCONCPOS',gsub('ID2_RID','ID2',paste0('ID2_',colnames(tmp)))))
	set(tmp, NULL, 'ID2_VISIT', NULL)	
	rtnn	<- merge(rtnn, tmp, by=c('ID2','VISIT_FIRSTCONCPOS'), all.x=1)
	#	get list of unique individuals in these events with metadata
	rtnn[, SELECT2:= 1L]
	set(rtnn, rtnn[, which(grepl('most likely a pair',SELECT) & SXO=='MF')], 'SELECT2', 2L)
	set(rtnn, rtnn[, which(grepl('mf|fm|12|21',SELECT) & SXO=='MF')], 'SELECT2', 3L)
	
	tmp	<- subset(rtnn, LINK_12==1 | LINK_21==1, select=c(ID1, ID1_SEX, ID1_COMM_NUM_A, ID1_BIRTHDATE, SELECT2))
	setnames(tmp, c('ID1','ID1_SEX','ID1_COMM_NUM_A','ID1_BIRTHDATE'), c('RID','SEX','COMM_NUM_A','BIRTHDATE'))	
	tmp2<- subset(rtnn, LINK_12==1 | LINK_21==1, select=c(ID2, ID2_SEX, ID2_COMM_NUM_A, ID2_BIRTHDATE, SELECT2))
	setnames(tmp2, c('ID2','ID2_SEX','ID2_COMM_NUM_A','ID2_BIRTHDATE'), c('RID','SEX','COMM_NUM_A','BIRTHDATE'))	
	tmp	<- rbind(tmp, tmp2)
	set(tmp, NULL, 'COMM_TYPE', tmp[, as.character(factor(substr(COMM_NUM_A,1,1)=='f', levels=c(TRUE,FALSE), labels=c('fishing','inland')))])
	set(tmp, tmp[, which(is.na(BIRTHDATE))],'BIRTHDATE',tmp[, mean(BIRTHDATE, na.rm=TRUE)])
	#	add if person is recent inmigrant
	tmp	<- merge(tmp, subset(de, !is.na(RID), c(RID, INMIG_2YRS_LOC, INMIG_2YRS)), by='RID', all.x=TRUE)
	set(tmp, tmp[, which(is.na(INMIG_2YRS_LOC))], 'INMIG_2YRS_LOC', 'resident')
	dr	<- tmp[, 	{
						z<- which.max(SELECT2)
						list(SEX=SEX[z], COMM_TYPE=COMM_TYPE[z], SELECT2=SELECT2[z], BIRTHDATE=BIRTHDATE[z], INMIG_2YRS_LOC=INMIG_2YRS_LOC[z])
					}, by='RID']
	dr[, AGE_AT_MID:= 2013.25-BIRTHDATE]
	dr[, AGE_AT_MID_C:= cut(AGE_AT_MID, breaks=c(10,25,35,65), labels=c('15-24','25-34','35+'), right=FALSE)]
	
	
	
	#
	# table by community type, sex
	#	
	ans		<- dr[, list(IN_TRM_CHAIN= length(RID[SELECT2>=1])), by=c('COMM_TYPE','SEX')]
	ans		<- merge(ans, dr[, list(LINKED= length(RID[SELECT2>=2])), by=c('COMM_TYPE','SEX')], by=c('COMM_TYPE','SEX'))
	ans		<- merge(ans, dr[, list(LINKED_DIR= length(RID[SELECT2>=3])), by=c('COMM_TYPE','SEX')], by=c('COMM_TYPE','SEX'))
	set(ans, ans[, which(grepl('fish',COMM_TYPE))], 'COMM_TYPE', 'fisherfolk')
	tmp		<- des[, list(	ELIGIBLE_1516= sum(PART_EVER+PART_NEVER), 
							PART_1516= sum(PART_EVER),
							HIV_1516= sum(HIV_1516_YES),
							ARTNAIVE= sum(HIV_1516_YES-SLART_AT_FIRST_VISIT),
							DEEP_SEQ_1516= sum(DEEP_SEQ_1516)
							), by=c('COMM_TYPE','SEX')]
	ans		<- merge(tmp, ans, by=c('COMM_TYPE','SEX'))	
	
	
	#
	# table by community type, sex, age
	#
	ansa	<- dr[, list(IN_TRM_CHAIN= length(RID[SELECT2>=1])), by=c('COMM_TYPE','SEX','AGE_AT_MID_C')]
	ansa	<- merge(ansa, dr[, list(LINKED= length(RID[SELECT2>=2])), by=c('COMM_TYPE','SEX','AGE_AT_MID_C')], by=c('COMM_TYPE','SEX','AGE_AT_MID_C'))
	ansa	<- merge(ansa, dr[, list(LINKED_DIR= length(RID[SELECT2>=3])), by=c('COMM_TYPE','SEX','AGE_AT_MID_C')], by=c('COMM_TYPE','SEX','AGE_AT_MID_C'))
	set(ansa, ansa[, which(grepl('fish',COMM_TYPE))], 'COMM_TYPE', 'fisherfolk')
	tmp		<- desa[, list(	ELIGIBLE_1516= sum(PART_EVER+PART_NEVER), 
							PART_1516= sum(PART_EVER),
							HIV_1516= sum(HIV_1516_YES),
							ARTNAIVE= sum(HIV_1516_YES-SLART_AT_FIRST_VISIT),
							DEEP_SEQ_1516= sum(DEEP_SEQ_1516)
							), by=c('COMM_TYPE','SEX','AGE_AT_MID_C')]
	ansa	<- merge(tmp, ansa, by=c('COMM_TYPE','SEX','AGE_AT_MID_C'), all=TRUE)
	# add total to ansa
	ansa	<- melt(ansa, id.vars=c('COMM_TYPE','SEX','AGE_AT_MID_C'))
	tmp		<- ansa[, list(SEX=SEX, AGE_AT_MID_C=AGE_AT_MID_C, prop=value/sum(value), LEGEND=paste(value,' (',100*round(value/sum(value),d=2),'%)',sep='')), by=c('COMM_TYPE','variable')]
	ansa	<- merge(ansa, tmp, by=c('variable','COMM_TYPE','SEX','AGE_AT_MID_C'))	
	tmp		<- ansa[, list(SEX='Any', AGE_AT_MID_C='Any', value=sum(value), LEGEND=as.character(sum(value))), by=c('COMM_TYPE','variable')]
	tmp2	<- ansa[, list(COMM_TYPE='Any',SEX='Any', AGE_AT_MID_C='Any', value=sum(value), LEGEND=as.character(sum(value))), by=c('variable')]
	ansa	<- rbind(ansa,tmp,tmp2, fill=TRUE)
	ansa[, LABEL:= paste(COMM_TYPE, SEX, AGE_AT_MID_C, sep='-')]
	ansa	<- dcast.data.table(ansa, LABEL+COMM_TYPE+SEX+AGE_AT_MID_C~variable, value.var='LEGEND')	
	write.csv(ansa, row.names=FALSE, file=paste0(outfile.base, '_table1.csv'))	
	
	#
	# table by community type, sex, migration status
	#
	ansm	<- dr[, list(IN_TRM_CHAIN= length(RID[SELECT2>=1])), by=c('COMM_TYPE','SEX','INMIG_2YRS_LOC')]
	ansm	<- merge(ansm, dr[, list(LINKED= length(RID[SELECT2>=2])), by=c('COMM_TYPE','SEX','INMIG_2YRS_LOC')], by=c('COMM_TYPE','SEX','INMIG_2YRS_LOC'))	
	set(ansm, ansm[, which(grepl('fish',COMM_TYPE))], 'COMM_TYPE', 'fisherfolk')
	tmp		<- desm[, list(	#ELIGIBLE_1516= sum(PART_EVER+PART_NEVER), 
							#PART_1516= sum(PART_EVER),
							HIV_1516= sum(HIV_1516_YES),
							ARTNAIVE= sum(HIV_1516_YES-SLART_AT_FIRST_VISIT),
							DEEP_SEQ_1516= sum(DEEP_SEQ_1516)
			), by=c('COMM_TYPE','SEX','INMIG_2YRS_LOC')]
	ansm	<- merge(tmp, ansm, by=c('COMM_TYPE','SEX','INMIG_2YRS_LOC'), all=TRUE)
	# add total to ansm
	ansm	<- melt(ansm, id.vars=c('COMM_TYPE','SEX','INMIG_2YRS_LOC'))
	set(ansm, ansm[, which(is.na(value))], 'value', 0L)
	tmp		<- ansm[, list(SEX=SEX, INMIG_2YRS_LOC=INMIG_2YRS_LOC, prop=value/sum(value), LEGEND=paste(value,' (',100*round(value/sum(value),d=2),'%)',sep='')), by=c('COMM_TYPE','variable')]
	ansm	<- merge(ansm, tmp, by=c('variable','COMM_TYPE','SEX','INMIG_2YRS_LOC'))	
	tmp		<- ansm[, list(SEX='Any', INMIG_2YRS_LOC='Any', value=sum(value), LEGEND=as.character(sum(value))), by=c('COMM_TYPE','variable')]
	tmp2	<- ansm[, list(COMM_TYPE='Any',SEX='Any', INMIG_2YRS_LOC='Any', value=sum(value), LEGEND=as.character(sum(value))), by=c('variable')]
	ansm	<- rbind(ansm,tmp,tmp2, fill=TRUE)
	ansm[, LABEL:= paste(COMM_TYPE, SEX, INMIG_2YRS_LOC, sep='-')]
	set(ansm, NULL, 'INMIG_2YRS_LOC', ansm[, factor(INMIG_2YRS_LOC, levels=c('Any','resident','inland','fisherfolk','external','origin_unknown'))])
	ansm	<- dcast.data.table(ansm, LABEL+COMM_TYPE+SEX+INMIG_2YRS_LOC~variable, value.var='LEGEND')
	setkey(ansm, COMM_TYPE, SEX, INMIG_2YRS_LOC)	
	write.csv(ansm, row.names=FALSE, file=paste0(outfile.base, '_table1_commtype_sex_migration.csv'))	
	
	
	#
	#	table by sex, age
	#
	ansa	<- dr[, list(IN_TRM_CHAIN= length(RID[SELECT2>=1])), by=c('SEX','AGE_AT_MID_C')]
	ansa	<- merge(ansa, dr[, list(LINKED= length(RID[SELECT2>=2])), by=c('SEX','AGE_AT_MID_C')], by=c('SEX','AGE_AT_MID_C'))
	ansa	<- merge(ansa, dr[, list(LINKED_DIR= length(RID[SELECT2>=3])), by=c('SEX','AGE_AT_MID_C')], by=c('SEX','AGE_AT_MID_C'))		
	tmp		<- desa[, list(	ELIGIBLE_1516= sum(PART_EVER+PART_NEVER), 
							PART_1516= sum(PART_EVER),
							HIV_1516= sum(HIV_1516_YES),
							ARTNAIVE= sum(HIV_1516_YES-SLART_AT_FIRST_VISIT),
							DEEP_SEQ_1516= sum(DEEP_SEQ_1516)
					), by=c('SEX','AGE_AT_MID_C')]
	ansa	<- merge(tmp, ansa, by=c('SEX','AGE_AT_MID_C'))
	# add proportions to ansa
	ansa	<- melt(ansa, id.vars=c('SEX','AGE_AT_MID_C'))
	tmp		<- ansa[, list(SEX=SEX, AGE_AT_MID_C=AGE_AT_MID_C, prop=value/sum(value), LEGEND=paste(value,' (',100*round(value/sum(value),d=2),'%)',sep='')), by=c('variable')]
	ansa	<- merge(ansa, tmp, by=c('variable','SEX','AGE_AT_MID_C'))
	# add total to ansa
	tmp		<- ansa[, list(SEX='Any', AGE_AT_MID_C='Any', value=sum(value), LEGEND=as.character(sum(value))), by=c('variable')]
	tmp2	<- ansa[, list(AGE_AT_MID_C='Any', value=sum(value), LEGEND=as.character(sum(value))), by=c('variable','SEX')]
	ansa	<- rbind(ansa,tmp,tmp2, fill=TRUE)
	ansa[, LABEL:= paste(SEX, AGE_AT_MID_C, sep='-')]
	ansa	<- dcast.data.table(ansa, LABEL+SEX+AGE_AT_MID_C~variable, value.var='LEGEND')
	set(ansa, NULL, 'LABEL', ansa[, factor(LABEL, levels=c("Any-Any","F-Any","F-15-24","F-25-34","F-35+","M-Any","M-15-24","M-25-34","M-35+"))])
	setkey(ansa, LABEL)
	write.csv(ansa, row.names=FALSE, file=paste0(outfile.base, '_table1_nocommtype.csv'))	
}


RakaiFull.phylogeography.171122.recpients.on.map<- function()
{
	require(data.table)
	require(scales)
	require(ggplot2)
	require(ggmap)
	require(grid)
	require(gridExtra)
	require(RColorBrewer)
	require(Hmisc)
	require(gtools)	#rdirichlet
	
	infile					<- "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/todi_pairs_171122_cl35_prior23_min30_withmetadata.rda"
	infile					<- "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/todi_pairs_171122_cl25_prior23_min30_withmetadata.rda"
	infile					<- "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/todi_pairs_171122_cl25_d50_prior23_min30_withmetadata.rda"
	outfile.base			<- gsub('_withmetadata.rda','',infile)
	
	style	<- "feature:road|color:0x17202A&style=feature:water|color:0x677996&style=feature:landscape.natural|color:0xedecda&style=feature:administrative|visibility=off"
	zm		<- get_googlemap(c(lon=31.65, lat=-0.66), scale=2, size=c(550,550), zoom=10, maptype="road", style=style)
	zc		<- as.data.table(read.csv('~/Dropbox (SPH Imperial College)/Rakai Pangea Meta Data/Data for Fish Analysis Working Group/PANGEA_Rakai_community_anonymized_IDs.csv', stringsAsFactors=FALSE))
	load(infile)	
	setkey(rtp, MALE_RID, FEMALE_RID)
	rtp[, PAIRID:= seq_len(nrow(rtp))]
	rtpdm	<- subset(rtp, grepl('mf|fm',SELECT))
	set(rtpdm, NULL, 'MALE_COMM_TYPE', rtpdm[, factor(MALE_COMM_TYPE=='fisherfolk',levels=c(TRUE,FALSE),labels=c('fishing site','inland community') )])
	set(rtpdm, NULL, 'FEMALE_COMM_TYPE', rtpdm[, factor(FEMALE_COMM_TYPE=='fisherfolk',levels=c(TRUE,FALSE),labels=c('fishing site','inland community') )])
	nrow(rtpdm)
	#	284	with 0.035
	#	238	with 0.025
	
	#	locations with number sequence samples
	ds	<- subset(rsm, MIN_PNG_OUTPUT>0, select=c(COMM_NUM, COMM_TYPE, COMM_NUM_A, ELIGIBLE_AVG, PARTICIPATED_AVG, HIV, MIN_PNG_OUTPUT))
	set(ds, NULL, 'COMM_TYPE', ds[, factor(COMM_TYPE=='fisherfolk',levels=c(TRUE,FALSE),labels=c('fishing site','inland community') )])
	ds[, P_PART_EMP:= PARTICIPATED_AVG/ELIGIBLE_AVG]
	ds[, P_SEQ_EMP:= MIN_PNG_OUTPUT/HIV]
	ds[, P_SEQCOV:= P_PART_EMP*P_SEQ_EMP]
	ds[, P_SEQCOV_C:= cut(P_SEQCOV, breaks=seq(0,1,0.1), labels=c('<10%','10-19%','20-29%','30-39%','40-49%','50-59%','60-69%','70-79%','80-89%','90%-100%'))]
	tmp	<- unique(subset(zc, select=c(COMM_NUM, longitude, latitude)), by='COMM_NUM')
	ds	<- merge(ds, tmp, by='COMM_NUM')
	
	#	show location of transmitters, recipients
	rtpdm[, AGEDIFF:= rtpdm[, FEMALE_BIRTHDATE-MALE_BIRTHDATE]]	
	set(rtpdm, NULL, 'MALE_SEX', 'M')
	set(rtpdm, NULL, 'FEMALE_SEX', 'F')	
	rmf		<- subset(rtpdm, grepl('mf',SELECT) )
	rfm		<- subset(rtpdm, grepl('fm',SELECT) )
	rtr2	<- copy(rmf)
	setnames(rtr2,colnames(rtr2),gsub('FEMALE','REC',colnames(rtr2)))
	setnames(rtr2,colnames(rtr2),gsub('MALE','TR',colnames(rtr2)))
	tmp		<- copy(rfm)
	setnames(tmp,colnames(tmp),gsub('FEMALE','TR',colnames(tmp)))
	setnames(tmp,colnames(tmp),gsub('MALE','REC',colnames(tmp)))
	rtr2	<- rbind(rtr2,tmp)	
	dc.tr	<- rtr2[, list(TR_OBS=length(PAIRID)), by=c('TR_COMM_NUM_A')]
	setnames(dc.tr, 'TR_COMM_NUM_A', 'COMM_NUM_A')
	dc.tr	<- merge(dc.tr, ds, by='COMM_NUM_A')
	dc.tr[, TR_OBS_C:= cut(TR_OBS, breaks=c(0,1,2,5,10,20,50,100))]
	dc.rec	<- rtr2[, list(REC_OBS=length(PAIRID)), by=c('REC_COMM_NUM_A')]
	setnames(dc.rec, 'REC_COMM_NUM_A', 'COMM_NUM_A')
	dc.rec	<- merge(dc.rec, ds, by='COMM_NUM_A')
	dc.rec[, REC_OBS_C:= cut(REC_OBS, breaks=c(0,1,2,5,10,20,50,100))]
	
	rtr2[, table(TR_COMM_TYPE, REC_COMM_TYPE)]
	#                  REC_COMM_TYPE
	#TR_COMM_TYPE       fishing site inland community
  	#fishing site              170                9
  	#inland community           14              100
	#
	#	plot number of observed recipients and observed transmitters
	ggmap(zm) +
			geom_point(data=dc.rec, aes(x=longitude, y=latitude, pch=COMM_TYPE, colour=REC_OBS_C, size=REC_OBS), alpha=1) +
			geom_text(data=dc.rec, aes(x=longitude, y=latitude, label=REC_OBS), nudge_x=0, nudge_y=0, size=4, colour='black') +			
			scale_colour_brewer(palette="YlOrRd") +
			scale_shape_manual(values=c('fishing site'=18, 'inland community'=19)) +
			scale_size(breaks=c(1,2,5,10,20,50), range=1.3*c(5,15))+						
			theme(legend.position='bottom', legend.box = "vertical") +
			guides(size='none', colour='none', pch=guide_legend(override.aes=list(size=7))) +
			labs(	x='', y='', pch="community\ntype", colour="phylogenetically observed recipients")
	ggsave(file=paste0(outfile.base,'_comm_recipientnumbers_on_map.pdf'), w=7, h=7)
	ggmap(zm) +
			geom_point(data=dc.tr, aes(x=longitude, y=latitude, pch=COMM_TYPE, colour=TR_OBS_C, size=TR_OBS), alpha=1) +
			geom_text(data=dc.tr, aes(x=longitude, y=latitude, label=TR_OBS), nudge_x=0, nudge_y=0, size=4, colour='black') +			
			scale_colour_brewer(palette="YlOrRd") +
			scale_size(breaks=c(1,2,5,10,20,50), range=1.3*c(5,15))+						
			theme(legend.position='bottom', legend.box = "vertical") +
			guides(size='none', colour='none', pch=guide_legend(override.aes=list(size=7))) +
			labs(	x='', y='', pch="community\ntype", colour="phylogenetically observed transmitters")
	ggsave(file=paste0(outfile.base,'_comm_transmitternumbers_on_map.pdf'), w=7, h=7)

	
	subset(rtr2, TR_INMIGRANT==1)	
}

RakaiFull.phylogeography.180618.figure.flows.on.map<- function()
{
	require(data.table)
	require(scales)
	require(ggplot2)
	require(ggmap)
	require(grid)
	require(gridExtra)
	require(RColorBrewer)
	require(Hmisc)
	require(gtools)	#rdirichlet
	
	infile					<- "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/todi_pairs_171122_cl35_prior23_min30_withmetadata.rda"
	infile					<- "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/todi_pairs_171122_cl25_prior23_min30_withmetadata.rda"
	infile					<- "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/todi_pairs_171122_cl25_d50_prior23_min30_withmetadata.rda"
	
	infile					<- "~/Dropbox (SPH Imperial College)/Rakai Fish Analysis/full_run/todi_pairs_180522_cl25_d50_prior23_min30_phylogeography_data_with_inmigrants.rda"		
	outfile.base			<- gsub('_data_with_inmigrants.rda','',infile)
	load(infile)
	setnames(rtr2, c('TR_INMIGRATE_2YR','REC_INMIGRATE_2YR'), c('TR_INMIGRATE','REC_INMIGRATE'))
	
	#	define location from where inmigrants arrived 
	rtr3	<- subset(rtr2, grepl('inmigrant',TR_INMIGRATE) & !grepl('external',TR_INMIGRATE))
	

	#	update inmigrant with new resolved locations in inmigrant2
	infile		<- "~/Dropbox (SPH Imperial College)/Rakai Pangea Meta Data/Data for Fish Analysis Working Group/RakaiPangeaMetaData_v2.rda"
	load(infile)
	inmigrant	<- as.data.table(inmigrant)
	infile		<- "~/Dropbox (SPH Imperial College)/Rakai Pangea Meta Data/Data for Fish Analysis Working Group/migrants_withMissingGPS.csv"
	inmigrant2	<- as.data.table(read.csv(infile))
	inmigrant	<- merge(inmigrant, subset(inmigrant2, select=c('RCCS_studyid','visit','X')), by=c('RCCS_studyid','visit'), all.x=TRUE)
	tmp			<- subset(inmigrant, !is.na(X), c(RCCS_studyid, visit, date, inmigrant))
	inmigrant2	<- merge(inmigrant2, tmp, by=c('RCCS_studyid','visit'))
	inmigrant	<- subset(inmigrant, is.na(X))
	set(inmigrant, NULL, 'X', NULL)
	set(inmigrant2, NULL, c('X','inmig_place_original','recode_city'), NULL)
	inmigrant	<- rbind(inmigrant, inmigrant2)
	#	prepare inmigrant -- identify inmigrants from fishing communities and from external
	set(inmigrant, NULL, 'inmig_place', inmigrant[, gsub('DDIMO|DDIMU|DIMO|DIMU','DIMU',inmig_place)])
	set(inmigrant, inmigrant[, which(grepl('MALEMBO',inmig_place))], 'inmig_place', 'MALEMBO')
	set(inmigrant, NULL, 'inmig_place', inmigrant[, gsub("KASEMSERO","KASENSERO",inmig_place)])
	set(inmigrant, inmigrant[, which(grepl('KASENSERO',inmig_place))], 'inmig_place', 'KASENSERO')
	set(inmigrant, NULL, 'date', inmigrant[, hivc.db.Date2numeric(date)])	
	#	define from_fishing and from_outside and from_inland
	inmigrant[, INMIG_LOC:= 'inland' ]
	set(inmigrant, inmigrant[, which(grepl('MALEMBO|DIMU|KASENSERO|NAMIREMBE',inmig_place))], 'INMIG_LOC','fisherfolk')
	set(inmigrant, inmigrant[, which(inmig_admin0!='Uganda')], 'INMIG_LOC','external')
	set(inmigrant, inmigrant[, which(inmig_admin1!='Rakai')], 'INMIG_LOC','external')
	set(inmigrant, inmigrant[, which(is.na(inmig_admin1))], 'INMIG_LOC','unknown')	
	setnames(inmigrant, c('RCCS_studyid','date'), c('TR_RID','TR_VISIT_DATE'))	
	rtr3	<- merge(unique(subset(rtr3, select=c(TR_RID, REC_RID, VISIT_FIRSTCONCPOS, DATE_FIRSTCONCPOS))), inmigrant, by='TR_RID', all.x=TRUE)
	setnames(rtr3, c('inmig_lon','inmig_lat','INMIG_LOC'), c('TR_INMIG_LON','TR_INMIG_LAT','TR_INMIG_LOC'))
	#	find inmigration event of transmitter that is at or before first time both were recorded infected
	rtr3	<- subset(rtr3, DATE_FIRSTCONCPOS >= TR_VISIT_DATE)
	tmp		<- rtr3[,  list(TR_VISIT_DATE=TR_VISIT_DATE[which.min(DATE_FIRSTCONCPOS-TR_VISIT_DATE)]), by=c('TR_RID','REC_RID','DATE_FIRSTCONCPOS')]
	rtr3	<- merge(rtr3,tmp,by=c('TR_RID','REC_RID','DATE_FIRSTCONCPOS','TR_VISIT_DATE'))	
	rtr3	<- subset(rtr3, select=c(TR_RID, REC_RID, VISIT_FIRSTCONCPOS, TR_INMIG_LON, TR_INMIG_LAT, TR_INMIG_LOC))
	#	add to rtr2
	rtr2	<- merge(rtr2,rtr3,by=c('TR_RID','REC_RID','VISIT_FIRSTCONCPOS'),all.x=TRUE)
	
	#	add coordinates to communities
	tmp		<- unique(subset(ds, select=c(COMM_NUM_A, LONG, LAT)))
	setnames(tmp, c('COMM_NUM_A','LONG','LAT'), c('TR_COMM_NUM_A','TR_X','TR_Y'))
	rtr2	<- merge(rtr2, tmp, by='TR_COMM_NUM_A')
	setnames(tmp, c('TR_COMM_NUM_A','TR_X','TR_Y'), c('REC_COMM_NUM_A','REC_X','REC_Y'))
	rtr2	<- merge(rtr2, tmp, by='REC_COMM_NUM_A')
	
	#
	#	get map
	#
	style	<- "feature:road|color:0x17202A&style=feature:water|color:0x677996&style=feature:landscape.natural|color:0xedecda&style=feature:administrative|visibility=off"
	zm		<- get_googlemap(c(lon=31.65, lat=-0.66), scale=2, size=c(550,550), zoom=10, maptype="road", style=style)
	zc		<- as.data.table(read.csv('~/Dropbox (SPH Imperial College)/Rakai Pangea Meta Data/Data for Fish Analysis Working Group/PANGEA_Rakai_community_anonymized_IDs.csv', stringsAsFactors=FALSE))			
	
	#
	#	to plot flows from migrants, set up fake communities from migrant locations
	#
	#	set up dummy location in corner for inmigration from external 
	#tmp		<- rtr2[, c(min(c(REC_X, TR_X)), max(c(REC_Y, TR_Y)))]
	tmp		<- c(31.3,-0.34)
	tmp2	<- rtr2[, which(grepl('external',TR_INMIGRATE))]
	set(rtr2, tmp2, 'TR_INMIG_LON', tmp[1])
	set(rtr2, tmp2, 'TR_INMIG_LAT', tmp[2])
	#	set up dummy location in corner for inmigration from unknown
	#tmp		<- rtr2[, c(max(c(REC_X, TR_X)), min(c(REC_Y, TR_Y)))]
	tmp		<- c(32.0,-1.0)
	tmp2	<- rtr2[, which(TR_INMIG_LOC=='unknown')]
	set(rtr2, tmp2, 'TR_INMIG_LON', tmp[1])
	set(rtr2, tmp2, 'TR_INMIG_LAT', tmp[2])
	#	define fake community IDs
	tmp		<- rtr2[, which(grepl('external',TR_INMIGRATE))]
	set(rtr2, tmp, 'TR_INMIG_LOC', 'external')	
	tmp		<- unique(subset(rtr2, !grepl('resident',TR_INMIGRATE), c(TR_INMIG_LON, TR_INMIG_LAT, TR_INMIG_LOC)))
	tmp[, TR_COMM_NUM_A_MIG:= paste0(substr(TR_INMIG_LOC,1,1),'mig',seq_len(nrow(tmp)))]
	rtr2	<- merge(rtr2, tmp, by=c('TR_INMIG_LON','TR_INMIG_LAT','TR_INMIG_LOC'), all.x=TRUE)
	
	#
	#	change locations of transmitter if transmitter is inmigrant and flow is not between same areas
	#
	tmp		<- rtr2[, which(TR_INMIGRATE!='resident')]	
	set(rtr2, tmp, 'TR_COMM_NUM_A', rtr2[tmp,TR_COMM_NUM_A_MIG])
	set(rtr2, tmp, 'TR_X', rtr2[tmp,TR_INMIG_LON])
	set(rtr2, tmp, 'TR_Y', rtr2[tmp,TR_INMIG_LAT])
	 	
	
	#
	#	plot number of observed transmission flows
	#
	
	#	overall parameters for plotting
	tr.edge.gap	<- 0
	rec.edge.gap<- 0.06
	node.size	<- 0.2
	edge.size	<- 1
	curvature	<- -0.2
	label.size	<- 5
	arrow		<- arrow(length=unit(0.04, "npc"), type="closed", angle=15)
	arrow2		<- arrow(length=unit(0.02, "npc"), type="closed", angle=15)
	curv.shift	<- 0.08
	#
	#	define flows
	#
	dc		<- rtr2[, list(FLOW=length(PAIRID)), by=c('TR_COMM_NUM_A','TR_INMIGRATE','REC_COMM_NUM_A','TR_X','TR_Y','REC_X','REC_Y')]
	tmp		<- dc[, which(substr(TR_COMM_NUM_A,1,1)=='u')]
	set(dc, tmp, 'TR_INMIGRATE', dc[tmp, gsub('inland|fisherfolk','unknownloc',TR_INMIGRATE)])
	set(dc, dc[, which(TR_INMIGRATE=='resident' & substr(TR_COMM_NUM_A,1,1)=='f')], 'TR_INMIGRATE', 'resident_fish')
	set(dc, dc[, which(TR_INMIGRATE=='resident' & substr(TR_COMM_NUM_A,1,1)!='f')], 'TR_INMIGRATE', 'resident_inland')
	#	count flows
	subset(dc, substr(TR_COMM_NUM_A,1,1)=='f' & substr(REC_COMM_NUM_A,1,1)=='f')[, sum(FLOW)]
	subset(dc, substr(TR_COMM_NUM_A,1,1)!='f' & substr(TR_COMM_NUM_A,1,1)!='e' & substr(TR_COMM_NUM_A,1,1)!='u' & substr(REC_COMM_NUM_A,1,1)!='f')[, sum(FLOW)]
	subset(dc, substr(TR_COMM_NUM_A,1,1)=='f' & substr(REC_COMM_NUM_A,1,1)!='f')
	subset(dc, substr(TR_COMM_NUM_A,1,1)!='f' & substr(TR_COMM_NUM_A,1,1)!='e' & substr(TR_COMM_NUM_A,1,1)!='u' & substr(REC_COMM_NUM_A,1,1)=='f')
	
	#
	#	define stuff for plotting curved flows
	#	
	#dc[, which(substr(TR_COMM_NUM_A,1,1)=='a' & substr(REC_COMM_NUM_A,1,1)=='a' & TR_INMIGRATE!='resident')]
	set(dc, NULL, 'TR_COMM_NUM_A', dc[, as.character(TR_COMM_NUM_A)])
	set(dc, NULL, 'REC_COMM_NUM_A', dc[, as.character(REC_COMM_NUM_A)])
	dc[, EDGETEXT_X:= (TR_X+REC_X)/2]
	dc[, EDGETEXT_Y:= (TR_Y+REC_Y)/2]
	dc[, EDGE_LABEL:= FLOW ]
	dc[, MX:= (REC_X - TR_X)]	
	dc[, MY:= (REC_Y - TR_Y)]	
	set(dc, NULL, 'TR_X_EDGE', dc[, TR_X + MX*tr.edge.gap])
	set(dc, NULL, 'TR_Y_EDGE', dc[, TR_Y + MY*tr.edge.gap])
	set(dc, NULL, 'REC_X_EDGE', dc[, REC_X - MX*rec.edge.gap])
	set(dc, NULL, 'REC_Y_EDGE', dc[, REC_Y - MY*rec.edge.gap])		
	dc[, TX:= -MY]
	dc[, TY:= MX]
	set(dc, NULL, 'EDGETEXT_X', dc[, EDGETEXT_X + TX*curv.shift])
	set(dc, NULL, 'EDGETEXT_Y', dc[, EDGETEXT_Y + TY*curv.shift])
	#
	#	flows inland -> fishing
	#
	ggmap(zm) +		
			geom_point(	data=unique(subset(dc,  substr(REC_COMM_NUM_A,1,1)=='f' & FLOW>0, c(REC_X, REC_Y))), 
						aes(x=REC_X, y=REC_Y), 
						colour='firebrick1', size=4) +
			geom_curve(	data=subset(dc, substr(TR_COMM_NUM_A,1,1)!='f' & substr(REC_COMM_NUM_A,1,1)=='f' & FLOW>0), 
						aes(x=TR_X_EDGE, xend=REC_X_EDGE, y=TR_Y_EDGE, yend=REC_Y_EDGE, colour=TR_INMIGRATE), 
						curvature=curvature, arrow=arrow, lineend="butt", linejoin='mitre') +
			geom_text(	data=subset(dc, substr(TR_COMM_NUM_A,1,1)!='f' & substr(REC_COMM_NUM_A,1,1)=='f' & FLOW>0), 
						aes(x=EDGETEXT_X, y=EDGETEXT_Y, label=EDGE_LABEL, colour=TR_INMIGRATE), 
						size=label.size) +
			coord_cartesian() +
			scale_colour_manual(values=c('resident_inland'="#66BD63", 'inmigrant_from_inland'="#00441B",'inmigrant_from_unknownloc'='black', 'inmigrant_from_external'='deepskyblue1')) +
			#scale_size(breaks=c(1,2,4), range=c(0.6,1.2)) +
			labs(x='', y='') +
			theme(legend.position='bottom', legend.box = "vertical") + 
			guides(size='none', colour='none') 
	ggsave(file=paste0(outfile.base,'_flows_intofishing_on_map.pdf'), w=7, h=7, useDingbats=FALSE)
	#
	#	flows fishing -> inland
	#		
	ggmap(zm) +					
			geom_point(	data=unique(subset(dc, substr(REC_COMM_NUM_A,1,1)!='f' & FLOW>0, c(REC_X, REC_Y))), 
						aes(x=REC_X, y=REC_Y), 
						colour="#00441B", size=4) +
			geom_curve(	data=subset(dc, substr(TR_COMM_NUM_A,1,1)=='f' & substr(REC_COMM_NUM_A,1,1)!='f' & FLOW>0), 
						aes(x=TR_X_EDGE, xend=REC_X_EDGE, y=TR_Y_EDGE, yend=REC_Y_EDGE, colour=TR_INMIGRATE), 
						curvature=curvature, arrow=arrow, lineend="butt", linejoin='mitre') +
			geom_text(	data=subset(dc, substr(TR_COMM_NUM_A,1,1)=='f'  & substr(REC_COMM_NUM_A,1,1)!='f' & FLOW>0), 
						aes(x=EDGETEXT_X, y=EDGETEXT_Y, label=EDGE_LABEL, colour=TR_INMIGRATE), 
						size=label.size) +
			coord_cartesian() +
			#scale_size(breaks=c(1,2,4), range=c(0.6,1.2))+
			scale_colour_manual(values=c('resident_fish'="firebrick1", 'inmigrant_from_fisherfolk'="darkred",'inmigrant_from_unknownloc'='black', 'inmigrant_from_external'='deepskyblue1')) +
			labs(x='', y='') +
			theme(legend.position='bottom', legend.box = "vertical") + 
			guides(size='none', colour='none')
	ggsave(file=paste0(outfile.base,'_flows_fromfishing_on_map.pdf'), w=7, h=7, useDingbats=FALSE)
	#
	#	flows fishing -> fishing and inland -> inland
	#
	ggmap(zm) +		
			geom_point(	data=unique(subset(dc,  substr(REC_COMM_NUM_A,1,1)=='f' & FLOW>0, c(REC_X, REC_Y))), 
					aes(x=REC_X, y=REC_Y), 
					colour='firebrick1', size=4) +
			geom_curve(	data=subset(dc, substr(TR_COMM_NUM_A,1,1)=='f' & substr(REC_COMM_NUM_A,1,1)=='f' & REC_COMM_NUM_A!=TR_COMM_NUM_A & FLOW>0), 
					aes(x=TR_X_EDGE, xend=REC_X_EDGE, y=TR_Y_EDGE, yend=REC_Y_EDGE, colour=TR_INMIGRATE), 
					curvature=curvature, arrow=arrow, lineend="butt", linejoin='mitre') +			
			geom_curve(	data=subset(dc, substr(TR_COMM_NUM_A,1,1)=='f' & substr(REC_COMM_NUM_A,1,1)=='f' & REC_COMM_NUM_A==TR_COMM_NUM_A & FLOW>0), 
					aes(x=TR_X+0.005, xend=REC_X+0.03, y=TR_Y-0.005, yend=REC_Y, colour=TR_INMIGRATE), 
					curvature=1) +
			geom_curve(	data=subset(dc, substr(TR_COMM_NUM_A,1,1)=='f' & substr(REC_COMM_NUM_A,1,1)=='f' & REC_COMM_NUM_A==TR_COMM_NUM_A & FLOW>0), 
					aes(x=TR_X+0.03, xend=REC_X+0.005, y=TR_Y, yend=REC_Y+0.005, colour=TR_INMIGRATE), 
					curvature=1, arrow=arrow2, lineend="butt", linejoin='mitre') +
			geom_text(	data=subset(dc, substr(TR_COMM_NUM_A,1,1)=='f' & substr(REC_COMM_NUM_A,1,1)=='f' & FLOW>0), 
					aes(x=EDGETEXT_X, y=EDGETEXT_Y, label=EDGE_LABEL, colour=TR_INMIGRATE), 
					size=label.size) +
			coord_cartesian() +
			scale_colour_manual(values=c('resident_inland'="#66BD63",'resident_fish'="firebrick1", 'inmigrant_from_inland'="#00441B",'inmigrant_from_fisherfolk'="darkred",'inmigrant_from_unknownloc'='black', 'inmigrant_from_external'='deepskyblue1')) +
			labs(x='', y='') +
			theme(legend.position='bottom', legend.box = "vertical") + 
			guides(size='none', colour='none')
	ggsave(file=paste0(outfile.base,'_flows_fishing_on_map.pdf'), w=7, h=7, useDingbats=FALSE)
	ggmap(zm) +		
			geom_point(	data=unique(subset(dc,  substr(REC_COMM_NUM_A,1,1)!='f' & FLOW>0, c(REC_X, REC_Y))), 
					aes(x=REC_X, y=REC_Y), 
					colour="#00441B", size=4) +
			geom_curve(	data=subset(dc, substr(TR_COMM_NUM_A,1,1)!='f' & substr(REC_COMM_NUM_A,1,1)!='f' & REC_COMM_NUM_A!=TR_COMM_NUM_A & FLOW>0), 
					aes(x=TR_X_EDGE, xend=REC_X_EDGE, y=TR_Y_EDGE, yend=REC_Y_EDGE, colour=TR_INMIGRATE), 
					curvature=curvature, arrow=arrow, lineend="butt", linejoin='mitre') +			
			geom_curve(	data=subset(dc, substr(TR_COMM_NUM_A,1,1)!='f' & substr(REC_COMM_NUM_A,1,1)!='f' & REC_COMM_NUM_A==TR_COMM_NUM_A & FLOW>0), 
					aes(x=TR_X+0.005, xend=REC_X+0.03, y=TR_Y-0.005, yend=REC_Y, colour=TR_INMIGRATE), 
					curvature=1) +
			geom_curve(	data=subset(dc, substr(TR_COMM_NUM_A,1,1)!='f' & substr(REC_COMM_NUM_A,1,1)!='f' & REC_COMM_NUM_A==TR_COMM_NUM_A & FLOW>0), 
					aes(x=TR_X+0.03, xend=REC_X+0.005, y=TR_Y, yend=REC_Y+0.005, colour=TR_INMIGRATE), 
					curvature=1, arrow=arrow2, lineend="butt", linejoin='mitre') +
			geom_text(	data=subset(dc, substr(TR_COMM_NUM_A,1,1)!='f' & substr(REC_COMM_NUM_A,1,1)!='f' & FLOW>0), 
					aes(x=EDGETEXT_X, y=EDGETEXT_Y, label=EDGE_LABEL, colour=TR_INMIGRATE), 
					size=label.size) +			
			coord_cartesian() +
			scale_colour_manual(values=c('resident_inland'="#66BD63",'resident_fish'="firebrick1", 'inmigrant_from_inland'="#00441B",'inmigrant_from_fisherfolk'="darkred",'inmigrant_from_unknownloc'='black', 'inmigrant_from_external'='deepskyblue1')) +
			labs(x='', y='') +
			theme(legend.position='bottom', legend.box = "vertical") + 
			guides(size='none', colour='none')
	ggsave(file=paste0(outfile.base,'_flows_inland_on_map.pdf'), w=7, h=7, useDingbats=FALSE)
	ggmap(zm) +		
			geom_point(	data=unique(subset(dc,  substr(REC_COMM_NUM_A,1,1)=='f' & FLOW>0, c(REC_X, REC_Y))), 
						aes(x=REC_X, y=REC_Y), 
						colour='firebrick1', size=4) +
			geom_curve(	data=subset(dc, substr(TR_COMM_NUM_A,1,1)=='f' & substr(REC_COMM_NUM_A,1,1)=='f' & REC_COMM_NUM_A!=TR_COMM_NUM_A & FLOW>0), 
						aes(x=TR_X_EDGE, xend=REC_X_EDGE, y=TR_Y_EDGE, yend=REC_Y_EDGE, colour=TR_INMIGRATE), 
						curvature=curvature, arrow=arrow, lineend="butt", linejoin='mitre') +			
			geom_curve(	data=subset(dc, substr(TR_COMM_NUM_A,1,1)=='f' & substr(REC_COMM_NUM_A,1,1)=='f' & REC_COMM_NUM_A==TR_COMM_NUM_A & FLOW>0), 
						aes(x=TR_X+0.005, xend=REC_X+0.03, y=TR_Y-0.005, yend=REC_Y, colour=TR_INMIGRATE), 
						curvature=1) +
			geom_curve(	data=subset(dc, substr(TR_COMM_NUM_A,1,1)=='f' & substr(REC_COMM_NUM_A,1,1)=='f' & REC_COMM_NUM_A==TR_COMM_NUM_A & FLOW>0), 
						aes(x=TR_X+0.03, xend=REC_X+0.005, y=TR_Y, yend=REC_Y+0.005, colour=TR_INMIGRATE), 
						curvature=1, arrow=arrow2, lineend="butt", linejoin='mitre') +
			geom_text(	data=subset(dc, substr(TR_COMM_NUM_A,1,1)=='f' & substr(REC_COMM_NUM_A,1,1)=='f' & FLOW>0), 
						aes(x=EDGETEXT_X, y=EDGETEXT_Y, label=EDGE_LABEL, colour=TR_INMIGRATE), 
						size=label.size) +
			geom_point(	data=unique(subset(dc,  substr(REC_COMM_NUM_A,1,1)!='f' & FLOW>0, c(REC_X, REC_Y))), 
						aes(x=REC_X, y=REC_Y), 
						colour="#00441B", size=4) +
			geom_curve(	data=