R/bigphylo.fun.R

Defines functions seq.write.dna.phylip seq.align.based.on.common.reference seq.write.dna.phylip.triangular seq.mvr.d.and.v seq.strip.gap seq.find seq.unique seq.rm.drugresistance seq.rm.drugresistance.internal seq.dist seq.replace seq.rmgaps seq.create.referencepairs seq.write.dna.nexus seq.length seq.proportion.ambiguous seq.gc.content hivc.clu.polyphyletic.clusters hivc.clu.get.tiplabels hivc.clu.get.tiplabels.v1 hivc.clu.plot

Documented in seq.align.based.on.common.reference seq.dist seq.find seq.gc.content seq.length seq.proportion.ambiguous seq.replace seq.rm.drugresistance seq.rmgaps seq.strip.gap seq.unique seq.write.dna.nexus seq.write.dna.phylip seq.write.dna.phylip.triangular

#' @export seq.write.dna.phylip
#' @title Write a square phylip file
seq.write.dna.phylip<- function(seq.DNAbin.mat, file)
{		
	tmp<- cbind( rownames(seq.DNAbin.mat), apply( as.character( seq.DNAbin.mat ), 1, function(x) paste(x,sep='',collapse='')  ) )
	tmp<- paste(t(tmp),collapse='\n',sep='')	
	tmp<- paste( paste(c(nrow(seq.DNAbin.mat),ncol(seq.DNAbin.mat)),sep='',collapse=' '),'\n',tmp,'\n',collapse='',sep='' )
	cat(tmp, file=file)
}

#' @export seq.align.based.on.common.reference
#' @title Merge two alignments with common reference
#' @import plyr data.table ape
#' @description Merge two alignments by adding gaps to both alignments in such a way that the common reference in both alignments are aligned with each other.
#' @param in.s 	first sequence alignment in **matrix** DNAbin format (ape package)
#' @param sq 	second sequence alignment in **matrix** DNAbin format (ape package)
#' @param return.common.sites 	Flag if the merged alignment should be intersection of the input alignment (TRUE, default) or the union (FALSE).
#' @param regexpr.reference		Regular expression to identify the reference taxon in the rownames of the input alignments.
#' @param regexpr.nomatch		Regular expression to identify non-nucleotide characters that should not be matched against each other in the reference. By default, the characters '-' '?' are not mached.
#' @return Merged sequence alignment in matrix DNAbin format (ape package)	 	
seq.align.based.on.common.reference<- function(in.s, sq, return.common.sites=TRUE, regexpr.reference='HXB2', regexpr.nomatch='-|\\?')
{
	stopifnot( class(in.s)=='DNAbin', class(sq)=='DNAbin', is.matrix(in.s), is.matrix(sq))
	chars.nomatch		<- gsub('\\','',strsplit(regexpr.nomatch,'|',fixed=1)[[1]],fixed=1)
	#	find index of references in both alignments
	ref.in	<- which(grepl(regexpr.reference,rownames(in.s)))
	ref.p	<- which(grepl(regexpr.reference,rownames(sq)))	
	stopifnot(length(ref.in)==1, length(ref.p)==1)
	#	
	#	build SU alignment key
	#
	tmp2	<- gsub('?','',gsub('-','',paste(as.vector(as.character(in.s[ref.in,1:50])), collapse='')),fixed=1)		
	tmp2	<- paste(strsplit(tmp2, '')[[1]],'-*',sep='',collapse='')
	#	find start of SU alignment in PANGEA alignment
	startat	<- regexpr(tmp2, gsub('?','-',paste(as.vector(as.character(sq[ref.p,])), collapse=''),fixed=1))
	if(startat==-1)
	{
		#try reverse in.s and sq
		tmp2	<- gsub('?','',gsub('-','',paste(as.vector(as.character(sq[ref.p,1:50])), collapse='')),fixed=1)		
		tmp2	<- paste(strsplit(tmp2, '')[[1]],'-*',sep='',collapse='')
		#	find start of SU alignment in PANGEA alignment
		startat	<- regexpr(tmp2, gsub('?','-',paste(as.vector(as.character(in.s[ref.in,])), collapse=''),fixed=1))
		stopifnot(startat>=0)
		#	now reverse
		tmp		<- ref.in
		ref.in	<- ref.p
		ref.p	<- tmp
		tmp		<- in.s
		in.s	<- sq
		sq		<- tmp
	}	
	#
	#	determine the final sites in SU alignment / PANGEA alignment that are not gaps-only
	#
	in.s2	<- as.character(in.s)	#	need this later too
	in.d	<- data.table(	TAXA= rownames(in.s), 
			FIRST= apply( in.s2, 1, function(x) which(x!='-')[1] ),
			LAST= ncol(in.s)-apply( in.s2, 1, function(x) which(rev(x)!='-')[1] )+1L		)		 
	in.l	<- in.d[, max(LAST)]
	sq.s2	<- as.character(sq)		#	need this later too
	sq.d	<- data.table(	TAXA= rownames(sq), 
			FIRST= apply( sq.s2, 1, function(x) which(x!='-')[1] ),
			LAST= ncol(sq)-apply( sq.s2, 1, function(x) which(rev(x)!='-')[1] )+1L		)		 
	sq.l	<- sq.d[, max(LAST)]	
	#
	#	get stop position of the two references in the other alignment
	#
	#	for dev:
	#	x<- matrix(strsplit('tcaa---------aaattt','')[[1]], nrow=1)
	#	y<- matrix(strsplit('-tcaaaa---a-ttt----','')[[1]], nrow=1)		
	x		<- as.character(sq[ref.p, seq.int(startat, sq.l)])
	y		<- as.character(in.s[ref.in, seq.int(1, in.l)])		#y must be the longer sequence with extra gaps		
	tmp		<- gsub(regexpr.nomatch,'',paste(as.vector(x), collapse=''))
	tmp2	<- gsub(regexpr.nomatch,'',paste(as.vector(y), collapse=''))
	if(nchar(tmp2)<=nchar(tmp))
	{
		#	ref in SU alignment is smaller than ref in PANGEA alignment
		z			<- substr(tmp2, nchar(tmp2)-50, nchar(tmp2))
		z			<- paste(strsplit(z, '')[[1]],'-*',sep='',collapse='')
		z			<- regexpr(z, paste(as.vector(x), collapse=''))
		sq.stopat	<- as.integer( z + attr(z,"match.length") - 1L )
		in.stopat	<- in.l	
	}
	if(nchar(tmp2)>nchar(tmp))
	{
		#	ref in PANGEA alignment is smaller than ref in SU alignment
		z			<- substr(tmp, nchar(tmp)-50, nchar(tmp))
		z			<- paste(strsplit(z, '')[[1]],'-*',sep='',collapse='')
		z			<- regexpr(z, paste(as.vector(y), collapse=''))
		in.stopat	<- as.integer( z + attr(z,"match.length") - 1L )
		sq.stopat	<- sq.l-startat+1L	
	}	
	tmp		<- gsub(regexpr.nomatch,'',paste(as.vector(x[,1:sq.stopat]), collapse=''))
	tmp2	<- gsub(regexpr.nomatch,'',paste(as.vector(y[,1:in.stopat]), collapse=''))
	stopifnot( tmp==tmp2 )
	#
	# 	calculate offsets (ie number of new gap insertions after site i) in SU and PANGEA alignments
	#
	#	dev:
	#	substring(paste(as.vector(x), collapse=''),280,400)
	#	k<- 285
	#	rbind( z[1, ((k-5):(k+20))+offset.in.x[(k-5):(k+20)]], z[2, ((k-5):(k+20))+offset.in.y[(k-5):(k+20)]] )
	#	z[1, ((k-5):(k+20))]	
	z							<- rbind.fill.matrix(x[,1:sq.stopat,drop=FALSE],y[,1:in.stopat,drop=FALSE])
	z[1,which(is.na(z[1,]))]	<- '-'
	z[2,which(is.na(z[2,]))]	<- '-'
	zz							<- unname(z)	
	offset.to.x	<- rep(0, ncol(z))
	offset.to.y	<- rep(0, ncol(z))			
	k			<- seq_len(ncol(z))+offset.to.y
	k			<- k[ k>0 & k<=ncol(z)]
	k			<- which( z[1, seq_along(k)]!=z[2, k] )[1]
	while(!is.na(k) & k<20e3)
	{
		#print(k)		
		done	<- 0
		if( !zz[1,k]%in%chars.nomatch )
		{
			kn		<- k-offset.to.x[min(k,length(offset.to.x))]
			offset.to.x[ seq.int(kn,length(offset.to.x)) ] <- offset.to.x[ seq.int(kn,length(offset.to.x)) ]+1
			if(k==1)
				zz	<- rbind( c('-',zz[1,]), c(zz[2,],'-') )
			if(k>1)
				zz	<- rbind( c(zz[1, 1:(k-1)],'-',zz[1,k:ncol(zz)]), c(zz[2,],'-')	)
			done	<- 1
		}
		if( !done & zz[1,k]%in%chars.nomatch )
		{
			kn		<- k-offset.to.y[min(k,length(offset.to.x))]
			offset.to.y[ seq.int(kn,length(offset.to.x)) ] <- offset.to.y[ seq.int(kn,length(offset.to.y)) ]+1
			if(k==1)
				zz	<- rbind( c(zz[1,],'-'), c('-',zz[2,]) )
			if(k>1)
				zz	<- rbind( c(zz[1,],'-'), c(zz[2, 1:(k-1)],'-',zz[2,k:ncol(zz)])	)							
		}
		k	<- unname(which(zz[1,]!=zz[2,])[1])		
	}
	if(0)
	{
		#	DEV
		# to x add offset.to.x
		k			<- offset.to.x+seq_along(offset.to.x)
		xx			<- matrix('-',ncol=max(k),nrow=nrow(x))
		xx[,k]		<- x[, seq_along(k)]
		# to y add offset.to.y
		k			<- offset.to.y+seq_along(offset.to.y)
		yy			<- matrix('-',ncol=max(k),nrow=nrow(y))
		yy[,k]		<- y[, seq_along(k)]
		rbind(xx, yy)
	}
	#
	#	add gaps to PANGEA and SU alignment 
	#
	in.newcol		<- offset.to.y[seq.int(in.stopat)]+seq_len(in.stopat)
	sq.newcol		<- offset.to.x[seq.int(sq.stopat)]+seq_len(sq.stopat)
	new.ncol		<- max(max(in.newcol),max(sq.newcol))		
	tmp				<- matrix('-', ncol=new.ncol,nrow=nrow(in.s2), dimnames=dimnames(in.s2))
	tmp[,in.newcol]	<- in.s2[, seq_along(in.newcol)]
	in.s2			<- as.DNAbin(tmp)	
	tmp				<- matrix('-', ncol=new.ncol,nrow=nrow(sq.s2), dimnames=dimnames(sq.s2))
	tmp[,sq.newcol]	<- sq.s2[, startat-1L+seq_along(sq.newcol)]
	sq.s2			<- as.DNAbin(tmp)
	#	if merged alignment should contain all sites in PANGEA and SU alignment
	#	need to add remainder now
	if(!return.common.sites & startat>1)
	{
		#	add gap col to in.s2
		tmp			<- as.DNAbin(matrix('-', ncol=startat-1L,nrow=nrow(in.s2), dimnames=dimnames(in.s2)))
		in.s2		<- cbind(tmp, in.s2)
		#	add data to sq.s2
		sq.s2		<- cbind(sq[,1:(startat-1L)], sq.s2)		
	}
	if(!return.common.sites & ncol(sq.s2)<ncol(sq))	
	{
		sq.s2		<- cbind(sq.s2, sq[, seq.int(length(sq.newcol)+1L,ncol(sq))])		
		tmp			<- as.DNAbin( matrix('-', nrow=nrow(in.s2), ncol=ncol(sq.s2)-ncol(in.s2), dimnames=dimnames(in.s2)) )
		in.s2		<- cbind(in.s2, tmp)		
	}
	if(!return.common.sites & ncol(in.s2)<ncol(in.s))	
	{
		in.s2		<- cbind(in.s2, in.s[, seq.int(length(in.newcol)+1L,ncol(in.s))])		
		tmp			<- as.DNAbin( matrix('-', nrow=nrow(sq.s2), ncol=ncol(in.s2)-ncol(sq.s2), dimnames=dimnames(sq.s2)) )
		sq.s2		<- cbind(sq.s2, tmp)		
	}	
	# 	now merge!
	ans				<- rbind(in.s2, sq.s2)
	#	check for duplicates -- should be HXB2	
	tmp				<- which(grepl(regexpr.reference,rownames(ans)))
	stopifnot(length(tmp)==2)
	ans				<- ans[-tmp[2],]	
	ans
}

#' @export seq.write.dna.phylip.triangular
#' @title Write a triangular phylip file
seq.write.dna.phylip.triangular<- function(wd, file=NA)
{
	stopifnot(class(wd)=='matrix')
	stopifnot(!is.na(file))
	tmp	<- paste(sapply(seq_len(nrow(wd)), function(i)
					{
						z	<- sprintf('%.16f',wd[i, 1:i][-i])
						if(length(z))
							z	<- as.vector(rbind(z, rep(1, length(z))))						
						z	<- paste(z, collapse=' ')
						paste(rownames(wd)[i],' ',z,sep='')		
					}), collapse='\n')
	
	cat('\t', paste(nrow(wd), '\n', tmp, sep=''), file=file)	
}

#' @import data.table ape ggplot2
#' @export seq.mvr.d.and.v
seq.mvr.d.and.v<- function(tps, seed=42, v.mult=1.2, complete.distance.matrix=TRUE, name.gd.col='GD', reco.opts=c(dim=750, costp_l1=0, costp_l2=0.001, costq_l1=0, costq_l2=0.001, nthread=1, lrate=0.003, niter=120), outfile=NA, verbose=FALSE)
{	
	require(recosystem)
	#reco.opts	<- c(dim=500, costp_l1=0, costp_l2=0.01, costq_l1=0, costq_l2=0.01, nthread=1, lrate=0.003, niter=40)	
	stopifnot( c('TAXA1','TAXA2','ID1','ID2',name.gd.col,'GD_V')%in%colnames(tps) )
	#	tps				<- subset(tp, REP==1 & GENE=='gag+pol+env')
	tpc		<- subset(tps, select=c('ID1','ID2',name.gd.col))
	setnames(tpc, name.gd.col, 'GD')
	if(complete.distance.matrix)
	{
		#	add upper triangular
		tmp		<- copy(tpc)
		set(tmp, NULL, 'ID1', tpc[, ID2])
		set(tmp, NULL, 'ID2', tpc[, ID1])
		tpc		<- rbind(tpc, tmp)
		#	add zero diagonal
		tmp		<- tpc[, range(ID1)]
		tmp		<- data.table(ID1= seq.int(tmp[1], tmp[2]), ID2= seq.int(tmp[1], tmp[2]), GD=0)
		tpc		<- rbind(tpc, tmp)
		#	setup matrix completion
		if(verbose)
			cat('\nmatrix completion')
		tmp		<- subset(tpc, !is.na(GD))
		tmp		<- data_memory(tmp[,ID1], tmp[,ID2], rating=tmp[,GD], index1=TRUE)
		if(!is.na(seed))
			set.seed(seed)
		r		<- Reco()	
		r$train(tmp, opts=reco.opts)	
		tpc[, GDp:= r$predict(data_memory(tpc[,ID1], tpc[,ID2], index1=TRUE), out_memory())]
		# plot
		if(!is.na(outfile))
		{
			ggplot(subset(tpc, !is.na(GD)), aes(x=GD, y=GDp)) + geom_point(colour='grey80', size=0.5, pch=16) + geom_abline(slope=1, intercept=0)
			ggsave(file=paste(outfile, '_', paste(reco.opts,collapse='_'), '.pdf', sep=''), w=7, h=7)		
		}
		if(verbose)
			cat('\ngenerating completed distance matrix d')
		#	fill in distance matrix
		tpc[, GDf:= GD]
		tmp			<- tpc[, which(is.na(GDf))]
		set(tpc, tmp, 'GDf', tpc[tmp, GDp])
		#	convert to matrix (not necessarily symmetric)
		#tmp		<- dcast.data.table( subset(tpc, ID1<20 & ID2<20, select=c(ID1,ID2,GDf)), ID1~ID2, value.var='GDf' )
		tmp			<- dcast.data.table( subset(tpc, select=c(ID1,ID2,GDf)), ID1~ID2, value.var='GDf' )		
		d			<- as.matrix(tmp[, -1, with=FALSE])
		rownames(d)	<- colnames(d)
		#	make symmetric
		d			<- (d+t(d))/2	
		stopifnot( all(d>=0) )		
	}
	if(!complete.distance.matrix)
	{
		if(verbose)
			cat('\ngenerating incomplete distance matrix d')		
		tmp				<- dcast.data.table(tpc, ID1~ID2, value.var='GD')		
		d				<- cbind(NA_real_, as.matrix(tmp[, -1, with=FALSE]))
		d				<- rbind(d, NA_real_)
		colnames(d)[1]	<- setdiff( as.character(tmp[, ID1]), colnames(d) )
		rownames(d)		<- colnames(d)
		diag(d)			<- 0
		#	complete lower triangular from upper triangular and vice versa
		tmp				<- lower.tri(d) & is.na(d)	
		d[tmp]			<- t(d)[tmp]
		tmp				<- upper.tri(d) & is.na(d)
		d[tmp]			<- t(d)[tmp]	
		d[is.nan(d)]	<- NA_real_
	}
	#	some rows/cols may have NAs only -- remove these as the matrix completion problem is ill-specified for these
	tmp			<- subset(subset(tpc, is.na(GD))[, list(GDM=length(GD)), by='ID1'], GDM==nrow(d)-1) #subtract one since diagonal is zero
	tmp			<- setdiff(rownames(d), tmp[, as.character(ID1)] )
	d			<- d[tmp, tmp]
	#	clean up
	tpc			<- r	<- NULL
	gc()
	#
	#	generate variance matrix
	#
	if(verbose)
		cat('\ngenerating variance matrix v')
	tmp				<- dcast.data.table(tps, ID1~ID2, value.var='GD_V')		
	v				<- cbind(NA_real_, as.matrix(tmp[, -1, with=FALSE]))
	v				<- rbind(v, NA_real_)
	colnames(v)[1]	<- setdiff( as.character(tmp[, ID1]), colnames(v) )
	rownames(v)		<- colnames(v)
	diag(v)			<- 0
	#	complete lower triangular from upper triangular and vice versa
	tmp				<- lower.tri(v) & is.na(v)	
	v[tmp]			<- t(v)[tmp]
	tmp				<- upper.tri(v) & is.na(v)
	v[tmp]			<- t(v)[tmp]
	#	set missing variances to large default
	v[is.na(v)]		<- max(v, na.rm=TRUE)*v.mult
	v				<- v[rownames(d),colnames(d)]	
	stopifnot( all(v>=0) )
	#
	#	reset names
	#
	if(verbose)
		cat('\nsetting taxon names')
	tmp				<- subset( tps, select=c(TAXA1, ID1) )
	setnames(tmp, c('TAXA1','ID1'), c('TAXA2','ID2') )
	tmp				<- unique(rbind( tmp, subset( tps, select=c(TAXA2, ID2) ) ))
	setnames(tmp, c('TAXA2','ID2'), c('TAXA','ID') )		
	tmp				<- merge(tmp, data.table(ID=as.integer(rownames(d))), by='ID')
	setkey(tmp, ID)
	rownames(d)		<- tmp[, TAXA]
	colnames(d)		<- tmp[, TAXA]		
	rownames(v)		<- tmp[, TAXA]
	colnames(v)		<- tmp[, TAXA]
	#				
	#	return
	#
	list(d=as.dist(d), v=as.dist(v))
}

#' @export seq.strip.gap
#' @title Strip gaps
seq.strip.gap<- function(seq.DNAbin.mat, strip.max.len=NA, strip.pc=NA, gap.chars='-')
{
	stopifnot(	!is.na(strip.max.len)&is.na(strip.pc) | is.na(strip.max.len)&!is.na(strip.pc), is.na(strip.pc)|strip.pc<1, is.na(strip.pc)|strip.pc>0, is.na(strip.max.len)|strip.max.len>0)	
	seq.DNAbin.mat	<- as.character(seq.DNAbin.mat)
	if(length(gap.chars)==1)
		tmp			<- apply(seq.DNAbin.mat,2,function(x) length(which(x==gap.chars)))
	if(length(gap.chars)>1)
		tmp			<- apply(seq.DNAbin.mat,2,function(x) length(which(x%in%gap.chars)))
	tmp				<- tmp/nrow(seq.DNAbin.mat)
	if(!is.na(strip.pc))
		return(as.DNAbin(seq.DNAbin.mat[,tmp<strip.pc,drop=FALSE]))
	if(!is.na(strip.max.len))
		for(strip.pc in rev(seq(0.01,1.00,0.01)))
		{
			z		<- which(tmp<strip.pc)
			if(length(z)<=strip.max.len)
			{
				cat('\nStripping gaps at pc=',strip.pc)
				return(as.DNAbin(seq.DNAbin.mat[,z,drop=FALSE]))
			}
		}
}

#' @export seq.find
#' @title Find the row numbers in a sequence matrix with a particular sequence pattern
seq.find<- function(char.matrix, pos0= NA, from= c(), verbose=1)
{
	if(is.na(pos0)) 	stop("start position of token to be replaced is missing")
	if(!length(from))	stop("token to be replaced is missing")
	query.colidx	<- seq.int(pos0,pos0+length(from)-1)
	query.yes		<- which( apply(char.matrix, 1, function(x)	all(x[query.colidx]==from) ) )
	query.yes	
}

#' @export seq.unique 
#' @title Compute a subset of unique sequences
seq.unique<- function(seq.DNAbin.matrix)
{
	x<- as.character(seq.DNAbin.matrix)
	x<- apply(x, 1, function(z) paste(z,collapse=''))
	seq.DNAbin.matrix[!duplicated(x),]			
}

#' @export seq.blast
#' @title Perform online BLAST search
#' @description see also package 'RFLPtools'
seq.blast<- function (x, database = "nr", hitListSize = "10", filter = "L", expect = "10", program = "blastn", organism= "HIV-1") 
{
	baseUrl <- "http://www.ncbi.nlm.nih.gov/blast/Blast.cgi"
	query <- paste("QUERY=", as.character(x), "&DATABASE=", database, "&ORGANISM=", organism,
			"&HITLIST_SIZE=", hitListSize, "&FILTER=", filter, "&EXPECT=", 
			expect, "&PROGRAM=", program, sep = "")
	url0 <- sprintf("%s?%s&CMD=Put", baseUrl, query)
	results <- tempfile()
	Sys.sleep(5)
	require(XML)
	post <- htmlTreeParse(url0, useInternalNodes = TRUE)
	x <- post[["string(//comment()[contains(., \"QBlastInfoBegin\")])"]]
	rid <- sub(".*RID = ([[:alnum:]]+).*", "\\1", x)
	rtoe <- as.integer(sub(".*RTOE = ([[:digit:]]+).*", "\\1", 
					x))
	url1 <- sprintf("%s?RID=%s&FORMAT_TYPE=XML&CMD=Get", baseUrl, 
			rid)
	Sys.sleep(rtoe)
	result <- annotate:::.tryParseResult(url1)
	qseq <- xpathApply(result, "//Hsp_qseq", xmlValue)
	hseq <- xpathApply(result, "//Hsp_hseq", xmlValue)
	require(Biostrings)
	res <- list()
	for (i in seq_len(length(qseq))) {
		res[i] <- DNAMultipleAlignment(c(hseq[[i]], qseq[[i]]), 
				rowmask = as(IRanges(), "NormalIRanges"), colmask = as(IRanges(), 
						"NormalIRanges"))
	}
	res
}

#' @export seq.blast.read
#' @title Read BLAST output
#' @description slight modification of read.blast() in pkg RFLPtools; expects blast was run with -outfmt 6
seq.blast.read<- function (file, sep = "\t") 
{
	require(data.table)
	x <- read.table(file = file, header = FALSE, sep = sep, quote = "\"", dec = ".", fill = TRUE, comment.char = "", stringsAsFactors = FALSE)
	if (ncol(x) != 12) 
		stop("Data in given file", basename(file), "has wrong dimension!")
	names(x) <- c("query.id", "subject.id", "identity", "alignment.length","mismatches", "gap.opens", "q.start", "q.end", "s.start","s.end", "evalue", "bit.score")
	data.table(x, key="query.id")
}

#' @export seq.rm.drugresistance
#' @title Remove drug resistance mutations
seq.rm.drugresistance<- function(seq, outfile=NA)
{
	require(data.table)
	stopifnot( any(rownames(seq)=='HXB2') )		#	expect HXB2 as reference in alignment
	load(system.file(package="big.phylo", 'AC_drugresistance_201508.rda'))	
	load(system.file(package="big.phylo", 'refseq_hiv1_hxb2.rda'))
	hxb2			<- paste(hxb2[, HXB2.K03455], collapse='')
	seq.hxb2		<- paste(as.character(seq['HXB2',]),collapse='')
	#	check that HXB2 in alignment is HXB2
	seq.hxb2.ng		<- gsub('-+','',seq.hxb2)
	stopifnot(nchar(seq.hxb2.ng)<=nchar(hxb2))					#	expect HXB2 in alignment is part of true HXB2
	seq.hxb2.st		<- as.integer(regexpr(substr(seq.hxb2.ng,1,20),hxb2))
	stopifnot(seq.hxb2.st>0)									#	expect HXB2 in alignment is part of true HXB2
	stopifnot(nchar(seq.hxb2.ng)+seq.hxb2.st-1L<=nchar(hxb2))	#	expect HXB2 in alignment is part of true HXB2	
	stopifnot( seq.hxb2.ng==substr(hxb2, seq.hxb2.st, nchar(seq.hxb2.ng)+seq.hxb2.st-1L) )
	#	find coordinates of real HXB2 in alignment
	seq.hxb2.pos	<- c()
	tmp				<- gregexpr('-+',seq.hxb2)[[1]]
	if(tmp[1]>-1)
	{
		seq.hxb2.pos<- data.table(GP_ST= as.integer(tmp), GP_LEN= attr(tmp,'match.length'))
		seq.hxb2.pos<- seq.hxb2.pos[, list(GP_POS= seq.int(GP_ST, length=GP_LEN)), by='GP_ST'][, GP_POS]		
	}
	seq.hxb2.pos 	<- data.table(HXB2INSEQ_POS= setdiff(seq_len(nchar(seq.hxb2)), seq.hxb2.pos))
	seq.hxb2.pos[, HXB2_POS:= seq_len(nrow(seq.hxb2.pos))]
	set(seq.hxb2.pos, NULL, 'HXB2_POS', seq.hxb2.pos[, HXB2_POS+seq.hxb2.st-1L])
	stopifnot( seq.hxb2.pos[, tail(HXB2_POS,1)]<=nchar(hxb2) )
	#	merge with dr
	setnames(dr, 'HXB2.pos', 'HXB2_POS')
	dr				<- merge(dr, seq.hxb2.pos, by='HXB2_POS')
	setnames(dr, 'HXB2INSEQ_POS', 'Alignment.nuc.pos')
	tmp			<- seq.rm.drugresistance.internal(as.character(seq), dr, verbose=1, rtn.DNAbin=1 )
	dr.info		<- tmp$nodr.info
	nodr.seq	<- tmp$nodr.seq
	#
	#	save
	#	
	if(!is.na(outfile))
		save(seq, nodr.seq, dr.info, file=outfile)
	list(nodr.info=dr.info, nodr.seq=nodr.seq)	
}

seq.rm.drugresistance.internal<- function(char.matrix, dr, verbose=1, rtn.DNAbin=1)
{
	if(verbose)	cat(paste("\nchecking for potential drug resistance mutations, n=",nrow(dr)))
	nodr.info	<- dr[, {
							query.yes	<- seq.find(char.matrix, Alignment.nuc.pos, unlist(strsplit(unlist(Mutant.NTs),'')))
							if(length(query.yes))
							{
								ans		<- rownames(char.matrix)[query.yes]
								#print( char.matrix[query.yes, seq.int(dr[i,Alignment.nuc.pos]-3, length.out=9)] ); stop()
							}	
							if(!length(query.yes))
								ans		<- NA_character_				
							list(TAXA=ans)
						}, by=c('HXB2_POS','DR.name','Gene.codon.number','Wild.type','Mutant.NTs','Alignment.nuc.pos')]
	nodr.info	<- subset(nodr.info, !is.na(TAXA))			
	for(i in nodr.info[, sort(unique(Alignment.nuc.pos))])
	{
		tmp		<- subset(nodr.info, Alignment.nuc.pos==i)[, TAXA]
		stopifnot(length(tmp)>0)
		cat(paste('\nsetting at pos',i,'to nnn for taxa, n=', length(tmp)))
		char.matrix[tmp,	seq.int(i, length.out=3) ]<- matrix("n", nrow=length(tmp), ncol=3)	
	}
	if(rtn.DNAbin)
		char.matrix	<- as.DNAbin(char.matrix)	
	list(nodr.seq=char.matrix, nodr.info=nodr.info)	
}	

#' @export seq.dist
#' @title Compute pairwise distance matrix
seq.dist<- function(seq.DNAbin.matrix, verbose=1)
{
	if(0)
	{
		require(ape)
		ans<- dist.dna(seq.DNAbin.matrix, model="raw", as.matrix=1)
	}
	if(1)
	{
		library(bigmemory)
		options(bigmemory.typecast.warning=FALSE)
		big.matrix.charmax<- 127
		dummy	<- 0
		ans<- big.matrix(nrow(seq.DNAbin.matrix),nrow(seq.DNAbin.matrix), dimnames=list(rownames(seq.DNAbin.matrix),c()), type="char", init=NA)
		if(nrow(seq.DNAbin.matrix)>5)
		{
			ans[1,1:6]<- 2^seq.int(6,11)-1
			if(is.na(ans[1,2]))		stop("unexpected behaviour of bigmemory")
			ans[1,1:6]<- NA
		}						
		for(i1 in seq.int(1,nrow(seq.DNAbin.matrix)-1))
		{			
			seq1<- seq.DNAbin.matrix[i1,]
			time<- system.time	(
					tmp	<- 1 - sapply(seq.int(i1+1,nrow(seq.DNAbin.matrix)),function(i2){		.C("hivc_dist_ambiguous_dna", seq1, seq.DNAbin.matrix[i2,], ncol(seq1), dummy )[[4]]			})
			)[3]
			if(verbose)	cat(paste("\ncompute distance of row",i1,"entries",nrow(seq.DNAbin.matrix)-i1,"took",time))			
			tmp												<- round(tmp*1e3,d=0)			
			tmp[tmp>big.matrix.charmax]						<- big.matrix.charmax
			ans[i1, seq.int(i1+1,nrow(seq.DNAbin.matrix))]	<- tmp
		}		
	}
	ans
}

#' @export seq.replace
#' @title Replace nucleotide codes in sequence alignment
seq.replace<- function(seq.DNAbin.matrix, code.from='?', code.to='n', verbose=0)
{
	seq.DNAbin.matrix	<- as.character(seq.DNAbin.matrix)		
	seq.DNAbin.matrix	<- apply(seq.DNAbin.matrix, 2, function(col) 		gsub(code.from,code.to,col,fixed=1)			)	
	as.DNAbin( seq.DNAbin.matrix )
}

#' @export seq.rmgaps
#' @title Remove gaps in sequence alignment
seq.rmgaps<- function(seq.DNAbin.matrix, rm.only.col.gaps=1, verbose=0)
{
	seq.DNAbin.matrix		<- as.character(seq.DNAbin.matrix)		
	if(!rm.only.col.gaps)
	{	
		if(is.matrix(seq.DNAbin.matrix))
		{
			tmp					<- lapply(seq_len(nrow(seq.DNAbin.matrix)), function(i){	seq.DNAbin.matrix[i, seq.DNAbin.matrix[i,]!="-" & seq.DNAbin.matrix[i,]!="?"]	})
			names(tmp)			<- rownames(seq.DNAbin.matrix)
		}
		else
		{
			tmp					<- lapply(seq_along(seq.DNAbin.matrix), function(i){	seq.DNAbin.matrix[[i]][ seq.DNAbin.matrix[[i]]!="-" & seq.DNAbin.matrix[[i]]!="?"]	})
			names(tmp)			<- names(seq.DNAbin.matrix)
		}		
		seq.DNAbin.matrix	<- tmp
	}
	else
	{		
		nogap				<- which( !apply(seq.DNAbin.matrix,2,function(x) all(x=="-" | x=="?")) )
		if(verbose)	cat(paste("\nremove gaps, n=",ncol(seq.DNAbin.matrix)-length(nogap)))
		seq.DNAbin.matrix	<- seq.DNAbin.matrix[,nogap]	
	}
	as.DNAbin( seq.DNAbin.matrix )
}

#' @export seq.read.GenBank
#' @title Read sequences from GenBank
seq.read.GenBank<- function (access.nb, seq.names = access.nb, species.names = TRUE, gene.names = FALSE, as.character = FALSE, attributes= c("origin")) 
{
	require(ape)
	N <- length(access.nb)
	nrequest <- N%/%400 + as.logical(N%%400)
	X <- character(0)
	for (i in 1:nrequest) {
		a <- (i - 1) * 400 + 1
		b <- 400 * i
		if (i == nrequest) 
			b <- N
		URL <- paste("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=", 
				paste(access.nb[a:b], collapse = ","), "&rettype=gb&retmode=text", 
				sep = "")
		X <- c(X, scan(file = URL, what = "", sep = "\n", quiet = TRUE))
	}
	cat(X)
	FI <- grep("^ {0,}ORIGIN", X) + 1
	LA <- which(X == "//") - 1
	obj <- vector("list", N)
	for (i in 1:N) {
		tmp <- gsub("[[:digit:] ]", "", X[FI[i]:LA[i]])
		obj[[i]] <- unlist(strsplit(tmp, NULL))
	}
	names(obj) <- seq.names
	if (!as.character) 
		obj <- as.DNAbin(obj)
	if(length(attributes)) 
	{
		attr.lines	<- lapply(attributes, function(attr) grep(paste("^ {0,}/",attr,sep=''), X) 	)
		attr.fields	<- lapply(seq_along(attr.lines),function(i)		gsub("\"$","",gsub(paste("^ {0,}/",attributes[i],"=\"",sep=''),"",X[attr.lines[[i]]]))		)						
		for(i in seq_along(attributes))
			attr(obj, attributes[[i]])<- attr.fields[[i]]
	}
	if (gene.names) {
		tmp <- character(N)
		sp <- grep(" +gene +<", X)
		for (i in 1:N) tmp[i] <- unlist(strsplit(X[sp[i + 1L]], 
							" +/gene=\""))[2]
		attr(obj, "gene") <- gsub("\"$", "", tmp)
	}
	obj
}

#' @export seq.create.referencepairs
seq.create.referencepairs<- function(dir.name= DATA)
{
	if(0)	#generate ATHENA_2013_hptn052.rda
	{
		gban				<- c( paste("JN",seq.int(247047,247075),sep=''),paste("JN",seq.int(634296,634492),sep='') )	
		tmp					<- hivc.read.GenBank(gban, as.character=0, attributes= c("isolate","country","collection_date"))		
		file				<- "ATHENA_2013_hptn052.fa"
		write.dna(tmp, paste(DATA,file,sep='/'), format = "fasta" )
		cmd					<- hiv.cmd.clustalo(paste(dir.name,"tmp",sep='/'), file, signat='', outdir=paste(dir.name,"tmp",sep='/'))
		cat(cmd)
		if(0) system(cmd)		
		file				<- paste(DATA,"tmp/ATHENA_2013_hptn052.fa.clustalo",sep='/')
		hptn052				<- read.dna(file, format = "fasta" )		
		rownames(hptn052)	<- attr(tmp,"isolate")		
		file				<- paste(DATA,"tmp/ATHENA_2013_hptn052.rda",sep='/')
		save(hptn052, file= file)
	}
	file<- paste(DATA,"tmp/ATHENA_2013_hptn052.rda",sep='/')		
	load(file)
	#select control sequences and compute pairwise distances between them
	hptn052.control		<- grep("control", rownames(hptn052))	
	hptn052.control.d	<- hivc.pwdist( hptn052[hptn052.control,] )
	hptn052.control.d	<- hptn052.control.d[ upper.tri(hptn052.control.d) ]
	#select positive sequences and compute pairwise distance between them	
	hptn052.sdc			<- c(grep("A", rownames(hptn052)), grep("B", rownames(hptn052)))
	hptn052.sdc			<- hptn052[hptn052.sdc,]
	hptn052.sdc.ind		<- sapply(strsplit(rownames(hptn052.sdc),'-'), function(x)
			{ 
				as.numeric(x[2]) + ifelse(x[3]=='I',0,0.5)
			})
	hptn052.sdc.ind		<- sapply( unique( hptn052.sdc.ind ), function(x){		hptn052.sdc[hptn052.sdc.ind==x,]	})
	hptn052.sdc.ind		<- lapply( which( sapply(hptn052.sdc.ind,nrow)==2 ), function(i)  hptn052.sdc.ind[[i]] )
	hptn052.sdc.ind.d	<- sapply( hptn052.sdc.ind, function(x) hivc.pwdist(x)[1,2] )
	
	list( gd.islink= hptn052.sdc.ind.d, gd.unlinked= hptn052.control.d )
}

#' @export seq.write.dna.nexus
#' @title Write a NEXUS file
seq.write.dna.nexus<- function(seq.DNAbin.mat, ph=NULL, file=NULL, nexus.format="DNA",nexus.gap='-', nexus.missing='?', nexus.interleave="NO")
{		
	tmp		<- cbind( rownames(seq.DNAbin.mat), apply( as.character( seq.DNAbin.mat ), 1, function(x) paste(x,sep='',collapse='')  ) )
	tmp		<- apply(tmp, 1, function(x) paste(x, collapse='\t', sep=''))
	tmp		<- paste(tmp, collapse='\n',sep='')
	header	<- paste( "#NEXUS\nBEGIN DATA;\nDIMENSIONS NTAX=",nrow(seq.DNAbin.mat)," NCHAR=",ncol(seq.DNAbin.mat),";\nFORMAT DATATYPE=",nexus.format," MISSING=",nexus.missing," GAP=",nexus.gap," INTERLEAVE=",nexus.interleave,";\nMATRIX\n", collapse='',sep='')
	tmp		<- paste(header, tmp, "\n;\nEND;\n", sep='')
	if(!is.null(ph))
	{
		tmp	<- paste(tmp,"#BEGIN TAXA;\nTAXLABELS ", paste(ph$tip.label, sep='',collapse=' '), ';\nEND;\n\n',sep='')
		tmp	<- paste(tmp, "BEGIN TREES;\nTREE tree1 = ", write.tree(ph), "\nEND;\n", sep='')
	}
	if(!is.null(file))
		cat(tmp, file=file)
	tmp
}		

#' @export seq.length
#' @title Compute the length of sequences
seq.length<- function(seq.DNAbin.mat, exclude=c('-','?'))
{
	counts	<- sapply(seq_len(nrow(seq.DNAbin.mat)), function(i) base.freq(seq.DNAbin.mat[i,], freq=1, all=1))
	len		<- apply(counts[ !rownames(counts)%in%exclude, ],2,sum)
	names(len)	<- rownames(seq.DNAbin.mat)
	len
}

#' @export seq.proportion.ambiguous
#' @title Compute the proportion of ambiguous nucleotides
seq.proportion.ambiguous<- function(seq.DNAbin.mat, exclude=c('-','?'))
{	
	counts	<- sapply(seq_len(nrow(seq.DNAbin.mat)), function(i) base.freq(seq.DNAbin.mat[i,], freq=1, all=1))	
	len		<- apply(counts[ !rownames(counts)%in%exclude, ],2,sum)
	pa		<- apply(counts[c("r", "m", "w", "s", "k", "y", "v", "h", "d", "b"),],2,sum)
	pa/len
}

#' @export seq.gc.content
#' @title Compute the GC content
seq.gc.content<- function(seq.DNAbin.mat)
{	
	rna.gc.fraction.n		<- c('a','c','g','t',	'r','m','w','s',	'k','y','v','h',		'd','b','n','-','?')
	rna.gc.fraction			<- c( 0, 1, 1, 0,		0.5, 0.5, 0, 1, 	1/2, 1/2, 2/3, 1/3,		1/3,2/3, 1/4, 0, 0)		#this fraction assumes that U is synonymous with T and that U does not occur in the code
	rna.gc.sum				<- c( T, T, T, T,       T, T, T, T,         T, T, T, T,				T, T, T, F, F )	
	counts					<- apply(seq.DNAbin.mat,1,function(x) base.freq(x, freq=1, all=1))
	apply(counts*rna.gc.fraction,2,sum) / apply(counts[rna.gc.sum,],2,sum)
}

######################################################################################
hivc.clu.polyphyletic.clusters<- function(cluphy.df, cluphy.subtrees=NULL, ph=NULL, clustering=NULL, verbose=1, plot.file=NA, pdf.scaley=25, pdf.xlim=NULL, cex.nodelabel=0.2, cex.tiplabel=0.2, adj.tiplabel= c(-0.15,0.5))
{
	if(!is.null(ph) && !is.null(clustering))
	{
		#get node corresponding to index of selected clusters		
		cluphy.cluidx				<- clustering[["clu.idx"]][ unique( cluphy.df[,cluster] ) ]		
		if(any(is.na(cluphy.cluidx)))	stop("unexcpected NA in cluphy.cluidx")
		#get selected clusters into single phylogeny
		cluphy.subtrees				<- lapply(cluphy.cluidx, function(x)		extract.clade(ph, x, root.edge= 1, interactive = FALSE) 		)
		names(cluphy.subtrees)		<- unique( cluphy.df[,cluster] )
	}
	else if(is.null(cluphy.subtrees))
		stop("expect either ('ph' and 'clustering') or 'cluphy.subtrees' as input")
	cluphy							<- eval(parse(text=paste('cluphy.subtrees[[',seq_along(cluphy.subtrees),']]', sep='',collapse='+')))
	print(cluphy)
	plot.coordinates				<- NULL
	#plot selected clusters
	if(!is.na(plot.file))
	{		
		cluphy.tiplabels						<- hivc.clu.get.tiplabels( cluphy, copy(cluphy.df) )
		if(verbose) cat(paste("\nwrite tree to file",plot.file))
		color.except.rootedge					<- rep(1, Nnode(cluphy, internal.only=F))
		color.except.rootedge[Ntip(cluphy)+1]	<- NA
		plot.coordinates						<- hivc.clu.plot(cluphy, color.except.rootedge, file=plot.file, pdf.scaley=pdf.scaley, pdf.off=0, pdf.xlim=pdf.xlim, cex.nodelabel=cex.nodelabel )		
		hivc.clu.plot.tiplabels( seq_len(Ntip(cluphy)), cluphy.tiplabels$text, cluphy.tiplabels$col, cex=cex.tiplabel, adj=adj.tiplabel, add.xinch=0, add.yinch=0 )
		dev.off()
	}	
	list(cluphy=cluphy, cluphy.subtrees=cluphy.subtrees, plot.coordinates=plot.coordinates)
}	
######################################################################################
hivc.clu.get.tiplabels<- function(ph, 	df.info, col.notmsm="#4EB3D3", col.Early="#EF9708", col.highVL="#FEE391", col.AfterTreat="#D4B9DA", col.green="#D9F0A3", col.latePres="#FA9FB5",
										select=c("CountryInfection","Trm","Sex","isAcute","lRNA.early","NegT","AnyPos_T1","PosSeqT","lRNA.hb4tr_LT","lRNA_bTS","lRNA_TS","lRNA_aTS","lRNAi_bTS","lRNAi_aTS","AnyT_T1","TrImo_bTS","TrImo_aTS","PosCD4_T1","CD4_T1","CD4_bTS","CD4_TS","CD4_aTS","Patient","RegionHospital") )
{
	require(colorspace)
	require(RColorBrewer)
	#
	#	PATIENT
	#
	#set colors CountryInfection
	tmp					<- rep("transparent",nrow(df.info))
	df.info[,CountryInfection.col:=tmp]
	set(df.info, which(df.info[,!is.na(CountryInfection) & CountryInfection!="NL"]), "CountryInfection.col", col.notmsm)
	#set colors Patient
	tmp					<- unique( df.info[,Patient] )
	#tmp2				<- diverge_hcl(length(tmp), h = c(246, 40), c = 96, l = c(85, 90))		
	tmp					<- data.table(Patient=tmp, Patient.col="transparent", key="Patient")
	df.info				<- merge(  df.info, tmp, all.x=1, by="Patient" )
	#set colors Sex
	df.info[,Sex.col:="transparent"]						
	set(df.info, which(df.info[,!is.na(Sex) & Sex!="M"]), "Sex.col", col.notmsm)
	#set colors MSM or BI	
	df.info[,Trm.col:="transparent"]						
	set(df.info, which(df.info[,!is.na(Trm) & Trm!="MSM"]), "Trm.col", col.notmsm)
	#set colors RegionHospital
	tmp					<- levels( df.info[,RegionHospital] )		
	tmp2				<- brewer.pal(length(tmp), "Dark2")
	tmp					<- data.table(RegionHospital=tmp, RegionHospital.col=tmp2, key="RegionHospital")
	df.info				<- merge(  df.info, tmp, all.x=1, by="RegionHospital" )
	#set colors isAcute
	df.info[,isAcute.col:="transparent"]						
	set(df.info, which(df.info[,isAcute%in%c("Yes","Maybe")]), "isAcute.col", col.Early)
	#
	#	TIMELINE
	#
	#	set transparent colors:		PosSeqT NegT AnyPos_T1	
	df.info[, NegT.col:="transparent"]
	df.info[, AnyPos_T1.col:="transparent"]
	#
	#	TREATMENT
	#
	#	set AnyT_T1.col				if 	PosSeqT<AnyT_T1 	orange 		else pink
	df.info[, AnyT_T1.col:="transparent"]
	select.seqb4tr		<- which(df.info[, !is.na(PosSeqT) & !is.na(AnyT_T1) & PosSeqT<=AnyT_T1])
	select.seqatr		<- which(df.info[, !is.na(PosSeqT) & !is.na(AnyT_T1) & PosSeqT>AnyT_T1])
	set(df.info, select.seqb4tr, "AnyT_T1.col", col.Early)
	set(df.info, select.seqatr, "AnyT_T1.col", col.AfterTreat)
	df.info[, TrImo_bTS.col:="transparent"]
	df.info[, TrImo_aTS.col:="transparent"]
	set(df.info, which(df.info[, TrImo_aTS>4]), "TrImo_aTS.col", col.AfterTreat)
	set(df.info, which(df.info[, TrImo_bTS>4]), "TrImo_bTS.col", col.AfterTreat)
	df.info[, PosSeqT.col:="transparent"]
	set(df.info, select.seqb4tr, "PosSeqT.col", col.Early)
	set(df.info, select.seqatr, "PosSeqT.col", col.AfterTreat)	
	#
	#	VIRAL LOAD
	#	
	df.info[, lRNA_bTS.col:="transparent"]
	set(df.info,	which(df.info[,lRNA_bTS>5]),	"lRNA_bTS.col",		col.highVL) 
	df.info[, lRNA_TS.col:="transparent"]
	set(df.info,	which(df.info[,lRNA_TS>5]),	"lRNA_TS.col",		col.highVL) 
	df.info[, lRNA_aTS.col:="transparent"]
	set(df.info,	which(df.info[,lRNA_aTS>3.5]),	"lRNA_aTS.col",		col.highVL) 
	df.info[, lRNAi_bTS.col:="transparent"]
	set(df.info,	which(df.info[,lRNAi_bTS>0.75]),	"lRNAi_bTS.col",		col.highVL) 
	df.info[, lRNAi_aTS.col:="transparent"]
	set(df.info,	which(df.info[,lRNAi_aTS>0.25]),	"lRNAi_aTS.col",		col.highVL) 
	df.info[, lRNA.hb4tr_LT.col:="transparent"]
	set(df.info,	which(df.info[,!is.na(lRNA.hb4tr_LT) & PosSeqT<=lRNA.hb4tr_LT]),	"lRNA.hb4tr_LT.col",		col.highVL) 
	df.info[, lRNA.early.col:="transparent"]
	set(df.info,	which(df.info[, lRNA.early]),	"lRNA.early.col",		col.Early) 
	#
	#	CD4
	#		
	df.info[, CD4_T1.col:="transparent"]
	set(df.info,	which(df.info[,CD4_T1>350]),	"CD4_T1.col",		col.green)	
	df.info[, PosCD4_T1.col:="transparent"]
	set(df.info,	which(df.info[,CD4_T1>350]),	"PosCD4_T1.col",		col.green)	
	df.info[, CD4_bTS.col:="transparent"]
	set(df.info,	which(df.info[,CD4_bTS>350]),	"CD4_bTS.col",		col.green)	
	df.info[, CD4_TS.col:="transparent"]
	set(df.info,	which(df.info[,CD4_TS>350]),	"CD4_TS.col",		col.green)	
	df.info[, CD4_aTS.col:="transparent"]
	set(df.info,	which(df.info[,CD4_aTS>350]),	"CD4_aTS.col",		col.green)
	#
	#	color late presenter
	#
	tmp<- which(df.info[,CD4_T1<350])
	set(df.info,	tmp,	"CD4_T1.col",			col.latePres)
	set(df.info,	tmp,	"PosCD4_T1.col",		col.latePres)
	set(df.info,	tmp,	"lRNA_bTS.col",			col.latePres)
	set(df.info,	tmp,	"lRNAi_bTS.col",		col.latePres)
	#	
	#	convert time to string	& handle inaccurate NegT or AnyPos_T1
	#
	tmp		<- which(df.info[, as.POSIXlt(NegT)$mday==1 & as.POSIXlt(NegT)$mon==0])				#since NegT were reset, these are the ones with inaccurate month
	set(df.info,NULL,	"NegT", 		as.character( df.info[,NegT], "%y.%m" ))
	set(df.info, tmp,	"NegT", 		paste(df.info[tmp,substr(NegT,1,3)],'??',sep='') )
	tmp		<- which(df.info[, as.POSIXlt(AnyPos_T1)$mday==31 & as.POSIXlt(AnyPos_T1)$mon==11])	#since AnyPos_T1 were reset, these are the ones with inaccurate month
	set(df.info,NULL,	"AnyPos_T1",	as.character( df.info[,AnyPos_T1], "%y.%m" ))
	set(df.info, tmp, 	"AnyPos_T1", 	paste(df.info[tmp,substr(AnyPos_T1,1,3)],'??',sep='') )
	set(df.info,NULL,	"PosSeqT", 		as.character( df.info[,PosSeqT], "%y.%m" ))
	set(df.info,NULL,	"lRNA.hb4tr_LT",as.character( df.info[,lRNA.hb4tr_LT], "%y.%m" ))
	set(df.info,NULL,	"PosCD4_T1", 	as.character( df.info[,PosCD4_T1], "%y.%m" ))
	set(df.info,NULL,	"AnyT_T1", 		as.character( df.info[,AnyT_T1], "%y.%m" ))
	#	
	#	set isAcute to either Y or M or N
	set(df.info,NULL,"isAcute", 		as.character( df.info[,isAcute]))
	set(df.info,which(df.info[,isAcute=="No"]),"isAcute",'N')
	set(df.info,which(df.info[,isAcute=="Maybe"]),"isAcute",'M')
	set(df.info,which(df.info[,isAcute=="Yes"]),"isAcute",'Y')
	#	set lRNA.early to either HVLE or ----
	set(df.info,NULL,"lRNA.early", 		as.character( df.info[, lRNA.early]))
	set(df.info,which(df.info[,lRNA.early=="TRUE"]),"lRNA.early","Y")
	set(df.info,which(df.info[,lRNA.early=="FALSE"]),"lRNA.early","N")
	#	set lRNAi_bTS lRNAi_aTS
	set(df.info,NULL,"lRNAi_bTS", 		as.character( round(df.info[, lRNAi_bTS],d=2)))
	set(df.info,NULL,"lRNAi_aTS", 		as.character( round(df.info[, lRNAi_aTS],d=2)))
	#	set TrImo_bTS TrImo_aTS
	set(df.info,NULL,"TrImo_bTS", 		as.character( round(df.info[, TrImo_bTS],d=1)))
	set(df.info,NULL,"TrImo_aTS", 		as.character( round(df.info[, TrImo_aTS],d=1)))
	#	set CD4_T1
	set(df.info,NULL,"CD4_T1", 		as.character( round(df.info[, CD4_T1],d=0)))
	#	set CD4_TS CD4_bTS CD4_aTS
	set(df.info,NULL,"CD4_TS", 		as.character( df.info[, CD4_TS]))
	set(df.info,NULL,"CD4_bTS", 		as.character( df.info[, CD4_bTS]))
	set(df.info,NULL,"CD4_aTS", 		as.character( df.info[, CD4_aTS]))
	#	set 'Amst' to 'A'
	set(df.info,NULL,"RegionHospital", 		as.character( df.info[,RegionHospital]))
	set(df.info,which(df.info[,RegionHospital=="Amst"]),"RegionHospital",'A')	
	#
	#	handle missing entries -- ensure that alignment is OK
	#
	setkey(df.info, Patient)
	tmp					<- which(is.na(df.info[,Patient]))
	set(df.info, tmp, "Patient", '')
	set(df.info, tmp, "Patient.col", "transparent")
	tmp					<- which(is.na(df.info[,RegionHospital]))
	set(df.info, tmp, "RegionHospital", '-')
	set(df.info, tmp, "RegionHospital.col", "transparent")		
	tmp					<- which(is.na(df.info[,CountryInfection]))
	set(df.info, tmp, "CountryInfection", "--")
	set(df.info, tmp, "CountryInfection.col", "transparent")				
	tmp					<- which(is.na(df.info[,Trm]))
	set(df.info, tmp, "Trm", '--')
	set(df.info, tmp, "Trm.col", "transparent")
	tmp					<- which(is.na(df.info[,Sex]))
	set(df.info, tmp, "Sex", '-')
	set(df.info, tmp, "Sex.col", "transparent")	
	set(df.info, which(df.info[,is.na(isAcute)]), "isAcute", 	'-')	
	tmp					<- which(is.na(df.info[,AnyPos_T1]))
	set(df.info, tmp, "AnyPos_T1", "--.--")	
	set(df.info, tmp, "AnyPos_T1.col", "transparent")
	tmp					<- which(is.na(df.info[,NegT]))
	set(df.info, tmp, "NegT", "--.--")	
	set(df.info, tmp, "NegT.col", "transparent")
	tmp					<- which(is.na(df.info[,PosSeqT]))
	set(df.info, tmp, "PosSeqT", "--.--")	
	set(df.info, tmp, "PosSeqT.col", "transparent")		
	tmp					<- which(is.na(df.info[,lRNA.hb4tr_LT]))
	set(df.info, tmp, "lRNA.hb4tr_LT", "--.--")		
	tmp					<- which(is.na(df.info[,PosCD4_T1]))
	set(df.info, tmp, "PosCD4_T1", "--.--")		
	tmp					<- which(is.na(df.info[,AnyT_T1]))
	set(df.info, tmp, "AnyT_T1", "--.--")		
	set(df.info, which(df.info[,is.na(TrImo_aTS)]), "TrImo_aTS", 	'-')
	set(df.info, which(df.info[,is.na(TrImo_bTS)]), "TrImo_bTS", 	'-')	
	set(df.info, which(df.info[,is.na(CD4_T1)]), "CD4_T1", 			'---')
	set(df.info, which(df.info[,is.na(PosCD4_T1)]), "PosCD4_T1", 	'---')
	set(df.info, which(df.info[,is.na(CD4_TS)]), "CD4_TS", 			'---')
	set(df.info, which(df.info[,is.na(CD4_bTS)]), "CD4_bTS", 		'---')
	set(df.info, which(df.info[,is.na(CD4_aTS)]), "CD4_aTS", 		'---')
	#
	#	add suffixes 
	#	
	set(df.info,NULL,"AnyPos_T1",		paste("d:",df.info[,AnyPos_T1],sep=''))
	set(df.info,NULL,"NegT", 			paste("n:",df.info[,NegT],sep=''))
	set(df.info,NULL,"PosSeqT", 		paste("s:",df.info[,PosSeqT],"   ",sep=''))
	set(df.info,NULL,"lRNA.hb4tr_LT",	paste("VLeh:",df.info[,lRNA.hb4tr_LT],sep=''))
	set(df.info,NULL,"AnyT_T1", 		paste("TR+:",df.info[,AnyT_T1],sep=''))
	set(df.info,NULL,"TrImo_bTS", 		paste("TR-:",df.info[,TrImo_bTS],sep=''))
	set(df.info,NULL,"TrImo_aTS", 		paste(":",df.info[,TrImo_aTS],"   ",sep=''))			
	set(df.info,NULL,"isAcute", 		paste("AC:",df.info[,isAcute],sep=''))
	set(df.info,NULL,"lRNA.early", 		paste(":",df.info[,lRNA.early],"   ",sep=''))
	set(df.info,NULL,"CountryInfection",paste("inf:",df.info[,CountryInfection],sep=''))
	set(df.info,NULL,"RegionHospital",	paste("",df.info[,RegionHospital],sep=''))
	set(df.info,NULL,"lRNA_bTS", 		paste("VL:",df.info[,lRNA_bTS],sep=''))
	set(df.info,NULL,"lRNA_TS", 		paste(":",df.info[,lRNA_TS],sep=''))
	set(df.info,NULL,"lRNA_aTS", 		paste(":",df.info[,lRNA_aTS],sep=''))
	set(df.info,NULL,"lRNAi_bTS", 		paste("VI:",df.info[,lRNAi_bTS],sep=''))
	set(df.info,NULL,"lRNAi_aTS", 		paste(":",df.info[,lRNAi_aTS],"   ",sep=''))	
	set(df.info,NULL,"PosCD4_T1", 		paste("CD4:",df.info[,PosCD4_T1],sep=''))
	set(df.info,NULL,"CD4_T1", 			paste(":",df.info[,CD4_T1],sep=''))	
	set(df.info,NULL,"CD4_bTS", 		paste(":",df.info[,CD4_bTS],sep=''))
	set(df.info,NULL,"CD4_TS", 			paste(":",df.info[,CD4_TS],sep=''))
	set(df.info,NULL,"CD4_aTS", 		paste(":",df.info[,CD4_aTS],"   ",sep=''))
	#
	#get df.info into order of tips
	#
	setkey(df.info, FASTASampleCode)
	df.info				<- df.info[ph$tip.label,]
	#select text and col matrix 
	text				<- t( as.matrix( df.info[,select, with=0] ) )
	colnames(text)		<- ph$tip.label
	col					<- t( as.matrix( df.info[,paste(select,".col",sep=''),with=0] ) )
	colnames(col)		<- ph$tip.label
	ans<- list(text=text, col=col)
	ans
}
######################################################################################
#prepare standard format of tip labels -- requires df.info to be sorted along the tips as they appear in a phylogeny
hivc.clu.get.tiplabels.v1<- function(ph, df.info)
{
	require(colorspace)
	require(RColorBrewer)
	#set colors CountryInfection
	tmp					<- rep("transparent",nrow(df.info))
	df.info[,CountryInfection.col:=tmp]
	set(df.info, which(df.info[,CountryInfection=="NL"]), "CountryInfection.col", "#EF9708")
	#set colors Patient
	tmp					<- unique( df.info[,Patient] )
	tmp2				<- diverge_hcl(length(tmp), h = c(246, 40), c = 96, l = c(85, 90))		
	tmp					<- data.table(Patient=tmp, Patient.col=tmp2, key="Patient")
	df.info				<- merge(  df.info, tmp, all.x=1, by="Patient" )
	#set colors Sex
	tmp					<- rep("transparent",nrow(df.info))
	df.info[,Sex.col:=tmp]						
	set(df.info, which(df.info[,Sex=="M"]), "Sex.col", "#EF9708")
	#set colors MSM or BI
	tmp					<- rep("transparent",nrow(df.info))
	df.info[,Trm.col:=tmp]						
	set(df.info, which(df.info[,Trm%in%c("MSM","BI")]), "Trm.col", "#EF9708")
	#set colors RegionHospital
	tmp					<- levels( df.info[,RegionHospital] )		
	tmp2				<- brewer.pal(length(tmp), "Dark2")
	tmp					<- data.table(RegionHospital=tmp, RegionHospital.col=tmp2, key="RegionHospital")
	df.info				<- merge(  df.info, tmp, all.x=1, by="RegionHospital" )		
	#set colors time		
	tmp					<- range( range(df.info[, NegT],na.rm=1), range(df.info[, AnyPos_T1],na.rm=1) )
	tmp					<- as.POSIXlt( seq.Date(tmp[1],tmp[2]+365,by="years") )$year
	tmp2				<- heat_hcl(length(tmp), h = c(0, -100), l = c(75, 40), c = c(40, 80), power = 1)
	yearcols			<- data.table(Year=tmp, Year.col=tmp2)
	#set colors PosSeqT
	tmp					<- data.table(PosSeqT= unique( df.info[,PosSeqT] ), key="PosSeqT" )
	tmp2				<- tmp[, as.POSIXlt(PosSeqT)$year]
	tmp[,"Year":=tmp2]
	tmp					<- subset( merge(tmp, yearcols, all.x=1, by="Year"), select=c(PosSeqT,Year.col))
	setnames(tmp,"Year.col","PosSeqT.col")
	df.info			<- merge(  df.info, tmp, all.x=1, by="PosSeqT" )
	#set colors NegT
	tmp					<- data.table(NegT= unique( df.info[,NegT] ), key="NegT" )
	tmp2				<- tmp[, as.POSIXlt(NegT)$year]
	tmp[,"Year":=tmp2]
	tmp					<- subset( merge(tmp, yearcols, all.x=1, by="Year"), select=c(NegT,Year.col))
	setnames(tmp,"Year.col","NegT.col")
	df.info			<- merge(  df.info, tmp, all.x=1, by="NegT" )
	#set colors AnyPos_T1
	tmp					<- data.table(AnyPos_T1= unique( df.info[,AnyPos_T1] ), key="AnyPos_T1" )
	tmp2				<- tmp[, as.POSIXlt(AnyPos_T1)$year]
	tmp[,"Year":=tmp2]
	tmp					<- subset( merge(tmp, yearcols, all.x=1, by="Year"), select=c(AnyPos_T1,Year.col))
	setnames(tmp,"Year.col","AnyPos_T1.col")
	df.info			<- merge(  df.info, tmp, all.x=1, by="AnyPos_T1" )									
	
	#convert time to string
	set(df.info,NULL,"AnyPos_T1",substr(as.character( df.info[,AnyPos_T1] ),1,7))
	set(df.info,NULL,"NegT",substr(as.character( df.info[,NegT] ),1,7))
	set(df.info,NULL,"PosSeqT",substr(as.character( df.info[,PosSeqT] ),1,7))
	
	#handle missing entries
	setkey(df.info, Patient)
	tmp					<- which(is.na(df.info[,Patient]))
	set(df.info, tmp, "Patient", '')
	set(df.info, tmp, "Patient.col", "transparent")
	tmp					<- which(is.na(df.info[,RegionHospital]))
	set(df.info, tmp, "RegionHospital", '-')
	set(df.info, tmp, "RegionHospital.col", "transparent")		
	tmp					<- which(is.na(df.info[,CountryInfection]))
	set(df.info, tmp, "CountryInfection", "--")
	set(df.info, tmp, "CountryInfection.col", "transparent")				
	tmp					<- which(is.na(df.info[,Trm]))
	set(df.info, tmp, "Trm", '')
	set(df.info, tmp, "Trm.col", "transparent")
	tmp					<- which(is.na(df.info[,Sex]))
	set(df.info, tmp, "Sex", '')
	set(df.info, tmp, "Sex.col", "transparent")
	tmp					<- which(is.na(df.info[,AnyPos_T1]))
	set(df.info, tmp, "AnyPos_T1", "-------")
	set(df.info, NULL, "AnyPos_T1", paste("HIV+:",df.info[,AnyPos_T1],sep=''))
	set(df.info, tmp, "AnyPos_T1.col", "transparent")
	tmp					<- which(is.na(df.info[,NegT]))
	set(df.info, tmp, "NegT", "-------")
	set(df.info, NULL, "NegT", paste("HIV-:",df.info[,NegT],sep=''))
	set(df.info, tmp, "NegT.col", "transparent")
	tmp					<- which(is.na(df.info[,PosSeqT]))
	set(df.info, tmp, "PosSeqT", "-------")
	set(df.info, NULL, "PosSeqT", paste("HIVS:",df.info[,PosSeqT],sep=''))
	set(df.info, tmp, "PosSeqT.col", "transparent")		
	#get df.info into order of tips
	setkey(df.info, FASTASampleCode)
	df.info				<- df.info[ph$tip.label,]
	#select text and col matrix 
	text				<- t( as.matrix( subset(df.info,select=c(CountryInfection, Trm, Sex, NegT, AnyPos_T1, PosSeqT, Patient, RegionHospital)) ) )
	colnames(text)		<- ph$tip.label
	col					<- t( as.matrix( subset(df.info,select=c(CountryInfection.col, Trm.col, Sex.col, NegT.col, AnyPos_T1.col, PosSeqT.col, Patient.col, RegionHospital.col)) ) )
	colnames(col)		<- ph$tip.label
	ans<- list(text=text, col=col)
	ans
}	
######################################################################################
hivc.clu.plot.tiplabels<- function (tip, text, col, xx=NULL, adj = c(-0.05, 0.5), cex=1, add.xinch= 0.03, add.yinch= 0.02) 
{		
	lastPP 			<- get("last_plot.phylo", envir = .PlotPhyloEnv)
	if(is.null(xx))
		xx 			<- lastPP$xx[tip]
	yy 				<- lastPP$yy[tip]
	if(length(tip)==1)
	{
		#assume vector of text and col of same length
		if(!is.vector(text) || !is.vector(col))	stop("expect vector text and vector col")
		if(length(text)!=length(col))			stop("expect same length of text and col")
		wh				<- sapply(text, function(x){	 c(xinch(strwidth(x, units = "inches", cex = cex)), yinch(strheight(x, units = "inches", cex = cex))) 		})
		wh.total		<- c(sum(wh[1,]),max(wh[2,]))
		coord			<- matrix(NA,6,length(text),dimnames=list(c("xl","xr","yb","yt","xx","yy"),c()))
		coord["xl",]	<- xx - wh.total[1] * adj[1] - xinch(add.xinch)
		tmp				<- cumsum(wh[1,]) + xinch(add.xinch) / ncol(wh)
		coord["xl",]	<- coord["xl",] + c(0, tmp[-ncol(coord)])
		coord["xr",]	<- coord["xl",] + wh[1,] + xinch(add.xinch)/ ncol(wh)
		coord["yb",]	<- yy - wh.total[2] * adj[2] - yinch(add.yinch)
		coord["yt",]	<- coord["yb",] + wh.total[2] + yinch(add.yinch)
		coord["xx",]	<- coord["xl",] + c( diff( coord[1,] ), diff(coord[1:2,ncol(wh)])  ) / 2
		coord["yy",]	<- rep(yy, ncol(coord))
#print(wh); print(wh.total); print(coord)		
	}
	else
	{
		#assume matrix of text and col of same ncol
		if(!is.matrix(text) || !is.matrix(col))				stop("expect matrix text and vector col")
		if(ncol(text)!=ncol(col) || nrow(text)!=nrow(col))	stop("expect same dimensions of text and col")
		if(ncol(text)!=length(tip))							stop("expect cols of text and col to correspond to tips")
		coord	<- lapply( seq_along(xx),function(i)
				{
					wh				<- sapply(text[,i], function(x){	 c(xinch(strwidth(x, units = "inches", cex = cex)), yinch(strheight(x, units = "inches", cex = cex))) 		})
					wh.total		<- c(sum(wh[1,]),max(wh[2,]))
					coord			<- matrix(NA,6,nrow(text),dimnames=list(c("xl","xr","yb","yt","xx","yy"),c()))
					coord["xl",]	<- xx[i] - wh.total[1] * adj[1] - xinch(add.xinch)
					tmp				<- cumsum(wh[1,]) + xinch(add.xinch) / ncol(wh)
					coord["xl",]	<- coord["xl",] + c(0, tmp[-ncol(coord)])
					coord["xr",]	<- coord["xl",] + wh[1,] + xinch(add.xinch)/ ncol(wh)
					coord["yb",]	<- yy[i] - wh.total[2] * adj[2] - yinch(add.yinch)
					coord["yt",]	<- coord["yb",] + wh.total[2] + yinch(add.yinch)
					coord["xx",]	<- coord["xl",] + c( diff( coord[1,] ), diff(coord[1:2,ncol(wh)])  ) / 2
					coord["yy",]	<- rep(yy[i], ncol(coord))
					coord
				})
		coord	<- do.call("cbind",coord)
		text	<- as.vector(text)
		col		<- as.vector(col)
#print(coords); print(text); print(col)
	}
	rect(coord["xl",], coord["yb",], coord["xr",], coord["yt",], col = col, border=NA)
	text(coord["xx",], coord["yy",], text, cex=cex)
}
######################################################################################
hivc.clu.plot<- function(	ph, clu, edge.col.basic="black", show.node.label= T, show.tip.label=F, file=NULL,  
							highlight.edge.of.tiplabel=NULL, highlight.edge.of.tiplabel.col="red", 
							highlight.cluster=NULL, highlight.cluster.col="red",							
							pdf.scaley=10, pdf.width= 7, pdf.height=pdf.scaley*7, pdf.off=1, pdf.xlim=NULL,
							cex.nodelabel=0.5, cex.edge.incluster=1, cex.edge.outcluster= cex.edge.incluster/3, no.margin=T, ...)
{
	require(colorspace)	
	clu.edge							<- clu[ ph$edge[,1] ]
	clu.edge							<- clu.edge+1				#set col index for non-clustering edges to 1
	clu.edge[is.na(clu.edge)]			<- 1
	#cols.n								<- length(unique(clu.edge))-1	
	#cols								<- c("black",rainbow_hcl(cols.n, start = 30, end = 300))	
	clu.edge.col						<- rep(edge.col.basic, nrow(ph$edge))	#cols[clu.edge]
	clu.edge.col[clu.edge==1]			<- "grey50" 
	if(!is.null(highlight.cluster))
	{
		if(length(highlight.cluster.col)==1)
			highlight.cluster.col<- rep(highlight.cluster.col,length(highlight.cluster))
		for(i in seq_along(highlight.cluster))
			clu.edge.col[clu.edge%in%(highlight.cluster[[i]]+1)]<- 	highlight.cluster.col[i]	
	}	
	if(!is.null(highlight.edge.of.tiplabel))
	{
		highlight.edge<- lapply(highlight.edge.of.tiplabel, function(x)		which( ph$edge[,2]%in%which( substr(ph$tip.label, 1, nchar(x))==x ) )		)					
		if(length(highlight.edge.of.tiplabel.col)==1)
			highlight.edge.of.tiplabel.col	<- rep(highlight.edge, length(highlight.edge.of.tiplabel.col) )		
		for(i in seq_along(highlight.edge))
			clu.edge.col[highlight.edge[[i]]]		<- highlight.edge.of.tiplabel.col[i]		
	}	
	clu.edge.width						<- rep(cex.edge.outcluster, length(clu.edge))
	clu.edge.width[clu.edge!=1]			<- cex.edge.incluster
	clu.edge.lty						<- rep(1, length(clu.edge))
	clu.edge.lty[clu.edge!=1]			<- 1
	if(class(file)=="character")
		pdf(file,width=pdf.width,height=pdf.height)
	if(no.margin) 	par(mar=c(0,0,0,0))
	plot.coordinates					<- plot(ph, show.tip.label=show.tip.label, show.node.label=show.node.label, cex=cex.nodelabel, edge.color=clu.edge.col, edge.width=clu.edge.width, edge.lty=clu.edge.lty, x.lim=pdf.xlim, ...)
	if(class(file)=="character" && pdf.off)
		dev.off()
	plot.coordinates
}
olli0601/big.phylo documentation built on Nov. 9, 2019, 9:25 a.m.