#' @export seq.write.dna.phylip
#' @title Write a square phylip file
seq.write.dna.phylip<- function(seq.DNAbin.mat, file)
{
tmp<- cbind( rownames(seq.DNAbin.mat), apply( as.character( seq.DNAbin.mat ), 1, function(x) paste(x,sep='',collapse='') ) )
tmp<- paste(t(tmp),collapse='\n',sep='')
tmp<- paste( paste(c(nrow(seq.DNAbin.mat),ncol(seq.DNAbin.mat)),sep='',collapse=' '),'\n',tmp,'\n',collapse='',sep='' )
cat(tmp, file=file)
}
#' @export seq.align.based.on.common.reference
#' @title Merge two alignments with common reference
#' @import plyr data.table ape
#' @description Merge two alignments by adding gaps to both alignments in such a way that the common reference in both alignments are aligned with each other.
#' @param in.s first sequence alignment in **matrix** DNAbin format (ape package)
#' @param sq second sequence alignment in **matrix** DNAbin format (ape package)
#' @param return.common.sites Flag if the merged alignment should be intersection of the input alignment (TRUE, default) or the union (FALSE).
#' @param regexpr.reference Regular expression to identify the reference taxon in the rownames of the input alignments.
#' @param regexpr.nomatch Regular expression to identify non-nucleotide characters that should not be matched against each other in the reference. By default, the characters '-' '?' are not mached.
#' @return Merged sequence alignment in matrix DNAbin format (ape package)
seq.align.based.on.common.reference<- function(in.s, sq, return.common.sites=TRUE, regexpr.reference='HXB2', regexpr.nomatch='-|\\?')
{
stopifnot( class(in.s)=='DNAbin', class(sq)=='DNAbin', is.matrix(in.s), is.matrix(sq))
chars.nomatch <- gsub('\\','',strsplit(regexpr.nomatch,'|',fixed=1)[[1]],fixed=1)
# find index of references in both alignments
ref.in <- which(grepl(regexpr.reference,rownames(in.s)))
ref.p <- which(grepl(regexpr.reference,rownames(sq)))
stopifnot(length(ref.in)==1, length(ref.p)==1)
#
# build SU alignment key
#
tmp2 <- gsub('?','',gsub('-','',paste(as.vector(as.character(in.s[ref.in,1:50])), collapse='')),fixed=1)
tmp2 <- paste(strsplit(tmp2, '')[[1]],'-*',sep='',collapse='')
# find start of SU alignment in PANGEA alignment
startat <- regexpr(tmp2, gsub('?','-',paste(as.vector(as.character(sq[ref.p,])), collapse=''),fixed=1))
if(startat==-1)
{
#try reverse in.s and sq
tmp2 <- gsub('?','',gsub('-','',paste(as.vector(as.character(sq[ref.p,1:50])), collapse='')),fixed=1)
tmp2 <- paste(strsplit(tmp2, '')[[1]],'-*',sep='',collapse='')
# find start of SU alignment in PANGEA alignment
startat <- regexpr(tmp2, gsub('?','-',paste(as.vector(as.character(in.s[ref.in,])), collapse=''),fixed=1))
stopifnot(startat>=0)
# now reverse
tmp <- ref.in
ref.in <- ref.p
ref.p <- tmp
tmp <- in.s
in.s <- sq
sq <- tmp
}
#
# determine the final sites in SU alignment / PANGEA alignment that are not gaps-only
#
in.s2 <- as.character(in.s) # need this later too
in.d <- data.table( TAXA= rownames(in.s),
FIRST= apply( in.s2, 1, function(x) which(x!='-')[1] ),
LAST= ncol(in.s)-apply( in.s2, 1, function(x) which(rev(x)!='-')[1] )+1L )
in.l <- in.d[, max(LAST)]
sq.s2 <- as.character(sq) # need this later too
sq.d <- data.table( TAXA= rownames(sq),
FIRST= apply( sq.s2, 1, function(x) which(x!='-')[1] ),
LAST= ncol(sq)-apply( sq.s2, 1, function(x) which(rev(x)!='-')[1] )+1L )
sq.l <- sq.d[, max(LAST)]
#
# get stop position of the two references in the other alignment
#
# for dev:
# x<- matrix(strsplit('tcaa---------aaattt','')[[1]], nrow=1)
# y<- matrix(strsplit('-tcaaaa---a-ttt----','')[[1]], nrow=1)
x <- as.character(sq[ref.p, seq.int(startat, sq.l)])
y <- as.character(in.s[ref.in, seq.int(1, in.l)]) #y must be the longer sequence with extra gaps
tmp <- gsub(regexpr.nomatch,'',paste(as.vector(x), collapse=''))
tmp2 <- gsub(regexpr.nomatch,'',paste(as.vector(y), collapse=''))
if(nchar(tmp2)<=nchar(tmp))
{
# ref in SU alignment is smaller than ref in PANGEA alignment
z <- substr(tmp2, nchar(tmp2)-50, nchar(tmp2))
z <- paste(strsplit(z, '')[[1]],'-*',sep='',collapse='')
z <- regexpr(z, paste(as.vector(x), collapse=''))
sq.stopat <- as.integer( z + attr(z,"match.length") - 1L )
in.stopat <- in.l
}
if(nchar(tmp2)>nchar(tmp))
{
# ref in PANGEA alignment is smaller than ref in SU alignment
z <- substr(tmp, nchar(tmp)-50, nchar(tmp))
z <- paste(strsplit(z, '')[[1]],'-*',sep='',collapse='')
z <- regexpr(z, paste(as.vector(y), collapse=''))
in.stopat <- as.integer( z + attr(z,"match.length") - 1L )
sq.stopat <- sq.l-startat+1L
}
tmp <- gsub(regexpr.nomatch,'',paste(as.vector(x[,1:sq.stopat]), collapse=''))
tmp2 <- gsub(regexpr.nomatch,'',paste(as.vector(y[,1:in.stopat]), collapse=''))
stopifnot( tmp==tmp2 )
#
# calculate offsets (ie number of new gap insertions after site i) in SU and PANGEA alignments
#
# dev:
# substring(paste(as.vector(x), collapse=''),280,400)
# k<- 285
# rbind( z[1, ((k-5):(k+20))+offset.in.x[(k-5):(k+20)]], z[2, ((k-5):(k+20))+offset.in.y[(k-5):(k+20)]] )
# z[1, ((k-5):(k+20))]
z <- rbind.fill.matrix(x[,1:sq.stopat,drop=FALSE],y[,1:in.stopat,drop=FALSE])
z[1,which(is.na(z[1,]))] <- '-'
z[2,which(is.na(z[2,]))] <- '-'
zz <- unname(z)
offset.to.x <- rep(0, ncol(z))
offset.to.y <- rep(0, ncol(z))
k <- seq_len(ncol(z))+offset.to.y
k <- k[ k>0 & k<=ncol(z)]
k <- which( z[1, seq_along(k)]!=z[2, k] )[1]
while(!is.na(k) & k<20e3)
{
#print(k)
done <- 0
if( !zz[1,k]%in%chars.nomatch )
{
kn <- k-offset.to.x[min(k,length(offset.to.x))]
offset.to.x[ seq.int(kn,length(offset.to.x)) ] <- offset.to.x[ seq.int(kn,length(offset.to.x)) ]+1
if(k==1)
zz <- rbind( c('-',zz[1,]), c(zz[2,],'-') )
if(k>1)
zz <- rbind( c(zz[1, 1:(k-1)],'-',zz[1,k:ncol(zz)]), c(zz[2,],'-') )
done <- 1
}
if( !done & zz[1,k]%in%chars.nomatch )
{
kn <- k-offset.to.y[min(k,length(offset.to.x))]
offset.to.y[ seq.int(kn,length(offset.to.x)) ] <- offset.to.y[ seq.int(kn,length(offset.to.y)) ]+1
if(k==1)
zz <- rbind( c(zz[1,],'-'), c('-',zz[2,]) )
if(k>1)
zz <- rbind( c(zz[1,],'-'), c(zz[2, 1:(k-1)],'-',zz[2,k:ncol(zz)]) )
}
k <- unname(which(zz[1,]!=zz[2,])[1])
}
if(0)
{
# DEV
# to x add offset.to.x
k <- offset.to.x+seq_along(offset.to.x)
xx <- matrix('-',ncol=max(k),nrow=nrow(x))
xx[,k] <- x[, seq_along(k)]
# to y add offset.to.y
k <- offset.to.y+seq_along(offset.to.y)
yy <- matrix('-',ncol=max(k),nrow=nrow(y))
yy[,k] <- y[, seq_along(k)]
rbind(xx, yy)
}
#
# add gaps to PANGEA and SU alignment
#
in.newcol <- offset.to.y[seq.int(in.stopat)]+seq_len(in.stopat)
sq.newcol <- offset.to.x[seq.int(sq.stopat)]+seq_len(sq.stopat)
new.ncol <- max(max(in.newcol),max(sq.newcol))
tmp <- matrix('-', ncol=new.ncol,nrow=nrow(in.s2), dimnames=dimnames(in.s2))
tmp[,in.newcol] <- in.s2[, seq_along(in.newcol)]
in.s2 <- as.DNAbin(tmp)
tmp <- matrix('-', ncol=new.ncol,nrow=nrow(sq.s2), dimnames=dimnames(sq.s2))
tmp[,sq.newcol] <- sq.s2[, startat-1L+seq_along(sq.newcol)]
sq.s2 <- as.DNAbin(tmp)
# if merged alignment should contain all sites in PANGEA and SU alignment
# need to add remainder now
if(!return.common.sites & startat>1)
{
# add gap col to in.s2
tmp <- as.DNAbin(matrix('-', ncol=startat-1L,nrow=nrow(in.s2), dimnames=dimnames(in.s2)))
in.s2 <- cbind(tmp, in.s2)
# add data to sq.s2
sq.s2 <- cbind(sq[,1:(startat-1L)], sq.s2)
}
if(!return.common.sites & ncol(sq.s2)<ncol(sq))
{
sq.s2 <- cbind(sq.s2, sq[, seq.int(length(sq.newcol)+1L,ncol(sq))])
tmp <- as.DNAbin( matrix('-', nrow=nrow(in.s2), ncol=ncol(sq.s2)-ncol(in.s2), dimnames=dimnames(in.s2)) )
in.s2 <- cbind(in.s2, tmp)
}
if(!return.common.sites & ncol(in.s2)<ncol(in.s))
{
in.s2 <- cbind(in.s2, in.s[, seq.int(length(in.newcol)+1L,ncol(in.s))])
tmp <- as.DNAbin( matrix('-', nrow=nrow(sq.s2), ncol=ncol(in.s2)-ncol(sq.s2), dimnames=dimnames(sq.s2)) )
sq.s2 <- cbind(sq.s2, tmp)
}
# now merge!
ans <- rbind(in.s2, sq.s2)
# check for duplicates -- should be HXB2
tmp <- which(grepl(regexpr.reference,rownames(ans)))
stopifnot(length(tmp)==2)
ans <- ans[-tmp[2],]
ans
}
#' @export seq.write.dna.phylip.triangular
#' @title Write a triangular phylip file
seq.write.dna.phylip.triangular<- function(wd, file=NA)
{
stopifnot(class(wd)=='matrix')
stopifnot(!is.na(file))
tmp <- paste(sapply(seq_len(nrow(wd)), function(i)
{
z <- sprintf('%.16f',wd[i, 1:i][-i])
if(length(z))
z <- as.vector(rbind(z, rep(1, length(z))))
z <- paste(z, collapse=' ')
paste(rownames(wd)[i],' ',z,sep='')
}), collapse='\n')
cat('\t', paste(nrow(wd), '\n', tmp, sep=''), file=file)
}
#' @import data.table ape ggplot2
#' @export seq.mvr.d.and.v
seq.mvr.d.and.v<- function(tps, seed=42, v.mult=1.2, complete.distance.matrix=TRUE, name.gd.col='GD', reco.opts=c(dim=750, costp_l1=0, costp_l2=0.001, costq_l1=0, costq_l2=0.001, nthread=1, lrate=0.003, niter=120), outfile=NA, verbose=FALSE)
{
require(recosystem)
#reco.opts <- c(dim=500, costp_l1=0, costp_l2=0.01, costq_l1=0, costq_l2=0.01, nthread=1, lrate=0.003, niter=40)
stopifnot( c('TAXA1','TAXA2','ID1','ID2',name.gd.col,'GD_V')%in%colnames(tps) )
# tps <- subset(tp, REP==1 & GENE=='gag+pol+env')
tpc <- subset(tps, select=c('ID1','ID2',name.gd.col))
setnames(tpc, name.gd.col, 'GD')
if(complete.distance.matrix)
{
# add upper triangular
tmp <- copy(tpc)
set(tmp, NULL, 'ID1', tpc[, ID2])
set(tmp, NULL, 'ID2', tpc[, ID1])
tpc <- rbind(tpc, tmp)
# add zero diagonal
tmp <- tpc[, range(ID1)]
tmp <- data.table(ID1= seq.int(tmp[1], tmp[2]), ID2= seq.int(tmp[1], tmp[2]), GD=0)
tpc <- rbind(tpc, tmp)
# setup matrix completion
if(verbose)
cat('\nmatrix completion')
tmp <- subset(tpc, !is.na(GD))
tmp <- data_memory(tmp[,ID1], tmp[,ID2], rating=tmp[,GD], index1=TRUE)
if(!is.na(seed))
set.seed(seed)
r <- Reco()
r$train(tmp, opts=reco.opts)
tpc[, GDp:= r$predict(data_memory(tpc[,ID1], tpc[,ID2], index1=TRUE), out_memory())]
# plot
if(!is.na(outfile))
{
ggplot(subset(tpc, !is.na(GD)), aes(x=GD, y=GDp)) + geom_point(colour='grey80', size=0.5, pch=16) + geom_abline(slope=1, intercept=0)
ggsave(file=paste(outfile, '_', paste(reco.opts,collapse='_'), '.pdf', sep=''), w=7, h=7)
}
if(verbose)
cat('\ngenerating completed distance matrix d')
# fill in distance matrix
tpc[, GDf:= GD]
tmp <- tpc[, which(is.na(GDf))]
set(tpc, tmp, 'GDf', tpc[tmp, GDp])
# convert to matrix (not necessarily symmetric)
#tmp <- dcast.data.table( subset(tpc, ID1<20 & ID2<20, select=c(ID1,ID2,GDf)), ID1~ID2, value.var='GDf' )
tmp <- dcast.data.table( subset(tpc, select=c(ID1,ID2,GDf)), ID1~ID2, value.var='GDf' )
d <- as.matrix(tmp[, -1, with=FALSE])
rownames(d) <- colnames(d)
# make symmetric
d <- (d+t(d))/2
stopifnot( all(d>=0) )
}
if(!complete.distance.matrix)
{
if(verbose)
cat('\ngenerating incomplete distance matrix d')
tmp <- dcast.data.table(tpc, ID1~ID2, value.var='GD')
d <- cbind(NA_real_, as.matrix(tmp[, -1, with=FALSE]))
d <- rbind(d, NA_real_)
colnames(d)[1] <- setdiff( as.character(tmp[, ID1]), colnames(d) )
rownames(d) <- colnames(d)
diag(d) <- 0
# complete lower triangular from upper triangular and vice versa
tmp <- lower.tri(d) & is.na(d)
d[tmp] <- t(d)[tmp]
tmp <- upper.tri(d) & is.na(d)
d[tmp] <- t(d)[tmp]
d[is.nan(d)] <- NA_real_
}
# some rows/cols may have NAs only -- remove these as the matrix completion problem is ill-specified for these
tmp <- subset(subset(tpc, is.na(GD))[, list(GDM=length(GD)), by='ID1'], GDM==nrow(d)-1) #subtract one since diagonal is zero
tmp <- setdiff(rownames(d), tmp[, as.character(ID1)] )
d <- d[tmp, tmp]
# clean up
tpc <- r <- NULL
gc()
#
# generate variance matrix
#
if(verbose)
cat('\ngenerating variance matrix v')
tmp <- dcast.data.table(tps, ID1~ID2, value.var='GD_V')
v <- cbind(NA_real_, as.matrix(tmp[, -1, with=FALSE]))
v <- rbind(v, NA_real_)
colnames(v)[1] <- setdiff( as.character(tmp[, ID1]), colnames(v) )
rownames(v) <- colnames(v)
diag(v) <- 0
# complete lower triangular from upper triangular and vice versa
tmp <- lower.tri(v) & is.na(v)
v[tmp] <- t(v)[tmp]
tmp <- upper.tri(v) & is.na(v)
v[tmp] <- t(v)[tmp]
# set missing variances to large default
v[is.na(v)] <- max(v, na.rm=TRUE)*v.mult
v <- v[rownames(d),colnames(d)]
stopifnot( all(v>=0) )
#
# reset names
#
if(verbose)
cat('\nsetting taxon names')
tmp <- subset( tps, select=c(TAXA1, ID1) )
setnames(tmp, c('TAXA1','ID1'), c('TAXA2','ID2') )
tmp <- unique(rbind( tmp, subset( tps, select=c(TAXA2, ID2) ) ))
setnames(tmp, c('TAXA2','ID2'), c('TAXA','ID') )
tmp <- merge(tmp, data.table(ID=as.integer(rownames(d))), by='ID')
setkey(tmp, ID)
rownames(d) <- tmp[, TAXA]
colnames(d) <- tmp[, TAXA]
rownames(v) <- tmp[, TAXA]
colnames(v) <- tmp[, TAXA]
#
# return
#
list(d=as.dist(d), v=as.dist(v))
}
#' @export seq.strip.gap
#' @title Strip gaps
seq.strip.gap<- function(seq.DNAbin.mat, strip.max.len=NA, strip.pc=NA, gap.chars='-')
{
stopifnot( !is.na(strip.max.len)&is.na(strip.pc) | is.na(strip.max.len)&!is.na(strip.pc), is.na(strip.pc)|strip.pc<1, is.na(strip.pc)|strip.pc>0, is.na(strip.max.len)|strip.max.len>0)
seq.DNAbin.mat <- as.character(seq.DNAbin.mat)
if(length(gap.chars)==1)
tmp <- apply(seq.DNAbin.mat,2,function(x) length(which(x==gap.chars)))
if(length(gap.chars)>1)
tmp <- apply(seq.DNAbin.mat,2,function(x) length(which(x%in%gap.chars)))
tmp <- tmp/nrow(seq.DNAbin.mat)
if(!is.na(strip.pc))
return(as.DNAbin(seq.DNAbin.mat[,tmp<strip.pc,drop=FALSE]))
if(!is.na(strip.max.len))
for(strip.pc in rev(seq(0.01,1.00,0.01)))
{
z <- which(tmp<strip.pc)
if(length(z)<=strip.max.len)
{
cat('\nStripping gaps at pc=',strip.pc)
return(as.DNAbin(seq.DNAbin.mat[,z,drop=FALSE]))
}
}
}
#' @export seq.find
#' @title Find the row numbers in a sequence matrix with a particular sequence pattern
seq.find<- function(char.matrix, pos0= NA, from= c(), verbose=1)
{
if(is.na(pos0)) stop("start position of token to be replaced is missing")
if(!length(from)) stop("token to be replaced is missing")
query.colidx <- seq.int(pos0,pos0+length(from)-1)
query.yes <- which( apply(char.matrix, 1, function(x) all(x[query.colidx]==from) ) )
query.yes
}
#' @export seq.unique
#' @title Compute a subset of unique sequences
seq.unique<- function(seq.DNAbin.matrix)
{
x<- as.character(seq.DNAbin.matrix)
x<- apply(x, 1, function(z) paste(z,collapse=''))
seq.DNAbin.matrix[!duplicated(x),]
}
#' @export seq.blast
#' @title Perform online BLAST search
#' @description see also package 'RFLPtools'
seq.blast<- function (x, database = "nr", hitListSize = "10", filter = "L", expect = "10", program = "blastn", organism= "HIV-1")
{
baseUrl <- "http://www.ncbi.nlm.nih.gov/blast/Blast.cgi"
query <- paste("QUERY=", as.character(x), "&DATABASE=", database, "&ORGANISM=", organism,
"&HITLIST_SIZE=", hitListSize, "&FILTER=", filter, "&EXPECT=",
expect, "&PROGRAM=", program, sep = "")
url0 <- sprintf("%s?%s&CMD=Put", baseUrl, query)
results <- tempfile()
Sys.sleep(5)
require(XML)
post <- htmlTreeParse(url0, useInternalNodes = TRUE)
x <- post[["string(//comment()[contains(., \"QBlastInfoBegin\")])"]]
rid <- sub(".*RID = ([[:alnum:]]+).*", "\\1", x)
rtoe <- as.integer(sub(".*RTOE = ([[:digit:]]+).*", "\\1",
x))
url1 <- sprintf("%s?RID=%s&FORMAT_TYPE=XML&CMD=Get", baseUrl,
rid)
Sys.sleep(rtoe)
result <- annotate:::.tryParseResult(url1)
qseq <- xpathApply(result, "//Hsp_qseq", xmlValue)
hseq <- xpathApply(result, "//Hsp_hseq", xmlValue)
require(Biostrings)
res <- list()
for (i in seq_len(length(qseq))) {
res[i] <- DNAMultipleAlignment(c(hseq[[i]], qseq[[i]]),
rowmask = as(IRanges(), "NormalIRanges"), colmask = as(IRanges(),
"NormalIRanges"))
}
res
}
#' @export seq.blast.read
#' @title Read BLAST output
#' @description slight modification of read.blast() in pkg RFLPtools; expects blast was run with -outfmt 6
seq.blast.read<- function (file, sep = "\t")
{
require(data.table)
x <- read.table(file = file, header = FALSE, sep = sep, quote = "\"", dec = ".", fill = TRUE, comment.char = "", stringsAsFactors = FALSE)
if (ncol(x) != 12)
stop("Data in given file", basename(file), "has wrong dimension!")
names(x) <- c("query.id", "subject.id", "identity", "alignment.length","mismatches", "gap.opens", "q.start", "q.end", "s.start","s.end", "evalue", "bit.score")
data.table(x, key="query.id")
}
#' @export seq.rm.drugresistance
#' @title Remove drug resistance mutations
seq.rm.drugresistance<- function(seq, outfile=NA)
{
require(data.table)
stopifnot( any(rownames(seq)=='HXB2') ) # expect HXB2 as reference in alignment
load(system.file(package="big.phylo", 'AC_drugresistance_201508.rda'))
load(system.file(package="big.phylo", 'refseq_hiv1_hxb2.rda'))
hxb2 <- paste(hxb2[, HXB2.K03455], collapse='')
seq.hxb2 <- paste(as.character(seq['HXB2',]),collapse='')
# check that HXB2 in alignment is HXB2
seq.hxb2.ng <- gsub('-+','',seq.hxb2)
stopifnot(nchar(seq.hxb2.ng)<=nchar(hxb2)) # expect HXB2 in alignment is part of true HXB2
seq.hxb2.st <- as.integer(regexpr(substr(seq.hxb2.ng,1,20),hxb2))
stopifnot(seq.hxb2.st>0) # expect HXB2 in alignment is part of true HXB2
stopifnot(nchar(seq.hxb2.ng)+seq.hxb2.st-1L<=nchar(hxb2)) # expect HXB2 in alignment is part of true HXB2
stopifnot( seq.hxb2.ng==substr(hxb2, seq.hxb2.st, nchar(seq.hxb2.ng)+seq.hxb2.st-1L) )
# find coordinates of real HXB2 in alignment
seq.hxb2.pos <- c()
tmp <- gregexpr('-+',seq.hxb2)[[1]]
if(tmp[1]>-1)
{
seq.hxb2.pos<- data.table(GP_ST= as.integer(tmp), GP_LEN= attr(tmp,'match.length'))
seq.hxb2.pos<- seq.hxb2.pos[, list(GP_POS= seq.int(GP_ST, length=GP_LEN)), by='GP_ST'][, GP_POS]
}
seq.hxb2.pos <- data.table(HXB2INSEQ_POS= setdiff(seq_len(nchar(seq.hxb2)), seq.hxb2.pos))
seq.hxb2.pos[, HXB2_POS:= seq_len(nrow(seq.hxb2.pos))]
set(seq.hxb2.pos, NULL, 'HXB2_POS', seq.hxb2.pos[, HXB2_POS+seq.hxb2.st-1L])
stopifnot( seq.hxb2.pos[, tail(HXB2_POS,1)]<=nchar(hxb2) )
# merge with dr
setnames(dr, 'HXB2.pos', 'HXB2_POS')
dr <- merge(dr, seq.hxb2.pos, by='HXB2_POS')
setnames(dr, 'HXB2INSEQ_POS', 'Alignment.nuc.pos')
tmp <- seq.rm.drugresistance.internal(as.character(seq), dr, verbose=1, rtn.DNAbin=1 )
dr.info <- tmp$nodr.info
nodr.seq <- tmp$nodr.seq
#
# save
#
if(!is.na(outfile))
save(seq, nodr.seq, dr.info, file=outfile)
list(nodr.info=dr.info, nodr.seq=nodr.seq)
}
seq.rm.drugresistance.internal<- function(char.matrix, dr, verbose=1, rtn.DNAbin=1)
{
if(verbose) cat(paste("\nchecking for potential drug resistance mutations, n=",nrow(dr)))
nodr.info <- dr[, {
query.yes <- seq.find(char.matrix, Alignment.nuc.pos, unlist(strsplit(unlist(Mutant.NTs),'')))
if(length(query.yes))
{
ans <- rownames(char.matrix)[query.yes]
#print( char.matrix[query.yes, seq.int(dr[i,Alignment.nuc.pos]-3, length.out=9)] ); stop()
}
if(!length(query.yes))
ans <- NA_character_
list(TAXA=ans)
}, by=c('HXB2_POS','DR.name','Gene.codon.number','Wild.type','Mutant.NTs','Alignment.nuc.pos')]
nodr.info <- subset(nodr.info, !is.na(TAXA))
for(i in nodr.info[, sort(unique(Alignment.nuc.pos))])
{
tmp <- subset(nodr.info, Alignment.nuc.pos==i)[, TAXA]
stopifnot(length(tmp)>0)
cat(paste('\nsetting at pos',i,'to nnn for taxa, n=', length(tmp)))
char.matrix[tmp, seq.int(i, length.out=3) ]<- matrix("n", nrow=length(tmp), ncol=3)
}
if(rtn.DNAbin)
char.matrix <- as.DNAbin(char.matrix)
list(nodr.seq=char.matrix, nodr.info=nodr.info)
}
#' @export seq.dist
#' @title Compute pairwise distance matrix
seq.dist<- function(seq.DNAbin.matrix, verbose=1)
{
if(0)
{
require(ape)
ans<- dist.dna(seq.DNAbin.matrix, model="raw", as.matrix=1)
}
if(1)
{
library(bigmemory)
options(bigmemory.typecast.warning=FALSE)
big.matrix.charmax<- 127
dummy <- 0
ans<- big.matrix(nrow(seq.DNAbin.matrix),nrow(seq.DNAbin.matrix), dimnames=list(rownames(seq.DNAbin.matrix),c()), type="char", init=NA)
if(nrow(seq.DNAbin.matrix)>5)
{
ans[1,1:6]<- 2^seq.int(6,11)-1
if(is.na(ans[1,2])) stop("unexpected behaviour of bigmemory")
ans[1,1:6]<- NA
}
for(i1 in seq.int(1,nrow(seq.DNAbin.matrix)-1))
{
seq1<- seq.DNAbin.matrix[i1,]
time<- system.time (
tmp <- 1 - sapply(seq.int(i1+1,nrow(seq.DNAbin.matrix)),function(i2){ .C("hivc_dist_ambiguous_dna", seq1, seq.DNAbin.matrix[i2,], ncol(seq1), dummy )[[4]] })
)[3]
if(verbose) cat(paste("\ncompute distance of row",i1,"entries",nrow(seq.DNAbin.matrix)-i1,"took",time))
tmp <- round(tmp*1e3,d=0)
tmp[tmp>big.matrix.charmax] <- big.matrix.charmax
ans[i1, seq.int(i1+1,nrow(seq.DNAbin.matrix))] <- tmp
}
}
ans
}
#' @export seq.replace
#' @title Replace nucleotide codes in sequence alignment
seq.replace<- function(seq.DNAbin.matrix, code.from='?', code.to='n', verbose=0)
{
seq.DNAbin.matrix <- as.character(seq.DNAbin.matrix)
seq.DNAbin.matrix <- apply(seq.DNAbin.matrix, 2, function(col) gsub(code.from,code.to,col,fixed=1) )
as.DNAbin( seq.DNAbin.matrix )
}
#' @export seq.rmgaps
#' @title Remove gaps in sequence alignment
seq.rmgaps<- function(seq.DNAbin.matrix, rm.only.col.gaps=1, verbose=0)
{
seq.DNAbin.matrix <- as.character(seq.DNAbin.matrix)
if(!rm.only.col.gaps)
{
if(is.matrix(seq.DNAbin.matrix))
{
tmp <- lapply(seq_len(nrow(seq.DNAbin.matrix)), function(i){ seq.DNAbin.matrix[i, seq.DNAbin.matrix[i,]!="-" & seq.DNAbin.matrix[i,]!="?"] })
names(tmp) <- rownames(seq.DNAbin.matrix)
}
else
{
tmp <- lapply(seq_along(seq.DNAbin.matrix), function(i){ seq.DNAbin.matrix[[i]][ seq.DNAbin.matrix[[i]]!="-" & seq.DNAbin.matrix[[i]]!="?"] })
names(tmp) <- names(seq.DNAbin.matrix)
}
seq.DNAbin.matrix <- tmp
}
else
{
nogap <- which( !apply(seq.DNAbin.matrix,2,function(x) all(x=="-" | x=="?")) )
if(verbose) cat(paste("\nremove gaps, n=",ncol(seq.DNAbin.matrix)-length(nogap)))
seq.DNAbin.matrix <- seq.DNAbin.matrix[,nogap]
}
as.DNAbin( seq.DNAbin.matrix )
}
#' @export seq.read.GenBank
#' @title Read sequences from GenBank
seq.read.GenBank<- function (access.nb, seq.names = access.nb, species.names = TRUE, gene.names = FALSE, as.character = FALSE, attributes= c("origin"))
{
require(ape)
N <- length(access.nb)
nrequest <- N%/%400 + as.logical(N%%400)
X <- character(0)
for (i in 1:nrequest) {
a <- (i - 1) * 400 + 1
b <- 400 * i
if (i == nrequest)
b <- N
URL <- paste("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=",
paste(access.nb[a:b], collapse = ","), "&rettype=gb&retmode=text",
sep = "")
X <- c(X, scan(file = URL, what = "", sep = "\n", quiet = TRUE))
}
cat(X)
FI <- grep("^ {0,}ORIGIN", X) + 1
LA <- which(X == "//") - 1
obj <- vector("list", N)
for (i in 1:N) {
tmp <- gsub("[[:digit:] ]", "", X[FI[i]:LA[i]])
obj[[i]] <- unlist(strsplit(tmp, NULL))
}
names(obj) <- seq.names
if (!as.character)
obj <- as.DNAbin(obj)
if(length(attributes))
{
attr.lines <- lapply(attributes, function(attr) grep(paste("^ {0,}/",attr,sep=''), X) )
attr.fields <- lapply(seq_along(attr.lines),function(i) gsub("\"$","",gsub(paste("^ {0,}/",attributes[i],"=\"",sep=''),"",X[attr.lines[[i]]])) )
for(i in seq_along(attributes))
attr(obj, attributes[[i]])<- attr.fields[[i]]
}
if (gene.names) {
tmp <- character(N)
sp <- grep(" +gene +<", X)
for (i in 1:N) tmp[i] <- unlist(strsplit(X[sp[i + 1L]],
" +/gene=\""))[2]
attr(obj, "gene") <- gsub("\"$", "", tmp)
}
obj
}
#' @export seq.create.referencepairs
seq.create.referencepairs<- function(dir.name= DATA)
{
if(0) #generate ATHENA_2013_hptn052.rda
{
gban <- c( paste("JN",seq.int(247047,247075),sep=''),paste("JN",seq.int(634296,634492),sep='') )
tmp <- hivc.read.GenBank(gban, as.character=0, attributes= c("isolate","country","collection_date"))
file <- "ATHENA_2013_hptn052.fa"
write.dna(tmp, paste(DATA,file,sep='/'), format = "fasta" )
cmd <- hiv.cmd.clustalo(paste(dir.name,"tmp",sep='/'), file, signat='', outdir=paste(dir.name,"tmp",sep='/'))
cat(cmd)
if(0) system(cmd)
file <- paste(DATA,"tmp/ATHENA_2013_hptn052.fa.clustalo",sep='/')
hptn052 <- read.dna(file, format = "fasta" )
rownames(hptn052) <- attr(tmp,"isolate")
file <- paste(DATA,"tmp/ATHENA_2013_hptn052.rda",sep='/')
save(hptn052, file= file)
}
file<- paste(DATA,"tmp/ATHENA_2013_hptn052.rda",sep='/')
load(file)
#select control sequences and compute pairwise distances between them
hptn052.control <- grep("control", rownames(hptn052))
hptn052.control.d <- hivc.pwdist( hptn052[hptn052.control,] )
hptn052.control.d <- hptn052.control.d[ upper.tri(hptn052.control.d) ]
#select positive sequences and compute pairwise distance between them
hptn052.sdc <- c(grep("A", rownames(hptn052)), grep("B", rownames(hptn052)))
hptn052.sdc <- hptn052[hptn052.sdc,]
hptn052.sdc.ind <- sapply(strsplit(rownames(hptn052.sdc),'-'), function(x)
{
as.numeric(x[2]) + ifelse(x[3]=='I',0,0.5)
})
hptn052.sdc.ind <- sapply( unique( hptn052.sdc.ind ), function(x){ hptn052.sdc[hptn052.sdc.ind==x,] })
hptn052.sdc.ind <- lapply( which( sapply(hptn052.sdc.ind,nrow)==2 ), function(i) hptn052.sdc.ind[[i]] )
hptn052.sdc.ind.d <- sapply( hptn052.sdc.ind, function(x) hivc.pwdist(x)[1,2] )
list( gd.islink= hptn052.sdc.ind.d, gd.unlinked= hptn052.control.d )
}
#' @export seq.write.dna.nexus
#' @title Write a NEXUS file
seq.write.dna.nexus<- function(seq.DNAbin.mat, ph=NULL, file=NULL, nexus.format="DNA",nexus.gap='-', nexus.missing='?', nexus.interleave="NO")
{
tmp <- cbind( rownames(seq.DNAbin.mat), apply( as.character( seq.DNAbin.mat ), 1, function(x) paste(x,sep='',collapse='') ) )
tmp <- apply(tmp, 1, function(x) paste(x, collapse='\t', sep=''))
tmp <- paste(tmp, collapse='\n',sep='')
header <- paste( "#NEXUS\nBEGIN DATA;\nDIMENSIONS NTAX=",nrow(seq.DNAbin.mat)," NCHAR=",ncol(seq.DNAbin.mat),";\nFORMAT DATATYPE=",nexus.format," MISSING=",nexus.missing," GAP=",nexus.gap," INTERLEAVE=",nexus.interleave,";\nMATRIX\n", collapse='',sep='')
tmp <- paste(header, tmp, "\n;\nEND;\n", sep='')
if(!is.null(ph))
{
tmp <- paste(tmp,"#BEGIN TAXA;\nTAXLABELS ", paste(ph$tip.label, sep='',collapse=' '), ';\nEND;\n\n',sep='')
tmp <- paste(tmp, "BEGIN TREES;\nTREE tree1 = ", write.tree(ph), "\nEND;\n", sep='')
}
if(!is.null(file))
cat(tmp, file=file)
tmp
}
#' @export seq.length
#' @title Compute the length of sequences
seq.length<- function(seq.DNAbin.mat, exclude=c('-','?'))
{
counts <- sapply(seq_len(nrow(seq.DNAbin.mat)), function(i) base.freq(seq.DNAbin.mat[i,], freq=1, all=1))
len <- apply(counts[ !rownames(counts)%in%exclude, ],2,sum)
names(len) <- rownames(seq.DNAbin.mat)
len
}
#' @export seq.proportion.ambiguous
#' @title Compute the proportion of ambiguous nucleotides
seq.proportion.ambiguous<- function(seq.DNAbin.mat, exclude=c('-','?'))
{
counts <- sapply(seq_len(nrow(seq.DNAbin.mat)), function(i) base.freq(seq.DNAbin.mat[i,], freq=1, all=1))
len <- apply(counts[ !rownames(counts)%in%exclude, ],2,sum)
pa <- apply(counts[c("r", "m", "w", "s", "k", "y", "v", "h", "d", "b"),],2,sum)
pa/len
}
#' @export seq.gc.content
#' @title Compute the GC content
seq.gc.content<- function(seq.DNAbin.mat)
{
rna.gc.fraction.n <- c('a','c','g','t', 'r','m','w','s', 'k','y','v','h', 'd','b','n','-','?')
rna.gc.fraction <- c( 0, 1, 1, 0, 0.5, 0.5, 0, 1, 1/2, 1/2, 2/3, 1/3, 1/3,2/3, 1/4, 0, 0) #this fraction assumes that U is synonymous with T and that U does not occur in the code
rna.gc.sum <- c( T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F )
counts <- apply(seq.DNAbin.mat,1,function(x) base.freq(x, freq=1, all=1))
apply(counts*rna.gc.fraction,2,sum) / apply(counts[rna.gc.sum,],2,sum)
}
######################################################################################
hivc.clu.polyphyletic.clusters<- function(cluphy.df, cluphy.subtrees=NULL, ph=NULL, clustering=NULL, verbose=1, plot.file=NA, pdf.scaley=25, pdf.xlim=NULL, cex.nodelabel=0.2, cex.tiplabel=0.2, adj.tiplabel= c(-0.15,0.5))
{
if(!is.null(ph) && !is.null(clustering))
{
#get node corresponding to index of selected clusters
cluphy.cluidx <- clustering[["clu.idx"]][ unique( cluphy.df[,cluster] ) ]
if(any(is.na(cluphy.cluidx))) stop("unexcpected NA in cluphy.cluidx")
#get selected clusters into single phylogeny
cluphy.subtrees <- lapply(cluphy.cluidx, function(x) extract.clade(ph, x, root.edge= 1, interactive = FALSE) )
names(cluphy.subtrees) <- unique( cluphy.df[,cluster] )
}
else if(is.null(cluphy.subtrees))
stop("expect either ('ph' and 'clustering') or 'cluphy.subtrees' as input")
cluphy <- eval(parse(text=paste('cluphy.subtrees[[',seq_along(cluphy.subtrees),']]', sep='',collapse='+')))
print(cluphy)
plot.coordinates <- NULL
#plot selected clusters
if(!is.na(plot.file))
{
cluphy.tiplabels <- hivc.clu.get.tiplabels( cluphy, copy(cluphy.df) )
if(verbose) cat(paste("\nwrite tree to file",plot.file))
color.except.rootedge <- rep(1, Nnode(cluphy, internal.only=F))
color.except.rootedge[Ntip(cluphy)+1] <- NA
plot.coordinates <- hivc.clu.plot(cluphy, color.except.rootedge, file=plot.file, pdf.scaley=pdf.scaley, pdf.off=0, pdf.xlim=pdf.xlim, cex.nodelabel=cex.nodelabel )
hivc.clu.plot.tiplabels( seq_len(Ntip(cluphy)), cluphy.tiplabels$text, cluphy.tiplabels$col, cex=cex.tiplabel, adj=adj.tiplabel, add.xinch=0, add.yinch=0 )
dev.off()
}
list(cluphy=cluphy, cluphy.subtrees=cluphy.subtrees, plot.coordinates=plot.coordinates)
}
######################################################################################
hivc.clu.get.tiplabels<- function(ph, df.info, col.notmsm="#4EB3D3", col.Early="#EF9708", col.highVL="#FEE391", col.AfterTreat="#D4B9DA", col.green="#D9F0A3", col.latePres="#FA9FB5",
select=c("CountryInfection","Trm","Sex","isAcute","lRNA.early","NegT","AnyPos_T1","PosSeqT","lRNA.hb4tr_LT","lRNA_bTS","lRNA_TS","lRNA_aTS","lRNAi_bTS","lRNAi_aTS","AnyT_T1","TrImo_bTS","TrImo_aTS","PosCD4_T1","CD4_T1","CD4_bTS","CD4_TS","CD4_aTS","Patient","RegionHospital") )
{
require(colorspace)
require(RColorBrewer)
#
# PATIENT
#
#set colors CountryInfection
tmp <- rep("transparent",nrow(df.info))
df.info[,CountryInfection.col:=tmp]
set(df.info, which(df.info[,!is.na(CountryInfection) & CountryInfection!="NL"]), "CountryInfection.col", col.notmsm)
#set colors Patient
tmp <- unique( df.info[,Patient] )
#tmp2 <- diverge_hcl(length(tmp), h = c(246, 40), c = 96, l = c(85, 90))
tmp <- data.table(Patient=tmp, Patient.col="transparent", key="Patient")
df.info <- merge( df.info, tmp, all.x=1, by="Patient" )
#set colors Sex
df.info[,Sex.col:="transparent"]
set(df.info, which(df.info[,!is.na(Sex) & Sex!="M"]), "Sex.col", col.notmsm)
#set colors MSM or BI
df.info[,Trm.col:="transparent"]
set(df.info, which(df.info[,!is.na(Trm) & Trm!="MSM"]), "Trm.col", col.notmsm)
#set colors RegionHospital
tmp <- levels( df.info[,RegionHospital] )
tmp2 <- brewer.pal(length(tmp), "Dark2")
tmp <- data.table(RegionHospital=tmp, RegionHospital.col=tmp2, key="RegionHospital")
df.info <- merge( df.info, tmp, all.x=1, by="RegionHospital" )
#set colors isAcute
df.info[,isAcute.col:="transparent"]
set(df.info, which(df.info[,isAcute%in%c("Yes","Maybe")]), "isAcute.col", col.Early)
#
# TIMELINE
#
# set transparent colors: PosSeqT NegT AnyPos_T1
df.info[, NegT.col:="transparent"]
df.info[, AnyPos_T1.col:="transparent"]
#
# TREATMENT
#
# set AnyT_T1.col if PosSeqT<AnyT_T1 orange else pink
df.info[, AnyT_T1.col:="transparent"]
select.seqb4tr <- which(df.info[, !is.na(PosSeqT) & !is.na(AnyT_T1) & PosSeqT<=AnyT_T1])
select.seqatr <- which(df.info[, !is.na(PosSeqT) & !is.na(AnyT_T1) & PosSeqT>AnyT_T1])
set(df.info, select.seqb4tr, "AnyT_T1.col", col.Early)
set(df.info, select.seqatr, "AnyT_T1.col", col.AfterTreat)
df.info[, TrImo_bTS.col:="transparent"]
df.info[, TrImo_aTS.col:="transparent"]
set(df.info, which(df.info[, TrImo_aTS>4]), "TrImo_aTS.col", col.AfterTreat)
set(df.info, which(df.info[, TrImo_bTS>4]), "TrImo_bTS.col", col.AfterTreat)
df.info[, PosSeqT.col:="transparent"]
set(df.info, select.seqb4tr, "PosSeqT.col", col.Early)
set(df.info, select.seqatr, "PosSeqT.col", col.AfterTreat)
#
# VIRAL LOAD
#
df.info[, lRNA_bTS.col:="transparent"]
set(df.info, which(df.info[,lRNA_bTS>5]), "lRNA_bTS.col", col.highVL)
df.info[, lRNA_TS.col:="transparent"]
set(df.info, which(df.info[,lRNA_TS>5]), "lRNA_TS.col", col.highVL)
df.info[, lRNA_aTS.col:="transparent"]
set(df.info, which(df.info[,lRNA_aTS>3.5]), "lRNA_aTS.col", col.highVL)
df.info[, lRNAi_bTS.col:="transparent"]
set(df.info, which(df.info[,lRNAi_bTS>0.75]), "lRNAi_bTS.col", col.highVL)
df.info[, lRNAi_aTS.col:="transparent"]
set(df.info, which(df.info[,lRNAi_aTS>0.25]), "lRNAi_aTS.col", col.highVL)
df.info[, lRNA.hb4tr_LT.col:="transparent"]
set(df.info, which(df.info[,!is.na(lRNA.hb4tr_LT) & PosSeqT<=lRNA.hb4tr_LT]), "lRNA.hb4tr_LT.col", col.highVL)
df.info[, lRNA.early.col:="transparent"]
set(df.info, which(df.info[, lRNA.early]), "lRNA.early.col", col.Early)
#
# CD4
#
df.info[, CD4_T1.col:="transparent"]
set(df.info, which(df.info[,CD4_T1>350]), "CD4_T1.col", col.green)
df.info[, PosCD4_T1.col:="transparent"]
set(df.info, which(df.info[,CD4_T1>350]), "PosCD4_T1.col", col.green)
df.info[, CD4_bTS.col:="transparent"]
set(df.info, which(df.info[,CD4_bTS>350]), "CD4_bTS.col", col.green)
df.info[, CD4_TS.col:="transparent"]
set(df.info, which(df.info[,CD4_TS>350]), "CD4_TS.col", col.green)
df.info[, CD4_aTS.col:="transparent"]
set(df.info, which(df.info[,CD4_aTS>350]), "CD4_aTS.col", col.green)
#
# color late presenter
#
tmp<- which(df.info[,CD4_T1<350])
set(df.info, tmp, "CD4_T1.col", col.latePres)
set(df.info, tmp, "PosCD4_T1.col", col.latePres)
set(df.info, tmp, "lRNA_bTS.col", col.latePres)
set(df.info, tmp, "lRNAi_bTS.col", col.latePres)
#
# convert time to string & handle inaccurate NegT or AnyPos_T1
#
tmp <- which(df.info[, as.POSIXlt(NegT)$mday==1 & as.POSIXlt(NegT)$mon==0]) #since NegT were reset, these are the ones with inaccurate month
set(df.info,NULL, "NegT", as.character( df.info[,NegT], "%y.%m" ))
set(df.info, tmp, "NegT", paste(df.info[tmp,substr(NegT,1,3)],'??',sep='') )
tmp <- which(df.info[, as.POSIXlt(AnyPos_T1)$mday==31 & as.POSIXlt(AnyPos_T1)$mon==11]) #since AnyPos_T1 were reset, these are the ones with inaccurate month
set(df.info,NULL, "AnyPos_T1", as.character( df.info[,AnyPos_T1], "%y.%m" ))
set(df.info, tmp, "AnyPos_T1", paste(df.info[tmp,substr(AnyPos_T1,1,3)],'??',sep='') )
set(df.info,NULL, "PosSeqT", as.character( df.info[,PosSeqT], "%y.%m" ))
set(df.info,NULL, "lRNA.hb4tr_LT",as.character( df.info[,lRNA.hb4tr_LT], "%y.%m" ))
set(df.info,NULL, "PosCD4_T1", as.character( df.info[,PosCD4_T1], "%y.%m" ))
set(df.info,NULL, "AnyT_T1", as.character( df.info[,AnyT_T1], "%y.%m" ))
#
# set isAcute to either Y or M or N
set(df.info,NULL,"isAcute", as.character( df.info[,isAcute]))
set(df.info,which(df.info[,isAcute=="No"]),"isAcute",'N')
set(df.info,which(df.info[,isAcute=="Maybe"]),"isAcute",'M')
set(df.info,which(df.info[,isAcute=="Yes"]),"isAcute",'Y')
# set lRNA.early to either HVLE or ----
set(df.info,NULL,"lRNA.early", as.character( df.info[, lRNA.early]))
set(df.info,which(df.info[,lRNA.early=="TRUE"]),"lRNA.early","Y")
set(df.info,which(df.info[,lRNA.early=="FALSE"]),"lRNA.early","N")
# set lRNAi_bTS lRNAi_aTS
set(df.info,NULL,"lRNAi_bTS", as.character( round(df.info[, lRNAi_bTS],d=2)))
set(df.info,NULL,"lRNAi_aTS", as.character( round(df.info[, lRNAi_aTS],d=2)))
# set TrImo_bTS TrImo_aTS
set(df.info,NULL,"TrImo_bTS", as.character( round(df.info[, TrImo_bTS],d=1)))
set(df.info,NULL,"TrImo_aTS", as.character( round(df.info[, TrImo_aTS],d=1)))
# set CD4_T1
set(df.info,NULL,"CD4_T1", as.character( round(df.info[, CD4_T1],d=0)))
# set CD4_TS CD4_bTS CD4_aTS
set(df.info,NULL,"CD4_TS", as.character( df.info[, CD4_TS]))
set(df.info,NULL,"CD4_bTS", as.character( df.info[, CD4_bTS]))
set(df.info,NULL,"CD4_aTS", as.character( df.info[, CD4_aTS]))
# set 'Amst' to 'A'
set(df.info,NULL,"RegionHospital", as.character( df.info[,RegionHospital]))
set(df.info,which(df.info[,RegionHospital=="Amst"]),"RegionHospital",'A')
#
# handle missing entries -- ensure that alignment is OK
#
setkey(df.info, Patient)
tmp <- which(is.na(df.info[,Patient]))
set(df.info, tmp, "Patient", '')
set(df.info, tmp, "Patient.col", "transparent")
tmp <- which(is.na(df.info[,RegionHospital]))
set(df.info, tmp, "RegionHospital", '-')
set(df.info, tmp, "RegionHospital.col", "transparent")
tmp <- which(is.na(df.info[,CountryInfection]))
set(df.info, tmp, "CountryInfection", "--")
set(df.info, tmp, "CountryInfection.col", "transparent")
tmp <- which(is.na(df.info[,Trm]))
set(df.info, tmp, "Trm", '--')
set(df.info, tmp, "Trm.col", "transparent")
tmp <- which(is.na(df.info[,Sex]))
set(df.info, tmp, "Sex", '-')
set(df.info, tmp, "Sex.col", "transparent")
set(df.info, which(df.info[,is.na(isAcute)]), "isAcute", '-')
tmp <- which(is.na(df.info[,AnyPos_T1]))
set(df.info, tmp, "AnyPos_T1", "--.--")
set(df.info, tmp, "AnyPos_T1.col", "transparent")
tmp <- which(is.na(df.info[,NegT]))
set(df.info, tmp, "NegT", "--.--")
set(df.info, tmp, "NegT.col", "transparent")
tmp <- which(is.na(df.info[,PosSeqT]))
set(df.info, tmp, "PosSeqT", "--.--")
set(df.info, tmp, "PosSeqT.col", "transparent")
tmp <- which(is.na(df.info[,lRNA.hb4tr_LT]))
set(df.info, tmp, "lRNA.hb4tr_LT", "--.--")
tmp <- which(is.na(df.info[,PosCD4_T1]))
set(df.info, tmp, "PosCD4_T1", "--.--")
tmp <- which(is.na(df.info[,AnyT_T1]))
set(df.info, tmp, "AnyT_T1", "--.--")
set(df.info, which(df.info[,is.na(TrImo_aTS)]), "TrImo_aTS", '-')
set(df.info, which(df.info[,is.na(TrImo_bTS)]), "TrImo_bTS", '-')
set(df.info, which(df.info[,is.na(CD4_T1)]), "CD4_T1", '---')
set(df.info, which(df.info[,is.na(PosCD4_T1)]), "PosCD4_T1", '---')
set(df.info, which(df.info[,is.na(CD4_TS)]), "CD4_TS", '---')
set(df.info, which(df.info[,is.na(CD4_bTS)]), "CD4_bTS", '---')
set(df.info, which(df.info[,is.na(CD4_aTS)]), "CD4_aTS", '---')
#
# add suffixes
#
set(df.info,NULL,"AnyPos_T1", paste("d:",df.info[,AnyPos_T1],sep=''))
set(df.info,NULL,"NegT", paste("n:",df.info[,NegT],sep=''))
set(df.info,NULL,"PosSeqT", paste("s:",df.info[,PosSeqT]," ",sep=''))
set(df.info,NULL,"lRNA.hb4tr_LT", paste("VLeh:",df.info[,lRNA.hb4tr_LT],sep=''))
set(df.info,NULL,"AnyT_T1", paste("TR+:",df.info[,AnyT_T1],sep=''))
set(df.info,NULL,"TrImo_bTS", paste("TR-:",df.info[,TrImo_bTS],sep=''))
set(df.info,NULL,"TrImo_aTS", paste(":",df.info[,TrImo_aTS]," ",sep=''))
set(df.info,NULL,"isAcute", paste("AC:",df.info[,isAcute],sep=''))
set(df.info,NULL,"lRNA.early", paste(":",df.info[,lRNA.early]," ",sep=''))
set(df.info,NULL,"CountryInfection",paste("inf:",df.info[,CountryInfection],sep=''))
set(df.info,NULL,"RegionHospital", paste("",df.info[,RegionHospital],sep=''))
set(df.info,NULL,"lRNA_bTS", paste("VL:",df.info[,lRNA_bTS],sep=''))
set(df.info,NULL,"lRNA_TS", paste(":",df.info[,lRNA_TS],sep=''))
set(df.info,NULL,"lRNA_aTS", paste(":",df.info[,lRNA_aTS],sep=''))
set(df.info,NULL,"lRNAi_bTS", paste("VI:",df.info[,lRNAi_bTS],sep=''))
set(df.info,NULL,"lRNAi_aTS", paste(":",df.info[,lRNAi_aTS]," ",sep=''))
set(df.info,NULL,"PosCD4_T1", paste("CD4:",df.info[,PosCD4_T1],sep=''))
set(df.info,NULL,"CD4_T1", paste(":",df.info[,CD4_T1],sep=''))
set(df.info,NULL,"CD4_bTS", paste(":",df.info[,CD4_bTS],sep=''))
set(df.info,NULL,"CD4_TS", paste(":",df.info[,CD4_TS],sep=''))
set(df.info,NULL,"CD4_aTS", paste(":",df.info[,CD4_aTS]," ",sep=''))
#
#get df.info into order of tips
#
setkey(df.info, FASTASampleCode)
df.info <- df.info[ph$tip.label,]
#select text and col matrix
text <- t( as.matrix( df.info[,select, with=0] ) )
colnames(text) <- ph$tip.label
col <- t( as.matrix( df.info[,paste(select,".col",sep=''),with=0] ) )
colnames(col) <- ph$tip.label
ans<- list(text=text, col=col)
ans
}
######################################################################################
#prepare standard format of tip labels -- requires df.info to be sorted along the tips as they appear in a phylogeny
hivc.clu.get.tiplabels.v1<- function(ph, df.info)
{
require(colorspace)
require(RColorBrewer)
#set colors CountryInfection
tmp <- rep("transparent",nrow(df.info))
df.info[,CountryInfection.col:=tmp]
set(df.info, which(df.info[,CountryInfection=="NL"]), "CountryInfection.col", "#EF9708")
#set colors Patient
tmp <- unique( df.info[,Patient] )
tmp2 <- diverge_hcl(length(tmp), h = c(246, 40), c = 96, l = c(85, 90))
tmp <- data.table(Patient=tmp, Patient.col=tmp2, key="Patient")
df.info <- merge( df.info, tmp, all.x=1, by="Patient" )
#set colors Sex
tmp <- rep("transparent",nrow(df.info))
df.info[,Sex.col:=tmp]
set(df.info, which(df.info[,Sex=="M"]), "Sex.col", "#EF9708")
#set colors MSM or BI
tmp <- rep("transparent",nrow(df.info))
df.info[,Trm.col:=tmp]
set(df.info, which(df.info[,Trm%in%c("MSM","BI")]), "Trm.col", "#EF9708")
#set colors RegionHospital
tmp <- levels( df.info[,RegionHospital] )
tmp2 <- brewer.pal(length(tmp), "Dark2")
tmp <- data.table(RegionHospital=tmp, RegionHospital.col=tmp2, key="RegionHospital")
df.info <- merge( df.info, tmp, all.x=1, by="RegionHospital" )
#set colors time
tmp <- range( range(df.info[, NegT],na.rm=1), range(df.info[, AnyPos_T1],na.rm=1) )
tmp <- as.POSIXlt( seq.Date(tmp[1],tmp[2]+365,by="years") )$year
tmp2 <- heat_hcl(length(tmp), h = c(0, -100), l = c(75, 40), c = c(40, 80), power = 1)
yearcols <- data.table(Year=tmp, Year.col=tmp2)
#set colors PosSeqT
tmp <- data.table(PosSeqT= unique( df.info[,PosSeqT] ), key="PosSeqT" )
tmp2 <- tmp[, as.POSIXlt(PosSeqT)$year]
tmp[,"Year":=tmp2]
tmp <- subset( merge(tmp, yearcols, all.x=1, by="Year"), select=c(PosSeqT,Year.col))
setnames(tmp,"Year.col","PosSeqT.col")
df.info <- merge( df.info, tmp, all.x=1, by="PosSeqT" )
#set colors NegT
tmp <- data.table(NegT= unique( df.info[,NegT] ), key="NegT" )
tmp2 <- tmp[, as.POSIXlt(NegT)$year]
tmp[,"Year":=tmp2]
tmp <- subset( merge(tmp, yearcols, all.x=1, by="Year"), select=c(NegT,Year.col))
setnames(tmp,"Year.col","NegT.col")
df.info <- merge( df.info, tmp, all.x=1, by="NegT" )
#set colors AnyPos_T1
tmp <- data.table(AnyPos_T1= unique( df.info[,AnyPos_T1] ), key="AnyPos_T1" )
tmp2 <- tmp[, as.POSIXlt(AnyPos_T1)$year]
tmp[,"Year":=tmp2]
tmp <- subset( merge(tmp, yearcols, all.x=1, by="Year"), select=c(AnyPos_T1,Year.col))
setnames(tmp,"Year.col","AnyPos_T1.col")
df.info <- merge( df.info, tmp, all.x=1, by="AnyPos_T1" )
#convert time to string
set(df.info,NULL,"AnyPos_T1",substr(as.character( df.info[,AnyPos_T1] ),1,7))
set(df.info,NULL,"NegT",substr(as.character( df.info[,NegT] ),1,7))
set(df.info,NULL,"PosSeqT",substr(as.character( df.info[,PosSeqT] ),1,7))
#handle missing entries
setkey(df.info, Patient)
tmp <- which(is.na(df.info[,Patient]))
set(df.info, tmp, "Patient", '')
set(df.info, tmp, "Patient.col", "transparent")
tmp <- which(is.na(df.info[,RegionHospital]))
set(df.info, tmp, "RegionHospital", '-')
set(df.info, tmp, "RegionHospital.col", "transparent")
tmp <- which(is.na(df.info[,CountryInfection]))
set(df.info, tmp, "CountryInfection", "--")
set(df.info, tmp, "CountryInfection.col", "transparent")
tmp <- which(is.na(df.info[,Trm]))
set(df.info, tmp, "Trm", '')
set(df.info, tmp, "Trm.col", "transparent")
tmp <- which(is.na(df.info[,Sex]))
set(df.info, tmp, "Sex", '')
set(df.info, tmp, "Sex.col", "transparent")
tmp <- which(is.na(df.info[,AnyPos_T1]))
set(df.info, tmp, "AnyPos_T1", "-------")
set(df.info, NULL, "AnyPos_T1", paste("HIV+:",df.info[,AnyPos_T1],sep=''))
set(df.info, tmp, "AnyPos_T1.col", "transparent")
tmp <- which(is.na(df.info[,NegT]))
set(df.info, tmp, "NegT", "-------")
set(df.info, NULL, "NegT", paste("HIV-:",df.info[,NegT],sep=''))
set(df.info, tmp, "NegT.col", "transparent")
tmp <- which(is.na(df.info[,PosSeqT]))
set(df.info, tmp, "PosSeqT", "-------")
set(df.info, NULL, "PosSeqT", paste("HIVS:",df.info[,PosSeqT],sep=''))
set(df.info, tmp, "PosSeqT.col", "transparent")
#get df.info into order of tips
setkey(df.info, FASTASampleCode)
df.info <- df.info[ph$tip.label,]
#select text and col matrix
text <- t( as.matrix( subset(df.info,select=c(CountryInfection, Trm, Sex, NegT, AnyPos_T1, PosSeqT, Patient, RegionHospital)) ) )
colnames(text) <- ph$tip.label
col <- t( as.matrix( subset(df.info,select=c(CountryInfection.col, Trm.col, Sex.col, NegT.col, AnyPos_T1.col, PosSeqT.col, Patient.col, RegionHospital.col)) ) )
colnames(col) <- ph$tip.label
ans<- list(text=text, col=col)
ans
}
######################################################################################
hivc.clu.plot.tiplabels<- function (tip, text, col, xx=NULL, adj = c(-0.05, 0.5), cex=1, add.xinch= 0.03, add.yinch= 0.02)
{
lastPP <- get("last_plot.phylo", envir = .PlotPhyloEnv)
if(is.null(xx))
xx <- lastPP$xx[tip]
yy <- lastPP$yy[tip]
if(length(tip)==1)
{
#assume vector of text and col of same length
if(!is.vector(text) || !is.vector(col)) stop("expect vector text and vector col")
if(length(text)!=length(col)) stop("expect same length of text and col")
wh <- sapply(text, function(x){ c(xinch(strwidth(x, units = "inches", cex = cex)), yinch(strheight(x, units = "inches", cex = cex))) })
wh.total <- c(sum(wh[1,]),max(wh[2,]))
coord <- matrix(NA,6,length(text),dimnames=list(c("xl","xr","yb","yt","xx","yy"),c()))
coord["xl",] <- xx - wh.total[1] * adj[1] - xinch(add.xinch)
tmp <- cumsum(wh[1,]) + xinch(add.xinch) / ncol(wh)
coord["xl",] <- coord["xl",] + c(0, tmp[-ncol(coord)])
coord["xr",] <- coord["xl",] + wh[1,] + xinch(add.xinch)/ ncol(wh)
coord["yb",] <- yy - wh.total[2] * adj[2] - yinch(add.yinch)
coord["yt",] <- coord["yb",] + wh.total[2] + yinch(add.yinch)
coord["xx",] <- coord["xl",] + c( diff( coord[1,] ), diff(coord[1:2,ncol(wh)]) ) / 2
coord["yy",] <- rep(yy, ncol(coord))
#print(wh); print(wh.total); print(coord)
}
else
{
#assume matrix of text and col of same ncol
if(!is.matrix(text) || !is.matrix(col)) stop("expect matrix text and vector col")
if(ncol(text)!=ncol(col) || nrow(text)!=nrow(col)) stop("expect same dimensions of text and col")
if(ncol(text)!=length(tip)) stop("expect cols of text and col to correspond to tips")
coord <- lapply( seq_along(xx),function(i)
{
wh <- sapply(text[,i], function(x){ c(xinch(strwidth(x, units = "inches", cex = cex)), yinch(strheight(x, units = "inches", cex = cex))) })
wh.total <- c(sum(wh[1,]),max(wh[2,]))
coord <- matrix(NA,6,nrow(text),dimnames=list(c("xl","xr","yb","yt","xx","yy"),c()))
coord["xl",] <- xx[i] - wh.total[1] * adj[1] - xinch(add.xinch)
tmp <- cumsum(wh[1,]) + xinch(add.xinch) / ncol(wh)
coord["xl",] <- coord["xl",] + c(0, tmp[-ncol(coord)])
coord["xr",] <- coord["xl",] + wh[1,] + xinch(add.xinch)/ ncol(wh)
coord["yb",] <- yy[i] - wh.total[2] * adj[2] - yinch(add.yinch)
coord["yt",] <- coord["yb",] + wh.total[2] + yinch(add.yinch)
coord["xx",] <- coord["xl",] + c( diff( coord[1,] ), diff(coord[1:2,ncol(wh)]) ) / 2
coord["yy",] <- rep(yy[i], ncol(coord))
coord
})
coord <- do.call("cbind",coord)
text <- as.vector(text)
col <- as.vector(col)
#print(coords); print(text); print(col)
}
rect(coord["xl",], coord["yb",], coord["xr",], coord["yt",], col = col, border=NA)
text(coord["xx",], coord["yy",], text, cex=cex)
}
######################################################################################
hivc.clu.plot<- function( ph, clu, edge.col.basic="black", show.node.label= T, show.tip.label=F, file=NULL,
highlight.edge.of.tiplabel=NULL, highlight.edge.of.tiplabel.col="red",
highlight.cluster=NULL, highlight.cluster.col="red",
pdf.scaley=10, pdf.width= 7, pdf.height=pdf.scaley*7, pdf.off=1, pdf.xlim=NULL,
cex.nodelabel=0.5, cex.edge.incluster=1, cex.edge.outcluster= cex.edge.incluster/3, no.margin=T, ...)
{
require(colorspace)
clu.edge <- clu[ ph$edge[,1] ]
clu.edge <- clu.edge+1 #set col index for non-clustering edges to 1
clu.edge[is.na(clu.edge)] <- 1
#cols.n <- length(unique(clu.edge))-1
#cols <- c("black",rainbow_hcl(cols.n, start = 30, end = 300))
clu.edge.col <- rep(edge.col.basic, nrow(ph$edge)) #cols[clu.edge]
clu.edge.col[clu.edge==1] <- "grey50"
if(!is.null(highlight.cluster))
{
if(length(highlight.cluster.col)==1)
highlight.cluster.col<- rep(highlight.cluster.col,length(highlight.cluster))
for(i in seq_along(highlight.cluster))
clu.edge.col[clu.edge%in%(highlight.cluster[[i]]+1)]<- highlight.cluster.col[i]
}
if(!is.null(highlight.edge.of.tiplabel))
{
highlight.edge<- lapply(highlight.edge.of.tiplabel, function(x) which( ph$edge[,2]%in%which( substr(ph$tip.label, 1, nchar(x))==x ) ) )
if(length(highlight.edge.of.tiplabel.col)==1)
highlight.edge.of.tiplabel.col <- rep(highlight.edge, length(highlight.edge.of.tiplabel.col) )
for(i in seq_along(highlight.edge))
clu.edge.col[highlight.edge[[i]]] <- highlight.edge.of.tiplabel.col[i]
}
clu.edge.width <- rep(cex.edge.outcluster, length(clu.edge))
clu.edge.width[clu.edge!=1] <- cex.edge.incluster
clu.edge.lty <- rep(1, length(clu.edge))
clu.edge.lty[clu.edge!=1] <- 1
if(class(file)=="character")
pdf(file,width=pdf.width,height=pdf.height)
if(no.margin) par(mar=c(0,0,0,0))
plot.coordinates <- plot(ph, show.tip.label=show.tip.label, show.node.label=show.node.label, cex=cex.nodelabel, edge.color=clu.edge.col, edge.width=clu.edge.width, edge.lty=clu.edge.lty, x.lim=pdf.xlim, ...)
if(class(file)=="character" && pdf.off)
dev.off()
plot.coordinates
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.